Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)

This commit is contained in:
2026-06-21 00:10:11 +02:00
commit d3bb91394b
71 changed files with 8138 additions and 0 deletions

View File

View File

@@ -0,0 +1,267 @@
"""Parsing EPUB -> structure de livre normalisee.
Strategie :
- ebooklib lit l'archive (manifest + spine + ncx).
- L'ordre de lecture vient du spine.
- Les titres viennent de la table des matieres (ncx/nav), mappes par href.
- Le texte de chaque document est extrait via BeautifulSoup (paragraphes).
- On classe chaque item en front / chapter / back et on decide s'il faut le lire.
Sorties ecrites dans data/<slug>/ :
- book.json : metadonnees + liste des chapitres (modele Book)
- chapters/chNN.json : texte normalise par chapitre (modele ChapterText)
- cover.<ext> : couverture extraite (si presente)
"""
from __future__ import annotations
import re
import warnings
from pathlib import Path
from typing import Optional
from urllib.parse import unquote, urldefrag
import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub
# Les xhtml d'epub declenchent un avertissement bs4 inoffensif ; on le tait.
try:
from bs4 import XMLParsedAsHTMLWarning
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
except ImportError: # pragma: no cover
pass
from ..config import book_data_dir
from ..models import Book, Chapter, ChapterKind, ChapterText
from ..util import safe_filename, slugify
# Un titre de chapitre commence par un numero, PROLOGUE ou EPILOGUE.
_CHAPTER_RE = re.compile(r"^\s*(\d+|prologue|[ée]pilogue)\b", re.IGNORECASE)
# Capture "<numero> - <POV>" ou juste "<numero>".
_TITLE_PARTS_RE = re.compile(r"^\s*([^-\n]+?)(?:\s*[-–—]\s*(.+))?\s*$")
# Seuil de mots pour qu'un element de back matter (remerciements...) soit lu.
_BACK_MATTER_MIN_WORDS = 40
def _build_toc_titles(book: epub.EpubBook) -> dict[str, str]:
"""Mappe href (sans fragment) -> titre, en aplatissant la toc ncx/nav."""
titles: dict[str, str] = {}
def walk(items) -> None:
for it in items:
if isinstance(it, tuple): # (Section, [children])
section, children = it
if isinstance(section, epub.Link):
_add(section)
walk(children)
elif isinstance(it, list):
walk(it)
elif isinstance(it, epub.Link):
_add(it)
def _add(link: epub.Link) -> None:
href = unquote(urldefrag(link.href)[0])
if href and href not in titles and link.title:
titles[href] = link.title.strip()
walk(book.toc)
return titles
def _extract_paragraphs(html: bytes) -> list[str]:
"""Extrait les paragraphes lisibles d'un document xhtml."""
soup = BeautifulSoup(html, "lxml")
# Retire les elements non narratifs.
for tag in soup(["script", "style", "sup", "table"]):
tag.decompose()
paragraphs: list[str] = []
blocks = soup.find_all(["p", "h1", "h2", "h3", "h4", "blockquote", "li"])
if not blocks and soup.body:
blocks = [soup.body]
for block in blocks:
text = block.get_text(" ", strip=True)
text = re.sub(r"\s+", " ", text).strip()
if text:
paragraphs.append(text)
return paragraphs
def _parse_title(title: str) -> tuple[Optional[str], Optional[str]]:
"""Decoupe un titre de chapitre en (numero, pov)."""
m = _TITLE_PARTS_RE.match(title)
if not m:
return None, None
number = (m.group(1) or "").strip() or None
pov = (m.group(2) or "").strip() or None
return number, pov
def _output_name(seq: int, kind: ChapterKind, number: Optional[str], title: str) -> str:
"""Nom de mp3 calque sur le format du sample (NN-<libelle>.mp3)."""
prefix = f"{seq:02d}"
label: str
if kind is ChapterKind.CHAPTER and number:
low = number.lower()
if low == "prologue":
label = "Prologue"
elif low in ("epilogue", "épilogue"):
label = "Épilogue"
elif number.isdigit():
label = f"Chapitre {int(number)}"
else:
label = number.capitalize()
else:
label = title
if label.isupper(): # titres tout-majuscule (ex "REMERCIEMENTS")
label = label.capitalize()
return safe_filename(f"{prefix}-{label}") + ".mp3"
def _classify(ordered: list[dict]) -> None:
"""Affecte kind/render a chaque item (mutation en place).
front = avant le premier chapitre numerote (couverture, page de titre...)
chapter = correspond au motif de titre de chapitre
back = apres le dernier chapitre (remerciements, glossaire...)
"""
chapter_idxs = [
i for i, it in enumerate(ordered)
if it["title"] and _CHAPTER_RE.match(it["title"])
]
first = chapter_idxs[0] if chapter_idxs else len(ordered)
last = chapter_idxs[-1] if chapter_idxs else -1
for i, it in enumerate(ordered):
is_chapter = bool(it["title"]) and bool(_CHAPTER_RE.match(it["title"]))
if is_chapter:
it["kind"] = ChapterKind.CHAPTER
it["render"] = it["word_count"] > 0
elif i < first:
it["kind"] = ChapterKind.FRONT
it["render"] = False
else: # i > last (back matter)
it["kind"] = ChapterKind.BACK
it["render"] = it["word_count"] >= _BACK_MATTER_MIN_WORDS
def _extract_cover(book: epub.EpubBook, dest_dir: Path) -> Optional[str]:
cover_item = None
for item in book.get_items_of_type(ebooklib.ITEM_COVER):
cover_item = item
break
if cover_item is None: # fallback : item nomme "cover"
for item in book.get_items_of_type(ebooklib.ITEM_IMAGE):
if "cover" in item.get_name().lower():
cover_item = item
break
if cover_item is None:
return None
ext = Path(cover_item.get_name()).suffix or ".jpg"
dest = dest_dir / f"cover{ext}"
dest.write_bytes(cover_item.get_content())
return dest.name
def parse_epub(epub_path: str | Path, slug: Optional[str] = None) -> Book:
"""Parse un EPUB et ecrit book.json + chapters/chNN.json dans data/<slug>/."""
epub_path = Path(epub_path)
book_ml = epub.read_epub(str(epub_path), options={"ignore_ncx": False})
title = _meta(book_ml, "title") or epub_path.stem
author = _meta(book_ml, "creator")
description = _meta(book_ml, "description")
language = _meta(book_ml, "language") or "fr"
slug = slug or slugify(title)
data_dir = book_data_dir(slug)
chapters_dir = data_dir / "chapters"
chapters_dir.mkdir(parents=True, exist_ok=True)
toc_titles = _build_toc_titles(book_ml)
# Documents dans l'ordre du spine.
id_to_item = {it.get_id(): it for it in book_ml.get_items()}
ordered: list[dict] = []
for idref, _linear in book_ml.spine:
item = id_to_item.get(idref)
if item is None or item.get_type() != ebooklib.ITEM_DOCUMENT:
continue
href = unquote(item.get_name())
paragraphs = _extract_paragraphs(item.get_content())
title_txt = toc_titles.get(href, "")
ordered.append({
"item_id": idref,
"src": href,
"title": title_txt,
"paragraphs": paragraphs,
"word_count": sum(len(p.split()) for p in paragraphs),
})
_classify(ordered)
cover_file = _extract_cover(book_ml, data_dir)
chapters: list[Chapter] = []
seq = 0 # compteur de prefixe sur les seuls chapitres rendus
for index, it in enumerate(ordered):
number = pov = None
if it["kind"] is ChapterKind.CHAPTER:
number, pov = _parse_title(it["title"])
text_file = None
output_name = None
if it["render"]:
seq += 1
ct = ChapterText(index=index, title=it["title"] or it["src"],
paragraphs=it["paragraphs"])
text_file = f"chapters/ch{index:02d}.json"
(data_dir / text_file).write_text(
ct.model_dump_json(indent=2), encoding="utf-8")
output_name = _output_name(seq, it["kind"], number, it["title"] or "")
chapters.append(Chapter(
index=index,
item_id=it["item_id"],
src=it["src"],
title=it["title"] or it["src"],
kind=it["kind"],
render=it["render"],
number=number,
pov=pov,
word_count=it["word_count"],
text_file=text_file,
output_name=output_name,
))
book = Book(
slug=slug,
title=title,
author=author,
language=(language[:2] if language else "fr"),
description=description,
cover_file=cover_file,
chapters=chapters,
)
(data_dir / "book.json").write_text(
book.model_dump_json(indent=2), encoding="utf-8")
return book
def _meta(book: epub.EpubBook, name: str) -> Optional[str]:
values = book.get_metadata("DC", name)
if values:
return values[0][0]
return None
def load_book(slug: str) -> Book:
path = book_data_dir(slug) / "book.json"
return Book.model_validate_json(path.read_text(encoding="utf-8"))
def load_chapter_text(slug: str, chapter: Chapter) -> ChapterText:
path = book_data_dir(slug) / chapter.text_file
return ChapterText.model_validate_json(path.read_text(encoding="utf-8"))