Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)

2026-06-21 00:10:11 +02:00
commit d3bb91394b
71 changed files with 8138 additions and 0 deletions
--- a/backend/inkflow/epub/parser.py
+++ b/backend/inkflow/epub/parser.py
@@ -0,0 +1,267 @@
+"""Parsing EPUB -> structure de livre normalisee.
+
+Strategie :
+- ebooklib lit l'archive (manifest + spine + ncx).
+- L'ordre de lecture vient du spine.
+- Les titres viennent de la table des matieres (ncx/nav), mappes par href.
+- Le texte de chaque document est extrait via BeautifulSoup (paragraphes).
+- On classe chaque item en front / chapter / back et on decide s'il faut le lire.
+
+Sorties ecrites dans data/<slug>/ :
+- book.json                : metadonnees + liste des chapitres (modele Book)
+- chapters/chNN.json       : texte normalise par chapitre (modele ChapterText)
+- cover.<ext>              : couverture extraite (si presente)
+"""
+from __future__ import annotations
+
+import re
+import warnings
+from pathlib import Path
+from typing import Optional
+from urllib.parse import unquote, urldefrag
+
+import ebooklib
+from bs4 import BeautifulSoup
+from ebooklib import epub
+
+# Les xhtml d'epub declenchent un avertissement bs4 inoffensif ; on le tait.
+try:
+    from bs4 import XMLParsedAsHTMLWarning
+    warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
+except ImportError:  # pragma: no cover
+    pass
+
+from ..config import book_data_dir
+from ..models import Book, Chapter, ChapterKind, ChapterText
+from ..util import safe_filename, slugify
+
+# Un titre de chapitre commence par un numero, PROLOGUE ou EPILOGUE.
+_CHAPTER_RE = re.compile(r"^\s*(\d+|prologue|[ée]pilogue)\b", re.IGNORECASE)
+# Capture "<numero> - <POV>" ou juste "<numero>".
+_TITLE_PARTS_RE = re.compile(r"^\s*([^-\n]+?)(?:\s*[-–—]\s*(.+))?\s*$")
+
+# Seuil de mots pour qu'un element de back matter (remerciements...) soit lu.
+_BACK_MATTER_MIN_WORDS = 40
+
+
+def _build_toc_titles(book: epub.EpubBook) -> dict[str, str]:
+    """Mappe href (sans fragment) -> titre, en aplatissant la toc ncx/nav."""
+    titles: dict[str, str] = {}
+
+    def walk(items) -> None:
+        for it in items:
+            if isinstance(it, tuple):  # (Section, [children])
+                section, children = it
+                if isinstance(section, epub.Link):
+                    _add(section)
+                walk(children)
+            elif isinstance(it, list):
+                walk(it)
+            elif isinstance(it, epub.Link):
+                _add(it)
+
+    def _add(link: epub.Link) -> None:
+        href = unquote(urldefrag(link.href)[0])
+        if href and href not in titles and link.title:
+            titles[href] = link.title.strip()
+
+    walk(book.toc)
+    return titles
+
+
+def _extract_paragraphs(html: bytes) -> list[str]:
+    """Extrait les paragraphes lisibles d'un document xhtml."""
+    soup = BeautifulSoup(html, "lxml")
+    # Retire les elements non narratifs.
+    for tag in soup(["script", "style", "sup", "table"]):
+        tag.decompose()
+
+    paragraphs: list[str] = []
+    blocks = soup.find_all(["p", "h1", "h2", "h3", "h4", "blockquote", "li"])
+    if not blocks and soup.body:
+        blocks = [soup.body]
+
+    for block in blocks:
+        text = block.get_text(" ", strip=True)
+        text = re.sub(r"\s+", " ", text).strip()
+        if text:
+            paragraphs.append(text)
+    return paragraphs
+
+
+def _parse_title(title: str) -> tuple[Optional[str], Optional[str]]:
+    """Decoupe un titre de chapitre en (numero, pov)."""
+    m = _TITLE_PARTS_RE.match(title)
+    if not m:
+        return None, None
+    number = (m.group(1) or "").strip() or None
+    pov = (m.group(2) or "").strip() or None
+    return number, pov
+
+
+def _output_name(seq: int, kind: ChapterKind, number: Optional[str], title: str) -> str:
+    """Nom de mp3 calque sur le format du sample (NN-<libelle>.mp3)."""
+    prefix = f"{seq:02d}"
+    label: str
+    if kind is ChapterKind.CHAPTER and number:
+        low = number.lower()
+        if low == "prologue":
+            label = "Prologue"
+        elif low in ("epilogue", "épilogue"):
+            label = "Épilogue"
+        elif number.isdigit():
+            label = f"Chapitre {int(number)}"
+        else:
+            label = number.capitalize()
+    else:
+        label = title
+    if label.isupper():  # titres tout-majuscule (ex "REMERCIEMENTS")
+        label = label.capitalize()
+    return safe_filename(f"{prefix}-{label}") + ".mp3"
+
+
+def _classify(ordered: list[dict]) -> None:
+    """Affecte kind/render a chaque item (mutation en place).
+
+    front  = avant le premier chapitre numerote (couverture, page de titre...)
+    chapter = correspond au motif de titre de chapitre
+    back   = apres le dernier chapitre (remerciements, glossaire...)
+    """
+    chapter_idxs = [
+        i for i, it in enumerate(ordered)
+        if it["title"] and _CHAPTER_RE.match(it["title"])
+    ]
+    first = chapter_idxs[0] if chapter_idxs else len(ordered)
+    last = chapter_idxs[-1] if chapter_idxs else -1
+
+    for i, it in enumerate(ordered):
+        is_chapter = bool(it["title"]) and bool(_CHAPTER_RE.match(it["title"]))
+        if is_chapter:
+            it["kind"] = ChapterKind.CHAPTER
+            it["render"] = it["word_count"] > 0
+        elif i < first:
+            it["kind"] = ChapterKind.FRONT
+            it["render"] = False
+        else:  # i > last (back matter)
+            it["kind"] = ChapterKind.BACK
+            it["render"] = it["word_count"] >= _BACK_MATTER_MIN_WORDS
+
+
+def _extract_cover(book: epub.EpubBook, dest_dir: Path) -> Optional[str]:
+    cover_item = None
+    for item in book.get_items_of_type(ebooklib.ITEM_COVER):
+        cover_item = item
+        break
+    if cover_item is None:  # fallback : item nomme "cover"
+        for item in book.get_items_of_type(ebooklib.ITEM_IMAGE):
+            if "cover" in item.get_name().lower():
+                cover_item = item
+                break
+    if cover_item is None:
+        return None
+    ext = Path(cover_item.get_name()).suffix or ".jpg"
+    dest = dest_dir / f"cover{ext}"
+    dest.write_bytes(cover_item.get_content())
+    return dest.name
+
+
+def parse_epub(epub_path: str | Path, slug: Optional[str] = None) -> Book:
+    """Parse un EPUB et ecrit book.json + chapters/chNN.json dans data/<slug>/."""
+    epub_path = Path(epub_path)
+    book_ml = epub.read_epub(str(epub_path), options={"ignore_ncx": False})
+
+    title = _meta(book_ml, "title") or epub_path.stem
+    author = _meta(book_ml, "creator")
+    description = _meta(book_ml, "description")
+    language = _meta(book_ml, "language") or "fr"
+    slug = slug or slugify(title)
+
+    data_dir = book_data_dir(slug)
+    chapters_dir = data_dir / "chapters"
+    chapters_dir.mkdir(parents=True, exist_ok=True)
+
+    toc_titles = _build_toc_titles(book_ml)
+
+    # Documents dans l'ordre du spine.
+    id_to_item = {it.get_id(): it for it in book_ml.get_items()}
+    ordered: list[dict] = []
+    for idref, _linear in book_ml.spine:
+        item = id_to_item.get(idref)
+        if item is None or item.get_type() != ebooklib.ITEM_DOCUMENT:
+            continue
+        href = unquote(item.get_name())
+        paragraphs = _extract_paragraphs(item.get_content())
+        title_txt = toc_titles.get(href, "")
+        ordered.append({
+            "item_id": idref,
+            "src": href,
+            "title": title_txt,
+            "paragraphs": paragraphs,
+            "word_count": sum(len(p.split()) for p in paragraphs),
+        })
+
+    _classify(ordered)
+
+    cover_file = _extract_cover(book_ml, data_dir)
+
+    chapters: list[Chapter] = []
+    seq = 0  # compteur de prefixe sur les seuls chapitres rendus
+    for index, it in enumerate(ordered):
+        number = pov = None
+        if it["kind"] is ChapterKind.CHAPTER:
+            number, pov = _parse_title(it["title"])
+
+        text_file = None
+        output_name = None
+        if it["render"]:
+            seq += 1
+            ct = ChapterText(index=index, title=it["title"] or it["src"],
+                             paragraphs=it["paragraphs"])
+            text_file = f"chapters/ch{index:02d}.json"
+            (data_dir / text_file).write_text(
+                ct.model_dump_json(indent=2), encoding="utf-8")
+            output_name = _output_name(seq, it["kind"], number, it["title"] or "")
+
+        chapters.append(Chapter(
+            index=index,
+            item_id=it["item_id"],
+            src=it["src"],
+            title=it["title"] or it["src"],
+            kind=it["kind"],
+            render=it["render"],
+            number=number,
+            pov=pov,
+            word_count=it["word_count"],
+            text_file=text_file,
+            output_name=output_name,
+        ))
+
+    book = Book(
+        slug=slug,
+        title=title,
+        author=author,
+        language=(language[:2] if language else "fr"),
+        description=description,
+        cover_file=cover_file,
+        chapters=chapters,
+    )
+    (data_dir / "book.json").write_text(
+        book.model_dump_json(indent=2), encoding="utf-8")
+    return book
+
+
+def _meta(book: epub.EpubBook, name: str) -> Optional[str]:
+    values = book.get_metadata("DC", name)
+    if values:
+        return values[0][0]
+    return None
+
+
+def load_book(slug: str) -> Book:
+    path = book_data_dir(slug) / "book.json"
+    return Book.model_validate_json(path.read_text(encoding="utf-8"))
+
+
+def load_chapter_text(slug: str, chapter: Chapter) -> ChapterText:
+    path = book_data_dir(slug) / chapter.text_file
+    return ChapterText.model_validate_json(path.read_text(encoding="utf-8"))