InkFlow/backend/inkflow/epub/parser.py

"""Parsing EPUB -> structure de livre normalisee.

Strategie :
- ebooklib lit l'archive (manifest + spine + ncx).
- L'ordre de lecture vient du spine.
- Les titres viennent de la table des matieres (ncx/nav), mappes par href.
- Le texte de chaque document est extrait via BeautifulSoup (paragraphes).
- On classe chaque item en front / chapter / back et on decide s'il faut le lire.

Sorties ecrites dans data/<slug>/ :
- book.json                : metadonnees + liste des chapitres (modele Book)
- chapters/chNN.json       : texte normalise par chapitre (modele ChapterText)
- cover.<ext>              : couverture extraite (si presente)
"""
from __future__ import annotations

import re
import warnings
from pathlib import Path
from typing import Optional
from urllib.parse import unquote, urldefrag

import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub

# Les xhtml d'epub declenchent un avertissement bs4 inoffensif ; on le tait.
try:
    from bs4 import XMLParsedAsHTMLWarning
    warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
except ImportError:  # pragma: no cover
    pass

from ..config import book_data_dir
from ..models import Book, Chapter, ChapterKind, ChapterText
from ..util import safe_filename, slugify

# Un titre de chapitre commence par un numero, PROLOGUE ou EPILOGUE.
_CHAPTER_RE = re.compile(r"^\s*(\d+|prologue|[ée]pilogue)\b", re.IGNORECASE)
# Capture "<numero> - <POV>" ou juste "<numero>".
_TITLE_PARTS_RE = re.compile(r"^\s*([^-\n]+?)(?:\s*[-–—]\s*(.+))?\s*$")

# Seuil de mots pour qu'un element de back matter (remerciements...) soit lu.
_BACK_MATTER_MIN_WORDS = 40


def _build_toc_titles(book: epub.EpubBook) -> dict[str, str]:
    """Mappe href (sans fragment) -> titre, en aplatissant la toc ncx/nav."""
    titles: dict[str, str] = {}

    def walk(items) -> None:
        for it in items:
            if isinstance(it, tuple):  # (Section, [children])
                section, children = it
                if isinstance(section, epub.Link):
                    _add(section)
                walk(children)
            elif isinstance(it, list):
                walk(it)
            elif isinstance(it, epub.Link):
                _add(it)

    def _add(link: epub.Link) -> None:
        href = unquote(urldefrag(link.href)[0])
        if href and href not in titles and link.title:
            titles[href] = link.title.strip()

    walk(book.toc)
    return titles


def _extract_paragraphs(html: bytes) -> list[str]:
    """Extrait les paragraphes lisibles d'un document xhtml."""
    soup = BeautifulSoup(html, "lxml")
    # Retire les elements non narratifs.
    for tag in soup(["script", "style", "sup", "table"]):
        tag.decompose()

    paragraphs: list[str] = []
    blocks = soup.find_all(["p", "h1", "h2", "h3", "h4", "blockquote", "li"])
    if not blocks and soup.body:
        blocks = [soup.body]

    for block in blocks:
        text = block.get_text(" ", strip=True)
        text = re.sub(r"\s+", " ", text).strip()
        if text:
            paragraphs.append(text)
    return paragraphs


def _parse_title(title: str) -> tuple[Optional[str], Optional[str]]:
    """Decoupe un titre de chapitre en (numero, pov)."""
    m = _TITLE_PARTS_RE.match(title)
    if not m:
        return None, None
    number = (m.group(1) or "").strip() or None
    pov = (m.group(2) or "").strip() or None
    return number, pov


def _output_name(seq: int, kind: ChapterKind, number: Optional[str], title: str) -> str:
    """Nom de mp3 calque sur le format du sample (NN-<libelle>.mp3)."""
    prefix = f"{seq:02d}"
    label: str
    if kind is ChapterKind.CHAPTER and number:
        low = number.lower()
        if low == "prologue":
            label = "Prologue"
        elif low in ("epilogue", "épilogue"):
            label = "Épilogue"
        elif number.isdigit():
            label = f"Chapitre {int(number)}"
        else:
            label = number.capitalize()
    else:
        label = title
    if label.isupper():  # titres tout-majuscule (ex "REMERCIEMENTS")
        label = label.capitalize()
    return safe_filename(f"{prefix}-{label}") + ".mp3"


def _classify(ordered: list[dict]) -> None:
    """Affecte kind/render a chaque item (mutation en place).

    front  = avant le premier chapitre numerote (couverture, page de titre...)
    chapter = correspond au motif de titre de chapitre
    back   = apres le dernier chapitre (remerciements, glossaire...)
    """
    chapter_idxs = [
        i for i, it in enumerate(ordered)
        if it["title"] and _CHAPTER_RE.match(it["title"])
    ]
    first = chapter_idxs[0] if chapter_idxs else len(ordered)
    last = chapter_idxs[-1] if chapter_idxs else -1

    for i, it in enumerate(ordered):
        is_chapter = bool(it["title"]) and bool(_CHAPTER_RE.match(it["title"]))
        if is_chapter:
            it["kind"] = ChapterKind.CHAPTER
            it["render"] = it["word_count"] > 0
        elif i < first:
            it["kind"] = ChapterKind.FRONT
            it["render"] = False
        else:  # i > last (back matter)
            it["kind"] = ChapterKind.BACK
            it["render"] = it["word_count"] >= _BACK_MATTER_MIN_WORDS


def _extract_cover(book: epub.EpubBook, dest_dir: Path) -> Optional[str]:
    cover_item = None
    for item in book.get_items_of_type(ebooklib.ITEM_COVER):
        cover_item = item
        break
    if cover_item is None:  # fallback : item nomme "cover"
        for item in book.get_items_of_type(ebooklib.ITEM_IMAGE):
            if "cover" in item.get_name().lower():
                cover_item = item
                break
    if cover_item is None:
        return None
    ext = Path(cover_item.get_name()).suffix or ".jpg"
    dest = dest_dir / f"cover{ext}"
    dest.write_bytes(cover_item.get_content())
    return dest.name


def parse_epub(epub_path: str | Path, slug: Optional[str] = None) -> Book:
    """Parse un EPUB et ecrit book.json + chapters/chNN.json dans data/<slug>/."""
    epub_path = Path(epub_path)
    book_ml = epub.read_epub(str(epub_path), options={"ignore_ncx": False})

    title = _meta(book_ml, "title") or epub_path.stem
    author = _meta(book_ml, "creator")
    description = _meta(book_ml, "description")
    language = _meta(book_ml, "language") or "fr"
    slug = slug or slugify(title)

    data_dir = book_data_dir(slug)
    chapters_dir = data_dir / "chapters"
    chapters_dir.mkdir(parents=True, exist_ok=True)

    toc_titles = _build_toc_titles(book_ml)

    # Documents dans l'ordre du spine.
    id_to_item = {it.get_id(): it for it in book_ml.get_items()}
    ordered: list[dict] = []
    for idref, _linear in book_ml.spine:
        item = id_to_item.get(idref)
        if item is None or item.get_type() != ebooklib.ITEM_DOCUMENT:
            continue
        href = unquote(item.get_name())
        paragraphs = _extract_paragraphs(item.get_content())
        title_txt = toc_titles.get(href, "")
        ordered.append({
            "item_id": idref,
            "src": href,
            "title": title_txt,
            "paragraphs": paragraphs,
            "word_count": sum(len(p.split()) for p in paragraphs),
        })

    _classify(ordered)

    cover_file = _extract_cover(book_ml, data_dir)

    chapters: list[Chapter] = []
    seq = 0  # compteur de prefixe sur les seuls chapitres rendus
    for index, it in enumerate(ordered):
        number = pov = None
        if it["kind"] is ChapterKind.CHAPTER:
            number, pov = _parse_title(it["title"])

        text_file = None
        output_name = None
        if it["render"]:
            seq += 1
            ct = ChapterText(index=index, title=it["title"] or it["src"],
                             paragraphs=it["paragraphs"])
            text_file = f"chapters/ch{index:02d}.json"
            (data_dir / text_file).write_text(
                ct.model_dump_json(indent=2), encoding="utf-8")
            output_name = _output_name(seq, it["kind"], number, it["title"] or "")

        chapters.append(Chapter(
            index=index,
            item_id=it["item_id"],
            src=it["src"],
            title=it["title"] or it["src"],
            kind=it["kind"],
            render=it["render"],
            number=number,
            pov=pov,
            word_count=it["word_count"],
            text_file=text_file,
            output_name=output_name,
        ))

    book = Book(
        slug=slug,
        title=title,
        author=author,
        language=(language[:2] if language else "fr"),
        description=description,
        cover_file=cover_file,
        chapters=chapters,
    )
    (data_dir / "book.json").write_text(
        book.model_dump_json(indent=2), encoding="utf-8")
    return book


def _meta(book: epub.EpubBook, name: str) -> Optional[str]:
    values = book.get_metadata("DC", name)
    if values:
        return values[0][0]
    return None


def load_book(slug: str) -> Book:
    path = book_data_dir(slug) / "book.json"
    return Book.model_validate_json(path.read_text(encoding="utf-8"))


def load_chapter_text(slug: str, chapter: Chapter) -> ChapterText:
    path = book_data_dir(slug) / chapter.text_file
    return ChapterText.model_validate_json(path.read_text(encoding="utf-8"))