"""Parsing EPUB -> structure de livre normalisee. Strategie : - ebooklib lit l'archive (manifest + spine + ncx). - L'ordre de lecture vient du spine. - Les titres viennent de la table des matieres (ncx/nav), mappes par href. - Le texte de chaque document est extrait via BeautifulSoup (paragraphes). - On classe chaque item en front / chapter / back et on decide s'il faut le lire. Sorties ecrites dans data// : - book.json : metadonnees + liste des chapitres (modele Book) - chapters/chNN.json : texte normalise par chapitre (modele ChapterText) - cover. : couverture extraite (si presente) """ from __future__ import annotations import re import warnings from pathlib import Path from typing import Optional from urllib.parse import unquote, urldefrag import ebooklib from bs4 import BeautifulSoup from ebooklib import epub # Les xhtml d'epub declenchent un avertissement bs4 inoffensif ; on le tait. try: from bs4 import XMLParsedAsHTMLWarning warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) except ImportError: # pragma: no cover pass from ..config import book_data_dir from ..models import Book, Chapter, ChapterKind, ChapterText from ..util import safe_filename, slugify # Un titre de chapitre commence par un numero, PROLOGUE ou EPILOGUE. _CHAPTER_RE = re.compile(r"^\s*(\d+|prologue|[ée]pilogue)\b", re.IGNORECASE) # Capture " - " ou juste "". _TITLE_PARTS_RE = re.compile(r"^\s*([^-\n]+?)(?:\s*[-–—]\s*(.+))?\s*$") # Seuil de mots pour qu'un element de back matter (remerciements...) soit lu. _BACK_MATTER_MIN_WORDS = 40 def _build_toc_titles(book: epub.EpubBook) -> dict[str, str]: """Mappe href (sans fragment) -> titre, en aplatissant la toc ncx/nav.""" titles: dict[str, str] = {} def walk(items) -> None: for it in items: if isinstance(it, tuple): # (Section, [children]) section, children = it if isinstance(section, epub.Link): _add(section) walk(children) elif isinstance(it, list): walk(it) elif isinstance(it, epub.Link): _add(it) def _add(link: epub.Link) -> None: href = unquote(urldefrag(link.href)[0]) if href and href not in titles and link.title: titles[href] = link.title.strip() walk(book.toc) return titles def _extract_paragraphs(html: bytes) -> list[str]: """Extrait les paragraphes lisibles d'un document xhtml.""" soup = BeautifulSoup(html, "lxml") # Retire les elements non narratifs. for tag in soup(["script", "style", "sup", "table"]): tag.decompose() paragraphs: list[str] = [] blocks = soup.find_all(["p", "h1", "h2", "h3", "h4", "blockquote", "li"]) if not blocks and soup.body: blocks = [soup.body] for block in blocks: text = block.get_text(" ", strip=True) text = re.sub(r"\s+", " ", text).strip() if text: paragraphs.append(text) return paragraphs def _parse_title(title: str) -> tuple[Optional[str], Optional[str]]: """Decoupe un titre de chapitre en (numero, pov).""" m = _TITLE_PARTS_RE.match(title) if not m: return None, None number = (m.group(1) or "").strip() or None pov = (m.group(2) or "").strip() or None return number, pov def _output_name(seq: int, kind: ChapterKind, number: Optional[str], title: str) -> str: """Nom de mp3 calque sur le format du sample (NN-.mp3).""" prefix = f"{seq:02d}" label: str if kind is ChapterKind.CHAPTER and number: low = number.lower() if low == "prologue": label = "Prologue" elif low in ("epilogue", "épilogue"): label = "Épilogue" elif number.isdigit(): label = f"Chapitre {int(number)}" else: label = number.capitalize() else: label = title if label.isupper(): # titres tout-majuscule (ex "REMERCIEMENTS") label = label.capitalize() return safe_filename(f"{prefix}-{label}") + ".mp3" def _classify(ordered: list[dict]) -> None: """Affecte kind/render a chaque item (mutation en place). front = avant le premier chapitre numerote (couverture, page de titre...) chapter = correspond au motif de titre de chapitre back = apres le dernier chapitre (remerciements, glossaire...) """ chapter_idxs = [ i for i, it in enumerate(ordered) if it["title"] and _CHAPTER_RE.match(it["title"]) ] first = chapter_idxs[0] if chapter_idxs else len(ordered) last = chapter_idxs[-1] if chapter_idxs else -1 for i, it in enumerate(ordered): is_chapter = bool(it["title"]) and bool(_CHAPTER_RE.match(it["title"])) if is_chapter: it["kind"] = ChapterKind.CHAPTER it["render"] = it["word_count"] > 0 elif i < first: it["kind"] = ChapterKind.FRONT it["render"] = False else: # i > last (back matter) it["kind"] = ChapterKind.BACK it["render"] = it["word_count"] >= _BACK_MATTER_MIN_WORDS def _extract_cover(book: epub.EpubBook, dest_dir: Path) -> Optional[str]: cover_item = None for item in book.get_items_of_type(ebooklib.ITEM_COVER): cover_item = item break if cover_item is None: # fallback : item nomme "cover" for item in book.get_items_of_type(ebooklib.ITEM_IMAGE): if "cover" in item.get_name().lower(): cover_item = item break if cover_item is None: return None ext = Path(cover_item.get_name()).suffix or ".jpg" dest = dest_dir / f"cover{ext}" dest.write_bytes(cover_item.get_content()) return dest.name def parse_epub(epub_path: str | Path, slug: Optional[str] = None) -> Book: """Parse un EPUB et ecrit book.json + chapters/chNN.json dans data//.""" epub_path = Path(epub_path) book_ml = epub.read_epub(str(epub_path), options={"ignore_ncx": False}) title = _meta(book_ml, "title") or epub_path.stem author = _meta(book_ml, "creator") description = _meta(book_ml, "description") language = _meta(book_ml, "language") or "fr" slug = slug or slugify(title) data_dir = book_data_dir(slug) chapters_dir = data_dir / "chapters" chapters_dir.mkdir(parents=True, exist_ok=True) toc_titles = _build_toc_titles(book_ml) # Documents dans l'ordre du spine. id_to_item = {it.get_id(): it for it in book_ml.get_items()} ordered: list[dict] = [] for idref, _linear in book_ml.spine: item = id_to_item.get(idref) if item is None or item.get_type() != ebooklib.ITEM_DOCUMENT: continue href = unquote(item.get_name()) paragraphs = _extract_paragraphs(item.get_content()) title_txt = toc_titles.get(href, "") ordered.append({ "item_id": idref, "src": href, "title": title_txt, "paragraphs": paragraphs, "word_count": sum(len(p.split()) for p in paragraphs), }) _classify(ordered) cover_file = _extract_cover(book_ml, data_dir) chapters: list[Chapter] = [] seq = 0 # compteur de prefixe sur les seuls chapitres rendus for index, it in enumerate(ordered): number = pov = None if it["kind"] is ChapterKind.CHAPTER: number, pov = _parse_title(it["title"]) text_file = None output_name = None if it["render"]: seq += 1 ct = ChapterText(index=index, title=it["title"] or it["src"], paragraphs=it["paragraphs"]) text_file = f"chapters/ch{index:02d}.json" (data_dir / text_file).write_text( ct.model_dump_json(indent=2), encoding="utf-8") output_name = _output_name(seq, it["kind"], number, it["title"] or "") chapters.append(Chapter( index=index, item_id=it["item_id"], src=it["src"], title=it["title"] or it["src"], kind=it["kind"], render=it["render"], number=number, pov=pov, word_count=it["word_count"], text_file=text_file, output_name=output_name, )) book = Book( slug=slug, title=title, author=author, language=(language[:2] if language else "fr"), description=description, cover_file=cover_file, chapters=chapters, ) (data_dir / "book.json").write_text( book.model_dump_json(indent=2), encoding="utf-8") return book def _meta(book: epub.EpubBook, name: str) -> Optional[str]: values = book.get_metadata("DC", name) if values: return values[0][0] return None def load_book(slug: str) -> Book: path = book_data_dir(slug) / "book.json" return Book.model_validate_json(path.read_text(encoding="utf-8")) def load_chapter_text(slug: str, chapter: Chapter) -> ChapterText: path = book_data_dir(slug) / chapter.text_file return ChapterText.model_validate_json(path.read_text(encoding="utf-8"))