Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)
This commit is contained in:
267
backend/inkflow/epub/parser.py
Normal file
267
backend/inkflow/epub/parser.py
Normal file
@@ -0,0 +1,267 @@
|
||||
"""Parsing EPUB -> structure de livre normalisee.
|
||||
|
||||
Strategie :
|
||||
- ebooklib lit l'archive (manifest + spine + ncx).
|
||||
- L'ordre de lecture vient du spine.
|
||||
- Les titres viennent de la table des matieres (ncx/nav), mappes par href.
|
||||
- Le texte de chaque document est extrait via BeautifulSoup (paragraphes).
|
||||
- On classe chaque item en front / chapter / back et on decide s'il faut le lire.
|
||||
|
||||
Sorties ecrites dans data/<slug>/ :
|
||||
- book.json : metadonnees + liste des chapitres (modele Book)
|
||||
- chapters/chNN.json : texte normalise par chapitre (modele ChapterText)
|
||||
- cover.<ext> : couverture extraite (si presente)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from urllib.parse import unquote, urldefrag
|
||||
|
||||
import ebooklib
|
||||
from bs4 import BeautifulSoup
|
||||
from ebooklib import epub
|
||||
|
||||
# Les xhtml d'epub declenchent un avertissement bs4 inoffensif ; on le tait.
|
||||
try:
|
||||
from bs4 import XMLParsedAsHTMLWarning
|
||||
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
|
||||
except ImportError: # pragma: no cover
|
||||
pass
|
||||
|
||||
from ..config import book_data_dir
|
||||
from ..models import Book, Chapter, ChapterKind, ChapterText
|
||||
from ..util import safe_filename, slugify
|
||||
|
||||
# Un titre de chapitre commence par un numero, PROLOGUE ou EPILOGUE.
|
||||
_CHAPTER_RE = re.compile(r"^\s*(\d+|prologue|[ée]pilogue)\b", re.IGNORECASE)
|
||||
# Capture "<numero> - <POV>" ou juste "<numero>".
|
||||
_TITLE_PARTS_RE = re.compile(r"^\s*([^-\n]+?)(?:\s*[-–—]\s*(.+))?\s*$")
|
||||
|
||||
# Seuil de mots pour qu'un element de back matter (remerciements...) soit lu.
|
||||
_BACK_MATTER_MIN_WORDS = 40
|
||||
|
||||
|
||||
def _build_toc_titles(book: epub.EpubBook) -> dict[str, str]:
|
||||
"""Mappe href (sans fragment) -> titre, en aplatissant la toc ncx/nav."""
|
||||
titles: dict[str, str] = {}
|
||||
|
||||
def walk(items) -> None:
|
||||
for it in items:
|
||||
if isinstance(it, tuple): # (Section, [children])
|
||||
section, children = it
|
||||
if isinstance(section, epub.Link):
|
||||
_add(section)
|
||||
walk(children)
|
||||
elif isinstance(it, list):
|
||||
walk(it)
|
||||
elif isinstance(it, epub.Link):
|
||||
_add(it)
|
||||
|
||||
def _add(link: epub.Link) -> None:
|
||||
href = unquote(urldefrag(link.href)[0])
|
||||
if href and href not in titles and link.title:
|
||||
titles[href] = link.title.strip()
|
||||
|
||||
walk(book.toc)
|
||||
return titles
|
||||
|
||||
|
||||
def _extract_paragraphs(html: bytes) -> list[str]:
|
||||
"""Extrait les paragraphes lisibles d'un document xhtml."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
# Retire les elements non narratifs.
|
||||
for tag in soup(["script", "style", "sup", "table"]):
|
||||
tag.decompose()
|
||||
|
||||
paragraphs: list[str] = []
|
||||
blocks = soup.find_all(["p", "h1", "h2", "h3", "h4", "blockquote", "li"])
|
||||
if not blocks and soup.body:
|
||||
blocks = [soup.body]
|
||||
|
||||
for block in blocks:
|
||||
text = block.get_text(" ", strip=True)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
if text:
|
||||
paragraphs.append(text)
|
||||
return paragraphs
|
||||
|
||||
|
||||
def _parse_title(title: str) -> tuple[Optional[str], Optional[str]]:
|
||||
"""Decoupe un titre de chapitre en (numero, pov)."""
|
||||
m = _TITLE_PARTS_RE.match(title)
|
||||
if not m:
|
||||
return None, None
|
||||
number = (m.group(1) or "").strip() or None
|
||||
pov = (m.group(2) or "").strip() or None
|
||||
return number, pov
|
||||
|
||||
|
||||
def _output_name(seq: int, kind: ChapterKind, number: Optional[str], title: str) -> str:
|
||||
"""Nom de mp3 calque sur le format du sample (NN-<libelle>.mp3)."""
|
||||
prefix = f"{seq:02d}"
|
||||
label: str
|
||||
if kind is ChapterKind.CHAPTER and number:
|
||||
low = number.lower()
|
||||
if low == "prologue":
|
||||
label = "Prologue"
|
||||
elif low in ("epilogue", "épilogue"):
|
||||
label = "Épilogue"
|
||||
elif number.isdigit():
|
||||
label = f"Chapitre {int(number)}"
|
||||
else:
|
||||
label = number.capitalize()
|
||||
else:
|
||||
label = title
|
||||
if label.isupper(): # titres tout-majuscule (ex "REMERCIEMENTS")
|
||||
label = label.capitalize()
|
||||
return safe_filename(f"{prefix}-{label}") + ".mp3"
|
||||
|
||||
|
||||
def _classify(ordered: list[dict]) -> None:
|
||||
"""Affecte kind/render a chaque item (mutation en place).
|
||||
|
||||
front = avant le premier chapitre numerote (couverture, page de titre...)
|
||||
chapter = correspond au motif de titre de chapitre
|
||||
back = apres le dernier chapitre (remerciements, glossaire...)
|
||||
"""
|
||||
chapter_idxs = [
|
||||
i for i, it in enumerate(ordered)
|
||||
if it["title"] and _CHAPTER_RE.match(it["title"])
|
||||
]
|
||||
first = chapter_idxs[0] if chapter_idxs else len(ordered)
|
||||
last = chapter_idxs[-1] if chapter_idxs else -1
|
||||
|
||||
for i, it in enumerate(ordered):
|
||||
is_chapter = bool(it["title"]) and bool(_CHAPTER_RE.match(it["title"]))
|
||||
if is_chapter:
|
||||
it["kind"] = ChapterKind.CHAPTER
|
||||
it["render"] = it["word_count"] > 0
|
||||
elif i < first:
|
||||
it["kind"] = ChapterKind.FRONT
|
||||
it["render"] = False
|
||||
else: # i > last (back matter)
|
||||
it["kind"] = ChapterKind.BACK
|
||||
it["render"] = it["word_count"] >= _BACK_MATTER_MIN_WORDS
|
||||
|
||||
|
||||
def _extract_cover(book: epub.EpubBook, dest_dir: Path) -> Optional[str]:
|
||||
cover_item = None
|
||||
for item in book.get_items_of_type(ebooklib.ITEM_COVER):
|
||||
cover_item = item
|
||||
break
|
||||
if cover_item is None: # fallback : item nomme "cover"
|
||||
for item in book.get_items_of_type(ebooklib.ITEM_IMAGE):
|
||||
if "cover" in item.get_name().lower():
|
||||
cover_item = item
|
||||
break
|
||||
if cover_item is None:
|
||||
return None
|
||||
ext = Path(cover_item.get_name()).suffix or ".jpg"
|
||||
dest = dest_dir / f"cover{ext}"
|
||||
dest.write_bytes(cover_item.get_content())
|
||||
return dest.name
|
||||
|
||||
|
||||
def parse_epub(epub_path: str | Path, slug: Optional[str] = None) -> Book:
|
||||
"""Parse un EPUB et ecrit book.json + chapters/chNN.json dans data/<slug>/."""
|
||||
epub_path = Path(epub_path)
|
||||
book_ml = epub.read_epub(str(epub_path), options={"ignore_ncx": False})
|
||||
|
||||
title = _meta(book_ml, "title") or epub_path.stem
|
||||
author = _meta(book_ml, "creator")
|
||||
description = _meta(book_ml, "description")
|
||||
language = _meta(book_ml, "language") or "fr"
|
||||
slug = slug or slugify(title)
|
||||
|
||||
data_dir = book_data_dir(slug)
|
||||
chapters_dir = data_dir / "chapters"
|
||||
chapters_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
toc_titles = _build_toc_titles(book_ml)
|
||||
|
||||
# Documents dans l'ordre du spine.
|
||||
id_to_item = {it.get_id(): it for it in book_ml.get_items()}
|
||||
ordered: list[dict] = []
|
||||
for idref, _linear in book_ml.spine:
|
||||
item = id_to_item.get(idref)
|
||||
if item is None or item.get_type() != ebooklib.ITEM_DOCUMENT:
|
||||
continue
|
||||
href = unquote(item.get_name())
|
||||
paragraphs = _extract_paragraphs(item.get_content())
|
||||
title_txt = toc_titles.get(href, "")
|
||||
ordered.append({
|
||||
"item_id": idref,
|
||||
"src": href,
|
||||
"title": title_txt,
|
||||
"paragraphs": paragraphs,
|
||||
"word_count": sum(len(p.split()) for p in paragraphs),
|
||||
})
|
||||
|
||||
_classify(ordered)
|
||||
|
||||
cover_file = _extract_cover(book_ml, data_dir)
|
||||
|
||||
chapters: list[Chapter] = []
|
||||
seq = 0 # compteur de prefixe sur les seuls chapitres rendus
|
||||
for index, it in enumerate(ordered):
|
||||
number = pov = None
|
||||
if it["kind"] is ChapterKind.CHAPTER:
|
||||
number, pov = _parse_title(it["title"])
|
||||
|
||||
text_file = None
|
||||
output_name = None
|
||||
if it["render"]:
|
||||
seq += 1
|
||||
ct = ChapterText(index=index, title=it["title"] or it["src"],
|
||||
paragraphs=it["paragraphs"])
|
||||
text_file = f"chapters/ch{index:02d}.json"
|
||||
(data_dir / text_file).write_text(
|
||||
ct.model_dump_json(indent=2), encoding="utf-8")
|
||||
output_name = _output_name(seq, it["kind"], number, it["title"] or "")
|
||||
|
||||
chapters.append(Chapter(
|
||||
index=index,
|
||||
item_id=it["item_id"],
|
||||
src=it["src"],
|
||||
title=it["title"] or it["src"],
|
||||
kind=it["kind"],
|
||||
render=it["render"],
|
||||
number=number,
|
||||
pov=pov,
|
||||
word_count=it["word_count"],
|
||||
text_file=text_file,
|
||||
output_name=output_name,
|
||||
))
|
||||
|
||||
book = Book(
|
||||
slug=slug,
|
||||
title=title,
|
||||
author=author,
|
||||
language=(language[:2] if language else "fr"),
|
||||
description=description,
|
||||
cover_file=cover_file,
|
||||
chapters=chapters,
|
||||
)
|
||||
(data_dir / "book.json").write_text(
|
||||
book.model_dump_json(indent=2), encoding="utf-8")
|
||||
return book
|
||||
|
||||
|
||||
def _meta(book: epub.EpubBook, name: str) -> Optional[str]:
|
||||
values = book.get_metadata("DC", name)
|
||||
if values:
|
||||
return values[0][0]
|
||||
return None
|
||||
|
||||
|
||||
def load_book(slug: str) -> Book:
|
||||
path = book_data_dir(slug) / "book.json"
|
||||
return Book.model_validate_json(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def load_chapter_text(slug: str, chapter: Chapter) -> ChapterText:
|
||||
path = book_data_dir(slug) / chapter.text_file
|
||||
return ChapterText.model_validate_json(path.read_text(encoding="utf-8"))
|
||||
Reference in New Issue
Block a user