268 lines
9.1 KiB
Python
268 lines
9.1 KiB
Python
"""Parsing EPUB -> structure de livre normalisee.
|
|
|
|
Strategie :
|
|
- ebooklib lit l'archive (manifest + spine + ncx).
|
|
- L'ordre de lecture vient du spine.
|
|
- Les titres viennent de la table des matieres (ncx/nav), mappes par href.
|
|
- Le texte de chaque document est extrait via BeautifulSoup (paragraphes).
|
|
- On classe chaque item en front / chapter / back et on decide s'il faut le lire.
|
|
|
|
Sorties ecrites dans data/<slug>/ :
|
|
- book.json : metadonnees + liste des chapitres (modele Book)
|
|
- chapters/chNN.json : texte normalise par chapitre (modele ChapterText)
|
|
- cover.<ext> : couverture extraite (si presente)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import warnings
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
from urllib.parse import unquote, urldefrag
|
|
|
|
import ebooklib
|
|
from bs4 import BeautifulSoup
|
|
from ebooklib import epub
|
|
|
|
# Les xhtml d'epub declenchent un avertissement bs4 inoffensif ; on le tait.
|
|
try:
|
|
from bs4 import XMLParsedAsHTMLWarning
|
|
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
|
|
except ImportError: # pragma: no cover
|
|
pass
|
|
|
|
from ..config import book_data_dir
|
|
from ..models import Book, Chapter, ChapterKind, ChapterText
|
|
from ..util import safe_filename, slugify
|
|
|
|
# Un titre de chapitre commence par un numero, PROLOGUE ou EPILOGUE.
|
|
_CHAPTER_RE = re.compile(r"^\s*(\d+|prologue|[ée]pilogue)\b", re.IGNORECASE)
|
|
# Capture "<numero> - <POV>" ou juste "<numero>".
|
|
_TITLE_PARTS_RE = re.compile(r"^\s*([^-\n]+?)(?:\s*[-–—]\s*(.+))?\s*$")
|
|
|
|
# Seuil de mots pour qu'un element de back matter (remerciements...) soit lu.
|
|
_BACK_MATTER_MIN_WORDS = 40
|
|
|
|
|
|
def _build_toc_titles(book: epub.EpubBook) -> dict[str, str]:
|
|
"""Mappe href (sans fragment) -> titre, en aplatissant la toc ncx/nav."""
|
|
titles: dict[str, str] = {}
|
|
|
|
def walk(items) -> None:
|
|
for it in items:
|
|
if isinstance(it, tuple): # (Section, [children])
|
|
section, children = it
|
|
if isinstance(section, epub.Link):
|
|
_add(section)
|
|
walk(children)
|
|
elif isinstance(it, list):
|
|
walk(it)
|
|
elif isinstance(it, epub.Link):
|
|
_add(it)
|
|
|
|
def _add(link: epub.Link) -> None:
|
|
href = unquote(urldefrag(link.href)[0])
|
|
if href and href not in titles and link.title:
|
|
titles[href] = link.title.strip()
|
|
|
|
walk(book.toc)
|
|
return titles
|
|
|
|
|
|
def _extract_paragraphs(html: bytes) -> list[str]:
|
|
"""Extrait les paragraphes lisibles d'un document xhtml."""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
# Retire les elements non narratifs.
|
|
for tag in soup(["script", "style", "sup", "table"]):
|
|
tag.decompose()
|
|
|
|
paragraphs: list[str] = []
|
|
blocks = soup.find_all(["p", "h1", "h2", "h3", "h4", "blockquote", "li"])
|
|
if not blocks and soup.body:
|
|
blocks = [soup.body]
|
|
|
|
for block in blocks:
|
|
text = block.get_text(" ", strip=True)
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
if text:
|
|
paragraphs.append(text)
|
|
return paragraphs
|
|
|
|
|
|
def _parse_title(title: str) -> tuple[Optional[str], Optional[str]]:
|
|
"""Decoupe un titre de chapitre en (numero, pov)."""
|
|
m = _TITLE_PARTS_RE.match(title)
|
|
if not m:
|
|
return None, None
|
|
number = (m.group(1) or "").strip() or None
|
|
pov = (m.group(2) or "").strip() or None
|
|
return number, pov
|
|
|
|
|
|
def _output_name(seq: int, kind: ChapterKind, number: Optional[str], title: str) -> str:
|
|
"""Nom de mp3 calque sur le format du sample (NN-<libelle>.mp3)."""
|
|
prefix = f"{seq:02d}"
|
|
label: str
|
|
if kind is ChapterKind.CHAPTER and number:
|
|
low = number.lower()
|
|
if low == "prologue":
|
|
label = "Prologue"
|
|
elif low in ("epilogue", "épilogue"):
|
|
label = "Épilogue"
|
|
elif number.isdigit():
|
|
label = f"Chapitre {int(number)}"
|
|
else:
|
|
label = number.capitalize()
|
|
else:
|
|
label = title
|
|
if label.isupper(): # titres tout-majuscule (ex "REMERCIEMENTS")
|
|
label = label.capitalize()
|
|
return safe_filename(f"{prefix}-{label}") + ".mp3"
|
|
|
|
|
|
def _classify(ordered: list[dict]) -> None:
|
|
"""Affecte kind/render a chaque item (mutation en place).
|
|
|
|
front = avant le premier chapitre numerote (couverture, page de titre...)
|
|
chapter = correspond au motif de titre de chapitre
|
|
back = apres le dernier chapitre (remerciements, glossaire...)
|
|
"""
|
|
chapter_idxs = [
|
|
i for i, it in enumerate(ordered)
|
|
if it["title"] and _CHAPTER_RE.match(it["title"])
|
|
]
|
|
first = chapter_idxs[0] if chapter_idxs else len(ordered)
|
|
last = chapter_idxs[-1] if chapter_idxs else -1
|
|
|
|
for i, it in enumerate(ordered):
|
|
is_chapter = bool(it["title"]) and bool(_CHAPTER_RE.match(it["title"]))
|
|
if is_chapter:
|
|
it["kind"] = ChapterKind.CHAPTER
|
|
it["render"] = it["word_count"] > 0
|
|
elif i < first:
|
|
it["kind"] = ChapterKind.FRONT
|
|
it["render"] = False
|
|
else: # i > last (back matter)
|
|
it["kind"] = ChapterKind.BACK
|
|
it["render"] = it["word_count"] >= _BACK_MATTER_MIN_WORDS
|
|
|
|
|
|
def _extract_cover(book: epub.EpubBook, dest_dir: Path) -> Optional[str]:
|
|
cover_item = None
|
|
for item in book.get_items_of_type(ebooklib.ITEM_COVER):
|
|
cover_item = item
|
|
break
|
|
if cover_item is None: # fallback : item nomme "cover"
|
|
for item in book.get_items_of_type(ebooklib.ITEM_IMAGE):
|
|
if "cover" in item.get_name().lower():
|
|
cover_item = item
|
|
break
|
|
if cover_item is None:
|
|
return None
|
|
ext = Path(cover_item.get_name()).suffix or ".jpg"
|
|
dest = dest_dir / f"cover{ext}"
|
|
dest.write_bytes(cover_item.get_content())
|
|
return dest.name
|
|
|
|
|
|
def parse_epub(epub_path: str | Path, slug: Optional[str] = None) -> Book:
|
|
"""Parse un EPUB et ecrit book.json + chapters/chNN.json dans data/<slug>/."""
|
|
epub_path = Path(epub_path)
|
|
book_ml = epub.read_epub(str(epub_path), options={"ignore_ncx": False})
|
|
|
|
title = _meta(book_ml, "title") or epub_path.stem
|
|
author = _meta(book_ml, "creator")
|
|
description = _meta(book_ml, "description")
|
|
language = _meta(book_ml, "language") or "fr"
|
|
slug = slug or slugify(title)
|
|
|
|
data_dir = book_data_dir(slug)
|
|
chapters_dir = data_dir / "chapters"
|
|
chapters_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
toc_titles = _build_toc_titles(book_ml)
|
|
|
|
# Documents dans l'ordre du spine.
|
|
id_to_item = {it.get_id(): it for it in book_ml.get_items()}
|
|
ordered: list[dict] = []
|
|
for idref, _linear in book_ml.spine:
|
|
item = id_to_item.get(idref)
|
|
if item is None or item.get_type() != ebooklib.ITEM_DOCUMENT:
|
|
continue
|
|
href = unquote(item.get_name())
|
|
paragraphs = _extract_paragraphs(item.get_content())
|
|
title_txt = toc_titles.get(href, "")
|
|
ordered.append({
|
|
"item_id": idref,
|
|
"src": href,
|
|
"title": title_txt,
|
|
"paragraphs": paragraphs,
|
|
"word_count": sum(len(p.split()) for p in paragraphs),
|
|
})
|
|
|
|
_classify(ordered)
|
|
|
|
cover_file = _extract_cover(book_ml, data_dir)
|
|
|
|
chapters: list[Chapter] = []
|
|
seq = 0 # compteur de prefixe sur les seuls chapitres rendus
|
|
for index, it in enumerate(ordered):
|
|
number = pov = None
|
|
if it["kind"] is ChapterKind.CHAPTER:
|
|
number, pov = _parse_title(it["title"])
|
|
|
|
text_file = None
|
|
output_name = None
|
|
if it["render"]:
|
|
seq += 1
|
|
ct = ChapterText(index=index, title=it["title"] or it["src"],
|
|
paragraphs=it["paragraphs"])
|
|
text_file = f"chapters/ch{index:02d}.json"
|
|
(data_dir / text_file).write_text(
|
|
ct.model_dump_json(indent=2), encoding="utf-8")
|
|
output_name = _output_name(seq, it["kind"], number, it["title"] or "")
|
|
|
|
chapters.append(Chapter(
|
|
index=index,
|
|
item_id=it["item_id"],
|
|
src=it["src"],
|
|
title=it["title"] or it["src"],
|
|
kind=it["kind"],
|
|
render=it["render"],
|
|
number=number,
|
|
pov=pov,
|
|
word_count=it["word_count"],
|
|
text_file=text_file,
|
|
output_name=output_name,
|
|
))
|
|
|
|
book = Book(
|
|
slug=slug,
|
|
title=title,
|
|
author=author,
|
|
language=(language[:2] if language else "fr"),
|
|
description=description,
|
|
cover_file=cover_file,
|
|
chapters=chapters,
|
|
)
|
|
(data_dir / "book.json").write_text(
|
|
book.model_dump_json(indent=2), encoding="utf-8")
|
|
return book
|
|
|
|
|
|
def _meta(book: epub.EpubBook, name: str) -> Optional[str]:
|
|
values = book.get_metadata("DC", name)
|
|
if values:
|
|
return values[0][0]
|
|
return None
|
|
|
|
|
|
def load_book(slug: str) -> Book:
|
|
path = book_data_dir(slug) / "book.json"
|
|
return Book.model_validate_json(path.read_text(encoding="utf-8"))
|
|
|
|
|
|
def load_chapter_text(slug: str, chapter: Chapter) -> ChapterText:
|
|
path = book_data_dir(slug) / chapter.text_file
|
|
return ChapterText.model_validate_json(path.read_text(encoding="utf-8"))
|