Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)

2026-06-21 00:10:11 +02:00
commit d3bb91394b
71 changed files with 8138 additions and 0 deletions
--- a/backend/inkflow/pipeline/render.py
+++ b/backend/inkflow/pipeline/render.py
@@ -0,0 +1,158 @@
+"""Rendu audio d'un chapitre : (segments + voix) -> WAV -> MP3.
+
+Une `RenderUnit` = un bout de texte + la voix a employer. On construit la liste
+d'unites (mono-narrateur ou multi-voix selon le casting), on synthetise chacune,
+on concatene avec des silences, on normalise puis on encode en MP3.
+"""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Optional
+
+from ..analysis.pronunciation import apply_pronunciation
+from ..audio.postprocess import concat_segments, encode_mp3, normalize_loudness, write_wav
+from ..config import book_data_dir, book_output_dir
+from ..models import (
+    Book,
+    Chapter,
+    ChapterAnalysis,
+    ChapterText,
+    Pronunciation,
+    SegmentType,
+)
+from ..tts.base import TTSBackend, VoiceSpec
+
+# Resout un nom de locuteur en une voix concrete.
+VoiceResolver = Callable[[str], VoiceSpec]
+
+
+@dataclass
+class RenderUnit:
+    text: str
+    voice: VoiceSpec
+    speaker: str = "narrateur"
+    glued_to_prev: bool = False   # incise -> gap reduit avec l'unite precedente
+
+
+def build_units_mono(ct: ChapterText, narrator: VoiceSpec) -> list[RenderUnit]:
+    """Mono-narrateur : chaque paragraphe est lu par la voix du narrateur."""
+    return [RenderUnit(text=p, voice=narrator) for p in ct.paragraphs if p.strip()]
+
+
+def make_voice_resolver(cast, voicebank, engine: str) -> VoiceResolver:
+    """Construit un resolver locuteur -> VoiceSpec via le casting + la voicebank.
+
+    Replie sur la voix du narrateur si le locuteur n'a pas de voix attribuee.
+    """
+    from ..casting.assign import resolve_speaker_voice
+    from ..casting.voicebank import voice_spec_for
+
+    def resolve(speaker: str):
+        vid = resolve_speaker_voice(speaker, cast, voicebank)
+        if vid is None:
+            vid = cast.narrator_voice_id
+        entry = voicebank.by_id(vid) if vid else None
+        if entry is None:
+            return None  # le backend utilisera sa voix par defaut
+        return voice_spec_for(entry, engine)
+
+    return resolve
+
+
+def build_units_multi(
+    analysis: ChapterAnalysis,
+    resolve: VoiceResolver,
+    default_voice: "VoiceSpec",
+) -> list[RenderUnit]:
+    """Multi-voix : narration -> narrateur, dialogue -> voix du personnage.
+
+    Les incises annotees sur une replique (bornes dans le texte) sont detachees
+    ici, au dernier moment : la sous-chaine d'incise est portee par la voix du
+    narrateur (`glued_to_prev` pour reduire le silence), le reste par la voix du
+    personnage. Les repliques sans incise sont rendues entieres.
+    """
+    from ..analysis.segmenter import iter_incise_pieces
+
+    narrator = resolve("narrateur") or default_voice
+    units: list[RenderUnit] = []
+    for seg in analysis.segments:
+        if not seg.text.strip():
+            continue
+        if seg.type is SegmentType.NARRATION:
+            units.append(RenderUnit(text=seg.text, voice=narrator,
+                                    speaker="narrateur",
+                                    glued_to_prev=seg.glued_to_prev))
+            continue
+
+        char_voice = resolve(seg.speaker) or default_voice
+        if not seg.incises:
+            units.append(RenderUnit(text=seg.text, voice=char_voice,
+                                    speaker=seg.speaker,
+                                    glued_to_prev=seg.glued_to_prev))
+            continue
+
+        for k, (is_incise, piece) in enumerate(
+                iter_incise_pieces(seg.text, seg.incises)):
+            glued = seg.glued_to_prev if k == 0 else True
+            if is_incise:
+                units.append(RenderUnit(text=piece, voice=narrator,
+                                        speaker="narrateur", glued_to_prev=glued))
+            else:
+                units.append(RenderUnit(text=piece, voice=char_voice,
+                                        speaker=seg.speaker, glued_to_prev=glued))
+    return units
+
+
+def render_units(
+    units: list[RenderUnit],
+    backend: TTSBackend,
+    *,
+    pron: Optional[Pronunciation] = None,
+    progress: Optional[Callable[[int, int], None]] = None,
+) -> tuple["list", int]:
+    """Synthetise toutes les unites et renvoie (liste (audio,sr), n_units)."""
+    parts = []
+    total = len(units)
+    for i, unit in enumerate(units):
+        text = apply_pronunciation(unit.text, pron) if pron else unit.text
+        audio, sr = backend.synthesize(text, unit.voice)
+        parts.append((audio, sr))
+        if progress:
+            progress(i + 1, total)
+    return parts, total
+
+
+def render_chapter_to_mp3(
+    book: Book,
+    chapter: Chapter,
+    units: list[RenderUnit],
+    backend: TTSBackend,
+    *,
+    pron: Optional[Pronunciation] = None,
+    track: Optional[int] = None,
+    progress: Optional[Callable[[int, int], None]] = None,
+) -> Path:
+    """Pipeline complet pour un chapitre -> output/<livre>/NN-...mp3."""
+    parts, _ = render_units(units, backend, pron=pron, progress=progress)
+    # parts est aligne 1:1 avec units -> on transmet les marqueurs d'incise.
+    audio, sr = concat_segments(parts, glued=[u.glued_to_prev for u in units])
+    audio = normalize_loudness(audio)
+
+    # WAV intermediaire dans data/, MP3 final dans output/.
+    wav_path = book_data_dir(book.slug) / "audio" / f"ch{chapter.index:02d}.wav"
+    write_wav(wav_path, audio, sr)
+
+    out_dir = book_output_dir(book.title)
+    mp3_path = out_dir / (chapter.output_name or f"ch{chapter.index:02d}.mp3")
+    cover = None
+    if book.cover_file:
+        candidate = book_data_dir(book.slug) / book.cover_file
+        cover = candidate if candidate.exists() else None
+
+    encode_mp3(
+        wav_path, mp3_path,
+        title=chapter.title, album=book.title, artist=book.author,
+        track=track, cover_path=cover,
+    )
+    return mp3_path