Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)

2026-06-21 00:10:11 +02:00
commit d3bb91394b
71 changed files with 8138 additions and 0 deletions
--- a/backend/inkflow/casting/init.py
+++ b/backend/inkflow/casting/init.py
--- a/backend/inkflow/casting/assign.py
+++ b/backend/inkflow/casting/assign.py
@@ -0,0 +1,86 @@
+"""Auto-casting : attribue une voix distincte a chaque personnage.
+
+Strategie deterministe :
+- Narrateur : voix FR native par defaut (ff_siwis), sinon premiere voix.
+- Personnages : voix du meme genre, distinctes tant qu'il en reste ; au-dela on
+  recycle en repartissant le plus equitablement possible. Genre inconnu -> pool
+  mixte. L'ordre (tri par nom) garantit la reproductibilite.
+L'utilisateur pourra surcharger ces choix dans l'UI.
+"""
+from __future__ import annotations
+
+from collections import Counter
+from typing import Optional
+
+from ..models import Cast, Character, Voicebank
+
+# Voix narrateur preferee (FR native).
+PREFERRED_NARRATOR = "fr_f_siwis"
+
+
+def _pick_pool(vb: Voicebank, gender: Optional[str], narrator_id: str) -> list[str]:
+    """Voix candidates : on privilegie STRICTEMENT le genre (quitte a reutiliser).
+
+    On ne croise le genre que si aucune voix du bon genre n'existe. Le narrateur
+    est exclu tant qu'il reste d'autres options, pour le distinguer.
+    """
+    same = [e.id for e in vb.by_gender(gender)] if gender in ("male", "female") else []
+    pool = same if same else [e.id for e in vb.entries]
+    non_narrator = [vid for vid in pool if vid != narrator_id]
+    return non_narrator or pool  # garde le narrateur seulement s'il est seul
+
+
+def assign_voices(
+    characters: list[Character],
+    vb: Voicebank,
+    *,
+    narrator_voice_id: Optional[str] = None,
+    respect_existing: bool = False,
+) -> Cast:
+    """Renvoie un Cast avec narrateur + voix par personnage (mutation des chars).
+
+    `respect_existing=True` conserve les voix deja attribuees (overrides UI) ;
+    sinon tout est re-calcule (auto-casting frais).
+    """
+    if not vb.entries:
+        return Cast(narrator_voice_id=narrator_voice_id, characters=characters)
+
+    narrator_id = narrator_voice_id or (
+        PREFERRED_NARRATOR if vb.by_id(PREFERRED_NARRATOR) else vb.entries[0].id)
+
+    usage: Counter[str] = Counter()
+    usage[narrator_id] += 1  # le narrateur compte deja
+
+    for ch in sorted(characters, key=lambda c: c.name.lower()):
+        if respect_existing and ch.voice_id and vb.by_id(ch.voice_id):
+            usage[ch.voice_id] += 1
+            continue  # respecte une attribution existante (override utilisateur)
+        pool = _pick_pool(vb, ch.gender, narrator_id)
+        # Choisit la voix la moins utilisee du pool (donc une voix neuve d'abord).
+        best = min(pool, key=lambda vid: (usage[vid], pool.index(vid)))
+        ch.voice_id = best
+        usage[best] += 1
+
+    return Cast(narrator_voice_id=narrator_id, characters=characters)
+
+
+def resolve_speaker_voice(
+    speaker: str, cast: Cast, vb: Voicebank
+) -> Optional[str]:
+    """Mappe un nom de locuteur (segment) vers un id de voix.
+
+    Matche d'abord par nom/alias exact (rapide), puis en dernier recours par
+    rapprochement heuristique de tokens (ex: un "Jim" qui n'aurait pas encore
+    ete absorbe comme alias de "James Holden").
+    """
+    if speaker == "narrateur":
+        return cast.narrator_voice_id
+    low = speaker.lower()
+    for ch in cast.characters:
+        if ch.name.lower() == low or low in (a.lower() for a in ch.aliases):
+            return ch.voice_id
+    from .dedup import heuristic_match
+    match = heuristic_match(speaker, cast.characters)
+    if isinstance(match, Character):
+        return match.voice_id
+    return None  # inconnu -> le rendu repliera sur le narrateur
--- a/backend/inkflow/casting/dedup.py
+++ b/backend/inkflow/casting/dedup.py
@@ -0,0 +1,345 @@
+"""Reconciliation du casting : deduplication des variantes de noms.
+
+Probleme : un meme personnage apparait sous plusieurs formes ("Holden",
+"James Holden", "James", "Jim"). Sans reconciliation, chaque forme devient un
+personnage distinct avec sa propre voix -> incoherence a l'ecoute.
+
+Strategie hybride :
+1. Heuristique (sans LLM) : match exact sur nom/alias, puis sous-ensemble de
+   tokens ("Holden" contenu dans "James Holden").
+2. Gemma tranche les cas ambigus (plusieurs candidats compatibles, ou variante
+   non evidente type "Jim" <-> "James") a l'aide des descriptions.
+
+Chaque variante rencontree est conservee comme `alias` du personnage canonique ;
+le nom canonique est la forme la plus complete vue ("James Holden"). Les
+artefacts d'analyse (segments) ne sont PAS modifies : la resolution de voix au
+rendu s'appuie sur les aliases (`casting/assign.py`).
+"""
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+from ..models import Character
+from ..settings import get_settings
+
+# Sentinelles internes.
+_AMBIGUOUS = object()   # heuristique : plusieurs candidats -> on delegue a Gemma
+_NEW = object()         # decision Gemma : nouveau personnage
+
+# Mots vides / titres a ignorer pour le rapprochement par tokens.
+_STOPWORDS = {
+    "le", "la", "les", "un", "une", "de", "du", "des", "monsieur", "madame",
+    "mademoiselle", "m", "mme", "mlle", "mr", "dr", "docteur", "capitaine",
+    "lieutenant", "sergent", "general", "amiral", "the", "of",
+}
+_SPLIT_RE = re.compile(r"[^\wÀ-ÿ]+")
+
+# Garde-fou de contexte (caracteres) pour le prompt Gemma.
+_MAX_PROMPT_CHARS = 24000
+
+
+def _norm(name: str) -> str:
+    return name.strip().lower()
+
+
+def _tokens(name: str) -> set[str]:
+    """Tokens significatifs d'un nom (minuscules, sans titres ni mots vides)."""
+    parts = [p for p in _SPLIT_RE.split(name.strip()) if p]
+    return {p.lower() for p in parts
+            if len(p) >= 2 and p.lower() not in _STOPWORDS}
+
+
+def _completeness(name: str) -> tuple[int, int]:
+    """Cle de tri du nom le plus "complet" : plus de tokens, puis plus long."""
+    return (len(_tokens(name)), len(name.strip()))
+
+
+def _forms(c: Character) -> list[str]:
+    return [c.name, *c.aliases]
+
+
+def _token_freq(characters: list[Character], extra: Optional[list[str]] = None):
+    """Compte, pour chaque token, le nb de surfaces distinctes le contenant.
+
+    Sert a juger la distinctivite d'un token : "holden" present dans une seule
+    famille est sur a fusionner ; "alex" present dans plusieurs ne l'est pas.
+    """
+    from collections import Counter
+    freq: Counter[str] = Counter()
+    surfaces = {_norm(f) for c in characters for f in _forms(c)}
+    surfaces |= {_norm(s) for s in (extra or [])}
+    for s in surfaces:
+        for t in _tokens(s):
+            freq[t] += 1
+    return freq
+
+
+def heuristic_match(surface: str, characters: list[Character], tokfreq=None):
+    """Rapproche `surface` d'un personnage connu sans LLM (conservateur).
+
+    Renvoie le `Character` correspondant, `None` si aucun, ou `_AMBIGUOUS` si le
+    rapprochement est plausible mais incertain (decision laissee a Gemma).
+
+    Un lien par sous-ensemble de tokens n'est considere SUR que si le plus petit
+    cote a >=2 tokens, ou si les tokens partages sont globalement distinctifs
+    (presents dans <=2 surfaces). Sinon le lien est ambigu (ex: un prenom
+    courant "Alex" partage par plusieurs personnages).
+    """
+    s_norm = _norm(surface)
+    for c in characters:
+        if _norm(c.name) == s_norm or any(_norm(a) == s_norm for a in c.aliases):
+            return c
+    s_tok = _tokens(surface)
+    if not s_tok:
+        return None
+    if tokfreq is None:
+        tokfreq = _token_freq(characters, [surface])
+
+    safe: list[Character] = []
+    ambiguous = False
+    for c in characters:
+        linked = is_safe = False
+        for form in _forms(c):
+            f_tok = _tokens(form)
+            if not f_tok or not (s_tok <= f_tok or f_tok <= s_tok):
+                continue
+            linked = True
+            shared = s_tok & f_tok
+            if min(len(s_tok), len(f_tok)) >= 2 or all(tokfreq[t] <= 2 for t in shared):
+                is_safe = True
+        if is_safe:
+            safe.append(c)
+        elif linked:
+            ambiguous = True
+    if len(safe) == 1 and not ambiguous:
+        return safe[0]
+    if safe or ambiguous:
+        return _AMBIGUOUS
+    return None
+
+
+def canonical_of(a: str, b: str) -> str:
+    """Forme canonique entre deux variantes : la plus complete."""
+    return a if _completeness(a) >= _completeness(b) else b
+
+
+def _absorb(
+    target: Character,
+    name: str,
+    *,
+    gender: Optional[str] = None,
+    age: Optional[str] = None,
+    description: Optional[str] = None,
+    voice_id: Optional[str] = None,
+) -> None:
+    """Fusionne la variante `name` dans `target` (mutation en place).
+
+    Enrichit les attributs manquants, recalcule le nom canonique et range les
+    autres formes en aliases.
+    """
+    target.gender = target.gender or gender
+    target.age = target.age or age
+    target.description = target.description or description
+    target.voice_id = target.voice_id or voice_id
+
+    forms: dict[str, str] = {}  # norm -> graphie d'origine (1re vue conservee)
+    for f in [target.name, *target.aliases, name]:
+        f = (f or "").strip()
+        if f:
+            forms.setdefault(_norm(f), f)
+    canon = max(forms, key=lambda n: _completeness(forms[n]))
+    target.name = forms[canon]
+    target.aliases = sorted(v for k, v in forms.items() if k != canon)
+
+
+def _item(c) -> dict:
+    """Normalise un personnage ou un nom brut en entree de reconciliation."""
+    if isinstance(c, Character):
+        return {"name": c.name, "gender": c.gender, "age": c.age,
+                "description": c.description, "voice_id": c.voice_id}
+    return {"name": str(c), "gender": None, "age": None,
+            "description": None, "voice_id": None}
+
+
+def _find(chars: list[Character], name: str) -> Optional[Character]:
+    n = _norm(name)
+    return next((c for c in chars
+                 if _norm(c.name) == n or any(_norm(a) == n for a in c.aliases)),
+                None)
+
+
+def _create(chars: list[Character], it: dict, name_map: dict[str, str]) -> None:
+    new = Character(name=it["name"].strip(), gender=it["gender"], age=it["age"],
+                    description=it["description"], voice_id=it["voice_id"])
+    chars.append(new)
+    name_map[_norm(it["name"])] = new.name
+
+
+def reconcile_characters(
+    book_chars: list[Character],
+    new_chars,
+    gemma=None,
+    *,
+    speaker_names: Optional[list[str]] = None,
+) -> tuple[list[Character], dict[str, str]]:
+    """Reconcilie de nouvelles detections dans le casting du livre.
+
+    `new_chars` : personnages extraits (objets `Character`) du/des chapitre(s).
+    `speaker_names` : formes de locuteur brutes vues dans les segments (absorbees
+    comme aliases pour que la resolution de voix matche au rendu).
+    `gemma` : si fourni, tranche les cas ambigus ; sinon heuristique seule.
+
+    Renvoie (liste canonique mise a jour, map nom_surface_normalise -> canonique).
+    """
+    chars = [c.model_copy(deep=True) for c in book_chars]
+    name_map: dict[str, str] = {}
+
+    items = [_item(c) for c in new_chars]
+    seen = {_norm(it["name"]) for it in items}
+    for sp in (speaker_names or []):
+        n = _norm(sp or "")
+        if n and n not in seen and n not in {"narrateur", "inconnu", "?"}:
+            items.append(_item(sp))
+            seen.add(n)
+
+    # Fréquence globale des tokens (base + entrants) -> distinctivite stable,
+    # independante de l'ordre de traitement.
+    tokfreq = _token_freq(chars, [it["name"] for it in items])
+
+    pending: list[dict] = []
+    for it in items:
+        m = heuristic_match(it["name"], chars, tokfreq)
+        if m is _AMBIGUOUS:
+            pending.append(it)
+        elif m is not None:
+            _absorb(m, it["name"], gender=it["gender"], age=it["age"],
+                    description=it["description"], voice_id=it["voice_id"])
+            name_map[_norm(it["name"])] = m.name
+        elif gemma is not None:
+            pending.append(it)  # peut etre une variante non evidente ("Jim")
+        else:
+            _create(chars, it, name_map)
+
+    if pending and gemma is not None:
+        decisions = _gemma_reconcile(chars, pending, gemma)
+        for it in pending:
+            canon = decisions.get(_norm(it["name"]))
+            target = _find(chars, canon) if isinstance(canon, str) else None
+            if target is None:  # Gemma dit NOUVEAU/inconnu : ultime essai heuristique
+                hm = heuristic_match(it["name"], chars, tokfreq)
+                target = hm if isinstance(hm, Character) else None
+            if target is not None:
+                _absorb(target, it["name"], gender=it["gender"], age=it["age"],
+                        description=it["description"], voice_id=it["voice_id"])
+                name_map[_norm(it["name"])] = target.name
+            else:
+                _create(chars, it, name_map)
+    elif pending:
+        # Sans Gemma : on ne devine pas les cas ambigus, on les garde distincts.
+        for it in pending:
+            _create(chars, it, name_map)
+
+    return chars, name_map
+
+
+def dedup_cast(characters: list[Character], gemma=None) -> list[Character]:
+    """Replie les doublons d'un casting existant (conserve les voix attribuees).
+
+    Deux phases : (1) regroupement heuristique sur (gemma=None) -> liste reduite
+    et sure ; (2) si `gemma` fourni, passe de regroupement Gemma sur les seuls
+    noms candidats (partageant un token avec un autre), pour fusionner les
+    variantes que l'heuristique laisse de cote (ex: "Okoye" -> "Elvi Okoye").
+    """
+    base, _ = reconcile_characters([], characters, gemma=None)
+    if gemma is None:
+        return base
+    return _gemma_merge_pass(base, gemma)
+
+
+def _gemma_merge_pass(base: list[Character], gemma) -> list[Character]:
+    """Rattache via Gemma les formes courtes a un nom complet (ancre).
+
+    Tache volontairement contrainte (et plus fiable qu'un regroupement libre) :
+    une "forme courte" est un nom dont les tokens sont strictement inclus dans
+    ceux d'un autre (ex: "Okoye" vs "Elvi Okoye"). Gemma mappe chaque forme
+    courte vers le nom canonique EXACT d'une ancre, ou "NOUVEAU". Traite par
+    petits lots pour rester dans la zone de fiabilite du modele.
+    """
+    shorts: list[Character] = []
+    anchors: list[Character] = []
+    for i, c in enumerate(base):
+        ts = _tokens(c.name)
+        if ts and any(j != i and ts < _tokens(d.name) for j, d in enumerate(base)):
+            shorts.append(c)
+        else:
+            anchors.append(c)
+    if not shorts:
+        return base
+
+    result = [a.model_copy(deep=True) for a in anchors]
+    leftovers: list[Character] = []
+    for start in range(0, len(shorts), 12):
+        chunk = shorts[start:start + 12]
+        decisions = _gemma_reconcile(result, [_item(s) for s in chunk], gemma)
+        for s in chunk:
+            canon = decisions.get(_norm(s.name))
+            tgt = _find(result, canon) if isinstance(canon, str) else None
+            if tgt is None:
+                hm = heuristic_match(s.name, result)
+                tgt = hm if isinstance(hm, Character) else None
+            # Garde-fou : ne pas fusionner deux genres connus opposes.
+            if tgt is not None and s.gender and tgt.gender and s.gender != tgt.gender:
+                tgt = None
+            if tgt is not None:
+                _absorb(tgt, s.name, gender=s.gender, age=s.age,
+                        description=s.description, voice_id=s.voice_id)
+                for a in s.aliases:
+                    _absorb(tgt, a)
+            else:
+                leftovers.append(s)
+    return result + leftovers
+
+
+def _gemma_reconcile(
+    chars: list[Character], pending: list[dict], gemma
+) -> dict[str, object]:
+    """Un appel groupe : pour chaque nom en attente, son canonique ou _NEW."""
+    known = []
+    for c in chars:
+        al = f" (alias: {', '.join(c.aliases)})" if c.aliases else ""
+        desc = f" — {c.description}" if c.description else ""
+        known.append(f"- {c.name}{al}{desc}")
+    new_lines = []
+    for n, it in enumerate(pending):
+        desc = f" — {it['description']}" if it.get("description") else ""
+        new_lines.append(f"[{n}] {it['name']}{desc}")
+
+    prompt = (
+        "Personnages DEJA connus du livre :\n"
+        + ("\n".join(known) if known else "(aucun)")
+        + "\n\nNoms DETECTES a classer :\n" + "\n".join(new_lines)
+        + "\n\nPour chaque nom detecte, indique s'il designe un personnage deja "
+        "connu (donne alors son nom canonique EXACT tel qu'ecrit ci-dessus) ou "
+        "s'il s'agit d'un nouveau personnage (\"NOUVEAU\"). Ne fusionne que si "
+        "c'est, avec certitude, la meme personne. EN CAS DE DOUTE, ou si "
+        "plusieurs personnages connus pourraient correspondre, reponds "
+        "\"NOUVEAU\". Ne rapproche jamais deux personnes differentes qui "
+        "partagent seulement un prenom ou un nom de famille.\n\n"
+        'Reponds par un tableau JSON: '
+        '[{"i":0,"canonical":"James Holden"},{"i":1,"canonical":"NOUVEAU"}]'
+    )
+    if len(prompt) > _MAX_PROMPT_CHARS:
+        prompt = prompt[:_MAX_PROMPT_CHARS]
+    result = gemma.generate_json(prompt, system=get_settings().prompt_dedup)
+
+    decisions: dict[str, object] = {}
+    for item in result:
+        if not isinstance(item, dict) or "i" not in item:
+            continue
+        n = item["i"]
+        canon = str(item.get("canonical") or "").strip()
+        if isinstance(n, int) and 0 <= n < len(pending) and canon:
+            decisions[_norm(pending[n]["name"])] = (
+                _NEW if canon.upper() == "NOUVEAU" else canon)
+    return decisions
--- a/backend/inkflow/casting/voicebank.py
+++ b/backend/inkflow/casting/voicebank.py
@@ -0,0 +1,91 @@
+"""Banque de voix : un jeu de voix variees (genre/age) auto-suffisant.
+
+Chaque voix s'appuie sur une voix Kokoro (identite + clip de reference). Le clip
+de reference est genere une fois en lisant un passage francais standard ; il sert
+de reference de timbre pour le clonage Qwen3 (rendu final). Aucune ressource
+externe a sourcer.
+
+Resolution moteur :
+- Kokoro -> VoiceSpec(preset=kokoro_voice)        (rapide, preview / draft)
+- Qwen3  -> VoiceSpec(ref_audio=clip, ref_text=…) (qualite, clonage)
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+import soundfile as sf
+
+from ..config import VOICEBANK_DIR
+from ..models import VoiceEntry, Voicebank
+from ..tts.base import VoiceSpec
+
+# Passage de reference lu par chaque voix pour creer son clip de clonage.
+REFERENCE_TEXT = (
+    "L'univers est toujours plus étrange qu'on ne le croit. "
+    "Chaque nouvelle merveille pose les bases d'une découverte plus éblouissante encore."
+)
+
+# Jeu de voix par defaut (varie en genre). ff_siwis est la seule voix FR native ;
+# les autres empruntent un timbre anglais mais lisent un texte phonemise en FR.
+SEED: list[VoiceEntry] = [
+    VoiceEntry(id="fr_f_siwis",  kokoro_voice="ff_siwis",   gender="female", age="adult", label="Siwis (FR)"),
+    VoiceEntry(id="f_bella",     kokoro_voice="af_bella",   gender="female", age="adult", label="Bella"),
+    VoiceEntry(id="f_heart",     kokoro_voice="af_heart",   gender="female", age="young", label="Heart"),
+    VoiceEntry(id="f_emma",      kokoro_voice="bf_emma",    gender="female", age="adult", label="Emma"),
+    VoiceEntry(id="f_nicole",    kokoro_voice="af_nicole",  gender="female", age="adult", label="Nicole"),
+    VoiceEntry(id="m_fenrir",    kokoro_voice="am_fenrir",  gender="male",   age="adult", label="Fenrir"),
+    VoiceEntry(id="m_michael",   kokoro_voice="am_michael", gender="male",   age="adult", label="Michael"),
+    VoiceEntry(id="m_george",    kokoro_voice="bm_george",  gender="male",   age="adult", label="George"),
+    VoiceEntry(id="m_lewis",     kokoro_voice="bm_lewis",   gender="male",   age="adult", label="Lewis"),
+    VoiceEntry(id="m_eric",      kokoro_voice="am_eric",    gender="male",   age="young", label="Eric"),
+    VoiceEntry(id="m_santa",     kokoro_voice="am_santa",   gender="male",   age="old",   label="Santa"),
+]
+
+
+def metadata_path() -> Path:
+    return VOICEBANK_DIR / "metadata.json"
+
+
+def clips_dir() -> Path:
+    return VOICEBANK_DIR / "clips"
+
+
+def load_voicebank() -> Voicebank:
+    path = metadata_path()
+    if path.exists():
+        return Voicebank.model_validate_json(path.read_text(encoding="utf-8"))
+    return Voicebank(entries=list(SEED))
+
+
+def save_voicebank(vb: Voicebank) -> Path:
+    VOICEBANK_DIR.mkdir(parents=True, exist_ok=True)
+    metadata_path().write_text(vb.model_dump_json(indent=2), encoding="utf-8")
+    return metadata_path()
+
+
+def build_voicebank(*, regenerate: bool = False) -> Voicebank:
+    """Genere les clips de reference manquants et ecrit metadata.json."""
+    from ..tts.kokoro import KokoroBackend
+
+    clips_dir().mkdir(parents=True, exist_ok=True)
+    backend = KokoroBackend()
+    entries: list[VoiceEntry] = []
+    for seed in SEED:
+        clip_rel = f"clips/{seed.id}.wav"
+        clip_abs = VOICEBANK_DIR / clip_rel
+        if regenerate or not clip_abs.exists():
+            audio, sr = backend.synthesize(REFERENCE_TEXT, VoiceSpec(preset=seed.kokoro_voice))
+            sf.write(str(clip_abs), audio, sr)
+        entry = seed.model_copy(update={"ref_audio": clip_rel, "ref_text": REFERENCE_TEXT})
+        entries.append(entry)
+    vb = Voicebank(entries=entries)
+    save_voicebank(vb)
+    return vb
+
+
+def voice_spec_for(entry: VoiceEntry, engine: str, *, speed: float = 1.0) -> VoiceSpec:
+    """Construit la VoiceSpec adaptee au moteur cible."""
+    if engine == "qwen3" and entry.ref_audio:
+        ref_abs = str(VOICEBANK_DIR / entry.ref_audio)
+        return VoiceSpec(ref_audio=ref_abs, ref_text=entry.ref_text, speed=speed)
+    return VoiceSpec(preset=entry.kokoro_voice, speed=speed)