Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)
This commit is contained in:
0
backend/inkflow/casting/__init__.py
Normal file
0
backend/inkflow/casting/__init__.py
Normal file
86
backend/inkflow/casting/assign.py
Normal file
86
backend/inkflow/casting/assign.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Auto-casting : attribue une voix distincte a chaque personnage.
|
||||
|
||||
Strategie deterministe :
|
||||
- Narrateur : voix FR native par defaut (ff_siwis), sinon premiere voix.
|
||||
- Personnages : voix du meme genre, distinctes tant qu'il en reste ; au-dela on
|
||||
recycle en repartissant le plus equitablement possible. Genre inconnu -> pool
|
||||
mixte. L'ordre (tri par nom) garantit la reproductibilite.
|
||||
L'utilisateur pourra surcharger ces choix dans l'UI.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import Counter
|
||||
from typing import Optional
|
||||
|
||||
from ..models import Cast, Character, Voicebank
|
||||
|
||||
# Voix narrateur preferee (FR native).
|
||||
PREFERRED_NARRATOR = "fr_f_siwis"
|
||||
|
||||
|
||||
def _pick_pool(vb: Voicebank, gender: Optional[str], narrator_id: str) -> list[str]:
|
||||
"""Voix candidates : on privilegie STRICTEMENT le genre (quitte a reutiliser).
|
||||
|
||||
On ne croise le genre que si aucune voix du bon genre n'existe. Le narrateur
|
||||
est exclu tant qu'il reste d'autres options, pour le distinguer.
|
||||
"""
|
||||
same = [e.id for e in vb.by_gender(gender)] if gender in ("male", "female") else []
|
||||
pool = same if same else [e.id for e in vb.entries]
|
||||
non_narrator = [vid for vid in pool if vid != narrator_id]
|
||||
return non_narrator or pool # garde le narrateur seulement s'il est seul
|
||||
|
||||
|
||||
def assign_voices(
|
||||
characters: list[Character],
|
||||
vb: Voicebank,
|
||||
*,
|
||||
narrator_voice_id: Optional[str] = None,
|
||||
respect_existing: bool = False,
|
||||
) -> Cast:
|
||||
"""Renvoie un Cast avec narrateur + voix par personnage (mutation des chars).
|
||||
|
||||
`respect_existing=True` conserve les voix deja attribuees (overrides UI) ;
|
||||
sinon tout est re-calcule (auto-casting frais).
|
||||
"""
|
||||
if not vb.entries:
|
||||
return Cast(narrator_voice_id=narrator_voice_id, characters=characters)
|
||||
|
||||
narrator_id = narrator_voice_id or (
|
||||
PREFERRED_NARRATOR if vb.by_id(PREFERRED_NARRATOR) else vb.entries[0].id)
|
||||
|
||||
usage: Counter[str] = Counter()
|
||||
usage[narrator_id] += 1 # le narrateur compte deja
|
||||
|
||||
for ch in sorted(characters, key=lambda c: c.name.lower()):
|
||||
if respect_existing and ch.voice_id and vb.by_id(ch.voice_id):
|
||||
usage[ch.voice_id] += 1
|
||||
continue # respecte une attribution existante (override utilisateur)
|
||||
pool = _pick_pool(vb, ch.gender, narrator_id)
|
||||
# Choisit la voix la moins utilisee du pool (donc une voix neuve d'abord).
|
||||
best = min(pool, key=lambda vid: (usage[vid], pool.index(vid)))
|
||||
ch.voice_id = best
|
||||
usage[best] += 1
|
||||
|
||||
return Cast(narrator_voice_id=narrator_id, characters=characters)
|
||||
|
||||
|
||||
def resolve_speaker_voice(
|
||||
speaker: str, cast: Cast, vb: Voicebank
|
||||
) -> Optional[str]:
|
||||
"""Mappe un nom de locuteur (segment) vers un id de voix.
|
||||
|
||||
Matche d'abord par nom/alias exact (rapide), puis en dernier recours par
|
||||
rapprochement heuristique de tokens (ex: un "Jim" qui n'aurait pas encore
|
||||
ete absorbe comme alias de "James Holden").
|
||||
"""
|
||||
if speaker == "narrateur":
|
||||
return cast.narrator_voice_id
|
||||
low = speaker.lower()
|
||||
for ch in cast.characters:
|
||||
if ch.name.lower() == low or low in (a.lower() for a in ch.aliases):
|
||||
return ch.voice_id
|
||||
from .dedup import heuristic_match
|
||||
match = heuristic_match(speaker, cast.characters)
|
||||
if isinstance(match, Character):
|
||||
return match.voice_id
|
||||
return None # inconnu -> le rendu repliera sur le narrateur
|
||||
345
backend/inkflow/casting/dedup.py
Normal file
345
backend/inkflow/casting/dedup.py
Normal file
@@ -0,0 +1,345 @@
|
||||
"""Reconciliation du casting : deduplication des variantes de noms.
|
||||
|
||||
Probleme : un meme personnage apparait sous plusieurs formes ("Holden",
|
||||
"James Holden", "James", "Jim"). Sans reconciliation, chaque forme devient un
|
||||
personnage distinct avec sa propre voix -> incoherence a l'ecoute.
|
||||
|
||||
Strategie hybride :
|
||||
1. Heuristique (sans LLM) : match exact sur nom/alias, puis sous-ensemble de
|
||||
tokens ("Holden" contenu dans "James Holden").
|
||||
2. Gemma tranche les cas ambigus (plusieurs candidats compatibles, ou variante
|
||||
non evidente type "Jim" <-> "James") a l'aide des descriptions.
|
||||
|
||||
Chaque variante rencontree est conservee comme `alias` du personnage canonique ;
|
||||
le nom canonique est la forme la plus complete vue ("James Holden"). Les
|
||||
artefacts d'analyse (segments) ne sont PAS modifies : la resolution de voix au
|
||||
rendu s'appuie sur les aliases (`casting/assign.py`).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from ..models import Character
|
||||
from ..settings import get_settings
|
||||
|
||||
# Sentinelles internes.
|
||||
_AMBIGUOUS = object() # heuristique : plusieurs candidats -> on delegue a Gemma
|
||||
_NEW = object() # decision Gemma : nouveau personnage
|
||||
|
||||
# Mots vides / titres a ignorer pour le rapprochement par tokens.
|
||||
_STOPWORDS = {
|
||||
"le", "la", "les", "un", "une", "de", "du", "des", "monsieur", "madame",
|
||||
"mademoiselle", "m", "mme", "mlle", "mr", "dr", "docteur", "capitaine",
|
||||
"lieutenant", "sergent", "general", "amiral", "the", "of",
|
||||
}
|
||||
_SPLIT_RE = re.compile(r"[^\wÀ-ÿ]+")
|
||||
|
||||
# Garde-fou de contexte (caracteres) pour le prompt Gemma.
|
||||
_MAX_PROMPT_CHARS = 24000
|
||||
|
||||
|
||||
def _norm(name: str) -> str:
|
||||
return name.strip().lower()
|
||||
|
||||
|
||||
def _tokens(name: str) -> set[str]:
|
||||
"""Tokens significatifs d'un nom (minuscules, sans titres ni mots vides)."""
|
||||
parts = [p for p in _SPLIT_RE.split(name.strip()) if p]
|
||||
return {p.lower() for p in parts
|
||||
if len(p) >= 2 and p.lower() not in _STOPWORDS}
|
||||
|
||||
|
||||
def _completeness(name: str) -> tuple[int, int]:
|
||||
"""Cle de tri du nom le plus "complet" : plus de tokens, puis plus long."""
|
||||
return (len(_tokens(name)), len(name.strip()))
|
||||
|
||||
|
||||
def _forms(c: Character) -> list[str]:
|
||||
return [c.name, *c.aliases]
|
||||
|
||||
|
||||
def _token_freq(characters: list[Character], extra: Optional[list[str]] = None):
|
||||
"""Compte, pour chaque token, le nb de surfaces distinctes le contenant.
|
||||
|
||||
Sert a juger la distinctivite d'un token : "holden" present dans une seule
|
||||
famille est sur a fusionner ; "alex" present dans plusieurs ne l'est pas.
|
||||
"""
|
||||
from collections import Counter
|
||||
freq: Counter[str] = Counter()
|
||||
surfaces = {_norm(f) for c in characters for f in _forms(c)}
|
||||
surfaces |= {_norm(s) for s in (extra or [])}
|
||||
for s in surfaces:
|
||||
for t in _tokens(s):
|
||||
freq[t] += 1
|
||||
return freq
|
||||
|
||||
|
||||
def heuristic_match(surface: str, characters: list[Character], tokfreq=None):
|
||||
"""Rapproche `surface` d'un personnage connu sans LLM (conservateur).
|
||||
|
||||
Renvoie le `Character` correspondant, `None` si aucun, ou `_AMBIGUOUS` si le
|
||||
rapprochement est plausible mais incertain (decision laissee a Gemma).
|
||||
|
||||
Un lien par sous-ensemble de tokens n'est considere SUR que si le plus petit
|
||||
cote a >=2 tokens, ou si les tokens partages sont globalement distinctifs
|
||||
(presents dans <=2 surfaces). Sinon le lien est ambigu (ex: un prenom
|
||||
courant "Alex" partage par plusieurs personnages).
|
||||
"""
|
||||
s_norm = _norm(surface)
|
||||
for c in characters:
|
||||
if _norm(c.name) == s_norm or any(_norm(a) == s_norm for a in c.aliases):
|
||||
return c
|
||||
s_tok = _tokens(surface)
|
||||
if not s_tok:
|
||||
return None
|
||||
if tokfreq is None:
|
||||
tokfreq = _token_freq(characters, [surface])
|
||||
|
||||
safe: list[Character] = []
|
||||
ambiguous = False
|
||||
for c in characters:
|
||||
linked = is_safe = False
|
||||
for form in _forms(c):
|
||||
f_tok = _tokens(form)
|
||||
if not f_tok or not (s_tok <= f_tok or f_tok <= s_tok):
|
||||
continue
|
||||
linked = True
|
||||
shared = s_tok & f_tok
|
||||
if min(len(s_tok), len(f_tok)) >= 2 or all(tokfreq[t] <= 2 for t in shared):
|
||||
is_safe = True
|
||||
if is_safe:
|
||||
safe.append(c)
|
||||
elif linked:
|
||||
ambiguous = True
|
||||
if len(safe) == 1 and not ambiguous:
|
||||
return safe[0]
|
||||
if safe or ambiguous:
|
||||
return _AMBIGUOUS
|
||||
return None
|
||||
|
||||
|
||||
def canonical_of(a: str, b: str) -> str:
|
||||
"""Forme canonique entre deux variantes : la plus complete."""
|
||||
return a if _completeness(a) >= _completeness(b) else b
|
||||
|
||||
|
||||
def _absorb(
|
||||
target: Character,
|
||||
name: str,
|
||||
*,
|
||||
gender: Optional[str] = None,
|
||||
age: Optional[str] = None,
|
||||
description: Optional[str] = None,
|
||||
voice_id: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Fusionne la variante `name` dans `target` (mutation en place).
|
||||
|
||||
Enrichit les attributs manquants, recalcule le nom canonique et range les
|
||||
autres formes en aliases.
|
||||
"""
|
||||
target.gender = target.gender or gender
|
||||
target.age = target.age or age
|
||||
target.description = target.description or description
|
||||
target.voice_id = target.voice_id or voice_id
|
||||
|
||||
forms: dict[str, str] = {} # norm -> graphie d'origine (1re vue conservee)
|
||||
for f in [target.name, *target.aliases, name]:
|
||||
f = (f or "").strip()
|
||||
if f:
|
||||
forms.setdefault(_norm(f), f)
|
||||
canon = max(forms, key=lambda n: _completeness(forms[n]))
|
||||
target.name = forms[canon]
|
||||
target.aliases = sorted(v for k, v in forms.items() if k != canon)
|
||||
|
||||
|
||||
def _item(c) -> dict:
|
||||
"""Normalise un personnage ou un nom brut en entree de reconciliation."""
|
||||
if isinstance(c, Character):
|
||||
return {"name": c.name, "gender": c.gender, "age": c.age,
|
||||
"description": c.description, "voice_id": c.voice_id}
|
||||
return {"name": str(c), "gender": None, "age": None,
|
||||
"description": None, "voice_id": None}
|
||||
|
||||
|
||||
def _find(chars: list[Character], name: str) -> Optional[Character]:
|
||||
n = _norm(name)
|
||||
return next((c for c in chars
|
||||
if _norm(c.name) == n or any(_norm(a) == n for a in c.aliases)),
|
||||
None)
|
||||
|
||||
|
||||
def _create(chars: list[Character], it: dict, name_map: dict[str, str]) -> None:
|
||||
new = Character(name=it["name"].strip(), gender=it["gender"], age=it["age"],
|
||||
description=it["description"], voice_id=it["voice_id"])
|
||||
chars.append(new)
|
||||
name_map[_norm(it["name"])] = new.name
|
||||
|
||||
|
||||
def reconcile_characters(
|
||||
book_chars: list[Character],
|
||||
new_chars,
|
||||
gemma=None,
|
||||
*,
|
||||
speaker_names: Optional[list[str]] = None,
|
||||
) -> tuple[list[Character], dict[str, str]]:
|
||||
"""Reconcilie de nouvelles detections dans le casting du livre.
|
||||
|
||||
`new_chars` : personnages extraits (objets `Character`) du/des chapitre(s).
|
||||
`speaker_names` : formes de locuteur brutes vues dans les segments (absorbees
|
||||
comme aliases pour que la resolution de voix matche au rendu).
|
||||
`gemma` : si fourni, tranche les cas ambigus ; sinon heuristique seule.
|
||||
|
||||
Renvoie (liste canonique mise a jour, map nom_surface_normalise -> canonique).
|
||||
"""
|
||||
chars = [c.model_copy(deep=True) for c in book_chars]
|
||||
name_map: dict[str, str] = {}
|
||||
|
||||
items = [_item(c) for c in new_chars]
|
||||
seen = {_norm(it["name"]) for it in items}
|
||||
for sp in (speaker_names or []):
|
||||
n = _norm(sp or "")
|
||||
if n and n not in seen and n not in {"narrateur", "inconnu", "?"}:
|
||||
items.append(_item(sp))
|
||||
seen.add(n)
|
||||
|
||||
# Fréquence globale des tokens (base + entrants) -> distinctivite stable,
|
||||
# independante de l'ordre de traitement.
|
||||
tokfreq = _token_freq(chars, [it["name"] for it in items])
|
||||
|
||||
pending: list[dict] = []
|
||||
for it in items:
|
||||
m = heuristic_match(it["name"], chars, tokfreq)
|
||||
if m is _AMBIGUOUS:
|
||||
pending.append(it)
|
||||
elif m is not None:
|
||||
_absorb(m, it["name"], gender=it["gender"], age=it["age"],
|
||||
description=it["description"], voice_id=it["voice_id"])
|
||||
name_map[_norm(it["name"])] = m.name
|
||||
elif gemma is not None:
|
||||
pending.append(it) # peut etre une variante non evidente ("Jim")
|
||||
else:
|
||||
_create(chars, it, name_map)
|
||||
|
||||
if pending and gemma is not None:
|
||||
decisions = _gemma_reconcile(chars, pending, gemma)
|
||||
for it in pending:
|
||||
canon = decisions.get(_norm(it["name"]))
|
||||
target = _find(chars, canon) if isinstance(canon, str) else None
|
||||
if target is None: # Gemma dit NOUVEAU/inconnu : ultime essai heuristique
|
||||
hm = heuristic_match(it["name"], chars, tokfreq)
|
||||
target = hm if isinstance(hm, Character) else None
|
||||
if target is not None:
|
||||
_absorb(target, it["name"], gender=it["gender"], age=it["age"],
|
||||
description=it["description"], voice_id=it["voice_id"])
|
||||
name_map[_norm(it["name"])] = target.name
|
||||
else:
|
||||
_create(chars, it, name_map)
|
||||
elif pending:
|
||||
# Sans Gemma : on ne devine pas les cas ambigus, on les garde distincts.
|
||||
for it in pending:
|
||||
_create(chars, it, name_map)
|
||||
|
||||
return chars, name_map
|
||||
|
||||
|
||||
def dedup_cast(characters: list[Character], gemma=None) -> list[Character]:
|
||||
"""Replie les doublons d'un casting existant (conserve les voix attribuees).
|
||||
|
||||
Deux phases : (1) regroupement heuristique sur (gemma=None) -> liste reduite
|
||||
et sure ; (2) si `gemma` fourni, passe de regroupement Gemma sur les seuls
|
||||
noms candidats (partageant un token avec un autre), pour fusionner les
|
||||
variantes que l'heuristique laisse de cote (ex: "Okoye" -> "Elvi Okoye").
|
||||
"""
|
||||
base, _ = reconcile_characters([], characters, gemma=None)
|
||||
if gemma is None:
|
||||
return base
|
||||
return _gemma_merge_pass(base, gemma)
|
||||
|
||||
|
||||
def _gemma_merge_pass(base: list[Character], gemma) -> list[Character]:
|
||||
"""Rattache via Gemma les formes courtes a un nom complet (ancre).
|
||||
|
||||
Tache volontairement contrainte (et plus fiable qu'un regroupement libre) :
|
||||
une "forme courte" est un nom dont les tokens sont strictement inclus dans
|
||||
ceux d'un autre (ex: "Okoye" vs "Elvi Okoye"). Gemma mappe chaque forme
|
||||
courte vers le nom canonique EXACT d'une ancre, ou "NOUVEAU". Traite par
|
||||
petits lots pour rester dans la zone de fiabilite du modele.
|
||||
"""
|
||||
shorts: list[Character] = []
|
||||
anchors: list[Character] = []
|
||||
for i, c in enumerate(base):
|
||||
ts = _tokens(c.name)
|
||||
if ts and any(j != i and ts < _tokens(d.name) for j, d in enumerate(base)):
|
||||
shorts.append(c)
|
||||
else:
|
||||
anchors.append(c)
|
||||
if not shorts:
|
||||
return base
|
||||
|
||||
result = [a.model_copy(deep=True) for a in anchors]
|
||||
leftovers: list[Character] = []
|
||||
for start in range(0, len(shorts), 12):
|
||||
chunk = shorts[start:start + 12]
|
||||
decisions = _gemma_reconcile(result, [_item(s) for s in chunk], gemma)
|
||||
for s in chunk:
|
||||
canon = decisions.get(_norm(s.name))
|
||||
tgt = _find(result, canon) if isinstance(canon, str) else None
|
||||
if tgt is None:
|
||||
hm = heuristic_match(s.name, result)
|
||||
tgt = hm if isinstance(hm, Character) else None
|
||||
# Garde-fou : ne pas fusionner deux genres connus opposes.
|
||||
if tgt is not None and s.gender and tgt.gender and s.gender != tgt.gender:
|
||||
tgt = None
|
||||
if tgt is not None:
|
||||
_absorb(tgt, s.name, gender=s.gender, age=s.age,
|
||||
description=s.description, voice_id=s.voice_id)
|
||||
for a in s.aliases:
|
||||
_absorb(tgt, a)
|
||||
else:
|
||||
leftovers.append(s)
|
||||
return result + leftovers
|
||||
|
||||
|
||||
def _gemma_reconcile(
|
||||
chars: list[Character], pending: list[dict], gemma
|
||||
) -> dict[str, object]:
|
||||
"""Un appel groupe : pour chaque nom en attente, son canonique ou _NEW."""
|
||||
known = []
|
||||
for c in chars:
|
||||
al = f" (alias: {', '.join(c.aliases)})" if c.aliases else ""
|
||||
desc = f" — {c.description}" if c.description else ""
|
||||
known.append(f"- {c.name}{al}{desc}")
|
||||
new_lines = []
|
||||
for n, it in enumerate(pending):
|
||||
desc = f" — {it['description']}" if it.get("description") else ""
|
||||
new_lines.append(f"[{n}] {it['name']}{desc}")
|
||||
|
||||
prompt = (
|
||||
"Personnages DEJA connus du livre :\n"
|
||||
+ ("\n".join(known) if known else "(aucun)")
|
||||
+ "\n\nNoms DETECTES a classer :\n" + "\n".join(new_lines)
|
||||
+ "\n\nPour chaque nom detecte, indique s'il designe un personnage deja "
|
||||
"connu (donne alors son nom canonique EXACT tel qu'ecrit ci-dessus) ou "
|
||||
"s'il s'agit d'un nouveau personnage (\"NOUVEAU\"). Ne fusionne que si "
|
||||
"c'est, avec certitude, la meme personne. EN CAS DE DOUTE, ou si "
|
||||
"plusieurs personnages connus pourraient correspondre, reponds "
|
||||
"\"NOUVEAU\". Ne rapproche jamais deux personnes differentes qui "
|
||||
"partagent seulement un prenom ou un nom de famille.\n\n"
|
||||
'Reponds par un tableau JSON: '
|
||||
'[{"i":0,"canonical":"James Holden"},{"i":1,"canonical":"NOUVEAU"}]'
|
||||
)
|
||||
if len(prompt) > _MAX_PROMPT_CHARS:
|
||||
prompt = prompt[:_MAX_PROMPT_CHARS]
|
||||
result = gemma.generate_json(prompt, system=get_settings().prompt_dedup)
|
||||
|
||||
decisions: dict[str, object] = {}
|
||||
for item in result:
|
||||
if not isinstance(item, dict) or "i" not in item:
|
||||
continue
|
||||
n = item["i"]
|
||||
canon = str(item.get("canonical") or "").strip()
|
||||
if isinstance(n, int) and 0 <= n < len(pending) and canon:
|
||||
decisions[_norm(pending[n]["name"])] = (
|
||||
_NEW if canon.upper() == "NOUVEAU" else canon)
|
||||
return decisions
|
||||
91
backend/inkflow/casting/voicebank.py
Normal file
91
backend/inkflow/casting/voicebank.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""Banque de voix : un jeu de voix variees (genre/age) auto-suffisant.
|
||||
|
||||
Chaque voix s'appuie sur une voix Kokoro (identite + clip de reference). Le clip
|
||||
de reference est genere une fois en lisant un passage francais standard ; il sert
|
||||
de reference de timbre pour le clonage Qwen3 (rendu final). Aucune ressource
|
||||
externe a sourcer.
|
||||
|
||||
Resolution moteur :
|
||||
- Kokoro -> VoiceSpec(preset=kokoro_voice) (rapide, preview / draft)
|
||||
- Qwen3 -> VoiceSpec(ref_audio=clip, ref_text=…) (qualite, clonage)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import soundfile as sf
|
||||
|
||||
from ..config import VOICEBANK_DIR
|
||||
from ..models import VoiceEntry, Voicebank
|
||||
from ..tts.base import VoiceSpec
|
||||
|
||||
# Passage de reference lu par chaque voix pour creer son clip de clonage.
|
||||
REFERENCE_TEXT = (
|
||||
"L'univers est toujours plus étrange qu'on ne le croit. "
|
||||
"Chaque nouvelle merveille pose les bases d'une découverte plus éblouissante encore."
|
||||
)
|
||||
|
||||
# Jeu de voix par defaut (varie en genre). ff_siwis est la seule voix FR native ;
|
||||
# les autres empruntent un timbre anglais mais lisent un texte phonemise en FR.
|
||||
SEED: list[VoiceEntry] = [
|
||||
VoiceEntry(id="fr_f_siwis", kokoro_voice="ff_siwis", gender="female", age="adult", label="Siwis (FR)"),
|
||||
VoiceEntry(id="f_bella", kokoro_voice="af_bella", gender="female", age="adult", label="Bella"),
|
||||
VoiceEntry(id="f_heart", kokoro_voice="af_heart", gender="female", age="young", label="Heart"),
|
||||
VoiceEntry(id="f_emma", kokoro_voice="bf_emma", gender="female", age="adult", label="Emma"),
|
||||
VoiceEntry(id="f_nicole", kokoro_voice="af_nicole", gender="female", age="adult", label="Nicole"),
|
||||
VoiceEntry(id="m_fenrir", kokoro_voice="am_fenrir", gender="male", age="adult", label="Fenrir"),
|
||||
VoiceEntry(id="m_michael", kokoro_voice="am_michael", gender="male", age="adult", label="Michael"),
|
||||
VoiceEntry(id="m_george", kokoro_voice="bm_george", gender="male", age="adult", label="George"),
|
||||
VoiceEntry(id="m_lewis", kokoro_voice="bm_lewis", gender="male", age="adult", label="Lewis"),
|
||||
VoiceEntry(id="m_eric", kokoro_voice="am_eric", gender="male", age="young", label="Eric"),
|
||||
VoiceEntry(id="m_santa", kokoro_voice="am_santa", gender="male", age="old", label="Santa"),
|
||||
]
|
||||
|
||||
|
||||
def metadata_path() -> Path:
|
||||
return VOICEBANK_DIR / "metadata.json"
|
||||
|
||||
|
||||
def clips_dir() -> Path:
|
||||
return VOICEBANK_DIR / "clips"
|
||||
|
||||
|
||||
def load_voicebank() -> Voicebank:
|
||||
path = metadata_path()
|
||||
if path.exists():
|
||||
return Voicebank.model_validate_json(path.read_text(encoding="utf-8"))
|
||||
return Voicebank(entries=list(SEED))
|
||||
|
||||
|
||||
def save_voicebank(vb: Voicebank) -> Path:
|
||||
VOICEBANK_DIR.mkdir(parents=True, exist_ok=True)
|
||||
metadata_path().write_text(vb.model_dump_json(indent=2), encoding="utf-8")
|
||||
return metadata_path()
|
||||
|
||||
|
||||
def build_voicebank(*, regenerate: bool = False) -> Voicebank:
|
||||
"""Genere les clips de reference manquants et ecrit metadata.json."""
|
||||
from ..tts.kokoro import KokoroBackend
|
||||
|
||||
clips_dir().mkdir(parents=True, exist_ok=True)
|
||||
backend = KokoroBackend()
|
||||
entries: list[VoiceEntry] = []
|
||||
for seed in SEED:
|
||||
clip_rel = f"clips/{seed.id}.wav"
|
||||
clip_abs = VOICEBANK_DIR / clip_rel
|
||||
if regenerate or not clip_abs.exists():
|
||||
audio, sr = backend.synthesize(REFERENCE_TEXT, VoiceSpec(preset=seed.kokoro_voice))
|
||||
sf.write(str(clip_abs), audio, sr)
|
||||
entry = seed.model_copy(update={"ref_audio": clip_rel, "ref_text": REFERENCE_TEXT})
|
||||
entries.append(entry)
|
||||
vb = Voicebank(entries=entries)
|
||||
save_voicebank(vb)
|
||||
return vb
|
||||
|
||||
|
||||
def voice_spec_for(entry: VoiceEntry, engine: str, *, speed: float = 1.0) -> VoiceSpec:
|
||||
"""Construit la VoiceSpec adaptee au moteur cible."""
|
||||
if engine == "qwen3" and entry.ref_audio:
|
||||
ref_abs = str(VOICEBANK_DIR / entry.ref_audio)
|
||||
return VoiceSpec(ref_audio=ref_abs, ref_text=entry.ref_text, speed=speed)
|
||||
return VoiceSpec(preset=entry.kokoro_voice, speed=speed)
|
||||
Reference in New Issue
Block a user