Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)
This commit is contained in:
93
backend/inkflow/tts/kokoro.py
Normal file
93
backend/inkflow/tts/kokoro.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""Backend Kokoro (rapide, voix preglees) — ideal pour les previews.
|
||||
|
||||
Kokoro tronque les textes longs : on synthetise morceau par morceau (decoupage
|
||||
par phrases) puis on concatene. Le francais passe par espeak-ng via phonemizer.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..config import setup_espeak
|
||||
from ..settings import get_settings
|
||||
from .base import TTSBackend, VoiceSpec, to_mono_float32
|
||||
from .chunk import chunk_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Le port MLX de Kokoro a un bug d'alignement intermittent (mx.random.normal
|
||||
# dans le generateur harmonique) qui leve un broadcast_shapes sur certains
|
||||
# tirages. Comme c'est aleatoire, un simple retry suffit le plus souvent ;
|
||||
# en dernier recours on coupe le morceau en deux.
|
||||
_KOKORO_RETRIES = 8
|
||||
|
||||
|
||||
class KokoroBackend(TTSBackend):
|
||||
name = "kokoro"
|
||||
|
||||
def __init__(self, model_id: str | None = None, lang_code: str | None = None):
|
||||
setup_espeak()
|
||||
settings = get_settings()
|
||||
self.model_id = model_id or settings.kokoro_model
|
||||
self.lang_code = lang_code or settings.kokoro_lang_code
|
||||
self._model = None
|
||||
self._sample_rate = 24000
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
if self._model is None:
|
||||
from mlx_audio.tts.utils import load_model
|
||||
self._model = load_model(self.model_id)
|
||||
|
||||
def default_voice(self) -> VoiceSpec:
|
||||
return VoiceSpec(preset=get_settings().kokoro_default_voice)
|
||||
|
||||
def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]:
|
||||
self._ensure_loaded()
|
||||
preset = voice.preset or get_settings().kokoro_default_voice
|
||||
pieces: list[np.ndarray] = []
|
||||
for chunk in chunk_text(text):
|
||||
pieces.extend(self._gen_resilient(chunk, preset, voice.speed))
|
||||
if not pieces:
|
||||
return np.zeros(0, dtype=np.float32), self._sample_rate
|
||||
return np.concatenate(pieces), self._sample_rate
|
||||
|
||||
def _gen_once(self, text: str, preset: str, speed: float) -> list[np.ndarray]:
|
||||
out: list[np.ndarray] = []
|
||||
for result in self._model.generate(
|
||||
text=text, voice=preset, speed=speed, lang_code=self.lang_code,
|
||||
):
|
||||
self._sample_rate = getattr(result, "sample_rate", self._sample_rate)
|
||||
out.append(to_mono_float32(result.audio))
|
||||
return out
|
||||
|
||||
def _gen_resilient(self, text: str, preset: str, speed: float,
|
||||
depth: int = 0) -> list[np.ndarray]:
|
||||
"""Genere un morceau avec retries, puis re-decoupe en secours."""
|
||||
for _ in range(_KOKORO_RETRIES):
|
||||
try:
|
||||
return self._gen_once(text, preset, speed)
|
||||
except Exception: # noqa: BLE001 — bug intermittent du vocoder
|
||||
continue
|
||||
# Toujours en echec : on coupe en deux et on reessaie chaque moitie.
|
||||
if depth < 3 and len(text) > 40:
|
||||
mid = _split_point(text)
|
||||
left = self._gen_resilient(text[:mid].strip(), preset, speed, depth + 1)
|
||||
right = self._gen_resilient(text[mid:].strip(), preset, speed, depth + 1)
|
||||
return left + right
|
||||
logger.warning("Kokoro: morceau abandonne apres echecs: %r", text[:60])
|
||||
return []
|
||||
|
||||
|
||||
def _split_point(text: str) -> int:
|
||||
"""Point de coupe au plus proche du milieu (espace de preference)."""
|
||||
mid = len(text) // 2
|
||||
left = text.rfind(" ", 0, mid)
|
||||
right = text.find(" ", mid)
|
||||
if left == -1 and right == -1:
|
||||
return mid
|
||||
if left == -1:
|
||||
return right
|
||||
if right == -1:
|
||||
return left
|
||||
return left if (mid - left) <= (right - mid) else right
|
||||
Reference in New Issue
Block a user