"""Backend Kokoro (rapide, voix preglees) — ideal pour les previews. Kokoro tronque les textes longs : on synthetise morceau par morceau (decoupage par phrases) puis on concatene. Le francais passe par espeak-ng via phonemizer. """ from __future__ import annotations import logging import numpy as np from ..config import setup_espeak from ..settings import get_settings from .base import TTSBackend, VoiceSpec, to_mono_float32 from .chunk import chunk_text logger = logging.getLogger(__name__) # Le port MLX de Kokoro a un bug d'alignement intermittent (mx.random.normal # dans le generateur harmonique) qui leve un broadcast_shapes sur certains # tirages. Comme c'est aleatoire, un simple retry suffit le plus souvent ; # en dernier recours on coupe le morceau en deux. _KOKORO_RETRIES = 8 class KokoroBackend(TTSBackend): name = "kokoro" def __init__(self, model_id: str | None = None, lang_code: str | None = None): setup_espeak() settings = get_settings() self.model_id = model_id or settings.kokoro_model self.lang_code = lang_code or settings.kokoro_lang_code self._model = None self._sample_rate = 24000 def _ensure_loaded(self) -> None: if self._model is None: from mlx_audio.tts.utils import load_model self._model = load_model(self.model_id) def default_voice(self) -> VoiceSpec: return VoiceSpec(preset=get_settings().kokoro_default_voice) def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]: self._ensure_loaded() preset = voice.preset or get_settings().kokoro_default_voice pieces: list[np.ndarray] = [] for chunk in chunk_text(text): pieces.extend(self._gen_resilient(chunk, preset, voice.speed)) if not pieces: return np.zeros(0, dtype=np.float32), self._sample_rate return np.concatenate(pieces), self._sample_rate def _gen_once(self, text: str, preset: str, speed: float) -> list[np.ndarray]: out: list[np.ndarray] = [] for result in self._model.generate( text=text, voice=preset, speed=speed, lang_code=self.lang_code, ): self._sample_rate = getattr(result, "sample_rate", self._sample_rate) out.append(to_mono_float32(result.audio)) return out def _gen_resilient(self, text: str, preset: str, speed: float, depth: int = 0) -> list[np.ndarray]: """Genere un morceau avec retries, puis re-decoupe en secours.""" for _ in range(_KOKORO_RETRIES): try: return self._gen_once(text, preset, speed) except Exception: # noqa: BLE001 — bug intermittent du vocoder continue # Toujours en echec : on coupe en deux et on reessaie chaque moitie. if depth < 3 and len(text) > 40: mid = _split_point(text) left = self._gen_resilient(text[:mid].strip(), preset, speed, depth + 1) right = self._gen_resilient(text[mid:].strip(), preset, speed, depth + 1) return left + right logger.warning("Kokoro: morceau abandonne apres echecs: %r", text[:60]) return [] def _split_point(text: str) -> int: """Point de coupe au plus proche du milieu (espace de preference).""" mid = len(text) // 2 left = text.rfind(" ", 0, mid) right = text.find(" ", mid) if left == -1 and right == -1: return mid if left == -1: return right if right == -1: return left return left if (mid - left) <= (right - mid) else right