"""Backend Qwen3-TTS (qualite + clonage par audio de reference) — rendu final. Deux modes : - voix preglee : `voice` (ex "Chelsie") + `language` ("French"). - clonage : `ref_audio` (+ `ref_text` transcription du clip) pour imiter une voix de la voicebank, attribuee a un personnage. """ from __future__ import annotations import logging import numpy as np from ..settings import get_settings from .base import TTSBackend, VoiceSpec, to_mono_float32 from .chunk import chunk_text logger = logging.getLogger(__name__) # Qwen3 tolere des sequences plus longues que Kokoro, mais on borne quand meme. _QWEN_MAX_CHARS = 500 # Garde-fou anti-derive : Qwen3 part parfois en boucle (audio 50x trop long) ou # s'arrete net (sortie ~0 s). On estime la duree plausible d'un chunk depuis sa # longueur (~15 caracteres/s en francais) et on rejette/reessaie les sorties hors # bornes. Stochastique (temperature) -> un retry change le tirage. _CHARS_PER_SEC = 15.0 _QWEN_RETRIES = 3 _MIN_FLOOR_SEC = 0.3 # en deca = generation echouee (silence) def _bounds(n_chars: int) -> tuple[float, float, float]: """(attendu, min, max) en secondes pour un chunk de `n_chars` caracteres.""" expected = max(1.0, n_chars / _CHARS_PER_SEC) return expected, max(_MIN_FLOOR_SEC, 0.4 * expected), 2.5 * expected + 2.0 class Qwen3Backend(TTSBackend): name = "qwen3" def __init__(self, model_id: str | None = None, language: str | None = None): settings = get_settings() self.model_id = model_id or settings.qwen3_model self.language = language or settings.language self._model = None self._sample_rate = 24000 def _ensure_loaded(self) -> None: if self._model is None: from mlx_audio.tts.utils import load_model self._model = load_model(self.model_id) def default_voice(self) -> VoiceSpec: return VoiceSpec(preset=get_settings().qwen3_default_voice) def _gen_kwargs(self, voice: VoiceSpec) -> dict: kwargs: dict = {"language": self.language, "speed": voice.speed} if voice.ref_audio: # mode clonage kwargs["ref_audio"] = voice.ref_audio if voice.ref_text: kwargs["ref_text"] = voice.ref_text else: # mode voix preglee kwargs["voice"] = voice.preset or get_settings().qwen3_default_voice return kwargs def _gen_chunk_once(self, chunk: str, kwargs: dict) -> np.ndarray: """Genere l'audio (concatene) d'un chunk en un tirage.""" out: list[np.ndarray] = [] for result in self._model.generate(text=chunk, **kwargs): self._sample_rate = getattr(result, "sample_rate", self._sample_rate) out.append(to_mono_float32(result.audio)) return np.concatenate(out) if out else np.zeros(0, dtype=np.float32) def _gen_chunk_guarded(self, chunk: str, kwargs: dict) -> np.ndarray: """Genere un chunk en rejetant les sorties aberrantes (boucle / coupure). Retourne le 1er tirage dans les bornes ; sinon la tentative la plus proche de la duree attendue (en excluant les silences et les derives extremes). """ sr = self._sample_rate expected, lo, hi = _bounds(len(chunk)) attempts: list[np.ndarray] = [] for i in range(_QWEN_RETRIES): audio = self._gen_chunk_once(chunk, kwargs) dur = len(audio) / sr if lo <= dur <= hi: if i: logger.info("Qwen3: chunk OK au retry %d (%.1fs)", i, dur) return audio logger.warning("Qwen3: sortie aberrante %.1fs (attendu ~%.1fs) — retry", dur, expected) attempts.append(audio) # Aucune tentative dans les bornes : on garde la moins mauvaise (ni # silence ni derive), la plus proche de l'attendu. valid = [a for a in attempts if _MIN_FLOOR_SEC <= len(a) / sr <= hi] or attempts best = min(valid, key=lambda a: abs(len(a) / sr - expected)) logger.warning("Qwen3: chunk non stabilise apres %d essais, garde %.1fs: %r", _QWEN_RETRIES, len(best) / sr, chunk[:60]) return best def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]: self._ensure_loaded() kwargs = self._gen_kwargs(voice) pieces = [self._gen_chunk_guarded(chunk, kwargs) for chunk in chunk_text(text, max_chars=_QWEN_MAX_CHARS)] pieces = [p for p in pieces if len(p)] if not pieces: return np.zeros(0, dtype=np.float32), self._sample_rate return np.concatenate(pieces), self._sample_rate