"""Backend Qwen3-TTS (qualite + clonage par audio de reference) — rendu final. Deux modes : - voix preglee : `voice` (ex "Chelsie") + `language` ("French"). - clonage : `ref_audio` (+ `ref_text` transcription du clip) pour imiter une voix de la voicebank, attribuee a un personnage. """ from __future__ import annotations import numpy as np from ..settings import get_settings from .base import TTSBackend, VoiceSpec, to_mono_float32 from .chunk import chunk_text # Qwen3 tolere des sequences plus longues que Kokoro, mais on borne quand meme. _QWEN_MAX_CHARS = 500 class Qwen3Backend(TTSBackend): name = "qwen3" def __init__(self, model_id: str | None = None, language: str | None = None): settings = get_settings() self.model_id = model_id or settings.qwen3_model self.language = language or settings.language self._model = None self._sample_rate = 24000 def _ensure_loaded(self) -> None: if self._model is None: from mlx_audio.tts.utils import load_model self._model = load_model(self.model_id) def default_voice(self) -> VoiceSpec: return VoiceSpec(preset=get_settings().qwen3_default_voice) def _gen_kwargs(self, voice: VoiceSpec) -> dict: kwargs: dict = {"language": self.language, "speed": voice.speed} if voice.ref_audio: # mode clonage kwargs["ref_audio"] = voice.ref_audio if voice.ref_text: kwargs["ref_text"] = voice.ref_text else: # mode voix preglee kwargs["voice"] = voice.preset or get_settings().qwen3_default_voice return kwargs def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]: self._ensure_loaded() kwargs = self._gen_kwargs(voice) pieces: list[np.ndarray] = [] for chunk in chunk_text(text, max_chars=_QWEN_MAX_CHARS): for result in self._model.generate(text=chunk, **kwargs): self._sample_rate = getattr(result, "sample_rate", self._sample_rate) pieces.append(to_mono_float32(result.audio)) if not pieces: return np.zeros(0, dtype=np.float32), self._sample_rate return np.concatenate(pieces), self._sample_rate