Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)
This commit is contained in:
58
backend/inkflow/tts/qwen3.py
Normal file
58
backend/inkflow/tts/qwen3.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Backend Qwen3-TTS (qualite + clonage par audio de reference) — rendu final.
|
||||
|
||||
Deux modes :
|
||||
- voix preglee : `voice` (ex "Chelsie") + `language` ("French").
|
||||
- clonage : `ref_audio` (+ `ref_text` transcription du clip) pour imiter une
|
||||
voix de la voicebank, attribuee a un personnage.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..settings import get_settings
|
||||
from .base import TTSBackend, VoiceSpec, to_mono_float32
|
||||
from .chunk import chunk_text
|
||||
|
||||
# Qwen3 tolere des sequences plus longues que Kokoro, mais on borne quand meme.
|
||||
_QWEN_MAX_CHARS = 500
|
||||
|
||||
|
||||
class Qwen3Backend(TTSBackend):
|
||||
name = "qwen3"
|
||||
|
||||
def __init__(self, model_id: str | None = None, language: str | None = None):
|
||||
settings = get_settings()
|
||||
self.model_id = model_id or settings.qwen3_model
|
||||
self.language = language or settings.language
|
||||
self._model = None
|
||||
self._sample_rate = 24000
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
if self._model is None:
|
||||
from mlx_audio.tts.utils import load_model
|
||||
self._model = load_model(self.model_id)
|
||||
|
||||
def default_voice(self) -> VoiceSpec:
|
||||
return VoiceSpec(preset=get_settings().qwen3_default_voice)
|
||||
|
||||
def _gen_kwargs(self, voice: VoiceSpec) -> dict:
|
||||
kwargs: dict = {"language": self.language, "speed": voice.speed}
|
||||
if voice.ref_audio: # mode clonage
|
||||
kwargs["ref_audio"] = voice.ref_audio
|
||||
if voice.ref_text:
|
||||
kwargs["ref_text"] = voice.ref_text
|
||||
else: # mode voix preglee
|
||||
kwargs["voice"] = voice.preset or get_settings().qwen3_default_voice
|
||||
return kwargs
|
||||
|
||||
def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]:
|
||||
self._ensure_loaded()
|
||||
kwargs = self._gen_kwargs(voice)
|
||||
pieces: list[np.ndarray] = []
|
||||
for chunk in chunk_text(text, max_chars=_QWEN_MAX_CHARS):
|
||||
for result in self._model.generate(text=chunk, **kwargs):
|
||||
self._sample_rate = getattr(result, "sample_rate", self._sample_rate)
|
||||
pieces.append(to_mono_float32(result.audio))
|
||||
if not pieces:
|
||||
return np.zeros(0, dtype=np.float32), self._sample_rate
|
||||
return np.concatenate(pieces), self._sample_rate
|
||||
Reference in New Issue
Block a user