Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)
This commit is contained in:
0
backend/inkflow/tts/__init__.py
Normal file
0
backend/inkflow/tts/__init__.py
Normal file
48
backend/inkflow/tts/base.py
Normal file
48
backend/inkflow/tts/base.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""Abstraction des moteurs TTS (backend pluggable).
|
||||
|
||||
Deux implementations : Kokoro (rapide, voix preglees -> previews) et Qwen3-TTS
|
||||
(qualite + clonage par audio de reference -> rendu final). Toutes deux renvoient
|
||||
de l'audio mono float32 + une frequence d'echantillonnage.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoiceSpec:
|
||||
"""Decrit la voix a utiliser pour une synthese.
|
||||
|
||||
- `preset` : nom d'une voix preglee (Kokoro: "ff_siwis" ; Qwen3: "Chelsie").
|
||||
- `ref_audio` / `ref_text` : clip de reference pour le clonage (Qwen3).
|
||||
"""
|
||||
preset: Optional[str] = None
|
||||
ref_audio: Optional[str] = None
|
||||
ref_text: Optional[str] = None
|
||||
speed: float = 1.0
|
||||
|
||||
|
||||
class TTSBackend(ABC):
|
||||
"""Interface commune a tous les moteurs TTS."""
|
||||
|
||||
name: str = "base"
|
||||
|
||||
@abstractmethod
|
||||
def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]:
|
||||
"""Synthetise `text` et renvoie (audio mono float32, sample_rate)."""
|
||||
|
||||
def default_voice(self) -> VoiceSpec:
|
||||
return VoiceSpec()
|
||||
|
||||
|
||||
def to_mono_float32(audio) -> np.ndarray:
|
||||
"""Normalise une sortie de modele (mx.array / np / list) en mono float32."""
|
||||
arr = np.asarray(audio, dtype=np.float32)
|
||||
if arr.ndim > 1:
|
||||
# (channels, n) ou (n, channels) -> moyenne sur l'axe des canaux.
|
||||
arr = arr.mean(axis=0) if arr.shape[0] < arr.shape[-1] else arr.mean(axis=-1)
|
||||
return np.ascontiguousarray(arr.reshape(-1))
|
||||
62
backend/inkflow/tts/chunk.py
Normal file
62
backend/inkflow/tts/chunk.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""Decoupage de texte en morceaux synthese-friendly.
|
||||
|
||||
Les modeles TTS (Kokoro notamment) tronquent les textes trop longs. On decoupe
|
||||
donc sur les frontieres de phrases en respectant une longueur max par morceau.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
# Fin de phrase : ponctuation forte suivie d'un espace.
|
||||
_SENTENCE_END_RE = re.compile(r"(?<=[.!?…])\s+|\n+")
|
||||
# Pour les phrases tres longues, on coupe aussi sur les virgules / points-virgules.
|
||||
_SOFT_BREAK_RE = re.compile(r"(?<=[,;:])\s+")
|
||||
|
||||
DEFAULT_MAX_CHARS = 350
|
||||
|
||||
|
||||
def split_sentences(text: str) -> list[str]:
|
||||
parts = [p.strip() for p in _SENTENCE_END_RE.split(text)]
|
||||
return [p for p in parts if p]
|
||||
|
||||
|
||||
def _split_long(sentence: str, max_chars: int) -> list[str]:
|
||||
"""Coupe une phrase trop longue sur les virgules, puis par fenetre dure."""
|
||||
if len(sentence) <= max_chars:
|
||||
return [sentence]
|
||||
out: list[str] = []
|
||||
buf = ""
|
||||
for piece in _SOFT_BREAK_RE.split(sentence):
|
||||
cand = f"{buf} {piece}".strip()
|
||||
if len(cand) <= max_chars:
|
||||
buf = cand
|
||||
else:
|
||||
if buf:
|
||||
out.append(buf)
|
||||
if len(piece) <= max_chars:
|
||||
buf = piece
|
||||
else: # mot/segment plus long que la fenetre : coupe brute
|
||||
for i in range(0, len(piece), max_chars):
|
||||
out.append(piece[i:i + max_chars])
|
||||
buf = ""
|
||||
if buf:
|
||||
out.append(buf)
|
||||
return out
|
||||
|
||||
|
||||
def chunk_text(text: str, max_chars: int = DEFAULT_MAX_CHARS) -> list[str]:
|
||||
"""Regroupe les phrases en morceaux <= max_chars, sans couper une phrase."""
|
||||
chunks: list[str] = []
|
||||
buf = ""
|
||||
for sentence in split_sentences(text):
|
||||
for part in _split_long(sentence, max_chars):
|
||||
cand = f"{buf} {part}".strip()
|
||||
if len(cand) <= max_chars:
|
||||
buf = cand
|
||||
else:
|
||||
if buf:
|
||||
chunks.append(buf)
|
||||
buf = part
|
||||
if buf:
|
||||
chunks.append(buf)
|
||||
return chunks
|
||||
20
backend/inkflow/tts/factory.py
Normal file
20
backend/inkflow/tts/factory.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Selection du backend TTS par nom (pluggable)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
|
||||
from .base import TTSBackend
|
||||
|
||||
BACKENDS = ("kokoro", "qwen3")
|
||||
|
||||
|
||||
@lru_cache(maxsize=4)
|
||||
def get_backend(name: str = "kokoro") -> TTSBackend:
|
||||
name = name.lower()
|
||||
if name == "kokoro":
|
||||
from .kokoro import KokoroBackend
|
||||
return KokoroBackend()
|
||||
if name == "qwen3":
|
||||
from .qwen3 import Qwen3Backend
|
||||
return Qwen3Backend()
|
||||
raise ValueError(f"Backend TTS inconnu: {name!r} (dispo: {', '.join(BACKENDS)})")
|
||||
93
backend/inkflow/tts/kokoro.py
Normal file
93
backend/inkflow/tts/kokoro.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""Backend Kokoro (rapide, voix preglees) — ideal pour les previews.
|
||||
|
||||
Kokoro tronque les textes longs : on synthetise morceau par morceau (decoupage
|
||||
par phrases) puis on concatene. Le francais passe par espeak-ng via phonemizer.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..config import setup_espeak
|
||||
from ..settings import get_settings
|
||||
from .base import TTSBackend, VoiceSpec, to_mono_float32
|
||||
from .chunk import chunk_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Le port MLX de Kokoro a un bug d'alignement intermittent (mx.random.normal
|
||||
# dans le generateur harmonique) qui leve un broadcast_shapes sur certains
|
||||
# tirages. Comme c'est aleatoire, un simple retry suffit le plus souvent ;
|
||||
# en dernier recours on coupe le morceau en deux.
|
||||
_KOKORO_RETRIES = 8
|
||||
|
||||
|
||||
class KokoroBackend(TTSBackend):
|
||||
name = "kokoro"
|
||||
|
||||
def __init__(self, model_id: str | None = None, lang_code: str | None = None):
|
||||
setup_espeak()
|
||||
settings = get_settings()
|
||||
self.model_id = model_id or settings.kokoro_model
|
||||
self.lang_code = lang_code or settings.kokoro_lang_code
|
||||
self._model = None
|
||||
self._sample_rate = 24000
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
if self._model is None:
|
||||
from mlx_audio.tts.utils import load_model
|
||||
self._model = load_model(self.model_id)
|
||||
|
||||
def default_voice(self) -> VoiceSpec:
|
||||
return VoiceSpec(preset=get_settings().kokoro_default_voice)
|
||||
|
||||
def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]:
|
||||
self._ensure_loaded()
|
||||
preset = voice.preset or get_settings().kokoro_default_voice
|
||||
pieces: list[np.ndarray] = []
|
||||
for chunk in chunk_text(text):
|
||||
pieces.extend(self._gen_resilient(chunk, preset, voice.speed))
|
||||
if not pieces:
|
||||
return np.zeros(0, dtype=np.float32), self._sample_rate
|
||||
return np.concatenate(pieces), self._sample_rate
|
||||
|
||||
def _gen_once(self, text: str, preset: str, speed: float) -> list[np.ndarray]:
|
||||
out: list[np.ndarray] = []
|
||||
for result in self._model.generate(
|
||||
text=text, voice=preset, speed=speed, lang_code=self.lang_code,
|
||||
):
|
||||
self._sample_rate = getattr(result, "sample_rate", self._sample_rate)
|
||||
out.append(to_mono_float32(result.audio))
|
||||
return out
|
||||
|
||||
def _gen_resilient(self, text: str, preset: str, speed: float,
|
||||
depth: int = 0) -> list[np.ndarray]:
|
||||
"""Genere un morceau avec retries, puis re-decoupe en secours."""
|
||||
for _ in range(_KOKORO_RETRIES):
|
||||
try:
|
||||
return self._gen_once(text, preset, speed)
|
||||
except Exception: # noqa: BLE001 — bug intermittent du vocoder
|
||||
continue
|
||||
# Toujours en echec : on coupe en deux et on reessaie chaque moitie.
|
||||
if depth < 3 and len(text) > 40:
|
||||
mid = _split_point(text)
|
||||
left = self._gen_resilient(text[:mid].strip(), preset, speed, depth + 1)
|
||||
right = self._gen_resilient(text[mid:].strip(), preset, speed, depth + 1)
|
||||
return left + right
|
||||
logger.warning("Kokoro: morceau abandonne apres echecs: %r", text[:60])
|
||||
return []
|
||||
|
||||
|
||||
def _split_point(text: str) -> int:
|
||||
"""Point de coupe au plus proche du milieu (espace de preference)."""
|
||||
mid = len(text) // 2
|
||||
left = text.rfind(" ", 0, mid)
|
||||
right = text.find(" ", mid)
|
||||
if left == -1 and right == -1:
|
||||
return mid
|
||||
if left == -1:
|
||||
return right
|
||||
if right == -1:
|
||||
return left
|
||||
return left if (mid - left) <= (right - mid) else right
|
||||
58
backend/inkflow/tts/qwen3.py
Normal file
58
backend/inkflow/tts/qwen3.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Backend Qwen3-TTS (qualite + clonage par audio de reference) — rendu final.
|
||||
|
||||
Deux modes :
|
||||
- voix preglee : `voice` (ex "Chelsie") + `language` ("French").
|
||||
- clonage : `ref_audio` (+ `ref_text` transcription du clip) pour imiter une
|
||||
voix de la voicebank, attribuee a un personnage.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ..settings import get_settings
|
||||
from .base import TTSBackend, VoiceSpec, to_mono_float32
|
||||
from .chunk import chunk_text
|
||||
|
||||
# Qwen3 tolere des sequences plus longues que Kokoro, mais on borne quand meme.
|
||||
_QWEN_MAX_CHARS = 500
|
||||
|
||||
|
||||
class Qwen3Backend(TTSBackend):
|
||||
name = "qwen3"
|
||||
|
||||
def __init__(self, model_id: str | None = None, language: str | None = None):
|
||||
settings = get_settings()
|
||||
self.model_id = model_id or settings.qwen3_model
|
||||
self.language = language or settings.language
|
||||
self._model = None
|
||||
self._sample_rate = 24000
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
if self._model is None:
|
||||
from mlx_audio.tts.utils import load_model
|
||||
self._model = load_model(self.model_id)
|
||||
|
||||
def default_voice(self) -> VoiceSpec:
|
||||
return VoiceSpec(preset=get_settings().qwen3_default_voice)
|
||||
|
||||
def _gen_kwargs(self, voice: VoiceSpec) -> dict:
|
||||
kwargs: dict = {"language": self.language, "speed": voice.speed}
|
||||
if voice.ref_audio: # mode clonage
|
||||
kwargs["ref_audio"] = voice.ref_audio
|
||||
if voice.ref_text:
|
||||
kwargs["ref_text"] = voice.ref_text
|
||||
else: # mode voix preglee
|
||||
kwargs["voice"] = voice.preset or get_settings().qwen3_default_voice
|
||||
return kwargs
|
||||
|
||||
def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]:
|
||||
self._ensure_loaded()
|
||||
kwargs = self._gen_kwargs(voice)
|
||||
pieces: list[np.ndarray] = []
|
||||
for chunk in chunk_text(text, max_chars=_QWEN_MAX_CHARS):
|
||||
for result in self._model.generate(text=chunk, **kwargs):
|
||||
self._sample_rate = getattr(result, "sample_rate", self._sample_rate)
|
||||
pieces.append(to_mono_float32(result.audio))
|
||||
if not pieces:
|
||||
return np.zeros(0, dtype=np.float32), self._sample_rate
|
||||
return np.concatenate(pieces), self._sample_rate
|
||||
Reference in New Issue
Block a user