Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)

This commit is contained in:
2026-06-21 00:10:11 +02:00
commit d3bb91394b
71 changed files with 8138 additions and 0 deletions

View File

View File

@@ -0,0 +1,48 @@
"""Abstraction des moteurs TTS (backend pluggable).
Deux implementations : Kokoro (rapide, voix preglees -> previews) et Qwen3-TTS
(qualite + clonage par audio de reference -> rendu final). Toutes deux renvoient
de l'audio mono float32 + une frequence d'echantillonnage.
"""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Optional
import numpy as np
@dataclass
class VoiceSpec:
"""Decrit la voix a utiliser pour une synthese.
- `preset` : nom d'une voix preglee (Kokoro: "ff_siwis" ; Qwen3: "Chelsie").
- `ref_audio` / `ref_text` : clip de reference pour le clonage (Qwen3).
"""
preset: Optional[str] = None
ref_audio: Optional[str] = None
ref_text: Optional[str] = None
speed: float = 1.0
class TTSBackend(ABC):
"""Interface commune a tous les moteurs TTS."""
name: str = "base"
@abstractmethod
def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]:
"""Synthetise `text` et renvoie (audio mono float32, sample_rate)."""
def default_voice(self) -> VoiceSpec:
return VoiceSpec()
def to_mono_float32(audio) -> np.ndarray:
"""Normalise une sortie de modele (mx.array / np / list) en mono float32."""
arr = np.asarray(audio, dtype=np.float32)
if arr.ndim > 1:
# (channels, n) ou (n, channels) -> moyenne sur l'axe des canaux.
arr = arr.mean(axis=0) if arr.shape[0] < arr.shape[-1] else arr.mean(axis=-1)
return np.ascontiguousarray(arr.reshape(-1))

View File

@@ -0,0 +1,62 @@
"""Decoupage de texte en morceaux synthese-friendly.
Les modeles TTS (Kokoro notamment) tronquent les textes trop longs. On decoupe
donc sur les frontieres de phrases en respectant une longueur max par morceau.
"""
from __future__ import annotations
import re
# Fin de phrase : ponctuation forte suivie d'un espace.
_SENTENCE_END_RE = re.compile(r"(?<=[.!?…])\s+|\n+")
# Pour les phrases tres longues, on coupe aussi sur les virgules / points-virgules.
_SOFT_BREAK_RE = re.compile(r"(?<=[,;:])\s+")
DEFAULT_MAX_CHARS = 350
def split_sentences(text: str) -> list[str]:
parts = [p.strip() for p in _SENTENCE_END_RE.split(text)]
return [p for p in parts if p]
def _split_long(sentence: str, max_chars: int) -> list[str]:
"""Coupe une phrase trop longue sur les virgules, puis par fenetre dure."""
if len(sentence) <= max_chars:
return [sentence]
out: list[str] = []
buf = ""
for piece in _SOFT_BREAK_RE.split(sentence):
cand = f"{buf} {piece}".strip()
if len(cand) <= max_chars:
buf = cand
else:
if buf:
out.append(buf)
if len(piece) <= max_chars:
buf = piece
else: # mot/segment plus long que la fenetre : coupe brute
for i in range(0, len(piece), max_chars):
out.append(piece[i:i + max_chars])
buf = ""
if buf:
out.append(buf)
return out
def chunk_text(text: str, max_chars: int = DEFAULT_MAX_CHARS) -> list[str]:
"""Regroupe les phrases en morceaux <= max_chars, sans couper une phrase."""
chunks: list[str] = []
buf = ""
for sentence in split_sentences(text):
for part in _split_long(sentence, max_chars):
cand = f"{buf} {part}".strip()
if len(cand) <= max_chars:
buf = cand
else:
if buf:
chunks.append(buf)
buf = part
if buf:
chunks.append(buf)
return chunks

View File

@@ -0,0 +1,20 @@
"""Selection du backend TTS par nom (pluggable)."""
from __future__ import annotations
from functools import lru_cache
from .base import TTSBackend
BACKENDS = ("kokoro", "qwen3")
@lru_cache(maxsize=4)
def get_backend(name: str = "kokoro") -> TTSBackend:
name = name.lower()
if name == "kokoro":
from .kokoro import KokoroBackend
return KokoroBackend()
if name == "qwen3":
from .qwen3 import Qwen3Backend
return Qwen3Backend()
raise ValueError(f"Backend TTS inconnu: {name!r} (dispo: {', '.join(BACKENDS)})")

View File

@@ -0,0 +1,93 @@
"""Backend Kokoro (rapide, voix preglees) — ideal pour les previews.
Kokoro tronque les textes longs : on synthetise morceau par morceau (decoupage
par phrases) puis on concatene. Le francais passe par espeak-ng via phonemizer.
"""
from __future__ import annotations
import logging
import numpy as np
from ..config import setup_espeak
from ..settings import get_settings
from .base import TTSBackend, VoiceSpec, to_mono_float32
from .chunk import chunk_text
logger = logging.getLogger(__name__)
# Le port MLX de Kokoro a un bug d'alignement intermittent (mx.random.normal
# dans le generateur harmonique) qui leve un broadcast_shapes sur certains
# tirages. Comme c'est aleatoire, un simple retry suffit le plus souvent ;
# en dernier recours on coupe le morceau en deux.
_KOKORO_RETRIES = 8
class KokoroBackend(TTSBackend):
name = "kokoro"
def __init__(self, model_id: str | None = None, lang_code: str | None = None):
setup_espeak()
settings = get_settings()
self.model_id = model_id or settings.kokoro_model
self.lang_code = lang_code or settings.kokoro_lang_code
self._model = None
self._sample_rate = 24000
def _ensure_loaded(self) -> None:
if self._model is None:
from mlx_audio.tts.utils import load_model
self._model = load_model(self.model_id)
def default_voice(self) -> VoiceSpec:
return VoiceSpec(preset=get_settings().kokoro_default_voice)
def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]:
self._ensure_loaded()
preset = voice.preset or get_settings().kokoro_default_voice
pieces: list[np.ndarray] = []
for chunk in chunk_text(text):
pieces.extend(self._gen_resilient(chunk, preset, voice.speed))
if not pieces:
return np.zeros(0, dtype=np.float32), self._sample_rate
return np.concatenate(pieces), self._sample_rate
def _gen_once(self, text: str, preset: str, speed: float) -> list[np.ndarray]:
out: list[np.ndarray] = []
for result in self._model.generate(
text=text, voice=preset, speed=speed, lang_code=self.lang_code,
):
self._sample_rate = getattr(result, "sample_rate", self._sample_rate)
out.append(to_mono_float32(result.audio))
return out
def _gen_resilient(self, text: str, preset: str, speed: float,
depth: int = 0) -> list[np.ndarray]:
"""Genere un morceau avec retries, puis re-decoupe en secours."""
for _ in range(_KOKORO_RETRIES):
try:
return self._gen_once(text, preset, speed)
except Exception: # noqa: BLE001 — bug intermittent du vocoder
continue
# Toujours en echec : on coupe en deux et on reessaie chaque moitie.
if depth < 3 and len(text) > 40:
mid = _split_point(text)
left = self._gen_resilient(text[:mid].strip(), preset, speed, depth + 1)
right = self._gen_resilient(text[mid:].strip(), preset, speed, depth + 1)
return left + right
logger.warning("Kokoro: morceau abandonne apres echecs: %r", text[:60])
return []
def _split_point(text: str) -> int:
"""Point de coupe au plus proche du milieu (espace de preference)."""
mid = len(text) // 2
left = text.rfind(" ", 0, mid)
right = text.find(" ", mid)
if left == -1 and right == -1:
return mid
if left == -1:
return right
if right == -1:
return left
return left if (mid - left) <= (right - mid) else right

View File

@@ -0,0 +1,58 @@
"""Backend Qwen3-TTS (qualite + clonage par audio de reference) — rendu final.
Deux modes :
- voix preglee : `voice` (ex "Chelsie") + `language` ("French").
- clonage : `ref_audio` (+ `ref_text` transcription du clip) pour imiter une
voix de la voicebank, attribuee a un personnage.
"""
from __future__ import annotations
import numpy as np
from ..settings import get_settings
from .base import TTSBackend, VoiceSpec, to_mono_float32
from .chunk import chunk_text
# Qwen3 tolere des sequences plus longues que Kokoro, mais on borne quand meme.
_QWEN_MAX_CHARS = 500
class Qwen3Backend(TTSBackend):
name = "qwen3"
def __init__(self, model_id: str | None = None, language: str | None = None):
settings = get_settings()
self.model_id = model_id or settings.qwen3_model
self.language = language or settings.language
self._model = None
self._sample_rate = 24000
def _ensure_loaded(self) -> None:
if self._model is None:
from mlx_audio.tts.utils import load_model
self._model = load_model(self.model_id)
def default_voice(self) -> VoiceSpec:
return VoiceSpec(preset=get_settings().qwen3_default_voice)
def _gen_kwargs(self, voice: VoiceSpec) -> dict:
kwargs: dict = {"language": self.language, "speed": voice.speed}
if voice.ref_audio: # mode clonage
kwargs["ref_audio"] = voice.ref_audio
if voice.ref_text:
kwargs["ref_text"] = voice.ref_text
else: # mode voix preglee
kwargs["voice"] = voice.preset or get_settings().qwen3_default_voice
return kwargs
def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]:
self._ensure_loaded()
kwargs = self._gen_kwargs(voice)
pieces: list[np.ndarray] = []
for chunk in chunk_text(text, max_chars=_QWEN_MAX_CHARS):
for result in self._model.generate(text=chunk, **kwargs):
self._sample_rate = getattr(result, "sample_rate", self._sample_rate)
pieces.append(to_mono_float32(result.audio))
if not pieces:
return np.zeros(0, dtype=np.float32), self._sample_rate
return np.concatenate(pieces), self._sample_rate