Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)
This commit is contained in:
48
backend/inkflow/tts/base.py
Normal file
48
backend/inkflow/tts/base.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""Abstraction des moteurs TTS (backend pluggable).
|
||||
|
||||
Deux implementations : Kokoro (rapide, voix preglees -> previews) et Qwen3-TTS
|
||||
(qualite + clonage par audio de reference -> rendu final). Toutes deux renvoient
|
||||
de l'audio mono float32 + une frequence d'echantillonnage.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoiceSpec:
|
||||
"""Decrit la voix a utiliser pour une synthese.
|
||||
|
||||
- `preset` : nom d'une voix preglee (Kokoro: "ff_siwis" ; Qwen3: "Chelsie").
|
||||
- `ref_audio` / `ref_text` : clip de reference pour le clonage (Qwen3).
|
||||
"""
|
||||
preset: Optional[str] = None
|
||||
ref_audio: Optional[str] = None
|
||||
ref_text: Optional[str] = None
|
||||
speed: float = 1.0
|
||||
|
||||
|
||||
class TTSBackend(ABC):
|
||||
"""Interface commune a tous les moteurs TTS."""
|
||||
|
||||
name: str = "base"
|
||||
|
||||
@abstractmethod
|
||||
def synthesize(self, text: str, voice: VoiceSpec) -> tuple[np.ndarray, int]:
|
||||
"""Synthetise `text` et renvoie (audio mono float32, sample_rate)."""
|
||||
|
||||
def default_voice(self) -> VoiceSpec:
|
||||
return VoiceSpec()
|
||||
|
||||
|
||||
def to_mono_float32(audio) -> np.ndarray:
|
||||
"""Normalise une sortie de modele (mx.array / np / list) en mono float32."""
|
||||
arr = np.asarray(audio, dtype=np.float32)
|
||||
if arr.ndim > 1:
|
||||
# (channels, n) ou (n, channels) -> moyenne sur l'axe des canaux.
|
||||
arr = arr.mean(axis=0) if arr.shape[0] < arr.shape[-1] else arr.mean(axis=-1)
|
||||
return np.ascontiguousarray(arr.reshape(-1))
|
||||
Reference in New Issue
Block a user