Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)
This commit is contained in:
62
backend/inkflow/tts/chunk.py
Normal file
62
backend/inkflow/tts/chunk.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""Decoupage de texte en morceaux synthese-friendly.
|
||||
|
||||
Les modeles TTS (Kokoro notamment) tronquent les textes trop longs. On decoupe
|
||||
donc sur les frontieres de phrases en respectant une longueur max par morceau.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
# Fin de phrase : ponctuation forte suivie d'un espace.
|
||||
_SENTENCE_END_RE = re.compile(r"(?<=[.!?…])\s+|\n+")
|
||||
# Pour les phrases tres longues, on coupe aussi sur les virgules / points-virgules.
|
||||
_SOFT_BREAK_RE = re.compile(r"(?<=[,;:])\s+")
|
||||
|
||||
DEFAULT_MAX_CHARS = 350
|
||||
|
||||
|
||||
def split_sentences(text: str) -> list[str]:
|
||||
parts = [p.strip() for p in _SENTENCE_END_RE.split(text)]
|
||||
return [p for p in parts if p]
|
||||
|
||||
|
||||
def _split_long(sentence: str, max_chars: int) -> list[str]:
|
||||
"""Coupe une phrase trop longue sur les virgules, puis par fenetre dure."""
|
||||
if len(sentence) <= max_chars:
|
||||
return [sentence]
|
||||
out: list[str] = []
|
||||
buf = ""
|
||||
for piece in _SOFT_BREAK_RE.split(sentence):
|
||||
cand = f"{buf} {piece}".strip()
|
||||
if len(cand) <= max_chars:
|
||||
buf = cand
|
||||
else:
|
||||
if buf:
|
||||
out.append(buf)
|
||||
if len(piece) <= max_chars:
|
||||
buf = piece
|
||||
else: # mot/segment plus long que la fenetre : coupe brute
|
||||
for i in range(0, len(piece), max_chars):
|
||||
out.append(piece[i:i + max_chars])
|
||||
buf = ""
|
||||
if buf:
|
||||
out.append(buf)
|
||||
return out
|
||||
|
||||
|
||||
def chunk_text(text: str, max_chars: int = DEFAULT_MAX_CHARS) -> list[str]:
|
||||
"""Regroupe les phrases en morceaux <= max_chars, sans couper une phrase."""
|
||||
chunks: list[str] = []
|
||||
buf = ""
|
||||
for sentence in split_sentences(text):
|
||||
for part in _split_long(sentence, max_chars):
|
||||
cand = f"{buf} {part}".strip()
|
||||
if len(cand) <= max_chars:
|
||||
buf = cand
|
||||
else:
|
||||
if buf:
|
||||
chunks.append(buf)
|
||||
buf = part
|
||||
if buf:
|
||||
chunks.append(buf)
|
||||
return chunks
|
||||
Reference in New Issue
Block a user