63 lines
2.0 KiB
Python
63 lines
2.0 KiB
Python
"""Decoupage de texte en morceaux synthese-friendly.
|
|
|
|
Les modeles TTS (Kokoro notamment) tronquent les textes trop longs. On decoupe
|
|
donc sur les frontieres de phrases en respectant une longueur max par morceau.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
# Fin de phrase : ponctuation forte suivie d'un espace.
|
|
_SENTENCE_END_RE = re.compile(r"(?<=[.!?…])\s+|\n+")
|
|
# Pour les phrases tres longues, on coupe aussi sur les virgules / points-virgules.
|
|
_SOFT_BREAK_RE = re.compile(r"(?<=[,;:])\s+")
|
|
|
|
DEFAULT_MAX_CHARS = 350
|
|
|
|
|
|
def split_sentences(text: str) -> list[str]:
|
|
parts = [p.strip() for p in _SENTENCE_END_RE.split(text)]
|
|
return [p for p in parts if p]
|
|
|
|
|
|
def _split_long(sentence: str, max_chars: int) -> list[str]:
|
|
"""Coupe une phrase trop longue sur les virgules, puis par fenetre dure."""
|
|
if len(sentence) <= max_chars:
|
|
return [sentence]
|
|
out: list[str] = []
|
|
buf = ""
|
|
for piece in _SOFT_BREAK_RE.split(sentence):
|
|
cand = f"{buf} {piece}".strip()
|
|
if len(cand) <= max_chars:
|
|
buf = cand
|
|
else:
|
|
if buf:
|
|
out.append(buf)
|
|
if len(piece) <= max_chars:
|
|
buf = piece
|
|
else: # mot/segment plus long que la fenetre : coupe brute
|
|
for i in range(0, len(piece), max_chars):
|
|
out.append(piece[i:i + max_chars])
|
|
buf = ""
|
|
if buf:
|
|
out.append(buf)
|
|
return out
|
|
|
|
|
|
def chunk_text(text: str, max_chars: int = DEFAULT_MAX_CHARS) -> list[str]:
|
|
"""Regroupe les phrases en morceaux <= max_chars, sans couper une phrase."""
|
|
chunks: list[str] = []
|
|
buf = ""
|
|
for sentence in split_sentences(text):
|
|
for part in _split_long(sentence, max_chars):
|
|
cand = f"{buf} {part}".strip()
|
|
if len(cand) <= max_chars:
|
|
buf = cand
|
|
else:
|
|
if buf:
|
|
chunks.append(buf)
|
|
buf = part
|
|
if buf:
|
|
chunks.append(buf)
|
|
return chunks
|