Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)

This commit is contained in:
2026-06-21 00:10:11 +02:00
commit d3bb91394b
71 changed files with 8138 additions and 0 deletions

View File

@@ -0,0 +1,62 @@
"""Decoupage de texte en morceaux synthese-friendly.
Les modeles TTS (Kokoro notamment) tronquent les textes trop longs. On decoupe
donc sur les frontieres de phrases en respectant une longueur max par morceau.
"""
from __future__ import annotations
import re
# Fin de phrase : ponctuation forte suivie d'un espace.
_SENTENCE_END_RE = re.compile(r"(?<=[.!?…])\s+|\n+")
# Pour les phrases tres longues, on coupe aussi sur les virgules / points-virgules.
_SOFT_BREAK_RE = re.compile(r"(?<=[,;:])\s+")
DEFAULT_MAX_CHARS = 350
def split_sentences(text: str) -> list[str]:
parts = [p.strip() for p in _SENTENCE_END_RE.split(text)]
return [p for p in parts if p]
def _split_long(sentence: str, max_chars: int) -> list[str]:
"""Coupe une phrase trop longue sur les virgules, puis par fenetre dure."""
if len(sentence) <= max_chars:
return [sentence]
out: list[str] = []
buf = ""
for piece in _SOFT_BREAK_RE.split(sentence):
cand = f"{buf} {piece}".strip()
if len(cand) <= max_chars:
buf = cand
else:
if buf:
out.append(buf)
if len(piece) <= max_chars:
buf = piece
else: # mot/segment plus long que la fenetre : coupe brute
for i in range(0, len(piece), max_chars):
out.append(piece[i:i + max_chars])
buf = ""
if buf:
out.append(buf)
return out
def chunk_text(text: str, max_chars: int = DEFAULT_MAX_CHARS) -> list[str]:
"""Regroupe les phrases en morceaux <= max_chars, sans couper une phrase."""
chunks: list[str] = []
buf = ""
for sentence in split_sentences(text):
for part in _split_long(sentence, max_chars):
cand = f"{buf} {part}".strip()
if len(cand) <= max_chars:
buf = cand
else:
if buf:
chunks.append(buf)
buf = part
if buf:
chunks.append(buf)
return chunks