InkFlow/backend/inkflow/pipeline/render.py

"""Rendu audio d'un chapitre : (segments + voix) -> WAV -> MP3.

Une `RenderUnit` = un bout de texte + la voix a employer. On construit la liste
d'unites (mono-narrateur ou multi-voix selon le casting), on synthetise chacune,
on concatene avec des silences, on normalise puis on encode en MP3.
"""
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Callable, Optional

from ..analysis.pronunciation import apply_pronunciation
from ..audio.postprocess import concat_segments, encode_mp3, normalize_loudness, write_wav
from ..config import book_data_dir, book_output_dir
from ..models import (
    Book,
    Chapter,
    ChapterAnalysis,
    ChapterText,
    Pronunciation,
    SegmentType,
)
from ..tts.base import TTSBackend, VoiceSpec

# Resout un nom de locuteur en une voix concrete.
VoiceResolver = Callable[[str], VoiceSpec]


@dataclass
class RenderUnit:
    text: str
    voice: VoiceSpec
    speaker: str = "narrateur"
    glued_to_prev: bool = False   # incise -> gap reduit avec l'unite precedente


def build_units_mono(ct: ChapterText, narrator: VoiceSpec) -> list[RenderUnit]:
    """Mono-narrateur : chaque paragraphe est lu par la voix du narrateur."""
    return [RenderUnit(text=p, voice=narrator) for p in ct.paragraphs if p.strip()]


def make_voice_resolver(cast, voicebank, engine: str) -> VoiceResolver:
    """Construit un resolver locuteur -> VoiceSpec via le casting + la voicebank.

    Replie sur la voix du narrateur si le locuteur n'a pas de voix attribuee.
    """
    from ..casting.assign import resolve_speaker_voice
    from ..casting.voicebank import voice_spec_for

    def resolve(speaker: str):
        vid = resolve_speaker_voice(speaker, cast, voicebank)
        if vid is None:
            vid = cast.narrator_voice_id
        entry = voicebank.by_id(vid) if vid else None
        if entry is None:
            return None  # le backend utilisera sa voix par defaut
        return voice_spec_for(entry, engine)

    return resolve


def build_units_multi(
    analysis: ChapterAnalysis,
    resolve: VoiceResolver,
    default_voice: "VoiceSpec",
) -> list[RenderUnit]:
    """Multi-voix : narration -> narrateur, dialogue -> voix du personnage.

    Les incises annotees sur une replique (bornes dans le texte) sont detachees
    ici, au dernier moment : la sous-chaine d'incise est portee par la voix du
    narrateur (`glued_to_prev` pour reduire le silence), le reste par la voix du
    personnage. Les repliques sans incise sont rendues entieres.
    """
    from ..analysis.segmenter import iter_incise_pieces

    narrator = resolve("narrateur") or default_voice
    units: list[RenderUnit] = []
    for seg in analysis.segments:
        if not seg.text.strip():
            continue
        if seg.type is SegmentType.NARRATION:
            units.append(RenderUnit(text=seg.text, voice=narrator,
                                    speaker="narrateur",
                                    glued_to_prev=seg.glued_to_prev))
            continue

        char_voice = resolve(seg.speaker) or default_voice
        if not seg.incises:
            units.append(RenderUnit(text=seg.text, voice=char_voice,
                                    speaker=seg.speaker,
                                    glued_to_prev=seg.glued_to_prev))
            continue

        for k, (is_incise, piece) in enumerate(
                iter_incise_pieces(seg.text, seg.incises)):
            glued = seg.glued_to_prev if k == 0 else True
            if is_incise:
                units.append(RenderUnit(text=piece, voice=narrator,
                                        speaker="narrateur", glued_to_prev=glued))
            else:
                units.append(RenderUnit(text=piece, voice=char_voice,
                                        speaker=seg.speaker, glued_to_prev=glued))
    return units


def render_units(
    units: list[RenderUnit],
    backend: TTSBackend,
    *,
    pron: Optional[Pronunciation] = None,
    progress: Optional[Callable[[int, int], None]] = None,
) -> tuple["list", int]:
    """Synthetise toutes les unites et renvoie (liste (audio,sr), n_units)."""
    parts = []
    total = len(units)
    for i, unit in enumerate(units):
        text = apply_pronunciation(unit.text, pron) if pron else unit.text
        audio, sr = backend.synthesize(text, unit.voice)
        parts.append((audio, sr))
        if progress:
            progress(i + 1, total)
    return parts, total


def render_chapter_to_mp3(
    book: Book,
    chapter: Chapter,
    units: list[RenderUnit],
    backend: TTSBackend,
    *,
    pron: Optional[Pronunciation] = None,
    track: Optional[int] = None,
    progress: Optional[Callable[[int, int], None]] = None,
) -> Path:
    """Pipeline complet pour un chapitre -> output/<livre>/NN-...mp3."""
    parts, _ = render_units(units, backend, pron=pron, progress=progress)
    # parts est aligne 1:1 avec units -> on transmet les marqueurs d'incise.
    audio, sr = concat_segments(parts, glued=[u.glued_to_prev for u in units])
    audio = normalize_loudness(audio)

    # WAV intermediaire dans data/, MP3 final dans output/.
    wav_path = book_data_dir(book.slug) / "audio" / f"ch{chapter.index:02d}.wav"
    write_wav(wav_path, audio, sr)

    out_dir = book_output_dir(book.title)
    mp3_path = out_dir / (chapter.output_name or f"ch{chapter.index:02d}.mp3")
    cover = None
    if book.cover_file:
        candidate = book_data_dir(book.slug) / book.cover_file
        cover = candidate if candidate.exists() else None

    encode_mp3(
        wav_path, mp3_path,
        title=chapter.title, album=book.title, artist=book.author,
        track=track, cover_path=cover,
    )
    return mp3_path