Initial commit: InkFlow — EPUB vers livre audio local (MLX/Kokoro)
This commit is contained in:
0
backend/inkflow/analysis/__init__.py
Normal file
0
backend/inkflow/analysis/__init__.py
Normal file
123
backend/inkflow/analysis/gemma.py
Normal file
123
backend/inkflow/analysis/gemma.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""Wrapper mlx-lm autour de Gemma pour l'analyse de texte.
|
||||
|
||||
Charge le modele paresseusement (une seule fois par process) et expose des
|
||||
helpers de generation, dont un `generate_json` tolerant qui extrait le premier
|
||||
objet/array JSON valide de la sortie du modele.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from functools import lru_cache
|
||||
from typing import Any, Optional
|
||||
|
||||
from ..settings import get_settings
|
||||
|
||||
# Bornes d'un bloc JSON dans une reponse potentiellement bavarde.
|
||||
_JSON_SPAN_RE = re.compile(r"(\{.*\}|\[.*\])", re.DOTALL)
|
||||
_FENCE_RE = re.compile(r"```(?:json)?\s*(.*?)```", re.DOTALL)
|
||||
|
||||
|
||||
@lru_cache(maxsize=2)
|
||||
def _load(model_id: str):
|
||||
# Import paresseux : evite de charger mlx tant qu'on n'analyse pas.
|
||||
from mlx_lm import load
|
||||
return load(model_id)
|
||||
|
||||
|
||||
class Gemma:
|
||||
"""Petite facade autour de mlx-lm pour piloter Gemma."""
|
||||
|
||||
def __init__(self, model_id: Optional[str] = None):
|
||||
self.model_id = model_id or get_settings().gemma_model
|
||||
self._model = None
|
||||
self._tokenizer = None
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
if self._model is None:
|
||||
self._model, self._tokenizer = _load(self.model_id)
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompt: str,
|
||||
*,
|
||||
system: Optional[str] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
temperature: Optional[float] = None,
|
||||
) -> str:
|
||||
"""Genere une reponse texte a partir d'un prompt (template de chat).
|
||||
|
||||
`max_tokens`/`temperature` non fournis -> valeurs des reglages courants.
|
||||
"""
|
||||
self._ensure_loaded()
|
||||
settings = get_settings()
|
||||
if max_tokens is None:
|
||||
max_tokens = settings.gemma_max_tokens
|
||||
if temperature is None:
|
||||
temperature = settings.gemma_temperature
|
||||
from mlx_lm import generate
|
||||
from mlx_lm.sample_utils import make_sampler
|
||||
|
||||
messages = []
|
||||
if system:
|
||||
messages.append({"role": "system", "content": system})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
formatted = self._tokenizer.apply_chat_template(
|
||||
messages, add_generation_prompt=True, tokenize=False
|
||||
)
|
||||
sampler = make_sampler(temp=temperature)
|
||||
return generate(
|
||||
self._model,
|
||||
self._tokenizer,
|
||||
prompt=formatted,
|
||||
max_tokens=max_tokens,
|
||||
sampler=sampler,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
def generate_json(
|
||||
self,
|
||||
prompt: str,
|
||||
*,
|
||||
system: Optional[str] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
temperature: Optional[float] = None,
|
||||
retries: int = 1,
|
||||
) -> Any:
|
||||
"""Genere puis parse un JSON. Reessaie en cas d'echec de parsing.
|
||||
|
||||
`max_tokens`/`temperature` non fournis -> valeurs des reglages courants.
|
||||
"""
|
||||
last_err: Optional[Exception] = None
|
||||
for attempt in range(retries + 1):
|
||||
raw = self.generate(
|
||||
prompt, system=system, max_tokens=max_tokens,
|
||||
temperature=temperature if attempt == 0 else 0.0,
|
||||
)
|
||||
try:
|
||||
return _extract_json(raw)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
last_err = exc
|
||||
raise ValueError(f"Reponse JSON invalide apres {retries + 1} essais: {last_err}")
|
||||
|
||||
|
||||
def _extract_json(text: str) -> Any:
|
||||
"""Extrait le premier objet/array JSON d'une reponse libre du modele.
|
||||
|
||||
Tolere le texte parasite avant/apres (y compris un 2e bloc) grace a
|
||||
raw_decode, qui s'arrete au premier JSON complet.
|
||||
"""
|
||||
text = text.strip()
|
||||
fence = _FENCE_RE.search(text)
|
||||
if fence:
|
||||
text = fence.group(1).strip()
|
||||
decoder = json.JSONDecoder()
|
||||
# Cherche le 1er debut de structure JSON et decode a partir de la.
|
||||
for i, ch in enumerate(text):
|
||||
if ch in "[{":
|
||||
try:
|
||||
obj, _ = decoder.raw_decode(text[i:])
|
||||
return obj
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
raise ValueError("aucun JSON trouve dans la reponse")
|
||||
59
backend/inkflow/analysis/pronunciation.py
Normal file
59
backend/inkflow/analysis/pronunciation.py
Normal file
@@ -0,0 +1,59 @@
|
||||
"""Dictionnaire de prononciation : application + proposition de candidats.
|
||||
|
||||
L'application est une simple reecriture de surface du texte (graphie guidee)
|
||||
avant synthese. Les candidats (noms propres, termes SF) peuvent etre proposes
|
||||
par Gemma puis valides par l'utilisateur dans l'UI.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Iterable
|
||||
|
||||
from ..models import Pronunciation, PronunciationEntry
|
||||
from ..settings import get_settings
|
||||
from .gemma import Gemma
|
||||
|
||||
|
||||
def apply_pronunciation(text: str, pron: Pronunciation) -> str:
|
||||
"""Remplace chaque terme actif par sa graphie phonetique (mot entier)."""
|
||||
for entry in pron.entries:
|
||||
if not entry.enabled or not entry.term:
|
||||
continue
|
||||
pattern = re.compile(rf"\b{re.escape(entry.term)}\b")
|
||||
text = pattern.sub(entry.replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
# Le prompt systeme est editable dans les reglages (settings.prompt_pronunciation).
|
||||
|
||||
|
||||
def propose_pronunciations(text: str, gemma: Gemma, *, max_chars: int = 16000) -> list[PronunciationEntry]:
|
||||
"""Propose des candidats de prononciation a valider."""
|
||||
sample = text[:max_chars]
|
||||
prompt = (
|
||||
"Repere dans cet extrait les mots a risque de mauvaise prononciation par "
|
||||
"une voix de synthese francaise. Pour chacun, propose une graphie "
|
||||
"phonetique francaise (replacement) qui guide la prononciation.\n\n"
|
||||
f"EXTRAIT:\n{sample}\n\n"
|
||||
'Reponds par un tableau JSON: '
|
||||
'[{"term":"Tiamat","replacement":"Tia-matt","note":"nom propre"}]'
|
||||
)
|
||||
result = gemma.generate_json(prompt, system=get_settings().prompt_pronunciation)
|
||||
entries: list[PronunciationEntry] = []
|
||||
for item in result:
|
||||
if isinstance(item, dict) and item.get("term") and item.get("replacement"):
|
||||
entries.append(PronunciationEntry(
|
||||
term=str(item["term"]).strip(),
|
||||
replacement=str(item["replacement"]).strip(),
|
||||
note=item.get("note"),
|
||||
))
|
||||
return entries
|
||||
|
||||
|
||||
def merge_pronunciations(
|
||||
existing: Pronunciation, new: Iterable[PronunciationEntry]
|
||||
) -> Pronunciation:
|
||||
by_term = {e.term.lower(): e for e in existing.entries}
|
||||
for e in new:
|
||||
by_term.setdefault(e.term.lower(), e)
|
||||
return Pronunciation(entries=list(by_term.values()))
|
||||
622
backend/inkflow/analysis/segmenter.py
Normal file
622
backend/inkflow/analysis/segmenter.py
Normal file
@@ -0,0 +1,622 @@
|
||||
"""Segmentation narration/dialogue + attribution de locuteur + casting.
|
||||
|
||||
Approche hybride :
|
||||
1. Pre-segmentation deterministe au niveau paragraphe (regles de ponctuation
|
||||
francaise : un paragraphe commencant par un cadratin "—" est une replique).
|
||||
2. Gemma attribue un locuteur a chaque replique, en un seul appel par chapitre
|
||||
(liste numerotee + contexte), et extrait le casting (personnages + attributs).
|
||||
|
||||
Le decoupage fin des incises ("..., dit-il") est laisse a une passe ulterieure ;
|
||||
en v1 la replique entiere est portee par la voix du personnage.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from ..models import (
|
||||
Cast,
|
||||
Chapter,
|
||||
ChapterAnalysis,
|
||||
ChapterText,
|
||||
Character,
|
||||
Incise,
|
||||
Segment,
|
||||
SegmentType,
|
||||
)
|
||||
from ..settings import get_settings
|
||||
from .gemma import Gemma
|
||||
|
||||
# Un paragraphe de dialogue commence par un cadratin (U+2014) ou un tiret long.
|
||||
_DIALOGUE_LEAD_RE = re.compile(r"^\s*[—―]\s*")
|
||||
|
||||
# --- Detection des incises (inversion verbe-sujet francaise) ------------------
|
||||
# Une incise est un groupe de narration insere dans une replique ("..., dit-il.").
|
||||
# On exclut tu/nous/vous (imperatifs "Donne-le-moi", "Crois-tu ?") pour limiter
|
||||
# les faux positifs. Voir `detect_incises` plus bas pour les deux passes
|
||||
# (inversion verbe-pronom + nominale "lanca Drummer", conscience du casting).
|
||||
_INCISE_PRON = r"(?:il|elle|on|ils|elles|je)"
|
||||
# Verbe de parole, eventuellement reflechi ("s'ecria", "s'exclama").
|
||||
_INCISE_VERB = r"(?:[A-Za-zÀ-ÿ]+['’])?[A-Za-zÀ-ÿ]{2,}"
|
||||
|
||||
|
||||
def segment_chapter_text(ct: ChapterText) -> list[Segment]:
|
||||
"""Decoupe un chapitre en segments narration/dialogue (regles seules)."""
|
||||
segments: list[Segment] = []
|
||||
for para in ct.paragraphs:
|
||||
if _DIALOGUE_LEAD_RE.match(para):
|
||||
text = _DIALOGUE_LEAD_RE.sub("", para).strip()
|
||||
segments.append(Segment(
|
||||
type=SegmentType.DIALOGUE, text=text, speaker="?"))
|
||||
else:
|
||||
segments.append(Segment(
|
||||
type=SegmentType.NARRATION, text=para, speaker="narrateur"))
|
||||
return segments
|
||||
|
||||
|
||||
# --- Attribution des locuteurs (Gemma) --------------------------------------
|
||||
# Le prompt systeme est editable dans les reglages (settings.prompt_speakers).
|
||||
|
||||
|
||||
_UNKNOWN = {"", "?", "inconnu", "narrateur"}
|
||||
_CTX_CHARS = 160 # troncature du contexte narratif avant/apres
|
||||
_CHUNK_MAX_DIALOGUES = 30 # repliques par appel (fiabilite du modele)
|
||||
|
||||
|
||||
def attribute_speakers(
|
||||
segments: list[Segment],
|
||||
gemma: Gemma,
|
||||
*,
|
||||
characters: Optional[list[Character]] = None,
|
||||
pov: Optional[str] = None,
|
||||
) -> dict[int, str]:
|
||||
"""Renseigne `speaker` pour chaque dialogue (mutation en place).
|
||||
|
||||
Fournit au modele la liste canonique enrichie des personnages (nom, genre,
|
||||
description) et, pour chaque replique, le contexte narratif AVANT et APRES
|
||||
(l'incise d'attribution est souvent placee apres : "— Bonjour. dit Marie.").
|
||||
|
||||
Renvoie une map {index_de_segment: confidence} ("high"/"medium"/"low"),
|
||||
conservee en memoire (non persistee) pour piloter la 2e passe retroactive.
|
||||
Une replique dont le nom rendu sort de la liste fournie est gardee mais
|
||||
marquee "low" afin d'etre reexaminee.
|
||||
"""
|
||||
dialogues = [(i, s) for i, s in enumerate(segments)
|
||||
if s.type is SegmentType.DIALOGUE]
|
||||
if not dialogues:
|
||||
return {}
|
||||
|
||||
# Repliques deja resolues (seed par incise) : montrees comme contexte fixe,
|
||||
# jamais re-demandees au modele. Si tout est resolu, rien a faire.
|
||||
locked = {i for i, s in dialogues if _is_resolved(s.speaker)}
|
||||
if len(locked) == len(dialogues):
|
||||
return {i: "high" for i, _ in dialogues}
|
||||
|
||||
hint = _speakers_hint(characters, pov)
|
||||
valid = {c.name.strip().lower() for c in (characters or [])}
|
||||
confidence: dict[int, str] = {}
|
||||
|
||||
for chunk in _chunk_dialogues(dialogues, segments, hint):
|
||||
prompt = (
|
||||
"Voici les repliques de dialogue d'un extrait, numerotees, avec la "
|
||||
"narration qui precede et qui suit chaque replique. Les repliques "
|
||||
"deja attribuees affichent (locuteur: X) : ne les modifie pas, "
|
||||
"sers-t'en comme contexte (alternance des tours). Pour les AUTRES, "
|
||||
"indique le personnage qui parle (recopie son nom depuis la liste "
|
||||
"fournie ; 'inconnu' si vraiment indeterminable) et ta confiance "
|
||||
"(high/medium/low)."
|
||||
f"{hint}\n\n" + "\n".join(line for _, line in chunk) +
|
||||
'\n\nReponds par un tableau JSON: '
|
||||
'[{"i": 0, "speaker": "Holden", "confidence": "high"}, ...]'
|
||||
)
|
||||
result = gemma.generate_json(prompt, system=get_settings().prompt_speakers)
|
||||
by_i: dict[int, dict] = {item["i"]: item for item in result
|
||||
if isinstance(item, dict) and "i" in item}
|
||||
for j, (seg_idx, _line) in enumerate(chunk):
|
||||
if seg_idx in locked: # seed conserve
|
||||
confidence[seg_idx] = "high"
|
||||
continue
|
||||
seg = segments[seg_idx]
|
||||
item = by_i.get(j) or {}
|
||||
speaker = (str(item.get("speaker") or "inconnu").strip()
|
||||
or "inconnu")
|
||||
conf = str(item.get("confidence") or "low").strip().lower()
|
||||
if conf not in {"high", "medium", "low"}:
|
||||
conf = "low"
|
||||
# Nom hors liste connue -> on garde le nom mais on le rejuge.
|
||||
if (valid and speaker.lower() not in _UNKNOWN
|
||||
and speaker.lower() not in valid):
|
||||
conf = "low"
|
||||
seg.speaker = speaker
|
||||
confidence[seg_idx] = conf
|
||||
return confidence
|
||||
|
||||
|
||||
def _speakers_hint(characters: Optional[list[Character]], pov: Optional[str]) -> str:
|
||||
hint = ""
|
||||
if characters:
|
||||
lines = []
|
||||
for c in characters:
|
||||
attrs = c.gender or ""
|
||||
desc = f" — {c.description}" if c.description else ""
|
||||
lines.append(f"- {c.name}" + (f" ({attrs})" if attrs else "") + desc)
|
||||
hint += "\nPersonnages du chapitre:\n" + "\n".join(lines)
|
||||
if pov:
|
||||
hint += f"\nLe point de vue de ce chapitre est: {pov}."
|
||||
return hint
|
||||
|
||||
|
||||
def _is_resolved(speaker: str) -> bool:
|
||||
"""Vrai si la replique a deja un locuteur sur (seed incise, etc.)."""
|
||||
return (speaker or "").strip().lower() not in _UNKNOWN
|
||||
|
||||
|
||||
def _dialogue_line(n: int, segments: list[Segment], idx: int) -> str:
|
||||
seg = segments[idx]
|
||||
# Replique deja resolue (ex: seed par incise) -> montree comme contexte fixe.
|
||||
if _is_resolved(seg.speaker):
|
||||
return f"[{n}] (locuteur: {seg.speaker}) REPLIQUE: {seg.text!r}"
|
||||
before = _adjacent_narration(segments, idx, -1)
|
||||
after = _adjacent_narration(segments, idx, +1)
|
||||
parts = [f"[{n}]"]
|
||||
if before:
|
||||
parts.append(f"(avant: {before!r})")
|
||||
parts.append(f"REPLIQUE: {seg.text!r}")
|
||||
if after:
|
||||
parts.append(f"(apres: {after!r})")
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def _adjacent_narration(segments: list[Segment], idx: int, direction: int) -> str:
|
||||
"""Texte de la narration immediatement adjacente (incise d'attribution)."""
|
||||
j = idx + direction
|
||||
if 0 <= j < len(segments) and segments[j].type is SegmentType.NARRATION:
|
||||
return segments[j].text[:_CTX_CHARS]
|
||||
return ""
|
||||
|
||||
|
||||
def _chunk_dialogues(
|
||||
dialogues: list[tuple[int, Segment]],
|
||||
segments: list[Segment],
|
||||
hint: str,
|
||||
) -> list[list[tuple[int, str]]]:
|
||||
"""Decoupe les repliques en lots tenant sous `_MAX_PROMPT_CHARS`.
|
||||
|
||||
Chaque lot est une liste de (index_segment, ligne_rendue) ; la ligne est
|
||||
numerotee localement (0..k) pour le prompt, l'index segment sert au mapping
|
||||
retour. Evite la troncature brutale sur les longs chapitres.
|
||||
"""
|
||||
budget = _MAX_PROMPT_CHARS - len(hint) - 400 # marge pour les consignes
|
||||
chunks: list[list[tuple[int, str]]] = []
|
||||
current: list[tuple[int, str]] = []
|
||||
size = 0
|
||||
for idx, _seg in dialogues:
|
||||
line = _dialogue_line(len(current), segments, idx)
|
||||
if current and (size + len(line) > budget
|
||||
or len(current) >= _CHUNK_MAX_DIALOGUES):
|
||||
chunks.append(current)
|
||||
current = []
|
||||
size = 0
|
||||
line = _dialogue_line(0, segments, idx)
|
||||
current.append((idx, line))
|
||||
size += len(line) + 1
|
||||
if current:
|
||||
chunks.append(current)
|
||||
return chunks
|
||||
|
||||
|
||||
# --- Passe retroactive : re-resolution des repliques indeterminees ----------
|
||||
# Le prompt systeme est editable (settings.prompt_speakers_refine).
|
||||
|
||||
|
||||
def _refine_unknown_speakers(
|
||||
segments: list[Segment],
|
||||
gemma: Gemma,
|
||||
*,
|
||||
characters: Optional[list[Character]] = None,
|
||||
confidence: dict[int, str],
|
||||
) -> None:
|
||||
"""2e passe : re-resout les repliques restees indeterminees/peu sures.
|
||||
|
||||
Chaque replique douteuse est presentee avec ses voisines de dialogue DEJA
|
||||
identifiees (alternance des tours) et son contexte narratif, pour exploiter
|
||||
l'information venant des repliques *suivantes*. Mutation en place ; aucun
|
||||
appel Gemma si rien n'est douteux.
|
||||
"""
|
||||
dialogues = [(i, s) for i, s in enumerate(segments)
|
||||
if s.type is SegmentType.DIALOGUE]
|
||||
if not dialogues:
|
||||
return
|
||||
pos = {seg_idx: n for n, (seg_idx, _s) in enumerate(dialogues)}
|
||||
doubtful = [seg_idx for seg_idx, _s in dialogues
|
||||
if segments[seg_idx].speaker.strip().lower() in _UNKNOWN
|
||||
or confidence.get(seg_idx) == "low"]
|
||||
if not doubtful:
|
||||
return
|
||||
|
||||
hint = _speakers_hint(characters, pov=None)
|
||||
lines = []
|
||||
for j, seg_idx in enumerate(doubtful):
|
||||
n = pos[seg_idx]
|
||||
ctx = []
|
||||
if n > 0:
|
||||
prev_idx = dialogues[n - 1][0]
|
||||
ctx.append(f"replique precedente (dite par "
|
||||
f"{segments[prev_idx].speaker}): "
|
||||
f"{segments[prev_idx].text[:_CTX_CHARS]!r}")
|
||||
before = _adjacent_narration(segments, seg_idx, -1)
|
||||
if before:
|
||||
ctx.append(f"narration avant: {before!r}")
|
||||
after = _adjacent_narration(segments, seg_idx, +1)
|
||||
if after:
|
||||
ctx.append(f"narration apres: {after!r}")
|
||||
if n < len(dialogues) - 1:
|
||||
next_idx = dialogues[n + 1][0]
|
||||
ctx.append(f"replique suivante (dite par "
|
||||
f"{segments[next_idx].speaker}): "
|
||||
f"{segments[next_idx].text[:_CTX_CHARS]!r}")
|
||||
ctx_str = (" [" + " ; ".join(ctx) + "]") if ctx else ""
|
||||
lines.append(f"[{j}]{ctx_str} REPLIQUE: {segments[seg_idx].text!r}")
|
||||
|
||||
prompt = (
|
||||
"Repliques au locuteur indetermine. Pour chacune, en t'appuyant sur les "
|
||||
"repliques voisines DEJA attribuees (alternance des tours) et le "
|
||||
"contexte, indique qui parle (recopie le nom depuis la liste ; "
|
||||
"'inconnu' si toujours indeterminable)."
|
||||
f"{hint}\n\n" + "\n".join(lines) +
|
||||
'\n\nReponds par un tableau JSON: [{"i": 0, "speaker": "Holden"}, ...]'
|
||||
)
|
||||
result = gemma.generate_json(_truncate(prompt),
|
||||
system=get_settings().prompt_speakers_refine)
|
||||
by_i = {item["i"]: item.get("speaker") for item in result
|
||||
if isinstance(item, dict) and "i" in item}
|
||||
for j, seg_idx in enumerate(doubtful):
|
||||
new = (str(by_i.get(j) or "").strip())
|
||||
if new and new.lower() not in _UNKNOWN:
|
||||
segments[seg_idx].speaker = new
|
||||
|
||||
|
||||
# --- Extraction du casting (Gemma) ------------------------------------------
|
||||
# Le prompt systeme est editable dans les reglages (settings.prompt_characters).
|
||||
|
||||
|
||||
def extract_characters(text: str, gemma: Gemma) -> list[Character]:
|
||||
"""Extrait les personnages et leurs attributs (genre, age) d'un texte."""
|
||||
prompt = (
|
||||
"A partir de l'extrait suivant, liste les personnages qui parlent ou "
|
||||
"sont nommes. Pour chacun, donne: name (nom court canonique), gender "
|
||||
"(male/female/unknown), age (child/young/adult/old/unknown), et une "
|
||||
"courte description. Ignore les figurants sans nom.\n\n"
|
||||
f"EXTRAIT:\n{_truncate(text)}\n\n"
|
||||
'Reponds par un tableau JSON: '
|
||||
'[{"name":"Holden","gender":"male","age":"adult","description":"..."}]'
|
||||
)
|
||||
result = gemma.generate_json(prompt, system=get_settings().prompt_characters)
|
||||
characters: list[Character] = []
|
||||
for item in result:
|
||||
if not isinstance(item, dict) or not item.get("name"):
|
||||
continue
|
||||
characters.append(Character(
|
||||
name=str(item["name"]).strip(),
|
||||
gender=_norm(item.get("gender")),
|
||||
age=_norm(item.get("age")),
|
||||
description=(item.get("description") or None),
|
||||
))
|
||||
return characters
|
||||
|
||||
|
||||
def merge_characters(existing: list[Character], new: list[Character]) -> list[Character]:
|
||||
"""Fusionne deux listes de personnages par nom (insensible a la casse)."""
|
||||
by_key = {c.name.lower(): c for c in existing}
|
||||
for c in new:
|
||||
key = c.name.lower()
|
||||
if key in by_key:
|
||||
cur = by_key[key]
|
||||
cur.gender = cur.gender or c.gender
|
||||
cur.age = cur.age or c.age
|
||||
cur.description = cur.description or c.description
|
||||
else:
|
||||
by_key[key] = c
|
||||
return list(by_key.values())
|
||||
|
||||
|
||||
def _norm(value) -> Optional[str]:
|
||||
if not value:
|
||||
return None
|
||||
v = str(value).strip().lower()
|
||||
return v if v and v != "unknown" else None
|
||||
|
||||
|
||||
# --- Helpers -----------------------------------------------------------------
|
||||
|
||||
# Garde-fou de contexte (caracteres) pour rester dans une fenetre raisonnable.
|
||||
_MAX_PROMPT_CHARS = 24000
|
||||
|
||||
|
||||
def _truncate(text: str) -> str:
|
||||
return text if len(text) <= _MAX_PROMPT_CHARS else text[:_MAX_PROMPT_CHARS]
|
||||
|
||||
|
||||
# --- Detection des incises (deterministe, conscience du casting) -------------
|
||||
# Les incises sont annotees par des bornes (offsets) sur la replique persistee
|
||||
# (non destructif) ; le rendu les fait porter par la voix du narrateur. Deux
|
||||
# passes complementaires :
|
||||
# 1. inversion verbe-pronom ("dit-il", "coupa-t-elle") ;
|
||||
# 2. nominale : verbe de parole + sujet connu (nom du casting OU nom de role,
|
||||
# ex: "compatit Holden", "lanca Drummer", "informa le soldat").
|
||||
# La passe nominale s'appuie sur la liste des personnages -> peu de faux positifs
|
||||
# et permet d'extraire le locuteur explicite (seeding de l'attribution).
|
||||
|
||||
# Pronom objet eventuel devant le verbe ("lui demanda un garde").
|
||||
_CLITIC = r"(?:lui|leur|nous|vous|me|te|se|y|en|[mts]['’])"
|
||||
|
||||
# Formes conjuguees de verbes de parole (3e pers., passe simple / present /
|
||||
# imparfait). Liste curee : on prefere rater une incise que d'en inventer une.
|
||||
_SPEECH_VERBS = {
|
||||
"dit", "disait", "redit", "répondit", "repondit", "répond", "repond",
|
||||
"répondait", "repondait", "demanda", "demandait", "demande", "interrogea",
|
||||
"questionna", "ecria", "écria", "exclama", "enquit", "lança", "lanca",
|
||||
"lançait", "lance", "murmura", "chuchota", "souffla", "soupira", "ajouta",
|
||||
"ajoute", "reprit", "poursuivit", "poursuit", "continua", "enchaîna",
|
||||
"enchaina", "fit", "faisait", "remarqua", "observa", "nota", "déclara",
|
||||
"declara", "affirma", "assura", "rétorqua", "retorqua", "répliqua",
|
||||
"repliqua", "riposta", "objecta", "protesta", "insista", "renchérit",
|
||||
"rencherit", "acquiesça", "acquiesca", "admit", "avoua", "convint",
|
||||
"concéda", "conceda", "rectifia", "corrigea", "précisa", "precisa",
|
||||
"expliqua", "raconta", "annonça", "annonca", "proclama", "ordonna",
|
||||
"commanda", "supplia", "implora", "gémit", "gemit", "grogna", "ronchonna",
|
||||
"maugréa", "maugrea", "marmonna", "glissa", "lâcha", "lacha", "coupa",
|
||||
"interrompit", "conclut", "compléta", "completa", "suggéra", "suggera",
|
||||
"proposa", "promit", "jura", "menaça", "menaca", "ironisa", "plaisanta",
|
||||
"railla", "cria", "hurla", "tonna", "gronda", "rugit", "susurra",
|
||||
"compatit", "salua", "appela", "héla", "hela", "interpella", "balbutia",
|
||||
"bredouilla", "bafouilla", "gloussa", "ricana", "siffla", "tempêta",
|
||||
"tempeta", "rétorque", "lâche", "informa", "renseigna", "indiqua",
|
||||
"rappela", "avertit", "prévint", "prevint", "intima", "rétorquait",
|
||||
"lançait", "questionnait", "reconnut", "constata", "répéta", "repeta",
|
||||
}
|
||||
|
||||
# Noms de role pouvant etre sujet d'une incise ("informa le soldat").
|
||||
_ROLE_NOUNS = {
|
||||
"garde", "soldat", "sentinelle", "gardien", "prêtre", "pretre", "homme",
|
||||
"femme", "fille", "garçon", "garcon", "vieille", "vieillard", "capitaine",
|
||||
"lieutenant", "sergent", "général", "general", "amiral", "officier", "voix",
|
||||
"inconnu", "inconnue", "étranger", "etranger", "enfant", "serviteur",
|
||||
"servante", "messager", "domestique", "médecin", "medecin",
|
||||
}
|
||||
|
||||
# Mots vides ignores quand on indexe les tokens d'un nom de personnage.
|
||||
_NAME_STOP = {
|
||||
"le", "la", "les", "un", "une", "de", "du", "des", "monsieur", "madame",
|
||||
"mademoiselle", "m", "mme", "mlle", "mr", "dr", "docteur", "saint", "sainte",
|
||||
}
|
||||
|
||||
# Ponctuations qui terminent la partie parlee : si l'incise les suit, tout le
|
||||
# reste de la replique est de la narration (la parole est finie). Apres une
|
||||
# simple virgule au contraire, le dialogue reprend apres l'incise.
|
||||
_SENTENCE_FINAL = {"", ".", "!", "?", "…"}
|
||||
|
||||
|
||||
def _incise_end(text: str, close_end: int, lead: str) -> int:
|
||||
"""Fin effective de l'incise : jusqu'au bout de la replique si la parole
|
||||
etait deja close a gauche (`.`/`!`/`?`/`…` ou debut), sinon la cloture."""
|
||||
return len(text) if lead in _SENTENCE_FINAL else close_end
|
||||
|
||||
|
||||
# Passe 1 : inversion verbe-(t-)pronom, ancree sur une ponctuation a gauche
|
||||
# (virgule, point, ?, !, …) ou le debut de la replique.
|
||||
_INVERSION_RE = re.compile(
|
||||
r"(?P<lead>[,.!?…]|^)\s*"
|
||||
r"(?P<inc>" + _INCISE_VERB + r"-(?:t-)?" + _INCISE_PRON +
|
||||
r"(?:\s+[^.!?…»\",;]*?)?)" # complements eventuels ("dit-il en souriant")
|
||||
r"(?P<close>[.!?…,])", # cloture : ponctuation forte OU virgule
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _inversion_spans(text: str) -> list[tuple[int, int]]:
|
||||
return [(m.start("inc"), _incise_end(text, m.end("close"), m.group("lead")))
|
||||
for m in _INVERSION_RE.finditer(text)]
|
||||
|
||||
|
||||
def _name_token_index(names) -> dict[str, str]:
|
||||
"""Index token -> nom canonique (tokens distinctifs uniquement).
|
||||
|
||||
Un token partage par plusieurs personnages est ambigu et ecarte.
|
||||
"""
|
||||
idx: dict[str, str] = {}
|
||||
ambiguous: set[str] = set()
|
||||
for name in names or ():
|
||||
for tok in re.split(r"[^\wÀ-ÿ]+", name):
|
||||
t = tok.lower()
|
||||
if len(t) < 2 or t in _NAME_STOP:
|
||||
continue
|
||||
if t in idx and idx[t] != name:
|
||||
ambiguous.add(t)
|
||||
else:
|
||||
idx[t] = name
|
||||
for t in ambiguous:
|
||||
idx.pop(t, None)
|
||||
return idx
|
||||
|
||||
|
||||
# Nom propre : initiale majuscule (motif sensible a la casse).
|
||||
_PROPER = r"[A-ZÀ-Ÿ][\wÀ-ÿ’'\-]+"
|
||||
_REJECT = object() # le sujet n'en est pas un -> pas une incise
|
||||
|
||||
|
||||
def _classify_subject(subj: str, idx: dict[str, str]):
|
||||
"""Locuteur porte par le sujet d'une incise nominale.
|
||||
|
||||
- personnage connu -> nom canonique ;
|
||||
- nom propre (capitalise) inconnu -> nom de surface (seed quand meme : le
|
||||
texte le nomme, independamment de la fiabilite de l'extraction) ;
|
||||
- nom de role generique ("le soldat") -> None (incise reelle, pas de seed) ;
|
||||
- mot quelconque -> _REJECT (pas une incise).
|
||||
"""
|
||||
low = subj.lower()
|
||||
if low in idx:
|
||||
return idx[low]
|
||||
if low in _ROLE_NOUNS:
|
||||
return None
|
||||
if subj[:1].isupper() and len(low) >= 2 and low not in _NAME_STOP:
|
||||
return subj.strip("’'")
|
||||
return _REJECT
|
||||
|
||||
|
||||
def _nominal_matches(text: str, names) -> list[tuple[int, int, Optional[str]]]:
|
||||
"""Passe 2 : (start, end, locuteur) pour chaque incise nominale.
|
||||
|
||||
Une incise nominale = verbe de parole + sujet (nom du casting, nom propre,
|
||||
ou nom de role). Le sujet nom propre est seede meme absent du casting.
|
||||
"""
|
||||
idx = _name_token_index(names)
|
||||
literals = sorted(set(idx) | _ROLE_NOUNS, key=len, reverse=True)
|
||||
lit_alt = "|".join(re.escape(s) for s in literals)
|
||||
# Sujet : nom connu/role (insensible casse) OU nom propre (capitalise, sensible
|
||||
# casse pour ne pas happer un determiner "un"/"le"). Pas d'IGNORECASE global.
|
||||
subj_alt = (f"(?i:{lit_alt})|{_PROPER}") if lit_alt else _PROPER
|
||||
verbs = "|".join(re.escape(v) for v in sorted(_SPEECH_VERBS, key=len, reverse=True))
|
||||
pat = re.compile(
|
||||
r"(?P<lead>[,.!?…]|^)\s*"
|
||||
r"(?P<inc>(?:(?i:" + _CLITIC + r")\s+)?"
|
||||
r"(?i:" + verbs + r")\b"
|
||||
r"[^.!?…»\",;]{0,40}?\b"
|
||||
r"(?P<subj>" + subj_alt + r")\b"
|
||||
r"[^.!?…»\",;]*?)"
|
||||
r"(?P<close>[.!?…,])",
|
||||
)
|
||||
out: list[tuple[int, int, Optional[str]]] = []
|
||||
for m in pat.finditer(text):
|
||||
spk = _classify_subject(m.group("subj"), idx)
|
||||
if spk is _REJECT:
|
||||
continue
|
||||
out.append((m.start("inc"),
|
||||
_incise_end(text, m.end("close"), m.group("lead")), spk))
|
||||
return out
|
||||
|
||||
|
||||
def _merge_spans(spans: list[tuple[int, int]]) -> list[Incise]:
|
||||
"""Trie et fusionne (sans chevauchement) une liste de bornes -> Incise."""
|
||||
out: list[Incise] = []
|
||||
last_end = -1
|
||||
for s, e in sorted(set(spans)):
|
||||
if s < last_end: # chevauchement -> on garde le premier vu
|
||||
continue
|
||||
out.append(Incise(start=s, end=e))
|
||||
last_end = e
|
||||
return out
|
||||
|
||||
|
||||
def detect_incises(text: str, *, names=None) -> list[Incise]:
|
||||
"""Bornes des incises dans une replique (inversion + nominale cast-aware)."""
|
||||
spans = _inversion_spans(text)
|
||||
spans += [(s, e) for s, e, _ in _nominal_matches(text, names or set())]
|
||||
return _merge_spans(spans)
|
||||
|
||||
|
||||
def incise_speaker(text: str, incise: Incise, names) -> Optional[str]:
|
||||
"""Locuteur explicite porte par une incise nominale ("compatit Holden")."""
|
||||
for s, e, spk in _nominal_matches(text, names):
|
||||
if s == incise.start and e == incise.end:
|
||||
return spk
|
||||
return None
|
||||
|
||||
|
||||
def iter_incise_pieces(
|
||||
text: str, incises: list[Incise]
|
||||
) -> list[tuple[bool, str]]:
|
||||
"""Decoupe `text` en morceaux (is_incise, sous_texte) via les bornes.
|
||||
|
||||
Utilise au rendu : pieces dialogue -> voix du personnage, pieces incise ->
|
||||
voix du narrateur. Texte conserve modulo espaces de bordure.
|
||||
"""
|
||||
pieces: list[tuple[bool, str]] = []
|
||||
cursor = 0
|
||||
for inc in sorted(incises, key=lambda i: i.start):
|
||||
if inc.start < cursor: # garde-fou chevauchement
|
||||
continue
|
||||
before = text[cursor:inc.start]
|
||||
if before.strip():
|
||||
pieces.append((False, before.strip()))
|
||||
body = text[inc.start:inc.end]
|
||||
if body.strip():
|
||||
pieces.append((True, body.strip()))
|
||||
cursor = inc.end
|
||||
tail = text[cursor:]
|
||||
if tail.strip():
|
||||
pieces.append((False, tail.strip()))
|
||||
return pieces
|
||||
|
||||
|
||||
def analyze_chapter(
|
||||
chapter: Chapter,
|
||||
ct: ChapterText,
|
||||
gemma: Gemma,
|
||||
*,
|
||||
book_chars: Optional[list[Character]] = None,
|
||||
dedup_gemma: Optional[Gemma] = None,
|
||||
) -> tuple[ChapterAnalysis, list[Character]]:
|
||||
"""Analyse complete d'un chapitre.
|
||||
|
||||
Sequence : segmentation -> extraction des personnages -> reconciliation
|
||||
(dedup contre le cast cumule du livre) -> annotation des incises + seeding
|
||||
du locuteur explicite -> attribution LLM des repliques restantes -> passe
|
||||
retroactive. Les repliques sont persistees entieres (incises = bornes).
|
||||
|
||||
`book_chars` : cast cumule du livre (personnages canoniques deja connus).
|
||||
`dedup_gemma` : si fourni, tranche les cas de dedup ambigus.
|
||||
|
||||
Renvoie (analyse, cast cumule mis a jour) ; le 2e element est l'ensemble du
|
||||
casting du livre reconcilie, pret a etre persiste tel quel.
|
||||
"""
|
||||
from ..casting.dedup import reconcile_characters
|
||||
|
||||
segments = segment_chapter_text(ct)
|
||||
full_text = "\n".join(ct.paragraphs)
|
||||
found = extract_characters(full_text, gemma)
|
||||
|
||||
# Dedup AVANT l'attribution : le modele recevra des noms canoniques.
|
||||
chars, name_map = reconcile_characters(book_chars or [], found, dedup_gemma)
|
||||
|
||||
# Liste canonique restreinte a ce chapitre (personnages detectes + POV).
|
||||
chapter_canon = {(name_map.get(c.name.strip().lower()) or c.name).strip().lower()
|
||||
for c in found}
|
||||
chapter_chars = [c for c in chars if c.name.strip().lower() in chapter_canon]
|
||||
if chapter.pov:
|
||||
pv = chapter.pov.strip().lower()
|
||||
for c in chars:
|
||||
if (c not in chapter_chars and
|
||||
(pv in c.name.lower()
|
||||
or any(pv in a.lower() for a in c.aliases))):
|
||||
chapter_chars.append(c)
|
||||
|
||||
# Annotation deterministe des incises (bornes, non destructif) + seeding :
|
||||
# une incise nominale qui nomme un personnage fixe le locuteur avec certitude
|
||||
# AVANT l'appel LLM (corrige les cas que le petit modele rate).
|
||||
names = {c.name for c in chars}
|
||||
for seg in segments:
|
||||
if seg.type is not SegmentType.DIALOGUE:
|
||||
continue
|
||||
seg.incises = detect_incises(seg.text, names=names)
|
||||
for inc in seg.incises:
|
||||
spk = incise_speaker(seg.text, inc, names)
|
||||
if spk:
|
||||
seg.speaker = spk
|
||||
break
|
||||
|
||||
conf = attribute_speakers(segments, gemma, characters=chapter_chars,
|
||||
pov=chapter.pov)
|
||||
if get_settings().retro_pass_use_gemma:
|
||||
_refine_unknown_speakers(segments, gemma, characters=chapter_chars,
|
||||
confidence=conf)
|
||||
|
||||
# Absorbe les locuteurs residuels (hors liste) en aliases (heuristique seule).
|
||||
chars, _ = reconcile_characters(
|
||||
chars, [], None, speaker_names=[s.speaker for s in segments])
|
||||
|
||||
# Les repliques sont persistees entieres ; les incises restent des bornes
|
||||
# (rendu : voix narrateur). Plus de fragmentation a l'analyse.
|
||||
analysis = ChapterAnalysis(index=chapter.index, title=ct.title,
|
||||
segments=segments)
|
||||
return analysis, chars
|
||||
Reference in New Issue
Block a user