Ajout d'un outil de benchmark des modèles d'analyse + support des modèles à raisonnement

- Nouvelle commande `inkflow benchmark` : compare la sortie d'analyse aux fichiers
  de référence (data/<slug>/reference/), met plusieurs modèles en concurrence,
  table rich + rapport JSON. Métriques : attribution de locuteur, incises, type/glued.
  Flags --models, --temperature, --reasoning, --stream, --use-cached + suivi par chapitre.
- analysis/benchmark.py : scoring pur (testable) + runner multi-modèles (un MLX à la fois).
- gemma.py : support des modèles à raisonnement (retrait de la pensée, désactivation
  via enable_thinking hors --reasoning, arrêt anticipé sur JSON complet, plafond +
  température dédiés anti-boucle), récupération du chat_template manquant (fix Mistral),
  streaming des tokens (set_token_sink).
- settings.py : gemma_reasoning, gemma_reasoning_max_tokens, gemma_reasoning_temperature.
- Tests : test_benchmark.py (scoring pur), test_gemma_reasoning.py.

Conclusion benchmark : Qwen3.6-27B-8bit non-raisonnant = meilleur modèle d'analyse.
This commit is contained in:
2026-06-21 03:25:50 +02:00
parent c1ab679686
commit 141df5f04e
6 changed files with 952 additions and 12 deletions

View File

@@ -0,0 +1,159 @@
"""Tests purs du scoring de benchmark (sans Gemma ni disque).
Monte des `ChapterAnalysis` synthetiques et verifie les metriques :
alignement, attribution du locuteur (avec normalisation d'alias), incises
(exact vs chevauchement), type/glued, et micro-moyenne sur plusieurs chapitres.
"""
from __future__ import annotations
from inkflow.analysis.benchmark import (
_score_counts,
align,
aggregate,
score_chapter,
)
from inkflow.models import (
Cast,
Character,
ChapterAnalysis,
Incise,
Segment,
SegmentType,
)
def _seg(text, *, type="narration", speaker="narrateur", glued=False, incises=None):
return Segment(
type=SegmentType(type), text=text, speaker=speaker,
glued_to_prev=glued, incises=incises or [])
def _chap(index, segments):
return ChapterAnalysis(index=index, title=f"ch{index}", segments=segments)
# --- Alignement --------------------------------------------------------------
def test_alignement_1_1_textes_identiques():
ref = _chap(5, [_seg("Bonjour."), _seg("— Salut.", type="dialogue")])
hyp = _chap(5, [_seg("bonjour. "), _seg("— salut.", type="dialogue")]) # espaces/casse
pairs = align(ref, hyp)
assert len(pairs) == 2
assert all(r is not None and h is not None for r, h in pairs)
def test_alignement_segment_hypothese_en_trop():
ref = _chap(5, [_seg("A"), _seg("B")])
hyp = _chap(5, [_seg("A"), _seg("X"), _seg("B")])
cnt = _score_counts(ref, hyp, None)
# le segment "X" non couvert par la reference -> warning
assert any("sans correspondance" in w for w in cnt.warnings)
# --- Attribution du locuteur -------------------------------------------------
def test_speaker_parfait():
segs = [
_seg("narration"),
_seg("— Bonjour.", type="dialogue", speaker="Holden"),
_seg("— Salut.", type="dialogue", speaker="Kajri"),
]
ref = _chap(5, [s.model_copy(deep=True) for s in segs])
hyp = _chap(5, [s.model_copy(deep=True) for s in segs])
score = score_chapter(ref, hyp)
assert score.speaker_acc_dialogue == 1.0
assert score.speaker_acc_all == 1.0
assert score.errors == []
def test_speaker_avec_erreurs():
ref = _chap(5, [
_seg("— A.", type="dialogue", speaker="Holden"),
_seg("— B.", type="dialogue", speaker="Kajri"),
])
hyp = _chap(5, [
_seg("— A.", type="dialogue", speaker="Holden"),
_seg("— B.", type="dialogue", speaker="Drummer"), # faux
])
score = score_chapter(ref, hyp)
assert score.speaker_acc_dialogue == 0.5
assert len(score.errors) == 1
assert score.errors[0].expected == "Kajri"
assert score.errors[0].got == "Drummer"
assert score.confusion["Kajri"]["Drummer"] == 1
def test_speaker_normalisation_alias():
ref = _chap(5, [_seg("— A.", type="dialogue", speaker="Camina Drummer")])
hyp = _chap(5, [_seg("— A.", type="dialogue", speaker="Drummer")]) # alias
cast = Cast(characters=[Character(name="Camina Drummer", aliases=["Drummer"])])
# sans cast : compte comme une erreur (noms differents)
assert score_chapter(ref, hyp, None).speaker_acc_dialogue == 0.0
# avec cast : l'alias est resolu -> correct
assert score_chapter(ref, hyp, cast).speaker_acc_dialogue == 1.0
# --- Incises -----------------------------------------------------------------
def test_incises_exact_vs_overlap():
ref = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X",
incises=[Incise(start=4, end=11)])])
# span decale mais largement chevauchant -> overlap ok, exact non
hyp = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X",
incises=[Incise(start=4, end=10)])])
score = score_chapter(ref, hyp)
assert score.incise_exact_f1 < 1.0
assert score.incise_overlap_f1 == 1.0
def test_incises_faux_positif_baisse_precision():
ref = _chap(5, [_seg("— A.", type="dialogue", speaker="X", incises=[])])
hyp = _chap(5, [_seg("— A.", type="dialogue", speaker="X",
incises=[Incise(start=0, end=3)])]) # invente une incise
score = score_chapter(ref, hyp)
assert score.incise_overlap_p < 1.0
assert score.incise_overlap_r == 1.0 # rien a rappeler
def test_incises_manque_baisse_rappel():
ref = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X",
incises=[Incise(start=4, end=11)])])
hyp = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X", incises=[])])
score = score_chapter(ref, hyp)
assert score.incise_overlap_r < 1.0
assert score.incise_overlap_p == 1.0
# --- Type / glued ------------------------------------------------------------
def test_type_et_glued():
ref = _chap(5, [_seg("A", type="narration"), _seg("— B", type="dialogue", glued=True)])
hyp = _chap(5, [_seg("A", type="dialogue"), _seg("— B", type="dialogue", glued=False)])
score = score_chapter(ref, hyp)
assert score.type_acc == 0.5
assert score.glued_acc == 0.5
# --- Agregat (micro-moyenne) -------------------------------------------------
def test_aggregate_micro_moyenne():
# ch1 : 1 dialogue correct ; ch2 : 3 dialogues dont 1 faux
ref1 = _chap(1, [_seg("— A.", type="dialogue", speaker="X")])
hyp1 = _chap(1, [_seg("— A.", type="dialogue", speaker="X")])
ref2 = _chap(2, [
_seg("— B.", type="dialogue", speaker="X"),
_seg("— C.", type="dialogue", speaker="Y"),
_seg("— D.", type="dialogue", speaker="Z"),
])
hyp2 = _chap(2, [
_seg("— B.", type="dialogue", speaker="X"),
_seg("— C.", type="dialogue", speaker="Y"),
_seg("— D.", type="dialogue", speaker="WRONG"),
])
c1, c2 = _score_counts(ref1, hyp1, None), _score_counts(ref2, hyp2, None)
s1, s2 = score_chapter(ref1, hyp1), score_chapter(ref2, hyp2)
agg = aggregate([s1, s2], [c1, c2])
# micro : 3 corrects / 4 dialogues = 0.75 (et non moyenne de 1.0 et 0.666)
assert agg.n_dialogue == 4
assert abs(agg.speaker_acc_dialogue - 0.75) < 1e-9
assert agg.index == -1