Ajout d'un outil de benchmark des modèles d'analyse + support des modèles à raisonnement

- Nouvelle commande `inkflow benchmark` : compare la sortie d'analyse aux fichiers
  de référence (data/<slug>/reference/), met plusieurs modèles en concurrence,
  table rich + rapport JSON. Métriques : attribution de locuteur, incises, type/glued.
  Flags --models, --temperature, --reasoning, --stream, --use-cached + suivi par chapitre.
- analysis/benchmark.py : scoring pur (testable) + runner multi-modèles (un MLX à la fois).
- gemma.py : support des modèles à raisonnement (retrait de la pensée, désactivation
  via enable_thinking hors --reasoning, arrêt anticipé sur JSON complet, plafond +
  température dédiés anti-boucle), récupération du chat_template manquant (fix Mistral),
  streaming des tokens (set_token_sink).
- settings.py : gemma_reasoning, gemma_reasoning_max_tokens, gemma_reasoning_temperature.
- Tests : test_benchmark.py (scoring pur), test_gemma_reasoning.py.

Conclusion benchmark : Qwen3.6-27B-8bit non-raisonnant = meilleur modèle d'analyse.
This commit is contained in:
2026-06-21 03:25:50 +02:00
parent c1ab679686
commit 141df5f04e
6 changed files with 952 additions and 12 deletions

View File

@@ -0,0 +1,159 @@
"""Tests purs du scoring de benchmark (sans Gemma ni disque).
Monte des `ChapterAnalysis` synthetiques et verifie les metriques :
alignement, attribution du locuteur (avec normalisation d'alias), incises
(exact vs chevauchement), type/glued, et micro-moyenne sur plusieurs chapitres.
"""
from __future__ import annotations
from inkflow.analysis.benchmark import (
_score_counts,
align,
aggregate,
score_chapter,
)
from inkflow.models import (
Cast,
Character,
ChapterAnalysis,
Incise,
Segment,
SegmentType,
)
def _seg(text, *, type="narration", speaker="narrateur", glued=False, incises=None):
return Segment(
type=SegmentType(type), text=text, speaker=speaker,
glued_to_prev=glued, incises=incises or [])
def _chap(index, segments):
return ChapterAnalysis(index=index, title=f"ch{index}", segments=segments)
# --- Alignement --------------------------------------------------------------
def test_alignement_1_1_textes_identiques():
ref = _chap(5, [_seg("Bonjour."), _seg("— Salut.", type="dialogue")])
hyp = _chap(5, [_seg("bonjour. "), _seg("— salut.", type="dialogue")]) # espaces/casse
pairs = align(ref, hyp)
assert len(pairs) == 2
assert all(r is not None and h is not None for r, h in pairs)
def test_alignement_segment_hypothese_en_trop():
ref = _chap(5, [_seg("A"), _seg("B")])
hyp = _chap(5, [_seg("A"), _seg("X"), _seg("B")])
cnt = _score_counts(ref, hyp, None)
# le segment "X" non couvert par la reference -> warning
assert any("sans correspondance" in w for w in cnt.warnings)
# --- Attribution du locuteur -------------------------------------------------
def test_speaker_parfait():
segs = [
_seg("narration"),
_seg("— Bonjour.", type="dialogue", speaker="Holden"),
_seg("— Salut.", type="dialogue", speaker="Kajri"),
]
ref = _chap(5, [s.model_copy(deep=True) for s in segs])
hyp = _chap(5, [s.model_copy(deep=True) for s in segs])
score = score_chapter(ref, hyp)
assert score.speaker_acc_dialogue == 1.0
assert score.speaker_acc_all == 1.0
assert score.errors == []
def test_speaker_avec_erreurs():
ref = _chap(5, [
_seg("— A.", type="dialogue", speaker="Holden"),
_seg("— B.", type="dialogue", speaker="Kajri"),
])
hyp = _chap(5, [
_seg("— A.", type="dialogue", speaker="Holden"),
_seg("— B.", type="dialogue", speaker="Drummer"), # faux
])
score = score_chapter(ref, hyp)
assert score.speaker_acc_dialogue == 0.5
assert len(score.errors) == 1
assert score.errors[0].expected == "Kajri"
assert score.errors[0].got == "Drummer"
assert score.confusion["Kajri"]["Drummer"] == 1
def test_speaker_normalisation_alias():
ref = _chap(5, [_seg("— A.", type="dialogue", speaker="Camina Drummer")])
hyp = _chap(5, [_seg("— A.", type="dialogue", speaker="Drummer")]) # alias
cast = Cast(characters=[Character(name="Camina Drummer", aliases=["Drummer"])])
# sans cast : compte comme une erreur (noms differents)
assert score_chapter(ref, hyp, None).speaker_acc_dialogue == 0.0
# avec cast : l'alias est resolu -> correct
assert score_chapter(ref, hyp, cast).speaker_acc_dialogue == 1.0
# --- Incises -----------------------------------------------------------------
def test_incises_exact_vs_overlap():
ref = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X",
incises=[Incise(start=4, end=11)])])
# span decale mais largement chevauchant -> overlap ok, exact non
hyp = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X",
incises=[Incise(start=4, end=10)])])
score = score_chapter(ref, hyp)
assert score.incise_exact_f1 < 1.0
assert score.incise_overlap_f1 == 1.0
def test_incises_faux_positif_baisse_precision():
ref = _chap(5, [_seg("— A.", type="dialogue", speaker="X", incises=[])])
hyp = _chap(5, [_seg("— A.", type="dialogue", speaker="X",
incises=[Incise(start=0, end=3)])]) # invente une incise
score = score_chapter(ref, hyp)
assert score.incise_overlap_p < 1.0
assert score.incise_overlap_r == 1.0 # rien a rappeler
def test_incises_manque_baisse_rappel():
ref = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X",
incises=[Incise(start=4, end=11)])])
hyp = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X", incises=[])])
score = score_chapter(ref, hyp)
assert score.incise_overlap_r < 1.0
assert score.incise_overlap_p == 1.0
# --- Type / glued ------------------------------------------------------------
def test_type_et_glued():
ref = _chap(5, [_seg("A", type="narration"), _seg("— B", type="dialogue", glued=True)])
hyp = _chap(5, [_seg("A", type="dialogue"), _seg("— B", type="dialogue", glued=False)])
score = score_chapter(ref, hyp)
assert score.type_acc == 0.5
assert score.glued_acc == 0.5
# --- Agregat (micro-moyenne) -------------------------------------------------
def test_aggregate_micro_moyenne():
# ch1 : 1 dialogue correct ; ch2 : 3 dialogues dont 1 faux
ref1 = _chap(1, [_seg("— A.", type="dialogue", speaker="X")])
hyp1 = _chap(1, [_seg("— A.", type="dialogue", speaker="X")])
ref2 = _chap(2, [
_seg("— B.", type="dialogue", speaker="X"),
_seg("— C.", type="dialogue", speaker="Y"),
_seg("— D.", type="dialogue", speaker="Z"),
])
hyp2 = _chap(2, [
_seg("— B.", type="dialogue", speaker="X"),
_seg("— C.", type="dialogue", speaker="Y"),
_seg("— D.", type="dialogue", speaker="WRONG"),
])
c1, c2 = _score_counts(ref1, hyp1, None), _score_counts(ref2, hyp2, None)
s1, s2 = score_chapter(ref1, hyp1), score_chapter(ref2, hyp2)
agg = aggregate([s1, s2], [c1, c2])
# micro : 3 corrects / 4 dialogues = 0.75 (et non moyenne de 1.0 et 0.666)
assert agg.n_dialogue == 4
assert abs(agg.speaker_acc_dialogue - 0.75) < 1e-9
assert agg.index == -1

View File

@@ -0,0 +1,67 @@
"""Tests purs de `_strip_reasoning` (retrait de la chaine de pensee).
Sans charger de modele : on verifie que la pensee est retiree et que
`_extract_json` recupere bien la reponse FINALE (et non un fragment JSON
parasite present dans la pensee).
"""
from __future__ import annotations
from inkflow.analysis.gemma import (
_extract_json,
_has_complete_json,
_strip_reasoning,
)
def test_has_complete_json_arret_anticipe():
# JSON complet -> True (on peut stopper la generation)
assert _has_complete_json('voici: {"speaker": "Marie"}')
assert _has_complete_json('[{"a": 1}]')
# JSON tronque (reponse pas encore finie) -> False (on continue)
assert not _has_complete_json('{"speaker": "Mar')
assert not _has_complete_json('texte sans json')
# cas streaming reel : pensee close + fence json en cours mais objet complet
buf = _strip_reasoning('<think>...</think>```json\n{"speaker": "Marie"}')
assert _has_complete_json(buf)
def test_format_a_canaux_gemma4():
raw = (
"<|channel>thought\n"
"Thinking Process: la capitale est Paris. Exemple: {\"capitale\": \"...\"}\n"
"<channel|>```json\n{\"capitale\": \"Paris\"}\n```"
)
cleaned = _strip_reasoning(raw)
# la pensee (et son JSON d'exemple parasite) a disparu
assert "Thinking Process" not in cleaned
assert '"..."' not in cleaned
# le JSON extrait est bien la reponse finale
assert _extract_json(cleaned) == {"capitale": "Paris"}
def test_balises_think_deepseek():
raw = "<think>je reflechis, peut-etre [1,2]</think>\n[{\"speaker\": \"Holden\"}]"
cleaned = _strip_reasoning(raw)
assert "reflechis" not in cleaned
assert _extract_json(cleaned) == [{"speaker": "Holden"}]
def test_sans_raisonnement_inchange():
raw = '{"speaker": "Kajri"}'
assert _strip_reasoning(raw) == raw
assert _extract_json(_strip_reasoning(raw)) == {"speaker": "Kajri"}
def test_pensee_tronquee_sans_fermeture():
# pensee non fermee (budget de tokens epuise) : le prefixe de canal saute,
# on ne renvoie pas le marqueur d'ouverture.
raw = "<|channel>thought\nje commence a reflechir mais c'est coupe"
cleaned = _strip_reasoning(raw)
assert not cleaned.startswith("<|channel")
assert "<channel" not in cleaned
def test_dernier_marqueur_gagne():
# plusieurs blocs : seule la derniere reponse finale compte
raw = "<think>a</think>milieu<think>b</think>FINAL"
assert _strip_reasoning(raw) == "FINAL"