Ajout d'un outil de benchmark des modèles d'analyse + support des modèles à raisonnement
- Nouvelle commande `inkflow benchmark` : compare la sortie d'analyse aux fichiers de référence (data/<slug>/reference/), met plusieurs modèles en concurrence, table rich + rapport JSON. Métriques : attribution de locuteur, incises, type/glued. Flags --models, --temperature, --reasoning, --stream, --use-cached + suivi par chapitre. - analysis/benchmark.py : scoring pur (testable) + runner multi-modèles (un MLX à la fois). - gemma.py : support des modèles à raisonnement (retrait de la pensée, désactivation via enable_thinking hors --reasoning, arrêt anticipé sur JSON complet, plafond + température dédiés anti-boucle), récupération du chat_template manquant (fix Mistral), streaming des tokens (set_token_sink). - settings.py : gemma_reasoning, gemma_reasoning_max_tokens, gemma_reasoning_temperature. - Tests : test_benchmark.py (scoring pur), test_gemma_reasoning.py. Conclusion benchmark : Qwen3.6-27B-8bit non-raisonnant = meilleur modèle d'analyse.
This commit is contained in:
159
backend/tests/test_benchmark.py
Normal file
159
backend/tests/test_benchmark.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""Tests purs du scoring de benchmark (sans Gemma ni disque).
|
||||
|
||||
Monte des `ChapterAnalysis` synthetiques et verifie les metriques :
|
||||
alignement, attribution du locuteur (avec normalisation d'alias), incises
|
||||
(exact vs chevauchement), type/glued, et micro-moyenne sur plusieurs chapitres.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from inkflow.analysis.benchmark import (
|
||||
_score_counts,
|
||||
align,
|
||||
aggregate,
|
||||
score_chapter,
|
||||
)
|
||||
from inkflow.models import (
|
||||
Cast,
|
||||
Character,
|
||||
ChapterAnalysis,
|
||||
Incise,
|
||||
Segment,
|
||||
SegmentType,
|
||||
)
|
||||
|
||||
|
||||
def _seg(text, *, type="narration", speaker="narrateur", glued=False, incises=None):
|
||||
return Segment(
|
||||
type=SegmentType(type), text=text, speaker=speaker,
|
||||
glued_to_prev=glued, incises=incises or [])
|
||||
|
||||
|
||||
def _chap(index, segments):
|
||||
return ChapterAnalysis(index=index, title=f"ch{index}", segments=segments)
|
||||
|
||||
|
||||
# --- Alignement --------------------------------------------------------------
|
||||
|
||||
def test_alignement_1_1_textes_identiques():
|
||||
ref = _chap(5, [_seg("Bonjour."), _seg("— Salut.", type="dialogue")])
|
||||
hyp = _chap(5, [_seg("bonjour. "), _seg("— salut.", type="dialogue")]) # espaces/casse
|
||||
pairs = align(ref, hyp)
|
||||
assert len(pairs) == 2
|
||||
assert all(r is not None and h is not None for r, h in pairs)
|
||||
|
||||
|
||||
def test_alignement_segment_hypothese_en_trop():
|
||||
ref = _chap(5, [_seg("A"), _seg("B")])
|
||||
hyp = _chap(5, [_seg("A"), _seg("X"), _seg("B")])
|
||||
cnt = _score_counts(ref, hyp, None)
|
||||
# le segment "X" non couvert par la reference -> warning
|
||||
assert any("sans correspondance" in w for w in cnt.warnings)
|
||||
|
||||
|
||||
# --- Attribution du locuteur -------------------------------------------------
|
||||
|
||||
def test_speaker_parfait():
|
||||
segs = [
|
||||
_seg("narration"),
|
||||
_seg("— Bonjour.", type="dialogue", speaker="Holden"),
|
||||
_seg("— Salut.", type="dialogue", speaker="Kajri"),
|
||||
]
|
||||
ref = _chap(5, [s.model_copy(deep=True) for s in segs])
|
||||
hyp = _chap(5, [s.model_copy(deep=True) for s in segs])
|
||||
score = score_chapter(ref, hyp)
|
||||
assert score.speaker_acc_dialogue == 1.0
|
||||
assert score.speaker_acc_all == 1.0
|
||||
assert score.errors == []
|
||||
|
||||
|
||||
def test_speaker_avec_erreurs():
|
||||
ref = _chap(5, [
|
||||
_seg("— A.", type="dialogue", speaker="Holden"),
|
||||
_seg("— B.", type="dialogue", speaker="Kajri"),
|
||||
])
|
||||
hyp = _chap(5, [
|
||||
_seg("— A.", type="dialogue", speaker="Holden"),
|
||||
_seg("— B.", type="dialogue", speaker="Drummer"), # faux
|
||||
])
|
||||
score = score_chapter(ref, hyp)
|
||||
assert score.speaker_acc_dialogue == 0.5
|
||||
assert len(score.errors) == 1
|
||||
assert score.errors[0].expected == "Kajri"
|
||||
assert score.errors[0].got == "Drummer"
|
||||
assert score.confusion["Kajri"]["Drummer"] == 1
|
||||
|
||||
|
||||
def test_speaker_normalisation_alias():
|
||||
ref = _chap(5, [_seg("— A.", type="dialogue", speaker="Camina Drummer")])
|
||||
hyp = _chap(5, [_seg("— A.", type="dialogue", speaker="Drummer")]) # alias
|
||||
cast = Cast(characters=[Character(name="Camina Drummer", aliases=["Drummer"])])
|
||||
# sans cast : compte comme une erreur (noms differents)
|
||||
assert score_chapter(ref, hyp, None).speaker_acc_dialogue == 0.0
|
||||
# avec cast : l'alias est resolu -> correct
|
||||
assert score_chapter(ref, hyp, cast).speaker_acc_dialogue == 1.0
|
||||
|
||||
|
||||
# --- Incises -----------------------------------------------------------------
|
||||
|
||||
def test_incises_exact_vs_overlap():
|
||||
ref = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X",
|
||||
incises=[Incise(start=4, end=11)])])
|
||||
# span decale mais largement chevauchant -> overlap ok, exact non
|
||||
hyp = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X",
|
||||
incises=[Incise(start=4, end=10)])])
|
||||
score = score_chapter(ref, hyp)
|
||||
assert score.incise_exact_f1 < 1.0
|
||||
assert score.incise_overlap_f1 == 1.0
|
||||
|
||||
|
||||
def test_incises_faux_positif_baisse_precision():
|
||||
ref = _chap(5, [_seg("— A.", type="dialogue", speaker="X", incises=[])])
|
||||
hyp = _chap(5, [_seg("— A.", type="dialogue", speaker="X",
|
||||
incises=[Incise(start=0, end=3)])]) # invente une incise
|
||||
score = score_chapter(ref, hyp)
|
||||
assert score.incise_overlap_p < 1.0
|
||||
assert score.incise_overlap_r == 1.0 # rien a rappeler
|
||||
|
||||
|
||||
def test_incises_manque_baisse_rappel():
|
||||
ref = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X",
|
||||
incises=[Incise(start=4, end=11)])])
|
||||
hyp = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X", incises=[])])
|
||||
score = score_chapter(ref, hyp)
|
||||
assert score.incise_overlap_r < 1.0
|
||||
assert score.incise_overlap_p == 1.0
|
||||
|
||||
|
||||
# --- Type / glued ------------------------------------------------------------
|
||||
|
||||
def test_type_et_glued():
|
||||
ref = _chap(5, [_seg("A", type="narration"), _seg("— B", type="dialogue", glued=True)])
|
||||
hyp = _chap(5, [_seg("A", type="dialogue"), _seg("— B", type="dialogue", glued=False)])
|
||||
score = score_chapter(ref, hyp)
|
||||
assert score.type_acc == 0.5
|
||||
assert score.glued_acc == 0.5
|
||||
|
||||
|
||||
# --- Agregat (micro-moyenne) -------------------------------------------------
|
||||
|
||||
def test_aggregate_micro_moyenne():
|
||||
# ch1 : 1 dialogue correct ; ch2 : 3 dialogues dont 1 faux
|
||||
ref1 = _chap(1, [_seg("— A.", type="dialogue", speaker="X")])
|
||||
hyp1 = _chap(1, [_seg("— A.", type="dialogue", speaker="X")])
|
||||
ref2 = _chap(2, [
|
||||
_seg("— B.", type="dialogue", speaker="X"),
|
||||
_seg("— C.", type="dialogue", speaker="Y"),
|
||||
_seg("— D.", type="dialogue", speaker="Z"),
|
||||
])
|
||||
hyp2 = _chap(2, [
|
||||
_seg("— B.", type="dialogue", speaker="X"),
|
||||
_seg("— C.", type="dialogue", speaker="Y"),
|
||||
_seg("— D.", type="dialogue", speaker="WRONG"),
|
||||
])
|
||||
c1, c2 = _score_counts(ref1, hyp1, None), _score_counts(ref2, hyp2, None)
|
||||
s1, s2 = score_chapter(ref1, hyp1), score_chapter(ref2, hyp2)
|
||||
agg = aggregate([s1, s2], [c1, c2])
|
||||
# micro : 3 corrects / 4 dialogues = 0.75 (et non moyenne de 1.0 et 0.666)
|
||||
assert agg.n_dialogue == 4
|
||||
assert abs(agg.speaker_acc_dialogue - 0.75) < 1e-9
|
||||
assert agg.index == -1
|
||||
67
backend/tests/test_gemma_reasoning.py
Normal file
67
backend/tests/test_gemma_reasoning.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Tests purs de `_strip_reasoning` (retrait de la chaine de pensee).
|
||||
|
||||
Sans charger de modele : on verifie que la pensee est retiree et que
|
||||
`_extract_json` recupere bien la reponse FINALE (et non un fragment JSON
|
||||
parasite present dans la pensee).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from inkflow.analysis.gemma import (
|
||||
_extract_json,
|
||||
_has_complete_json,
|
||||
_strip_reasoning,
|
||||
)
|
||||
|
||||
|
||||
def test_has_complete_json_arret_anticipe():
|
||||
# JSON complet -> True (on peut stopper la generation)
|
||||
assert _has_complete_json('voici: {"speaker": "Marie"}')
|
||||
assert _has_complete_json('[{"a": 1}]')
|
||||
# JSON tronque (reponse pas encore finie) -> False (on continue)
|
||||
assert not _has_complete_json('{"speaker": "Mar')
|
||||
assert not _has_complete_json('texte sans json')
|
||||
# cas streaming reel : pensee close + fence json en cours mais objet complet
|
||||
buf = _strip_reasoning('<think>...</think>```json\n{"speaker": "Marie"}')
|
||||
assert _has_complete_json(buf)
|
||||
|
||||
|
||||
def test_format_a_canaux_gemma4():
|
||||
raw = (
|
||||
"<|channel>thought\n"
|
||||
"Thinking Process: la capitale est Paris. Exemple: {\"capitale\": \"...\"}\n"
|
||||
"<channel|>```json\n{\"capitale\": \"Paris\"}\n```"
|
||||
)
|
||||
cleaned = _strip_reasoning(raw)
|
||||
# la pensee (et son JSON d'exemple parasite) a disparu
|
||||
assert "Thinking Process" not in cleaned
|
||||
assert '"..."' not in cleaned
|
||||
# le JSON extrait est bien la reponse finale
|
||||
assert _extract_json(cleaned) == {"capitale": "Paris"}
|
||||
|
||||
|
||||
def test_balises_think_deepseek():
|
||||
raw = "<think>je reflechis, peut-etre [1,2]</think>\n[{\"speaker\": \"Holden\"}]"
|
||||
cleaned = _strip_reasoning(raw)
|
||||
assert "reflechis" not in cleaned
|
||||
assert _extract_json(cleaned) == [{"speaker": "Holden"}]
|
||||
|
||||
|
||||
def test_sans_raisonnement_inchange():
|
||||
raw = '{"speaker": "Kajri"}'
|
||||
assert _strip_reasoning(raw) == raw
|
||||
assert _extract_json(_strip_reasoning(raw)) == {"speaker": "Kajri"}
|
||||
|
||||
|
||||
def test_pensee_tronquee_sans_fermeture():
|
||||
# pensee non fermee (budget de tokens epuise) : le prefixe de canal saute,
|
||||
# on ne renvoie pas le marqueur d'ouverture.
|
||||
raw = "<|channel>thought\nje commence a reflechir mais c'est coupe"
|
||||
cleaned = _strip_reasoning(raw)
|
||||
assert not cleaned.startswith("<|channel")
|
||||
assert "<channel" not in cleaned
|
||||
|
||||
|
||||
def test_dernier_marqueur_gagne():
|
||||
# plusieurs blocs : seule la derniere reponse finale compte
|
||||
raw = "<think>a</think>milieu<think>b</think>FINAL"
|
||||
assert _strip_reasoning(raw) == "FINAL"
|
||||
Reference in New Issue
Block a user