"""Tests purs du scoring de benchmark (sans Gemma ni disque). Monte des `ChapterAnalysis` synthetiques et verifie les metriques : alignement, attribution du locuteur (avec normalisation d'alias), incises (exact vs chevauchement), type/glued, et micro-moyenne sur plusieurs chapitres. """ from __future__ import annotations from inkflow.analysis.benchmark import ( _score_counts, align, aggregate, score_chapter, ) from inkflow.models import ( Cast, Character, ChapterAnalysis, Incise, Segment, SegmentType, ) def _seg(text, *, type="narration", speaker="narrateur", glued=False, incises=None): return Segment( type=SegmentType(type), text=text, speaker=speaker, glued_to_prev=glued, incises=incises or []) def _chap(index, segments): return ChapterAnalysis(index=index, title=f"ch{index}", segments=segments) # --- Alignement -------------------------------------------------------------- def test_alignement_1_1_textes_identiques(): ref = _chap(5, [_seg("Bonjour."), _seg("— Salut.", type="dialogue")]) hyp = _chap(5, [_seg("bonjour. "), _seg("— salut.", type="dialogue")]) # espaces/casse pairs = align(ref, hyp) assert len(pairs) == 2 assert all(r is not None and h is not None for r, h in pairs) def test_alignement_segment_hypothese_en_trop(): ref = _chap(5, [_seg("A"), _seg("B")]) hyp = _chap(5, [_seg("A"), _seg("X"), _seg("B")]) cnt = _score_counts(ref, hyp, None) # le segment "X" non couvert par la reference -> warning assert any("sans correspondance" in w for w in cnt.warnings) # --- Attribution du locuteur ------------------------------------------------- def test_speaker_parfait(): segs = [ _seg("narration"), _seg("— Bonjour.", type="dialogue", speaker="Holden"), _seg("— Salut.", type="dialogue", speaker="Kajri"), ] ref = _chap(5, [s.model_copy(deep=True) for s in segs]) hyp = _chap(5, [s.model_copy(deep=True) for s in segs]) score = score_chapter(ref, hyp) assert score.speaker_acc_dialogue == 1.0 assert score.speaker_acc_all == 1.0 assert score.errors == [] def test_speaker_avec_erreurs(): ref = _chap(5, [ _seg("— A.", type="dialogue", speaker="Holden"), _seg("— B.", type="dialogue", speaker="Kajri"), ]) hyp = _chap(5, [ _seg("— A.", type="dialogue", speaker="Holden"), _seg("— B.", type="dialogue", speaker="Drummer"), # faux ]) score = score_chapter(ref, hyp) assert score.speaker_acc_dialogue == 0.5 assert len(score.errors) == 1 assert score.errors[0].expected == "Kajri" assert score.errors[0].got == "Drummer" assert score.confusion["Kajri"]["Drummer"] == 1 def test_speaker_normalisation_alias(): ref = _chap(5, [_seg("— A.", type="dialogue", speaker="Camina Drummer")]) hyp = _chap(5, [_seg("— A.", type="dialogue", speaker="Drummer")]) # alias cast = Cast(characters=[Character(name="Camina Drummer", aliases=["Drummer"])]) # sans cast : compte comme une erreur (noms differents) assert score_chapter(ref, hyp, None).speaker_acc_dialogue == 0.0 # avec cast : l'alias est resolu -> correct assert score_chapter(ref, hyp, cast).speaker_acc_dialogue == 1.0 # --- Incises ----------------------------------------------------------------- def test_incises_exact_vs_overlap(): ref = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X", incises=[Incise(start=4, end=11)])]) # span decale mais largement chevauchant -> overlap ok, exact non hyp = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X", incises=[Incise(start=4, end=10)])]) score = score_chapter(ref, hyp) assert score.incise_exact_f1 < 1.0 assert score.incise_overlap_f1 == 1.0 def test_incises_faux_positif_baisse_precision(): ref = _chap(5, [_seg("— A.", type="dialogue", speaker="X", incises=[])]) hyp = _chap(5, [_seg("— A.", type="dialogue", speaker="X", incises=[Incise(start=0, end=3)])]) # invente une incise score = score_chapter(ref, hyp) assert score.incise_overlap_p < 1.0 assert score.incise_overlap_r == 1.0 # rien a rappeler def test_incises_manque_baisse_rappel(): ref = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X", incises=[Incise(start=4, end=11)])]) hyp = _chap(5, [_seg("— A dit-il.", type="dialogue", speaker="X", incises=[])]) score = score_chapter(ref, hyp) assert score.incise_overlap_r < 1.0 assert score.incise_overlap_p == 1.0 # --- Type / glued ------------------------------------------------------------ def test_type_et_glued(): ref = _chap(5, [_seg("A", type="narration"), _seg("— B", type="dialogue", glued=True)]) hyp = _chap(5, [_seg("A", type="dialogue"), _seg("— B", type="dialogue", glued=False)]) score = score_chapter(ref, hyp) assert score.type_acc == 0.5 assert score.glued_acc == 0.5 # --- Agregat (micro-moyenne) ------------------------------------------------- def test_aggregate_micro_moyenne(): # ch1 : 1 dialogue correct ; ch2 : 3 dialogues dont 1 faux ref1 = _chap(1, [_seg("— A.", type="dialogue", speaker="X")]) hyp1 = _chap(1, [_seg("— A.", type="dialogue", speaker="X")]) ref2 = _chap(2, [ _seg("— B.", type="dialogue", speaker="X"), _seg("— C.", type="dialogue", speaker="Y"), _seg("— D.", type="dialogue", speaker="Z"), ]) hyp2 = _chap(2, [ _seg("— B.", type="dialogue", speaker="X"), _seg("— C.", type="dialogue", speaker="Y"), _seg("— D.", type="dialogue", speaker="WRONG"), ]) c1, c2 = _score_counts(ref1, hyp1, None), _score_counts(ref2, hyp2, None) s1, s2 = score_chapter(ref1, hyp1), score_chapter(ref2, hyp2) agg = aggregate([s1, s2], [c1, c2]) # micro : 3 corrects / 4 dialogues = 0.75 (et non moyenne de 1.0 et 0.666) assert agg.n_dialogue == 4 assert abs(agg.speaker_acc_dialogue - 0.75) < 1e-9 assert agg.index == -1