Ajout d'un outil de benchmark des modèles d'analyse + support des modèles à raisonnement
- Nouvelle commande `inkflow benchmark` : compare la sortie d'analyse aux fichiers de référence (data/<slug>/reference/), met plusieurs modèles en concurrence, table rich + rapport JSON. Métriques : attribution de locuteur, incises, type/glued. Flags --models, --temperature, --reasoning, --stream, --use-cached + suivi par chapitre. - analysis/benchmark.py : scoring pur (testable) + runner multi-modèles (un MLX à la fois). - gemma.py : support des modèles à raisonnement (retrait de la pensée, désactivation via enable_thinking hors --reasoning, arrêt anticipé sur JSON complet, plafond + température dédiés anti-boucle), récupération du chat_template manquant (fix Mistral), streaming des tokens (set_token_sink). - settings.py : gemma_reasoning, gemma_reasoning_max_tokens, gemma_reasoning_temperature. - Tests : test_benchmark.py (scoring pur), test_gemma_reasoning.py. Conclusion benchmark : Qwen3.6-27B-8bit non-raisonnant = meilleur modèle d'analyse.
This commit is contained in:
@@ -13,7 +13,7 @@ import typer
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
from .config import ensure_dirs
|
||||
from .config import book_data_dir, ensure_dirs
|
||||
from .epub.parser import load_book, load_chapter_text, parse_epub
|
||||
from .models import Cast
|
||||
from .store import artifacts
|
||||
@@ -95,6 +95,104 @@ def analyze(
|
||||
console.print(f"[green]Casting[/] : {len(chars)} personnages -> cast.json")
|
||||
|
||||
|
||||
@app.command()
|
||||
def benchmark(
|
||||
slug: str,
|
||||
models: Optional[str] = typer.Option(
|
||||
None, help="Modeles a comparer, separes par des virgules (def: modele courant)."),
|
||||
chapter: Optional[int] = typer.Option(
|
||||
None, help="Restreindre a un chapitre (def: tous ceux avec reference)."),
|
||||
temperature: Optional[float] = typer.Option(
|
||||
None, help="Epingle la temperature Gemma (repro). Ex: 0.0."),
|
||||
reasoning: bool = typer.Option(
|
||||
False, "--reasoning", help="Modeles a raisonnement : retire la pensee + budget tokens accru."),
|
||||
use_cached: bool = typer.Option(
|
||||
False, "--use-cached", help="Compare les analysis/chNN.json existants (pas de modele)."),
|
||||
stream: bool = typer.Option(
|
||||
False, "--stream", help="Affiche les tokens generes en temps reel (pensee + reponse)."),
|
||||
):
|
||||
"""Met des modeles en concurrence sur les chapitres de reference (vs reference/)."""
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
from .analysis import gemma as _gemma
|
||||
from .analysis.benchmark import run_benchmark
|
||||
from .settings import get_settings
|
||||
|
||||
model_ids = ([m.strip() for m in models.split(",") if m.strip()]
|
||||
if models else [get_settings().gemma_model])
|
||||
chapters = [chapter] if chapter is not None else None
|
||||
|
||||
label = "artefacts en cache" if use_cached else f"{len(model_ids)} modele(s)"
|
||||
console.print(f"[blue]Benchmark[/] {slug} ({label}) — suivi par chapitre :")
|
||||
# Suivi en clair (lignes persistantes), avec horodatage pour voir l'avancement
|
||||
# d'un run long. On evite console.status (spinner) qui n'imprime rien.
|
||||
def _progress(msg: str) -> None:
|
||||
from datetime import datetime as _dt
|
||||
console.print(f"[dim]{_dt.now():%H:%M:%S}[/] {msg}")
|
||||
|
||||
# Streaming des tokens : ecriture brute sur stdout (sans markup rich) pour
|
||||
# voir defiler pensee et reponse. Necessite stdout non bufferise cote shell.
|
||||
if stream:
|
||||
def _sink(piece: str) -> None:
|
||||
sys.stdout.write(piece)
|
||||
sys.stdout.flush()
|
||||
_gemma.set_token_sink(_sink)
|
||||
try:
|
||||
report = run_benchmark(
|
||||
slug, model_ids, chapters=chapters,
|
||||
temperature=temperature,
|
||||
reasoning=reasoning if reasoning else None,
|
||||
use_cached=use_cached,
|
||||
progress=_progress)
|
||||
finally:
|
||||
if stream:
|
||||
_gemma.set_token_sink(None)
|
||||
report.generated_at = datetime.now().isoformat(timespec="seconds")
|
||||
|
||||
# Table comparative : une ligne par modele (agregat micro-moyenne).
|
||||
table = Table(title=f"Benchmark {slug} — chapitres {report.chapters}")
|
||||
table.add_column("modele")
|
||||
for col in ("speaker_dlg", "speaker_all", "incise_f1", "type", "glued", "temps(s)"):
|
||||
table.add_column(col, justify="right")
|
||||
for ms in report.models:
|
||||
if ms.error or ms.aggregate is None:
|
||||
table.add_row(ms.model_id, f"[red]{ms.error or 'aucun chapitre'}[/]")
|
||||
continue
|
||||
a = ms.aggregate
|
||||
table.add_row(
|
||||
ms.model_id,
|
||||
f"{a.speaker_acc_dialogue:.1%}",
|
||||
f"{a.speaker_acc_all:.1%}",
|
||||
f"{a.incise_overlap_f1:.2f}",
|
||||
f"{a.type_acc:.1%}",
|
||||
f"{a.glued_acc:.1%}",
|
||||
f"{ms.elapsed_s:.0f}",
|
||||
)
|
||||
console.print(table)
|
||||
|
||||
# Detail des erreurs d'attribution (les pires) par modele.
|
||||
for ms in report.models:
|
||||
errs = [e for cs in ms.per_chapter for e in cs.errors]
|
||||
if not errs:
|
||||
continue
|
||||
console.print(f"\n[bold]{ms.model_id}[/] — {len(errs)} erreur(s) de locuteur:")
|
||||
for e in errs[:15]:
|
||||
console.print(
|
||||
f" ch·seg{e.index:>3} attendu=[green]{e.expected}[/] "
|
||||
f"obtenu=[red]{e.got}[/] — {e.text_excerpt!r}")
|
||||
if len(errs) > 15:
|
||||
console.print(f" [dim]… +{len(errs) - 15} autres[/]")
|
||||
|
||||
# Rapport JSON horodate.
|
||||
out_dir = book_data_dir(slug) / "benchmark"
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
stamp = report.generated_at.replace(":", "").replace("-", "")
|
||||
out_path = out_dir / f"{stamp}.json"
|
||||
out_path.write_text(report.model_dump_json(indent=2), encoding="utf-8")
|
||||
console.print(f"\n[green]Rapport[/] -> {out_path}")
|
||||
|
||||
|
||||
@app.command()
|
||||
def pronounce(
|
||||
slug: str,
|
||||
|
||||
Reference in New Issue
Block a user