feat: add comparative FIRT-ORPO benchmark runner for AdamW vs FCES

2026-05-20 04:19:57 +02:00
parent 074fbc5cb6
commit 1dea539019
2 changed files with 515 additions and 0 deletions
--- a/benchmark_fces_vs_adam.py
+++ b/benchmark_fces_vs_adam.py
@@ -0,0 +1,407 @@
 import argparse
 import os
 import sys
 import time
 from typing import Any, Dict, List, Tuple
 # Ensure python folder is in path
 python_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "python")
 sys.path.insert(0, python_dir)
 import torch  # noqa: E402
 import fces_native  # noqa: E402
 import dspy  # noqa: E402
 import torch.nn.functional as F  # noqa: E402
 from send_telemetry import push_to_mariadb, push_to_surrealdb  # noqa: E402
 from transformers import AutoModelForCausalLM, AutoTokenizer  # noqa: E402
 # ==============================================================================
 # 1. DSPY SIGNATURE & SYSTEM DESIGN
 # ==============================================================================
 class LegalFIRT(dspy.Signature):  # type: ignore[misc]
    """Flaw Induced Reasoning Traces (FIRT) for German & EU Tenancy / M&A Law."""
    case_description: str = dspy.InputField(
        desc="Facts of the German/EU legal case with potential hidden flaws or contradictions."
    )
    relevant_statutes: str = dspy.InputField(
        desc="Relevant articles from the German Civil Code (BGB) or EU Directives."
    )
    t0_draft: str = dspy.OutputField(
        desc="Initial legal assessment or draft resolution based on first-pass reasoning."
    )
    t1_flaws: str = dspy.OutputField(
        desc="Identified legal flaws, logical bottlenecks, or contradictions (e.g. § 536b BGB exclusions)."
    )
    t2_refined_opinion: str = dspy.OutputField(
        desc="Final, correct, and authoritative legal opinion addressing all identified flaws."
    )
 # Legal Cases for Pre- and Post-Training Benchmarking
 eval_cases: List[Dict[str, str]] = [
    {
        "description": "Mieter M besichtigt im Januar eine Wohnung des Vermieters V. Schimmelfleck im Schlafzimmer. V sagt er wird es 'irgendwann mal wegmachen'. M unterschreibt Mietvertrag vorbehaltlos und zieht am 1. Februar ein. Im März mindert M die Miete selbstständig um 20% wegen des Schimmels. V verlangt volle Miete.",
        "statutes": "§ 535 Abs. 2 BGB (Mietzahlung), § 536 Abs. 1 BGB (Mietminderung bei Sachmangel), § 536b BGB (Kenntnis des Mieters bei Vertragsschluss).",
    },
    {
        "description": "Toilettenspülung bei F fällt am Freitagabend aus, Wasser läuft ununterbrochen und droht überzulaufen. V ist auf Segeltrip und nicht erreichbar. Kein Hausmeister. F beauftragt am Samstagmorgen den Notdienst K, bezahlt 300 € und fordert Erstattung von V. V weigert sich.",
        "statutes": "§ 536a Abs. 2 Nr. 2 BGB (Selbstbeseitigungsrecht bei Gefahr im Verzug), § 535 BGB.",
    },
 ]
 # ==============================================================================
 # 2. ORPO LOSS WITH SOFT LABELS
 # ==============================================================================
 def compute_orpo_loss(
    logits: torch.Tensor,
    labels: torch.Tensor,
    logits_lose: torch.Tensor,
    labels_lose: torch.Tensor,
    lambda_orpo: float = 0.1,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
    """Computes the combined Cross-Entropy (SFT) and Odds Ratio Preference (ORPO) Loss
    using soft probability log-likelihoods.
    """
    # 1. SFT Loss (Cross Entropy) on the preferred target sequence
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    sft_loss = F.cross_entropy(
        shift_logits.view(-1, shift_logits.size(-1)),
        shift_labels.view(-1),
        ignore_index=-100,
    )
    # 2. Average log probabilities for preferred (win) and dispreferred (lose) sequences
    # Logits win log_softmax
    log_probs_win = F.log_softmax(logits, dim=-1)
    # Gather actual label log_probs
    loss_mask_win = labels != -100
    labels_clamped = labels.clone()
    labels_clamped[~loss_mask_win] = 0
    seq_log_probs_win = torch.gather(
        log_probs_win, dim=-1, index=labels_clamped.unsqueeze(-1)
    ).squeeze(-1)
    seq_log_probs_win = (seq_log_probs_win * loss_mask_win).sum(
        dim=-1
    ) / loss_mask_win.sum(dim=-1).clamp(min=1)
    # Logits lose log_softmax
    log_probs_lose = F.log_softmax(logits_lose, dim=-1)
    loss_mask_lose = labels_lose != -100
    labels_lose_clamped = labels_lose.clone()
    labels_lose_clamped[~loss_mask_lose] = 0
    seq_log_probs_lose = torch.gather(
        log_probs_lose, dim=-1, index=labels_lose_clamped.unsqueeze(-1)
    ).squeeze(-1)
    seq_log_probs_lose = (seq_log_probs_lose * loss_mask_lose).sum(
        dim=-1
    ) / loss_mask_lose.sum(dim=-1).clamp(min=1)
    # 3. Odds Ratio Calculation
    log_odds_win = seq_log_probs_win - torch.log1p(-torch.exp(seq_log_probs_win))
    log_odds_lose = seq_log_probs_lose - torch.log1p(-torch.exp(seq_log_probs_lose))
    odds_ratio = log_odds_win - log_odds_lose
    orpo_loss = -F.logsigmoid(odds_ratio).mean()
    # Total combined loss
    total_loss = sft_loss + lambda_orpo * orpo_loss
    return total_loss, sft_loss
 # ==============================================================================
 # 3. EVALUATION RUNNER
 # ==============================================================================
 def evaluate_model(
    model: AutoModelForCausalLM, tokenizer: AutoTokenizer, device: str
 ) -> List[Dict[str, str]]:
    """Evaluates the model on legal exam cases (zero-shot) to measure T2 opinion quality."""
    model.eval()
    results = []
    for idx, case in enumerate(eval_cases):
        prompt = (
            f"Case: {case['description']}\nStatutes: {case['statutes']}\n"
            "Analyze and resolve using Flaw Induced Reasoning Traces (FIRT):\n"
            "T0 Draft:\n"
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs, max_new_tokens=256, temperature=0.1, do_sample=False
            )
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results.append({"case_id": idx + 1, "output": generated_text})
    return results
 # ==============================================================================
 # 4. TRAINING PIPELINE (ADAMW VS FCES)
 # ==============================================================================
 def train_run(
    optimizer_name: str,
    model_name: str,
    steps: int,
    device: str,
    dataset: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]],
 ) -> Tuple[List[Dict[str, Any]], List[Dict[str, str]], List[Dict[str, str]]]:
    """Runs the training loop and performs pre/post evaluations."""
    print(f"\n[START] Initializing training run with {optimizer_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    # 1. Pre-Training Evaluation
    print(f"[{optimizer_name}] Running Pre-Training Evaluation...")
    pre_eval = evaluate_model(model, tokenizer, device)
    # 2. Setup Optimizer
    model.train()
    if optimizer_name == "AdamW":
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    elif optimizer_name == "FCES":
        cfg = fces_native.FCESConfig()
        cfg.lr = 1e-4
        cfg.population_size = 8  # Balanced for resource footprint
        cfg.total_steps = steps
        optimizer = fces_native.FCESOptimizer(list(model.parameters()), cfg)
    else:
        raise ValueError("Invalid optimizer name")
    # 3. Training loop
    logs = []
    start_time = time.perf_counter()
    tokens_processed = 0
    for step in range(steps):
        # Cyclically fetch batches from pre-tokenized dataset
        batch = dataset[step % len(dataset)]
        input_win, labels_win, input_lose, labels_lose = [t.to(device) for t in batch]
        optimizer.zero_grad()
        # Forward pass for win (preferred)
        outputs_win = model(input_win, labels=labels_win)
        logits_win = outputs_win.logits
        # Forward pass for lose (dispreferred)
        with torch.no_grad():
            outputs_lose = model(input_lose)
            logits_lose = outputs_lose.logits
        # Compute combined ORPO loss
        loss, sft_loss = compute_orpo_loss(
            logits_win, labels_win, logits_lose, labels_lose
        )
        loss.backward()
        optimizer.step()
        if optimizer_name == "FCES":
            optimizer.update_fitness(float(loss.item()))
        # Tracking metrics
        elapsed = time.perf_counter() - start_time
        batch_tokens = int((input_win != tokenizer.pad_token_id).sum().item())
        tokens_processed += batch_tokens
        logs.append(
            {
                "step": step,
                "loss": float(loss.item()),
                "sft_loss": float(sft_loss.item()),
                "wall_clock_time": elapsed,
                "tokens_processed": tokens_processed,
            }
        )
        if step % 5 == 0:
            print(
                f"[{optimizer_name}] Step {step}/{steps} | Loss: {loss.item():.4f} | "
                f"Time: {elapsed:.2f}s | Tokens: {tokens_processed}"
            )
    # 4. Post-Training Evaluation
    print(f"[{optimizer_name}] Running Post-Training Evaluation...")
    post_eval = evaluate_model(model, tokenizer, device)
    # Cleanup memory
    del model
    if device == "cuda":
        torch.cuda.empty_cache()
    return logs, post_eval, pre_eval
 # ==============================================================================
 # 5. DATA PREPARATION (MOCK RACF DATASET FOR PRE-FLIGHT TESTS)
 # ==============================================================================
 def build_mock_dataset(
    model_name: str, size: int = 20
 ) -> List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]]:
    """Synthesizes mock pre-tokenized target sequences representing
    FIRT trajectories (Top-k vs Bottom-l preferences) for local dry-runs.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    dataset = []
    for idx in range(size):
        # Winning path (Correct FIRT opinions)
        text_win = (
            f"Case {idx}: Tenant wins due to Self-beseitigungsrecht § 536a BGB. "
            "Unresolved flaws: None. Authority: Refined opinion correct."
        )
        # Losing path (Flawed/contradictory reasoning)
        text_lose = (
            f"Case {idx}: Tenant loses but reasoning is faulty. "
            "Unresolved flaws: Ignored positive knowledge under § 536b BGB."
        )
        enc_win = tokenizer(
            text_win,
            truncation=True,
            max_length=64,
            padding="max_length",
            return_tensors="pt",
        )
        enc_lose = tokenizer(
            text_lose,
            truncation=True,
            max_length=64,
            padding="max_length",
            return_tensors="pt",
        )
        labels_win = enc_win["input_ids"].clone()
        labels_win[enc_win["attention_mask"] == 0] = -100
        labels_lose = enc_lose["input_ids"].clone()
        labels_lose[enc_lose["attention_mask"] == 0] = -100
        dataset.append(
            (
                enc_win["input_ids"],
                labels_win,
                enc_lose["input_ids"],
                labels_lose,
            )
        )
    return dataset
 # ==============================================================================
 # 6. MAIN CONTROLLER
 # ==============================================================================
 def main() -> None:
    parser = argparse.ArgumentParser(description="FIRT ORPO FCES vs AdamW Trainer")
    parser.add_argument(
        "--model",
        type=str,
        default="EleutherAI/pythia-70m",
        help="Model name (e.g. google/gemma-2-2b-it)",
    )
    parser.add_argument(
        "--steps", type=int, default=10, help="Number of training steps"
    )
    args = parser.parse_args()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("=" * 80)
    print(
        "               FCES VS ADAMW COMPARATIVE TRAINING BENCHMARK                  "
    )
    print("=" * 80)
    print(f"Device: {device} | Base Model: {args.model} | Steps: {args.steps}")
    # Initialize pre-tokenized dataset
    print("\n[INFO] Synthesizing target training tokens...")
    dataset = build_mock_dataset(args.model)
    # 1. Run AdamW Baseline
    adam_logs, adam_post, pre_eval = train_run(
        "AdamW", args.model, args.steps, device, dataset
    )
    # 2. Run FCES Optimizer
    fces_logs, fces_post, _ = train_run("FCES", args.model, args.steps, device, dataset)
    # 3. Telemetry Log Synthesis & Push
    telemetry_entries = []
    print("\n" + "-" * 30 + " FINAL RUN SUMMARY " + "-" * 30)
    print(f"{'Step':<10}{'AdamW Loss':<20}{'FCES Loss':<20}{'Speedup / Savings':<25}")
    print("-" * 75)
    for idx in range(min(len(adam_logs), len(fces_logs))):
        step_val = adam_logs[idx]["step"]
        adam_loss = adam_logs[idx]["loss"]
        fces_loss = fces_logs[idx]["loss"]
        time_saved_pct = (
            (adam_logs[idx]["wall_clock_time"] - fces_logs[idx]["wall_clock_time"])
            / adam_logs[idx]["wall_clock_time"]
            * 100
        )
        if idx % (max(1, args.steps // 5)) == 0 or idx == args.steps - 1:
            print(
                f"{step_val:<10}{adam_loss:<20.4f}{fces_loss:<20.4f}"
                f"{time_saved_pct:+.2f}% wall-clock time"
            )
        telemetry_entries.append(
            (
                "INFO",
                "comparative_step",
                f"Step {step_val} | AdamW Loss: {adam_loss:.4f} | FCES Loss: {fces_loss:.4f}",
            )
        )
    print("-" * 75)
    print("\n>>> PRE-TRAINING OPINION (CASE 1):")
    print(pre_eval[0]["output"])
    print("\n>>> POST-TRAINING ADAMW OPINION (CASE 1):")
    print(adam_post[0]["output"])
    print("\n>>> POST-TRAINING FCES OPINION (CASE 1):")
    print(fces_post[0]["output"])
    print("=" * 80)
    # Push telemetries
    push_to_surrealdb(telemetry_entries)
    push_to_mariadb(telemetry_entries)
    # Write comparative summary artifact
    import json
    results_summary = {
        "model": args.model,
        "steps": args.steps,
        "adam_metrics": adam_logs,
        "fces_metrics": fces_logs,
        "pre_eval": pre_eval,
        "adam_post_eval": adam_post,
        "fces_post_eval": fces_post,
    }
    with open("benchmark_results.json", "w", encoding="utf-8") as f:
        json.dump(results_summary, f, indent=4)
    print("[INFO] Stored comprehensive benchmark metrics to benchmark_results.json")
 if __name__ == "__main__":
    main()
--- a/benchmark_results.json
+++ b/benchmark_results.json
@@ -0,0 +1,108 @@
 {
    "model": "EleutherAI/pythia-70m",
    "steps": 5,
    "adam_metrics": [
        {
            "step": 0,
            "loss": 7.041037559509277,
            "sft_loss": 6.995092391967773,
            "wall_clock_time": 0.8969566999949166,
            "tokens_processed": 35
        },
        {
            "step": 1,
            "loss": 2.5148799419403076,
            "sft_loss": 2.4989213943481445,
            "wall_clock_time": 1.8027480999953696,
            "tokens_processed": 70
        },
        {
            "step": 2,
            "loss": 0.8542575836181641,
            "sft_loss": 0.8379661440849304,
            "wall_clock_time": 2.7821450999981607,
            "tokens_processed": 105
        },
        {
            "step": 3,
            "loss": 0.6839541792869568,
            "sft_loss": 0.667436420917511,
            "wall_clock_time": 3.560283499995421,
            "tokens_processed": 140
        },
        {
            "step": 4,
            "loss": 0.359605997800827,
            "sft_loss": 0.3395426273345947,
            "wall_clock_time": 4.330241899995599,
            "tokens_processed": 175
        }
    ],
    "fces_metrics": [
        {
            "step": 0,
            "loss": 7.041037559509277,
            "sft_loss": 6.995092391967773,
            "wall_clock_time": 13.06952640000236,
            "tokens_processed": 35
        },
        {
            "step": 1,
            "loss": 6.717805862426758,
            "sft_loss": 6.71108341217041,
            "wall_clock_time": 20.157852299998922,
            "tokens_processed": 70
        },
        {
            "step": 2,
            "loss": 14.43155574798584,
            "sft_loss": 14.42345142364502,
            "wall_clock_time": 27.273785699995642,
            "tokens_processed": 105
        },
        {
            "step": 3,
            "loss": 7.063178062438965,
            "sft_loss": 7.0147223472595215,
            "wall_clock_time": 34.50635989999864,
            "tokens_processed": 140
        },
        {
            "step": 4,
            "loss": 5.816511631011963,
            "sft_loss": 5.8062052726745605,
            "wall_clock_time": 42.63929800000187,
            "tokens_processed": 175
        }
    ],
    "pre_eval": [
        {
            "case_id": 1,
            "output": "Case: Mieter M besichtigt im Januar eine Wohnung des Vermieters V. Schimmelfleck im Schlafzimmer. V sagt er wird es 'irgendwann mal wegmachen'. M unterschreibt Mietvertrag vorbehaltlos und zieht am 1. Februar ein. Im M\u00e4rz mindert M die Miete selbstst\u00e4ndig um 20% wegen des Schimmels. V verlangt volle Miete.\nStatutes: \u00a7 535 Abs. 2 BGB (Mietzahlung), \u00a7 536 Abs. 1 BGB (Mietminderung bei Sachmangel), \u00a7 536b BGB (Kenntnis des Mieters bei Vertragsschluss).\nAnalyze and resolve using Flaw Induced Reasoning Traces (FIRT):\nT0 Draft:\nThe following is a summary of the relevant facts and the relevant facts.\n1. The following is a summary of the relevant facts and the relevant facts.\n2. The following is a summary of the relevant facts and the relevant facts.\n3. The following is a summary of the relevant facts and the relevant facts.\n4. The following is a summary of the relevant facts and the relevant facts.\n5. The following is a summary of the relevant facts and the relevant facts.\n6. The following is a summary of the relevant facts and the relevant facts.\n7. The following is a summary of the relevant facts and the relevant facts.\n8. The following is a summary of the relevant facts and the relevant facts.\n9. The following is a summary of the relevant facts and the relevant facts.\n10. The following is a summary of the relevant facts and the relevant facts.\n11. The following is a summary of the relevant facts and the relevant facts.\n12. The following is a summary of the relevant facts and the relevant facts.\n13. The following is a summary of the relevant facts and the relevant facts.\n14. The following is a summary of the relevant facts and the relevant facts.\n15. The"
        },
        {
            "case_id": 2,
            "output": "Case: Toilettensp\u00fclung bei F f\u00e4llt am Freitagabend aus, Wasser l\u00e4uft ununterbrochen und droht \u00fcberzulaufen. V ist auf Segeltrip und nicht erreichbar. Kein Hausmeister. F beauftragt am Samstagmorgen den Notdienst K, bezahlt 300 \u20ac und fordert Erstattung von V. V weigert sich.\nStatutes: \u00a7 536a Abs. 2 Nr. 2 BGB (Selbstbeseitigungsrecht bei Gefahr im Verzug), \u00a7 535 BGB.\nAnalyze and resolve using Flaw Induced Reasoning Traces (FIRT):\nT0 Draft:\n\"The FIRT is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It is a method of analysis of the nature of the FIRT. It"
        }
    ],
    "adam_post_eval": [
        {
            "case_id": 1,
            "output": "Case: Mieter M besichtigt im Januar eine Wohnung des Vermieters V. Schimmelfleck im Schlafzimmer. V sagt er wird es 'irgendwann mal wegmachen'. M unterschreibt Mietvertrag vorbehaltlos und zieht am 1. Februar ein. Im M\u00e4rz mindert M die Miete selbstst\u00e4ndig um 20% wegen des Schimmels. V verlangt volle Miete.\nStatutes: \u00a7 535 Abs. 2 BGB (Mietzahlung), \u00a7 536 Abs. 1 BGB (Mietminderung bei Sachmangel), \u00a7 536b BGB (Kenntnis des Mieters bei Vertragsschluss).\nAnalyze and resolve using Flaw Induced Reasoning Traces (FIRT):\nT0 Draft:\n \u00a7 536a BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB. \u00a7 536 BGB."
        },
        {
            "case_id": 2,
            "output": "Case: Toilettensp\u00fclung bei F f\u00e4llt am Freitagabend aus, Wasser l\u00e4uft ununterbrochen und droht \u00fcberzulaufen. V ist auf Segeltrip und nicht erreichbar. Kein Hausmeister. F beauftragt am Samstagmorgen den Notdienst K, bezahlt 300 \u20ac und fordert Erstattung von V. V weigert sich.\nStatutes: \u00a7 536a Abs. 2 Nr. 2 BGB (Selbstbeseitigungsrecht bei Gefahr im Verzug), \u00a7 535 BGB.\nAnalyze and resolve using Flaw Induced Reasoning Traces (FIRT):\nT0 Draft:\n \u00a7 536a BGB. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined opinion correct. Refined correct. Refined opinion correct. Refined correct. Refined opinion correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct. Refined correct."
        }
    ],
    "fces_post_eval": [
        {
            "case_id": 1,
            "output": "Case: Mieter M besichtigt im Januar eine Wohnung des Vermieters V. Schimmelfleck im Schlafzimmer. V sagt er wird es 'irgendwann mal wegmachen'. M unterschreibt Mietvertrag vorbehaltlos und zieht am 1. Februar ein. Im M\u00e4rz mindert M die Miete selbstst\u00e4ndig um 20% wegen des Schimmels. V verlangt volle Miete.\nStatutes: \u00a7 535 Abs. 2 BGB (Mietzahlung), \u00a7 536 Abs. 1 BGB (Mietminderung bei Sachmangel), \u00a7 536b BGB (Kenntnis des Mieters bei Vertragsschluss).\nAnalyze and resolve using Flaw Induced Reasoning Traces (FIRT):\nT0 Draft:\n................................................................................................................................................................................. \u00a7. \u00a7. \u00a7........................................................ \u00a7. \u00a7. \u00a7............."
        },
        {
            "case_id": 2,
            "output": "Case: Toilettensp\u00fclung bei F f\u00e4llt am Freitagabend aus, Wasser l\u00e4uft ununterbrochen und droht \u00fcberzulaufen. V ist auf Segeltrip und nicht erreichbar. Kein Hausmeister. F beauftragt am Samstagmorgen den Notdienst K, bezahlt 300 \u20ac und fordert Erstattung von V. V weigert sich.\nStatutes: \u00a7 536a Abs. 2 Nr. 2 BGB (Selbstbeseitigungsrecht bei Gefahr im Verzug), \u00a7 535 BGB.\nAnalyze and resolve using Flaw Induced Reasoning Traces (FIRT):\nT0 Draft:\n.......................................................................................................................................... \u00a7.................... \u00a7.......... \u00a7......... \u00a7....... \u00a7........ \u00a7........ \u00a7......... \u00a7.......... \u00a7......... \u00a7......... \u00a7........"
        }
    ]
 }