FCES-native/kaggle_fces_prep.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# FCES-native Kaggle Preparation Pipeline\n",
    "\n",
    "Führt alle 5 Vorbereitungsschritte durch:\n",
    "1. Hyperparameter Sweep\n",
    "2. Adapter Library aufbauen\n",
    "3. MoE Gate vortrainieren\n",
    "4. FCES Warm-Start Checkpoint\n",
    "5. Klausuren-Dataset tokenisieren\n",
    "\n",
    "Alle Artefakte landen in `/kaggle/working/output/` und können als Dataset veröffentlicht werden."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── Cell 1: Environment Setup ──────────────────────────────────────────────\n",
    "import subprocess, sys\n",
    "\n",
    "# Clone repo\n",
    "subprocess.run(['git', 'clone', '--depth=1',\n",
    "    'https://git.zky.de/sven/FCES-native', '/kaggle/working/FCES-native'],\n",
    "    check=True)\n",
    "\n",
    "# Install Python deps\n",
    "subprocess.run([sys.executable, '-m', 'pip', 'install', '-q',\n",
    "    'transformers', 'dspy-ai', 'pydantic', 'structlog'],\n",
    "    check=True)\n",
    "\n",
    "# Compile C++ extension\n",
    "result = subprocess.run(\n",
    "    [sys.executable, 'python/setup.py', 'build_ext', '--inplace'],\n",
    "    cwd='/kaggle/working/FCES-native',\n",
    "    capture_output=True, text=True\n",
    ")\n",
    "if result.returncode != 0:\n",
    "    print('[WARN] fces_native compile failed — will use AdamW fallback for Step 4')\n",
    "    print(result.stderr[-500:])\n",
    "else:\n",
    "    print('[OK] fces_native compiled successfully')\n",
    "\n",
    "print('Setup complete!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── Cell 2: Run Full Pipeline ──────────────────────────────────────────────\n",
    "# Adjust model and steps for your Kaggle GPU quota:\n",
    "#   T4  (15h/week):  pythia-70m,  sweep=10, adapters=30, warm=20\n",
    "#   P100 (30h/week): pythia-160m, sweep=15, adapters=50, warm=30\n",
    "#   L4   (30h/week): pythia-410m, sweep=20, adapters=80, warm=50\n",
    "\n",
    "MODEL = 'EleutherAI/pythia-160m'   # ← erhöhe für bessere Adapters\n",
    "SWEEP_STEPS = 15\n",
    "ADAPTER_STEPS = 50\n",
    "WARM_STEPS = 30\n",
    "SKIP = ''  # z.B. '1,3' um Steps zu überspringen\n",
    "\n",
    "subprocess.run(\n",
    "    [sys.executable, 'kaggle_prep.py',\n",
    "     '--model', MODEL,\n",
    "     '--sweep-steps', str(SWEEP_STEPS),\n",
    "     '--adapter-steps', str(ADAPTER_STEPS),\n",
    "     '--warm-steps', str(WARM_STEPS),\n",
    "     '--skip-steps', SKIP],\n",
    "    cwd='/kaggle/working/FCES-native',\n",
    "    check=True\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── Cell 3: Inspect Artefakte ──────────────────────────────────────────────\n",
    "import json, os\n",
    "\n",
    "out = '/kaggle/working/FCES-native/kaggle_output'\n",
    "summary_path = os.path.join(out, 'prep_summary.json')\n",
    "\n",
    "with open(summary_path) as f:\n",
    "    summary = json.load(f)\n",
    "\n",
    "print(f\"Total time : {summary['total_wall_time_s']}s\")\n",
    "print(f\"Total size : {summary['total_size_mb']} MB\")\n",
    "print(f\"Artifacts  : {len(summary['output_files'])} files\")\n",
    "print()\n",
    "print('RunPod hint:')\n",
    "print(summary['runpod_savings_hint'])\n",
    "\n",
    "# Show best hyperparams\n",
    "cfg_path = os.path.join(out, 'step1_best_config.json')\n",
    "if os.path.exists(cfg_path):\n",
    "    with open(cfg_path) as f:\n",
    "        cfg = json.load(f)\n",
    "    print(f\"\\nBest Hparams: {cfg['best']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ── Cell 4: Publish als Kaggle Dataset ────────────────────────────────────\n",
    "# (Optional — nur wenn Kaggle API konfiguriert)\n",
    "# import kaggle\n",
    "# kaggle.api.dataset_create_new(\n",
    "#     folder='/kaggle/working/output',\n",
    "#     dir_mode='zip',\n",
    "#     convert_to_csv=False,\n",
    "# )"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}