144 lines
4.7 KiB
Plaintext
144 lines
4.7 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# FCES-native Kaggle Preparation Pipeline\n",
|
|
"\n",
|
|
"Führt alle 5 Vorbereitungsschritte durch:\n",
|
|
"1. Hyperparameter Sweep\n",
|
|
"2. Adapter Library aufbauen\n",
|
|
"3. MoE Gate vortrainieren\n",
|
|
"4. FCES Warm-Start Checkpoint\n",
|
|
"5. Klausuren-Dataset tokenisieren\n",
|
|
"\n",
|
|
"Alle Artefakte landen in `/kaggle/working/output/` und können als Dataset veröffentlicht werden."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# ── Cell 1: Environment Setup ──────────────────────────────────────────────\n",
|
|
"import subprocess, sys\n",
|
|
"\n",
|
|
"# Clone repo\n",
|
|
"subprocess.run(['git', 'clone', '--depth=1',\n",
|
|
" 'https://git.zky.de/sven/FCES-native', '/kaggle/working/FCES-native'],\n",
|
|
" check=True)\n",
|
|
"\n",
|
|
"# Install Python deps\n",
|
|
"subprocess.run([sys.executable, '-m', 'pip', 'install', '-q',\n",
|
|
" 'transformers', 'dspy-ai', 'pydantic', 'structlog'],\n",
|
|
" check=True)\n",
|
|
"\n",
|
|
"# Compile C++ extension\n",
|
|
"result = subprocess.run(\n",
|
|
" [sys.executable, 'python/setup.py', 'build_ext', '--inplace'],\n",
|
|
" cwd='/kaggle/working/FCES-native',\n",
|
|
" capture_output=True, text=True\n",
|
|
")\n",
|
|
"if result.returncode != 0:\n",
|
|
" print('[WARN] fces_native compile failed — will use AdamW fallback for Step 4')\n",
|
|
" print(result.stderr[-500:])\n",
|
|
"else:\n",
|
|
" print('[OK] fces_native compiled successfully')\n",
|
|
"\n",
|
|
"print('Setup complete!')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# ── Cell 2: Run Full Pipeline ──────────────────────────────────────────────\n",
|
|
"# Adjust model and steps for your Kaggle GPU quota:\n",
|
|
"# T4 (15h/week): pythia-70m, sweep=10, adapters=30, warm=20\n",
|
|
"# P100 (30h/week): pythia-160m, sweep=15, adapters=50, warm=30\n",
|
|
"# L4 (30h/week): pythia-410m, sweep=20, adapters=80, warm=50\n",
|
|
"\n",
|
|
"MODEL = 'EleutherAI/pythia-160m' # ← erhöhe für bessere Adapters\n",
|
|
"SWEEP_STEPS = 15\n",
|
|
"ADAPTER_STEPS = 50\n",
|
|
"WARM_STEPS = 30\n",
|
|
"SKIP = '' # z.B. '1,3' um Steps zu überspringen\n",
|
|
"\n",
|
|
"subprocess.run(\n",
|
|
" [sys.executable, 'kaggle_prep.py',\n",
|
|
" '--model', MODEL,\n",
|
|
" '--sweep-steps', str(SWEEP_STEPS),\n",
|
|
" '--adapter-steps', str(ADAPTER_STEPS),\n",
|
|
" '--warm-steps', str(WARM_STEPS),\n",
|
|
" '--skip-steps', SKIP],\n",
|
|
" cwd='/kaggle/working/FCES-native',\n",
|
|
" check=True\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# ── Cell 3: Inspect Artefakte ──────────────────────────────────────────────\n",
|
|
"import json, os\n",
|
|
"\n",
|
|
"out = '/kaggle/working/FCES-native/kaggle_output'\n",
|
|
"summary_path = os.path.join(out, 'prep_summary.json')\n",
|
|
"\n",
|
|
"with open(summary_path) as f:\n",
|
|
" summary = json.load(f)\n",
|
|
"\n",
|
|
"print(f\"Total time : {summary['total_wall_time_s']}s\")\n",
|
|
"print(f\"Total size : {summary['total_size_mb']} MB\")\n",
|
|
"print(f\"Artifacts : {len(summary['output_files'])} files\")\n",
|
|
"print()\n",
|
|
"print('RunPod hint:')\n",
|
|
"print(summary['runpod_savings_hint'])\n",
|
|
"\n",
|
|
"# Show best hyperparams\n",
|
|
"cfg_path = os.path.join(out, 'step1_best_config.json')\n",
|
|
"if os.path.exists(cfg_path):\n",
|
|
" with open(cfg_path) as f:\n",
|
|
" cfg = json.load(f)\n",
|
|
" print(f\"\\nBest Hparams: {cfg['best']}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# ── Cell 4: Publish als Kaggle Dataset ────────────────────────────────────\n",
|
|
"# (Optional — nur wenn Kaggle API konfiguriert)\n",
|
|
"# import kaggle\n",
|
|
"# kaggle.api.dataset_create_new(\n",
|
|
"# folder='/kaggle/working/output',\n",
|
|
"# dir_mode='zip',\n",
|
|
"# convert_to_csv=False,\n",
|
|
"# )"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"name": "python",
|
|
"version": "3.10.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|