feat: Add playbook vector extraction and activation steering routing to FCES training pipeline

2026-05-23 08:33:27 +02:00
parent e0d8a32823
commit 4c9a550f8b
4 changed files with 884 additions and 46 deletions
--- a/python/run_representation_pipeline.py
+++ b/python/run_representation_pipeline.py
@@ -0,0 +1,473 @@
+"""Orchestration script to compile Skill and Process libraries, train Gating-MLP, and validate steering.
+
+Discovers local playbooks, extracts representation vectors using Pythia-70m (or dummy fallback),
+trains the gating network on hidden states, and runs validation.
+"""
+
+from __future__ import annotations
+
+import argparse
+import logging
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+
+# Import our representation engineering and router modules
+import sys
+
+sys.path.append(str(Path(__file__).parent.absolute()))
+
+from representation_engineering import (
+    PlaybookParser,
+    PlaybookMetadata,
+    RepresentationVectorExtractor,
+    SkillVectorLibrary,
+    ProcessVectorLibrary,
+)
+from adapter_moe_router import LearnableGate, ExpertAdapterRouter
+
+logger = logging.getLogger("run_representation_pipeline")
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+
+
+# --- Dummy Model & Tokenizer for Offline Fallback & Testing ---
+
+
+class DummyTransformerLayer(nn.Module):  # type: ignore[misc]
+    def __init__(self, hidden_dim: int) -> None:
+        super().__init__()
+        self.linear1 = nn.Linear(hidden_dim, hidden_dim)
+        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
+
+    def forward(
+        self, x: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> Tuple[torch.Tensor]:
+        h = self.linear2(torch.relu(self.linear1(x)))
+        return (x + h,)
+
+
+class DummyModel(nn.Module):  # type: ignore[misc]
+    def __init__(
+        self, vocab_size: int = 1000, hidden_dim: int = 32, num_layers: int = 4
+    ) -> None:
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.hidden_dim = hidden_dim
+        self.embedding = nn.Embedding(vocab_size, hidden_dim)
+        self.layers = nn.ModuleList(
+            [DummyTransformerLayer(hidden_dim) for _ in range(num_layers)]
+        )
+        self.lm_head = nn.Linear(hidden_dim, vocab_size)
+
+    def forward(self, input_ids: torch.Tensor, *args: Any, **kwargs: Any) -> Any:
+        x = self.embedding(input_ids)
+        for layer in self.layers:
+            x = layer(x)[0]
+        logits = self.lm_head(x)
+
+        class Output:
+            def __init__(self, logits: torch.Tensor) -> None:
+                self.logits = logits
+
+        return Output(logits=logits)
+
+
+class DummyTokenizer:
+    def __init__(self) -> None:
+        pass
+
+    def __call__(
+        self, text: str | List[str], return_tensors: str = "pt", **kwargs: Any
+    ) -> Dict[str, torch.Tensor]:
+        if isinstance(text, str):
+            text = [text]
+        batch_ids = []
+        max_len = 0
+        for t in text:
+            words = t.split()
+            ids = [abs(hash(w)) % 1000 for w in words]
+            if not ids:
+                ids = [0]
+            batch_ids.append(ids)
+            max_len = max(max_len, len(ids))
+
+        padded_ids = []
+        for ids in batch_ids:
+            padded_ids.append(ids + [0] * (max_len - len(ids)))
+        return {"input_ids": torch.tensor(padded_ids)}
+
+
+# --- Helper to parse process steps from playbooks ---
+
+
+def parse_playbook_steps(path: Path) -> List[Tuple[str, str]]:
+    """Extracts step names and short descriptions from sequential lists in a playbook."""
+    content = path.read_text(encoding="utf-8")
+    steps = []
+
+    # Locate numbered items under headers like Fallback Procedure, Operational Protocol, etc.
+    lines = content.splitlines()
+    in_procedure = False
+
+    for line in lines:
+        line_str = line.strip()
+        if not line_str:
+            continue
+
+        if line_str.startswith("#"):
+            lower_header = line_str.lower()
+            in_procedure = any(
+                x in lower_header
+                for x in ["procedure", "protocol", "schritt", "ablauf", "loop"]
+            )
+            continue
+
+        if in_procedure:
+            # Match 1. Step name: Description or just 1. Description
+            match = re.match(r"^(\d+)\.\s*(.*?)$", line_str)
+            if match:
+                step_num = match.group(1)
+                step_text = match.group(2).strip()
+                # Split step name and description if separated by double asterisks or colon
+                parts = re.split(r"\*\*|:\s*", step_text, maxsplit=1)
+                if len(parts) > 1 and parts[0].strip():
+                    step_name = parts[0].strip()
+                    step_desc = parts[1].strip()
+                else:
+                    step_name = f"Step_{step_num}"
+                    step_desc = step_text
+                steps.append((step_name, step_desc))
+
+    return steps
+
+
+# --- Pipeline Orchestrator ---
+
+
+def run_pipeline(
+    model_id: str,
+    skills_dirs: List[str],
+    output_dir: str,
+    use_dummy: bool = False,
+    epochs: int = 50,
+    lr: float = 0.01,
+) -> None:
+    logger.info("Initializing Representation Engineering Pipeline...")
+
+    # 1. Resolve output directory
+    out_path = Path(output_dir)
+    out_path.mkdir(parents=True, exist_ok=True)
+
+    # 2. Load model and tokenizer (with dummy fallback)
+    model = None
+    tokenizer = None
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    if not use_dummy:
+        try:
+            logger.info(f"Attempting to load model '{model_id}' from HuggingFace...")
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
+            logger.info("Model loaded successfully.")
+        except Exception as e:
+            logger.warning(
+                f"Failed to load model from HuggingFace: {e}. Falling back to Dummy Model."
+            )
+            use_dummy = True
+
+    if use_dummy:
+        logger.info(
+            "Initializing lightweight Dummy Model and Tokenizer for execution/testing..."
+        )
+        model = DummyModel(vocab_size=1000, hidden_dim=32, num_layers=4).to(device)
+        tokenizer = DummyTokenizer()
+        device = "cpu"
+
+    assert model is not None
+    assert tokenizer is not None
+
+    # 3. Discover playbooks
+    logger.info("Scanning directories for playbooks...")
+    playbooks: List[PlaybookMetadata] = []
+    parsed_files: List[Path] = []
+
+    for s_dir in skills_dirs:
+        path = Path(s_dir)
+        if path.exists():
+            logger.info(f"Scanning directory: {path}")
+            found = PlaybookParser.parse_directory(path)
+            playbooks.extend(found)
+            # Find actual files for process parsing
+            for p_file in path.glob("**/SKILL.md"):
+                parsed_files.append(p_file)
+            for p_file in path.glob("*_SKILL.md"):
+                parsed_files.append(p_file)
+
+    if not playbooks:
+        logger.warning(
+            "No playbooks discovered! Creating a default mock playbook for execution..."
+        )
+        mock_skill = PlaybookMetadata(
+            name="MockSkill",
+            description="A temporary skill for pipeline validation",
+            objectives=[
+                "Validate the routing pipeline",
+                "Verify steering functionality",
+            ],
+            trigger_examples=[
+                "Run the mock pipeline check",
+                "Test activation steering on mock",
+            ],
+            file_path="mock_SKILL.md",
+        )
+        playbooks.append(mock_skill)
+
+    logger.info(f"Discovered {len(playbooks)} playbooks.")
+
+    # 4. Extract Skill and Process Libraries
+    skill_library = SkillVectorLibrary()
+    process_library = ProcessVectorLibrary()
+    extractor = RepresentationVectorExtractor(model, tokenizer, device=device)
+
+    # Determine model layers
+    transformer_layers = []
+    for name, module in model.named_modules():
+        if re.match(r".*layers?\.\d+$", name):
+            transformer_layers.append(name)
+    num_layers = len(transformer_layers)
+    logger.info(f"Detected {num_layers} transformer layers in base model.")
+
+    # Choose layers to extract: middle/late layers
+    layers_to_extract = (
+        list(range(num_layers // 2, num_layers)) if num_layers > 0 else [0]
+    )
+
+    # Extract skill vectors
+    for pb in playbooks:
+        logger.info(f"Extracting vectors for skill: {pb.name}")
+        vec = extractor.extract_steering_vector(pb, layers_to_extract=layers_to_extract)
+        skill_library.add_vector(vec)
+
+        # Extract sequential process vectors if file exists
+        p_path = Path(pb.file_path)
+        if p_path.exists():
+            steps = parse_playbook_steps(p_path)
+            if steps:
+                logger.info(
+                    f"Found {len(steps)} sequential steps in process '{pb.name}'"
+                )
+                step_vectors = []
+                for step_name, step_desc in steps:
+                    step_pb = PlaybookMetadata(
+                        name=f"{pb.name}_{step_name}",
+                        description=step_desc,
+                        objectives=[step_desc],
+                        trigger_examples=pb.trigger_examples,
+                        file_path=str(p_path),
+                    )
+                    step_vec = extractor.extract_steering_vector(
+                        step_pb, layers_to_extract=layers_to_extract
+                    )
+                    step_vectors.append(step_vec)
+                process_library.add_process(
+                    pb.name.lower().replace(" ", "_"), step_vectors
+                )
+
+    # Save libraries
+    skill_lib_path = out_path / "skill_library.pt"
+    process_lib_path = out_path / "process_library.pt"
+    skill_library.save(skill_lib_path)
+    process_library.save(process_lib_path)
+
+    # 5. Train Gating-MLP on hidden states
+    skill_ids = sorted(list(skill_library.vectors.keys()))
+    if not skill_ids:
+        logger.error("No skills available to train Gating-MLP.")
+        return
+
+    logger.info(f"Training Gating-MLP router over {len(skill_ids)} skills...")
+
+    # Gating features layer (we extract hidden states from early/middle layer to predict routing)
+    gate_layer_idx = layers_to_extract[0] if layers_to_extract else 0
+    hidden_dim = model.hidden_dim if hasattr(model, "hidden_dim") else 32
+    if hasattr(model, "config") and hasattr(model.config, "hidden_size"):
+        hidden_dim = model.config.hidden_size
+
+    gate_net = LearnableGate(in_features=hidden_dim, num_adapters=len(skill_ids)).to(
+        device
+    )
+    optimizer = optim.Adam(gate_net.parameters(), lr=lr)
+    criterion = nn.CrossEntropyLoss()
+
+    # Collect training dataset: (hidden_state, label)
+    # We pass the win prompts for each skill, hook the gate layer, and collect states
+    training_data: List[Tuple[torch.Tensor, int]] = []
+
+    # Temporary hook to collect states
+    collected_states: List[torch.Tensor] = []
+
+    def collect_hook(module: nn.Module, input_t: Any, output_t: Any) -> None:
+        x = output_t[0] if isinstance(output_t, tuple) else output_t
+        # Pool to sequence mean
+        collected_states.append(x.detach().mean(dim=1).squeeze(0))
+
+    # Register hook on gate layer
+    hook_handle = None
+    target_layer_name = (
+        transformer_layers[gate_layer_idx]
+        if gate_layer_idx < len(transformer_layers)
+        else None
+    )
+
+    if target_layer_name:
+        for name, module in model.named_modules():
+            if name == target_layer_name:
+                hook_handle = module.register_forward_hook(collect_hook)
+                break
+
+    # Run win prompts to collect activations
+    for label_idx, skill_id in enumerate(skill_ids):
+        # Retrieve parsed metadata for this skill ID
+        pb_match = next(
+            (p for p in playbooks if p.name.lower().replace(" ", "_") == skill_id), None
+        )
+        if pb_match:
+            for trigger in pb_match.trigger_examples:
+                win_prompt = (
+                    f"Instructions: You are acting with the following skill: {pb_match.name}.\n"
+                    f"Request: {trigger}\nOutput:"
+                )
+                inputs = tokenizer(win_prompt, return_tensors="pt")
+                # Move inputs to device
+                inputs = {k: v.to(device) for k, v in inputs.items()}
+
+                collected_states.clear()
+                with torch.no_grad():
+                    model(**inputs)
+
+                if collected_states:
+                    state = collected_states[0].cpu()  # shape [hidden_dim]
+                    training_data.append((state, label_idx))
+
+    if hook_handle:
+        hook_handle.remove()
+
+    if not training_data:
+        logger.warning(
+            "Could not collect training data. Using synthetic data to train Gating-MLP."
+        )
+        # Fallback to random features for testing compilation flow
+        for i in range(100):
+            label = i % len(skill_ids)
+            feat = torch.randn(hidden_dim) + (label * 2.0)  # separate them a bit
+            training_data.append((feat, label))
+
+    # Train MLP
+    gate_net.train()
+    X = torch.stack([x for x, y in training_data]).to(device)
+    Y = torch.tensor([y for x, y in training_data]).to(device)
+
+    logger.info(
+        f"Collected {len(training_data)} training samples. Starting optimization..."
+    )
+
+    for epoch in range(epochs):
+        optimizer.zero_grad()
+        outputs = gate_net(X)
+        loss = criterion(outputs, Y)
+        loss.backward()
+        optimizer.step()
+
+        if (epoch + 1) % max(1, epochs // 5) == 0 or epoch == epochs - 1:
+            # Calculate accuracy
+            _, preds = torch.max(outputs, 1)
+            correct = (preds == Y).sum().item()
+            acc = correct / len(Y)
+            logger.info(
+                f"Epoch {epoch+1:02d}/{epochs:02d} | Loss: {loss.item():.4f} | Training Accuracy: {acc * 100:.1f}%"
+            )
+
+    # Save gate weights
+    gate_weights_path = out_path / "gate_weights.pt"
+    torch.save(gate_net.state_dict(), gate_weights_path)
+    logger.info(f"Gating MLP weights saved to {gate_weights_path}")
+
+    # 6. Validate steerability & routing performance
+    logger.info("Running pipeline verification...")
+    router = ExpertAdapterRouter(
+        base_model=model,
+        skill_library=skill_library,
+        process_library=process_library,
+        in_features=hidden_dim,
+        steering_alpha=1.0,
+    )
+    router.skill_gate.load_state_dict(
+        torch.load(gate_weights_path, map_location=device)
+    )
+    router.register_hooks()
+
+    # Run test prompt with routing enabled
+    test_prompt = "Validate this test prompt"
+    inputs = tokenizer(test_prompt, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+
+    # Test that forwarding goes through hooks without crash
+    with torch.no_grad():
+        _ = model(**inputs)
+    logger.info("Successfully executed forward pass with dynamic activation steering.")
+
+    router.unregister_hooks()
+    logger.info("Pipeline run completed successfully.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="FCES Representation Engineering Pipeline"
+    )
+    parser.add_argument(
+        "--model_id",
+        type=str,
+        default="EleutherAI/pythia-70m",
+        help="HuggingFace model identifier",
+    )
+    parser.add_argument(
+        "--skills_dirs",
+        type=str,
+        default="C:/Users/Sven/Documents/svenco-knowledge/skills,C:/Users/Sven/Documents/everything-claude-code/skills",
+        help="Comma-separated paths to search for playbooks",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./python/output",
+        help="Directory to save compiled libraries",
+    )
+    parser.add_argument(
+        "--use_dummy", action="store_true", help="Force using lightweight dummy model"
+    )
+    parser.add_argument("--epochs", type=int, default=50, help="Gating training epochs")
+    parser.add_argument(
+        "--lr", type=float, default=0.01, help="Gating training learning rate"
+    )
+
+    args = parser.parse_args()
+
+    # Split skills dirs
+    dirs_list = [d.strip() for d in args.skills_dirs.split(",") if d.strip()]
+
+    run_pipeline(
+        model_id=args.model_id,
+        skills_dirs=dirs_list,
+        output_dir=args.output_dir,
+        use_dummy=args.use_dummy,
+        epochs=args.epochs,
+        lr=args.lr,
+    )