FCES-native/python/run_representation_pipeline.py

"""Orchestration script to compile Skill and Process libraries, train Gating-MLP, and validate steering.

Discovers local playbooks, extracts representation vectors using Pythia-70m (or dummy fallback),
trains the gating network on hidden states, and runs validation.
"""

from __future__ import annotations

import argparse
import logging
import re
from pathlib import Path
from typing import Any, Dict, List, Tuple

import torch
import torch.nn as nn
import torch.optim as optim

# Import our representation engineering and router modules
import sys

sys.path.append(str(Path(__file__).parent.absolute()))

from representation_engineering import (
    PlaybookParser,
    PlaybookMetadata,
    RepresentationVectorExtractor,
    SkillVectorLibrary,
    ProcessVectorLibrary,
)
from adapter_moe_router import LearnableGate, ExpertAdapterRouter

logger = logging.getLogger("run_representation_pipeline")
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)


# --- Dummy Model & Tokenizer for Offline Fallback & Testing ---


class DummyTransformerLayer(nn.Module):  # type: ignore[misc]
    def __init__(self, hidden_dim: int) -> None:
        super().__init__()
        self.linear1 = nn.Linear(hidden_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)

    def forward(
        self, x: torch.Tensor, *args: Any, **kwargs: Any
    ) -> Tuple[torch.Tensor]:
        h = self.linear2(torch.relu(self.linear1(x)))
        return (x + h,)


class DummyModel(nn.Module):  # type: ignore[misc]
    def __init__(
        self, vocab_size: int = 1000, hidden_dim: int = 32, num_layers: int = 4
    ) -> None:
        super().__init__()
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.layers = nn.ModuleList(
            [DummyTransformerLayer(hidden_dim) for _ in range(num_layers)]
        )
        self.lm_head = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids: torch.Tensor, *args: Any, **kwargs: Any) -> Any:
        x = self.embedding(input_ids)
        for layer in self.layers:
            x = layer(x)[0]
        logits = self.lm_head(x)

        class Output:
            def __init__(self, logits: torch.Tensor) -> None:
                self.logits = logits

        return Output(logits=logits)


class DummyTokenizer:
    def __init__(self) -> None:
        pass

    def __call__(
        self, text: str | List[str], return_tensors: str = "pt", **kwargs: Any
    ) -> Dict[str, torch.Tensor]:
        if isinstance(text, str):
            text = [text]
        batch_ids = []
        max_len = 0
        for t in text:
            words = t.split()
            ids = [abs(hash(w)) % 1000 for w in words]
            if not ids:
                ids = [0]
            batch_ids.append(ids)
            max_len = max(max_len, len(ids))

        padded_ids = []
        for ids in batch_ids:
            padded_ids.append(ids + [0] * (max_len - len(ids)))
        return {"input_ids": torch.tensor(padded_ids)}


# --- Helper to parse process steps from playbooks ---


def parse_playbook_steps(path: Path) -> List[Tuple[str, str]]:
    """Extracts step names and short descriptions from sequential lists in a playbook."""
    content = path.read_text(encoding="utf-8")
    steps = []

    # Locate numbered items under headers like Fallback Procedure, Operational Protocol, etc.
    lines = content.splitlines()
    in_procedure = False

    for line in lines:
        line_str = line.strip()
        if not line_str:
            continue

        if line_str.startswith("#"):
            lower_header = line_str.lower()
            in_procedure = any(
                x in lower_header
                for x in ["procedure", "protocol", "schritt", "ablauf", "loop"]
            )
            continue

        if in_procedure:
            # Match 1. Step name: Description or just 1. Description
            match = re.match(r"^(\d+)\.\s*(.*?)$", line_str)
            if match:
                step_num = match.group(1)
                step_text = match.group(2).strip()
                # Split step name and description if separated by double asterisks or colon
                parts = re.split(r"\*\*|:\s*", step_text, maxsplit=1)
                if len(parts) > 1 and parts[0].strip():
                    step_name = parts[0].strip()
                    step_desc = parts[1].strip()
                else:
                    step_name = f"Step_{step_num}"
                    step_desc = step_text
                steps.append((step_name, step_desc))

    return steps


# --- Pipeline Orchestrator ---


def run_pipeline(
    model_id: str,
    skills_dirs: List[str],
    output_dir: str,
    use_dummy: bool = False,
    epochs: int = 50,
    lr: float = 0.01,
) -> None:
    logger.info("Initializing Representation Engineering Pipeline...")

    # 1. Resolve output directory
    out_path = Path(output_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    # 2. Load model and tokenizer (with dummy fallback)
    model = None
    tokenizer = None
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if not use_dummy:
        try:
            logger.info(f"Attempting to load model '{model_id}' from HuggingFace...")
            from transformers import AutoModelForCausalLM, AutoTokenizer

            tokenizer = AutoTokenizer.from_pretrained(model_id)
            model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
            logger.info("Model loaded successfully.")
        except Exception as e:
            logger.warning(
                f"Failed to load model from HuggingFace: {e}. Falling back to Dummy Model."
            )
            use_dummy = True

    if use_dummy:
        logger.info(
            "Initializing lightweight Dummy Model and Tokenizer for execution/testing..."
        )
        model = DummyModel(vocab_size=1000, hidden_dim=32, num_layers=4).to(device)
        tokenizer = DummyTokenizer()
        device = "cpu"

    assert model is not None
    assert tokenizer is not None

    # 3. Discover playbooks
    logger.info("Scanning directories for playbooks...")
    playbooks: List[PlaybookMetadata] = []
    parsed_files: List[Path] = []

    for s_dir in skills_dirs:
        path = Path(s_dir)
        if path.exists():
            logger.info(f"Scanning directory: {path}")
            found = PlaybookParser.parse_directory(path)
            playbooks.extend(found)
            # Find actual files for process parsing
            for p_file in path.glob("**/SKILL.md"):
                parsed_files.append(p_file)
            for p_file in path.glob("*_SKILL.md"):
                parsed_files.append(p_file)

    if not playbooks:
        logger.warning(
            "No playbooks discovered! Creating a default mock playbook for execution..."
        )
        mock_skill = PlaybookMetadata(
            name="MockSkill",
            description="A temporary skill for pipeline validation",
            objectives=[
                "Validate the routing pipeline",
                "Verify steering functionality",
            ],
            trigger_examples=[
                "Run the mock pipeline check",
                "Test activation steering on mock",
            ],
            file_path="mock_SKILL.md",
        )
        playbooks.append(mock_skill)

    logger.info(f"Discovered {len(playbooks)} playbooks.")

    # 4. Extract Skill and Process Libraries
    skill_library = SkillVectorLibrary()
    process_library = ProcessVectorLibrary()
    extractor = RepresentationVectorExtractor(model, tokenizer, device=device)

    # Determine model layers
    transformer_layers = []
    for name, module in model.named_modules():
        if re.match(r".*layers?\.\d+$", name):
            transformer_layers.append(name)
    num_layers = len(transformer_layers)
    logger.info(f"Detected {num_layers} transformer layers in base model.")

    # Choose layers to extract: middle/late layers
    layers_to_extract = (
        list(range(num_layers // 2, num_layers)) if num_layers > 0 else [0]
    )

    # Extract skill vectors
    for pb in playbooks:
        logger.info(f"Extracting vectors for skill: {pb.name}")
        vec = extractor.extract_steering_vector(pb, layers_to_extract=layers_to_extract)
        skill_library.add_vector(vec)

        # Extract sequential process vectors if file exists
        p_path = Path(pb.file_path)
        if p_path.exists():
            steps = parse_playbook_steps(p_path)
            if steps:
                logger.info(
                    f"Found {len(steps)} sequential steps in process '{pb.name}'"
                )
                step_vectors = []
                for step_name, step_desc in steps:
                    step_pb = PlaybookMetadata(
                        name=f"{pb.name}_{step_name}",
                        description=step_desc,
                        objectives=[step_desc],
                        trigger_examples=pb.trigger_examples,
                        file_path=str(p_path),
                    )
                    step_vec = extractor.extract_steering_vector(
                        step_pb, layers_to_extract=layers_to_extract
                    )
                    step_vectors.append(step_vec)
                process_library.add_process(
                    pb.name.lower().replace(" ", "_"), step_vectors
                )

    # Save libraries
    skill_lib_path = out_path / "skill_library.pt"
    process_lib_path = out_path / "process_library.pt"
    skill_library.save(skill_lib_path)
    process_library.save(process_lib_path)

    # 5. Train Gating-MLP on hidden states
    skill_ids = sorted(list(skill_library.vectors.keys()))
    if not skill_ids:
        logger.error("No skills available to train Gating-MLP.")
        return

    logger.info(f"Training Gating-MLP router over {len(skill_ids)} skills...")

    # Gating features layer (we extract hidden states from early/middle layer to predict routing)
    gate_layer_idx = layers_to_extract[0] if layers_to_extract else 0
    hidden_dim = model.hidden_dim if hasattr(model, "hidden_dim") else 32
    if hasattr(model, "config") and hasattr(model.config, "hidden_size"):
        hidden_dim = model.config.hidden_size

    gate_net = LearnableGate(in_features=hidden_dim, num_adapters=len(skill_ids)).to(
        device
    )
    optimizer = optim.Adam(gate_net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # Collect training dataset: (hidden_state, label)
    # We pass the win prompts for each skill, hook the gate layer, and collect states
    training_data: List[Tuple[torch.Tensor, int]] = []

    # Temporary hook to collect states
    collected_states: List[torch.Tensor] = []

    def collect_hook(module: nn.Module, input_t: Any, output_t: Any) -> None:
        x = output_t[0] if isinstance(output_t, tuple) else output_t
        # Pool to sequence mean
        collected_states.append(x.detach().mean(dim=1).squeeze(0))

    # Register hook on gate layer
    hook_handle = None
    target_layer_name = (
        transformer_layers[gate_layer_idx]
        if gate_layer_idx < len(transformer_layers)
        else None
    )

    if target_layer_name:
        for name, module in model.named_modules():
            if name == target_layer_name:
                hook_handle = module.register_forward_hook(collect_hook)
                break

    # Run win prompts to collect activations
    for label_idx, skill_id in enumerate(skill_ids):
        # Retrieve parsed metadata for this skill ID
        pb_match = next(
            (p for p in playbooks if p.name.lower().replace(" ", "_") == skill_id), None
        )
        if pb_match:
            for trigger in pb_match.trigger_examples:
                win_prompt = (
                    f"Instructions: You are acting with the following skill: {pb_match.name}.\n"
                    f"Request: {trigger}\nOutput:"
                )
                inputs = tokenizer(win_prompt, return_tensors="pt")
                # Move inputs to device
                inputs = {k: v.to(device) for k, v in inputs.items()}

                collected_states.clear()
                with torch.no_grad():
                    model(**inputs)

                if collected_states:
                    state = collected_states[0].cpu()  # shape [hidden_dim]
                    training_data.append((state, label_idx))

    if hook_handle:
        hook_handle.remove()

    if not training_data:
        logger.warning(
            "Could not collect training data. Using synthetic data to train Gating-MLP."
        )
        # Fallback to random features for testing compilation flow
        for i in range(100):
            label = i % len(skill_ids)
            feat = torch.randn(hidden_dim) + (label * 2.0)  # separate them a bit
            training_data.append((feat, label))

    # Train MLP
    gate_net.train()
    X = torch.stack([x for x, y in training_data]).to(device)
    Y = torch.tensor([y for x, y in training_data]).to(device)

    logger.info(
        f"Collected {len(training_data)} training samples. Starting optimization..."
    )

    for epoch in range(epochs):
        optimizer.zero_grad()
        outputs = gate_net(X)
        loss = criterion(outputs, Y)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % max(1, epochs // 5) == 0 or epoch == epochs - 1:
            # Calculate accuracy
            _, preds = torch.max(outputs, 1)
            correct = (preds == Y).sum().item()
            acc = correct / len(Y)
            logger.info(
                f"Epoch {epoch+1:02d}/{epochs:02d} | Loss: {loss.item():.4f} | Training Accuracy: {acc * 100:.1f}%"
            )

    # Save gate weights
    gate_weights_path = out_path / "gate_weights.pt"
    torch.save(gate_net.state_dict(), gate_weights_path)
    logger.info(f"Gating MLP weights saved to {gate_weights_path}")

    # 6. Validate steerability & routing performance
    logger.info("Running pipeline verification...")
    router = ExpertAdapterRouter(
        base_model=model,
        skill_library=skill_library,
        process_library=process_library,
        in_features=hidden_dim,
        steering_alpha=1.0,
    )
    router.skill_gate.load_state_dict(
        torch.load(gate_weights_path, map_location=device)
    )
    router.register_hooks()

    # Run test prompt with routing enabled
    test_prompt = "Validate this test prompt"
    inputs = tokenizer(test_prompt, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Test that forwarding goes through hooks without crash
    with torch.no_grad():
        _ = model(**inputs)
    logger.info("Successfully executed forward pass with dynamic activation steering.")

    router.unregister_hooks()
    logger.info("Pipeline run completed successfully.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="FCES Representation Engineering Pipeline"
    )
    parser.add_argument(
        "--model_id",
        type=str,
        default="EleutherAI/pythia-70m",
        help="HuggingFace model identifier",
    )
    parser.add_argument(
        "--skills_dirs",
        type=str,
        default="C:/Users/Sven/Documents/svenco-knowledge/skills,C:/Users/Sven/Documents/everything-claude-code/skills",
        help="Comma-separated paths to search for playbooks",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="./python/output",
        help="Directory to save compiled libraries",
    )
    parser.add_argument(
        "--use_dummy", action="store_true", help="Force using lightweight dummy model"
    )
    parser.add_argument("--epochs", type=int, default=50, help="Gating training epochs")
    parser.add_argument(
        "--lr", type=float, default=0.01, help="Gating training learning rate"
    )

    args = parser.parse_args()

    # Split skills dirs
    dirs_list = [d.strip() for d in args.skills_dirs.split(",") if d.strip()]

    run_pipeline(
        model_id=args.model_id,
        skills_dirs=dirs_list,
        output_dir=args.output_dir,
        use_dummy=args.use_dummy,
        epochs=args.epochs,
        lr=args.lr,
    )