"""Orchestration script to compile Skill and Process libraries, train Gating-MLP, and validate steering. Discovers local playbooks, extracts representation vectors using Pythia-70m (or dummy fallback), trains the gating network on hidden states, and runs validation. """ from __future__ import annotations import argparse import logging import re from pathlib import Path from typing import Any, Dict, List, Tuple import torch import torch.nn as nn import torch.optim as optim # Import our representation engineering and router modules import sys sys.path.append(str(Path(__file__).parent.absolute())) from representation_engineering import ( PlaybookParser, PlaybookMetadata, RepresentationVectorExtractor, SkillVectorLibrary, ProcessVectorLibrary, ) from adapter_moe_router import LearnableGate, ExpertAdapterRouter logger = logging.getLogger("run_representation_pipeline") logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" ) # --- Dummy Model & Tokenizer for Offline Fallback & Testing --- class DummyTransformerLayer(nn.Module): # type: ignore[misc] def __init__(self, hidden_dim: int) -> None: super().__init__() self.linear1 = nn.Linear(hidden_dim, hidden_dim) self.linear2 = nn.Linear(hidden_dim, hidden_dim) def forward( self, x: torch.Tensor, *args: Any, **kwargs: Any ) -> Tuple[torch.Tensor]: h = self.linear2(torch.relu(self.linear1(x))) return (x + h,) class DummyModel(nn.Module): # type: ignore[misc] def __init__( self, vocab_size: int = 1000, hidden_dim: int = 32, num_layers: int = 4 ) -> None: super().__init__() self.vocab_size = vocab_size self.hidden_dim = hidden_dim self.embedding = nn.Embedding(vocab_size, hidden_dim) self.layers = nn.ModuleList( [DummyTransformerLayer(hidden_dim) for _ in range(num_layers)] ) self.lm_head = nn.Linear(hidden_dim, vocab_size) def forward(self, input_ids: torch.Tensor, *args: Any, **kwargs: Any) -> Any: x = self.embedding(input_ids) for layer in self.layers: x = layer(x)[0] logits = self.lm_head(x) class Output: def __init__(self, logits: torch.Tensor) -> None: self.logits = logits return Output(logits=logits) class DummyTokenizer: def __init__(self) -> None: pass def __call__( self, text: str | List[str], return_tensors: str = "pt", **kwargs: Any ) -> Dict[str, torch.Tensor]: if isinstance(text, str): text = [text] batch_ids = [] max_len = 0 for t in text: words = t.split() ids = [abs(hash(w)) % 1000 for w in words] if not ids: ids = [0] batch_ids.append(ids) max_len = max(max_len, len(ids)) padded_ids = [] for ids in batch_ids: padded_ids.append(ids + [0] * (max_len - len(ids))) return {"input_ids": torch.tensor(padded_ids)} # --- Helper to parse process steps from playbooks --- def parse_playbook_steps(path: Path) -> List[Tuple[str, str]]: """Extracts step names and short descriptions from sequential lists in a playbook.""" content = path.read_text(encoding="utf-8") steps = [] # Locate numbered items under headers like Fallback Procedure, Operational Protocol, etc. lines = content.splitlines() in_procedure = False for line in lines: line_str = line.strip() if not line_str: continue if line_str.startswith("#"): lower_header = line_str.lower() in_procedure = any( x in lower_header for x in ["procedure", "protocol", "schritt", "ablauf", "loop"] ) continue if in_procedure: # Match 1. Step name: Description or just 1. Description match = re.match(r"^(\d+)\.\s*(.*?)$", line_str) if match: step_num = match.group(1) step_text = match.group(2).strip() # Split step name and description if separated by double asterisks or colon parts = re.split(r"\*\*|:\s*", step_text, maxsplit=1) if len(parts) > 1 and parts[0].strip(): step_name = parts[0].strip() step_desc = parts[1].strip() else: step_name = f"Step_{step_num}" step_desc = step_text steps.append((step_name, step_desc)) return steps # --- Pipeline Orchestrator --- def run_pipeline( model_id: str, skills_dirs: List[str], output_dir: str, use_dummy: bool = False, epochs: int = 50, lr: float = 0.01, ) -> None: logger.info("Initializing Representation Engineering Pipeline...") # 1. Resolve output directory out_path = Path(output_dir) out_path.mkdir(parents=True, exist_ok=True) # 2. Load model and tokenizer (with dummy fallback) model = None tokenizer = None device = "cuda" if torch.cuda.is_available() else "cpu" if not use_dummy: try: logger.info(f"Attempting to load model '{model_id}' from HuggingFace...") from transformers import AutoModelForCausalLM, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id).to(device) logger.info("Model loaded successfully.") except Exception as e: logger.warning( f"Failed to load model from HuggingFace: {e}. Falling back to Dummy Model." ) use_dummy = True if use_dummy: logger.info( "Initializing lightweight Dummy Model and Tokenizer for execution/testing..." ) model = DummyModel(vocab_size=1000, hidden_dim=32, num_layers=4).to(device) tokenizer = DummyTokenizer() device = "cpu" assert model is not None assert tokenizer is not None # 3. Discover playbooks logger.info("Scanning directories for playbooks...") playbooks: List[PlaybookMetadata] = [] parsed_files: List[Path] = [] for s_dir in skills_dirs: path = Path(s_dir) if path.exists(): logger.info(f"Scanning directory: {path}") found = PlaybookParser.parse_directory(path) playbooks.extend(found) # Find actual files for process parsing for p_file in path.glob("**/SKILL.md"): parsed_files.append(p_file) for p_file in path.glob("*_SKILL.md"): parsed_files.append(p_file) if not playbooks: logger.warning( "No playbooks discovered! Creating a default mock playbook for execution..." ) mock_skill = PlaybookMetadata( name="MockSkill", description="A temporary skill for pipeline validation", objectives=[ "Validate the routing pipeline", "Verify steering functionality", ], trigger_examples=[ "Run the mock pipeline check", "Test activation steering on mock", ], file_path="mock_SKILL.md", ) playbooks.append(mock_skill) logger.info(f"Discovered {len(playbooks)} playbooks.") # 4. Extract Skill and Process Libraries skill_library = SkillVectorLibrary() process_library = ProcessVectorLibrary() extractor = RepresentationVectorExtractor(model, tokenizer, device=device) # Determine model layers transformer_layers = [] for name, module in model.named_modules(): if re.match(r".*layers?\.\d+$", name): transformer_layers.append(name) num_layers = len(transformer_layers) logger.info(f"Detected {num_layers} transformer layers in base model.") # Choose layers to extract: middle/late layers layers_to_extract = ( list(range(num_layers // 2, num_layers)) if num_layers > 0 else [0] ) # Extract skill vectors for pb in playbooks: logger.info(f"Extracting vectors for skill: {pb.name}") vec = extractor.extract_steering_vector(pb, layers_to_extract=layers_to_extract) skill_library.add_vector(vec) # Extract sequential process vectors if file exists p_path = Path(pb.file_path) if p_path.exists(): steps = parse_playbook_steps(p_path) if steps: logger.info( f"Found {len(steps)} sequential steps in process '{pb.name}'" ) step_vectors = [] for step_name, step_desc in steps: step_pb = PlaybookMetadata( name=f"{pb.name}_{step_name}", description=step_desc, objectives=[step_desc], trigger_examples=pb.trigger_examples, file_path=str(p_path), ) step_vec = extractor.extract_steering_vector( step_pb, layers_to_extract=layers_to_extract ) step_vectors.append(step_vec) process_library.add_process( pb.name.lower().replace(" ", "_"), step_vectors ) # Save libraries skill_lib_path = out_path / "skill_library.pt" process_lib_path = out_path / "process_library.pt" skill_library.save(skill_lib_path) process_library.save(process_lib_path) # 5. Train Gating-MLP on hidden states skill_ids = sorted(list(skill_library.vectors.keys())) if not skill_ids: logger.error("No skills available to train Gating-MLP.") return logger.info(f"Training Gating-MLP router over {len(skill_ids)} skills...") # Gating features layer (we extract hidden states from early/middle layer to predict routing) gate_layer_idx = layers_to_extract[0] if layers_to_extract else 0 hidden_dim = model.hidden_dim if hasattr(model, "hidden_dim") else 32 if hasattr(model, "config") and hasattr(model.config, "hidden_size"): hidden_dim = model.config.hidden_size gate_net = LearnableGate(in_features=hidden_dim, num_adapters=len(skill_ids)).to( device ) optimizer = optim.Adam(gate_net.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() # Collect training dataset: (hidden_state, label) # We pass the win prompts for each skill, hook the gate layer, and collect states training_data: List[Tuple[torch.Tensor, int]] = [] # Temporary hook to collect states collected_states: List[torch.Tensor] = [] def collect_hook(module: nn.Module, input_t: Any, output_t: Any) -> None: x = output_t[0] if isinstance(output_t, tuple) else output_t # Pool to sequence mean collected_states.append(x.detach().mean(dim=1).squeeze(0)) # Register hook on gate layer hook_handle = None target_layer_name = ( transformer_layers[gate_layer_idx] if gate_layer_idx < len(transformer_layers) else None ) if target_layer_name: for name, module in model.named_modules(): if name == target_layer_name: hook_handle = module.register_forward_hook(collect_hook) break # Run win prompts to collect activations for label_idx, skill_id in enumerate(skill_ids): # Retrieve parsed metadata for this skill ID pb_match = next( (p for p in playbooks if p.name.lower().replace(" ", "_") == skill_id), None ) if pb_match: for trigger in pb_match.trigger_examples: win_prompt = ( f"Instructions: You are acting with the following skill: {pb_match.name}.\n" f"Request: {trigger}\nOutput:" ) inputs = tokenizer(win_prompt, return_tensors="pt") # Move inputs to device inputs = {k: v.to(device) for k, v in inputs.items()} collected_states.clear() with torch.no_grad(): model(**inputs) if collected_states: state = collected_states[0].cpu() # shape [hidden_dim] training_data.append((state, label_idx)) if hook_handle: hook_handle.remove() if not training_data: logger.warning( "Could not collect training data. Using synthetic data to train Gating-MLP." ) # Fallback to random features for testing compilation flow for i in range(100): label = i % len(skill_ids) feat = torch.randn(hidden_dim) + (label * 2.0) # separate them a bit training_data.append((feat, label)) # Train MLP gate_net.train() X = torch.stack([x for x, y in training_data]).to(device) Y = torch.tensor([y for x, y in training_data]).to(device) logger.info( f"Collected {len(training_data)} training samples. Starting optimization..." ) for epoch in range(epochs): optimizer.zero_grad() outputs = gate_net(X) loss = criterion(outputs, Y) loss.backward() optimizer.step() if (epoch + 1) % max(1, epochs // 5) == 0 or epoch == epochs - 1: # Calculate accuracy _, preds = torch.max(outputs, 1) correct = (preds == Y).sum().item() acc = correct / len(Y) logger.info( f"Epoch {epoch+1:02d}/{epochs:02d} | Loss: {loss.item():.4f} | Training Accuracy: {acc * 100:.1f}%" ) # Save gate weights gate_weights_path = out_path / "gate_weights.pt" torch.save(gate_net.state_dict(), gate_weights_path) logger.info(f"Gating MLP weights saved to {gate_weights_path}") # 6. Validate steerability & routing performance logger.info("Running pipeline verification...") router = ExpertAdapterRouter( base_model=model, skill_library=skill_library, process_library=process_library, in_features=hidden_dim, steering_alpha=1.0, ) router.skill_gate.load_state_dict( torch.load(gate_weights_path, map_location=device) ) router.register_hooks() # Run test prompt with routing enabled test_prompt = "Validate this test prompt" inputs = tokenizer(test_prompt, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} # Test that forwarding goes through hooks without crash with torch.no_grad(): _ = model(**inputs) logger.info("Successfully executed forward pass with dynamic activation steering.") router.unregister_hooks() logger.info("Pipeline run completed successfully.") if __name__ == "__main__": parser = argparse.ArgumentParser( description="FCES Representation Engineering Pipeline" ) parser.add_argument( "--model_id", type=str, default="EleutherAI/pythia-70m", help="HuggingFace model identifier", ) parser.add_argument( "--skills_dirs", type=str, default="C:/Users/Sven/Documents/svenco-knowledge/skills,C:/Users/Sven/Documents/everything-claude-code/skills", help="Comma-separated paths to search for playbooks", ) parser.add_argument( "--output_dir", type=str, default="./python/output", help="Directory to save compiled libraries", ) parser.add_argument( "--use_dummy", action="store_true", help="Force using lightweight dummy model" ) parser.add_argument("--epochs", type=int, default=50, help="Gating training epochs") parser.add_argument( "--lr", type=float, default=0.01, help="Gating training learning rate" ) args = parser.parse_args() # Split skills dirs dirs_list = [d.strip() for d in args.skills_dirs.split(",") if d.strip()] run_pipeline( model_id=args.model_id, skills_dirs=dirs_list, output_dir=args.output_dir, use_dummy=args.use_dummy, epochs=args.epochs, lr=args.lr, )