feat: Add playbook vector extraction and activation steering routing to FCES training pipeline

2026-05-23 08:33:27 +02:00
parent e0d8a32823
commit 4c9a550f8b
4 changed files with 884 additions and 46 deletions
--- a/tests/test_representation_engineering.py
+++ b/tests/test_representation_engineering.py
@@ -0,0 +1,251 @@
+"""Unit tests for Representation Engineering and Vector Library Compilation."""
+
+from __future__ import annotations
+
+import os
+import sys
+import tempfile
+import unittest
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+
+# Ensure python directory is in path
+sys.path.append(
+    os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "python")
+)
+
+from representation_engineering import (
+    PlaybookParser,
+    PlaybookMetadata,
+    RepresentationVector,
+    RepresentationVectorExtractor,
+    SkillVectorLibrary,
+    ProcessVectorLibrary,
+)
+from adapter_moe_router import ExpertAdapterRouter
+
+
+# --- Mock Classes for Testing ---
+
+
+class SimpleTransformerLayer(nn.Module):  # type: ignore[misc]
+    def __init__(self, hidden_dim: int) -> None:
+        super().__init__()
+        self.linear = nn.Linear(hidden_dim, hidden_dim, bias=False)
+        with torch.no_grad():
+            self.linear.weight.fill_(0.1)
+
+    def forward(
+        self, x: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> Tuple[torch.Tensor]:
+        return (x + self.linear(x),)
+
+
+class SimpleModel(nn.Module):  # type: ignore[misc]
+    def __init__(self, hidden_dim: int = 16) -> None:
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        # We name layers matching .layers.\d+$ to verify registration
+        self.layers = nn.ModuleList(
+            [SimpleTransformerLayer(hidden_dim), SimpleTransformerLayer(hidden_dim)]
+        )
+
+    def forward(self, input_ids: torch.Tensor, *args: Any, **kwargs: Any) -> Any:
+        # Simple embedding emulation: shape [batch, seq_len, hidden_dim]
+        # input_ids shape: [batch, seq_len]
+        batch_size, seq_len = input_ids.shape
+        x = (
+            torch.ones(batch_size, seq_len, self.hidden_dim, dtype=torch.float32)
+            * input_ids.unsqueeze(-1).float()
+        )
+        for layer in self.layers:
+            x = layer(x)[0]
+
+        class Output:
+            def __init__(self, logits: torch.Tensor) -> None:
+                self.logits = logits
+
+        return Output(logits=x)
+
+
+class SimpleTokenizer:
+    def __call__(
+        self, text: str | List[str], return_tensors: str = "pt", **kwargs: Any
+    ) -> Dict[str, torch.Tensor]:
+        if isinstance(text, str):
+            text = [text]
+        val = 2 if any("skill" in t.lower() for t in text) else 1
+        return {"input_ids": torch.ones(len(text), 5, dtype=torch.long) * val}
+
+
+# --- Test Cases ---
+
+
+class TestRepresentationEngineering(unittest.TestCase):
+    def setUp(self) -> None:
+        self.model = SimpleModel(hidden_dim=16)
+        self.tokenizer = SimpleTokenizer()
+        self.extractor = RepresentationVectorExtractor(
+            self.model, self.tokenizer, device="cpu"
+        )
+
+    def test_playbook_parser(self) -> None:
+        # Create a mock SKILL.md content
+        mock_content = """---
+name: Mock Test Skill
+description: >-
+  This is a description
+  with multiline text.
+---
+# Skill: Mock Test Skill
+
+## Objective
+- Analyze inputs dynamically.
+- Perform steering optimization.
+
+## Activation Trigger
+- Use this when testing the parser.
+- Trigger example two.
+"""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            file_path = Path(tmpdir) / "SKILL.md"
+            file_path.write_text(mock_content, encoding="utf-8")
+
+            pb = PlaybookParser.parse_file(file_path)
+            assert pb is not None
+            self.assertEqual(pb.name, "Mock Test Skill")
+            self.assertEqual(
+                pb.description, "This is a description with multiline text."
+            )
+            self.assertIn("Analyze inputs dynamically.", pb.objectives)
+            self.assertIn("Use this when testing the parser.", pb.trigger_examples)
+            self.assertIn("Trigger example two.", pb.trigger_examples)
+
+    def test_vector_extraction(self) -> None:
+        mock_pb = PlaybookMetadata(
+            name="TestSteering",
+            description="Steering description",
+            objectives=["Objective 1"],
+            trigger_examples=["Trigger 1"],
+            file_path="mock_path.md",
+        )
+        # Extract from layer index 1
+        vec = self.extractor.extract_steering_vector(mock_pb, layers_to_extract=[1])
+        self.assertEqual(vec.skill_id, "teststeering")
+        self.assertIn(1, vec.layer_vectors)
+        self.assertEqual(vec.layer_vectors[1].shape, (16,))
+        # Assert normalized to unit norm
+        self.assertAlmostEqual(torch.norm(vec.layer_vectors[1]).item(), 1.0, places=5)
+
+    def test_libraries_save_and_load(self) -> None:
+        skill_lib = SkillVectorLibrary()
+        process_lib = ProcessVectorLibrary()
+
+        # Add mock vectors
+        v1 = RepresentationVector(
+            skill_id="skill_a",
+            layer_vectors={0: torch.ones(16), 1: torch.zeros(16)},
+            metadata={"name": "Skill A"},
+        )
+        skill_lib.add_vector(v1)
+
+        v2 = RepresentationVector(
+            skill_id="step_1",
+            layer_vectors={0: torch.ones(16) * 0.5},
+            metadata={"name": "Step 1"},
+        )
+        process_lib.add_process("process_a", [v2])
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            skill_path = Path(tmpdir) / "skills.pt"
+            proc_path = Path(tmpdir) / "processes.pt"
+
+            skill_lib.save(skill_path)
+            process_lib.save(proc_path)
+
+            self.assertTrue(skill_path.exists())
+            self.assertTrue(proc_path.exists())
+
+            # Load into new libraries
+            new_skill_lib = SkillVectorLibrary()
+            new_proc_lib = ProcessVectorLibrary()
+
+            new_skill_lib.load(skill_path)
+            new_proc_lib.load(proc_path)
+
+            loaded_v1 = new_skill_lib.get_vector("skill_a")
+            assert loaded_v1 is not None
+            self.assertEqual(loaded_v1.skill_id, "skill_a")
+            self.assertTrue(torch.allclose(loaded_v1.layer_vectors[0], torch.ones(16)))
+
+            loaded_v2 = new_proc_lib.get_process_step("process_a", 0)
+            assert loaded_v2 is not None
+            self.assertEqual(loaded_v2.skill_id, "step_1")
+            self.assertTrue(
+                torch.allclose(loaded_v2.layer_vectors[0], torch.ones(16) * 0.5)
+            )
+
+    def test_router_activation_steering(self) -> None:
+        # Create a skill library and add a steering vector
+        skill_lib = SkillVectorLibrary()
+        v = RepresentationVector(
+            skill_id="skill_steer",
+            layer_vectors={0: torch.ones(16) * 0.5, 1: torch.ones(16) * -0.5},
+            metadata={"name": "Steer"},
+        )
+        skill_lib.add_vector(v)
+
+        # Setup router
+        router = ExpertAdapterRouter(
+            base_model=self.model,
+            skill_library=skill_lib,
+            in_features=16,
+            steering_alpha=2.0,
+            steering_mode="prompt",
+        )
+
+        # Test 1: Forward without hooks (baseline)
+        x = torch.ones(1, 4, dtype=torch.long)  # batch=1, seq_len=4
+        with torch.no_grad():
+            out_base = self.model(x).logits
+
+        # Test 2: Register hooks and set active routing to skill_steer (index 0)
+        router.register_hooks()
+        self.assertEqual(len(router.hooks), 2)  # Should hook both layers
+
+        priors = torch.tensor([1.0])
+        router.set_active_routing(priors)
+
+        with torch.no_grad():
+            out_steered = self.model(x).logits
+
+        # Verify that steering changed the output
+        self.assertFalse(torch.allclose(out_base, out_steered))
+
+        # Test 3: Process sequential routing
+        router.unregister_hooks()
+        proc_lib = ProcessVectorLibrary()
+        v_proc = RepresentationVector(
+            skill_id="step_a",
+            layer_vectors={0: torch.ones(16) * 10.0},  # large steering value
+            metadata={"name": "Step A"},
+        )
+        proc_lib.add_process("my_workflow", [v_proc])
+
+        router.process_library = proc_lib
+        router.active_process_id = "my_workflow"
+        router.active_process_step = 0
+        router.register_hooks()
+
+        with torch.no_grad():
+            out_proc = self.model(x).logits
+
+        self.assertFalse(torch.allclose(out_base, out_proc))
+        router.unregister_hooks()
+
+
+if __name__ == "__main__":
+    unittest.main()