FCES-native/tests/test_representation_engineering.py

"""Unit tests for Representation Engineering and Vector Library Compilation."""

from __future__ import annotations

import os
import sys
import tempfile
import unittest
from pathlib import Path
from typing import Any, Dict, List, Tuple

import torch
import torch.nn as nn

# Ensure python directory is in path
sys.path.append(
    os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "python")
)

from representation_engineering import (
    PlaybookParser,
    PlaybookMetadata,
    RepresentationVector,
    RepresentationVectorExtractor,
    SkillVectorLibrary,
    ProcessVectorLibrary,
)
from adapter_moe_router import ExpertAdapterRouter


# --- Mock Classes for Testing ---


class SimpleTransformerLayer(nn.Module):  # type: ignore[misc]
    def __init__(self, hidden_dim: int) -> None:
        super().__init__()
        self.linear = nn.Linear(hidden_dim, hidden_dim, bias=False)
        with torch.no_grad():
            self.linear.weight.fill_(0.1)

    def forward(
        self, x: torch.Tensor, *args: Any, **kwargs: Any
    ) -> Tuple[torch.Tensor]:
        return (x + self.linear(x),)


class SimpleModel(nn.Module):  # type: ignore[misc]
    def __init__(self, hidden_dim: int = 16) -> None:
        super().__init__()
        self.hidden_dim = hidden_dim
        # We name layers matching .layers.\d+$ to verify registration
        self.layers = nn.ModuleList(
            [SimpleTransformerLayer(hidden_dim), SimpleTransformerLayer(hidden_dim)]
        )

    def forward(self, input_ids: torch.Tensor, *args: Any, **kwargs: Any) -> Any:
        # Simple embedding emulation: shape [batch, seq_len, hidden_dim]
        # input_ids shape: [batch, seq_len]
        batch_size, seq_len = input_ids.shape
        x = (
            torch.ones(batch_size, seq_len, self.hidden_dim, dtype=torch.float32)
            * input_ids.unsqueeze(-1).float()
        )
        for layer in self.layers:
            x = layer(x)[0]

        class Output:
            def __init__(self, logits: torch.Tensor) -> None:
                self.logits = logits

        return Output(logits=x)


class SimpleTokenizer:
    def __call__(
        self, text: str | List[str], return_tensors: str = "pt", **kwargs: Any
    ) -> Dict[str, torch.Tensor]:
        if isinstance(text, str):
            text = [text]
        val = 2 if any("skill" in t.lower() for t in text) else 1
        return {"input_ids": torch.ones(len(text), 5, dtype=torch.long) * val}


# --- Test Cases ---


class TestRepresentationEngineering(unittest.TestCase):
    def setUp(self) -> None:
        self.model = SimpleModel(hidden_dim=16)
        self.tokenizer = SimpleTokenizer()
        self.extractor = RepresentationVectorExtractor(
            self.model, self.tokenizer, device="cpu"
        )

    def test_playbook_parser(self) -> None:
        # Create a mock SKILL.md content
        mock_content = """---
name: Mock Test Skill
description: >-
  This is a description
  with multiline text.
---
# Skill: Mock Test Skill

## Objective
- Analyze inputs dynamically.
- Perform steering optimization.

## Activation Trigger
- Use this when testing the parser.
- Trigger example two.
"""
        with tempfile.TemporaryDirectory() as tmpdir:
            file_path = Path(tmpdir) / "SKILL.md"
            file_path.write_text(mock_content, encoding="utf-8")

            pb = PlaybookParser.parse_file(file_path)
            assert pb is not None
            self.assertEqual(pb.name, "Mock Test Skill")
            self.assertEqual(
                pb.description, "This is a description with multiline text."
            )
            self.assertIn("Analyze inputs dynamically.", pb.objectives)
            self.assertIn("Use this when testing the parser.", pb.trigger_examples)
            self.assertIn("Trigger example two.", pb.trigger_examples)

    def test_vector_extraction(self) -> None:
        mock_pb = PlaybookMetadata(
            name="TestSteering",
            description="Steering description",
            objectives=["Objective 1"],
            trigger_examples=["Trigger 1"],
            file_path="mock_path.md",
        )
        # Extract from layer index 1
        vec = self.extractor.extract_steering_vector(mock_pb, layers_to_extract=[1])
        self.assertEqual(vec.skill_id, "teststeering")
        self.assertIn(1, vec.layer_vectors)
        self.assertEqual(vec.layer_vectors[1].shape, (16,))
        # Assert normalized to unit norm
        self.assertAlmostEqual(torch.norm(vec.layer_vectors[1]).item(), 1.0, places=5)

    def test_libraries_save_and_load(self) -> None:
        skill_lib = SkillVectorLibrary()
        process_lib = ProcessVectorLibrary()

        # Add mock vectors
        v1 = RepresentationVector(
            skill_id="skill_a",
            layer_vectors={0: torch.ones(16), 1: torch.zeros(16)},
            metadata={"name": "Skill A"},
        )
        skill_lib.add_vector(v1)

        v2 = RepresentationVector(
            skill_id="step_1",
            layer_vectors={0: torch.ones(16) * 0.5},
            metadata={"name": "Step 1"},
        )
        process_lib.add_process("process_a", [v2])

        with tempfile.TemporaryDirectory() as tmpdir:
            skill_path = Path(tmpdir) / "skills.pt"
            proc_path = Path(tmpdir) / "processes.pt"

            skill_lib.save(skill_path)
            process_lib.save(proc_path)

            self.assertTrue(skill_path.exists())
            self.assertTrue(proc_path.exists())

            # Load into new libraries
            new_skill_lib = SkillVectorLibrary()
            new_proc_lib = ProcessVectorLibrary()

            new_skill_lib.load(skill_path)
            new_proc_lib.load(proc_path)

            loaded_v1 = new_skill_lib.get_vector("skill_a")
            assert loaded_v1 is not None
            self.assertEqual(loaded_v1.skill_id, "skill_a")
            self.assertTrue(torch.allclose(loaded_v1.layer_vectors[0], torch.ones(16)))

            loaded_v2 = new_proc_lib.get_process_step("process_a", 0)
            assert loaded_v2 is not None
            self.assertEqual(loaded_v2.skill_id, "step_1")
            self.assertTrue(
                torch.allclose(loaded_v2.layer_vectors[0], torch.ones(16) * 0.5)
            )

    def test_router_activation_steering(self) -> None:
        # Create a skill library and add a steering vector
        skill_lib = SkillVectorLibrary()
        v = RepresentationVector(
            skill_id="skill_steer",
            layer_vectors={0: torch.ones(16) * 0.5, 1: torch.ones(16) * -0.5},
            metadata={"name": "Steer"},
        )
        skill_lib.add_vector(v)

        # Setup router
        router = ExpertAdapterRouter(
            base_model=self.model,
            skill_library=skill_lib,
            in_features=16,
            steering_alpha=2.0,
            steering_mode="prompt",
        )

        # Test 1: Forward without hooks (baseline)
        x = torch.ones(1, 4, dtype=torch.long)  # batch=1, seq_len=4
        with torch.no_grad():
            out_base = self.model(x).logits

        # Test 2: Register hooks and set active routing to skill_steer (index 0)
        router.register_hooks()
        self.assertEqual(len(router.hooks), 2)  # Should hook both layers

        priors = torch.tensor([1.0])
        router.set_active_routing(priors)

        with torch.no_grad():
            out_steered = self.model(x).logits

        # Verify that steering changed the output
        self.assertFalse(torch.allclose(out_base, out_steered))

        # Test 3: Process sequential routing
        router.unregister_hooks()
        proc_lib = ProcessVectorLibrary()
        v_proc = RepresentationVector(
            skill_id="step_a",
            layer_vectors={0: torch.ones(16) * 10.0},  # large steering value
            metadata={"name": "Step A"},
        )
        proc_lib.add_process("my_workflow", [v_proc])

        router.process_library = proc_lib
        router.active_process_id = "my_workflow"
        router.active_process_step = 0
        router.register_hooks()

        with torch.no_grad():
            out_proc = self.model(x).logits

        self.assertFalse(torch.allclose(out_base, out_proc))
        router.unregister_hooks()


if __name__ == "__main__":
    unittest.main()