feat: Add playbook vector extraction and activation steering routing to FCES training pipeline

2026-05-23 08:33:27 +02:00
parent e0d8a32823
commit 4c9a550f8b
4 changed files with 884 additions and 46 deletions
--- a/python/representation_engineering.py
+++ b/python/representation_engineering.py
@@ -41,24 +41,29 @@ class PlaybookParser:
            logger.warning(f"File not found: {path}")
            return None

-        content = path.read_text(encoding="utf-8")
+        content = path.read_text(encoding="utf-8").lstrip()

        # Parse frontmatter if present
        name = path.parent.name if path.parent else "unknown"
        description = ""

-        frontmatter_match = re.match(r"^---\s*\n(.*?)\n---\s*\n", content, re.DOTALL)
+        frontmatter_match = re.match(r"^---\r?\n(.*?)\r?\n---\r?\n", content, re.DOTALL)
        if frontmatter_match:
            fm_text = frontmatter_match.group(1)
            name_match = re.search(r"^name:\s*(.*?)$", fm_text, re.MULTILINE)
            if name_match:
                name = name_match.group(1).strip()
-            desc_match = re.search(r"^description:\s*(.*?)$", fm_text, re.MULTILINE)
+            desc_match = re.search(
+                r"^description:\s*(.*?)(?=\r?\n\w+:|\Z)",
+                fm_text,
+                re.DOTALL | re.MULTILINE,
+            )
            if desc_match:
                description = desc_match.group(1).strip()
                # Remove multiline YAML indicators if any
                description = re.sub(r"^>-?\s*", "", description)
-                description = re.sub(r"\n\s+", " ", description)
+                description = re.sub(r"^\|\s*", "", description)
+                description = re.sub(r"\r?\n\s*", " ", description).strip()

        # Parse headers if no name/description in frontmatter
        if name == "unknown" or not description:
@@ -247,7 +252,8 @@ class RepresentationVectorExtractor:

        # Run forward pass for win prompts
        for prompt in win_prompts:
-            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+            raw_inputs = self.tokenizer(prompt, return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in raw_inputs.items()}
            with torch.no_grad():
                self.model(**inputs)

@@ -264,7 +270,8 @@ class RepresentationVectorExtractor:

        # Run forward pass for lose prompts
        for prompt in lose_prompts:
-            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+            raw_inputs = self.tokenizer(prompt, return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in raw_inputs.items()}
            with torch.no_grad():
                self.model(**inputs)