feat: Add playbook vector extraction and activation steering routing to FCES training pipeline

This commit is contained in:
AI-anonymous
2026-05-23 08:33:27 +02:00
parent e0d8a32823
commit 4c9a550f8b
4 changed files with 884 additions and 46 deletions

View File

@@ -41,24 +41,29 @@ class PlaybookParser:
logger.warning(f"File not found: {path}")
return None
content = path.read_text(encoding="utf-8")
content = path.read_text(encoding="utf-8").lstrip()
# Parse frontmatter if present
name = path.parent.name if path.parent else "unknown"
description = ""
frontmatter_match = re.match(r"^---\s*\n(.*?)\n---\s*\n", content, re.DOTALL)
frontmatter_match = re.match(r"^---\r?\n(.*?)\r?\n---\r?\n", content, re.DOTALL)
if frontmatter_match:
fm_text = frontmatter_match.group(1)
name_match = re.search(r"^name:\s*(.*?)$", fm_text, re.MULTILINE)
if name_match:
name = name_match.group(1).strip()
desc_match = re.search(r"^description:\s*(.*?)$", fm_text, re.MULTILINE)
desc_match = re.search(
r"^description:\s*(.*?)(?=\r?\n\w+:|\Z)",
fm_text,
re.DOTALL | re.MULTILINE,
)
if desc_match:
description = desc_match.group(1).strip()
# Remove multiline YAML indicators if any
description = re.sub(r"^>-?\s*", "", description)
description = re.sub(r"\n\s+", " ", description)
description = re.sub(r"^\|\s*", "", description)
description = re.sub(r"\r?\n\s*", " ", description).strip()
# Parse headers if no name/description in frontmatter
if name == "unknown" or not description:
@@ -247,7 +252,8 @@ class RepresentationVectorExtractor:
# Run forward pass for win prompts
for prompt in win_prompts:
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
raw_inputs = self.tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(self.device) for k, v in raw_inputs.items()}
with torch.no_grad():
self.model(**inputs)
@@ -264,7 +270,8 @@ class RepresentationVectorExtractor:
# Run forward pass for lose prompts
for prompt in lose_prompts:
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
raw_inputs = self.tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(self.device) for k, v in raw_inputs.items()}
with torch.no_grad():
self.model(**inputs)