feat: complete phase 5 - Klausuren validation with MoE routing, dynamic dimensional matching, and full mypy type safety

2026-05-22 01:32:02 +02:00
parent 1d358fa5ad
commit 306372bb5b
5 changed files with 1041 additions and 7 deletions
--- a/python/adapter_moe_router.py
+++ b/python/adapter_moe_router.py
@@ -15,11 +15,12 @@ import torch.nn as nn
 from parasitic_qlora import ExpertAdapter


-class LearnableGate(nn.Module):  # type: ignore[misc]
+class LearnableGate(nn.Module):  # type: ignore[misc, unused-ignore]
    """A lightweight learnable MLP gating network for routing."""

    def __init__(self, in_features: int, num_adapters: int) -> None:
        super().__init__()
+        self.in_features = in_features
        self.gate = nn.Sequential(
            nn.Linear(in_features, in_features // 2),
            nn.ReLU(),
@@ -27,9 +28,16 @@ class LearnableGate(nn.Module):  # type: ignore[misc]
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # Input: [..., in_features]
+        # Input: [..., in_features] or other dimension to be pooled
        # Output: [..., num_adapters] (unnormalized scores)
-        return self.gate(x)
+        if x.shape[-1] != self.in_features:
+            import torch.nn.functional as F
+
+            orig_shape = x.shape
+            flat_x = x.view(-1, orig_shape[-1]).unsqueeze(1)
+            pooled_x = F.adaptive_avg_pool1d(flat_x, self.in_features)
+            x = pooled_x.squeeze(1).view(*orig_shape[:-1], self.in_features)
+        return self.gate(x)  # type: ignore[no-any-return, unused-ignore]


 class ExpertAdapterRouter:
@@ -157,8 +165,8 @@ class ExpertAdapterRouter:
                if layer_name in adapter.layers:
                    lm = adapter.layers[layer_name]
                    # Ensure tensors are on the correct device
-                    lora_A = lm.lora_A.to(x.device)
-                    lora_B = lm.lora_B.to(x.device)
+                    lora_A = lm.lora_A.to(device=x.device, dtype=x.dtype)
+                    lora_B = lm.lora_B.to(device=x.device, dtype=x.dtype)

                    # Dynamic scaling: gate_weight for this adapter
                    # weights has shape [batch, num_adapters] or [batch, seq_len, num_adapters]