Add telemetry_and_inference example and finalize previous DLL/ABI fixes
This commit is contained in:
@@ -129,6 +129,9 @@ if(FCES_BUILD_EXAMPLES)
|
||||
|
||||
add_executable(pytorch_integration examples/pytorch_integration.cpp)
|
||||
target_link_libraries(pytorch_integration PRIVATE fces)
|
||||
|
||||
add_executable(telemetry_and_inference examples/telemetry_and_inference.cpp)
|
||||
target_link_libraries(telemetry_and_inference PRIVATE fces)
|
||||
endif()
|
||||
|
||||
# ============================================================================
|
||||
|
||||
114
examples/telemetry_and_inference.cpp
Normal file
114
examples/telemetry_and_inference.cpp
Normal file
@@ -0,0 +1,114 @@
|
||||
/**
|
||||
* @file telemetry_and_inference.cpp
|
||||
* @brief Example showcasing telemetry instrumentation and model inference.
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <torch/torch.h>
|
||||
#include "fces/optimizer.hpp"
|
||||
#include "fces/telemetry.hpp"
|
||||
|
||||
// Define a simple neural network for nonlinear regression: y = x^2
|
||||
struct RegressionNet : torch::nn::Module {
|
||||
torch::nn::Linear fc1{nullptr}, fc2{nullptr};
|
||||
|
||||
RegressionNet() {
|
||||
fc1 = register_module("fc1", torch::nn::Linear(1, 16));
|
||||
fc2 = register_module("fc2", torch::nn::Linear(16, 1));
|
||||
}
|
||||
|
||||
torch::Tensor forward(torch::Tensor x) {
|
||||
x = torch::tanh(fc1->forward(x));
|
||||
return fc2->forward(x);
|
||||
}
|
||||
};
|
||||
|
||||
int main() {
|
||||
fces::Telemetry::get().info("app_start", "Telemetry and Inference demo initialized.");
|
||||
|
||||
// 1. Create Model and Data
|
||||
auto model = std::make_shared<RegressionNet>();
|
||||
|
||||
// Generate training data: x in [-2, 2], y = x^2 + noise
|
||||
auto x_train = torch::linspace(-2.0, 2.0, 100).unsqueeze(1);
|
||||
auto y_train = x_train.pow(2) + 0.1 * torch::randn({100, 1});
|
||||
|
||||
// 2. Configure Optimizer
|
||||
std::vector<torch::Tensor> params;
|
||||
for (auto& p : model->parameters()) {
|
||||
params.push_back(p);
|
||||
}
|
||||
|
||||
fces::FCESOptimizer optimizer(
|
||||
params,
|
||||
fces::FCESConfig{}
|
||||
.set_lr(2e-3f)
|
||||
.set_population_size(150)
|
||||
.set_total_steps(100)
|
||||
);
|
||||
|
||||
fces::Telemetry::get().info("training_start", "Beginning neural net optimization with FCES.");
|
||||
|
||||
auto start_train = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// 3. Optimization Loop
|
||||
for (int epoch = 0; epoch <= 100; ++epoch) {
|
||||
optimizer.zero_grad();
|
||||
auto pred = model->forward(x_train);
|
||||
auto loss = torch::mse_loss(pred, y_train);
|
||||
loss.backward();
|
||||
optimizer.step();
|
||||
optimizer.update_fitness(loss.item<float>());
|
||||
|
||||
if (epoch % 20 == 0) {
|
||||
fces::Telemetry::get().info("epoch_checkpoint",
|
||||
"Epoch " + std::to_string(epoch) + " | Loss: " + std::to_string(loss.item<float>()));
|
||||
}
|
||||
}
|
||||
|
||||
auto end_train = std::chrono::high_resolution_clock::now();
|
||||
double train_duration = std::chrono::duration<double, std::milli>(end_train - start_train).count();
|
||||
|
||||
fces::Telemetry::get().info("training_complete",
|
||||
"Duration: " + std::to_string(train_duration) + " ms");
|
||||
|
||||
// 4. Inference Phase
|
||||
fces::Telemetry::get().info("inference_phase_start", "Evaluating model on new test inputs.");
|
||||
|
||||
// Generate test inputs
|
||||
auto x_test = torch::tensor({-1.5f, -0.5f, 0.0f, 0.5f, 1.5f}).unsqueeze(1);
|
||||
auto y_expected = x_test.pow(2);
|
||||
|
||||
// Switch model to evaluation mode
|
||||
model->eval();
|
||||
|
||||
// Run inference and measure latency
|
||||
auto start_inf = std::chrono::high_resolution_clock::now();
|
||||
torch::Tensor y_pred;
|
||||
{
|
||||
torch::NoGradGuard no_grad;
|
||||
y_pred = model->forward(x_test);
|
||||
}
|
||||
auto end_inf = std::chrono::high_resolution_clock::now();
|
||||
double inf_duration = std::chrono::duration<double, std::milli>(end_inf - start_inf).count();
|
||||
|
||||
// Log telemetry for inference performance
|
||||
fces::Telemetry::get().info("inference_perf",
|
||||
"Inputs: " + std::to_string(x_test.size(0)) + " | Latency: " + std::to_string(inf_duration) + " ms");
|
||||
|
||||
// Print predictions and expected values side-by-side
|
||||
std::cout << "\n================ INFERENCE RESULTS ================" << std::endl;
|
||||
std::cout << "Input (x) | Predicted (y_pred) | Expected (y_expected)" << std::endl;
|
||||
std::cout << "----------------------------------------------------" << std::endl;
|
||||
for (int i = 0; i < x_test.size(0); ++i) {
|
||||
float x_val = x_test[i][0].item<float>();
|
||||
float pred_val = y_pred[i][0].item<float>();
|
||||
float exp_val = y_expected[i][0].item<float>();
|
||||
std::printf(" %7.2f | %7.4f | %7.4f\n", x_val, pred_val, exp_val);
|
||||
}
|
||||
std::cout << "====================================================\n" << std::endl;
|
||||
|
||||
fces::Telemetry::get().info("app_finish", "Exiting demo successfully.");
|
||||
return 0;
|
||||
}
|
||||
@@ -9,6 +9,7 @@
|
||||
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
namespace fces {
|
||||
|
||||
|
||||
@@ -32,6 +32,15 @@ namespace fces {
|
||||
* optimizer.step();
|
||||
* optimizer.update_fitness(loss.item<float>());
|
||||
*/
|
||||
struct FCESOptimizerOptions : public torch::optim::OptimizerCloneableOptions<FCESOptimizerOptions> {
|
||||
explicit FCESOptimizerOptions(double lr = 0.01) : lr_(lr) {}
|
||||
|
||||
double get_lr() const override { return lr_; }
|
||||
void set_lr(const double lr) override { lr_ = lr; }
|
||||
|
||||
double lr_;
|
||||
};
|
||||
|
||||
class FCESOptimizer : public torch::optim::Optimizer {
|
||||
public:
|
||||
explicit FCESOptimizer(
|
||||
|
||||
@@ -13,6 +13,8 @@ class OscillationDetector {
|
||||
public:
|
||||
static constexpr int WINDOW_SIZE = 64;
|
||||
static constexpr float POWER_THRESHOLD = 0.5f;
|
||||
static constexpr int MIN_PERIOD = 4;
|
||||
static constexpr int MAX_PERIOD = 16;
|
||||
|
||||
void update(float loss);
|
||||
bool detect() const;
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
namespace fces {
|
||||
|
||||
// Static members
|
||||
std::atomic<uint64_t> FuzzyController::next_id_{0};
|
||||
std::atomic<uint64_t> FuzzyController::next_id_{1};
|
||||
thread_local std::mt19937 FuzzyController::rng_{std::random_device{}()};
|
||||
|
||||
// ---------------------------------------------------------------
|
||||
@@ -30,7 +30,7 @@ Genome Genome::clone() const {
|
||||
// ---------------------------------------------------------------
|
||||
|
||||
FuzzyController::FuzzyController()
|
||||
: id(next_id_++), origin("genesis") {
|
||||
: id(next_id_++), origin("random") {
|
||||
genome.randomize(rng_);
|
||||
// Bias output toward acceleration (V2.1 insight)
|
||||
// Set output biases (last GENOME_OUTPUT_DIM elements) to +2.0, -1.0, 0.0 with noise
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
namespace fces {
|
||||
|
||||
|
||||
@@ -1,16 +1,84 @@
|
||||
#include "fces/optimizer.hpp"
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
|
||||
namespace fces {
|
||||
|
||||
namespace {
|
||||
|
||||
int classify_layer_by_shape(const torch::Tensor& p) {
|
||||
auto dims = p.sizes();
|
||||
if (dims.size() == 2) {
|
||||
int64_t d0 = dims[0];
|
||||
int64_t d1 = dims[1];
|
||||
if (d0 > 10000 || d1 > 10000) {
|
||||
return 0; // Embedding
|
||||
} else if (d0 * 3 == d1 || d0 == d1 * 3) {
|
||||
return 1; // Attention QKV
|
||||
} else if (d0 == d1) {
|
||||
return 3; // MLP/FFN
|
||||
} else {
|
||||
return 2; // Attention Proj
|
||||
}
|
||||
} else if (dims.size() == 1) {
|
||||
if (dims[0] < 128) {
|
||||
return 4; // LayerNorm
|
||||
} else {
|
||||
return 5; // Other / bias
|
||||
}
|
||||
}
|
||||
return 5; // Other
|
||||
}
|
||||
|
||||
torch::Tensor apply_trust_clipping(const torch::Tensor& p, torch::Tensor update, float trust_region_clip) {
|
||||
if (torch::isnan(update).any().item<bool>() || torch::isinf(update).any().item<bool>()) {
|
||||
return torch::zeros_like(update);
|
||||
}
|
||||
|
||||
float p_norm = p.norm().item<float>();
|
||||
if (p_norm > 1e-6f) {
|
||||
float update_mag = update.norm().item<float>();
|
||||
if (!std::isfinite(update_mag)) {
|
||||
return torch::zeros_like(update);
|
||||
}
|
||||
|
||||
float max_update = trust_region_clip * p_norm;
|
||||
if (update_mag > max_update) {
|
||||
float correction = max_update / (update_mag + 1e-8f);
|
||||
update.mul_(correction);
|
||||
}
|
||||
}
|
||||
|
||||
if (torch::isnan(update).any().item<bool>() || torch::isinf(update).any().item<bool>()) {
|
||||
return torch::zeros_like(update);
|
||||
}
|
||||
|
||||
return update;
|
||||
}
|
||||
|
||||
float calculate_parasitic_reward(const torch::Tensor& p, float mult, const RunningStats& grad_norm_tracker) {
|
||||
if (!p.grad().defined()) {
|
||||
return 0.0f;
|
||||
}
|
||||
float g_norm = p.grad().abs().mean().item<float>();
|
||||
float z_g = grad_norm_tracker.z_score(g_norm);
|
||||
return z_g * (mult - 1.0f);
|
||||
}
|
||||
|
||||
std::unique_ptr<torch::optim::OptimizerOptions> make_optimizer_options(double lr) {
|
||||
return std::make_unique<FCESOptimizerOptions>(lr);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
FCESOptimizer::FCESOptimizer(
|
||||
std::vector<torch::Tensor> params,
|
||||
FCESConfig config
|
||||
)
|
||||
: torch::optim::Optimizer(
|
||||
{torch::optim::OptimizerParamGroup(std::move(params))},
|
||||
std::make_unique<torch::optim::OptimizerOptions>(config.lr)
|
||||
make_optimizer_options(config.lr)
|
||||
),
|
||||
config_(std::move(config)),
|
||||
population_(config_.population_size, 10000,
|
||||
@@ -24,6 +92,8 @@ FCESOptimizer::FCESOptimizer(
|
||||
population_, 50, config_.auto_population, config_.direct_construction
|
||||
);
|
||||
|
||||
spectral_sensor_ = std::make_unique<SpectralSensor>();
|
||||
|
||||
// Initial RAM backup
|
||||
backup_to_ram();
|
||||
|
||||
@@ -37,24 +107,107 @@ torch::Tensor FCESOptimizer::step(LossClosure closure) {
|
||||
|
||||
torch::Tensor loss = {};
|
||||
if (closure) {
|
||||
torch::AutoGradMode grad_mode(true);
|
||||
loss = closure();
|
||||
}
|
||||
|
||||
// TODO: Port full step logic from Python:
|
||||
// 1. _gather_stats()
|
||||
// 2. get_active_controller()
|
||||
// 3. _get_actions()
|
||||
// 4. _apply_parameter_updates()
|
||||
// 5. Evolution & maintenance
|
||||
// 1. Gather Statistics
|
||||
gather_stats();
|
||||
|
||||
// Minimal stub: apply sign-SGD update
|
||||
// 2. Strategy: Population Selection & Dynamics
|
||||
auto& active_controller = evolution_manager_->get_active_controller();
|
||||
|
||||
// 3. Decision: Neural Decisions from Controllers
|
||||
float current_loss_val = (loss.defined()) ? loss.item<float>() : last_step_loss_;
|
||||
|
||||
// Emergency Brake - NaN/Inf Detection
|
||||
if (std::isnan(current_loss_val) || !std::isfinite(current_loss_val)) {
|
||||
Telemetry::get().error("emergency_brake_nan", "NaN/Inf loss detected in step " + std::to_string(step_counter_));
|
||||
handle_rollback();
|
||||
return loss;
|
||||
}
|
||||
|
||||
float loss_velocity = fitness_engine_.calculate_loss_signal(current_loss_val, ema_loss_, config_.signal_mode);
|
||||
last_loss_velocity_ = loss_velocity;
|
||||
|
||||
float progress = std::min(1.0f, static_cast<float>(step_counter_) / std::max(1, config_.total_steps));
|
||||
float grad_cv = grad_norm_tracker_.get_std() / (grad_norm_tracker_.get_mean() + 1e-8f);
|
||||
|
||||
float csr_factor = 1.0f;
|
||||
if (config_.csr_enabled) {
|
||||
if (step_counter_ < config_.csr_warmup_steps) {
|
||||
csr_factor = 0.0f;
|
||||
} else {
|
||||
float steps_since_warmup = static_cast<float>(step_counter_ - config_.csr_warmup_steps);
|
||||
csr_factor = std::min(1.0f, steps_since_warmup / std::max(1.0f, static_cast<float>(config_.csr_ramp_steps)));
|
||||
}
|
||||
}
|
||||
|
||||
// Update spectral sensing rank
|
||||
float spectral_alpha = 0.0f;
|
||||
if (config_.grokking_coefficient > 0.0f && spectral_sensor_) {
|
||||
if (step_counter_ % config_.spectral_frequency == 0 || last_spectral_rank_ == 0.0f) {
|
||||
int param_idx = 0;
|
||||
for (auto& group : param_groups()) {
|
||||
for (auto& p : group.params()) {
|
||||
if (!p.grad().defined()) continue;
|
||||
|
||||
auto update = torch::sign(p.grad());
|
||||
p.data().add_(update, -config_.lr);
|
||||
if (p.dim() >= 2) {
|
||||
std::string name = "layer_" + std::to_string(param_idx);
|
||||
spectral_sensor_->track_layer(name, p);
|
||||
}
|
||||
param_idx++;
|
||||
}
|
||||
}
|
||||
last_spectral_rank_ = spectral_sensor_->get_global_rank();
|
||||
}
|
||||
spectral_alpha = last_spectral_rank_;
|
||||
}
|
||||
|
||||
float effective_alpha = spectral_alpha * csr_factor;
|
||||
float kzm_damping = fitness_engine_.compute_kzm_damping(effective_alpha);
|
||||
float stagnation_intensity = std::min(1.0f, static_cast<float>(stagnation_counter_) / 500.0f);
|
||||
float log_spectral_alpha = std::log(effective_alpha + 1e-6f);
|
||||
|
||||
// Call decide_update
|
||||
auto actions = active_controller.decide_update(
|
||||
layer_stats_,
|
||||
loss_velocity,
|
||||
progress,
|
||||
rollback_ema_,
|
||||
grad_cv,
|
||||
log_spectral_alpha,
|
||||
stagnation_intensity,
|
||||
kzm_damping,
|
||||
loss_velocity
|
||||
);
|
||||
|
||||
// Bandit-style Early Stopping
|
||||
if (step_counter_ % 5 == 0 && loss_velocity > 0.05f) {
|
||||
Telemetry::get().warning("early_stopping_poor_controller",
|
||||
"controller_id=" + std::to_string(active_controller.id) + " velocity=" + std::to_string(loss_velocity));
|
||||
evolution_manager_->steps_active = evolution_manager_->selection_interval;
|
||||
}
|
||||
|
||||
if (torch::isnan(actions).any().item<bool>()) {
|
||||
Telemetry::get().error("controller_nan_actions", "NaN actions returned by controller ID " + std::to_string(active_controller.id));
|
||||
population_.kill(active_controller);
|
||||
auto& new_controller = evolution_manager_->get_active_controller();
|
||||
actions = torch::zeros_like(actions);
|
||||
for (int i = 0; i < actions.size(0); ++i) {
|
||||
actions[i][0] = 0.5f; // log_mult default
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Action: Apply Updates
|
||||
apply_parameter_updates(actions);
|
||||
|
||||
// 5. Evolution & Maintenance
|
||||
if (current_loss_val > 0.0f) {
|
||||
evolution_manager_->update_population_dynamics(
|
||||
loss_velocity,
|
||||
ema_loss_,
|
||||
step_counter_,
|
||||
config_.total_steps
|
||||
);
|
||||
}
|
||||
|
||||
if (step_counter_ % 50 == 0) {
|
||||
@@ -65,21 +218,77 @@ torch::Tensor FCESOptimizer::step(LossClosure closure) {
|
||||
}
|
||||
|
||||
void FCESOptimizer::update_fitness(float loss) {
|
||||
// EMA loss tracking
|
||||
if (step_counter_ == 1) {
|
||||
ema_loss_ = loss;
|
||||
} else {
|
||||
ema_loss_ = 0.95f * ema_loss_ + 0.05f * loss;
|
||||
// 1. Divergence Safety
|
||||
bool is_nan = std::isnan(loss) || !std::isfinite(loss);
|
||||
bool is_spike = (step_counter_ > 1) && (ema_loss_ > 0.0f) && (loss > config_.rollback_threshold * ema_loss_) && (ema_loss_ > 0.1f);
|
||||
if (is_nan || is_spike) {
|
||||
Telemetry::get().warning("divergence_detected", "loss=" + std::to_string(loss) + " ema=" + std::to_string(ema_loss_));
|
||||
handle_rollback();
|
||||
return;
|
||||
}
|
||||
last_step_loss_ = loss;
|
||||
|
||||
// Update best loss window
|
||||
if (loss < best_loss_window_) {
|
||||
if (step_counter_ == 1 || ema_loss_ == 0.0f) {
|
||||
ema_loss_ = loss;
|
||||
last_step_loss_ = loss;
|
||||
last_sparsity_ = calculate_sparsity();
|
||||
return;
|
||||
}
|
||||
|
||||
// 2. Metric Calculation
|
||||
float train_adv = ema_loss_ - loss;
|
||||
float val_adv = 0.0f;
|
||||
float current_sparsity = calculate_sparsity();
|
||||
float sparsity_delta = current_sparsity - last_sparsity_;
|
||||
float consistency_gap = std::max(0.0f, train_adv - val_adv);
|
||||
|
||||
float grad_std = grad_norm_tracker_.get_std();
|
||||
float grad_mean = grad_norm_tracker_.get_mean();
|
||||
float grad_cv = grad_std / (grad_mean + 1e-8f);
|
||||
|
||||
float raw_rank = (spectral_sensor_) ? spectral_sensor_->get_global_rank() : 0.0f;
|
||||
float csr_factor = 1.0f;
|
||||
if (config_.csr_enabled) {
|
||||
if (step_counter_ < config_.csr_warmup_steps) {
|
||||
csr_factor = 0.0f;
|
||||
} else {
|
||||
float steps_since_warmup = static_cast<float>(step_counter_ - config_.csr_warmup_steps);
|
||||
csr_factor = std::min(1.0f, steps_since_warmup / std::max(1.0f, static_cast<float>(config_.csr_ramp_steps)));
|
||||
}
|
||||
}
|
||||
float effective_rank = config_.csr_enabled ? raw_rank * csr_factor : raw_rank;
|
||||
|
||||
FitnessMetrics metrics;
|
||||
metrics.training_advantage = train_adv;
|
||||
metrics.validation_advantage = val_adv;
|
||||
metrics.grad_cv = grad_cv;
|
||||
metrics.sparsity_delta = sparsity_delta;
|
||||
metrics.consistency_gap = consistency_gap;
|
||||
metrics.stable_rank = effective_rank;
|
||||
|
||||
// 3. Fuzzy Evaluation
|
||||
float final_fitness = fitness_evaluator_.evaluate(metrics);
|
||||
|
||||
// 4. State Update
|
||||
ema_loss_ = 0.95f * ema_loss_ + 0.05f * loss;
|
||||
last_step_loss_ = loss;
|
||||
last_sparsity_ = current_sparsity;
|
||||
|
||||
// Stagnation logic
|
||||
if (loss < best_loss_window_ * 0.995f) {
|
||||
best_loss_window_ = loss;
|
||||
stagnation_counter_ = 0;
|
||||
} else {
|
||||
stagnation_counter_++;
|
||||
}
|
||||
|
||||
// 5. Apply to Population
|
||||
auto& active_controller = evolution_manager_->get_active_controller();
|
||||
population_.update_controller_fitness(active_controller, final_fitness);
|
||||
|
||||
Telemetry::get().info("fitness_calculated",
|
||||
"loss=" + std::to_string(loss) +
|
||||
" ema_loss=" + std::to_string(ema_loss_) +
|
||||
" fitness=" + std::to_string(final_fitness));
|
||||
}
|
||||
|
||||
void FCESOptimizer::backup_to_ram() {
|
||||
@@ -115,17 +324,152 @@ float FCESOptimizer::calculate_sparsity() const {
|
||||
}
|
||||
|
||||
void FCESOptimizer::gather_stats() {
|
||||
// TODO: Port _gather_stats from Python
|
||||
layer_stats_.clear();
|
||||
param_group_mapping_.clear();
|
||||
|
||||
int param_idx = 0;
|
||||
bool has_nan_or_inf = false;
|
||||
float max_grad_norm = 0.0f;
|
||||
|
||||
for (auto& group : param_groups()) {
|
||||
for (auto& p : group.params()) {
|
||||
if (!p.grad().defined()) {
|
||||
param_group_mapping_.push_back(-1);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto grad = p.grad();
|
||||
if (torch::isnan(grad).any().item<bool>() || torch::isinf(grad).any().item<bool>()) {
|
||||
has_nan_or_inf = true;
|
||||
}
|
||||
|
||||
float grad_norm = grad.norm().item<float>();
|
||||
if (std::isnan(grad_norm) || !std::isfinite(grad_norm)) {
|
||||
has_nan_or_inf = true;
|
||||
grad_norm = 0.0f;
|
||||
}
|
||||
|
||||
if (grad_norm > max_grad_norm) {
|
||||
max_grad_norm = grad_norm;
|
||||
}
|
||||
|
||||
int64_t total_elements = grad.numel();
|
||||
int64_t zeros = (grad.abs() < 1e-5f).sum().item<int64_t>();
|
||||
float sparsity = (total_elements > 0) ? static_cast<float>(zeros) / total_elements : 0.0f;
|
||||
|
||||
int layer_type = classify_layer_by_shape(p);
|
||||
int group_idx = static_cast<int>(layer_stats_.size());
|
||||
layer_stats_.push_back({grad_norm, sparsity, static_cast<float>(layer_type)});
|
||||
param_group_mapping_.push_back(group_idx);
|
||||
|
||||
if (spectral_sensor_ && p.dim() >= 2) {
|
||||
std::string name = "layer_" + std::to_string(param_idx);
|
||||
spectral_sensor_->track_layer(name, p);
|
||||
}
|
||||
|
||||
param_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
if (has_nan_or_inf) {
|
||||
Telemetry::get().error("poisoned_gradients_detected",
|
||||
"NaN/Inf detected in gradients during step " + std::to_string(step_counter_));
|
||||
handle_rollback();
|
||||
return;
|
||||
}
|
||||
|
||||
if (step_counter_ == 1 && max_grad_norm > 1.0f) {
|
||||
float safe_lr = 0.01f / (max_grad_norm + 1e-8f);
|
||||
for (auto& group : param_groups()) {
|
||||
if (group.options().get_lr() > safe_lr) {
|
||||
Telemetry::get().info("auto_calibration_throttled_lr",
|
||||
"old=" + std::to_string(group.options().get_lr()) + " new=" + std::to_string(safe_lr));
|
||||
group.options().set_lr(safe_lr);
|
||||
config_.lr = safe_lr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!layer_stats_.empty()) {
|
||||
float first_grad_norm = layer_stats_[0][0];
|
||||
grad_norm_tracker_.update(first_grad_norm);
|
||||
}
|
||||
}
|
||||
|
||||
void FCESOptimizer::apply_parameter_updates(const torch::Tensor& /*actions*/) {
|
||||
// TODO: Port _apply_parameter_updates from Python
|
||||
void FCESOptimizer::apply_parameter_updates(const torch::Tensor& actions) {
|
||||
int param_idx = 0;
|
||||
float parasitic_accum = 0.0f;
|
||||
int count_updated = 0;
|
||||
|
||||
auto& active_controller = evolution_manager_->get_active_controller();
|
||||
|
||||
for (auto& group : param_groups()) {
|
||||
float lr = static_cast<float>(group.options().get_lr());
|
||||
float wd = config_.weight_decay;
|
||||
|
||||
for (auto& p : group.params()) {
|
||||
if (!p.grad().defined()) {
|
||||
param_idx++;
|
||||
continue;
|
||||
}
|
||||
|
||||
int g_idx = param_group_mapping_[param_idx];
|
||||
if (g_idx < 0 || g_idx >= actions.size(0)) {
|
||||
param_idx++;
|
||||
continue;
|
||||
}
|
||||
|
||||
float mult = actions[g_idx][0].item<float>();
|
||||
float sign_gate = actions[g_idx][1].item<float>();
|
||||
float wd_mult = (actions.size(1) > 2) ? actions[g_idx][2].item<float>() : 1.0f;
|
||||
|
||||
bool use_sign = sign_gate > 0.0f;
|
||||
if (config_.ablation_mode == "force_sign") {
|
||||
use_sign = true;
|
||||
} else if (config_.ablation_mode == "force_grad") {
|
||||
use_sign = false;
|
||||
}
|
||||
|
||||
if (wd > 0.0f) {
|
||||
float effective_wd = wd;
|
||||
if (config_.adaptive_wd) {
|
||||
effective_wd *= wd_mult;
|
||||
}
|
||||
p.data().mul_(1.0f - lr * effective_wd);
|
||||
}
|
||||
|
||||
torch::Tensor update_vec = use_sign ? torch::sign(p.grad()) : p.grad();
|
||||
torch::Tensor update = -lr * mult * update_vec;
|
||||
|
||||
update = apply_trust_clipping(p, update, config_.trust_region_clip);
|
||||
p.data().add_(update);
|
||||
|
||||
if (config_.parasitic_mode) {
|
||||
parasitic_accum += calculate_parasitic_reward(p, mult, grad_norm_tracker_);
|
||||
}
|
||||
|
||||
param_idx++;
|
||||
count_updated++;
|
||||
}
|
||||
}
|
||||
|
||||
if (config_.parasitic_mode && count_updated > 0) {
|
||||
float reward = parasitic_accum / static_cast<float>(count_updated);
|
||||
population_.update_controller_fitness(active_controller, reward * 10.0f, false);
|
||||
}
|
||||
}
|
||||
|
||||
void FCESOptimizer::handle_rollback() {
|
||||
restore_from_ram();
|
||||
population_.calm_down();
|
||||
rollback_ema_ = 0.9f * rollback_ema_ + 0.1f;
|
||||
|
||||
ema_loss_ = 0.0f;
|
||||
last_step_loss_ = 0.0f;
|
||||
grad_norm_tracker_.reset();
|
||||
zero_grad();
|
||||
|
||||
Telemetry::get().warning("hard_reset_executed", "rollback_sanitization");
|
||||
}
|
||||
|
||||
} // namespace fces
|
||||
|
||||
@@ -26,10 +26,11 @@ void SpectralSensor::reset() {
|
||||
|
||||
float SpectralSensor::compute_effective_rank(const torch::Tensor& weight) {
|
||||
// SVD-based effective rank (Shannon entropy of normalized singular values)
|
||||
auto svd = torch::linalg::svdvals(weight.to(torch::kFloat32));
|
||||
auto svd_result = torch::svd(weight.to(torch::kFloat32));
|
||||
auto svd = std::get<1>(svd_result);
|
||||
auto s = svd / svd.sum();
|
||||
auto log_s = torch::log(s + 1e-10f);
|
||||
float entropy = -(s * log_s).sum().item<float>();
|
||||
auto log_s = (s + 1e-10f).log();
|
||||
float entropy = -s.mul(log_s).sum().item<float>();
|
||||
return std::exp(entropy);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user