Source code for unilab.base.curriculum

"""Curriculum learning for adaptive difficulty adjustment."""

from __future__ import annotations

from typing import Any

import numpy as np


[docs] class EpisodeLengthTracker: """Track moving average of episode length."""
[docs] def __init__(self, num_envs: int, window_size: int = 1000): self.num_envs = num_envs self.window_size = max(1, int(window_size * num_envs / 4096)) self.average_length = 0.0
[docs] def update(self, episode_lengths: np.ndarray) -> None: """Update average with new episode lengths.""" if len(episode_lengths) == 0: return current_avg = float(np.mean(episode_lengths)) weight = min(len(episode_lengths) / self.window_size, 1.0) self.average_length = self.average_length * (1 - weight) + current_avg * weight
[docs] class PenaltyCurriculum: """Adaptive penalty scaling based on episode length."""
[docs] def __init__( self, env: Any, enabled: bool = True, initial_scale: float = 0.5, min_scale: float = 0.5, max_scale: float = 1.0, level_down_threshold: float = 150.0, level_up_threshold: float = 750.0, degree: float = 0.001, ): self.env = env self.enabled = enabled self.current_scale = initial_scale self.min_scale = min_scale self.max_scale = max_scale self.level_down_threshold = level_down_threshold self.level_up_threshold = level_up_threshold self.degree = degree # Store original penalty weights self.penalty_names: list[str] = [] self.original_weights: dict[str, float] = {} if enabled: self._identify_penalties() self._apply_initial_scale()
def _identify_penalties(self) -> None: """Identify penalty rewards (negative scales).""" for name, scale in self.env.cfg.reward_config.scales.items(): if scale < 0: self.penalty_names.append(name) self.original_weights[name] = scale def _apply_initial_scale(self) -> None: """Apply initial penalty scaling.""" for name in self.penalty_names: self.env.cfg.reward_config.scales[name] = ( self.original_weights[name] * self.current_scale )
[docs] def update(self, average_episode_length: float) -> None: """Update penalty scale based on average episode length.""" if not self.enabled: return # Adjust scale if average_episode_length < self.level_down_threshold: self.current_scale *= 1.0 - self.degree elif average_episode_length > self.level_up_threshold: self.current_scale *= 1.0 + self.degree # Clamp self.current_scale = float(np.clip(self.current_scale, self.min_scale, self.max_scale)) # Apply to all penalty rewards for name in self.penalty_names: self.env.cfg.reward_config.scales[name] = ( self.original_weights[name] * self.current_scale )