Source code for unilab.base.curriculum
"""Curriculum learning for adaptive difficulty adjustment."""
from __future__ import annotations
from typing import Any
import numpy as np
[docs]
class EpisodeLengthTracker:
"""Track moving average of episode length."""
[docs]
def __init__(self, num_envs: int, window_size: int = 1000):
self.num_envs = num_envs
self.window_size = max(1, int(window_size * num_envs / 4096))
self.average_length = 0.0
[docs]
def update(self, episode_lengths: np.ndarray) -> None:
"""Update average with new episode lengths."""
if len(episode_lengths) == 0:
return
current_avg = float(np.mean(episode_lengths))
weight = min(len(episode_lengths) / self.window_size, 1.0)
self.average_length = self.average_length * (1 - weight) + current_avg * weight
[docs]
class PenaltyCurriculum:
"""Adaptive penalty scaling based on episode length."""
[docs]
def __init__(
self,
env: Any,
enabled: bool = True,
initial_scale: float = 0.5,
min_scale: float = 0.5,
max_scale: float = 1.0,
level_down_threshold: float = 150.0,
level_up_threshold: float = 750.0,
degree: float = 0.001,
):
self.env = env
self.enabled = enabled
self.current_scale = initial_scale
self.min_scale = min_scale
self.max_scale = max_scale
self.level_down_threshold = level_down_threshold
self.level_up_threshold = level_up_threshold
self.degree = degree
# Store original penalty weights
self.penalty_names: list[str] = []
self.original_weights: dict[str, float] = {}
if enabled:
self._identify_penalties()
self._apply_initial_scale()
def _identify_penalties(self) -> None:
"""Identify penalty rewards (negative scales)."""
for name, scale in self.env.cfg.reward_config.scales.items():
if scale < 0:
self.penalty_names.append(name)
self.original_weights[name] = scale
def _apply_initial_scale(self) -> None:
"""Apply initial penalty scaling."""
for name in self.penalty_names:
self.env.cfg.reward_config.scales[name] = (
self.original_weights[name] * self.current_scale
)
[docs]
def update(self, average_episode_length: float) -> None:
"""Update penalty scale based on average episode length."""
if not self.enabled:
return
# Adjust scale
if average_episode_length < self.level_down_threshold:
self.current_scale *= 1.0 - self.degree
elif average_episode_length > self.level_up_threshold:
self.current_scale *= 1.0 + self.degree
# Clamp
self.current_scale = float(np.clip(self.current_scale, self.min_scale, self.max_scale))
# Apply to all penalty rewards
for name in self.penalty_names:
self.env.cfg.reward_config.scales[name] = (
self.original_weights[name] * self.current_scale
)