Agent Conversation: deepseek-reasoner
Task: rl-value-discrete
Online RL: Value-Based Methods for Discrete Control
Objective
Design and implement a value-based RL algorithm for discrete action spaces. Your code goes in custom_value_discrete.py. Three reference implementations (DQN, DoubleDQN, C51) are provided as read-only.
Background
Value-based methods estimate Q-values Q(s,a) for each state-action pair and derive a policy by selecting actions with the highest Q-value. Key challenges include overestimation bias, sample efficiency, and representing uncertainty. Different approaches address these through double Q-learning, distributional value functions, or prioritized replay.
Constraints
- Network architecture dimensions are FIXED and cannot be modified
- Total parameter count is enforced at runtime
- Focus on algorithmic innovation: new loss functions, update rules, exploration strategies, etc.
- Do NOT simply copy a reference implementation with minor changes
Evaluation
Trained and evaluated on CartPole-v1, LunarLander-v2, Acrobot-v1. Additional held-out environments (not shown during intermediate testing) are used to assess generalization. Metric: mean episodic return over 10 evaluation episodes (higher is better).
cleanrl/cleanrl/custom_value_discrete.py [EDITABLE — lines 174–242 only]
1: # Custom value-based discrete RL algorithm for MLS-Bench
2: #
3: # EDITABLE section: QNetwork head and ValueAlgorithm classes.
4: # FIXED sections: everything else (config, env, buffer, encoder, utility, training loop).
5: import os
6: import random
7: import time
8: from dataclasses import dataclass
9:
10: import gymnasium as gym
11: import numpy as np
12: import torch
13: import torch.nn as nn
14: import torch.nn.functional as F
15: import torch.optim as optim
16: import tyro
17:
18:
19: # =====================================================================
20: # FIXED: Configuration
21: # =====================================================================
22: @dataclass
23: class Args:
24: exp_name: str = os.path.basename(__file__)[: -len(".py")]
25: """the name of this experiment"""
26: seed: int = 1
27: """seed of the experiment"""
28: torch_deterministic: bool = True
29: """if toggled, `torch.backends.cudnn.deterministic=False`"""
30: cuda: bool = True
31: """if toggled, cuda will be enabled by default"""
32:
33: # Algorithm specific arguments
34: env_id: str = "CartPole-v1"
35: """the id of the environment"""
36: total_timesteps: int = 500000
37: """total timesteps of the experiments"""
38: learning_rate: float = 2.5e-4
39: """the learning rate of the optimizer"""
40: buffer_size: int = 10000
41: """the replay memory buffer size"""
42: gamma: float = 0.99
43: """the discount factor gamma"""Tool Input
1{
2 "op": "replace",
3 "filename": "cleanrl/cleanrl/custom_value_discrete.py",
4 "start_line": 174,
5 "end_line": 242,
6 "content": "class QNetwork(nn.Module):\n \"\"\"Q-network with dropout for uncertainty estimation.\"\"\"\n def __init__(self, obs_dim, n_actions, dropout_rate=0.1):\n super().__init__()\n self.encoder = MLPEncoder(obs_dim)\n # Add dropout layer for uncertainty estimation\n self.dropout = nn.Dropout(p=dropout_rate)\n self.head = nn.Linear(ENCODER_FEATURE_DIM, n_actions)\n \n def forward(self, obs, mc_dropout=False):\n features = self.encoder(obs)\n if mc_dropout:\n # Apply dropout during inference for uncertainty estimation\n features = self.dropout(features)\n return self.head(features)\n \n def mc_forward(self, obs, n_samples=10):\n \"\"\"Run multiple forward passes with dropout enabled, return mean and std.\"\"\"\n # Enable dropout\n self.train()\n with torch.no_grad():\n samples = []\n for _ in range(n_samples):\n features = self.encoder(obs)\n features = self.dropout(features)\n q = self.head(features)\n samples.append(q)\n # Switch back to eval mode (but keep dropout enabled for future mc calls)\n self.eval()\n samples = torch.stack(samples, dim=0) # [n_samples, batch, n_actions]\n mean = samples.mean(dim=0)\n std = samples.std(dim=0, unbiased=False)\n return mean, std\n\n\nclass ValueAlgorithm:\n \"\"\"MC Dropout DQN with uncertainty-weighted optimistic targets.\"\"\"\n def __init__(self, obs_dim, n_actions, device, args):\n self.device = device\n self.n_actions = n_actions\n self.gamma = args.gamma\n self.total_it = 0\n \n # Hyperparameters for uncertainty weighting\n self.dropout_rate = 0.1\n self.mc_samples = 10\n self.beta = 1.0 # optimism coefficient\n self.uncertainty_weight = True\n self.eps = 1e-6\n \n # Networks\n self.q_network = QNetwork(obs_dim, n_actions, self.dropout_rate).to(device)\n self.target_network = QNetwork(obs_dim, n_actions, self.dropout_rate).to(device)\n self.target_network.load_state_dict(self.q_network.state_dict())\n \n self.optimizer = optim.Adam(self.q_network.parameters(), lr=args.learning_rate)\n \n def select_action(self, obs, epsilon):\n \"\"\"Optimistic exploration using MC dropout uncertainty.\"\"\"\n if random.random() < epsilon:\n return random.randint(0, self.n_actions - 1)\n obs_t = torch.tensor(obs.reshape(1, -1), device=self.device, dtype=torch.float32)\n with torch.no_grad():\n mean, std = self.q_network.mc_forward(obs_t, n_samples=self.mc_samples)\n # UCB: mean + beta * std\n optimistic_q = mean + self.beta * std\n return torch.argmax(optimistic_q, dim=1).item()\n \n def update(self, batch, global_step):\n self.total_it += 1\n obs, next_obs, actions, rewards, dones = batch\n \n with torch.no_grad():\n # Compute optimistic target using MC dropout on target network\n next_mean, next_std = self.target_network.mc_forward(next_obs, n_samples=self.mc_samples)\n # UCB target: mean + beta * std\n next_q_optimistic = next_mean + self.beta * next_std\n target_max, _ = next_q_optimistic.max(dim=1)\n td_target = rewards + (1 - dones) * self.gamma * target_max\n \n # Get current Q values with dropout enabled (single sample)\n current_q = self.q_network(obs, mc_dropout=True) # enable dropout for training\n old_val = current_q.gather(1, actions.unsqueeze(1)).squeeze(1)\n \n # Compute uncertainty weights if enabled\n if self.uncertainty_weight:\n # Estimate uncertainty per sample using MC dropout\n with torch.no_grad():\n # Compute variance across multiple dropout samples for each state-action\n mc_q_samples = []\n for _ in range(self.mc_samples):\n q_sample = self.q_network(obs, mc_dropout=True)\n mc_q_samples.append(q_sample)\n mc_q_samples = torch.stack(mc_q_samples, dim=0) # [mc_samples, batch, n_actions]\n # Variance per state-action\n selected_q_samples = mc_q_samples[:, torch.arange(len(obs)), actions]\n variance = selected_q_samples.var(dim=0, unbiased=False)\n uncertainty = torch.sqrt(variance + self.eps)\n # Weight inversely proportional to uncertainty\n weights = 1.0 / (uncertainty + 1.0)\n # Normalize weights to have mean 1\n weights = weights / weights.mean()\n # Weighted MSE loss\n td_error = td_target - old_val\n td_loss = (weights * td_error ** 2).mean()\n else:\n td_loss = F.mse_loss(td_target, old_val)\n \n self.optimizer.zero_grad()\n td_loss.backward()\n torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), max_norm=10.0)\n self.optimizer.step()\n \n # Logging\n with torch.no_grad():\n mean_q = old_val.mean().item()\n if self.uncertainty_weight:\n avg_weight = weights.mean().item()\n avg_uncertainty = uncertainty.mean().item()\n else:\n avg_weight = 1.0\n avg_uncertainty = 0.0\n \n return {\n \"td_loss\": td_loss.item(),\n \"q_values\": mean_q,\n \"uncertainty\": avg_uncertainty,\n \"weight\": avg_weight,\n }"
7}OK: Replaced lines 174..242 with 129 line(s) in cleanrl/cleanrl/custom_value_discrete.py. Editable range: 174–302. [Current file: cleanrl/cleanrl/custom_value_discrete.py | editable: 174–302 | total: 389 lines] 171| # ===================================================================== 172| # EDITABLE: QNetwork head and ValueAlgorithm 173| # ===================================================================== 174 class QNetwork(nn.Module): 175 """Q-network with dropout for uncertainty estimation.""" 176 def __init__(self, obs_dim, n_actions, dropout_rate=0.1): 177 super().__init__() 178 self.encoder = MLPEncoder(obs_dim) 179 # Add dropout layer for uncertainty estimation 180 self.dropout = nn.Dropout(p=dropout_rate) 181 self.head = nn.Linear(ENCODER_FEATURE_DIM, n_actions) 182 183 def forward(self, obs, mc_dropout=False): 184 features = self.encoder(obs) 185
[Test #1] (2 tests remaining; call submit(n=1) to submit this result) ## Seed 42 ### cartpole-v1 (scripts/cartpole.sh) Training metrics (last steps): TRAIN_METRICS step=495000 td_loss=1214498048.0000 q_values=277551.0938 uncertainty=0.0010 weight=1.0000 TRAIN_METRICS step=496000 td_loss=1261210496.0000 q_values=282532.5625 uncertainty=0.0010 weight=1.0000 TRAIN_METRICS step=497000 td_loss=2526375680.0000 q_values=285740.0625 uncertainty=0.0010 weight=1.0000 TRAIN_METRICS step=498000 td_loss=2545784576.0000 q_values=287017.6875 uncertainty=0.0010 weight=1.0000 TRAIN_METRICS step=499000 td_loss=3178588672.0000 q_values=287783.8125 uncertainty=0.0010 weight=1.0000 Evaluation (cartpole-v1): Eval episodic_return: 52.00 Eval episodic_return: 90.30 Eval episodic_return: 13.90 Final eval return: 13.90 ### lunarlander-v2 (scripts/lunarlander.sh) Training metrics (last steps): TRAIN_METRICS step=495000 td_loss=147635910475776.0000 q_values=55743072.0000 uncertainty=0.0010 weight=1.0000 TRAIN
Tool Input
1{
2 "op": "replace",
3 "filename": "cleanrl/cleanrl/custom_value_discrete.py",
4 "start_line": 174,
5 "end_line": 302,
6 "content": "class QNetwork(nn.Module):\n \"\"\"Q-network with dropout for uncertainty estimation.\"\"\"\n def __init__(self, obs_dim, n_actions, dropout_rate=0.5):\n super().__init__()\n self.encoder = MLPEncoder(obs_dim)\n # Add dropout layer for uncertainty estimation\n self.dropout = nn.Dropout(p=dropout_rate)\n self.head = nn.Linear(ENCODER_FEATURE_DIM, n_actions)\n # Optimistic initialization: small positive bias for exploration\n nn.init.constant_(self.head.bias, 0.1)\n \n def forward(self, obs, mc_dropout=False):\n features = self.encoder(obs)\n if mc_dropout:\n # Apply dropout during inference for uncertainty estimation\n features = self.dropout(features)\n return self.head(features)\n \n def mc_forward(self, obs, n_samples=10):\n \"\"\"Run multiple forward passes with dropout enabled, return mean and std.\"\"\"\n # Enable dropout\n self.train()\n with torch.no_grad():\n samples = []\n for _ in range(n_samples):\n features = self.encoder(obs)\n features = self.dropout(features)\n q = self.head(features)\n samples.append(q)\n # Switch back to eval mode (but keep dropout enabled for future mc calls)\n self.eval()\n samples = torch.stack(samples, dim=0) # [n_samples, batch, n_actions]\n mean = samples.mean(dim=0)\n std = samples.std(dim=0, unbiased=False)\n return mean, std\n\n\nclass ValueAlgorithm:\n \"\"\"Uncertainty-Penalized DQN with dropout uncertainty and regularization.\"\"\"\n def __init__(self, obs_dim, n_actions, device, args):\n self.device = device\n self.n_actions = n_actions\n self.gamma = args.gamma\n self.total_it = 0\n \n # Hyperparameters\n self.dropout_rate = 0.5\n self.mc_samples = 10\n self.beta = 1.0 # uncertainty penalty coefficient\n self.uncertainty_weight = True\n self.eps = 1e-6\n self.huber_delta = 10.0 # delta for Huber loss\n \n # Networks\n self.q_network = QNetwork(obs_dim, n_actions, self.dropout_rate).to(device)\n self.target_network = QNetwork(obs_dim, n_actions, self.dropout_rate).to(device)\n self.target_network.load_state_dict(self.q_network.state_dict())\n \n # Optimizer with weight decay\n self.optimizer = optim.Adam(self.q_network.parameters(), lr=args.learning_rate, weight_decay=1e-5)\n \n def select_action(self, obs, epsilon):\n \"\"\"Optimistic exploration using MC dropout uncertainty.\"\"\"\n if random.random() < epsilon:\n return random.randint(0, self.n_actions - 1)\n obs_t = torch.tensor(obs.reshape(1, -1), device=self.device, dtype=torch.float32)\n with torch.no_grad():\n mean, std = self.q_network.mc_forward(obs_t, n_samples=self.mc_samples)\n # UCB: mean + beta * std\n optimistic_q = mean + self.beta * std\n return torch.argmax(optimistic_q, dim=1).item()\n \n def update(self, batch, global_step):\n self.total_it += 1\n obs, next_obs, actions, rewards, dones = batch\n \n with torch.no_grad():\n # Compute pessimistic target using MC dropout on target network\n next_mean, next_std = self.target_network.mc_forward(next_obs, n_samples=self.mc_samples)\n # Lower confidence bound: mean - beta * std (penalize uncertainty)\n next_q_pessimistic = next_mean - self.beta * next_std\n target_max, _ = next_q_pessimistic.max(dim=1)\n td_target = rewards + (1 - dones) * self.gamma * target_max\n # Clip td_target to prevent explosion (optional)\n td_target = td_target.clamp(-1000, 1000)\n \n # Get current Q values with dropout enabled (single sample)\n current_q = self.q_network(obs, mc_dropout=True) # enable dropout for training\n old_val = current_q.gather(1, actions.unsqueeze(1)).squeeze(1)\n \n # Compute uncertainty weights if enabled\n if self.uncertainty_weight:\n # Estimate uncertainty per sample using MC dropout\n with torch.no_grad():\n # Compute variance across multiple dropout samples for each state-action\n mc_q_samples = []\n for _ in range(self.mc_samples):\n q_sample = self.q_network(obs, mc_dropout=True)\n mc_q_samples.append(q_sample)\n mc_q_samples = torch.stack(mc_q_samples, dim=0) # [mc_samples, batch, n_actions]\n # Variance per state-action\n selected_q_samples = mc_q_samples[:, torch.arange(len(obs)), actions]\n variance = selected_q_samples.var(dim=0, unbiased=False)\n uncertainty = torch.sqrt(variance + self.eps)\n # Weight inversely proportional to uncertainty\n weights = 1.0 / (uncertainty + 1.0)\n # Normalize weights to have mean 1\n weights = weights / weights.mean()\n # Weighted Huber loss\n td_error = td_target - old_val\n huber_loss = F.huber_loss(td_error, torch.zeros_like(td_error), reduction='none', delta=self.huber_delta)\n td_loss = (weights * huber_loss).mean()\n else:\n td_loss = F.huber_loss(td_target, old_val, reduction='mean', delta=self.huber_delta)\n \n self.optimizer.zero_grad()\n td_loss.backward()\n torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), max_norm=10.0)\n self.optimizer.step()\n \n # Logging\n with torch.no_grad():\n mean_q = old_val.mean().item()\n if self.uncertainty_weight:\n avg_weight = weights.mean().item()\n avg_uncertainty = uncertainty.mean().item()\n else:\n avg_weight = 1.0\n avg_uncertainty = 0.0\n \n return {\n \"td_loss\": td_loss.item(),\n \"q_values\": mean_q,\n \"uncertainty\": avg_uncertainty,\n \"weight\": avg_weight,\n }"
7}OK: Replaced lines 174..302 with 136 line(s) in cleanrl/cleanrl/custom_value_discrete.py. Editable range: 174–309. [Current file: cleanrl/cleanrl/custom_value_discrete.py | editable: 174–309 | total: 396 lines] 171| # ===================================================================== 172| # EDITABLE: QNetwork head and ValueAlgorithm 173| # ===================================================================== 174 class QNetwork(nn.Module): 175 """Q-network with dropout for uncertainty estimation.""" 176 def __init__(self, obs_dim, n_actions, dropout_rate=0.5): 177 super().__init__() 178 self.encoder = MLPEncoder(obs_dim) 179 # Add dropout layer for uncertainty estimation 180 self.dropout = nn.Dropout(p=dropout_rate) 181 self.head = nn.Linear(ENCODER_FEATURE_DIM, n_actions) 182 # Optimistic initialization: small positive bias for exploration 183 nn.init.constant_(self.head
[Test #2] (1 test remaining; call submit(n=2) to submit this result) ### cartpole-v1 (scripts/cartpole.sh) Training metrics (last steps): TRAIN_METRICS step=495000 td_loss=0.1430 q_values=2.3103 uncertainty=0.0010 weight=1.0000 TRAIN_METRICS step=496000 td_loss=0.2369 q_values=1.8540 uncertainty=0.0010 weight=1.0000 TRAIN_METRICS step=497000 td_loss=0.1940 q_values=2.3045 uncertainty=0.0010 weight=1.0000 TRAIN_METRICS step=498000 td_loss=0.1320 q_values=2.1621 uncertainty=0.0010 weight=1.0000 TRAIN_METRICS step=499000 td_loss=0.2308 q_values=1.8689 uncertainty=0.0010 weight=1.0000 Evaluation (cartpole-v1): Eval episodic_return: 65.40 Eval episodic_return: 67.30 Eval episodic_return: 70.80 Final eval return: 70.80 ### lunarlander-v2 (scripts/lunarlander.sh) Training metrics (last steps): TRAIN_METRICS step=495000 td_loss=478.9758 q_values=-996.2295 uncertainty=0.0010 weight=1.0000 TRAIN_METRICS step=496000 td_loss=270.4600 q_values=-998.9423 uncertainty=0.0010 weight=1.0000 TRAIN_METR
Tool Input
1{
2 "op": "replace",
3 "filename": "cleanrl/cleanrl/custom_value_discrete.py",
4 "start_line": 174,
5 "end_line": 309,
6 "content": "class QNetwork(nn.Module):\n \"\"\"Ensemble Q-network with multiple heads for uncertainty estimation.\"\"\"\n def __init__(self, obs_dim, n_actions, ensemble_size=5):\n super().__init__()\n self.encoder = MLPEncoder(obs_dim)\n self.n_actions = n_actions\n self.ensemble_size = ensemble_size\n # Separate heads for each ensemble member\n self.heads = nn.ModuleList([\n nn.Linear(ENCODER_FEATURE_DIM, n_actions) for _ in range(ensemble_size)\n ])\n # Initialize each head with small random variations\n for head in self.heads:\n nn.init.normal_(head.weight, mean=0.0, std=0.01)\n nn.init.constant_(head.bias, 0.1)\n \n def forward(self, obs):\n \"\"\"Return mean Q-values across ensemble.\"\"\"\n features = self.encoder(obs)\n q_vals = [head(features) for head in self.heads] # list of [batch, n_actions]\n q_stack = torch.stack(q_vals, dim=0) # [ensemble, batch, n_actions]\n return q_stack.mean(dim=0)\n \n def forward_all(self, obs):\n \"\"\"Return Q-values for each ensemble member: [ensemble, batch, n_actions].\"\"\"\n features = self.encoder(obs)\n q_vals = [head(features) for head in self.heads]\n return torch.stack(q_vals, dim=0)\n \n def forward_head(self, obs, idx):\n \"\"\"Return Q-values for a specific head.\"\"\"\n features = self.encoder(obs)\n return self.heads[idx](features)\n\n\nclass ValueAlgorithm:\n \"\"\"Ensemble-DQN with variance-weighted pessimistic targets.\"\"\"\n def __init__(self, obs_dim, n_actions, device, args):\n self.device = device\n self.n_actions = n_actions\n self.gamma = args.gamma\n self.total_it = 0\n \n # Hyperparameters\n self.ensemble_size = 5\n self.beta = 1.0 # uncertainty penalty coefficient\n self.optimistic_exploration = True\n self.uncertainty_weight = True\n self.eps = 1e-6\n \n # Networks\n self.q_network = QNetwork(obs_dim, n_actions, self.ensemble_size).to(device)\n self.target_network = QNetwork(obs_dim, n_actions, self.ensemble_size).to(device)\n self.target_network.load_state_dict(self.q_network.state_dict())\n \n # Optimizer with weight decay\n self.optimizer = optim.Adam(self.q_network.parameters(), lr=args.learning_rate, weight_decay=1e-5)\n \n def select_action(self, obs, epsilon):\n \"\"\"Optimistic exploration using ensemble variance.\"\"\"\n if random.random() < epsilon:\n return random.randint(0, self.n_actions - 1)\n obs_t = torch.tensor(obs.reshape(1, -1), device=self.device, dtype=torch.float32)\n with torch.no_grad():\n q_ensemble = self.q_network.forward_all(obs_t) # [ensemble, 1, n_actions]\n mean = q_ensemble.mean(dim=0).squeeze(0) # [n_actions]\n std = q_ensemble.std(dim=0, unbiased=False).squeeze(0) # [n_actions]\n if self.optimistic_exploration:\n # UCB: mean + beta * std\n optimistic_q = mean + self.beta * std\n else:\n optimistic_q = mean\n return torch.argmax(optimistic_q).item()\n \n def update(self, batch, global_step):\n self.total_it += 1\n obs, next_obs, actions, rewards, dones = batch\n \n with torch.no_grad():\n # Compute ensemble Q-values for next state from target network\n next_q_ensemble = self.target_network.forward_all(next_obs) # [ensemble, batch, n_actions]\n next_mean = next_q_ensemble.mean(dim=0) # [batch, n_actions]\n next_std = next_q_ensemble.std(dim=0, unbiased=False) # [batch, n_actions]\n # Pessimistic target: mean - beta * std\n next_q_pessimistic = next_mean - self.beta * next_std\n target_max, _ = next_q_pessimistic.max(dim=1)\n td_target = rewards + (1 - dones) * self.gamma * target_max\n # Clip td_target to reasonable range\n td_target = td_target.clamp(-1000, 1000)\n \n # Get current Q-values from each head\n q_ensemble = self.q_network.forward_all(obs) # [ensemble, batch, n_actions]\n # Select Q-values for taken actions\n actions_expanded = actions.unsqueeze(0).unsqueeze(2).expand(self.ensemble_size, -1, -1) # [ensemble, batch, 1]\n old_vals = q_ensemble.gather(2, actions_expanded).squeeze(2) # [ensemble, batch]\n \n # Compute uncertainty weights per sample\n if self.uncertainty_weight:\n # Variance across ensemble per state-action\n variance = old_vals.var(dim=0, unbiased=False) # [batch]\n uncertainty = torch.sqrt(variance + self.eps)\n weights = 1.0 / (uncertainty + 1.0)\n weights = weights / weights.mean() # normalize mean to 1\n else:\n weights = torch.ones_like(rewards)\n \n # Compute loss for each ensemble member\n td_target_expanded = td_target.unsqueeze(0).expand(self.ensemble_size, -1) # [ensemble, batch]\n td_errors = td_target_expanded - old_vals\n huber_loss = F.huber_loss(td_errors, torch.zeros_like(td_errors), reduction='none', delta=10.0)\n # Weight each sample\n huber_loss = huber_loss * weights.unsqueeze(0)\n td_loss = huber_loss.mean()\n \n self.optimizer.zero_grad()\n td_loss.backward()\n torch.nn.utils.clip_grad_norm_(self.q_network.parameters(), max_norm=10.0)\n self.optimizer.step()\n \n # Logging\n with torch.no_grad():\n mean_q = old_vals.mean().item()\n if self.uncertainty_weight:\n avg_weight = weights.mean().item()\n avg_uncertainty = uncertainty.mean().item()\n else:\n avg_weight = 1.0\n avg_uncertainty = 0.0\n \n return {\n \"td_loss\": td_loss.item(),\n \"q_values\": mean_q,\n \"uncertainty\": avg_uncertainty,\n \"weight\": avg_weight,\n }"
7}OK: Replaced lines 174..309 with 135 line(s) in cleanrl/cleanrl/custom_value_discrete.py. Editable range: 174–308. [Current file: cleanrl/cleanrl/custom_value_discrete.py | editable: 174–308 | total: 395 lines] 171| # ===================================================================== 172| # EDITABLE: QNetwork head and ValueAlgorithm 173| # ===================================================================== 174 class QNetwork(nn.Module): 175 """Ensemble Q-network with multiple heads for uncertainty estimation.""" 176 def __init__(self, obs_dim, n_actions, ensemble_size=5): 177 super().__init__() 178 self.encoder = MLPEncoder(obs_dim) 179 self.n_actions = n_actions 180 self.ensemble_size = ensemble_size 181 # Separate heads for each ensemble member 182 self.heads = nn.ModuleList([ 183 nn.Linear(ENCODER_FEATURE_DIM, n_actions) for _ in range(ensemble_size) 184
[Test #3] (0 tests remaining) ## Seed 42 ### cartpole-v1 (scripts/cartpole.sh) Training metrics (last steps): TRAIN_METRICS step=495000 td_loss=0.0007 q_values=98.4470 uncertainty=0.0146 weight=1.0000 TRAIN_METRICS step=496000 td_loss=0.0011 q_values=98.4688 uncertainty=0.0188 weight=1.0000 TRAIN_METRICS step=497000 td_loss=0.0016 q_values=98.5447 uncertainty=0.0230 weight=1.0000 TRAIN_METRICS step=498000 td_loss=0.0014 q_values=98.4986 uncertainty=0.0268 weight=1.0000 TRAIN_METRICS step=499000 td_loss=0.0018 q_values=98.2919 uncertainty=0.0319 weight=1.0000 Evaluation (cartpole-v1): Eval episodic_return: 286.40 Eval episodic_return: 500.00 Eval episodic_return: 500.00 Final eval return: 500.00 ### lunarlander-v2 (scripts/lunarlander.sh) Training metrics (last steps): TRAIN_METRICS step=495000 td_loss=14.8493 q_values=115.5665 uncertainty=1669.3303 weight=1.0000 TRAIN_METRICS step=496000 td_loss=13.9367 q_values=58.0183 uncertainty=1520.8800 weight=1.0000 TRAIN_METRICS step=497000