From a99b828bfa9b48501c0eef135b7d846dcd5c3d7b Mon Sep 17 00:00:00 2001 From: "Justin M. Lidard" Date: Thu, 14 Nov 2024 21:01:37 -0500 Subject: [PATCH 1/7] add diffusion versions of rlpd/ibrl/cal-ql --- agent/finetune/train_calql_diffusion_agent.py | 478 ++++++++++++++++++ agent/finetune/train_ibrl_diffusion_agent.py | 354 +++++++++++++ agent/finetune/train_rlpd_diffusion_agent.py | 381 ++++++++++++++ .../can/calql_diffusion_mlp_online.yaml | 121 +++++ .../finetune/can/ibrl_diffusion_mlp.yaml | 119 +++++ .../square/calql_diffusion_mlp_online.yaml | 122 +++++ .../finetune/square/ibrl_diffusion_mlp.yaml | 120 +++++ .../can/calql_diffusion_mlp_offline.yaml | 117 +++++ .../square/calql_diffusion_mlp_offline.yaml | 118 +++++ .../scratch/can/rlpd_diffusion_mlp.yaml | 116 +++++ .../scratch/square/rlpd_diffusion_mlp.yaml | 117 +++++ model/diffusion/diffusion_calql.py | 223 ++++++++ model/diffusion/diffusion_ibrl.py | 286 +++++++++++ model/diffusion/diffusion_rlpd.py | 152 ++++++ 14 files changed, 2824 insertions(+) create mode 100644 agent/finetune/train_calql_diffusion_agent.py create mode 100644 agent/finetune/train_ibrl_diffusion_agent.py create mode 100644 agent/finetune/train_rlpd_diffusion_agent.py create mode 100644 cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml create mode 100644 cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml create mode 100644 cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml create mode 100644 cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml create mode 100644 cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml create mode 100644 cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml create mode 100644 cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml create mode 100644 cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml create mode 100644 model/diffusion/diffusion_calql.py create mode 100644 model/diffusion/diffusion_ibrl.py create mode 100644 model/diffusion/diffusion_rlpd.py diff --git a/agent/finetune/train_calql_diffusion_agent.py b/agent/finetune/train_calql_diffusion_agent.py new file mode 100644 index 00000000..78a364ff --- /dev/null +++ b/agent/finetune/train_calql_diffusion_agent.py @@ -0,0 +1,478 @@ +""" +Reinforcement Learning with Prior Data (RLPD) agent training script. + +Does not support image observations right now. +""" + +import os +import pickle +import numpy as np +import torch +import logging +import wandb +import hydra +from collections import deque + +log = logging.getLogger(__name__) +from util.timer import Timer +from agent.finetune.train_agent import TrainAgent +from util.scheduler import CosineAnnealingWarmupRestarts + + +class TrainCalQLDiffusionAgent(TrainAgent): + def __init__(self, cfg): + super().__init__(cfg) + assert self.n_envs == 1, "Cal-QL only supports single env for now" + + # Train mode (offline or online) + self.train_online = cfg.train.train_online + + # Build dataset + self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset) + + # note the discount factor gamma here is applied to reward every act_steps, instead of every env step + self.gamma = cfg.train.gamma + + # Optimizer + self.actor_optimizer = torch.optim.AdamW( + self.model.network.parameters(), + lr=cfg.train.actor_lr, + weight_decay=cfg.train.actor_weight_decay, + ) + self.actor_lr_scheduler = CosineAnnealingWarmupRestarts( + self.actor_optimizer, + first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.actor_lr, + min_lr=cfg.train.actor_lr_scheduler.min_lr, + warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps, + gamma=1.0, + ) + self.critic_optimizer = torch.optim.AdamW( + self.model.critic.parameters(), + lr=cfg.train.critic_lr, + weight_decay=cfg.train.critic_weight_decay, + ) + self.critic_lr_scheduler = CosineAnnealingWarmupRestarts( + self.critic_optimizer, + first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.critic_lr, + min_lr=cfg.train.critic_lr_scheduler.min_lr, + warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps, + gamma=1.0, + ) + + # Perturbation scale + self.target_ema_rate = cfg.train.target_ema_rate + + # Number of random actions to sample for Cal-QL + self.n_random_actions = cfg.train.n_random_actions + + # Reward scale + self.scale_reward_factor = cfg.train.scale_reward_factor + + # Number of critic updates + self.num_update = cfg.train.num_update + + # Buffer size + self.buffer_size = cfg.train.buffer_size + + # Online only configs + if self.train_online: + # number of episode to colect per epoch for training + self.n_episode_per_epoch = cfg.train.n_episode_per_epoch + + # Eval episodes + self.n_eval_episode = cfg.train.n_eval_episode + + # Exploration steps at the beginning - using randomly sampled action + self.n_explore_steps = cfg.train.n_explore_steps + + def run(self): + # make a FIFO replay buffer for obs, action, and reward + obs_buffer = deque(maxlen=self.buffer_size) + next_obs_buffer = deque(maxlen=self.buffer_size) + action_buffer = deque(maxlen=self.buffer_size) + reward_buffer = deque(maxlen=self.buffer_size) + reward_to_go_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) + if not self.train_online: + obs_array = np.array(obs_buffer) + next_obs_array = np.array(next_obs_buffer) + actions_array = np.array(action_buffer) + rewards_array = np.array(reward_buffer) + reward_to_go_array = np.array(reward_to_go_buffer) + terminated_array = np.array(terminated_buffer) + + # load offline dataset into replay buffer + dataloader_offline = torch.utils.data.DataLoader( + self.dataset_offline, + batch_size=len(self.dataset_offline), + drop_last=False, + ) + for batch in dataloader_offline: + actions, states_and_next, rewards, terminated, reward_to_go = batch + states = states_and_next["state"] + next_states = states_and_next["next_state"] + obs_buffer_off = states.cpu().numpy() + next_obs_buffer_off = next_states.cpu().numpy() + action_buffer_off = actions.cpu().numpy() + reward_buffer_off = rewards.cpu().numpy().flatten() + reward_to_go_buffer_off = reward_to_go.cpu().numpy().flatten() + terminated_buffer_off = terminated.cpu().numpy().flatten() + + # Start training loop + timer = Timer() + run_results = [] + cnt_train_step = 0 + done_venv = np.zeros((1, self.n_envs)) + while self.itr < self.n_train_itr: + if self.itr % 1000 == 0: + print(f"Finished training iteration {self.itr} of {self.n_train_itr}") + + # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env + options_venv = [{} for _ in range(self.n_envs)] + if self.itr % self.render_freq == 0 and self.render_video: + for env_ind in range(self.n_render): + options_venv[env_ind]["video_path"] = os.path.join( + self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4" + ) + + # Define train or eval - all envs restart + eval_mode = ( + self.itr % self.val_freq == 0 + and self.itr >= self.n_explore_steps + and not self.force_train + ) + # during eval, we collect a fixed number of episodes, so we set n_steps to a large value + if eval_mode: + n_steps = int(1e5) + elif not self.train_online: + n_steps = 0 + else: + n_steps = int(1e5) # use episodes + self.model.eval() if eval_mode else self.model.train() + + # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning + firsts_trajs = np.zeros((n_steps + 1, self.n_envs)) + if self.reset_at_iteration or eval_mode or self.itr == 0: + prev_obs_venv = self.reset_env_all(options_venv=options_venv) + firsts_trajs[0] = 1 + else: + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((n_steps, self.n_envs)) + + # Collect a set of trajectories from env + cnt_episode = 0 + for step in range(n_steps): + if step % 100 == 0: + print(f"Completed environment step {step}") + + # Select action + if self.itr < self.n_explore_steps: + action_venv = self.venv.action_space.sample() + else: + with torch.no_grad(): + cond = { + "state": torch.from_numpy(prev_obs_venv["state"]) + .float() + .to(self.device) + } + samples = ( + self.model( + cond=cond, + deterministic=eval_mode, + ) + .cpu() + .numpy() + ) # n_env x horizon x act + action_venv = samples[:, : self.act_steps] + + # Apply multi-step action + ( + obs_venv, + reward_venv, + terminated_venv, + truncated_venv, + info_venv, + ) = self.venv.step(action_venv) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv + + # add to buffer in train mode + if not eval_mode: + for i in range(self.n_envs): + obs_buffer.append(prev_obs_venv["state"][i]) + if truncated_venv[i]: + next_obs_buffer.append(info_venv[i]["final_obs"]["state"]) + else: # first obs in new episode + next_obs_buffer.append(obs_venv["state"][i]) + action_buffer.append(action_venv[i]) + reward_buffer.extend( + (reward_venv * self.scale_reward_factor).tolist() + ) + terminated_buffer.extend(terminated_venv.tolist()) + + # update for next step + prev_obs_venv = obs_venv + + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + + # check if enough eval episodes are done + cnt_episode += np.sum(done_venv) + if eval_mode and cnt_episode >= self.n_eval_episode: + break + if not eval_mode and cnt_episode >= self.n_episode_per_epoch: + break + + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. + episodes_start_end = [] + for env_ind in range(self.n_envs): + env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0] + for i in range(len(env_steps) - 1): + start = env_steps[i] + end = env_steps[i + 1] + if end - start > 1: + episodes_start_end.append((env_ind, start, end - 1)) + if len(episodes_start_end) > 0: + reward_trajs_split = [ + reward_trajs[start : end + 1, env_ind] + for env_ind, start, end in episodes_start_end + ] + + # compute episode returns + returns_trajs_split = [ + np.zeros_like(reward_trajs) for reward_trajs in reward_trajs_split + ] + for traj_rewards, traj_returns in zip( + reward_trajs_split, returns_trajs_split + ): + prev_return = 0 + for t in range(len(traj_rewards)): + traj_returns[-t - 1] = ( + traj_rewards[-t - 1] + self.gamma * prev_return + ) + prev_return = traj_returns[-t - 1] + + # flatten (note: only works for single env!) + returns_trajs_split = np.concatenate(returns_trajs_split) + + # extend buffer + reward_to_go_buffer.extend(returns_trajs_split) + + num_episode_finished = len(reward_trajs_split) + episode_reward = np.array( + [np.sum(reward_traj) for reward_traj in reward_trajs_split] + ) + episode_best_reward = np.array( + [ + np.max(reward_traj) / self.act_steps + for reward_traj in reward_trajs_split + ] + ) + avg_episode_reward = np.mean(episode_reward) + avg_best_reward = np.mean(episode_best_reward) + success_rate = np.mean( + episode_best_reward >= self.best_reward_threshold_for_success + ) + else: + episode_reward = np.array([]) + num_episode_finished = 0 + avg_episode_reward = 0 + avg_best_reward = 0 + success_rate = 0 + + # Update models + if not eval_mode and self.itr >= self.n_explore_steps: + # TODO: is this slow in online? + if self.train_online: + obs_array = np.array(obs_buffer) + next_obs_array = np.array(next_obs_buffer) + actions_array = np.array(action_buffer) + rewards_array = np.array(reward_buffer) + reward_to_go_array = np.array(reward_to_go_buffer) + terminated_array = np.array(terminated_buffer) + + # override num_update + if self.train_online: + # the amount of new transitions(single env) + num_update = len(reward_trajs_split[0]) + else: + num_update = self.num_update + for _ in range(num_update): + # Sample from OFFLINE buffer + inds = np.random.choice( + len(obs_buffer_off), + self.batch_size // 2 if self.train_online else self.batch_size, + ) + obs_b = ( + torch.from_numpy(obs_buffer_off[inds]).float().to(self.device) + ) + next_obs_b = ( + torch.from_numpy(next_obs_buffer_off[inds]) + .float() + .to(self.device) + ) + actions_b = ( + torch.from_numpy(action_buffer_off[inds]) + .float() + .to(self.device) + ) + rewards_b = ( + torch.from_numpy(reward_buffer_off[inds]) + .float() + .to(self.device) + ) + terminated_b = ( + torch.from_numpy(terminated_buffer_off[inds]) + .float() + .to(self.device) + ) + reward_to_go_b = ( + torch.from_numpy(reward_to_go_buffer_off[inds]) + .float() + .to(self.device) + ) + + # Sample from ONLINE buffer + if self.train_online: + inds = np.random.choice(len(obs_buffer), self.batch_size // 2) + obs_b_on = ( + torch.from_numpy(obs_array[inds]).float().to(self.device) + ) + next_obs_b_on = ( + torch.from_numpy(next_obs_array[inds]) + .float() + .to(self.device) + ) + actions_b_on = ( + torch.from_numpy(actions_array[inds]) + .float() + .to(self.device) + ) + rewards_b_on = ( + torch.from_numpy(rewards_array[inds]) + .float() + .to(self.device) + ) + terminated_b_on = ( + torch.from_numpy(terminated_array[inds]) + .float() + .to(self.device) + ) + reward_to_go_b_on = ( + torch.from_numpy(reward_to_go_array[inds]) + .float() + .to(self.device) + ) + + # merge offline and online data + obs_b = torch.cat([obs_b, obs_b_on], dim=0) + next_obs_b = torch.cat([next_obs_b, next_obs_b_on], dim=0) + actions_b = torch.cat([actions_b, actions_b_on], dim=0) + rewards_b = torch.cat([rewards_b, rewards_b_on], dim=0) + terminated_b = torch.cat([terminated_b, terminated_b_on], dim=0) + reward_to_go_b = torch.cat( + [reward_to_go_b, reward_to_go_b_on], dim=0 + ) + + # Get a random action for Cal-QL + random_actions = ( + torch.rand( + ( + self.batch_size, + self.n_random_actions, + self.horizon_steps, + self.action_dim, + ) + ).to(self.device) + * 2 + - 1 + ) # scale to [-1, 1] + + # Update critic + loss_critic = self.model.loss_critic( + {"state": obs_b}, + {"state": next_obs_b}, + actions_b, + random_actions, + rewards_b, + reward_to_go_b, + terminated_b, + self.gamma, + ) + self.critic_optimizer.zero_grad() + loss_critic.backward() + self.critic_optimizer.step() + + # Update target critic + self.model.update_target_critic(self.target_ema_rate) + + # Update actor + loss_actor = self.model.loss_actor( + {"state": obs_b}, + ) + self.actor_optimizer.zero_grad() + loss_actor.backward() + self.actor_optimizer.step() + + # Update lr + self.actor_lr_scheduler.step() + self.critic_lr_scheduler.step() + + # Save model + if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1: + self.save_model() + + # Log loss and save metrics + run_results.append( + { + "itr": self.itr, + "step": cnt_train_step, + } + ) + if self.itr % self.log_freq == 0 and self.itr >= self.n_explore_steps: + time = timer() + run_results[-1]["time"] = time + if eval_mode: + log.info( + f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "success rate - eval": success_rate, + "avg episode reward - eval": avg_episode_reward, + "avg best reward - eval": avg_best_reward, + "num episode - eval": num_episode_finished, + }, + step=self.itr, + commit=False, + ) + run_results[-1]["eval_success_rate"] = success_rate + run_results[-1]["eval_episode_reward"] = avg_episode_reward + run_results[-1]["eval_best_reward"] = avg_best_reward + else: + log.info( + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "total env step": cnt_train_step, + "loss - actor": loss_actor, + "loss - critic": loss_critic, + "avg episode reward - train": avg_episode_reward, + "num episode - train": num_episode_finished, + }, + step=self.itr, + commit=True, + ) + run_results[-1]["train_episode_reward"] = avg_episode_reward + with open(self.result_path, "wb") as f: + pickle.dump(run_results, f) + self.itr += 1 diff --git a/agent/finetune/train_ibrl_diffusion_agent.py b/agent/finetune/train_ibrl_diffusion_agent.py new file mode 100644 index 00000000..27e4580b --- /dev/null +++ b/agent/finetune/train_ibrl_diffusion_agent.py @@ -0,0 +1,354 @@ +""" +Imitation Bootstrapped Reinforcement Learning (IBRL) agent training script. + +Does not support image observations right now. +""" + +import os +import pickle +import numpy as np +import torch +import logging +import wandb +import hydra +from collections import deque + +log = logging.getLogger(__name__) +from util.timer import Timer +from agent.finetune.train_agent import TrainAgent +from util.scheduler import CosineAnnealingWarmupRestarts + + +class TrainIBRLDiffusionAgent(TrainAgent): + def __init__(self, cfg): + super().__init__(cfg) + + # Build dataset + self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset) + + # note the discount factor gamma here is applied to reward every act_steps, instead of every env step + self.gamma = cfg.train.gamma + + # Optimizer + self.actor_optimizer = torch.optim.AdamW( + self.model.network.parameters(), + lr=cfg.train.actor_lr, + weight_decay=cfg.train.actor_weight_decay, + ) + self.actor_lr_scheduler = CosineAnnealingWarmupRestarts( + self.actor_optimizer, + first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.actor_lr, + min_lr=cfg.train.actor_lr_scheduler.min_lr, + warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps, + gamma=1.0, + ) + self.critic_optimizer = torch.optim.AdamW( + self.model.ensemble_params.values(), # https://github.com/pytorch/pytorch/issues/120581 + lr=cfg.train.critic_lr, + weight_decay=cfg.train.critic_weight_decay, + ) + self.critic_lr_scheduler = CosineAnnealingWarmupRestarts( + self.critic_optimizer, + first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.critic_lr, + min_lr=cfg.train.critic_lr_scheduler.min_lr, + warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps, + gamma=1.0, + ) + + # Perturbation scale + self.target_ema_rate = cfg.train.target_ema_rate + + # Reward scale + self.scale_reward_factor = cfg.train.scale_reward_factor + + # Number of critic updates + self.critic_num_update = cfg.train.critic_num_update + + # Update frequency + self.update_freq = cfg.train.update_freq + + # Buffer size + self.buffer_size = cfg.train.buffer_size + + # Eval episodes + self.n_eval_episode = cfg.train.n_eval_episode + + # Exploration steps at the beginning - using randomly sampled action + self.n_explore_steps = cfg.train.n_explore_steps + + def run(self): + # make a FIFO replay buffer for obs, action, and reward + obs_buffer = deque(maxlen=self.buffer_size) + next_obs_buffer = deque(maxlen=self.buffer_size) + action_buffer = deque(maxlen=self.buffer_size) + reward_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) + + # load offline dataset into replay buffer + dataloader_offline = torch.utils.data.DataLoader( + self.dataset_offline, + batch_size=len(self.dataset_offline), + drop_last=False, + ) + for batch in dataloader_offline: + actions, states_and_next, rewards, terminated = batch + states = states_and_next["state"] + next_states = states_and_next["next_state"] + obs_buffer.extend(states.cpu().numpy()) + next_obs_buffer.extend(next_states.cpu().numpy()) + action_buffer.extend(actions.cpu().numpy()) + reward_buffer.extend(rewards.cpu().numpy().flatten()) + terminated_buffer.extend(terminated.cpu().numpy().flatten()) + + # Start training loop + timer = Timer() + run_results = [] + cnt_train_step = 0 + done_venv = np.zeros((1, self.n_envs)) + while self.itr < self.n_train_itr: + if self.itr % 1000 == 0: + print(f"Finished training iteration {self.itr} of {self.n_train_itr}") + + # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env + options_venv = [{} for _ in range(self.n_envs)] + if self.itr % self.render_freq == 0 and self.render_video: + for env_ind in range(self.n_render): + options_venv[env_ind]["video_path"] = os.path.join( + self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4" + ) + + # Define train or eval - all envs restart + eval_mode = ( + self.itr % self.val_freq == 0 + and self.itr > self.n_explore_steps + and not self.force_train + ) + n_steps = ( + self.n_steps if not eval_mode else int(1e5) + ) # large number for eval mode + self.model.eval() if eval_mode else self.model.train() + + # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning + firsts_trajs = np.zeros((n_steps + 1, self.n_envs)) + if self.reset_at_iteration or eval_mode or self.itr == 0: + prev_obs_venv = self.reset_env_all(options_venv=options_venv) + firsts_trajs[0] = 1 + else: + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((n_steps, self.n_envs)) + + # Collect a set of trajectories from env + cnt_episode = 0 + for step in range(n_steps): + # Select action + with torch.no_grad(): + cond = { + "state": torch.from_numpy(prev_obs_venv["state"]) + .float() + .to(self.device) + } + samples = ( + self.model( + cond=cond, + deterministic=eval_mode, + ) + .cpu() + .numpy() + ) # n_env x horizon x act + action_venv = samples[:, : self.act_steps] + + # Apply multi-step action + ( + obs_venv, + reward_venv, + terminated_venv, + truncated_venv, + info_venv, + ) = self.venv.step(action_venv) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv + + # add to buffer in train mode + if not eval_mode: + for i in range(self.n_envs): + obs_buffer.append(prev_obs_venv["state"][i]) + if "final_obs" in info_venv[i]: # truncated + next_obs_buffer.append(info_venv[i]["final_obs"]["state"]) + else: # first obs in new episode + next_obs_buffer.append(obs_venv["state"][i]) + action_buffer.append(action_venv[i]) + reward_buffer.extend( + (reward_venv * self.scale_reward_factor).tolist() + ) + terminated_buffer.extend(terminated_venv.tolist()) + + # update for next step + prev_obs_venv = obs_venv + + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + + # check if enough eval episodes are done + cnt_episode += np.sum(done_venv) + if eval_mode and cnt_episode >= self.n_eval_episode: + break + + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. + episodes_start_end = [] + for env_ind in range(self.n_envs): + env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0] + for i in range(len(env_steps) - 1): + start = env_steps[i] + end = env_steps[i + 1] + if end - start > 1: + episodes_start_end.append((env_ind, start, end - 1)) + if len(episodes_start_end) > 0: + reward_trajs_split = [ + reward_trajs[start : end + 1, env_ind] + for env_ind, start, end in episodes_start_end + ] + num_episode_finished = len(reward_trajs_split) + episode_reward = np.array( + [np.sum(reward_traj) for reward_traj in reward_trajs_split] + ) + episode_best_reward = np.array( + [ + np.max(reward_traj) / self.act_steps + for reward_traj in reward_trajs_split + ] + ) + avg_episode_reward = np.mean(episode_reward) + avg_best_reward = np.mean(episode_best_reward) + success_rate = np.mean( + episode_best_reward >= self.best_reward_threshold_for_success + ) + else: + episode_reward = np.array([]) + num_episode_finished = 0 + avg_episode_reward = 0 + avg_best_reward = 0 + success_rate = 0 + + # Update models + if ( + not eval_mode + and self.itr > self.n_explore_steps + and self.itr % self.update_freq == 0 + ): + # Update critic more frequently + for _ in range(self.critic_num_update): + # Sample from online buffer + inds = np.random.choice(len(obs_buffer), self.batch_size) + obs_b = ( + torch.from_numpy(np.array([obs_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + next_obs_b = ( + torch.from_numpy(np.array([next_obs_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + actions_b = ( + torch.from_numpy(np.array([action_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + rewards_b = ( + torch.from_numpy(np.array([reward_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + terminated_b = ( + torch.from_numpy(np.array([terminated_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + loss_critic = self.model.loss_critic( + {"state": obs_b}, + {"state": next_obs_b}, + actions_b, + rewards_b, + terminated_b, + self.gamma, + ) + self.critic_optimizer.zero_grad() + loss_critic.backward() + self.critic_optimizer.step() + + # Update target critic every critic update + self.model.update_target_critic(self.target_ema_rate) + + # Update actor once with the final batch + loss_actor = self.model.loss_actor( + {"state": obs_b}, + ) + self.actor_optimizer.zero_grad() + loss_actor.backward() + self.actor_optimizer.step() + + # Update target actor + self.model.update_target_actor(self.target_ema_rate) + + # Update lr + self.actor_lr_scheduler.step() + self.critic_lr_scheduler.step() + + # Save model + if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1: + self.save_model() + + # Log loss and save metrics + run_results.append( + { + "itr": self.itr, + "step": cnt_train_step, + } + ) + if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps: + time = timer() + run_results[-1]["time"] = time + if eval_mode: + log.info( + f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "success rate - eval": success_rate, + "avg episode reward - eval": avg_episode_reward, + "avg best reward - eval": avg_best_reward, + "num episode - eval": num_episode_finished, + }, + step=self.itr, + commit=False, + ) + run_results[-1]["eval_success_rate"] = success_rate + run_results[-1]["eval_episode_reward"] = avg_episode_reward + run_results[-1]["eval_best_reward"] = avg_best_reward + else: + log.info( + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "total env step": cnt_train_step, + "loss - actor": loss_actor, + "loss - critic": loss_critic, + "avg episode reward - train": avg_episode_reward, + "num episode - train": num_episode_finished, + }, + step=self.itr, + commit=True, + ) + run_results[-1]["train_episode_reward"] = avg_episode_reward + with open(self.result_path, "wb") as f: + pickle.dump(run_results, f) + self.itr += 1 diff --git a/agent/finetune/train_rlpd_diffusion_agent.py b/agent/finetune/train_rlpd_diffusion_agent.py new file mode 100644 index 00000000..31587d76 --- /dev/null +++ b/agent/finetune/train_rlpd_diffusion_agent.py @@ -0,0 +1,381 @@ +""" +Reinforcement Learning with Prior Data (RLPD) agent training script. + +Does not support image observations right now. +""" + +import os +import pickle +import numpy as np +import torch +import logging +import wandb +import hydra +from collections import deque + +log = logging.getLogger(__name__) +from util.timer import Timer +from agent.finetune.train_agent import TrainAgent +from util.scheduler import CosineAnnealingWarmupRestarts + + +class TrainRLPDDiffusionAgent(TrainAgent): + def __init__(self, cfg): + super().__init__(cfg) + + # Build dataset + self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset) + + # note the discount factor gamma here is applied to reward every act_steps, instead of every env step + self.gamma = cfg.train.gamma + + # Optimizer + self.actor_optimizer = torch.optim.AdamW( + self.model.network.parameters(), + lr=cfg.train.actor_lr, + weight_decay=cfg.train.actor_weight_decay, + ) + self.actor_lr_scheduler = CosineAnnealingWarmupRestarts( + self.actor_optimizer, + first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.actor_lr, + min_lr=cfg.train.actor_lr_scheduler.min_lr, + warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps, + gamma=1.0, + ) + self.critic_optimizer = torch.optim.AdamW( + self.model.ensemble_params.values(), # https://github.com/pytorch/pytorch/issues/120581 + lr=cfg.train.critic_lr, + weight_decay=cfg.train.critic_weight_decay, + ) + self.critic_lr_scheduler = CosineAnnealingWarmupRestarts( + self.critic_optimizer, + first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.critic_lr, + min_lr=cfg.train.critic_lr_scheduler.min_lr, + warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps, + gamma=1.0, + ) + + # Perturbation scale + self.target_ema_rate = cfg.train.target_ema_rate + + # Reward scale + self.scale_reward_factor = cfg.train.scale_reward_factor + + # Number of critic updates + self.critic_num_update = cfg.train.critic_num_update + + # Buffer size + self.buffer_size = cfg.train.buffer_size + + # Eval episodes + self.n_eval_episode = cfg.train.n_eval_episode + + # Exploration steps at the beginning - using randomly sampled action + self.n_explore_steps = cfg.train.n_explore_steps + + def run(self): + # make a FIFO replay buffer for obs, action, and reward + obs_buffer = deque(maxlen=self.buffer_size) + next_obs_buffer = deque(maxlen=self.buffer_size) + action_buffer = deque(maxlen=self.buffer_size) + reward_buffer = deque(maxlen=self.buffer_size) + terminated_buffer = deque(maxlen=self.buffer_size) + + # load offline dataset into replay buffer + dataloader_offline = torch.utils.data.DataLoader( + self.dataset_offline, + batch_size=len(self.dataset_offline), + drop_last=False, + ) + for batch in dataloader_offline: + actions, states_and_next, rewards, terminated = batch + states = states_and_next["state"] + next_states = states_and_next["next_state"] + obs_buffer_off = states.cpu().numpy() + next_obs_buffer_off = next_states.cpu().numpy() + action_buffer_off = actions.cpu().numpy() + reward_buffer_off = rewards.cpu().numpy().flatten() + terminated_buffer_off = terminated.cpu().numpy().flatten() + + # Start training loop + timer = Timer() + run_results = [] + cnt_train_step = 0 + done_venv = np.zeros((1, self.n_envs)) + while self.itr < self.n_train_itr: + if self.itr % 1000 == 0: + print(f"Finished training iteration {self.itr} of {self.n_train_itr}") + + # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env + options_venv = [{} for _ in range(self.n_envs)] + if self.itr % self.render_freq == 0 and self.render_video: + for env_ind in range(self.n_render): + options_venv[env_ind]["video_path"] = os.path.join( + self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4" + ) + + # Define train or eval - all envs restart + eval_mode = ( + self.itr % self.val_freq == 0 + and self.itr >= self.n_explore_steps + and not self.force_train + ) + n_steps = ( + self.n_steps if not eval_mode else int(1e5) + ) # large number for eval mode + self.model.eval() if eval_mode else self.model.train() + + # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning + firsts_trajs = np.zeros((n_steps + 1, self.n_envs)) + if self.reset_at_iteration or eval_mode or self.itr == 0: + prev_obs_venv = self.reset_env_all(options_venv=options_venv) + firsts_trajs[0] = 1 + else: + # if done at the end of last iteration, then the envs are just reset + firsts_trajs[0] = done_venv + reward_trajs = np.zeros((n_steps, self.n_envs)) + + # Collect a set of trajectories from env + cnt_episode = 0 + for step in range(n_steps): + # Select action + if self.itr < self.n_explore_steps: + action_venv = self.venv.action_space.sample() + else: + with torch.no_grad(): + cond = { + "state": torch.from_numpy(prev_obs_venv["state"]) + .float() + .to(self.device) + } + samples = ( + self.model( + cond=cond, + deterministic=eval_mode, + ) + .cpu() + .numpy() + ) # n_env x horizon x act + action_venv = samples[:, : self.act_steps] + + # Apply multi-step action + ( + obs_venv, + reward_venv, + terminated_venv, + truncated_venv, + info_venv, + ) = self.venv.step(action_venv) + done_venv = terminated_venv | truncated_venv + reward_trajs[step] = reward_venv + firsts_trajs[step + 1] = done_venv + + # add to buffer in train mode + if not eval_mode: + for i in range(self.n_envs): + obs_buffer.append(prev_obs_venv["state"][i]) + if truncated_venv[i]: + next_obs_buffer.append(info_venv[i]["final_obs"]["state"]) + else: + next_obs_buffer.append(obs_venv["state"][i]) + action_buffer.append(action_venv[i]) + reward_buffer.extend( + (reward_venv * self.scale_reward_factor).tolist() + ) + terminated_buffer.extend(terminated_venv.tolist()) + + # update for next step + prev_obs_venv = obs_venv + + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + + # check if enough eval episodes are done + cnt_episode += np.sum(done_venv) + if eval_mode and cnt_episode >= self.n_eval_episode: + break + + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. + episodes_start_end = [] + for env_ind in range(self.n_envs): + env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0] + for i in range(len(env_steps) - 1): + start = env_steps[i] + end = env_steps[i + 1] + if end - start > 1: + episodes_start_end.append((env_ind, start, end - 1)) + if len(episodes_start_end) > 0: + reward_trajs_split = [ + reward_trajs[start : end + 1, env_ind] + for env_ind, start, end in episodes_start_end + ] + num_episode_finished = len(reward_trajs_split) + episode_reward = np.array( + [np.sum(reward_traj) for reward_traj in reward_trajs_split] + ) + episode_best_reward = np.array( + [ + np.max(reward_traj) / self.act_steps + for reward_traj in reward_trajs_split + ] + ) + avg_episode_reward = np.mean(episode_reward) + avg_best_reward = np.mean(episode_best_reward) + success_rate = np.mean( + episode_best_reward >= self.best_reward_threshold_for_success + ) + else: + episode_reward = np.array([]) + num_episode_finished = 0 + avg_episode_reward = 0 + avg_best_reward = 0 + success_rate = 0 + + # Update models + if not eval_mode and self.itr >= self.n_explore_steps: + # Update critic more frequently + for _ in range(self.critic_num_update): + # Sample from OFFLINE buffer + inds = np.random.choice(len(obs_buffer_off), self.batch_size // 2) + obs_b_off = ( + torch.from_numpy(obs_buffer_off[inds]).float().to(self.device) + ) + next_obs_b_off = ( + torch.from_numpy(next_obs_buffer_off[inds]) + .float() + .to(self.device) + ) + actions_b_off = ( + torch.from_numpy(action_buffer_off[inds]) + .float() + .to(self.device) + ) + rewards_b_off = ( + torch.from_numpy(reward_buffer_off[inds]) + .float() + .to(self.device) + ) + terminated_b_off = ( + torch.from_numpy(terminated_buffer_off[inds]) + .float() + .to(self.device) + ) + + # Sample from ONLINE buffer + inds = np.random.choice(len(obs_buffer), self.batch_size // 2) + obs_b_on = ( + torch.from_numpy(np.array([obs_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + next_obs_b_on = ( + torch.from_numpy(np.array([next_obs_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + actions_b_on = ( + torch.from_numpy(np.array([action_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + rewards_b_on = ( + torch.from_numpy(np.array([reward_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + terminated_b_on = ( + torch.from_numpy(np.array([terminated_buffer[i] for i in inds])) + .float() + .to(self.device) + ) + + # merge offline and online data + obs_b = torch.cat([obs_b_off, obs_b_on], dim=0) + next_obs_b = torch.cat([next_obs_b_off, next_obs_b_on], dim=0) + actions_b = torch.cat([actions_b_off, actions_b_on], dim=0) + rewards_b = torch.cat([rewards_b_off, rewards_b_on], dim=0) + terminated_b = torch.cat([terminated_b_off, terminated_b_on], dim=0) + + # Update critic + loss_critic = self.model.loss_critic( + {"state": obs_b}, + {"state": next_obs_b}, + actions_b, + rewards_b, + terminated_b, + self.gamma, + ) + self.critic_optimizer.zero_grad() + loss_critic.backward() + self.critic_optimizer.step() + + # Update target critic every critic update + self.model.update_target_critic(self.target_ema_rate) + + # Update actor once with the final batch + loss_actor = self.model.loss_actor( + {"state": obs_b}, + ) + self.actor_optimizer.zero_grad() + loss_actor.backward() + + # Update lr + self.actor_lr_scheduler.step() + self.critic_lr_scheduler.step() + + # Save model + if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1: + self.save_model() + + # Log loss and save metrics + run_results.append( + { + "itr": self.itr, + "step": cnt_train_step, + } + ) + if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps: + time = timer() + if eval_mode: + log.info( + f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "success rate - eval": success_rate, + "avg episode reward - eval": avg_episode_reward, + "avg best reward - eval": avg_best_reward, + "num episode - eval": num_episode_finished, + }, + step=self.itr, + commit=False, + ) + run_results[-1]["eval_success_rate"] = success_rate + run_results[-1]["eval_episode_reward"] = avg_episode_reward + run_results[-1]["eval_best_reward"] = avg_best_reward + else: + log.info( + f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "total env step": cnt_train_step, + "loss - actor": loss_actor, + "loss - critic": loss_critic, + "entropy coeff": 0, + "avg episode reward - train": avg_episode_reward, + "num episode - train": num_episode_finished, + }, + step=self.itr, + commit=True, + ) + run_results[-1]["train_episode_reward"] = avg_episode_reward + with open(self.result_path, "wb") as f: + pickle.dump(run_results, f) + self.itr += 1 diff --git a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml new file mode 100644 index 00000000..bf35e0a6 --- /dev/null +++ b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml @@ -0,0 +1,121 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz +base_policy_path: + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-calql-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 3 # 1000 + buffer_size: 1000000 + online_utd_ratio: 1 + n_eval_episode: 3 # 40 + n_explore_steps: 0 + +model: + _target_: model.diffusion.diffusion_calql.CalQL_Diffusion + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml new file mode 100644 index 00000000..85d603ae --- /dev/null +++ b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml @@ -0,0 +1,119 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_diffusion_agent.TrainIBRLDiffusionAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz +base_policy_path: + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 250 # IBRL uses 200 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-ibrl-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 50000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 3 # 40 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.diffusion.diffusion_ibrl.IBRL_Diffusion + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 10 + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + max_n_episodes: 100 \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml new file mode 100644 index 00000000..ad5dce08 --- /dev/null +++ b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml @@ -0,0 +1,122 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz +base_policy_path: + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-calql-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 3 # 1000 + buffer_size: 1000000 + online_utd_ratio: 1 + n_eval_episode: 3 # 40 + n_explore_steps: 0 + +model: + _target_: model.diffusion.diffusion_calql.CalQL_Diffusion + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml new file mode 100644 index 00000000..82043b23 --- /dev/null +++ b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml @@ -0,0 +1,120 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_diffusion_agent.TrainIBRLDiffusionAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz +base_policy_path: + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 250 # IBRL uses 200 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-ibrl-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 50000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 3 # 40 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.diffusion.diffusion_ibrl.IBRL_Diffusion + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 10 + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + max_n_episodes: 100 \ No newline at end of file diff --git a/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml new file mode 100644 index 00000000..4afaf5d7 --- /dev/null +++ b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml @@ -0,0 +1,117 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-calql-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 20 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 10 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 3 # 1000 + buffer_size: 1000000 + n_eval_episode: 3 # 10 + n_explore_steps: 0 + +model: + _target_: model.diffusion.diffusion_calql.CalQL_Diffusion + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml new file mode 100644 index 00000000..653cba85 --- /dev/null +++ b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml @@ -0,0 +1,118 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-calql-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 20 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 10 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 3 # 1000 + buffer_size: 1000000 + n_eval_episode: 3 # 10 + n_explore_steps: 0 + +model: + _target_: model.diffusion.diffusion_calql.CalQL_Diffusion + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml new file mode 100644 index 00000000..6dc91eca --- /dev/null +++ b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml @@ -0,0 +1,116 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-rlpd-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 50000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 3 # 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 0 + +model: + _target_: model.diffusion.diffusion_rlpd.RLPD_Diffusion + randn_clip_value: 10 + backup_entropy: False + n_critics: 5 + tanh_output: True + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml new file mode 100644 index 00000000..7567a57a --- /dev/null +++ b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml @@ -0,0 +1,117 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-rlpd-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 50000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 3 # 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 0 + +model: + _target_: model.diffusion.diffusion_rlpd.RLPD_Diffusion + randn_clip_value: 10 + backup_entropy: False + n_critics: 5 + tanh_output: True + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/model/diffusion/diffusion_calql.py b/model/diffusion/diffusion_calql.py new file mode 100644 index 00000000..31b486d1 --- /dev/null +++ b/model/diffusion/diffusion_calql.py @@ -0,0 +1,223 @@ +""" +Calibrated Conservative Q-Learning (CalQL) for Gaussian policy. + +""" + +import torch +import torch.nn as nn +import logging +from copy import deepcopy +import numpy as np +import einops + +from model.diffusion.diffusion_rwr import RWRDiffusion +from model.diffusion.sampling import make_timesteps + +log = logging.getLogger(__name__) + + +class CalQL_Diffusion(RWRDiffusion): + def __init__( + self, + actor, + critic, + network_path=None, + cql_clip_diff_min=-np.inf, + cql_clip_diff_max=np.inf, + cql_min_q_weight=5.0, + cql_n_actions=10, + **kwargs, + ): + super().__init__(network=actor, network_path=None, **kwargs) + self.cql_clip_diff_min = cql_clip_diff_min + self.cql_clip_diff_max = cql_clip_diff_max + self.cql_min_q_weight = cql_min_q_weight + self.cql_n_actions = cql_n_actions + + # initialize critic networks + self.critic = critic.to(self.device) + self.target_critic = deepcopy(critic).to(self.device) + + # Load pre-trained checkpoint - note we are also loading the pre-trained critic here + if network_path is not None: + checkpoint = torch.load( + network_path, + map_location=self.device, + weights_only=True, + ) + self.load_state_dict( + checkpoint["model"], + strict=True, + ) + log.info("Loaded actor from %s", network_path) + log.info( + f"Number of network parameters: {sum(p.numel() for p in self.parameters())}" + ) + + def loss_critic( + self, + obs, + next_obs, + actions, + random_actions, + rewards, + returns, + terminated, + gamma, + ): + B = len(actions) + + # Get initial TD loss + q_data1, q_data2 = self.critic(obs, actions) + with torch.no_grad(): + # repeat for action samples + next_obs_repeated = { + "state": next_obs["state"].repeat_interleave(self.cql_n_actions, dim=0) + } + + # Get the next actions + next_actions = self.forward( + next_obs_repeated, + deterministic=False, + ) + next_q1, next_q2 = self.target_critic(next_obs_repeated, next_actions) + next_q = torch.min(next_q1, next_q2) + + # Reshape the next_q to match the number of samples + next_q = next_q.view(B, self.cql_n_actions) # (B, n_sample) + + # Get the max indices over the samples, and index into the next_q + max_idx = torch.argmax(next_q, dim=1) + next_q = next_q[torch.arange(B), max_idx] + + # Get the target Q values + target_q = rewards + gamma * (1 - terminated) * next_q + + # TD loss + td_loss_1 = nn.functional.mse_loss(q_data1, target_q) + td_loss_2 = nn.functional.mse_loss(q_data2, target_q) + + # Get actions + pi_actions = self.forward( + obs, + deterministic=False, + ) # no gradient + pi_next_actions = self.forward( + next_obs, + deterministic=False, + ) # no gradient + + # Random action Q values + n_random_actions = random_actions.shape[1] + obs_sample_state = { + "state": obs["state"].repeat_interleave(n_random_actions, dim=0) + } + random_actions = einops.rearrange(random_actions, "B N H A -> (B N) H A") + + # Get the random action Q-values + q_rand_1, q_rand_2 = self.critic(obs_sample_state, random_actions) + q_rand_1 = q_rand_1 + q_rand_2 = q_rand_2 + + # Reshape the random action Q values to match the number of samples + q_rand_1 = q_rand_1.view(B, n_random_actions) # (n_sample, B) + q_rand_2 = q_rand_2.view(B, n_random_actions) + + # Policy action Q values + q_pi_1, q_pi_2 = self.critic(obs, pi_actions) + q_pi_next_1, q_pi_next_2 = self.critic(next_obs, pi_next_actions) + + # Ensure calibration w.r.t. value function estimate + q_pi_1 = torch.max(q_pi_1, returns)[:, None] # (B, 1) + q_pi_2 = torch.max(q_pi_2, returns)[:, None] # (B, 1) + q_pi_next_1 = torch.max(q_pi_next_1, returns)[:, None] # (B, 1) + q_pi_next_2 = torch.max(q_pi_next_2, returns)[:, None] # (B, 1) + + # cql_importance_sample + q_pi_1 = q_pi_1 + q_pi_2 = q_pi_2 + q_pi_next_1 = q_pi_next_1 + q_pi_next_2 = q_pi_next_2 + cat_q_1 = torch.cat( + [q_rand_1, q_pi_1, q_pi_next_1], dim=-1 + ) # (B, num_samples+1) + cql_qf1_ood = torch.logsumexp(cat_q_1, dim=-1) # max over num_samples + cat_q_2 = torch.cat( + [q_rand_2, q_pi_2, q_pi_next_2], dim=-1 + ) # (B, num_samples+1) + cql_qf2_ood = torch.logsumexp(cat_q_2, dim=-1) # sum over num_samples + + # skip cal_lagrange since the paper shows cql_target_action_gap not used in kitchen + + # Subtract the log likelihood of the data + cql_qf1_diff = torch.clamp( + cql_qf1_ood - q_data1, + min=self.cql_clip_diff_min, + max=self.cql_clip_diff_max, + ).mean() + cql_qf2_diff = torch.clamp( + cql_qf2_ood - q_data2, + min=self.cql_clip_diff_min, + max=self.cql_clip_diff_max, + ).mean() + cql_min_qf1_loss = cql_qf1_diff * self.cql_min_q_weight + cql_min_qf2_loss = cql_qf2_diff * self.cql_min_q_weight + + # Sum the two losses + critic_loss = td_loss_1 + td_loss_2 + cql_min_qf1_loss + cql_min_qf2_loss + return critic_loss + + def loss_actor(self, obs): + action = self.forward_train( + obs, + deterministic=False, + ) + q1, q2 = self.critic(obs, action) + actor_loss = -torch.min(q1, q2) + return actor_loss.mean() + + def update_target_critic(self, tau): + for target_param, param in zip( + self.target_critic.parameters(), self.critic.parameters() + ): + target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) + + def forward_train( + self, + cond, + deterministic=False, + ): + """ + Differentiable forward pass used in actor training. + """ + device = self.betas.device + B = len(cond["state"]) + + # Loop + x = torch.randn((B, self.horizon_steps, self.action_dim), device=device) + t_all = list(reversed(range(self.denoising_steps))) + for i, t in enumerate(t_all): + t_b = make_timesteps(B, t, device) + mean, logvar = self.p_mean_var( + x=x, + t=t_b, + cond=cond, + ) + std = torch.exp(0.5 * logvar) + + # Determine the noise level + if deterministic and t == 0: + std = torch.zeros_like(std) + elif deterministic: # For DDPM, sample with noise + std = torch.clip(std, min=1e-3) + else: + std = torch.clip(std, min=self.min_sampling_denoising_std) + noise = torch.randn_like(x).clamp_( + -self.randn_clip_value, self.randn_clip_value + ) + x = mean + std * noise + + # clamp action at final step + if self.final_action_clip_value and i == len(t_all) - 1: + x = torch.clamp(x, -1, 1) + return x diff --git a/model/diffusion/diffusion_ibrl.py b/model/diffusion/diffusion_ibrl.py new file mode 100644 index 00000000..51d545b8 --- /dev/null +++ b/model/diffusion/diffusion_ibrl.py @@ -0,0 +1,286 @@ +""" +Imitation Bootstrapped Reinforcement Learning (IBRL) for Diffusion policy. + +""" + +import torch +import torch.nn as nn +import logging +from copy import deepcopy + +from model.diffusion.diffusion_rwr import RWRDiffusion +from model.diffusion.sampling import make_timesteps + +log = logging.getLogger(__name__) + + +class IBRL_Diffusion(RWRDiffusion): + def __init__( + self, + actor, + critic, + n_critics, + soft_action_sample=False, + soft_action_sample_beta=10, + **kwargs, + ): + super().__init__(network=actor, **kwargs) + self.soft_action_sample = soft_action_sample + self.soft_action_sample_beta = soft_action_sample_beta + + # Set up target actor + self.target_actor = deepcopy(actor) + + # Frozen pre-trained policy + self.bc_policy = deepcopy(actor) + for param in self.bc_policy.parameters(): + param.requires_grad = False + + # initialize critic networks + self.critic_networks = [ + deepcopy(critic).to(self.device) for _ in range(n_critics) + ] + self.critic_networks = nn.ModuleList(self.critic_networks) + + # initialize target networks + self.target_networks = [ + deepcopy(critic).to(self.device) for _ in range(n_critics) + ] + self.target_networks = nn.ModuleList(self.target_networks) + + # Construct a "stateless" version of one of the models. It is "stateless" in the sense that the parameters are meta Tensors and do not have storage. + base_model = deepcopy(self.critic_networks[0]) + self.base_model = base_model.to("meta") + self.ensemble_params, self.ensemble_buffers = torch.func.stack_module_state( + self.critic_networks + ) + + def critic_wrapper(self, params, buffers, data): + """for vmap""" + return torch.func.functional_call(self.base_model, (params, buffers), data) + + def get_random_indices(self, sz=None, num_ind=2): + """get num_ind random indices from a set of size sz (used for getting critic targets)""" + if sz is None: + sz = len(self.critic_networks) + perm = torch.randperm(sz) + ind = perm[:num_ind].to(self.device) + return ind + + def loss_critic( + self, + obs, + next_obs, + actions, + rewards, + terminated, + gamma, + ): + # get random critic index + q1_ind, q2_ind = self.get_random_indices() + with torch.no_grad(): + next_actions_bc = self.forward_sample( + cond=next_obs, + network_override=self.bc_policy, + ) + next_actions_rl = self.forward_sample( + cond=next_obs, + deterministic=False, + network_override=self.target_actor, + ) + + # get the BC Q value + next_q1_bc = self.target_networks[q1_ind](next_obs, next_actions_bc) + next_q2_bc = self.target_networks[q2_ind](next_obs, next_actions_bc) + next_q_bc = torch.min(next_q1_bc, next_q2_bc) + + # get the RL Q value + next_q1_rl = self.target_networks[q1_ind](next_obs, next_actions_rl) + next_q2_rl = self.target_networks[q2_ind](next_obs, next_actions_rl) + next_q_rl = torch.min(next_q1_rl, next_q2_rl) + + # take the max Q value + next_q = torch.where(next_q_bc > next_q_rl, next_q_bc, next_q_rl) + + # target value + target_q = rewards + gamma * (1 - terminated) * next_q # (B,) + + # run all critics in batch + current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))( + self.ensemble_params, self.ensemble_buffers, (obs, actions) + ) # (n_critics, B) + loss_critic = torch.mean((current_q - target_q[None]) ** 2) + return loss_critic + + def loss_actor(self, obs): + action = self.forward_train( + obs, + deterministic=False, + ) # use online policy only, also IBRL does not use tanh squashing + current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))( + self.ensemble_params, self.ensemble_buffers, (obs, action) + ) # (n_critics, B) + current_q = current_q.min( + dim=0 + ).values # unlike RLPD, IBRL uses the min Q value for actor update + loss_actor = -torch.mean(current_q) + return loss_actor + + def update_target_critic(self, tau): + """need to use ensemble_params instead of critic_networks""" + for target_ind, target_critic in enumerate(self.target_networks): + for target_param_name, target_param in target_critic.named_parameters(): + source_param = self.ensemble_params[target_param_name][target_ind] + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + + def update_target_actor(self, tau): + for target_param, source_param in zip( + self.target_actor.parameters(), self.network.parameters() + ): + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + + # ---------- Sampling ----------# + + def forward( + self, + cond, + deterministic=False, + reparameterize=False, + ): + """use both pre-trained and online policies""" + q1_ind, q2_ind = self.get_random_indices() + + # sample an action from the BC policy + bc_action = self.forward_sample( + cond=cond, + deterministic=True, + network_override=self.bc_policy, + ) + + # sample an action from the RL policy + rl_action = super().forward( + cond=cond, + deterministic=deterministic, + ) + + # compute Q value of BC policy + q_bc_1 = self.critic_networks[q1_ind](cond, bc_action) # (B,) + q_bc_2 = self.critic_networks[q2_ind](cond, bc_action) + q_bc = torch.min(q_bc_1, q_bc_2) + + # compute Q value of RL policy + q_rl_1 = self.critic_networks[q1_ind](cond, rl_action) + q_rl_2 = self.critic_networks[q2_ind](cond, rl_action) + q_rl = torch.min(q_rl_1, q_rl_2) + + # soft sample or greedy + if deterministic or not self.soft_action_sample: + action = torch.where( + (q_bc > q_rl)[:, None, None], + bc_action, + rl_action, + ) + else: + # compute the Q weights with probability proportional to exp(\beta * Q(a)) + qw_bc = torch.exp(q_bc * self.soft_action_sample_beta) + qw_rl = torch.exp(q_rl * self.soft_action_sample_beta) + q_weights = torch.softmax( + torch.stack([qw_bc, qw_rl], dim=-1), + dim=-1, + ) + + # sample according to the weights + q_indices = torch.multinomial(q_weights, 1) + action = torch.where( + (q_indices == 0)[:, None], + bc_action, + rl_action, + ) + return action + + # override + @torch.no_grad() + def forward_sample( + self, + cond, + deterministic=False, + network_override=None, + ): + device = cond["state"].device + B = len(cond["state"]) + + # Loop + x = torch.randn((B, self.horizon_steps, self.action_dim), device=device) + t_all = list(reversed(range(self.denoising_steps))) + for i, t in enumerate(t_all): + t_b = make_timesteps(B, t, device) + mean, logvar = self.p_mean_var( + x=x, + t=t_b, + cond=cond, + network_override=network_override, + ) + std = torch.exp(0.5 * logvar) + + # Determine the noise level + if deterministic and t == 0: + std = torch.zeros_like(std) + elif deterministic: + std = torch.clip(std, min=1e-3) + else: + std = torch.clip(std, min=self.min_sampling_denoising_std) + noise = torch.randn_like(x).clamp_( + -self.randn_clip_value, self.randn_clip_value + ) + x = mean + std * noise + + # clamp action at final step + if self.final_action_clip_value is not None and i == len(t_all) - 1: + x = torch.clamp( + x, -self.final_action_clip_value, self.final_action_clip_value + ) + return x + + def forward_train( + self, + cond, + deterministic=False, + ): + """ + Differentiable forward pass used in actor training. + """ + device = self.betas.device + B = len(cond["state"]) + + # Loop + x = torch.randn((B, self.horizon_steps, self.action_dim), device=device) + t_all = list(reversed(range(self.denoising_steps))) + for i, t in enumerate(t_all): + t_b = make_timesteps(B, t, device) + mean, logvar = self.p_mean_var( + x=x, + t=t_b, + cond=cond, + ) + std = torch.exp(0.5 * logvar) + + # Determine the noise level + if deterministic and t == 0: + std = torch.zeros_like(std) + elif deterministic: # For DDPM, sample with noise + std = torch.clip(std, min=1e-3) + else: + std = torch.clip(std, min=self.min_sampling_denoising_std) + noise = torch.randn_like(x).clamp_( + -self.randn_clip_value, self.randn_clip_value + ) + x = mean + std * noise + + # clamp action at final step + if self.final_action_clip_value and i == len(t_all) - 1: + x = torch.clamp(x, -1, 1) + return x diff --git a/model/diffusion/diffusion_rlpd.py b/model/diffusion/diffusion_rlpd.py new file mode 100644 index 00000000..8126b445 --- /dev/null +++ b/model/diffusion/diffusion_rlpd.py @@ -0,0 +1,152 @@ +""" +Reinforcement learning with prior data (RLPD) for Diffusion policy. + +Use ensemble of critics. + +""" + +import torch +import torch.nn as nn +import logging +from copy import deepcopy + +from model.diffusion.diffusion_rwr import RWRDiffusion +from model.diffusion.sampling import make_timesteps + +log = logging.getLogger(__name__) + + +class RLPD_Diffusion(RWRDiffusion): + def __init__( + self, + actor, + critic, + n_critics, + backup_entropy=False, + **kwargs, + ): + super().__init__(network=actor, **kwargs) + self.n_critics = n_critics + self.backup_entropy = backup_entropy + + # initialize critic networks + self.critic_networks = [ + deepcopy(critic).to(self.device) for _ in range(n_critics) + ] + self.critic_networks = nn.ModuleList(self.critic_networks) + + # initialize target networks + self.target_networks = [ + deepcopy(critic).to(self.device) for _ in range(n_critics) + ] + self.target_networks = nn.ModuleList(self.target_networks) + + # Construct a "stateless" version of one of the models. It is "stateless" in the sense that the parameters are meta Tensors and do not have storage. + base_model = deepcopy(self.critic_networks[0]) + self.base_model = base_model.to("meta") + self.ensemble_params, self.ensemble_buffers = torch.func.stack_module_state( + self.critic_networks + ) + + def critic_wrapper(self, params, buffers, data): + """for vmap""" + return torch.func.functional_call(self.base_model, (params, buffers), data) + + def get_random_indices(self, sz=None, num_ind=2): + """get num_ind random indices from a set of size sz (used for getting critic targets)""" + if sz is None: + sz = len(self.critic_networks) + perm = torch.randperm(sz) + ind = perm[:num_ind].to(self.device) + return ind + + def loss_critic( + self, + obs, + next_obs, + actions, + rewards, + terminated, + gamma, + ): + # get random critic index + q1_ind, q2_ind = self.get_random_indices() + with torch.no_grad(): + next_actions = self.forward( + cond=next_obs, + deterministic=False, + ) + next_q1 = self.target_networks[q1_ind](next_obs, next_actions) + next_q2 = self.target_networks[q2_ind](next_obs, next_actions) + next_q = torch.min(next_q1, next_q2) + + # target value + target_q = rewards + gamma * (1 - terminated) * next_q # (B,) + + # run all critics in batch + current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))( + self.ensemble_params, self.ensemble_buffers, (obs, actions) + ) # (n_critics, B) + loss_critic = torch.mean((current_q - target_q[None]) ** 2) + return loss_critic + + def loss_actor(self, obs): + action = self.forward_train( + obs, + deterministic=False, + ) + current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))( + self.ensemble_params, self.ensemble_buffers, (obs, action) + ) # (n_critics, B) + current_q = current_q.mean(dim=0) + loss_actor = -torch.mean(current_q) + return loss_actor + + def update_target_critic(self, tau): + """need to use ensemble_params instead of critic_networks""" + for target_ind, target_critic in enumerate(self.target_networks): + for target_param_name, target_param in target_critic.named_parameters(): + source_param = self.ensemble_params[target_param_name][target_ind] + target_param.data.copy_( + target_param.data * (1.0 - tau) + source_param.data * tau + ) + + def forward_train( + self, + cond, + deterministic=False, + ): + """ + Differentiable forward pass used in actor training. + """ + device = self.betas.device + B = len(cond["state"]) + + # Loop + x = torch.randn((B, self.horizon_steps, self.action_dim), device=device) + t_all = list(reversed(range(self.denoising_steps))) + for i, t in enumerate(t_all): + t_b = make_timesteps(B, t, device) + mean, logvar = self.p_mean_var( + x=x, + t=t_b, + cond=cond, + ) + std = torch.exp(0.5 * logvar) + + # Determine the noise level + if deterministic and t == 0: + std = torch.zeros_like(std) + elif deterministic: # For DDPM, sample with noise + std = torch.clip(std, min=1e-3) + else: + std = torch.clip(std, min=self.min_sampling_denoising_std) + noise = torch.randn_like(x).clamp_( + -self.randn_clip_value, self.randn_clip_value + ) + x = mean + std * noise + + # clamp action at final step + if self.final_action_clip_value and i == len(t_all) - 1: + x = torch.clamp(x, -1, 1) + return x From 4888a1752271810a124c2fb42eeef38313d9f5b0 Mon Sep 17 00:00:00 2001 From: "Justin M. Lidard" Date: Thu, 14 Nov 2024 21:03:27 -0500 Subject: [PATCH 2/7] fix typo: --- model/diffusion/diffusion_calql.py | 2 +- model/diffusion/diffusion_ibrl.py | 2 +- model/diffusion/diffusion_rlpd.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/model/diffusion/diffusion_calql.py b/model/diffusion/diffusion_calql.py index 31b486d1..02c7dd50 100644 --- a/model/diffusion/diffusion_calql.py +++ b/model/diffusion/diffusion_calql.py @@ -1,5 +1,5 @@ """ -Calibrated Conservative Q-Learning (CalQL) for Gaussian policy. +Calibrated Conservative Q-Learning (CalQL) for Diffusion Policy. """ diff --git a/model/diffusion/diffusion_ibrl.py b/model/diffusion/diffusion_ibrl.py index 51d545b8..245ed76c 100644 --- a/model/diffusion/diffusion_ibrl.py +++ b/model/diffusion/diffusion_ibrl.py @@ -1,5 +1,5 @@ """ -Imitation Bootstrapped Reinforcement Learning (IBRL) for Diffusion policy. +Imitation Bootstrapped Reinforcement Learning (IBRL) for Diffusion Policy. """ diff --git a/model/diffusion/diffusion_rlpd.py b/model/diffusion/diffusion_rlpd.py index 8126b445..7a49d81f 100644 --- a/model/diffusion/diffusion_rlpd.py +++ b/model/diffusion/diffusion_rlpd.py @@ -1,5 +1,5 @@ """ -Reinforcement learning with prior data (RLPD) for Diffusion policy. +Reinforcement learning with prior data (RLPD) for Diffusion Policy. Use ensemble of critics. From 92d2dac13af312e84dc61b3bcc8093e91e50bd2c Mon Sep 17 00:00:00 2001 From: "Justin M. Lidard" Date: Thu, 14 Nov 2024 21:21:21 -0500 Subject: [PATCH 3/7] config fix --- cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml | 4 ++-- cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml | 2 +- cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml | 4 ++-- cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml | 2 +- cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml | 4 ++-- .../pretrain/square/calql_diffusion_mlp_offline.yaml | 4 ++-- cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml | 2 +- cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml index bf35e0a6..88dc1c64 100644 --- a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml +++ b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml @@ -76,10 +76,10 @@ train: n_random_actions: 4 target_ema_rate: 0.005 scale_reward_factor: 1.0 - num_update: 3 # 1000 + num_update: 1000 buffer_size: 1000000 online_utd_ratio: 1 - n_eval_episode: 3 # 40 + n_eval_episode: 40 n_explore_steps: 0 model: diff --git a/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml index 85d603ae..36c6de6d 100644 --- a/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml @@ -76,7 +76,7 @@ train: scale_reward_factor: 1 critic_num_update: 3 buffer_size: 400000 - n_eval_episode: 3 # 40 + n_eval_episode: 40 n_explore_steps: 0 update_freq: 2 diff --git a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml index ad5dce08..29a40dde 100644 --- a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml +++ b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml @@ -76,10 +76,10 @@ train: n_random_actions: 4 target_ema_rate: 0.005 scale_reward_factor: 1.0 - num_update: 3 # 1000 + num_update: 1000 buffer_size: 1000000 online_utd_ratio: 1 - n_eval_episode: 3 # 40 + n_eval_episode: 40 n_explore_steps: 0 model: diff --git a/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml index 82043b23..7486d16c 100644 --- a/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml +++ b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml @@ -76,7 +76,7 @@ train: scale_reward_factor: 1 critic_num_update: 3 buffer_size: 400000 - n_eval_episode: 3 # 40 + n_eval_episode: 40 n_explore_steps: 0 update_freq: 2 diff --git a/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml index 4afaf5d7..eff67779 100644 --- a/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml +++ b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml @@ -74,9 +74,9 @@ train: n_random_actions: 10 target_ema_rate: 0.005 scale_reward_factor: 1.0 - num_update: 3 # 1000 + num_update: 1000 buffer_size: 1000000 - n_eval_episode: 3 # 10 + n_eval_episode: 40 n_explore_steps: 0 model: diff --git a/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml index 653cba85..fe5acb22 100644 --- a/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml +++ b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml @@ -74,9 +74,9 @@ train: n_random_actions: 10 target_ema_rate: 0.005 scale_reward_factor: 1.0 - num_update: 3 # 1000 + num_update: 1000 buffer_size: 1000000 - n_eval_episode: 3 # 10 + n_eval_episode: 40 n_explore_steps: 0 model: diff --git a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml index 6dc91eca..1ac0138d 100644 --- a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml +++ b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml @@ -74,7 +74,7 @@ train: scale_reward_factor: 1 critic_num_update: 3 buffer_size: 400000 - n_eval_episode: 3 # 40 + n_eval_episode: 40 n_explore_steps: 0 target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} init_temperature: 0 diff --git a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml index 7567a57a..1677d1c8 100644 --- a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml +++ b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml @@ -74,7 +74,7 @@ train: scale_reward_factor: 1 critic_num_update: 3 buffer_size: 400000 - n_eval_episode: 3 # 40 + n_eval_episode: 40 n_explore_steps: 0 target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} init_temperature: 0 From b0caba9f8a4d0610cd013113bdcaf6caf3879a2d Mon Sep 17 00:00:00 2001 From: "Justin M. Lidard" Date: Thu, 14 Nov 2024 21:38:47 -0500 Subject: [PATCH 4/7] make forward pass differentiable --- model/diffusion/diffusion_ibrl.py | 46 ++----------------------------- 1 file changed, 2 insertions(+), 44 deletions(-) diff --git a/model/diffusion/diffusion_ibrl.py b/model/diffusion/diffusion_ibrl.py index 245ed76c..bcbd4cfe 100644 --- a/model/diffusion/diffusion_ibrl.py +++ b/model/diffusion/diffusion_ibrl.py @@ -113,7 +113,7 @@ def loss_critic( return loss_critic def loss_actor(self, obs): - action = self.forward_train( + action = self.forward( obs, deterministic=False, ) # use online policy only, also IBRL does not use tanh squashing @@ -149,7 +149,6 @@ def forward( self, cond, deterministic=False, - reparameterize=False, ): """use both pre-trained and online policies""" q1_ind, q2_ind = self.get_random_indices() @@ -162,7 +161,7 @@ def forward( ) # sample an action from the RL policy - rl_action = super().forward( + rl_action = self.forward_sample( cond=cond, deterministic=deterministic, ) @@ -203,7 +202,6 @@ def forward( return action # override - @torch.no_grad() def forward_sample( self, cond, @@ -244,43 +242,3 @@ def forward_sample( x, -self.final_action_clip_value, self.final_action_clip_value ) return x - - def forward_train( - self, - cond, - deterministic=False, - ): - """ - Differentiable forward pass used in actor training. - """ - device = self.betas.device - B = len(cond["state"]) - - # Loop - x = torch.randn((B, self.horizon_steps, self.action_dim), device=device) - t_all = list(reversed(range(self.denoising_steps))) - for i, t in enumerate(t_all): - t_b = make_timesteps(B, t, device) - mean, logvar = self.p_mean_var( - x=x, - t=t_b, - cond=cond, - ) - std = torch.exp(0.5 * logvar) - - # Determine the noise level - if deterministic and t == 0: - std = torch.zeros_like(std) - elif deterministic: # For DDPM, sample with noise - std = torch.clip(std, min=1e-3) - else: - std = torch.clip(std, min=self.min_sampling_denoising_std) - noise = torch.randn_like(x).clamp_( - -self.randn_clip_value, self.randn_clip_value - ) - x = mean + std * noise - - # clamp action at final step - if self.final_action_clip_value and i == len(t_all) - 1: - x = torch.clamp(x, -1, 1) - return x From c367feb7fdcacc070b6aa48017fef206e94ea4e4 Mon Sep 17 00:00:00 2001 From: "Justin M. Lidard" Date: Fri, 22 Nov 2024 18:39:04 -0500 Subject: [PATCH 5/7] update configs --- .../can/calql_diffusion_mlp_online.yaml | 2 +- .../can/calql_diffusion_mlp_online_ph.yaml | 121 +++++++++++++++++ .../finetune/can/ibrl_diffusion_mlp_ph.yaml | 119 +++++++++++++++++ .../square/calql_diffusion_mlp_online.yaml | 2 +- .../square/calql_diffusion_mlp_online_ph.yaml | 122 ++++++++++++++++++ .../square/ibrl_diffusion_mlp_ph.yaml | 120 +++++++++++++++++ .../can/calql_diffusion_mlp_offline_ph.yaml | 117 +++++++++++++++++ .../pretrain/can/pre_diffusion_mlp_ph.yaml | 63 +++++++++ .../calql_diffusion_mlp_offline_ph.yaml | 118 +++++++++++++++++ .../pretrain/square/pre_diffusion_mlp_ph.yaml | 64 +++++++++ .../scratch/can/rlpd_diffusion_mlp.yaml | 4 +- .../scratch/can/rlpd_diffusion_mlp_ph.yaml | 116 +++++++++++++++++ .../scratch/square/rlpd_diffusion_mlp.yaml | 4 +- .../scratch/square/rlpd_diffusion_mlp_ph.yaml | 117 +++++++++++++++++ 14 files changed, 1083 insertions(+), 6 deletions(-) create mode 100644 cfg/robomimic/finetune/can/calql_diffusion_mlp_online_ph.yaml create mode 100644 cfg/robomimic/finetune/can/ibrl_diffusion_mlp_ph.yaml create mode 100644 cfg/robomimic/finetune/square/calql_diffusion_mlp_online_ph.yaml create mode 100644 cfg/robomimic/finetune/square/ibrl_diffusion_mlp_ph.yaml create mode 100644 cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline_ph.yaml create mode 100644 cfg/robomimic/pretrain/can/pre_diffusion_mlp_ph.yaml create mode 100644 cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline_ph.yaml create mode 100644 cfg/robomimic/pretrain/square/pre_diffusion_mlp_ph.yaml create mode 100644 cfg/robomimic/scratch/can/rlpd_diffusion_mlp_ph.yaml create mode 100644 cfg/robomimic/scratch/square/rlpd_diffusion_mlp_ph.yaml diff --git a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml index 88dc1c64..fbbe2a51 100644 --- a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml +++ b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml @@ -48,7 +48,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 1000 + n_train_itr: 10000 n_steps: 1 # not used n_episode_per_epoch: 1 gamma: 0.99 diff --git a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online_ph.yaml b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online_ph.yaml new file mode 100644 index 00000000..4448bf60 --- /dev/null +++ b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online_ph.yaml @@ -0,0 +1,121 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz +base_policy_path: + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-calql-ph-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 10000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + online_utd_ratio: 1 + n_eval_episode: 40 + n_explore_steps: 0 + +model: + _target_: model.diffusion.diffusion_calql.CalQL_Diffusion + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/finetune/can/ibrl_diffusion_mlp_ph.yaml b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp_ph.yaml new file mode 100644 index 00000000..13080491 --- /dev/null +++ b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp_ph.yaml @@ -0,0 +1,119 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_diffusion_agent.TrainIBRLDiffusionAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz +base_policy_path: + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 250 # IBRL uses 200 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-ibrl-ph-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 50000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.diffusion.diffusion_ibrl.IBRL_Diffusion + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 10 + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + max_n_episodes: 100 \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml index 29a40dde..1987bacd 100644 --- a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml +++ b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml @@ -48,7 +48,7 @@ wandb: run: ${now:%H-%M-%S}_${name} train: - n_train_itr: 1000 + n_train_itr: 10000 n_steps: 1 # not used n_episode_per_epoch: 1 gamma: 0.99 diff --git a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online_ph.yaml b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online_ph.yaml new file mode 100644 index 00000000..c3cf5285 --- /dev/null +++ b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online_ph.yaml @@ -0,0 +1,122 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz +base_policy_path: + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-calql-ph-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 10000 + n_steps: 1 # not used + n_episode_per_epoch: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: True + batch_size: 256 + n_random_actions: 4 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + online_utd_ratio: 1 + n_eval_episode: 40 + n_explore_steps: 0 + +model: + _target_: model.diffusion.diffusion_calql.CalQL_Diffusion + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/finetune/square/ibrl_diffusion_mlp_ph.yaml b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp_ph.yaml new file mode 100644 index 00000000..f2ffe2cc --- /dev/null +++ b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp_ph.yaml @@ -0,0 +1,120 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ibrl_diffusion_agent.TrainIBRLDiffusionAgent + +name: ${env_name}_ibrl_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz +base_policy_path: + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 250 # IBRL uses 200 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-ibrl-ph-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 50000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # IBRL specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + update_freq: 2 + +model: + _target_: model.diffusion.diffusion_ibrl.IBRL_Diffusion + randn_clip_value: 3 + n_critics: 5 + soft_action_sample: True + soft_action_sample_beta: 10 + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + max_n_episodes: 100 \ No newline at end of file diff --git a/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline_ph.yaml b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline_ph.yaml new file mode 100644 index 00000000..35fed253 --- /dev/null +++ b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline_ph.yaml @@ -0,0 +1,117 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-ph-calql-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 20 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 10 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 40 + n_explore_steps: 0 + +model: + _target_: model.diffusion.diffusion_calql.CalQL_Diffusion + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ph.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ph.yaml new file mode 100644 index 00000000..6607a971 --- /dev/null +++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ph.yaml @@ -0,0 +1,63 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent + +name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz + +seed: 42 +device: cuda:0 +env: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +horizon_steps: 4 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-ph-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 8000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 10000 + warmup_steps: 100 + min_lr: 1e-5 + save_model_freq: 500 + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline_ph.yaml b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline_ph.yaml new file mode 100644 index 00000000..521f8c4f --- /dev/null +++ b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline_ph.yaml @@ -0,0 +1,118 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent + +name: ${env_name}_calql_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-ph-calql-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 3e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 3e-4 + save_model_freq: 100 + val_freq: 20 + render: + freq: 1 + num: 0 + log_freq: 1 + # CalQL specific + train_online: False + batch_size: 256 + n_random_actions: 10 + target_ema_rate: 0.005 + scale_reward_factor: 1.0 + num_update: 1000 + buffer_size: 1000000 + n_eval_episode: 40 + n_explore_steps: 0 + +model: + _target_: model.diffusion.diffusion_calql.CalQL_Diffusion + randn_clip_value: 3 + cql_min_q_weight: 5.0 + tanh_output: True + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} + discount_factor: ${train.gamma} + get_mc_return: True \ No newline at end of file diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ph.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ph.yaml new file mode 100644 index 00000000..60f138d7 --- /dev/null +++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ph.yaml @@ -0,0 +1,64 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent + +name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz + +seed: 42 +device: cuda:0 +env: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +horizon_steps: 4 +cond_steps: 1 + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: robomimic-ph-${env}-pretrain + run: ${now:%H-%M-%S}_${name} + +train: + n_epochs: 8000 + batch_size: 256 + learning_rate: 1e-4 + weight_decay: 1e-6 + lr_scheduler: + first_cycle_steps: 10000 + warmup_steps: 100 + min_lr: 1e-5 + save_model_freq: 500 + +model: + _target_: model.diffusion.diffusion.DiffusionModel + predict_epsilon: True + denoised_clip_value: 1.0 + network: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +ema: + decay: 0.995 + +train_dataset: + _target_: agent.dataset.sequence.StitchedSequenceDataset + dataset_path: ${train_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml index 1ac0138d..ac240a5b 100644 --- a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml +++ b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml @@ -8,8 +8,8 @@ _target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent name: ${env_name}_rlpd_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json -normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz -offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz seed: 42 device: cuda:0 diff --git a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp_ph.yaml b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp_ph.yaml new file mode 100644 index 00000000..10921f4d --- /dev/null +++ b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp_ph.yaml @@ -0,0 +1,116 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-rlpd-ph-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 50000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 0 + +model: + _target_: model.diffusion.diffusion_rlpd.RLPD_Diffusion + randn_clip_value: 10 + backup_entropy: False + n_critics: 5 + tanh_output: True + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [256, 256, 256] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file diff --git a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml index 1677d1c8..b7e916b1 100644 --- a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml +++ b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml @@ -8,8 +8,8 @@ _target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent name: ${env_name}_rlpd_mlp_ta${horizon_steps} logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json -normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz -offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz seed: 42 device: cuda:0 diff --git a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp_ph.yaml b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp_ph.yaml new file mode 100644 index 00000000..0b1228c7 --- /dev/null +++ b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp_ph.yaml @@ -0,0 +1,117 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent + +name: ${env_name}_rlpd_mlp_ta${horizon_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz +offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz + +seed: 42 +device: cuda:0 +env_name: square +obs_dim: 23 +action_dim: 7 +denoising_steps: 10 +cond_steps: 1 +horizon_steps: 1 +act_steps: 1 + +env: + n_envs: 1 + name: ${env_name} + max_episode_steps: 300 + reset_at_iteration: False + save_video: False + best_reward_threshold_for_success: 1 + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: diffusion-rlpd-ph-${env_name}-act-${act_steps} + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 1000000 + n_steps: 1 + gamma: 0.99 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-4 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + save_model_freq: 50000 + val_freq: 10000 + render: + freq: 10000 + num: 0 + log_freq: 200 + # RLPD specific + batch_size: 256 + target_ema_rate: 0.01 + scale_reward_factor: 1 + critic_num_update: 3 + buffer_size: 400000 + n_eval_episode: 40 + n_explore_steps: 0 + target_entropy: ${eval:'- ${action_dim} * ${act_steps}'} + init_temperature: 0 + +model: + _target_: model.diffusion.diffusion_rlpd.RLPD_Diffusion + randn_clip_value: 10 + backup_entropy: False + n_critics: 5 + tanh_output: True + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 32 + mlp_dims: [1024, 1024, 1024] + cond_mlp_dims: [512, 64] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObsAct + mlp_dims: [1024, 1024, 1024] + activation_type: ReLU + use_layernorm: True + double_q: False # use ensemble + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + action_dim: ${action_dim} + action_steps: ${act_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} + +offline_dataset: + _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset + dataset_path: ${offline_dataset_path} + horizon_steps: ${horizon_steps} + cond_steps: ${cond_steps} + device: ${device} \ No newline at end of file From 583e1916836abde389550f5fd4828261fcfed107 Mon Sep 17 00:00:00 2001 From: "Justin M. Lidard" Date: Fri, 22 Nov 2024 19:50:23 -0500 Subject: [PATCH 6/7] add ppo with summed likelihood --- ...train_ppo_diffusion_agent_sumlikelihood.py | 484 ++++++++++++++++++ .../ft_ppo_diffusion_mlp_sumlikelihood.yaml | 111 ++++ .../diffusion/diffusion_ppo_sumlikelihood.py | 221 ++++++++ 3 files changed, 816 insertions(+) create mode 100644 agent/finetune/train_ppo_diffusion_agent_sumlikelihood.py create mode 100644 cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_sumlikelihood.yaml create mode 100644 model/diffusion/diffusion_ppo_sumlikelihood.py diff --git a/agent/finetune/train_ppo_diffusion_agent_sumlikelihood.py b/agent/finetune/train_ppo_diffusion_agent_sumlikelihood.py new file mode 100644 index 00000000..5f7005af --- /dev/null +++ b/agent/finetune/train_ppo_diffusion_agent_sumlikelihood.py @@ -0,0 +1,484 @@ +""" +DPPO fine-tuning. + +""" + +import os +import pickle +import einops +import numpy as np +import torch +import logging +import wandb +import math + +log = logging.getLogger(__name__) +from util.timer import Timer +from agent.finetune.train_ppo_agent import TrainPPOAgent +from util.scheduler import CosineAnnealingWarmupRestarts + + +class TrainPPODiffusionAgentSumLikelihood(TrainPPOAgent): + def __init__(self, cfg): + super().__init__(cfg) + + # Reward horizon --- always set to act_steps for now + self.reward_horizon = cfg.get("reward_horizon", self.act_steps) + + # Eta - between DDIM (=0 for eval) and DDPM (=1 for training) + self.learn_eta = self.model.learn_eta + if self.learn_eta: + self.eta_update_interval = cfg.train.eta_update_interval + self.eta_optimizer = torch.optim.AdamW( + self.model.eta.parameters(), + lr=cfg.train.eta_lr, + weight_decay=cfg.train.eta_weight_decay, + ) + self.eta_lr_scheduler = CosineAnnealingWarmupRestarts( + self.eta_optimizer, + first_cycle_steps=cfg.train.eta_lr_scheduler.first_cycle_steps, + cycle_mult=1.0, + max_lr=cfg.train.eta_lr, + min_lr=cfg.train.eta_lr_scheduler.min_lr, + warmup_steps=cfg.train.eta_lr_scheduler.warmup_steps, + gamma=1.0, + ) + + def run(self): + # Start training loop + timer = Timer() + run_results = [] + cnt_train_step = 0 + last_itr_eval = False + done_venv = np.zeros((1, self.n_envs)) + while self.itr < self.n_train_itr: + # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env + options_venv = [{} for _ in range(self.n_envs)] + if self.itr % self.render_freq == 0 and self.render_video: + for env_ind in range(self.n_render): + options_venv[env_ind]["video_path"] = os.path.join( + self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4" + ) + + # Define train or eval - all envs restart + eval_mode = self.itr % self.val_freq == 0 and not self.force_train + self.model.eval() if eval_mode else self.model.train() + last_itr_eval = eval_mode + + # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode + firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs)) + if self.reset_at_iteration or eval_mode or last_itr_eval: + prev_obs_venv = self.reset_env_all(options_venv=options_venv) + firsts_trajs[0] = 1 + else: + # if done at the end of last iteration, the envs are just reset + firsts_trajs[0] = done_venv + + # Holder + obs_trajs = { + "state": np.zeros( + (self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim) + ) + } + chains_trajs = np.zeros( + ( + self.n_steps, + self.n_envs, + self.model.ft_denoising_steps + 1, + self.horizon_steps, + self.action_dim, + ) + ) + terminated_trajs = np.zeros((self.n_steps, self.n_envs)) + reward_trajs = np.zeros((self.n_steps, self.n_envs)) + if self.save_full_observations: # state-only + obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim)) + obs_full_trajs = np.vstack( + (obs_full_trajs, prev_obs_venv["state"][:, -1][None]) + ) + + # Collect a set of trajectories from env + for step in range(self.n_steps): + if step % 10 == 0: + print(f"Processed step {step} of {self.n_steps}") + + # Select action + with torch.no_grad(): + cond = { + "state": torch.from_numpy(prev_obs_venv["state"]) + .float() + .to(self.device) + } + samples = self.model( + cond=cond, + deterministic=eval_mode, + return_chain=True, + ) + output_venv = ( + samples.trajectories.cpu().numpy() + ) # n_env x horizon x act + chains_venv = ( + samples.chains.cpu().numpy() + ) # n_env x denoising x horizon x act + action_venv = output_venv[:, : self.act_steps] + + # Apply multi-step action + ( + obs_venv, + reward_venv, + terminated_venv, + truncated_venv, + info_venv, + ) = self.venv.step(action_venv) + done_venv = terminated_venv | truncated_venv + if self.save_full_observations: # state-only + obs_full_venv = np.array( + [info["full_obs"]["state"] for info in info_venv] + ) # n_envs x act_steps x obs_dim + obs_full_trajs = np.vstack( + (obs_full_trajs, obs_full_venv.transpose(1, 0, 2)) + ) + obs_trajs["state"][step] = prev_obs_venv["state"] + chains_trajs[step] = chains_venv + reward_trajs[step] = reward_venv + terminated_trajs[step] = terminated_venv + firsts_trajs[step + 1] = done_venv + + # update for next step + prev_obs_venv = obs_venv + + # count steps --- not acounting for done within action chunk + cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0 + + # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration. + episodes_start_end = [] + for env_ind in range(self.n_envs): + env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0] + for i in range(len(env_steps) - 1): + start = env_steps[i] + end = env_steps[i + 1] + if end - start > 1: + episodes_start_end.append((env_ind, start, end - 1)) + if len(episodes_start_end) > 0: + reward_trajs_split = [ + reward_trajs[start : end + 1, env_ind] + for env_ind, start, end in episodes_start_end + ] + num_episode_finished = len(reward_trajs_split) + episode_reward = np.array( + [np.sum(reward_traj) for reward_traj in reward_trajs_split] + ) + if ( + self.furniture_sparse_reward + ): # only for furniture tasks, where reward only occurs in one env step + episode_best_reward = episode_reward + else: + episode_best_reward = np.array( + [ + np.max(reward_traj) / self.act_steps + for reward_traj in reward_trajs_split + ] + ) + avg_episode_reward = np.mean(episode_reward) + avg_best_reward = np.mean(episode_best_reward) + success_rate = np.mean( + episode_best_reward >= self.best_reward_threshold_for_success + ) + else: + episode_reward = np.array([]) + num_episode_finished = 0 + avg_episode_reward = 0 + avg_best_reward = 0 + success_rate = 0 + log.info("[WARNING] No episode completed within the iteration!") + + # Update models + if not eval_mode: + with torch.no_grad(): + obs_trajs["state"] = ( + torch.from_numpy(obs_trajs["state"]).float().to(self.device) + ) + + # Calculate value and logprobs - split into batches to prevent out of memory + num_split = math.ceil( + self.n_envs * self.n_steps / self.logprob_batch_size + ) + obs_ts = [{} for _ in range(num_split)] + obs_k = einops.rearrange( + obs_trajs["state"], + "s e ... -> (s e) ...", + ) + obs_ts_k = torch.split(obs_k, self.logprob_batch_size, dim=0) + for i, obs_t in enumerate(obs_ts_k): + obs_ts[i]["state"] = obs_t + values_trajs = np.empty((0, self.n_envs)) + for obs in obs_ts: + values = self.model.critic(obs).cpu().numpy().flatten() + values_trajs = np.vstack( + (values_trajs, values.reshape(-1, self.n_envs)) + ) + chains_t = einops.rearrange( + torch.from_numpy(chains_trajs).float().to(self.device), + "s e t h d -> (s e) t h d", + ) + chains_ts = torch.split(chains_t, self.logprob_batch_size, dim=0) + logprobs_trajs = np.empty( + ( + 0, + self.model.ft_denoising_steps, + self.horizon_steps, + self.action_dim, + ) + ) + for obs, chains in zip(obs_ts, chains_ts): + logprobs = self.model.get_logprobs(obs, chains).cpu().numpy() + logprobs_trajs = np.vstack( + ( + logprobs_trajs, + logprobs.reshape(-1, *logprobs_trajs.shape[1:]), + ) + ) + + # normalize reward with running variance if specified + if self.reward_scale_running: + reward_trajs_transpose = self.running_reward_scaler( + reward=reward_trajs.T, first=firsts_trajs[:-1].T + ) + reward_trajs = reward_trajs_transpose.T + + # bootstrap value with GAE if not terminal - apply reward scaling with constant if specified + obs_venv_ts = { + "state": torch.from_numpy(obs_venv["state"]) + .float() + .to(self.device) + } + advantages_trajs = np.zeros_like(reward_trajs) + lastgaelam = 0 + for t in reversed(range(self.n_steps)): + if t == self.n_steps - 1: + nextvalues = ( + self.model.critic(obs_venv_ts) + .reshape(1, -1) + .cpu() + .numpy() + ) + else: + nextvalues = values_trajs[t + 1] + nonterminal = 1.0 - terminated_trajs[t] + # delta = r + gamma*V(st+1) - V(st) + delta = ( + reward_trajs[t] * self.reward_scale_const + + self.gamma * nextvalues * nonterminal + - values_trajs[t] + ) + # A = delta_t + gamma*lamdba*delta_{t+1} + ... + advantages_trajs[t] = lastgaelam = ( + delta + + self.gamma * self.gae_lambda * nonterminal * lastgaelam + ) + returns_trajs = advantages_trajs + values_trajs + + # k for environment step + obs_k = { + "state": einops.rearrange( + obs_trajs["state"], + "s e ... -> (s e) ...", + ) + } + chains_k = einops.rearrange( + torch.tensor(chains_trajs, device=self.device).float(), + "s e t h d -> (s e) t h d", + ) + returns_k = ( + torch.tensor(returns_trajs, device=self.device).float().reshape(-1) + ) + values_k = ( + torch.tensor(values_trajs, device=self.device).float().reshape(-1) + ) + advantages_k = ( + torch.tensor(advantages_trajs, device=self.device) + .float() + .reshape(-1) + ) + logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float() + + # Update policy and critic + total_steps = self.n_steps * self.n_envs + clipfracs = [] + for update_epoch in range(self.update_epochs): + # for each epoch, go through all data in batches + flag_break = False + inds_k = torch.randperm(total_steps, device=self.device) + num_batch = max(1, total_steps // self.batch_size) # skip last ones + + # get the full range of denoising indices + denoising_inds_b = torch.arange(self.model.ft_denoising_steps) + + for batch in range(num_batch): + start = batch * self.batch_size + end = start + self.batch_size + batch_inds_b = inds_k[start:end] # b for batch + + obs_b = {"state": obs_k["state"][batch_inds_b]} + chains_prev_b = chains_k[batch_inds_b, :-1] + chains_next_b = chains_k[batch_inds_b, 1:] + returns_b = returns_k[batch_inds_b] + values_b = values_k[batch_inds_b] + advantages_b = advantages_k[batch_inds_b] + logprobs_b = logprobs_k[batch_inds_b, :].sum(1) + + # get loss + ( + pg_loss, + entropy_loss, + v_loss, + clipfrac, + approx_kl, + ratio, + bc_loss, + eta, + ) = self.model.loss( + obs_b, + chains_prev_b, + chains_next_b, + denoising_inds_b, + returns_b, + values_b, + advantages_b, + logprobs_b, + use_bc_loss=self.use_bc_loss, + reward_horizon=self.reward_horizon, + ) + loss = ( + pg_loss + + entropy_loss * self.ent_coef + + v_loss * self.vf_coef + + bc_loss * self.bc_loss_coeff + ) + clipfracs += [clipfrac] + + # update policy and critic + self.actor_optimizer.zero_grad() + self.critic_optimizer.zero_grad() + if self.learn_eta: + self.eta_optimizer.zero_grad() + loss.backward() + if self.itr >= self.n_critic_warmup_itr: + if self.max_grad_norm is not None: + torch.nn.utils.clip_grad_norm_( + self.model.actor_ft.parameters(), self.max_grad_norm + ) + self.actor_optimizer.step() + if self.learn_eta and batch % self.eta_update_interval == 0: + self.eta_optimizer.step() + self.critic_optimizer.step() + log.info( + f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}" + ) + + # Stop gradient update if KL difference reaches target + if self.target_kl is not None and approx_kl > self.target_kl: + flag_break = True + break + if flag_break: + break + + # Explained variation of future rewards using value function + y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy() + var_y = np.var(y_true) + explained_var = ( + np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y + ) + + # Plot state trajectories (only in D3IL) + if ( + self.itr % self.render_freq == 0 + and self.n_render > 0 + and self.traj_plotter is not None + ): + self.traj_plotter( + obs_full_trajs=obs_full_trajs, + n_render=self.n_render, + max_episode_steps=self.max_episode_steps, + render_dir=self.render_dir, + itr=self.itr, + ) + + # Update lr, min_sampling_std + if self.itr >= self.n_critic_warmup_itr: + self.actor_lr_scheduler.step() + if self.learn_eta: + self.eta_lr_scheduler.step() + self.critic_lr_scheduler.step() + self.model.step() + diffusion_min_sampling_std = self.model.get_min_sampling_denoising_std() + + # Save model + if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1: + self.save_model() + + # Log loss and save metrics + run_results.append( + { + "itr": self.itr, + "step": cnt_train_step, + } + ) + if self.save_trajs: + run_results[-1]["obs_full_trajs"] = obs_full_trajs + run_results[-1]["obs_trajs"] = obs_trajs + run_results[-1]["chains_trajs"] = chains_trajs + run_results[-1]["reward_trajs"] = reward_trajs + if self.itr % self.log_freq == 0: + time = timer() + run_results[-1]["time"] = time + if eval_mode: + log.info( + f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "success rate - eval": success_rate, + "avg episode reward - eval": avg_episode_reward, + "avg best reward - eval": avg_best_reward, + "num episode - eval": num_episode_finished, + }, + step=self.itr, + commit=False, + ) + run_results[-1]["eval_success_rate"] = success_rate + run_results[-1]["eval_episode_reward"] = avg_episode_reward + run_results[-1]["eval_best_reward"] = avg_best_reward + else: + log.info( + f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}" + ) + if self.use_wandb: + wandb.log( + { + "total env step": cnt_train_step, + "loss": loss, + "pg loss": pg_loss, + "value loss": v_loss, + "bc loss": bc_loss, + "eta": eta, + "approx kl": approx_kl, + "ratio": ratio, + "clipfrac": np.mean(clipfracs), + "explained variance": explained_var, + "avg episode reward - train": avg_episode_reward, + "num episode - train": num_episode_finished, + "diffusion - min sampling std": diffusion_min_sampling_std, + "actor lr": self.actor_optimizer.param_groups[0]["lr"], + "critic lr": self.critic_optimizer.param_groups[0][ + "lr" + ], + }, + step=self.itr, + commit=True, + ) + run_results[-1]["train_episode_reward"] = avg_episode_reward + with open(self.result_path, "wb") as f: + pickle.dump(run_results, f) + self.itr += 1 diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_sumlikelihood.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_sumlikelihood.yaml new file mode 100644 index 00000000..df7288e4 --- /dev/null +++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_sumlikelihood.yaml @@ -0,0 +1,111 @@ +defaults: + - _self_ +hydra: + run: + dir: ${logdir} +_target_: agent.finetune.train_ppo_diffusion_agent_sumlikelihood.TrainPPODiffusionAgentSumLikelihood + +name: ${env_name}_ft_diffusion_sumlikelihood_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps} +logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed} +base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt # use 8000 for comparing policy parameterizations +robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json +normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz + +seed: 42 +device: cuda:0 +env_name: can +obs_dim: 23 +action_dim: 7 +denoising_steps: 20 +ft_denoising_steps: 10 +cond_steps: 1 +horizon_steps: 4 +act_steps: 4 + +env: + n_envs: 5 + name: ${env_name} + best_reward_threshold_for_success: 1 + max_episode_steps: 300 + save_video: False + wrappers: + robomimic_lowdim: + normalization_path: ${normalization_path} + low_dim_keys: ['robot0_eef_pos', + 'robot0_eef_quat', + 'robot0_gripper_qpos', + 'object'] # same order of preprocessed observations + multi_step: + n_obs_steps: ${cond_steps} + n_action_steps: ${act_steps} + max_episode_steps: ${env.max_episode_steps} + reset_within_step: True + +wandb: + entity: ${oc.env:DPPO_WANDB_ENTITY} + project: dppo-sumlikelihood-robomimic-${env_name}-finetune + run: ${now:%H-%M-%S}_${name} + +train: + n_train_itr: 151 + n_critic_warmup_itr: 2 + n_steps: 10 + gamma: 0.999 + actor_lr: 1e-4 + actor_weight_decay: 0 + actor_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-4 + critic_lr: 1e-3 + critic_weight_decay: 0 + critic_lr_scheduler: + first_cycle_steps: 1000 + warmup_steps: 10 + min_lr: 1e-3 + save_model_freq: 100 + val_freq: 10 + render: + freq: 1 + num: 0 + # PPO specific + reward_scale_running: True + reward_scale_const: 1.0 + gae_lambda: 0.95 + batch_size: 7500 + update_epochs: 10 + vf_coef: 0.5 + target_kl: 1 + +model: + _target_: model.diffusion.diffusion_ppo_sumlikelihood.PPODiffusionSumLikelihood + # HP to tune + gamma_denoising: 0.99 + clip_ploss_coef: 0.01 + clip_ploss_coef_base: 0.001 + clip_ploss_coef_rate: 3 + randn_clip_value: 3 + min_sampling_denoising_std: 0.1 + min_logprob_denoising_std: 0.1 + # + network_path: ${base_policy_path} + actor: + _target_: model.diffusion.mlp_diffusion.DiffusionMLP + time_dim: 16 + mlp_dims: [512, 512, 512] + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + horizon_steps: ${horizon_steps} + action_dim: ${action_dim} + critic: + _target_: model.common.critic.CriticObs + mlp_dims: [256, 256, 256] + activation_type: Mish + residual_style: True + cond_dim: ${eval:'${obs_dim} * ${cond_steps}'} + ft_denoising_steps: ${ft_denoising_steps} + horizon_steps: ${horizon_steps} + obs_dim: ${obs_dim} + action_dim: ${action_dim} + denoising_steps: ${denoising_steps} + device: ${device} \ No newline at end of file diff --git a/model/diffusion/diffusion_ppo_sumlikelihood.py b/model/diffusion/diffusion_ppo_sumlikelihood.py new file mode 100644 index 00000000..c9f4b952 --- /dev/null +++ b/model/diffusion/diffusion_ppo_sumlikelihood.py @@ -0,0 +1,221 @@ +""" +DPPO: Diffusion Policy Policy Optimization. + +K: number of denoising steps +To: observation sequence length +Ta: action chunk size +Do: observation dimension +Da: action dimension + +C: image channels +H, W: image height and width + +""" + +from typing import Optional +import torch +import logging +import math + +log = logging.getLogger(__name__) +from model.diffusion.diffusion_vpg import VPGDiffusion + + +class PPODiffusionSumLikelihood(VPGDiffusion): + def __init__( + self, + gamma_denoising: float, + clip_ploss_coef: float, + clip_ploss_coef_base: float = 1e-3, + clip_ploss_coef_rate: float = 3, + clip_vloss_coef: Optional[float] = None, + clip_advantage_lower_quantile: float = 0, + clip_advantage_upper_quantile: float = 1, + norm_adv: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + + # Whether to normalize advantages within batch + self.norm_adv = norm_adv + + # Clipping value for policy loss + self.clip_ploss_coef = clip_ploss_coef + self.clip_ploss_coef_base = clip_ploss_coef_base + self.clip_ploss_coef_rate = clip_ploss_coef_rate + + # Clipping value for value loss + self.clip_vloss_coef = clip_vloss_coef + + # Discount factor for diffusion MDP + self.gamma_denoising = gamma_denoising + + # Quantiles for clipping advantages + self.clip_advantage_lower_quantile = clip_advantage_lower_quantile + self.clip_advantage_upper_quantile = clip_advantage_upper_quantile + + def loss( + self, + obs, + chains_prev, + chains_next, + denoising_inds, + returns, + oldvalues, + advantages, + oldlogprobs, + use_bc_loss=False, + reward_horizon=4, + ): + """ + PPO loss + + obs: dict with key state/rgb; more recent obs at the end + state: (B, To, Do) + rgb: (B, To, C, H, W) + chains: (B, K+1, Ta, Da) + returns: (B, ) + values: (B, ) + advantages: (B,) + oldlogprobs: (B, K, Ta, Da) + use_bc_loss: whether to add BC regularization loss + reward_horizon: action horizon that backpropagates gradient + """ + # Get new logprobs for denoising steps from T-1 to 0 - entropy is fixed fod diffusion + # repeat the obs for each denoising step + B = chains_next.shape[0] + obs_repeat = { + "state": obs["state"].repeat_interleave(self.ft_denoising_steps, dim=0) + } + denoising_inds = denoising_inds.repeat_interleave(chains_prev.shape[0]) + + # flatten the chains along the first and second dim + chains_prev = chains_prev.view(-1, *chains_prev.shape[2:]) + chains_next = chains_next.view(-1, *chains_next.shape[2:]) + + # get the logprobs for all denosing steps + newlogprobs, eta = self.get_logprobs_subsample( + obs_repeat, + chains_prev, + chains_next, + denoising_inds, + get_ent=True, + ) + + entropy_loss = -eta.mean() + newlogprobs = newlogprobs.clamp(min=-5, max=2) + oldlogprobs = oldlogprobs.clamp(min=-5, max=2) + + # expand newlogprobs to shape (B, K, Ta, Da) and sum along K + newlogprobs = newlogprobs.view( + -1, self.ft_denoising_steps, *newlogprobs.shape[1:] + ) + newlogprobs = newlogprobs.sum(dim=1) + + # only backpropagate through the earlier steps (e.g., ones actually executed in the environment) + newlogprobs = newlogprobs[:, :reward_horizon, :] + oldlogprobs = oldlogprobs[:, :reward_horizon, :] + + # Get the logprobs - batch over B and denoising steps + newlogprobs = newlogprobs.mean(dim=(-1, -2)).view(-1) + oldlogprobs = oldlogprobs.mean(dim=(-1, -2)).view(-1) + + bc_loss = 0 + if use_bc_loss: + # See Eqn. 2 of https://arxiv.org/pdf/2403.03949.pdf + # Give a reward for maximizing probability of teacher policy's action with current policy. + # Actions are chosen along trajectory induced by current policy. + + # Get counterfactual teacher actions + samples = self.forward( + cond=obs, + deterministic=False, + return_chain=True, + use_base_policy=True, + ) + # Get logprobs of teacher actions under this policy + bc_logprobs = self.get_logprobs( + obs, + samples.chains, + get_ent=False, + use_base_policy=False, + ) + bc_logprobs = bc_logprobs.clamp(min=-5, max=2) + bc_logprobs = bc_logprobs.mean(dim=(-1, -2)).view(-1) + bc_loss = -bc_logprobs.mean() + + # normalize advantages + if self.norm_adv: + advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) + + # Clip advantages by 5th and 95th percentile + advantage_min = torch.quantile(advantages, self.clip_advantage_lower_quantile) + advantage_max = torch.quantile(advantages, self.clip_advantage_upper_quantile) + advantages = advantages.clamp(min=advantage_min, max=advantage_max) + + # denoising discount + # discount = torch.tensor( + # [ + # self.gamma_denoising ** (self.ft_denoising_steps - i - 1) + # for i in denoising_inds + # ] + # ).to(self.device) + # advantages *= discount + + # get ratio + logratio = newlogprobs - oldlogprobs + ratio = logratio.exp() + + # exponentially interpolate between the base and the current clipping value over denoising steps and repeat + t = (denoising_inds.float() / (self.ft_denoising_steps - 1)).to(self.device) + t = t[ + -B: + ] # take the last B elements (pretend we're at the last denoising step) + if self.ft_denoising_steps > 1: + clip_ploss_coef = self.clip_ploss_coef_base + ( + self.clip_ploss_coef - self.clip_ploss_coef_base + ) * (torch.exp(self.clip_ploss_coef_rate * t) - 1) / ( + math.exp(self.clip_ploss_coef_rate) - 1 + ) + else: + clip_ploss_coef = t + + # get kl difference and whether value clipped + with torch.no_grad(): + # old_approx_kl: the approximate Kullback–Leibler divergence, measured by (-logratio).mean(), which corresponds to the k1 estimator in John Schulman’s blog post on approximating KL http://joschu.net/blog/kl-approx.html + # approx_kl: better alternative to old_approx_kl measured by (logratio.exp() - 1) - logratio, which corresponds to the k3 estimator in approximating KL http://joschu.net/blog/kl-approx.html + # old_approx_kl = (-logratio).mean() + approx_kl = ((ratio - 1) - logratio).mean() + clipfrac = ((ratio - 1.0).abs() > clip_ploss_coef).float().mean().item() + + # Policy loss with clipping + pg_loss1 = -advantages * ratio + pg_loss2 = -advantages * torch.clamp( + ratio, 1 - clip_ploss_coef, 1 + clip_ploss_coef + ) + pg_loss = torch.max(pg_loss1, pg_loss2).mean() + + # Value loss optionally with clipping + newvalues = self.critic(obs).view(-1) + if self.clip_vloss_coef is not None: + v_loss_unclipped = (newvalues - returns) ** 2 + v_clipped = oldvalues + torch.clamp( + newvalues - oldvalues, + -self.clip_vloss_coef, + self.clip_vloss_coef, + ) + v_loss_clipped = (v_clipped - returns) ** 2 + v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) + v_loss = 0.5 * v_loss_max.mean() + else: + v_loss = 0.5 * ((newvalues - returns) ** 2).mean() + return ( + pg_loss, + entropy_loss, + v_loss, + clipfrac, + approx_kl.item(), + ratio.mean().item(), + bc_loss, + eta.mean().item(), + ) From 2938fdd5757dca28312e280b37722bbf448d594e Mon Sep 17 00:00:00 2001 From: "Justin M. Lidard" Date: Fri, 22 Nov 2024 19:52:28 -0500 Subject: [PATCH 7/7] minor --- model/diffusion/diffusion_ppo_sumlikelihood.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/model/diffusion/diffusion_ppo_sumlikelihood.py b/model/diffusion/diffusion_ppo_sumlikelihood.py index c9f4b952..1cd331de 100644 --- a/model/diffusion/diffusion_ppo_sumlikelihood.py +++ b/model/diffusion/diffusion_ppo_sumlikelihood.py @@ -1,5 +1,5 @@ """ -DPPO: Diffusion Policy Policy Optimization. +DPPO: Diffusion Policy Policy Optimization with summed likelihood. K: number of denoising steps To: observation sequence length @@ -81,7 +81,6 @@ def loss( use_bc_loss: whether to add BC regularization loss reward_horizon: action horizon that backpropagates gradient """ - # Get new logprobs for denoising steps from T-1 to 0 - entropy is fixed fod diffusion # repeat the obs for each denoising step B = chains_next.shape[0] obs_repeat = {