From a99b828bfa9b48501c0eef135b7d846dcd5c3d7b Mon Sep 17 00:00:00 2001
From: "Justin M. Lidard" <jlidard@neuronic.cs.princeton.edu>
Date: Thu, 14 Nov 2024 21:01:37 -0500
Subject: [PATCH 1/7] add diffusion versions of rlpd/ibrl/cal-ql

---
 agent/finetune/train_calql_diffusion_agent.py | 478 ++++++++++++++++++
 agent/finetune/train_ibrl_diffusion_agent.py  | 354 +++++++++++++
 agent/finetune/train_rlpd_diffusion_agent.py  | 381 ++++++++++++++
 .../can/calql_diffusion_mlp_online.yaml       | 121 +++++
 .../finetune/can/ibrl_diffusion_mlp.yaml      | 119 +++++
 .../square/calql_diffusion_mlp_online.yaml    | 122 +++++
 .../finetune/square/ibrl_diffusion_mlp.yaml   | 120 +++++
 .../can/calql_diffusion_mlp_offline.yaml      | 117 +++++
 .../square/calql_diffusion_mlp_offline.yaml   | 118 +++++
 .../scratch/can/rlpd_diffusion_mlp.yaml       | 116 +++++
 .../scratch/square/rlpd_diffusion_mlp.yaml    | 117 +++++
 model/diffusion/diffusion_calql.py            | 223 ++++++++
 model/diffusion/diffusion_ibrl.py             | 286 +++++++++++
 model/diffusion/diffusion_rlpd.py             | 152 ++++++
 14 files changed, 2824 insertions(+)
 create mode 100644 agent/finetune/train_calql_diffusion_agent.py
 create mode 100644 agent/finetune/train_ibrl_diffusion_agent.py
 create mode 100644 agent/finetune/train_rlpd_diffusion_agent.py
 create mode 100644 cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml
 create mode 100644 cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml
 create mode 100644 cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml
 create mode 100644 cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml
 create mode 100644 cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml
 create mode 100644 cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml
 create mode 100644 cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml
 create mode 100644 cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml
 create mode 100644 model/diffusion/diffusion_calql.py
 create mode 100644 model/diffusion/diffusion_ibrl.py
 create mode 100644 model/diffusion/diffusion_rlpd.py

diff --git a/agent/finetune/train_calql_diffusion_agent.py b/agent/finetune/train_calql_diffusion_agent.py
new file mode 100644
index 00000000..78a364ff
--- /dev/null
+++ b/agent/finetune/train_calql_diffusion_agent.py
@@ -0,0 +1,478 @@
+"""
+Reinforcement Learning with Prior Data (RLPD) agent training script.
+
+Does not support image observations right now.
+"""
+
+import os
+import pickle
+import numpy as np
+import torch
+import logging
+import wandb
+import hydra
+from collections import deque
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.finetune.train_agent import TrainAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+class TrainCalQLDiffusionAgent(TrainAgent):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+        assert self.n_envs == 1, "Cal-QL only supports single env for now"
+
+        # Train mode (offline or online)
+        self.train_online = cfg.train.train_online
+
+        # Build dataset
+        self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset)
+
+        # note the discount factor gamma here is applied to reward every act_steps, instead of every env step
+        self.gamma = cfg.train.gamma
+
+        # Optimizer
+        self.actor_optimizer = torch.optim.AdamW(
+            self.model.network.parameters(),
+            lr=cfg.train.actor_lr,
+            weight_decay=cfg.train.actor_weight_decay,
+        )
+        self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.actor_optimizer,
+            first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.actor_lr,
+            min_lr=cfg.train.actor_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.critic_optimizer = torch.optim.AdamW(
+            self.model.critic.parameters(),
+            lr=cfg.train.critic_lr,
+            weight_decay=cfg.train.critic_weight_decay,
+        )
+        self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.critic_optimizer,
+            first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.critic_lr,
+            min_lr=cfg.train.critic_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+
+        # Perturbation scale
+        self.target_ema_rate = cfg.train.target_ema_rate
+
+        # Number of random actions to sample for Cal-QL
+        self.n_random_actions = cfg.train.n_random_actions
+
+        # Reward scale
+        self.scale_reward_factor = cfg.train.scale_reward_factor
+
+        # Number of critic updates
+        self.num_update = cfg.train.num_update
+
+        # Buffer size
+        self.buffer_size = cfg.train.buffer_size
+
+        # Online only configs
+        if self.train_online:
+            # number of episode to colect per epoch for training
+            self.n_episode_per_epoch = cfg.train.n_episode_per_epoch
+
+        # Eval episodes
+        self.n_eval_episode = cfg.train.n_eval_episode
+
+        # Exploration steps at the beginning - using randomly sampled action
+        self.n_explore_steps = cfg.train.n_explore_steps
+
+    def run(self):
+        # make a FIFO replay buffer for obs, action, and reward
+        obs_buffer = deque(maxlen=self.buffer_size)
+        next_obs_buffer = deque(maxlen=self.buffer_size)
+        action_buffer = deque(maxlen=self.buffer_size)
+        reward_buffer = deque(maxlen=self.buffer_size)
+        reward_to_go_buffer = deque(maxlen=self.buffer_size)
+        terminated_buffer = deque(maxlen=self.buffer_size)
+        if not self.train_online:
+            obs_array = np.array(obs_buffer)
+            next_obs_array = np.array(next_obs_buffer)
+            actions_array = np.array(action_buffer)
+            rewards_array = np.array(reward_buffer)
+            reward_to_go_array = np.array(reward_to_go_buffer)
+            terminated_array = np.array(terminated_buffer)
+
+        # load offline dataset into replay buffer
+        dataloader_offline = torch.utils.data.DataLoader(
+            self.dataset_offline,
+            batch_size=len(self.dataset_offline),
+            drop_last=False,
+        )
+        for batch in dataloader_offline:
+            actions, states_and_next, rewards, terminated, reward_to_go = batch
+            states = states_and_next["state"]
+            next_states = states_and_next["next_state"]
+            obs_buffer_off = states.cpu().numpy()
+            next_obs_buffer_off = next_states.cpu().numpy()
+            action_buffer_off = actions.cpu().numpy()
+            reward_buffer_off = rewards.cpu().numpy().flatten()
+            reward_to_go_buffer_off = reward_to_go.cpu().numpy().flatten()
+            terminated_buffer_off = terminated.cpu().numpy().flatten()
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        cnt_train_step = 0
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+            if self.itr % 1000 == 0:
+                print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = (
+                self.itr % self.val_freq == 0
+                and self.itr >= self.n_explore_steps
+                and not self.force_train
+            )
+            # during eval, we collect a fixed number of episodes, so we set n_steps to a large value
+            if eval_mode:
+                n_steps = int(1e5)
+            elif not self.train_online:
+                n_steps = 0
+            else:
+                n_steps = int(1e5)  # use episodes
+            self.model.eval() if eval_mode else self.model.train()
+
+            # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
+            firsts_trajs = np.zeros((n_steps + 1, self.n_envs))
+            if self.reset_at_iteration or eval_mode or self.itr == 0:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                # if done at the end of last iteration, the envs are just reset
+                firsts_trajs[0] = done_venv
+            reward_trajs = np.zeros((n_steps, self.n_envs))
+
+            # Collect a set of trajectories from env
+            cnt_episode = 0
+            for step in range(n_steps):
+                if step % 100 == 0:
+                    print(f"Completed environment step {step}")
+
+                # Select action
+                if self.itr < self.n_explore_steps:
+                    action_venv = self.venv.action_space.sample()
+                else:
+                    with torch.no_grad():
+                        cond = {
+                            "state": torch.from_numpy(prev_obs_venv["state"])
+                            .float()
+                            .to(self.device)
+                        }
+                        samples = (
+                            self.model(
+                                cond=cond,
+                                deterministic=eval_mode,
+                            )
+                            .cpu()
+                            .numpy()
+                        )  # n_env x horizon x act
+                    action_venv = samples[:, : self.act_steps]
+
+                # Apply multi-step action
+                (
+                    obs_venv,
+                    reward_venv,
+                    terminated_venv,
+                    truncated_venv,
+                    info_venv,
+                ) = self.venv.step(action_venv)
+                done_venv = terminated_venv | truncated_venv
+                reward_trajs[step] = reward_venv
+                firsts_trajs[step + 1] = done_venv
+
+                # add to buffer in train mode
+                if not eval_mode:
+                    for i in range(self.n_envs):
+                        obs_buffer.append(prev_obs_venv["state"][i])
+                        if truncated_venv[i]:
+                            next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
+                        else:  # first obs in new episode
+                            next_obs_buffer.append(obs_venv["state"][i])
+                        action_buffer.append(action_venv[i])
+                    reward_buffer.extend(
+                        (reward_venv * self.scale_reward_factor).tolist()
+                    )
+                    terminated_buffer.extend(terminated_venv.tolist())
+
+                # update for next step
+                prev_obs_venv = obs_venv
+
+                # count steps --- not acounting for done within action chunk
+                cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
+
+                # check if enough eval episodes are done
+                cnt_episode += np.sum(done_venv)
+                if eval_mode and cnt_episode >= self.n_eval_episode:
+                    break
+                if not eval_mode and cnt_episode >= self.n_episode_per_epoch:
+                    break
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+
+                # compute episode returns
+                returns_trajs_split = [
+                    np.zeros_like(reward_trajs) for reward_trajs in reward_trajs_split
+                ]
+                for traj_rewards, traj_returns in zip(
+                    reward_trajs_split, returns_trajs_split
+                ):
+                    prev_return = 0
+                    for t in range(len(traj_rewards)):
+                        traj_returns[-t - 1] = (
+                            traj_rewards[-t - 1] + self.gamma * prev_return
+                        )
+                        prev_return = traj_returns[-t - 1]
+
+                # flatten (note: only works for single env!)
+                returns_trajs_split = np.concatenate(returns_trajs_split)
+
+                # extend buffer
+                reward_to_go_buffer.extend(returns_trajs_split)
+
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+
+            # Update models
+            if not eval_mode and self.itr >= self.n_explore_steps:
+                # TODO: is this slow in online?
+                if self.train_online:
+                    obs_array = np.array(obs_buffer)
+                    next_obs_array = np.array(next_obs_buffer)
+                    actions_array = np.array(action_buffer)
+                    rewards_array = np.array(reward_buffer)
+                    reward_to_go_array = np.array(reward_to_go_buffer)
+                    terminated_array = np.array(terminated_buffer)
+
+                # override num_update
+                if self.train_online:
+                    # the amount of new transitions(single env)
+                    num_update = len(reward_trajs_split[0])
+                else:
+                    num_update = self.num_update
+                for _ in range(num_update):
+                    # Sample from OFFLINE buffer
+                    inds = np.random.choice(
+                        len(obs_buffer_off),
+                        self.batch_size // 2 if self.train_online else self.batch_size,
+                    )
+                    obs_b = (
+                        torch.from_numpy(obs_buffer_off[inds]).float().to(self.device)
+                    )
+                    next_obs_b = (
+                        torch.from_numpy(next_obs_buffer_off[inds])
+                        .float()
+                        .to(self.device)
+                    )
+                    actions_b = (
+                        torch.from_numpy(action_buffer_off[inds])
+                        .float()
+                        .to(self.device)
+                    )
+                    rewards_b = (
+                        torch.from_numpy(reward_buffer_off[inds])
+                        .float()
+                        .to(self.device)
+                    )
+                    terminated_b = (
+                        torch.from_numpy(terminated_buffer_off[inds])
+                        .float()
+                        .to(self.device)
+                    )
+                    reward_to_go_b = (
+                        torch.from_numpy(reward_to_go_buffer_off[inds])
+                        .float()
+                        .to(self.device)
+                    )
+
+                    # Sample from ONLINE buffer
+                    if self.train_online:
+                        inds = np.random.choice(len(obs_buffer), self.batch_size // 2)
+                        obs_b_on = (
+                            torch.from_numpy(obs_array[inds]).float().to(self.device)
+                        )
+                        next_obs_b_on = (
+                            torch.from_numpy(next_obs_array[inds])
+                            .float()
+                            .to(self.device)
+                        )
+                        actions_b_on = (
+                            torch.from_numpy(actions_array[inds])
+                            .float()
+                            .to(self.device)
+                        )
+                        rewards_b_on = (
+                            torch.from_numpy(rewards_array[inds])
+                            .float()
+                            .to(self.device)
+                        )
+                        terminated_b_on = (
+                            torch.from_numpy(terminated_array[inds])
+                            .float()
+                            .to(self.device)
+                        )
+                        reward_to_go_b_on = (
+                            torch.from_numpy(reward_to_go_array[inds])
+                            .float()
+                            .to(self.device)
+                        )
+
+                        # merge offline and online data
+                        obs_b = torch.cat([obs_b, obs_b_on], dim=0)
+                        next_obs_b = torch.cat([next_obs_b, next_obs_b_on], dim=0)
+                        actions_b = torch.cat([actions_b, actions_b_on], dim=0)
+                        rewards_b = torch.cat([rewards_b, rewards_b_on], dim=0)
+                        terminated_b = torch.cat([terminated_b, terminated_b_on], dim=0)
+                        reward_to_go_b = torch.cat(
+                            [reward_to_go_b, reward_to_go_b_on], dim=0
+                        )
+
+                    # Get a random action for Cal-QL
+                    random_actions = (
+                        torch.rand(
+                            (
+                                self.batch_size,
+                                self.n_random_actions,
+                                self.horizon_steps,
+                                self.action_dim,
+                            )
+                        ).to(self.device)
+                        * 2
+                        - 1
+                    )  # scale to [-1, 1]
+
+                    # Update critic
+                    loss_critic = self.model.loss_critic(
+                        {"state": obs_b},
+                        {"state": next_obs_b},
+                        actions_b,
+                        random_actions,
+                        rewards_b,
+                        reward_to_go_b,
+                        terminated_b,
+                        self.gamma,
+                    )
+                    self.critic_optimizer.zero_grad()
+                    loss_critic.backward()
+                    self.critic_optimizer.step()
+
+                    # Update target critic
+                    self.model.update_target_critic(self.target_ema_rate)
+
+                    # Update actor
+                    loss_actor = self.model.loss_actor(
+                        {"state": obs_b},
+                    )
+                    self.actor_optimizer.zero_grad()
+                    loss_actor.backward()
+                    self.actor_optimizer.step()
+
+            # Update lr
+            self.actor_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                    "step": cnt_train_step,
+                }
+            )
+            if self.itr % self.log_freq == 0 and self.itr >= self.n_explore_steps:
+                time = timer()
+                run_results[-1]["time"] = time
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "total env step": cnt_train_step,
+                                "loss - actor": loss_actor,
+                                "loss - critic": loss_critic,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_ibrl_diffusion_agent.py b/agent/finetune/train_ibrl_diffusion_agent.py
new file mode 100644
index 00000000..27e4580b
--- /dev/null
+++ b/agent/finetune/train_ibrl_diffusion_agent.py
@@ -0,0 +1,354 @@
+"""
+Imitation Bootstrapped Reinforcement Learning (IBRL) agent training script.
+
+Does not support image observations right now. 
+"""
+
+import os
+import pickle
+import numpy as np
+import torch
+import logging
+import wandb
+import hydra
+from collections import deque
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.finetune.train_agent import TrainAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+class TrainIBRLDiffusionAgent(TrainAgent):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # Build dataset
+        self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset)
+
+        # note the discount factor gamma here is applied to reward every act_steps, instead of every env step
+        self.gamma = cfg.train.gamma
+
+        # Optimizer
+        self.actor_optimizer = torch.optim.AdamW(
+            self.model.network.parameters(),
+            lr=cfg.train.actor_lr,
+            weight_decay=cfg.train.actor_weight_decay,
+        )
+        self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.actor_optimizer,
+            first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.actor_lr,
+            min_lr=cfg.train.actor_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.critic_optimizer = torch.optim.AdamW(
+            self.model.ensemble_params.values(),  # https://github.com/pytorch/pytorch/issues/120581
+            lr=cfg.train.critic_lr,
+            weight_decay=cfg.train.critic_weight_decay,
+        )
+        self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.critic_optimizer,
+            first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.critic_lr,
+            min_lr=cfg.train.critic_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+
+        # Perturbation scale
+        self.target_ema_rate = cfg.train.target_ema_rate
+
+        # Reward scale
+        self.scale_reward_factor = cfg.train.scale_reward_factor
+
+        # Number of critic updates
+        self.critic_num_update = cfg.train.critic_num_update
+
+        # Update frequency
+        self.update_freq = cfg.train.update_freq
+
+        # Buffer size
+        self.buffer_size = cfg.train.buffer_size
+
+        # Eval episodes
+        self.n_eval_episode = cfg.train.n_eval_episode
+
+        # Exploration steps at the beginning - using randomly sampled action
+        self.n_explore_steps = cfg.train.n_explore_steps
+
+    def run(self):
+        # make a FIFO replay buffer for obs, action, and reward
+        obs_buffer = deque(maxlen=self.buffer_size)
+        next_obs_buffer = deque(maxlen=self.buffer_size)
+        action_buffer = deque(maxlen=self.buffer_size)
+        reward_buffer = deque(maxlen=self.buffer_size)
+        terminated_buffer = deque(maxlen=self.buffer_size)
+
+        # load offline dataset into replay buffer
+        dataloader_offline = torch.utils.data.DataLoader(
+            self.dataset_offline,
+            batch_size=len(self.dataset_offline),
+            drop_last=False,
+        )
+        for batch in dataloader_offline:
+            actions, states_and_next, rewards, terminated = batch
+            states = states_and_next["state"]
+            next_states = states_and_next["next_state"]
+            obs_buffer.extend(states.cpu().numpy())
+            next_obs_buffer.extend(next_states.cpu().numpy())
+            action_buffer.extend(actions.cpu().numpy())
+            reward_buffer.extend(rewards.cpu().numpy().flatten())
+            terminated_buffer.extend(terminated.cpu().numpy().flatten())
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        cnt_train_step = 0
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+            if self.itr % 1000 == 0:
+                print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = (
+                self.itr % self.val_freq == 0
+                and self.itr > self.n_explore_steps
+                and not self.force_train
+            )
+            n_steps = (
+                self.n_steps if not eval_mode else int(1e5)
+            )  # large number for eval mode
+            self.model.eval() if eval_mode else self.model.train()
+
+            # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
+            firsts_trajs = np.zeros((n_steps + 1, self.n_envs))
+            if self.reset_at_iteration or eval_mode or self.itr == 0:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                # if done at the end of last iteration, the envs are just reset
+                firsts_trajs[0] = done_venv
+            reward_trajs = np.zeros((n_steps, self.n_envs))
+
+            # Collect a set of trajectories from env
+            cnt_episode = 0
+            for step in range(n_steps):
+                # Select action
+                with torch.no_grad():
+                    cond = {
+                        "state": torch.from_numpy(prev_obs_venv["state"])
+                        .float()
+                        .to(self.device)
+                    }
+                    samples = (
+                        self.model(
+                            cond=cond,
+                            deterministic=eval_mode,
+                        )
+                        .cpu()
+                        .numpy()
+                    )  # n_env x horizon x act
+                action_venv = samples[:, : self.act_steps]
+
+                # Apply multi-step action
+                (
+                    obs_venv,
+                    reward_venv,
+                    terminated_venv,
+                    truncated_venv,
+                    info_venv,
+                ) = self.venv.step(action_venv)
+                done_venv = terminated_venv | truncated_venv
+                reward_trajs[step] = reward_venv
+                firsts_trajs[step + 1] = done_venv
+
+                # add to buffer in train mode
+                if not eval_mode:
+                    for i in range(self.n_envs):
+                        obs_buffer.append(prev_obs_venv["state"][i])
+                        if "final_obs" in info_venv[i]:  # truncated
+                            next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
+                        else:  # first obs in new episode
+                            next_obs_buffer.append(obs_venv["state"][i])
+                        action_buffer.append(action_venv[i])
+                    reward_buffer.extend(
+                        (reward_venv * self.scale_reward_factor).tolist()
+                    )
+                    terminated_buffer.extend(terminated_venv.tolist())
+
+                # update for next step
+                prev_obs_venv = obs_venv
+
+                # count steps --- not acounting for done within action chunk
+                cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
+
+                # check if enough eval episodes are done
+                cnt_episode += np.sum(done_venv)
+                if eval_mode and cnt_episode >= self.n_eval_episode:
+                    break
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+
+            # Update models
+            if (
+                not eval_mode
+                and self.itr > self.n_explore_steps
+                and self.itr % self.update_freq == 0
+            ):
+                # Update critic more frequently
+                for _ in range(self.critic_num_update):
+                    # Sample from online buffer
+                    inds = np.random.choice(len(obs_buffer), self.batch_size)
+                    obs_b = (
+                        torch.from_numpy(np.array([obs_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    next_obs_b = (
+                        torch.from_numpy(np.array([next_obs_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    actions_b = (
+                        torch.from_numpy(np.array([action_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    rewards_b = (
+                        torch.from_numpy(np.array([reward_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    terminated_b = (
+                        torch.from_numpy(np.array([terminated_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    loss_critic = self.model.loss_critic(
+                        {"state": obs_b},
+                        {"state": next_obs_b},
+                        actions_b,
+                        rewards_b,
+                        terminated_b,
+                        self.gamma,
+                    )
+                    self.critic_optimizer.zero_grad()
+                    loss_critic.backward()
+                    self.critic_optimizer.step()
+
+                    # Update target critic every critic update
+                    self.model.update_target_critic(self.target_ema_rate)
+
+                # Update actor once with the final batch
+                loss_actor = self.model.loss_actor(
+                    {"state": obs_b},
+                )
+                self.actor_optimizer.zero_grad()
+                loss_actor.backward()
+                self.actor_optimizer.step()
+
+                # Update target actor
+                self.model.update_target_actor(self.target_ema_rate)
+
+            # Update lr
+            self.actor_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                    "step": cnt_train_step,
+                }
+            )
+            if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps:
+                time = timer()
+                run_results[-1]["time"] = time
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "total env step": cnt_train_step,
+                                "loss - actor": loss_actor,
+                                "loss - critic": loss_critic,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/agent/finetune/train_rlpd_diffusion_agent.py b/agent/finetune/train_rlpd_diffusion_agent.py
new file mode 100644
index 00000000..31587d76
--- /dev/null
+++ b/agent/finetune/train_rlpd_diffusion_agent.py
@@ -0,0 +1,381 @@
+"""
+Reinforcement Learning with Prior Data (RLPD) agent training script.
+
+Does not support image observations right now.
+"""
+
+import os
+import pickle
+import numpy as np
+import torch
+import logging
+import wandb
+import hydra
+from collections import deque
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.finetune.train_agent import TrainAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+class TrainRLPDDiffusionAgent(TrainAgent):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # Build dataset
+        self.dataset_offline = hydra.utils.instantiate(cfg.offline_dataset)
+
+        # note the discount factor gamma here is applied to reward every act_steps, instead of every env step
+        self.gamma = cfg.train.gamma
+
+        # Optimizer
+        self.actor_optimizer = torch.optim.AdamW(
+            self.model.network.parameters(),
+            lr=cfg.train.actor_lr,
+            weight_decay=cfg.train.actor_weight_decay,
+        )
+        self.actor_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.actor_optimizer,
+            first_cycle_steps=cfg.train.actor_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.actor_lr,
+            min_lr=cfg.train.actor_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.actor_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+        self.critic_optimizer = torch.optim.AdamW(
+            self.model.ensemble_params.values(),  # https://github.com/pytorch/pytorch/issues/120581
+            lr=cfg.train.critic_lr,
+            weight_decay=cfg.train.critic_weight_decay,
+        )
+        self.critic_lr_scheduler = CosineAnnealingWarmupRestarts(
+            self.critic_optimizer,
+            first_cycle_steps=cfg.train.critic_lr_scheduler.first_cycle_steps,
+            cycle_mult=1.0,
+            max_lr=cfg.train.critic_lr,
+            min_lr=cfg.train.critic_lr_scheduler.min_lr,
+            warmup_steps=cfg.train.critic_lr_scheduler.warmup_steps,
+            gamma=1.0,
+        )
+
+        # Perturbation scale
+        self.target_ema_rate = cfg.train.target_ema_rate
+
+        # Reward scale
+        self.scale_reward_factor = cfg.train.scale_reward_factor
+
+        # Number of critic updates
+        self.critic_num_update = cfg.train.critic_num_update
+
+        # Buffer size
+        self.buffer_size = cfg.train.buffer_size
+
+        # Eval episodes
+        self.n_eval_episode = cfg.train.n_eval_episode
+
+        # Exploration steps at the beginning - using randomly sampled action
+        self.n_explore_steps = cfg.train.n_explore_steps
+
+    def run(self):
+        # make a FIFO replay buffer for obs, action, and reward
+        obs_buffer = deque(maxlen=self.buffer_size)
+        next_obs_buffer = deque(maxlen=self.buffer_size)
+        action_buffer = deque(maxlen=self.buffer_size)
+        reward_buffer = deque(maxlen=self.buffer_size)
+        terminated_buffer = deque(maxlen=self.buffer_size)
+
+        # load offline dataset into replay buffer
+        dataloader_offline = torch.utils.data.DataLoader(
+            self.dataset_offline,
+            batch_size=len(self.dataset_offline),
+            drop_last=False,
+        )
+        for batch in dataloader_offline:
+            actions, states_and_next, rewards, terminated = batch
+            states = states_and_next["state"]
+            next_states = states_and_next["next_state"]
+            obs_buffer_off = states.cpu().numpy()
+            next_obs_buffer_off = next_states.cpu().numpy()
+            action_buffer_off = actions.cpu().numpy()
+            reward_buffer_off = rewards.cpu().numpy().flatten()
+            terminated_buffer_off = terminated.cpu().numpy().flatten()
+
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        cnt_train_step = 0
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+            if self.itr % 1000 == 0:
+                print(f"Finished training iteration {self.itr} of {self.n_train_itr}")
+
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = (
+                self.itr % self.val_freq == 0
+                and self.itr >= self.n_explore_steps
+                and not self.force_train
+            )
+            n_steps = (
+                self.n_steps if not eval_mode else int(1e5)
+            )  # large number for eval mode
+            self.model.eval() if eval_mode else self.model.train()
+
+            # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) at the beginning
+            firsts_trajs = np.zeros((n_steps + 1, self.n_envs))
+            if self.reset_at_iteration or eval_mode or self.itr == 0:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                # if done at the end of last iteration, then the envs are just reset
+                firsts_trajs[0] = done_venv
+            reward_trajs = np.zeros((n_steps, self.n_envs))
+
+            # Collect a set of trajectories from env
+            cnt_episode = 0
+            for step in range(n_steps):
+                # Select action
+                if self.itr < self.n_explore_steps:
+                    action_venv = self.venv.action_space.sample()
+                else:
+                    with torch.no_grad():
+                        cond = {
+                            "state": torch.from_numpy(prev_obs_venv["state"])
+                            .float()
+                            .to(self.device)
+                        }
+                        samples = (
+                            self.model(
+                                cond=cond,
+                                deterministic=eval_mode,
+                            )
+                            .cpu()
+                            .numpy()
+                        )  # n_env x horizon x act
+                    action_venv = samples[:, : self.act_steps]
+
+                # Apply multi-step action
+                (
+                    obs_venv,
+                    reward_venv,
+                    terminated_venv,
+                    truncated_venv,
+                    info_venv,
+                ) = self.venv.step(action_venv)
+                done_venv = terminated_venv | truncated_venv
+                reward_trajs[step] = reward_venv
+                firsts_trajs[step + 1] = done_venv
+
+                # add to buffer in train mode
+                if not eval_mode:
+                    for i in range(self.n_envs):
+                        obs_buffer.append(prev_obs_venv["state"][i])
+                        if truncated_venv[i]:
+                            next_obs_buffer.append(info_venv[i]["final_obs"]["state"])
+                        else:
+                            next_obs_buffer.append(obs_venv["state"][i])
+                        action_buffer.append(action_venv[i])
+                    reward_buffer.extend(
+                        (reward_venv * self.scale_reward_factor).tolist()
+                    )
+                    terminated_buffer.extend(terminated_venv.tolist())
+
+                # update for next step
+                prev_obs_venv = obs_venv
+
+                # count steps --- not acounting for done within action chunk
+                cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
+
+                # check if enough eval episodes are done
+                cnt_episode += np.sum(done_venv)
+                if eval_mode and cnt_episode >= self.n_eval_episode:
+                    break
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                episode_best_reward = np.array(
+                    [
+                        np.max(reward_traj) / self.act_steps
+                        for reward_traj in reward_trajs_split
+                    ]
+                )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+
+            # Update models
+            if not eval_mode and self.itr >= self.n_explore_steps:
+                # Update critic more frequently
+                for _ in range(self.critic_num_update):
+                    # Sample from OFFLINE buffer
+                    inds = np.random.choice(len(obs_buffer_off), self.batch_size // 2)
+                    obs_b_off = (
+                        torch.from_numpy(obs_buffer_off[inds]).float().to(self.device)
+                    )
+                    next_obs_b_off = (
+                        torch.from_numpy(next_obs_buffer_off[inds])
+                        .float()
+                        .to(self.device)
+                    )
+                    actions_b_off = (
+                        torch.from_numpy(action_buffer_off[inds])
+                        .float()
+                        .to(self.device)
+                    )
+                    rewards_b_off = (
+                        torch.from_numpy(reward_buffer_off[inds])
+                        .float()
+                        .to(self.device)
+                    )
+                    terminated_b_off = (
+                        torch.from_numpy(terminated_buffer_off[inds])
+                        .float()
+                        .to(self.device)
+                    )
+
+                    # Sample from ONLINE buffer
+                    inds = np.random.choice(len(obs_buffer), self.batch_size // 2)
+                    obs_b_on = (
+                        torch.from_numpy(np.array([obs_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    next_obs_b_on = (
+                        torch.from_numpy(np.array([next_obs_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    actions_b_on = (
+                        torch.from_numpy(np.array([action_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    rewards_b_on = (
+                        torch.from_numpy(np.array([reward_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+                    terminated_b_on = (
+                        torch.from_numpy(np.array([terminated_buffer[i] for i in inds]))
+                        .float()
+                        .to(self.device)
+                    )
+
+                    # merge offline and online data
+                    obs_b = torch.cat([obs_b_off, obs_b_on], dim=0)
+                    next_obs_b = torch.cat([next_obs_b_off, next_obs_b_on], dim=0)
+                    actions_b = torch.cat([actions_b_off, actions_b_on], dim=0)
+                    rewards_b = torch.cat([rewards_b_off, rewards_b_on], dim=0)
+                    terminated_b = torch.cat([terminated_b_off, terminated_b_on], dim=0)
+
+                    # Update critic
+                    loss_critic = self.model.loss_critic(
+                        {"state": obs_b},
+                        {"state": next_obs_b},
+                        actions_b,
+                        rewards_b,
+                        terminated_b,
+                        self.gamma,
+                    )
+                    self.critic_optimizer.zero_grad()
+                    loss_critic.backward()
+                    self.critic_optimizer.step()
+
+                    # Update target critic every critic update
+                    self.model.update_target_critic(self.target_ema_rate)
+
+                # Update actor once with the final batch
+                loss_actor = self.model.loss_actor(
+                    {"state": obs_b},
+                )
+                self.actor_optimizer.zero_grad()
+                loss_actor.backward()
+
+            # Update lr
+            self.actor_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                    "step": cnt_train_step,
+                }
+            )
+            if self.itr % self.log_freq == 0 and self.itr > self.n_explore_steps:
+                time = timer()
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: step {cnt_train_step:8d} | loss actor {loss_actor:8.4f} | loss critic {loss_critic:8.4f} | reward {avg_episode_reward:8.4f} | t:{time:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "total env step": cnt_train_step,
+                                "loss - actor": loss_actor,
+                                "loss - critic": loss_critic,
+                                "entropy coeff": 0,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml
new file mode 100644
index 00000000..bf35e0a6
--- /dev/null
+++ b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml
@@ -0,0 +1,121 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
+base_policy_path: 
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-calql-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_steps: 1  # not used
+  n_episode_per_epoch: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: True
+  batch_size: 256
+  n_random_actions: 4
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 3 # 1000
+  buffer_size: 1000000
+  online_utd_ratio: 1
+  n_eval_episode: 3 # 40
+  n_explore_steps: 0
+
+model:
+  _target_: model.diffusion.diffusion_calql.CalQL_Diffusion
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml
new file mode 100644
index 00000000..85d603ae
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml
@@ -0,0 +1,119 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ibrl_diffusion_agent.TrainIBRLDiffusionAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
+base_policy_path: 
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 250  # IBRL uses 200
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-ibrl-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 50000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # IBRL specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 3 # 40
+  n_explore_steps: 0
+  update_freq: 2
+
+model:
+  _target_: model.diffusion.diffusion_ibrl.IBRL_Diffusion
+  randn_clip_value: 3
+  n_critics: 5
+  soft_action_sample: True
+  soft_action_sample_beta: 10
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  max_n_episodes: 100
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml
new file mode 100644
index 00000000..ad5dce08
--- /dev/null
+++ b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml
@@ -0,0 +1,122 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
+base_policy_path: 
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-calql-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_steps: 1  # not used
+  n_episode_per_epoch: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: True
+  batch_size: 256
+  n_random_actions: 4
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 3 # 1000
+  buffer_size: 1000000
+  online_utd_ratio: 1
+  n_eval_episode: 3 # 40
+  n_explore_steps: 0
+
+model:
+  _target_: model.diffusion.diffusion_calql.CalQL_Diffusion
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml
new file mode 100644
index 00000000..82043b23
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml
@@ -0,0 +1,120 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ibrl_diffusion_agent.TrainIBRLDiffusionAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
+base_policy_path: 
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 250  # IBRL uses 200
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-ibrl-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 50000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # IBRL specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 3 # 40
+  n_explore_steps: 0
+  update_freq: 2
+
+model:
+  _target_: model.diffusion.diffusion_ibrl.IBRL_Diffusion
+  randn_clip_value: 3
+  n_critics: 5
+  soft_action_sample: True
+  soft_action_sample_beta: 10
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  max_n_episodes: 100
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml
new file mode 100644
index 00000000..4afaf5d7
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml
@@ -0,0 +1,117 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-calql-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 20
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: False
+  batch_size: 256
+  n_random_actions: 10
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 3 # 1000
+  buffer_size: 1000000
+  n_eval_episode: 3 # 10
+  n_explore_steps: 0
+
+model:
+  _target_: model.diffusion.diffusion_calql.CalQL_Diffusion
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml
new file mode 100644
index 00000000..653cba85
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml
@@ -0,0 +1,118 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-calql-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 20
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: False
+  batch_size: 256
+  n_random_actions: 10
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 3 # 1000
+  buffer_size: 1000000
+  n_eval_episode: 3 # 10
+  n_explore_steps: 0
+
+model:
+  _target_: model.diffusion.diffusion_calql.CalQL_Diffusion
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml
new file mode 100644
index 00000000..6dc91eca
--- /dev/null
+++ b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml
@@ -0,0 +1,116 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-rlpd-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 50000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # RLPD specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 3 # 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 0
+
+model:
+  _target_: model.diffusion.diffusion_rlpd.RLPD_Diffusion
+  randn_clip_value: 10
+  backup_entropy: False
+  n_critics: 5
+  tanh_output: True
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml
new file mode 100644
index 00000000..7567a57a
--- /dev/null
+++ b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml
@@ -0,0 +1,117 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-rlpd-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 50000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # RLPD specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 3 # 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 0
+
+model:
+  _target_: model.diffusion.diffusion_rlpd.RLPD_Diffusion
+  randn_clip_value: 10
+  backup_entropy: False
+  n_critics: 5
+  tanh_output: True
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/model/diffusion/diffusion_calql.py b/model/diffusion/diffusion_calql.py
new file mode 100644
index 00000000..31b486d1
--- /dev/null
+++ b/model/diffusion/diffusion_calql.py
@@ -0,0 +1,223 @@
+"""
+Calibrated Conservative Q-Learning (CalQL) for Gaussian policy.
+
+"""
+
+import torch
+import torch.nn as nn
+import logging
+from copy import deepcopy
+import numpy as np
+import einops
+
+from model.diffusion.diffusion_rwr import RWRDiffusion
+from model.diffusion.sampling import make_timesteps
+
+log = logging.getLogger(__name__)
+
+
+class CalQL_Diffusion(RWRDiffusion):
+    def __init__(
+        self,
+        actor,
+        critic,
+        network_path=None,
+        cql_clip_diff_min=-np.inf,
+        cql_clip_diff_max=np.inf,
+        cql_min_q_weight=5.0,
+        cql_n_actions=10,
+        **kwargs,
+    ):
+        super().__init__(network=actor, network_path=None, **kwargs)
+        self.cql_clip_diff_min = cql_clip_diff_min
+        self.cql_clip_diff_max = cql_clip_diff_max
+        self.cql_min_q_weight = cql_min_q_weight
+        self.cql_n_actions = cql_n_actions
+
+        # initialize critic networks
+        self.critic = critic.to(self.device)
+        self.target_critic = deepcopy(critic).to(self.device)
+
+        # Load pre-trained checkpoint - note we are also loading the pre-trained critic here
+        if network_path is not None:
+            checkpoint = torch.load(
+                network_path,
+                map_location=self.device,
+                weights_only=True,
+            )
+            self.load_state_dict(
+                checkpoint["model"],
+                strict=True,
+            )
+            log.info("Loaded actor from %s", network_path)
+        log.info(
+            f"Number of network parameters: {sum(p.numel() for p in self.parameters())}"
+        )
+
+    def loss_critic(
+        self,
+        obs,
+        next_obs,
+        actions,
+        random_actions,
+        rewards,
+        returns,
+        terminated,
+        gamma,
+    ):
+        B = len(actions)
+
+        # Get initial TD loss
+        q_data1, q_data2 = self.critic(obs, actions)
+        with torch.no_grad():
+            # repeat for action samples
+            next_obs_repeated = {
+                "state": next_obs["state"].repeat_interleave(self.cql_n_actions, dim=0)
+            }
+
+            # Get the next actions
+            next_actions = self.forward(
+                next_obs_repeated,
+                deterministic=False,
+            )
+            next_q1, next_q2 = self.target_critic(next_obs_repeated, next_actions)
+            next_q = torch.min(next_q1, next_q2)
+
+            # Reshape the next_q to match the number of samples
+            next_q = next_q.view(B, self.cql_n_actions)  # (B, n_sample)
+
+            # Get the max indices over the samples, and index into the next_q
+            max_idx = torch.argmax(next_q, dim=1)
+            next_q = next_q[torch.arange(B), max_idx]
+
+            # Get the target Q values
+            target_q = rewards + gamma * (1 - terminated) * next_q
+
+        # TD loss
+        td_loss_1 = nn.functional.mse_loss(q_data1, target_q)
+        td_loss_2 = nn.functional.mse_loss(q_data2, target_q)
+
+        # Get actions
+        pi_actions = self.forward(
+            obs,
+            deterministic=False,
+        )  # no gradient
+        pi_next_actions = self.forward(
+            next_obs,
+            deterministic=False,
+        )  # no gradient
+
+        # Random action Q values
+        n_random_actions = random_actions.shape[1]
+        obs_sample_state = {
+            "state": obs["state"].repeat_interleave(n_random_actions, dim=0)
+        }
+        random_actions = einops.rearrange(random_actions, "B N H A -> (B N) H A")
+
+        # Get the random action Q-values
+        q_rand_1, q_rand_2 = self.critic(obs_sample_state, random_actions)
+        q_rand_1 = q_rand_1
+        q_rand_2 = q_rand_2
+
+        # Reshape the random action Q values to match the number of samples
+        q_rand_1 = q_rand_1.view(B, n_random_actions)  # (n_sample, B)
+        q_rand_2 = q_rand_2.view(B, n_random_actions)
+
+        # Policy action Q values
+        q_pi_1, q_pi_2 = self.critic(obs, pi_actions)
+        q_pi_next_1, q_pi_next_2 = self.critic(next_obs, pi_next_actions)
+
+        # Ensure calibration w.r.t. value function estimate
+        q_pi_1 = torch.max(q_pi_1, returns)[:, None]  # (B, 1)
+        q_pi_2 = torch.max(q_pi_2, returns)[:, None]  # (B, 1)
+        q_pi_next_1 = torch.max(q_pi_next_1, returns)[:, None]  # (B, 1)
+        q_pi_next_2 = torch.max(q_pi_next_2, returns)[:, None]  # (B, 1)
+
+        # cql_importance_sample
+        q_pi_1 = q_pi_1
+        q_pi_2 = q_pi_2
+        q_pi_next_1 = q_pi_next_1
+        q_pi_next_2 = q_pi_next_2
+        cat_q_1 = torch.cat(
+            [q_rand_1, q_pi_1, q_pi_next_1], dim=-1
+        )  # (B, num_samples+1)
+        cql_qf1_ood = torch.logsumexp(cat_q_1, dim=-1)  # max over num_samples
+        cat_q_2 = torch.cat(
+            [q_rand_2, q_pi_2, q_pi_next_2], dim=-1
+        )  # (B, num_samples+1)
+        cql_qf2_ood = torch.logsumexp(cat_q_2, dim=-1)  # sum over num_samples
+
+        # skip cal_lagrange since the paper shows cql_target_action_gap not used in kitchen
+
+        # Subtract the log likelihood of the data
+        cql_qf1_diff = torch.clamp(
+            cql_qf1_ood - q_data1,
+            min=self.cql_clip_diff_min,
+            max=self.cql_clip_diff_max,
+        ).mean()
+        cql_qf2_diff = torch.clamp(
+            cql_qf2_ood - q_data2,
+            min=self.cql_clip_diff_min,
+            max=self.cql_clip_diff_max,
+        ).mean()
+        cql_min_qf1_loss = cql_qf1_diff * self.cql_min_q_weight
+        cql_min_qf2_loss = cql_qf2_diff * self.cql_min_q_weight
+
+        # Sum the two losses
+        critic_loss = td_loss_1 + td_loss_2 + cql_min_qf1_loss + cql_min_qf2_loss
+        return critic_loss
+
+    def loss_actor(self, obs):
+        action = self.forward_train(
+            obs,
+            deterministic=False,
+        )
+        q1, q2 = self.critic(obs, action)
+        actor_loss = -torch.min(q1, q2)
+        return actor_loss.mean()
+
+    def update_target_critic(self, tau):
+        for target_param, param in zip(
+            self.target_critic.parameters(), self.critic.parameters()
+        ):
+            target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
+
+    def forward_train(
+        self,
+        cond,
+        deterministic=False,
+    ):
+        """
+        Differentiable forward pass used in actor training.
+        """
+        device = self.betas.device
+        B = len(cond["state"])
+
+        # Loop
+        x = torch.randn((B, self.horizon_steps, self.action_dim), device=device)
+        t_all = list(reversed(range(self.denoising_steps)))
+        for i, t in enumerate(t_all):
+            t_b = make_timesteps(B, t, device)
+            mean, logvar = self.p_mean_var(
+                x=x,
+                t=t_b,
+                cond=cond,
+            )
+            std = torch.exp(0.5 * logvar)
+
+            # Determine the noise level
+            if deterministic and t == 0:
+                std = torch.zeros_like(std)
+            elif deterministic:  # For DDPM, sample with noise
+                std = torch.clip(std, min=1e-3)
+            else:
+                std = torch.clip(std, min=self.min_sampling_denoising_std)
+            noise = torch.randn_like(x).clamp_(
+                -self.randn_clip_value, self.randn_clip_value
+            )
+            x = mean + std * noise
+
+            # clamp action at final step
+            if self.final_action_clip_value and i == len(t_all) - 1:
+                x = torch.clamp(x, -1, 1)
+        return x
diff --git a/model/diffusion/diffusion_ibrl.py b/model/diffusion/diffusion_ibrl.py
new file mode 100644
index 00000000..51d545b8
--- /dev/null
+++ b/model/diffusion/diffusion_ibrl.py
@@ -0,0 +1,286 @@
+"""
+Imitation Bootstrapped Reinforcement Learning (IBRL) for Diffusion policy.
+
+"""
+
+import torch
+import torch.nn as nn
+import logging
+from copy import deepcopy
+
+from model.diffusion.diffusion_rwr import RWRDiffusion
+from model.diffusion.sampling import make_timesteps
+
+log = logging.getLogger(__name__)
+
+
+class IBRL_Diffusion(RWRDiffusion):
+    def __init__(
+        self,
+        actor,
+        critic,
+        n_critics,
+        soft_action_sample=False,
+        soft_action_sample_beta=10,
+        **kwargs,
+    ):
+        super().__init__(network=actor, **kwargs)
+        self.soft_action_sample = soft_action_sample
+        self.soft_action_sample_beta = soft_action_sample_beta
+
+        # Set up target actor
+        self.target_actor = deepcopy(actor)
+
+        # Frozen pre-trained policy
+        self.bc_policy = deepcopy(actor)
+        for param in self.bc_policy.parameters():
+            param.requires_grad = False
+
+        # initialize critic networks
+        self.critic_networks = [
+            deepcopy(critic).to(self.device) for _ in range(n_critics)
+        ]
+        self.critic_networks = nn.ModuleList(self.critic_networks)
+
+        # initialize target networks
+        self.target_networks = [
+            deepcopy(critic).to(self.device) for _ in range(n_critics)
+        ]
+        self.target_networks = nn.ModuleList(self.target_networks)
+
+        # Construct a "stateless" version of one of the models. It is "stateless" in the sense that the parameters are meta Tensors and do not have storage.
+        base_model = deepcopy(self.critic_networks[0])
+        self.base_model = base_model.to("meta")
+        self.ensemble_params, self.ensemble_buffers = torch.func.stack_module_state(
+            self.critic_networks
+        )
+
+    def critic_wrapper(self, params, buffers, data):
+        """for vmap"""
+        return torch.func.functional_call(self.base_model, (params, buffers), data)
+
+    def get_random_indices(self, sz=None, num_ind=2):
+        """get num_ind random indices from a set of size sz (used for getting critic targets)"""
+        if sz is None:
+            sz = len(self.critic_networks)
+        perm = torch.randperm(sz)
+        ind = perm[:num_ind].to(self.device)
+        return ind
+
+    def loss_critic(
+        self,
+        obs,
+        next_obs,
+        actions,
+        rewards,
+        terminated,
+        gamma,
+    ):
+        # get random critic index
+        q1_ind, q2_ind = self.get_random_indices()
+        with torch.no_grad():
+            next_actions_bc = self.forward_sample(
+                cond=next_obs,
+                network_override=self.bc_policy,
+            )
+            next_actions_rl = self.forward_sample(
+                cond=next_obs,
+                deterministic=False,
+                network_override=self.target_actor,
+            )
+
+            # get the BC Q value
+            next_q1_bc = self.target_networks[q1_ind](next_obs, next_actions_bc)
+            next_q2_bc = self.target_networks[q2_ind](next_obs, next_actions_bc)
+            next_q_bc = torch.min(next_q1_bc, next_q2_bc)
+
+            # get the RL Q value
+            next_q1_rl = self.target_networks[q1_ind](next_obs, next_actions_rl)
+            next_q2_rl = self.target_networks[q2_ind](next_obs, next_actions_rl)
+            next_q_rl = torch.min(next_q1_rl, next_q2_rl)
+
+            # take the max Q value
+            next_q = torch.where(next_q_bc > next_q_rl, next_q_bc, next_q_rl)
+
+            # target value
+            target_q = rewards + gamma * (1 - terminated) * next_q  # (B,)
+
+        # run all critics in batch
+        current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))(
+            self.ensemble_params, self.ensemble_buffers, (obs, actions)
+        )  # (n_critics, B)
+        loss_critic = torch.mean((current_q - target_q[None]) ** 2)
+        return loss_critic
+
+    def loss_actor(self, obs):
+        action = self.forward_train(
+            obs,
+            deterministic=False,
+        )  # use online policy only, also IBRL does not use tanh squashing
+        current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))(
+            self.ensemble_params, self.ensemble_buffers, (obs, action)
+        )  # (n_critics, B)
+        current_q = current_q.min(
+            dim=0
+        ).values  # unlike RLPD, IBRL uses the min Q value for actor update
+        loss_actor = -torch.mean(current_q)
+        return loss_actor
+
+    def update_target_critic(self, tau):
+        """need to use ensemble_params instead of critic_networks"""
+        for target_ind, target_critic in enumerate(self.target_networks):
+            for target_param_name, target_param in target_critic.named_parameters():
+                source_param = self.ensemble_params[target_param_name][target_ind]
+                target_param.data.copy_(
+                    target_param.data * (1.0 - tau) + source_param.data * tau
+                )
+
+    def update_target_actor(self, tau):
+        for target_param, source_param in zip(
+            self.target_actor.parameters(), self.network.parameters()
+        ):
+            target_param.data.copy_(
+                target_param.data * (1.0 - tau) + source_param.data * tau
+            )
+
+    # ---------- Sampling ----------#
+
+    def forward(
+        self,
+        cond,
+        deterministic=False,
+        reparameterize=False,
+    ):
+        """use both pre-trained and online policies"""
+        q1_ind, q2_ind = self.get_random_indices()
+
+        # sample an action from the BC policy
+        bc_action = self.forward_sample(
+            cond=cond,
+            deterministic=True,
+            network_override=self.bc_policy,
+        )
+
+        # sample an action from the RL policy
+        rl_action = super().forward(
+            cond=cond,
+            deterministic=deterministic,
+        )
+
+        # compute Q value of BC policy
+        q_bc_1 = self.critic_networks[q1_ind](cond, bc_action)  # (B,)
+        q_bc_2 = self.critic_networks[q2_ind](cond, bc_action)
+        q_bc = torch.min(q_bc_1, q_bc_2)
+
+        # compute Q value of RL policy
+        q_rl_1 = self.critic_networks[q1_ind](cond, rl_action)
+        q_rl_2 = self.critic_networks[q2_ind](cond, rl_action)
+        q_rl = torch.min(q_rl_1, q_rl_2)
+
+        # soft sample or greedy
+        if deterministic or not self.soft_action_sample:
+            action = torch.where(
+                (q_bc > q_rl)[:, None, None],
+                bc_action,
+                rl_action,
+            )
+        else:
+            # compute the Q weights with probability proportional to exp(\beta * Q(a))
+            qw_bc = torch.exp(q_bc * self.soft_action_sample_beta)
+            qw_rl = torch.exp(q_rl * self.soft_action_sample_beta)
+            q_weights = torch.softmax(
+                torch.stack([qw_bc, qw_rl], dim=-1),
+                dim=-1,
+            )
+
+            # sample according to the weights
+            q_indices = torch.multinomial(q_weights, 1)
+            action = torch.where(
+                (q_indices == 0)[:, None],
+                bc_action,
+                rl_action,
+            )
+        return action
+
+    # override
+    @torch.no_grad()
+    def forward_sample(
+        self,
+        cond,
+        deterministic=False,
+        network_override=None,
+    ):
+        device = cond["state"].device
+        B = len(cond["state"])
+
+        # Loop
+        x = torch.randn((B, self.horizon_steps, self.action_dim), device=device)
+        t_all = list(reversed(range(self.denoising_steps)))
+        for i, t in enumerate(t_all):
+            t_b = make_timesteps(B, t, device)
+            mean, logvar = self.p_mean_var(
+                x=x,
+                t=t_b,
+                cond=cond,
+                network_override=network_override,
+            )
+            std = torch.exp(0.5 * logvar)
+
+            # Determine the noise level
+            if deterministic and t == 0:
+                std = torch.zeros_like(std)
+            elif deterministic:
+                std = torch.clip(std, min=1e-3)
+            else:
+                std = torch.clip(std, min=self.min_sampling_denoising_std)
+            noise = torch.randn_like(x).clamp_(
+                -self.randn_clip_value, self.randn_clip_value
+            )
+            x = mean + std * noise
+
+            # clamp action at final step
+            if self.final_action_clip_value is not None and i == len(t_all) - 1:
+                x = torch.clamp(
+                    x, -self.final_action_clip_value, self.final_action_clip_value
+                )
+        return x
+
+    def forward_train(
+        self,
+        cond,
+        deterministic=False,
+    ):
+        """
+        Differentiable forward pass used in actor training.
+        """
+        device = self.betas.device
+        B = len(cond["state"])
+
+        # Loop
+        x = torch.randn((B, self.horizon_steps, self.action_dim), device=device)
+        t_all = list(reversed(range(self.denoising_steps)))
+        for i, t in enumerate(t_all):
+            t_b = make_timesteps(B, t, device)
+            mean, logvar = self.p_mean_var(
+                x=x,
+                t=t_b,
+                cond=cond,
+            )
+            std = torch.exp(0.5 * logvar)
+
+            # Determine the noise level
+            if deterministic and t == 0:
+                std = torch.zeros_like(std)
+            elif deterministic:  # For DDPM, sample with noise
+                std = torch.clip(std, min=1e-3)
+            else:
+                std = torch.clip(std, min=self.min_sampling_denoising_std)
+            noise = torch.randn_like(x).clamp_(
+                -self.randn_clip_value, self.randn_clip_value
+            )
+            x = mean + std * noise
+
+            # clamp action at final step
+            if self.final_action_clip_value and i == len(t_all) - 1:
+                x = torch.clamp(x, -1, 1)
+        return x
diff --git a/model/diffusion/diffusion_rlpd.py b/model/diffusion/diffusion_rlpd.py
new file mode 100644
index 00000000..8126b445
--- /dev/null
+++ b/model/diffusion/diffusion_rlpd.py
@@ -0,0 +1,152 @@
+"""
+Reinforcement learning with prior data (RLPD) for Diffusion policy.
+
+Use ensemble of critics.
+
+"""
+
+import torch
+import torch.nn as nn
+import logging
+from copy import deepcopy
+
+from model.diffusion.diffusion_rwr import RWRDiffusion
+from model.diffusion.sampling import make_timesteps
+
+log = logging.getLogger(__name__)
+
+
+class RLPD_Diffusion(RWRDiffusion):
+    def __init__(
+        self,
+        actor,
+        critic,
+        n_critics,
+        backup_entropy=False,
+        **kwargs,
+    ):
+        super().__init__(network=actor, **kwargs)
+        self.n_critics = n_critics
+        self.backup_entropy = backup_entropy
+
+        # initialize critic networks
+        self.critic_networks = [
+            deepcopy(critic).to(self.device) for _ in range(n_critics)
+        ]
+        self.critic_networks = nn.ModuleList(self.critic_networks)
+
+        # initialize target networks
+        self.target_networks = [
+            deepcopy(critic).to(self.device) for _ in range(n_critics)
+        ]
+        self.target_networks = nn.ModuleList(self.target_networks)
+
+        # Construct a "stateless" version of one of the models. It is "stateless" in the sense that the parameters are meta Tensors and do not have storage.
+        base_model = deepcopy(self.critic_networks[0])
+        self.base_model = base_model.to("meta")
+        self.ensemble_params, self.ensemble_buffers = torch.func.stack_module_state(
+            self.critic_networks
+        )
+
+    def critic_wrapper(self, params, buffers, data):
+        """for vmap"""
+        return torch.func.functional_call(self.base_model, (params, buffers), data)
+
+    def get_random_indices(self, sz=None, num_ind=2):
+        """get num_ind random indices from a set of size sz (used for getting critic targets)"""
+        if sz is None:
+            sz = len(self.critic_networks)
+        perm = torch.randperm(sz)
+        ind = perm[:num_ind].to(self.device)
+        return ind
+
+    def loss_critic(
+        self,
+        obs,
+        next_obs,
+        actions,
+        rewards,
+        terminated,
+        gamma,
+    ):
+        # get random critic index
+        q1_ind, q2_ind = self.get_random_indices()
+        with torch.no_grad():
+            next_actions = self.forward(
+                cond=next_obs,
+                deterministic=False,
+            )
+            next_q1 = self.target_networks[q1_ind](next_obs, next_actions)
+            next_q2 = self.target_networks[q2_ind](next_obs, next_actions)
+            next_q = torch.min(next_q1, next_q2)
+
+            # target value
+            target_q = rewards + gamma * (1 - terminated) * next_q  # (B,)
+
+        # run all critics in batch
+        current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))(
+            self.ensemble_params, self.ensemble_buffers, (obs, actions)
+        )  # (n_critics, B)
+        loss_critic = torch.mean((current_q - target_q[None]) ** 2)
+        return loss_critic
+
+    def loss_actor(self, obs):
+        action = self.forward_train(
+            obs,
+            deterministic=False,
+        )
+        current_q = torch.vmap(self.critic_wrapper, in_dims=(0, 0, None))(
+            self.ensemble_params, self.ensemble_buffers, (obs, action)
+        )  # (n_critics, B)
+        current_q = current_q.mean(dim=0)
+        loss_actor = -torch.mean(current_q)
+        return loss_actor
+
+    def update_target_critic(self, tau):
+        """need to use ensemble_params instead of critic_networks"""
+        for target_ind, target_critic in enumerate(self.target_networks):
+            for target_param_name, target_param in target_critic.named_parameters():
+                source_param = self.ensemble_params[target_param_name][target_ind]
+                target_param.data.copy_(
+                    target_param.data * (1.0 - tau) + source_param.data * tau
+                )
+
+    def forward_train(
+        self,
+        cond,
+        deterministic=False,
+    ):
+        """
+        Differentiable forward pass used in actor training.
+        """
+        device = self.betas.device
+        B = len(cond["state"])
+
+        # Loop
+        x = torch.randn((B, self.horizon_steps, self.action_dim), device=device)
+        t_all = list(reversed(range(self.denoising_steps)))
+        for i, t in enumerate(t_all):
+            t_b = make_timesteps(B, t, device)
+            mean, logvar = self.p_mean_var(
+                x=x,
+                t=t_b,
+                cond=cond,
+            )
+            std = torch.exp(0.5 * logvar)
+
+            # Determine the noise level
+            if deterministic and t == 0:
+                std = torch.zeros_like(std)
+            elif deterministic:  # For DDPM, sample with noise
+                std = torch.clip(std, min=1e-3)
+            else:
+                std = torch.clip(std, min=self.min_sampling_denoising_std)
+            noise = torch.randn_like(x).clamp_(
+                -self.randn_clip_value, self.randn_clip_value
+            )
+            x = mean + std * noise
+
+            # clamp action at final step
+            if self.final_action_clip_value and i == len(t_all) - 1:
+                x = torch.clamp(x, -1, 1)
+        return x

From 4888a1752271810a124c2fb42eeef38313d9f5b0 Mon Sep 17 00:00:00 2001
From: "Justin M. Lidard" <jlidard@neuronic.cs.princeton.edu>
Date: Thu, 14 Nov 2024 21:03:27 -0500
Subject: [PATCH 2/7] fix typo:

---
 model/diffusion/diffusion_calql.py | 2 +-
 model/diffusion/diffusion_ibrl.py  | 2 +-
 model/diffusion/diffusion_rlpd.py  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/model/diffusion/diffusion_calql.py b/model/diffusion/diffusion_calql.py
index 31b486d1..02c7dd50 100644
--- a/model/diffusion/diffusion_calql.py
+++ b/model/diffusion/diffusion_calql.py
@@ -1,5 +1,5 @@
 """
-Calibrated Conservative Q-Learning (CalQL) for Gaussian policy.
+Calibrated Conservative Q-Learning (CalQL) for Diffusion Policy.
 
 """
 
diff --git a/model/diffusion/diffusion_ibrl.py b/model/diffusion/diffusion_ibrl.py
index 51d545b8..245ed76c 100644
--- a/model/diffusion/diffusion_ibrl.py
+++ b/model/diffusion/diffusion_ibrl.py
@@ -1,5 +1,5 @@
 """
-Imitation Bootstrapped Reinforcement Learning (IBRL) for Diffusion policy.
+Imitation Bootstrapped Reinforcement Learning (IBRL) for Diffusion Policy.
 
 """
 
diff --git a/model/diffusion/diffusion_rlpd.py b/model/diffusion/diffusion_rlpd.py
index 8126b445..7a49d81f 100644
--- a/model/diffusion/diffusion_rlpd.py
+++ b/model/diffusion/diffusion_rlpd.py
@@ -1,5 +1,5 @@
 """
-Reinforcement learning with prior data (RLPD) for Diffusion policy.
+Reinforcement learning with prior data (RLPD) for Diffusion Policy.
 
 Use ensemble of critics.
 

From 92d2dac13af312e84dc61b3bcc8093e91e50bd2c Mon Sep 17 00:00:00 2001
From: "Justin M. Lidard" <jlidard@neuronic.cs.princeton.edu>
Date: Thu, 14 Nov 2024 21:21:21 -0500
Subject: [PATCH 3/7] config fix

---
 cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml    | 4 ++--
 cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml            | 2 +-
 cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml | 4 ++--
 cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml         | 2 +-
 cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml   | 4 ++--
 .../pretrain/square/calql_diffusion_mlp_offline.yaml          | 4 ++--
 cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml             | 2 +-
 cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml          | 2 +-
 8 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml
index bf35e0a6..88dc1c64 100644
--- a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml
+++ b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml
@@ -76,10 +76,10 @@ train:
   n_random_actions: 4
   target_ema_rate: 0.005
   scale_reward_factor: 1.0
-  num_update: 3 # 1000
+  num_update: 1000
   buffer_size: 1000000
   online_utd_ratio: 1
-  n_eval_episode: 3 # 40
+  n_eval_episode: 40
   n_explore_steps: 0
 
 model:
diff --git a/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml
index 85d603ae..36c6de6d 100644
--- a/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp.yaml
@@ -76,7 +76,7 @@ train:
   scale_reward_factor: 1
   critic_num_update: 3
   buffer_size: 400000
-  n_eval_episode: 3 # 40
+  n_eval_episode: 40
   n_explore_steps: 0
   update_freq: 2
 
diff --git a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml
index ad5dce08..29a40dde 100644
--- a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml
+++ b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml
@@ -76,10 +76,10 @@ train:
   n_random_actions: 4
   target_ema_rate: 0.005
   scale_reward_factor: 1.0
-  num_update: 3 # 1000
+  num_update: 1000
   buffer_size: 1000000
   online_utd_ratio: 1
-  n_eval_episode: 3 # 40
+  n_eval_episode: 40
   n_explore_steps: 0
 
 model:
diff --git a/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml
index 82043b23..7486d16c 100644
--- a/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml
+++ b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp.yaml
@@ -76,7 +76,7 @@ train:
   scale_reward_factor: 1
   critic_num_update: 3
   buffer_size: 400000
-  n_eval_episode: 3 # 40
+  n_eval_episode: 40
   n_explore_steps: 0
   update_freq: 2
 
diff --git a/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml
index 4afaf5d7..eff67779 100644
--- a/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml
+++ b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline.yaml
@@ -74,9 +74,9 @@ train:
   n_random_actions: 10
   target_ema_rate: 0.005
   scale_reward_factor: 1.0
-  num_update: 3 # 1000
+  num_update: 1000
   buffer_size: 1000000
-  n_eval_episode: 3 # 10
+  n_eval_episode: 40
   n_explore_steps: 0
 
 model:
diff --git a/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml
index 653cba85..fe5acb22 100644
--- a/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml
+++ b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline.yaml
@@ -74,9 +74,9 @@ train:
   n_random_actions: 10
   target_ema_rate: 0.005
   scale_reward_factor: 1.0
-  num_update: 3 # 1000
+  num_update: 1000
   buffer_size: 1000000
-  n_eval_episode: 3 # 10
+  n_eval_episode: 40
   n_explore_steps: 0
 
 model:
diff --git a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml
index 6dc91eca..1ac0138d 100644
--- a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml
+++ b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml
@@ -74,7 +74,7 @@ train:
   scale_reward_factor: 1
   critic_num_update: 3
   buffer_size: 400000
-  n_eval_episode: 3 # 40
+  n_eval_episode: 40
   n_explore_steps: 0
   target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
   init_temperature: 0
diff --git a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml
index 7567a57a..1677d1c8 100644
--- a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml
+++ b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml
@@ -74,7 +74,7 @@ train:
   scale_reward_factor: 1
   critic_num_update: 3
   buffer_size: 400000
-  n_eval_episode: 3 # 40
+  n_eval_episode: 40
   n_explore_steps: 0
   target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
   init_temperature: 0

From b0caba9f8a4d0610cd013113bdcaf6caf3879a2d Mon Sep 17 00:00:00 2001
From: "Justin M. Lidard" <jlidard@neuronic.cs.princeton.edu>
Date: Thu, 14 Nov 2024 21:38:47 -0500
Subject: [PATCH 4/7] make forward pass differentiable

---
 model/diffusion/diffusion_ibrl.py | 46 ++-----------------------------
 1 file changed, 2 insertions(+), 44 deletions(-)

diff --git a/model/diffusion/diffusion_ibrl.py b/model/diffusion/diffusion_ibrl.py
index 245ed76c..bcbd4cfe 100644
--- a/model/diffusion/diffusion_ibrl.py
+++ b/model/diffusion/diffusion_ibrl.py
@@ -113,7 +113,7 @@ def loss_critic(
         return loss_critic
 
     def loss_actor(self, obs):
-        action = self.forward_train(
+        action = self.forward(
             obs,
             deterministic=False,
         )  # use online policy only, also IBRL does not use tanh squashing
@@ -149,7 +149,6 @@ def forward(
         self,
         cond,
         deterministic=False,
-        reparameterize=False,
     ):
         """use both pre-trained and online policies"""
         q1_ind, q2_ind = self.get_random_indices()
@@ -162,7 +161,7 @@ def forward(
         )
 
         # sample an action from the RL policy
-        rl_action = super().forward(
+        rl_action = self.forward_sample(
             cond=cond,
             deterministic=deterministic,
         )
@@ -203,7 +202,6 @@ def forward(
         return action
 
     # override
-    @torch.no_grad()
     def forward_sample(
         self,
         cond,
@@ -244,43 +242,3 @@ def forward_sample(
                     x, -self.final_action_clip_value, self.final_action_clip_value
                 )
         return x
-
-    def forward_train(
-        self,
-        cond,
-        deterministic=False,
-    ):
-        """
-        Differentiable forward pass used in actor training.
-        """
-        device = self.betas.device
-        B = len(cond["state"])
-
-        # Loop
-        x = torch.randn((B, self.horizon_steps, self.action_dim), device=device)
-        t_all = list(reversed(range(self.denoising_steps)))
-        for i, t in enumerate(t_all):
-            t_b = make_timesteps(B, t, device)
-            mean, logvar = self.p_mean_var(
-                x=x,
-                t=t_b,
-                cond=cond,
-            )
-            std = torch.exp(0.5 * logvar)
-
-            # Determine the noise level
-            if deterministic and t == 0:
-                std = torch.zeros_like(std)
-            elif deterministic:  # For DDPM, sample with noise
-                std = torch.clip(std, min=1e-3)
-            else:
-                std = torch.clip(std, min=self.min_sampling_denoising_std)
-            noise = torch.randn_like(x).clamp_(
-                -self.randn_clip_value, self.randn_clip_value
-            )
-            x = mean + std * noise
-
-            # clamp action at final step
-            if self.final_action_clip_value and i == len(t_all) - 1:
-                x = torch.clamp(x, -1, 1)
-        return x

From c367feb7fdcacc070b6aa48017fef206e94ea4e4 Mon Sep 17 00:00:00 2001
From: "Justin M. Lidard" <jlidard@neuronic.cs.princeton.edu>
Date: Fri, 22 Nov 2024 18:39:04 -0500
Subject: [PATCH 5/7] update configs

---
 .../can/calql_diffusion_mlp_online.yaml       |   2 +-
 .../can/calql_diffusion_mlp_online_ph.yaml    | 121 +++++++++++++++++
 .../finetune/can/ibrl_diffusion_mlp_ph.yaml   | 119 +++++++++++++++++
 .../square/calql_diffusion_mlp_online.yaml    |   2 +-
 .../square/calql_diffusion_mlp_online_ph.yaml | 122 ++++++++++++++++++
 .../square/ibrl_diffusion_mlp_ph.yaml         | 120 +++++++++++++++++
 .../can/calql_diffusion_mlp_offline_ph.yaml   | 117 +++++++++++++++++
 .../pretrain/can/pre_diffusion_mlp_ph.yaml    |  63 +++++++++
 .../calql_diffusion_mlp_offline_ph.yaml       | 118 +++++++++++++++++
 .../pretrain/square/pre_diffusion_mlp_ph.yaml |  64 +++++++++
 .../scratch/can/rlpd_diffusion_mlp.yaml       |   4 +-
 .../scratch/can/rlpd_diffusion_mlp_ph.yaml    | 116 +++++++++++++++++
 .../scratch/square/rlpd_diffusion_mlp.yaml    |   4 +-
 .../scratch/square/rlpd_diffusion_mlp_ph.yaml | 117 +++++++++++++++++
 14 files changed, 1083 insertions(+), 6 deletions(-)
 create mode 100644 cfg/robomimic/finetune/can/calql_diffusion_mlp_online_ph.yaml
 create mode 100644 cfg/robomimic/finetune/can/ibrl_diffusion_mlp_ph.yaml
 create mode 100644 cfg/robomimic/finetune/square/calql_diffusion_mlp_online_ph.yaml
 create mode 100644 cfg/robomimic/finetune/square/ibrl_diffusion_mlp_ph.yaml
 create mode 100644 cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline_ph.yaml
 create mode 100644 cfg/robomimic/pretrain/can/pre_diffusion_mlp_ph.yaml
 create mode 100644 cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline_ph.yaml
 create mode 100644 cfg/robomimic/pretrain/square/pre_diffusion_mlp_ph.yaml
 create mode 100644 cfg/robomimic/scratch/can/rlpd_diffusion_mlp_ph.yaml
 create mode 100644 cfg/robomimic/scratch/square/rlpd_diffusion_mlp_ph.yaml

diff --git a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml
index 88dc1c64..fbbe2a51 100644
--- a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml
+++ b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online.yaml
@@ -48,7 +48,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 1000
+  n_train_itr: 10000
   n_steps: 1  # not used
   n_episode_per_epoch: 1
   gamma: 0.99
diff --git a/cfg/robomimic/finetune/can/calql_diffusion_mlp_online_ph.yaml b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online_ph.yaml
new file mode 100644
index 00000000..4448bf60
--- /dev/null
+++ b/cfg/robomimic/finetune/can/calql_diffusion_mlp_online_ph.yaml
@@ -0,0 +1,121 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+base_policy_path: 
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-calql-ph-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 10000
+  n_steps: 1  # not used
+  n_episode_per_epoch: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: True
+  batch_size: 256
+  n_random_actions: 4
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  online_utd_ratio: 1
+  n_eval_episode: 40
+  n_explore_steps: 0
+
+model:
+  _target_: model.diffusion.diffusion_calql.CalQL_Diffusion
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/can/ibrl_diffusion_mlp_ph.yaml b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp_ph.yaml
new file mode 100644
index 00000000..13080491
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ibrl_diffusion_mlp_ph.yaml
@@ -0,0 +1,119 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ibrl_diffusion_agent.TrainIBRLDiffusionAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+base_policy_path: 
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 250  # IBRL uses 200
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-ibrl-ph-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 50000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # IBRL specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  update_freq: 2
+
+model:
+  _target_: model.diffusion.diffusion_ibrl.IBRL_Diffusion
+  randn_clip_value: 3
+  n_critics: 5
+  soft_action_sample: True
+  soft_action_sample_beta: 10
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  max_n_episodes: 100
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml
index 29a40dde..1987bacd 100644
--- a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml
+++ b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online.yaml
@@ -48,7 +48,7 @@ wandb:
   run: ${now:%H-%M-%S}_${name}
 
 train:
-  n_train_itr: 1000
+  n_train_itr: 10000
   n_steps: 1  # not used
   n_episode_per_epoch: 1
   gamma: 0.99
diff --git a/cfg/robomimic/finetune/square/calql_diffusion_mlp_online_ph.yaml b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online_ph.yaml
new file mode 100644
index 00000000..c3cf5285
--- /dev/null
+++ b/cfg/robomimic/finetune/square/calql_diffusion_mlp_online_ph.yaml
@@ -0,0 +1,122 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+base_policy_path: 
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-calql-ph-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 10000
+  n_steps: 1  # not used
+  n_episode_per_epoch: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: True
+  batch_size: 256
+  n_random_actions: 4
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  online_utd_ratio: 1
+  n_eval_episode: 40
+  n_explore_steps: 0
+
+model:
+  _target_: model.diffusion.diffusion_calql.CalQL_Diffusion
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/finetune/square/ibrl_diffusion_mlp_ph.yaml b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp_ph.yaml
new file mode 100644
index 00000000..f2ffe2cc
--- /dev/null
+++ b/cfg/robomimic/finetune/square/ibrl_diffusion_mlp_ph.yaml
@@ -0,0 +1,120 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ibrl_diffusion_agent.TrainIBRLDiffusionAgent
+
+name: ${env_name}_ibrl_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+base_policy_path: 
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 250  # IBRL uses 200
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-ibrl-ph-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 50000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # IBRL specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  update_freq: 2
+
+model:
+  _target_: model.diffusion.diffusion_ibrl.IBRL_Diffusion
+  randn_clip_value: 3
+  n_critics: 5
+  soft_action_sample: True
+  soft_action_sample_beta: 10
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  max_n_episodes: 100
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline_ph.yaml b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline_ph.yaml
new file mode 100644
index 00000000..35fed253
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/calql_diffusion_mlp_offline_ph.yaml
@@ -0,0 +1,117 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-ph-calql-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 20
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: False
+  batch_size: 256
+  n_random_actions: 10
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  n_eval_episode: 40
+  n_explore_steps: 0
+
+model:
+  _target_: model.diffusion.diffusion_calql.CalQL_Diffusion
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ph.yaml b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ph.yaml
new file mode 100644
index 00000000..6607a971
--- /dev/null
+++ b/cfg/robomimic/pretrain/can/pre_diffusion_mlp_ph.yaml
@@ -0,0 +1,63 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-ph-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  save_model_freq: 500
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline_ph.yaml b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline_ph.yaml
new file mode 100644
index 00000000..521f8c4f
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/calql_diffusion_mlp_offline_ph.yaml
@@ -0,0 +1,118 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_calql_diffusion_agent.TrainCalQLDiffusionAgent
+
+name: ${env_name}_calql_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-ph-calql-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 3e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 3e-4
+  save_model_freq: 100
+  val_freq: 20
+  render:
+    freq: 1
+    num: 0
+  log_freq: 1
+  # CalQL specific
+  train_online: False
+  batch_size: 256
+  n_random_actions: 10
+  target_ema_rate: 0.005
+  scale_reward_factor: 1.0
+  num_update: 1000
+  buffer_size: 1000000
+  n_eval_episode: 40
+  n_explore_steps: 0
+
+model:
+  _target_: model.diffusion.diffusion_calql.CalQL_Diffusion
+  randn_clip_value: 3
+  cql_min_q_weight: 5.0
+  tanh_output: True
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
+  discount_factor: ${train.gamma}
+  get_mc_return: True
\ No newline at end of file
diff --git a/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ph.yaml b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ph.yaml
new file mode 100644
index 00000000..60f138d7
--- /dev/null
+++ b/cfg/robomimic/pretrain/square/pre_diffusion_mlp_ph.yaml
@@ -0,0 +1,64 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.pretrain.train_diffusion_agent.TrainDiffusionAgent
+
+name: ${env}_pre_diffusion_mlp_ta${horizon_steps}_td${denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+train_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+horizon_steps: 4
+cond_steps: 1
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: robomimic-ph-${env}-pretrain
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_epochs: 8000
+  batch_size: 256
+  learning_rate: 1e-4
+  weight_decay: 1e-6
+  lr_scheduler:
+    first_cycle_steps: 10000
+    warmup_steps: 100
+    min_lr: 1e-5
+  save_model_freq: 500
+
+model:
+  _target_: model.diffusion.diffusion.DiffusionModel
+  predict_epsilon: True
+  denoised_clip_value: 1.0
+  network:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+ema:
+  decay: 0.995
+
+train_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceDataset
+  dataset_path: ${train_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml
index 1ac0138d..ac240a5b 100644
--- a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml
+++ b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp.yaml
@@ -8,8 +8,8 @@ _target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent
 name: ${env_name}_rlpd_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
-normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
-offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
 
 seed: 42
 device: cuda:0
diff --git a/cfg/robomimic/scratch/can/rlpd_diffusion_mlp_ph.yaml b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp_ph.yaml
new file mode 100644
index 00000000..10921f4d
--- /dev/null
+++ b/cfg/robomimic/scratch/can/rlpd_diffusion_mlp_ph.yaml
@@ -0,0 +1,116 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-rlpd-ph-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 50000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # RLPD specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 0
+
+model:
+  _target_: model.diffusion.diffusion_rlpd.RLPD_Diffusion
+  randn_clip_value: 10
+  backup_entropy: False
+  n_critics: 5
+  tanh_output: True
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [256, 256, 256]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml
index 1677d1c8..b7e916b1 100644
--- a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml
+++ b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp.yaml
@@ -8,8 +8,8 @@ _target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent
 name: ${env_name}_rlpd_mlp_ta${horizon_steps}
 logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
 robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
-normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
-offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/train.npz
 
 seed: 42
 device: cuda:0
diff --git a/cfg/robomimic/scratch/square/rlpd_diffusion_mlp_ph.yaml b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp_ph.yaml
new file mode 100644
index 00000000..0b1228c7
--- /dev/null
+++ b/cfg/robomimic/scratch/square/rlpd_diffusion_mlp_ph.yaml
@@ -0,0 +1,117 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_rlpd_diffusion_agent.TrainRLPDDiffusionAgent
+
+name: ${env_name}_rlpd_mlp_ta${horizon_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/normalization.npz
+offline_dataset_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}-ph/train.npz
+
+seed: 42
+device: cuda:0
+env_name: square
+obs_dim: 23
+action_dim: 7
+denoising_steps: 10
+cond_steps: 1
+horizon_steps: 1
+act_steps: 1
+
+env:
+  n_envs: 1
+  name: ${env_name}
+  max_episode_steps: 300
+  reset_at_iteration: False
+  save_video: False
+  best_reward_threshold_for_success: 1
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: diffusion-rlpd-ph-${env_name}-act-${act_steps}
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 1000000
+  n_steps: 1
+  gamma: 0.99
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-4
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  save_model_freq: 50000
+  val_freq: 10000
+  render:
+    freq: 10000
+    num: 0
+  log_freq: 200
+  # RLPD specific
+  batch_size: 256
+  target_ema_rate: 0.01
+  scale_reward_factor: 1
+  critic_num_update: 3
+  buffer_size: 400000
+  n_eval_episode: 40
+  n_explore_steps: 0
+  target_entropy: ${eval:'- ${action_dim} * ${act_steps}'}
+  init_temperature: 0
+
+model:
+  _target_: model.diffusion.diffusion_rlpd.RLPD_Diffusion
+  randn_clip_value: 10
+  backup_entropy: False
+  n_critics: 5
+  tanh_output: True
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 32
+    mlp_dims: [1024, 1024, 1024]
+    cond_mlp_dims: [512, 64]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObsAct
+    mlp_dims: [1024, 1024, 1024]
+    activation_type: ReLU
+    use_layernorm: True
+    double_q: False # use ensemble
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    action_dim: ${action_dim}
+    action_steps: ${act_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
+
+offline_dataset:
+  _target_: agent.dataset.sequence.StitchedSequenceQLearningDataset
+  dataset_path: ${offline_dataset_path}
+  horizon_steps: ${horizon_steps}
+  cond_steps: ${cond_steps}
+  device: ${device}
\ No newline at end of file

From 583e1916836abde389550f5fd4828261fcfed107 Mon Sep 17 00:00:00 2001
From: "Justin M. Lidard" <jlidard@neu301.neuronic.cs.princeton.edu>
Date: Fri, 22 Nov 2024 19:50:23 -0500
Subject: [PATCH 6/7] add ppo with summed likelihood

---
 ...train_ppo_diffusion_agent_sumlikelihood.py | 484 ++++++++++++++++++
 .../ft_ppo_diffusion_mlp_sumlikelihood.yaml   | 111 ++++
 .../diffusion/diffusion_ppo_sumlikelihood.py  | 221 ++++++++
 3 files changed, 816 insertions(+)
 create mode 100644 agent/finetune/train_ppo_diffusion_agent_sumlikelihood.py
 create mode 100644 cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_sumlikelihood.yaml
 create mode 100644 model/diffusion/diffusion_ppo_sumlikelihood.py

diff --git a/agent/finetune/train_ppo_diffusion_agent_sumlikelihood.py b/agent/finetune/train_ppo_diffusion_agent_sumlikelihood.py
new file mode 100644
index 00000000..5f7005af
--- /dev/null
+++ b/agent/finetune/train_ppo_diffusion_agent_sumlikelihood.py
@@ -0,0 +1,484 @@
+"""
+DPPO fine-tuning.
+
+"""
+
+import os
+import pickle
+import einops
+import numpy as np
+import torch
+import logging
+import wandb
+import math
+
+log = logging.getLogger(__name__)
+from util.timer import Timer
+from agent.finetune.train_ppo_agent import TrainPPOAgent
+from util.scheduler import CosineAnnealingWarmupRestarts
+
+
+class TrainPPODiffusionAgentSumLikelihood(TrainPPOAgent):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+        # Reward horizon --- always set to act_steps for now
+        self.reward_horizon = cfg.get("reward_horizon", self.act_steps)
+
+        # Eta - between DDIM (=0 for eval) and DDPM (=1 for training)
+        self.learn_eta = self.model.learn_eta
+        if self.learn_eta:
+            self.eta_update_interval = cfg.train.eta_update_interval
+            self.eta_optimizer = torch.optim.AdamW(
+                self.model.eta.parameters(),
+                lr=cfg.train.eta_lr,
+                weight_decay=cfg.train.eta_weight_decay,
+            )
+            self.eta_lr_scheduler = CosineAnnealingWarmupRestarts(
+                self.eta_optimizer,
+                first_cycle_steps=cfg.train.eta_lr_scheduler.first_cycle_steps,
+                cycle_mult=1.0,
+                max_lr=cfg.train.eta_lr,
+                min_lr=cfg.train.eta_lr_scheduler.min_lr,
+                warmup_steps=cfg.train.eta_lr_scheduler.warmup_steps,
+                gamma=1.0,
+            )
+
+    def run(self):
+        # Start training loop
+        timer = Timer()
+        run_results = []
+        cnt_train_step = 0
+        last_itr_eval = False
+        done_venv = np.zeros((1, self.n_envs))
+        while self.itr < self.n_train_itr:
+            # Prepare video paths for each envs --- only applies for the first set of episodes if allowing reset within iteration and each iteration has multiple episodes from one env
+            options_venv = [{} for _ in range(self.n_envs)]
+            if self.itr % self.render_freq == 0 and self.render_video:
+                for env_ind in range(self.n_render):
+                    options_venv[env_ind]["video_path"] = os.path.join(
+                        self.render_dir, f"itr-{self.itr}_trial-{env_ind}.mp4"
+                    )
+
+            # Define train or eval - all envs restart
+            eval_mode = self.itr % self.val_freq == 0 and not self.force_train
+            self.model.eval() if eval_mode else self.model.train()
+            last_itr_eval = eval_mode
+
+            # Reset env before iteration starts (1) if specified, (2) at eval mode, or (3) right after eval mode
+            firsts_trajs = np.zeros((self.n_steps + 1, self.n_envs))
+            if self.reset_at_iteration or eval_mode or last_itr_eval:
+                prev_obs_venv = self.reset_env_all(options_venv=options_venv)
+                firsts_trajs[0] = 1
+            else:
+                # if done at the end of last iteration, the envs are just reset
+                firsts_trajs[0] = done_venv
+
+            # Holder
+            obs_trajs = {
+                "state": np.zeros(
+                    (self.n_steps, self.n_envs, self.n_cond_step, self.obs_dim)
+                )
+            }
+            chains_trajs = np.zeros(
+                (
+                    self.n_steps,
+                    self.n_envs,
+                    self.model.ft_denoising_steps + 1,
+                    self.horizon_steps,
+                    self.action_dim,
+                )
+            )
+            terminated_trajs = np.zeros((self.n_steps, self.n_envs))
+            reward_trajs = np.zeros((self.n_steps, self.n_envs))
+            if self.save_full_observations:  # state-only
+                obs_full_trajs = np.empty((0, self.n_envs, self.obs_dim))
+                obs_full_trajs = np.vstack(
+                    (obs_full_trajs, prev_obs_venv["state"][:, -1][None])
+                )
+
+            # Collect a set of trajectories from env
+            for step in range(self.n_steps):
+                if step % 10 == 0:
+                    print(f"Processed step {step} of {self.n_steps}")
+
+                # Select action
+                with torch.no_grad():
+                    cond = {
+                        "state": torch.from_numpy(prev_obs_venv["state"])
+                        .float()
+                        .to(self.device)
+                    }
+                    samples = self.model(
+                        cond=cond,
+                        deterministic=eval_mode,
+                        return_chain=True,
+                    )
+                    output_venv = (
+                        samples.trajectories.cpu().numpy()
+                    )  # n_env x horizon x act
+                    chains_venv = (
+                        samples.chains.cpu().numpy()
+                    )  # n_env x denoising x horizon x act
+                action_venv = output_venv[:, : self.act_steps]
+
+                # Apply multi-step action
+                (
+                    obs_venv,
+                    reward_venv,
+                    terminated_venv,
+                    truncated_venv,
+                    info_venv,
+                ) = self.venv.step(action_venv)
+                done_venv = terminated_venv | truncated_venv
+                if self.save_full_observations:  # state-only
+                    obs_full_venv = np.array(
+                        [info["full_obs"]["state"] for info in info_venv]
+                    )  # n_envs x act_steps x obs_dim
+                    obs_full_trajs = np.vstack(
+                        (obs_full_trajs, obs_full_venv.transpose(1, 0, 2))
+                    )
+                obs_trajs["state"][step] = prev_obs_venv["state"]
+                chains_trajs[step] = chains_venv
+                reward_trajs[step] = reward_venv
+                terminated_trajs[step] = terminated_venv
+                firsts_trajs[step + 1] = done_venv
+
+                # update for next step
+                prev_obs_venv = obs_venv
+
+                # count steps --- not acounting for done within action chunk
+                cnt_train_step += self.n_envs * self.act_steps if not eval_mode else 0
+
+            # Summarize episode reward --- this needs to be handled differently depending on whether the environment is reset after each iteration. Only count episodes that finish within the iteration.
+            episodes_start_end = []
+            for env_ind in range(self.n_envs):
+                env_steps = np.where(firsts_trajs[:, env_ind] == 1)[0]
+                for i in range(len(env_steps) - 1):
+                    start = env_steps[i]
+                    end = env_steps[i + 1]
+                    if end - start > 1:
+                        episodes_start_end.append((env_ind, start, end - 1))
+            if len(episodes_start_end) > 0:
+                reward_trajs_split = [
+                    reward_trajs[start : end + 1, env_ind]
+                    for env_ind, start, end in episodes_start_end
+                ]
+                num_episode_finished = len(reward_trajs_split)
+                episode_reward = np.array(
+                    [np.sum(reward_traj) for reward_traj in reward_trajs_split]
+                )
+                if (
+                    self.furniture_sparse_reward
+                ):  # only for furniture tasks, where reward only occurs in one env step
+                    episode_best_reward = episode_reward
+                else:
+                    episode_best_reward = np.array(
+                        [
+                            np.max(reward_traj) / self.act_steps
+                            for reward_traj in reward_trajs_split
+                        ]
+                    )
+                avg_episode_reward = np.mean(episode_reward)
+                avg_best_reward = np.mean(episode_best_reward)
+                success_rate = np.mean(
+                    episode_best_reward >= self.best_reward_threshold_for_success
+                )
+            else:
+                episode_reward = np.array([])
+                num_episode_finished = 0
+                avg_episode_reward = 0
+                avg_best_reward = 0
+                success_rate = 0
+                log.info("[WARNING] No episode completed within the iteration!")
+
+            # Update models
+            if not eval_mode:
+                with torch.no_grad():
+                    obs_trajs["state"] = (
+                        torch.from_numpy(obs_trajs["state"]).float().to(self.device)
+                    )
+
+                    # Calculate value and logprobs - split into batches to prevent out of memory
+                    num_split = math.ceil(
+                        self.n_envs * self.n_steps / self.logprob_batch_size
+                    )
+                    obs_ts = [{} for _ in range(num_split)]
+                    obs_k = einops.rearrange(
+                        obs_trajs["state"],
+                        "s e ... -> (s e) ...",
+                    )
+                    obs_ts_k = torch.split(obs_k, self.logprob_batch_size, dim=0)
+                    for i, obs_t in enumerate(obs_ts_k):
+                        obs_ts[i]["state"] = obs_t
+                    values_trajs = np.empty((0, self.n_envs))
+                    for obs in obs_ts:
+                        values = self.model.critic(obs).cpu().numpy().flatten()
+                        values_trajs = np.vstack(
+                            (values_trajs, values.reshape(-1, self.n_envs))
+                        )
+                    chains_t = einops.rearrange(
+                        torch.from_numpy(chains_trajs).float().to(self.device),
+                        "s e t h d -> (s e) t h d",
+                    )
+                    chains_ts = torch.split(chains_t, self.logprob_batch_size, dim=0)
+                    logprobs_trajs = np.empty(
+                        (
+                            0,
+                            self.model.ft_denoising_steps,
+                            self.horizon_steps,
+                            self.action_dim,
+                        )
+                    )
+                    for obs, chains in zip(obs_ts, chains_ts):
+                        logprobs = self.model.get_logprobs(obs, chains).cpu().numpy()
+                        logprobs_trajs = np.vstack(
+                            (
+                                logprobs_trajs,
+                                logprobs.reshape(-1, *logprobs_trajs.shape[1:]),
+                            )
+                        )
+
+                    # normalize reward with running variance if specified
+                    if self.reward_scale_running:
+                        reward_trajs_transpose = self.running_reward_scaler(
+                            reward=reward_trajs.T, first=firsts_trajs[:-1].T
+                        )
+                        reward_trajs = reward_trajs_transpose.T
+
+                    # bootstrap value with GAE if not terminal - apply reward scaling with constant if specified
+                    obs_venv_ts = {
+                        "state": torch.from_numpy(obs_venv["state"])
+                        .float()
+                        .to(self.device)
+                    }
+                    advantages_trajs = np.zeros_like(reward_trajs)
+                    lastgaelam = 0
+                    for t in reversed(range(self.n_steps)):
+                        if t == self.n_steps - 1:
+                            nextvalues = (
+                                self.model.critic(obs_venv_ts)
+                                .reshape(1, -1)
+                                .cpu()
+                                .numpy()
+                            )
+                        else:
+                            nextvalues = values_trajs[t + 1]
+                        nonterminal = 1.0 - terminated_trajs[t]
+                        # delta = r + gamma*V(st+1) - V(st)
+                        delta = (
+                            reward_trajs[t] * self.reward_scale_const
+                            + self.gamma * nextvalues * nonterminal
+                            - values_trajs[t]
+                        )
+                        # A = delta_t + gamma*lamdba*delta_{t+1} + ...
+                        advantages_trajs[t] = lastgaelam = (
+                            delta
+                            + self.gamma * self.gae_lambda * nonterminal * lastgaelam
+                        )
+                    returns_trajs = advantages_trajs + values_trajs
+
+                # k for environment step
+                obs_k = {
+                    "state": einops.rearrange(
+                        obs_trajs["state"],
+                        "s e ... -> (s e) ...",
+                    )
+                }
+                chains_k = einops.rearrange(
+                    torch.tensor(chains_trajs, device=self.device).float(),
+                    "s e t h d -> (s e) t h d",
+                )
+                returns_k = (
+                    torch.tensor(returns_trajs, device=self.device).float().reshape(-1)
+                )
+                values_k = (
+                    torch.tensor(values_trajs, device=self.device).float().reshape(-1)
+                )
+                advantages_k = (
+                    torch.tensor(advantages_trajs, device=self.device)
+                    .float()
+                    .reshape(-1)
+                )
+                logprobs_k = torch.tensor(logprobs_trajs, device=self.device).float()
+
+                # Update policy and critic
+                total_steps = self.n_steps * self.n_envs
+                clipfracs = []
+                for update_epoch in range(self.update_epochs):
+                    # for each epoch, go through all data in batches
+                    flag_break = False
+                    inds_k = torch.randperm(total_steps, device=self.device)
+                    num_batch = max(1, total_steps // self.batch_size)  # skip last ones
+
+                    # get the full range of denoising indices
+                    denoising_inds_b = torch.arange(self.model.ft_denoising_steps)
+
+                    for batch in range(num_batch):
+                        start = batch * self.batch_size
+                        end = start + self.batch_size
+                        batch_inds_b = inds_k[start:end]  # b for batch
+
+                        obs_b = {"state": obs_k["state"][batch_inds_b]}
+                        chains_prev_b = chains_k[batch_inds_b, :-1]
+                        chains_next_b = chains_k[batch_inds_b, 1:]
+                        returns_b = returns_k[batch_inds_b]
+                        values_b = values_k[batch_inds_b]
+                        advantages_b = advantages_k[batch_inds_b]
+                        logprobs_b = logprobs_k[batch_inds_b, :].sum(1)
+
+                        # get loss
+                        (
+                            pg_loss,
+                            entropy_loss,
+                            v_loss,
+                            clipfrac,
+                            approx_kl,
+                            ratio,
+                            bc_loss,
+                            eta,
+                        ) = self.model.loss(
+                            obs_b,
+                            chains_prev_b,
+                            chains_next_b,
+                            denoising_inds_b,
+                            returns_b,
+                            values_b,
+                            advantages_b,
+                            logprobs_b,
+                            use_bc_loss=self.use_bc_loss,
+                            reward_horizon=self.reward_horizon,
+                        )
+                        loss = (
+                            pg_loss
+                            + entropy_loss * self.ent_coef
+                            + v_loss * self.vf_coef
+                            + bc_loss * self.bc_loss_coeff
+                        )
+                        clipfracs += [clipfrac]
+
+                        # update policy and critic
+                        self.actor_optimizer.zero_grad()
+                        self.critic_optimizer.zero_grad()
+                        if self.learn_eta:
+                            self.eta_optimizer.zero_grad()
+                        loss.backward()
+                        if self.itr >= self.n_critic_warmup_itr:
+                            if self.max_grad_norm is not None:
+                                torch.nn.utils.clip_grad_norm_(
+                                    self.model.actor_ft.parameters(), self.max_grad_norm
+                                )
+                            self.actor_optimizer.step()
+                            if self.learn_eta and batch % self.eta_update_interval == 0:
+                                self.eta_optimizer.step()
+                        self.critic_optimizer.step()
+                        log.info(
+                            f"approx_kl: {approx_kl}, update_epoch: {update_epoch}, num_batch: {num_batch}"
+                        )
+
+                        # Stop gradient update if KL difference reaches target
+                        if self.target_kl is not None and approx_kl > self.target_kl:
+                            flag_break = True
+                            break
+                    if flag_break:
+                        break
+
+                # Explained variation of future rewards using value function
+                y_pred, y_true = values_k.cpu().numpy(), returns_k.cpu().numpy()
+                var_y = np.var(y_true)
+                explained_var = (
+                    np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
+                )
+
+            # Plot state trajectories (only in D3IL)
+            if (
+                self.itr % self.render_freq == 0
+                and self.n_render > 0
+                and self.traj_plotter is not None
+            ):
+                self.traj_plotter(
+                    obs_full_trajs=obs_full_trajs,
+                    n_render=self.n_render,
+                    max_episode_steps=self.max_episode_steps,
+                    render_dir=self.render_dir,
+                    itr=self.itr,
+                )
+
+            # Update lr, min_sampling_std
+            if self.itr >= self.n_critic_warmup_itr:
+                self.actor_lr_scheduler.step()
+                if self.learn_eta:
+                    self.eta_lr_scheduler.step()
+            self.critic_lr_scheduler.step()
+            self.model.step()
+            diffusion_min_sampling_std = self.model.get_min_sampling_denoising_std()
+
+            # Save model
+            if self.itr % self.save_model_freq == 0 or self.itr == self.n_train_itr - 1:
+                self.save_model()
+
+            # Log loss and save metrics
+            run_results.append(
+                {
+                    "itr": self.itr,
+                    "step": cnt_train_step,
+                }
+            )
+            if self.save_trajs:
+                run_results[-1]["obs_full_trajs"] = obs_full_trajs
+                run_results[-1]["obs_trajs"] = obs_trajs
+                run_results[-1]["chains_trajs"] = chains_trajs
+                run_results[-1]["reward_trajs"] = reward_trajs
+            if self.itr % self.log_freq == 0:
+                time = timer()
+                run_results[-1]["time"] = time
+                if eval_mode:
+                    log.info(
+                        f"eval: success rate {success_rate:8.4f} | avg episode reward {avg_episode_reward:8.4f} | avg best reward {avg_best_reward:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "success rate - eval": success_rate,
+                                "avg episode reward - eval": avg_episode_reward,
+                                "avg best reward - eval": avg_best_reward,
+                                "num episode - eval": num_episode_finished,
+                            },
+                            step=self.itr,
+                            commit=False,
+                        )
+                    run_results[-1]["eval_success_rate"] = success_rate
+                    run_results[-1]["eval_episode_reward"] = avg_episode_reward
+                    run_results[-1]["eval_best_reward"] = avg_best_reward
+                else:
+                    log.info(
+                        f"{self.itr}: step {cnt_train_step:8d} | loss {loss:8.4f} | pg loss {pg_loss:8.4f} | value loss {v_loss:8.4f} | bc loss {bc_loss:8.4f} | reward {avg_episode_reward:8.4f} | eta {eta:8.4f} | t:{time:8.4f}"
+                    )
+                    if self.use_wandb:
+                        wandb.log(
+                            {
+                                "total env step": cnt_train_step,
+                                "loss": loss,
+                                "pg loss": pg_loss,
+                                "value loss": v_loss,
+                                "bc loss": bc_loss,
+                                "eta": eta,
+                                "approx kl": approx_kl,
+                                "ratio": ratio,
+                                "clipfrac": np.mean(clipfracs),
+                                "explained variance": explained_var,
+                                "avg episode reward - train": avg_episode_reward,
+                                "num episode - train": num_episode_finished,
+                                "diffusion - min sampling std": diffusion_min_sampling_std,
+                                "actor lr": self.actor_optimizer.param_groups[0]["lr"],
+                                "critic lr": self.critic_optimizer.param_groups[0][
+                                    "lr"
+                                ],
+                            },
+                            step=self.itr,
+                            commit=True,
+                        )
+                    run_results[-1]["train_episode_reward"] = avg_episode_reward
+                with open(self.result_path, "wb") as f:
+                    pickle.dump(run_results, f)
+            self.itr += 1
diff --git a/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_sumlikelihood.yaml b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_sumlikelihood.yaml
new file mode 100644
index 00000000..df7288e4
--- /dev/null
+++ b/cfg/robomimic/finetune/can/ft_ppo_diffusion_mlp_sumlikelihood.yaml
@@ -0,0 +1,111 @@
+defaults:
+  - _self_
+hydra:
+  run:
+    dir: ${logdir}
+_target_: agent.finetune.train_ppo_diffusion_agent_sumlikelihood.TrainPPODiffusionAgentSumLikelihood
+
+name: ${env_name}_ft_diffusion_sumlikelihood_mlp_ta${horizon_steps}_td${denoising_steps}_tdf${ft_denoising_steps}
+logdir: ${oc.env:DPPO_LOG_DIR}/robomimic-finetune/${name}/${now:%Y-%m-%d}_${now:%H-%M-%S}_${seed}
+base_policy_path: ${oc.env:DPPO_LOG_DIR}/robomimic-pretrain/can/can_pre_diffusion_mlp_ta4_td20/2024-06-28_13-29-54/checkpoint/state_5000.pt  # use 8000 for comparing policy parameterizations
+robomimic_env_cfg_path: cfg/robomimic/env_meta/${env_name}.json
+normalization_path: ${oc.env:DPPO_DATA_DIR}/robomimic/${env_name}/normalization.npz
+
+seed: 42
+device: cuda:0
+env_name: can
+obs_dim: 23
+action_dim: 7
+denoising_steps: 20
+ft_denoising_steps: 10
+cond_steps: 1
+horizon_steps: 4
+act_steps: 4
+
+env:
+  n_envs: 5
+  name: ${env_name}
+  best_reward_threshold_for_success: 1
+  max_episode_steps: 300
+  save_video: False
+  wrappers:
+    robomimic_lowdim:
+      normalization_path: ${normalization_path}
+      low_dim_keys: ['robot0_eef_pos',
+                    'robot0_eef_quat',
+                    'robot0_gripper_qpos',
+                    'object'] # same order of preprocessed observations
+    multi_step:
+      n_obs_steps: ${cond_steps}
+      n_action_steps: ${act_steps}
+      max_episode_steps: ${env.max_episode_steps}
+      reset_within_step: True
+
+wandb:
+  entity: ${oc.env:DPPO_WANDB_ENTITY}
+  project: dppo-sumlikelihood-robomimic-${env_name}-finetune
+  run: ${now:%H-%M-%S}_${name}
+
+train:
+  n_train_itr: 151
+  n_critic_warmup_itr: 2
+  n_steps: 10
+  gamma: 0.999
+  actor_lr: 1e-4
+  actor_weight_decay: 0
+  actor_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-4
+  critic_lr: 1e-3
+  critic_weight_decay: 0
+  critic_lr_scheduler:
+    first_cycle_steps: 1000
+    warmup_steps: 10
+    min_lr: 1e-3
+  save_model_freq: 100
+  val_freq: 10
+  render:
+    freq: 1
+    num: 0
+  # PPO specific
+  reward_scale_running: True
+  reward_scale_const: 1.0
+  gae_lambda: 0.95
+  batch_size: 7500
+  update_epochs: 10
+  vf_coef: 0.5
+  target_kl: 1
+
+model:
+  _target_: model.diffusion.diffusion_ppo_sumlikelihood.PPODiffusionSumLikelihood
+  # HP to tune
+  gamma_denoising: 0.99
+  clip_ploss_coef: 0.01
+  clip_ploss_coef_base: 0.001
+  clip_ploss_coef_rate: 3
+  randn_clip_value: 3
+  min_sampling_denoising_std: 0.1
+  min_logprob_denoising_std: 0.1
+  #
+  network_path: ${base_policy_path}
+  actor:
+    _target_: model.diffusion.mlp_diffusion.DiffusionMLP
+    time_dim: 16
+    mlp_dims: [512, 512, 512]
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+    horizon_steps: ${horizon_steps}
+    action_dim: ${action_dim}
+  critic:
+    _target_: model.common.critic.CriticObs
+    mlp_dims: [256, 256, 256]
+    activation_type: Mish
+    residual_style: True
+    cond_dim: ${eval:'${obs_dim} * ${cond_steps}'}
+  ft_denoising_steps: ${ft_denoising_steps}
+  horizon_steps: ${horizon_steps}
+  obs_dim: ${obs_dim}
+  action_dim: ${action_dim}
+  denoising_steps: ${denoising_steps}
+  device: ${device}
\ No newline at end of file
diff --git a/model/diffusion/diffusion_ppo_sumlikelihood.py b/model/diffusion/diffusion_ppo_sumlikelihood.py
new file mode 100644
index 00000000..c9f4b952
--- /dev/null
+++ b/model/diffusion/diffusion_ppo_sumlikelihood.py
@@ -0,0 +1,221 @@
+"""
+DPPO: Diffusion Policy Policy Optimization. 
+
+K: number of denoising steps
+To: observation sequence length
+Ta: action chunk size
+Do: observation dimension
+Da: action dimension
+
+C: image channels
+H, W: image height and width
+
+"""
+
+from typing import Optional
+import torch
+import logging
+import math
+
+log = logging.getLogger(__name__)
+from model.diffusion.diffusion_vpg import VPGDiffusion
+
+
+class PPODiffusionSumLikelihood(VPGDiffusion):
+    def __init__(
+        self,
+        gamma_denoising: float,
+        clip_ploss_coef: float,
+        clip_ploss_coef_base: float = 1e-3,
+        clip_ploss_coef_rate: float = 3,
+        clip_vloss_coef: Optional[float] = None,
+        clip_advantage_lower_quantile: float = 0,
+        clip_advantage_upper_quantile: float = 1,
+        norm_adv: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Whether to normalize advantages within batch
+        self.norm_adv = norm_adv
+
+        # Clipping value for policy loss
+        self.clip_ploss_coef = clip_ploss_coef
+        self.clip_ploss_coef_base = clip_ploss_coef_base
+        self.clip_ploss_coef_rate = clip_ploss_coef_rate
+
+        # Clipping value for value loss
+        self.clip_vloss_coef = clip_vloss_coef
+
+        # Discount factor for diffusion MDP
+        self.gamma_denoising = gamma_denoising
+
+        # Quantiles for clipping advantages
+        self.clip_advantage_lower_quantile = clip_advantage_lower_quantile
+        self.clip_advantage_upper_quantile = clip_advantage_upper_quantile
+
+    def loss(
+        self,
+        obs,
+        chains_prev,
+        chains_next,
+        denoising_inds,
+        returns,
+        oldvalues,
+        advantages,
+        oldlogprobs,
+        use_bc_loss=False,
+        reward_horizon=4,
+    ):
+        """
+        PPO loss
+
+        obs: dict with key state/rgb; more recent obs at the end
+            state: (B, To, Do)
+            rgb: (B, To, C, H, W)
+        chains: (B, K+1, Ta, Da)
+        returns: (B, )
+        values: (B, )
+        advantages: (B,)
+        oldlogprobs: (B, K, Ta, Da)
+        use_bc_loss: whether to add BC regularization loss
+        reward_horizon: action horizon that backpropagates gradient
+        """
+        # Get new logprobs for denoising steps from T-1 to 0 - entropy is fixed fod diffusion
+        # repeat the obs for each denoising step
+        B = chains_next.shape[0]
+        obs_repeat = {
+            "state": obs["state"].repeat_interleave(self.ft_denoising_steps, dim=0)
+        }
+        denoising_inds = denoising_inds.repeat_interleave(chains_prev.shape[0])
+
+        # flatten the chains along the first and second dim
+        chains_prev = chains_prev.view(-1, *chains_prev.shape[2:])
+        chains_next = chains_next.view(-1, *chains_next.shape[2:])
+
+        # get the logprobs for all denosing steps
+        newlogprobs, eta = self.get_logprobs_subsample(
+            obs_repeat,
+            chains_prev,
+            chains_next,
+            denoising_inds,
+            get_ent=True,
+        )
+
+        entropy_loss = -eta.mean()
+        newlogprobs = newlogprobs.clamp(min=-5, max=2)
+        oldlogprobs = oldlogprobs.clamp(min=-5, max=2)
+
+        # expand newlogprobs to shape (B, K, Ta, Da) and sum along K
+        newlogprobs = newlogprobs.view(
+            -1, self.ft_denoising_steps, *newlogprobs.shape[1:]
+        )
+        newlogprobs = newlogprobs.sum(dim=1)
+
+        # only backpropagate through the earlier steps (e.g., ones actually executed in the environment)
+        newlogprobs = newlogprobs[:, :reward_horizon, :]
+        oldlogprobs = oldlogprobs[:, :reward_horizon, :]
+
+        # Get the logprobs - batch over B and denoising steps
+        newlogprobs = newlogprobs.mean(dim=(-1, -2)).view(-1)
+        oldlogprobs = oldlogprobs.mean(dim=(-1, -2)).view(-1)
+
+        bc_loss = 0
+        if use_bc_loss:
+            # See Eqn. 2 of https://arxiv.org/pdf/2403.03949.pdf
+            # Give a reward for maximizing probability of teacher policy's action with current policy.
+            # Actions are chosen along trajectory induced by current policy.
+
+            # Get counterfactual teacher actions
+            samples = self.forward(
+                cond=obs,
+                deterministic=False,
+                return_chain=True,
+                use_base_policy=True,
+            )
+            # Get logprobs of teacher actions under this policy
+            bc_logprobs = self.get_logprobs(
+                obs,
+                samples.chains,
+                get_ent=False,
+                use_base_policy=False,
+            )
+            bc_logprobs = bc_logprobs.clamp(min=-5, max=2)
+            bc_logprobs = bc_logprobs.mean(dim=(-1, -2)).view(-1)
+            bc_loss = -bc_logprobs.mean()
+
+        # normalize advantages
+        if self.norm_adv:
+            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
+
+        # Clip advantages by 5th and 95th percentile
+        advantage_min = torch.quantile(advantages, self.clip_advantage_lower_quantile)
+        advantage_max = torch.quantile(advantages, self.clip_advantage_upper_quantile)
+        advantages = advantages.clamp(min=advantage_min, max=advantage_max)
+
+        # denoising discount
+        # discount = torch.tensor(
+        #     [
+        #         self.gamma_denoising ** (self.ft_denoising_steps - i - 1)
+        #         for i in denoising_inds
+        #     ]
+        # ).to(self.device)
+        # advantages *= discount
+
+        # get ratio
+        logratio = newlogprobs - oldlogprobs
+        ratio = logratio.exp()
+
+        # exponentially interpolate between the base and the current clipping value over denoising steps and repeat
+        t = (denoising_inds.float() / (self.ft_denoising_steps - 1)).to(self.device)
+        t = t[
+            -B:
+        ]  # take the last B elements (pretend we're at the last denoising step)
+        if self.ft_denoising_steps > 1:
+            clip_ploss_coef = self.clip_ploss_coef_base + (
+                self.clip_ploss_coef - self.clip_ploss_coef_base
+            ) * (torch.exp(self.clip_ploss_coef_rate * t) - 1) / (
+                math.exp(self.clip_ploss_coef_rate) - 1
+            )
+        else:
+            clip_ploss_coef = t
+
+        # get kl difference and whether value clipped
+        with torch.no_grad():
+            # old_approx_kl: the approximate Kullback–Leibler divergence, measured by (-logratio).mean(), which corresponds to the k1 estimator in John Schulman’s blog post on approximating KL http://joschu.net/blog/kl-approx.html
+            # approx_kl: better alternative to old_approx_kl measured by (logratio.exp() - 1) - logratio, which corresponds to the k3 estimator in approximating KL http://joschu.net/blog/kl-approx.html
+            # old_approx_kl = (-logratio).mean()
+            approx_kl = ((ratio - 1) - logratio).mean()
+            clipfrac = ((ratio - 1.0).abs() > clip_ploss_coef).float().mean().item()
+
+        # Policy loss with clipping
+        pg_loss1 = -advantages * ratio
+        pg_loss2 = -advantages * torch.clamp(
+            ratio, 1 - clip_ploss_coef, 1 + clip_ploss_coef
+        )
+        pg_loss = torch.max(pg_loss1, pg_loss2).mean()
+
+        # Value loss optionally with clipping
+        newvalues = self.critic(obs).view(-1)
+        if self.clip_vloss_coef is not None:
+            v_loss_unclipped = (newvalues - returns) ** 2
+            v_clipped = oldvalues + torch.clamp(
+                newvalues - oldvalues,
+                -self.clip_vloss_coef,
+                self.clip_vloss_coef,
+            )
+            v_loss_clipped = (v_clipped - returns) ** 2
+            v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
+            v_loss = 0.5 * v_loss_max.mean()
+        else:
+            v_loss = 0.5 * ((newvalues - returns) ** 2).mean()
+        return (
+            pg_loss,
+            entropy_loss,
+            v_loss,
+            clipfrac,
+            approx_kl.item(),
+            ratio.mean().item(),
+            bc_loss,
+            eta.mean().item(),
+        )

From 2938fdd5757dca28312e280b37722bbf448d594e Mon Sep 17 00:00:00 2001
From: "Justin M. Lidard" <jlidard@neu301.neuronic.cs.princeton.edu>
Date: Fri, 22 Nov 2024 19:52:28 -0500
Subject: [PATCH 7/7] minor

---
 model/diffusion/diffusion_ppo_sumlikelihood.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/model/diffusion/diffusion_ppo_sumlikelihood.py b/model/diffusion/diffusion_ppo_sumlikelihood.py
index c9f4b952..1cd331de 100644
--- a/model/diffusion/diffusion_ppo_sumlikelihood.py
+++ b/model/diffusion/diffusion_ppo_sumlikelihood.py
@@ -1,5 +1,5 @@
 """
-DPPO: Diffusion Policy Policy Optimization. 
+DPPO: Diffusion Policy Policy Optimization with summed likelihood. 
 
 K: number of denoising steps
 To: observation sequence length
@@ -81,7 +81,6 @@ def loss(
         use_bc_loss: whether to add BC regularization loss
         reward_horizon: action horizon that backpropagates gradient
         """
-        # Get new logprobs for denoising steps from T-1 to 0 - entropy is fixed fod diffusion
         # repeat the obs for each denoising step
         B = chains_next.shape[0]
         obs_repeat = {