From e6a995caf90e5477d7b4d76f8b5c390cecb3f723 Mon Sep 17 00:00:00 2001
From: Sami Jaghouar <sami.jaghouar@hotmail.fr>
Date: Tue, 24 Sep 2024 01:19:27 +0000
Subject: [PATCH] add smth special maybe ?

---
 open_diloco/hivemind_diloco.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/open_diloco/hivemind_diloco.py b/open_diloco/hivemind_diloco.py
index 308608b..2dec08e 100644
--- a/open_diloco/hivemind_diloco.py
+++ b/open_diloco/hivemind_diloco.py
@@ -164,6 +164,8 @@ def compute_and_load_pseudo_grad_into_averager(self):
                 # opt_param is the param that will be all_reduce, it is suppose to be on cpu
                 # main_param is the param that has been updated by the inner optimizer, it is suppose to be on gpu
                 grad = opt_param.data - main_param.detach().to(opt_param.device)
+                mask = torch.rand_like(grad) > 0.95
+                grad *= mask
                 averaged_grad.copy_(grad, non_blocking=True)
 
     def notify_used_averaged_gradients(self):