From e6a995caf90e5477d7b4d76f8b5c390cecb3f723 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Tue, 24 Sep 2024 01:19:27 +0000 Subject: [PATCH] add smth special maybe ? --- open_diloco/hivemind_diloco.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/open_diloco/hivemind_diloco.py b/open_diloco/hivemind_diloco.py index 308608b..2dec08e 100644 --- a/open_diloco/hivemind_diloco.py +++ b/open_diloco/hivemind_diloco.py @@ -164,6 +164,8 @@ def compute_and_load_pseudo_grad_into_averager(self): # opt_param is the param that will be all_reduce, it is suppose to be on cpu # main_param is the param that has been updated by the inner optimizer, it is suppose to be on gpu grad = opt_param.data - main_param.detach().to(opt_param.device) + mask = torch.rand_like(grad) > 0.95 + grad *= mask averaged_grad.copy_(grad, non_blocking=True) def notify_used_averaged_gradients(self):