-
Notifications
You must be signed in to change notification settings - Fork 11
Open
Description
When trying to run training, I face the following CUDA out of memory issue. The program tried to allocate 152638.31 GiB memory, which is quite weired.
Traceback (most recent call last):
File "/home/luy1syv/yiren_workspace/GaussianLSS-main/scripts/train.py", line 87, in main
trainer.fit(model_module, datamodule=data_module, ckpt_path=ckpt_path)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 545, in fit
call._call_and_handle_interrupt(
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 102, in launch
return function(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 581, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 990, in _run
results = self._run_stage()
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1036, in _run_stage
self.fit_loop.run()
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 202, in run
self.advance()
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 359, in advance
self.epoch_loop.run(self._data_fetcher)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 136, in run
self.advance(data_fetcher)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 240, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 187, in run
self._optimizer_step(batch_idx, closure)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 265, in _optimizer_step
call._call_lightning_module_hook(
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 157, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/core/module.py", line 1282, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 151, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 263, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 230, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 117, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
return wrapped(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/optim/optimizer.py", line 373, in wrapper
out = func(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
ret = func(self, *args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/optim/adamw.py", line 161, in step
loss = closure()
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 104, in _wrap_closure
closure_result = closure()
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 140, in __call__
self._result = self.closure(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 126, in closure
step_output = self._step_fn()
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 315, in _training_step
training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 309, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 381, in training_step
return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 628, in __call__
wrapper_output = wrapper_module(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1519, in forward
else self._run_ddp_forward(*inputs, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1355, in _run_ddp_forward
return self.module(*inputs, **kwargs) # type: ignore[index]
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 621, in wrapped_forward
out = method(*_args, **_kwargs)
File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/model_module.py", line 55, in training_step
return self.shared_step(batch, 'train', on_step = True,
File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/model_module.py", line 27, in shared_step
pred = self(batch)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/model_module.py", line 24, in forward
return self.backbone(batch)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/GaussianLSS.py", line 104, in forward
x, num_gaussians = self.gs_render(features, means3D, cov3D, opacities)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/GaussianLSS.py", line 175, in forward
rendered_bev, _ = self.rasterizer(
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/diff-gaussian-rasterization/diff_gaussian_rasterization/__init__.py", line 211, in forward
return rasterize_gaussians(
File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/diff-gaussian-rasterization/diff_gaussian_rasterization/__init__.py", line 32, in rasterize_gaussians
return _RasterizeGaussians.apply(
File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/autograd/function.py", line 539, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/diff-gaussian-rasterization/diff_gaussian_rasterization/__init__.py", line 92, in forward
num_rendered, color, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 152638.31 GiB. GPU 0 has a total capacty of 79.25 GiB of which 67.39 GiB is free. Including non-PyTorch memory, this process has 11.85 GiB memory in use. Of the allocated memory 11.03 GiB is allocated by PyTorch, and 220.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Metadata
Metadata
Assignees
Labels
No labels