Skip to content

CUDA out of memory #10

@luyr

Description

@luyr

When trying to run training, I face the following CUDA out of memory issue. The program tried to allocate 152638.31 GiB memory, which is quite weired.

Traceback (most recent call last):
  File "/home/luy1syv/yiren_workspace/GaussianLSS-main/scripts/train.py", line 87, in main
    trainer.fit(model_module, datamodule=data_module, ckpt_path=ckpt_path)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 545, in fit
    call._call_and_handle_interrupt(
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 102, in launch
    return function(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 581, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 990, in _run
    results = self._run_stage()
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1036, in _run_stage
    self.fit_loop.run()
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 202, in run
    self.advance()
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 359, in advance
    self.epoch_loop.run(self._data_fetcher)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 136, in run
    self.advance(data_fetcher)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 240, in advance
    batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 187, in run
    self._optimizer_step(batch_idx, closure)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 265, in _optimizer_step
    call._call_lightning_module_hook(
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 157, in _call_lightning_module_hook
    output = fn(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/core/module.py", line 1282, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/core/optimizer.py", line 151, in step
    step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/ddp.py", line 263, in optimizer_step
    optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 230, in optimizer_step
    return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 117, in optimizer_step
    return optimizer.step(closure=closure, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
    return wrapped(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/optim/optimizer.py", line 373, in wrapper
    out = func(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
    ret = func(self, *args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/optim/adamw.py", line 161, in step
    loss = closure()
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 104, in _wrap_closure
    closure_result = closure()
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 140, in __call__
    self._result = self.closure(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 126, in closure
    step_output = self._step_fn()
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 315, in _training_step
    training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/trainer/call.py", line 309, in _call_strategy_hook
    output = fn(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 381, in training_step
    return self._forward_redirection(self.model, self.lightning_module, "training_step", *args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 628, in __call__
    wrapper_output = wrapper_module(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1519, in forward
    else self._run_ddp_forward(*inputs, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1355, in _run_ddp_forward
    return self.module(*inputs, **kwargs)  # type: ignore[index]
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py", line 621, in wrapped_forward
    out = method(*_args, **_kwargs)
  File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/model_module.py", line 55, in training_step
    return self.shared_step(batch, 'train', on_step = True,
  File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/model_module.py", line 27, in shared_step
    pred = self(batch)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/model_module.py", line 24, in forward
    return self.backbone(batch)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/GaussianLSS.py", line 104, in forward
    x, num_gaussians = self.gs_render(features, means3D, cov3D, opacities)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/GaussianLSS.py", line 175, in forward
    rendered_bev, _ = self.rasterizer(
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/diff-gaussian-rasterization/diff_gaussian_rasterization/__init__.py", line 211, in forward
    return rasterize_gaussians(
  File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/diff-gaussian-rasterization/diff_gaussian_rasterization/__init__.py", line 32, in rasterize_gaussians
    return _RasterizeGaussians.apply(
  File "/home/luy1syv/.conda/envs/GaussianLSS/lib/python3.9/site-packages/torch/autograd/function.py", line 539, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/home/luy1syv/yiren_workspace/GaussianLSS-main/GaussianLSS/model/diff-gaussian-rasterization/diff_gaussian_rasterization/__init__.py", line 92, in forward
    num_rendered, color, radii, geomBuffer, binningBuffer, imgBuffer = _C.rasterize_gaussians(*args)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 152638.31 GiB. GPU 0 has a total capacty of 79.25 GiB of which 67.39 GiB is free. Including non-PyTorch memory, this process has 11.85 GiB memory in use. Of the allocated memory 11.03 GiB is allocated by PyTorch, and 220.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions