Error running 06_fine_tune_qlora.py on multi-gpu

I have a Standard_NC48ads_A100_v4 cluster (same runtime as the sample code: 13.2 GPU ML Runtime) with 2 A100-80G GPUs. I copied the sample code and changed device_map="cuda:0" to device_map="auto" for Naive  Pipeline Parallelism as suggested by https://github.com/huggingface/accelerate/pull/1523. However, I still got this error 'You can't train a model that has been loaded in 8-bit precision on multiple devices'. Am I missing anything?

The full error message: 
```
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File <command-1253638663037442>:1
----> 1 trainer.train()

File /databricks/python/lib/python3.10/site-packages/mlflow/utils/autologging_utils/safety.py:434, in safe_patch.<locals>.safe_patch_function(*args, **kwargs)
    419 if (
    420     active_session_failed
    421     or autologging_is_disabled(autologging_integration)
   (...)
    428     # warning behavior during original function execution, since autologging is being
    429     # skipped
    430     with set_non_mlflow_warnings_behavior_for_current_thread(
    431         disable_warnings=False,
    432         reroute_warnings=False,
    433     ):
--> 434         return original(*args, **kwargs)
    436 # Whether or not the original / underlying function has been called during the
    437 # execution of patched code
    438 original_has_been_called = False

File /databricks/python_shell/dbruntime/huggingface_patches/transformers.py:54, in _create_patch_function.<locals>.patched_fit_function(self, *args, **kwargs)
     52 call_succeeded = False
     53 try:
---> 54     model = original_method(self, *args, **kwargs)
     55     call_succeeded = True
     56     return model

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/transformers/trainer.py:1539, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1534     self.model_wrapped = self.model
   1536 inner_training_loop = find_executable_batch_size(
   1537     self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
   1538 )
-> 1539 return inner_training_loop(
   1540     args=args,
   1541     resume_from_checkpoint=resume_from_checkpoint,
   1542     trial=trial,
   1543     ignore_keys_for_eval=ignore_keys_for_eval,
   1544 )

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/transformers/trainer.py:1656, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1654         model = self.accelerator.prepare(self.model)
   1655     else:
-> 1656         model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
   1657 else:
   1658     # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
   1659     model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
   1660         self.model, self.optimizer, self.lr_scheduler
   1661     )

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/accelerate/accelerator.py:1202, in Accelerator.prepare(self, device_placement, *args)
   1200     result = self._prepare_megatron_lm(*args)
   1201 else:
-> 1202     result = tuple(
   1203         self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
   1204     )
   1205     result = tuple(self._prepare_one(obj, device_placement=d) for obj, d in zip(result, device_placement))
   1207 if tpu_should_fix_optimizer or self.mixed_precision == "fp8":
   1208     # 2. grabbing new model parameters

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/accelerate/accelerator.py:1203, in <genexpr>(.0)
   1200     result = self._prepare_megatron_lm(*args)
   1201 else:
   1202     result = tuple(
-> 1203         self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
   1204     )
   1205     result = tuple(self._prepare_one(obj, device_placement=d) for obj, d in zip(result, device_placement))
   1207 if tpu_should_fix_optimizer or self.mixed_precision == "fp8":
   1208     # 2. grabbing new model parameters

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/accelerate/accelerator.py:1030, in Accelerator._prepare_one(self, obj, first_pass, device_placement)
   1028     return self.prepare_data_loader(obj, device_placement=device_placement)
   1029 elif isinstance(obj, torch.nn.Module):
-> 1030     return self.prepare_model(obj, device_placement=device_placement)
   1031 elif isinstance(obj, torch.optim.Optimizer):
   1032     optimizer = self.prepare_optimizer(obj, device_placement=device_placement)

File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/accelerate/accelerator.py:1270, in Accelerator.prepare_model(self, model, device_placement, evaluation_mode)
   1268 model_devices = set(model.hf_device_map.values())
   1269 if len(model_devices) > 1 and self.distributed_type != DistributedType.NO:
-> 1270     raise ValueError(
   1271         "You can't train a model that has been loaded in 8-bit precision on multiple devices in any distributed mode."
   1272         " In order to use 8-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism."
   1273         " Therefore you should not specify that you are under any distributed regime in your accelerate config."
   1274     )
   1275 current_device = list(model_devices)[0]
   1276 current_device_index = current_device.index if isinstance(current_device, torch.device) else current_device

ValueError: You can't train a model that has been loaded in 8-bit precision on multiple devices in any distributed mode. In order to use 8-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism. Therefore you should not specify that you are under any distributed regime in your accelerate config.
```
Dependent libraries are
```
%pip install git+https://github.com/huggingface/peft.git
%pip install datasets==2.12.0 bitsandbytes==0.40.1 einops==0.6.1 trl==0.4.7
%pip install torch==2.0.1 accelerate==0.21.0 transformers==4.31.0
```

Tagging  @younesbelkada since you are an expert on this issue :)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Error running 06_fine_tune_qlora.py on multi-gpu #77

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Error running 06_fine_tune_qlora.py on multi-gpu #77

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions