-
Notifications
You must be signed in to change notification settings - Fork 129
Open
Description
I have a Standard_NC48ads_A100_v4 cluster (same runtime as the sample code: 13.2 GPU ML Runtime) with 2 A100-80G GPUs. I copied the sample code and changed device_map="cuda:0" to device_map="auto" for Naive Pipeline Parallelism as suggested by huggingface/accelerate#1523. However, I still got this error 'You can't train a model that has been loaded in 8-bit precision on multiple devices'. Am I missing anything?
The full error message:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File <command-1253638663037442>:1
----> 1 trainer.train()
File /databricks/python/lib/python3.10/site-packages/mlflow/utils/autologging_utils/safety.py:434, in safe_patch.<locals>.safe_patch_function(*args, **kwargs)
419 if (
420 active_session_failed
421 or autologging_is_disabled(autologging_integration)
(...)
428 # warning behavior during original function execution, since autologging is being
429 # skipped
430 with set_non_mlflow_warnings_behavior_for_current_thread(
431 disable_warnings=False,
432 reroute_warnings=False,
433 ):
--> 434 return original(*args, **kwargs)
436 # Whether or not the original / underlying function has been called during the
437 # execution of patched code
438 original_has_been_called = False
File /databricks/python_shell/dbruntime/huggingface_patches/transformers.py:54, in _create_patch_function.<locals>.patched_fit_function(self, *args, **kwargs)
52 call_succeeded = False
53 try:
---> 54 model = original_method(self, *args, **kwargs)
55 call_succeeded = True
56 return model
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/transformers/trainer.py:1539, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1534 self.model_wrapped = self.model
1536 inner_training_loop = find_executable_batch_size(
1537 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1538 )
-> 1539 return inner_training_loop(
1540 args=args,
1541 resume_from_checkpoint=resume_from_checkpoint,
1542 trial=trial,
1543 ignore_keys_for_eval=ignore_keys_for_eval,
1544 )
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/transformers/trainer.py:1656, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1654 model = self.accelerator.prepare(self.model)
1655 else:
-> 1656 model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
1657 else:
1658 # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
1659 model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
1660 self.model, self.optimizer, self.lr_scheduler
1661 )
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/accelerate/accelerator.py:1202, in Accelerator.prepare(self, device_placement, *args)
1200 result = self._prepare_megatron_lm(*args)
1201 else:
-> 1202 result = tuple(
1203 self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
1204 )
1205 result = tuple(self._prepare_one(obj, device_placement=d) for obj, d in zip(result, device_placement))
1207 if tpu_should_fix_optimizer or self.mixed_precision == "fp8":
1208 # 2. grabbing new model parameters
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/accelerate/accelerator.py:1203, in <genexpr>(.0)
1200 result = self._prepare_megatron_lm(*args)
1201 else:
1202 result = tuple(
-> 1203 self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
1204 )
1205 result = tuple(self._prepare_one(obj, device_placement=d) for obj, d in zip(result, device_placement))
1207 if tpu_should_fix_optimizer or self.mixed_precision == "fp8":
1208 # 2. grabbing new model parameters
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/accelerate/accelerator.py:1030, in Accelerator._prepare_one(self, obj, first_pass, device_placement)
1028 return self.prepare_data_loader(obj, device_placement=device_placement)
1029 elif isinstance(obj, torch.nn.Module):
-> 1030 return self.prepare_model(obj, device_placement=device_placement)
1031 elif isinstance(obj, torch.optim.Optimizer):
1032 optimizer = self.prepare_optimizer(obj, device_placement=device_placement)
File /local_disk0/.ephemeral_nfs/envs/pythonEnv-178813df-08aa-41ab-aed7-c8a469946d46/lib/python3.10/site-packages/accelerate/accelerator.py:1270, in Accelerator.prepare_model(self, model, device_placement, evaluation_mode)
1268 model_devices = set(model.hf_device_map.values())
1269 if len(model_devices) > 1 and self.distributed_type != DistributedType.NO:
-> 1270 raise ValueError(
1271 "You can't train a model that has been loaded in 8-bit precision on multiple devices in any distributed mode."
1272 " In order to use 8-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism."
1273 " Therefore you should not specify that you are under any distributed regime in your accelerate config."
1274 )
1275 current_device = list(model_devices)[0]
1276 current_device_index = current_device.index if isinstance(current_device, torch.device) else current_device
ValueError: You can't train a model that has been loaded in 8-bit precision on multiple devices in any distributed mode. In order to use 8-bit models that have been loaded across multiple GPUs the solution is to use Naive Pipeline Parallelism. Therefore you should not specify that you are under any distributed regime in your accelerate config.
Dependent libraries are
%pip install git+https://github.com/huggingface/peft.git
%pip install datasets==2.12.0 bitsandbytes==0.40.1 einops==0.6.1 trl==0.4.7
%pip install torch==2.0.1 accelerate==0.21.0 transformers==4.31.0
Tagging @younesbelkada since you are an expert on this issue :)
Metadata
Metadata
Assignees
Labels
No labels