Gaudi device allocation error in training area

aza · ‎10-20-2024

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[7], line 24
      2 training_args = GaudiTrainingArguments(
      3     output_dir="./gemma-finetuned",
      4     num_train_epochs=1,
   (...)
     20     report_to = [],
     21 )
     23 # Initialize Trainer
---> 24 trainer = GaudiTrainer(
     25     model=model,
     26     args=training_args,
     27     train_dataset=tokenized_train_dataset,
     28     eval_dataset=tokenized_eval_dataset,
     29     data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
     30 )

File ~/.local/lib/python3.10/site-packages/optimum/habana/transformers/trainer.py:216, in GaudiTrainer.__init__(self, model, gaudi_config, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics)
    212     # Workaround to not set amp backend again when calling super().__init__(...)
    213     # args.bf16 is not used after the __init__ anyway
    214     args.bf16 = False
--> 216 super().__init__(
    217     model,
    218     args,
    219     data_collator,
    220     train_dataset,
    221     eval_dataset,
    222     tokenizer,
    223     model_init,
    224     compute_metrics,
    225     callbacks,
    226     optimizers,
    227     preprocess_logits_for_metrics,
    228 )
    230 if gaudi_config is None:
    231     self.gaudi_config = GaudiConfig.from_pretrained(args.gaudi_config_name)

File ~/.local/lib/python3.10/site-packages/transformers/trainer.py:535, in Trainer.__init__(self, model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics)
    530 # Bnb Quantized models doesn't support `.to` operation.
    531 if (
    532     self.place_model_on_device
    533     and not getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES
    534 ):
--> 535     self._move_model_to_device(model, args.device)
    537 # Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs
    538 if self.is_model_parallel:

File ~/.local/lib/python3.10/site-packages/optimum/habana/transformers/trainer.py:299, in GaudiTrainer._move_model_to_device(self, model, device)
    298 def _move_model_to_device(self, model, device):
--> 299     model = model.to(device)
    300     # Moving a model to HPU disconnects the tied weights, so we have to retie them.
    301     if self.args.use_habana and hasattr(model, "tie_weights"):

File /usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py:173, in wrapped_to(self, *args, **kwargs)
    170 shared_parameters = rearrange_shared_parameters(shared_parameters)
    172 # Call original model.to
--> 173 result = self.original_to(*args, **kwargs)
    175 # Collect all new parameters
    176 for_all_parameters_in_submodules(collect_parameters)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1155, in Module.to(self, *args, **kwargs)
   1151         return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None,
   1152                     non_blocking, memory_format=convert_to_format)
   1153     return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
-> 1155 return self._apply(convert)

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:802, in Module._apply(self, fn, recurse)
    800 if recurse:
    801     for module in self.children():
--> 802         module._apply(fn)
    804 def compute_should_use_set_data(tensor, tensor_applied):
    805     if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
    806         # If the new tensor has compatible tensor type as the existing tensor,
    807         # the current behavior is to change the tensor in-place using `.data =`,
   (...)
    812         # global flag to let the user control whether they want the future
    813         # behavior of overwriting the existing tensor or not.

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:802, in Module._apply(self, fn, recurse)
    800 if recurse:
    801     for module in self.children():
--> 802         module._apply(fn)
    804 def compute_should_use_set_data(tensor, tensor_applied):
    805     if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
    806         # If the new tensor has compatible tensor type as the existing tensor,
    807         # the current behavior is to change the tensor in-place using `.data =`,
   (...)
    812         # global flag to let the user control whether they want the future
    813         # behavior of overwriting the existing tensor or not.

    [... skipping similar frames: Module._apply at line 802 (1 times)]

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:802, in Module._apply(self, fn, recurse)
    800 if recurse:
    801     for module in self.children():
--> 802         module._apply(fn)
    804 def compute_should_use_set_data(tensor, tensor_applied):
    805     if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
    806         # If the new tensor has compatible tensor type as the existing tensor,
    807         # the current behavior is to change the tensor in-place using `.data =`,
   (...)
    812         # global flag to let the user control whether they want the future
    813         # behavior of overwriting the existing tensor or not.

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:825, in Module._apply(self, fn, recurse)
    821 # Tensors stored in modules are graph leaves, and we don't want to
    822 # track autograd history of `param_applied`, so we have to use
    823 # `with torch.no_grad():`
    824 with torch.no_grad():
--> 825     param_applied = fn(param)
    826 should_use_set_data = compute_should_use_set_data(param, param_applied)
    827 if should_use_set_data:

File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1153, in Module.to.<locals>.convert(t)
   1150 if convert_to_format is not None and t.dim() in (4, 5):
   1151     return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None,
   1152                 non_blocking, memory_format=convert_to_format)
-> 1153 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)

File /usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py:53, in HabanaParameterWrapper.__torch_function__(cls, func, types, args, kwargs)
     51             new_args[0].change_device_placement(new_args[1].device)
     52             return
---> 53 return super().__torch_function__(func, types, new_args, kwargs)

RuntimeError: synStatus=8 [Device not found] Device acquire failed.

When I check using hl-smi,

as you can see in the image there is something wrong with gaudi device allocated to me in training area

Zulkifli_Intel · ‎10-27-2024

Hi aza,

Thank you for reaching out to us.

Please provide us with the following information:

System ID/Resource ID:
Cloud Account ID:
Region (us-region-1/us-region-2/us-region-3):
Account tier (Standard/Premium/Enterprise):

Regards,

Zul

aza · ‎10-27-2024

System ID/Resource ID: training
Cloud Account ID: 216804434122
Region: us-region-1
Account tier: Standard

aza · ‎10-27-2024

This error came in gaudi training, for a particular container.

Zulkifli_Intel · ‎10-28-2024

Hi aza,

We are investigating this issue and will get back to you soon.

Regards,

Zul

Witold_Intel · ‎10-31-2024

Hello Yash,

Please let us know the IP adress of the failing machine. Is it part of pdx04-k03-hv harvester cluster?

aza · ‎11-01-2024

I am sorry but I am not able to connect to that specific cluster again.

aza · ‎11-01-2024

If I ever encounter faulty gaudi what should I do to record all the needed steps?

Gaudi device allocation error in training area

Connectivity

Containers