- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In[7], line 24 2 training_args = GaudiTrainingArguments( 3 output_dir="./gemma-finetuned", 4 num_train_epochs=1, (...) 20 report_to = [], 21 ) 23 # Initialize Trainer ---> 24 trainer = GaudiTrainer( 25 model=model, 26 args=training_args, 27 train_dataset=tokenized_train_dataset, 28 eval_dataset=tokenized_eval_dataset, 29 data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False), 30 ) File ~/.local/lib/python3.10/site-packages/optimum/habana/transformers/trainer.py:216, in GaudiTrainer.__init__(self, model, gaudi_config, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics) 212 # Workaround to not set amp backend again when calling super().__init__(...) 213 # args.bf16 is not used after the __init__ anyway 214 args.bf16 = False --> 216 super().__init__( 217 model, 218 args, 219 data_collator, 220 train_dataset, 221 eval_dataset, 222 tokenizer, 223 model_init, 224 compute_metrics, 225 callbacks, 226 optimizers, 227 preprocess_logits_for_metrics, 228 ) 230 if gaudi_config is None: 231 self.gaudi_config = GaudiConfig.from_pretrained(args.gaudi_config_name) File ~/.local/lib/python3.10/site-packages/transformers/trainer.py:535, in Trainer.__init__(self, model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics) 530 # Bnb Quantized models doesn't support `.to` operation. 531 if ( 532 self.place_model_on_device 533 and not getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES 534 ): --> 535 self._move_model_to_device(model, args.device) 537 # Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs 538 if self.is_model_parallel: File ~/.local/lib/python3.10/site-packages/optimum/habana/transformers/trainer.py:299, in GaudiTrainer._move_model_to_device(self, model, device) 298 def _move_model_to_device(self, model, device): --> 299 model = model.to(device) 300 # Moving a model to HPU disconnects the tied weights, so we have to retie them. 301 if self.args.use_habana and hasattr(model, "tie_weights"): File /usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py:173, in wrapped_to(self, *args, **kwargs) 170 shared_parameters = rearrange_shared_parameters(shared_parameters) 172 # Call original model.to --> 173 result = self.original_to(*args, **kwargs) 175 # Collect all new parameters 176 for_all_parameters_in_submodules(collect_parameters) File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1155, in Module.to(self, *args, **kwargs) 1151 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, 1152 non_blocking, memory_format=convert_to_format) 1153 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking) -> 1155 return self._apply(convert) File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:802, in Module._apply(self, fn, recurse) 800 if recurse: 801 for module in self.children(): --> 802 module._apply(fn) 804 def compute_should_use_set_data(tensor, tensor_applied): 805 if torch._has_compatible_shallow_copy_type(tensor, tensor_applied): 806 # If the new tensor has compatible tensor type as the existing tensor, 807 # the current behavior is to change the tensor in-place using `.data =`, (...) 812 # global flag to let the user control whether they want the future 813 # behavior of overwriting the existing tensor or not. File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:802, in Module._apply(self, fn, recurse) 800 if recurse: 801 for module in self.children(): --> 802 module._apply(fn) 804 def compute_should_use_set_data(tensor, tensor_applied): 805 if torch._has_compatible_shallow_copy_type(tensor, tensor_applied): 806 # If the new tensor has compatible tensor type as the existing tensor, 807 # the current behavior is to change the tensor in-place using `.data =`, (...) 812 # global flag to let the user control whether they want the future 813 # behavior of overwriting the existing tensor or not. [... skipping similar frames: Module._apply at line 802 (1 times)] File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:802, in Module._apply(self, fn, recurse) 800 if recurse: 801 for module in self.children(): --> 802 module._apply(fn) 804 def compute_should_use_set_data(tensor, tensor_applied): 805 if torch._has_compatible_shallow_copy_type(tensor, tensor_applied): 806 # If the new tensor has compatible tensor type as the existing tensor, 807 # the current behavior is to change the tensor in-place using `.data =`, (...) 812 # global flag to let the user control whether they want the future 813 # behavior of overwriting the existing tensor or not. File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:825, in Module._apply(self, fn, recurse) 821 # Tensors stored in modules are graph leaves, and we don't want to 822 # track autograd history of `param_applied`, so we have to use 823 # `with torch.no_grad():` 824 with torch.no_grad(): --> 825 param_applied = fn(param) 826 should_use_set_data = compute_should_use_set_data(param, param_applied) 827 if should_use_set_data: File /usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1153, in Module.to.<locals>.convert(t) 1150 if convert_to_format is not None and t.dim() in (4, 5): 1151 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, 1152 non_blocking, memory_format=convert_to_format) -> 1153 return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking) File /usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/core/weight_sharing.py:53, in HabanaParameterWrapper.__torch_function__(cls, func, types, args, kwargs) 51 new_args[0].change_device_placement(new_args[1].device) 52 return ---> 53 return super().__torch_function__(func, types, new_args, kwargs) RuntimeError: synStatus=8 [Device not found] Device acquire failed.
When I check using hl-smi,
as you can see in the image there is something wrong with gaudi device allocated to me in training area
- Tags:
- gaudi device failure
Link Copied
7 Replies
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi aza,
Thank you for reaching out to us.
Please provide us with the following information:
- System ID/Resource ID:
- Cloud Account ID:
- Region (us-region-1/us-region-2/us-region-3):
- Account tier (Standard/Premium/Enterprise):
Regards,
Zul
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
- System ID/Resource ID: training
- Cloud Account ID: 216804434122
- Region: us-region-1
- Account tier: Standard
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
This error came in gaudi training, for a particular container.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hi aza,
We are investigating this issue and will get back to you soon.
Regards,
Zul
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
Hello Yash,
Please let us know the IP adress of the failing machine. Is it part of pdx04-k03-hv harvester cluster?
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
I am sorry but I am not able to connect to that specific cluster again.
- Mark as New
- Bookmark
- Subscribe
- Mute
- Subscribe to RSS Feed
- Permalink
- Report Inappropriate Content
If I ever encounter faulty gaudi what should I do to record all the needed steps?
Reply
Topic Options
- Subscribe to RSS Feed
- Mark Topic as New
- Mark Topic as Read
- Float this Topic for Current User
- Bookmark
- Subscribe
- Printer Friendly Page