tiiuae/falcon-40b-instruct · RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

May 28, 2023

•

edited May 28, 2023

<3090gpux2 >

from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model = "falcon40binstruction"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
)
sequences = pipeline(
"Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
max_length=200,
do_sample=True,
top_k=10,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
print(f"Result: {seq['generated_text']}")

The model 'RWForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForCausalLM', 'RoCBertForCausalLM', 'RoFormerForCausalLM', 'RwkvForCausalLM', 'Speech2Text2ForCausalLM', 'TransfoXLLMHeadModel', 'TrOCRForCausalLM', 'XGLMForCausalLM', 'XLMWithLMHeadModel', 'XLMProphetNetForCausalLM', 'XLMRobertaForCausalLM', 'XLMRobertaXLForCausalLM', 'XLNetLMHeadModel', 'XmodForCausalLM'].
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.

RuntimeError Traceback (most recent call last)
Cell In[1], line 16
7 tokenizer = AutoTokenizer.from_pretrained(model)
8 pipeline = transformers.pipeline(
9 "text-generation",
10 model=model,
(...)
14 device_map="auto",
15 )
---> 16 sequences = pipeline(
17 "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
18 max_length=200,
19 do_sample=True,
20 top_k=10,
21 num_return_sequences=1,
22 eos_token_id=tokenizer.eos_token_id,
23 )
24 for seq in sequences:
25 print(f"Result: {seq['generated_text']}")

File ~/autodl-tmp/transformers/pipelines/text_generation.py:201, in TextGenerationPipeline.call(self, text_inputs, **kwargs)
160 def call(self, text_inputs, **kwargs):
161 """
162 Complete the prompt(s) given as inputs.
163
(...)
199 ids of the generated text.
200 """
--> 201 return super().call(text_inputs, **kwargs)

File ~/autodl-tmp/transformers/pipelines/base.py:1118, in Pipeline.call(self, inputs, num_workers, batch_size, *args, **kwargs)
1110 return next(
1111 iter(
1112 self.get_iterator(
(...)
1115 )
1116 )
1117 else:
-> 1118 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)

File ~/autodl-tmp/transformers/pipelines/base.py:1125, in Pipeline.run_single(self, inputs, preprocess_params, forward_params, postprocess_params)
1123 def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
1124 model_inputs = self.preprocess(inputs, **preprocess_params)
-> 1125 model_outputs = self.forward(model_inputs, **forward_params)
1126 outputs = self.postprocess(model_outputs, **postprocess_params)
1127 return outputs

File ~/autodl-tmp/transformers/pipelines/base.py:1024, in Pipeline.forward(self, model_inputs, **forward_params)
1022 with inference_context():
1023 model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
-> 1024 model_outputs = self._forward(model_inputs, **forward_params)
1025 model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
1026 else:

File ~/autodl-tmp/transformers/pipelines/text_generation.py:263, in TextGenerationPipeline._forward(self, model_inputs, **generate_kwargs)
260 generate_kwargs["min_length"] += prefix_length
262 # BS x SL
--> 263 generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
264 out_b = generated_sequence.shape[0]
265 if self.framework == "pt":

File ~/miniconda3/lib/python3.8/site-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)

File ~/autodl-tmp/transformers/generation/utils.py:1568, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, **kwargs)
1560 input_ids, model_kwargs = self._expand_inputs_for_generation(
1561 input_ids=input_ids,
1562 expand_size=generation_config.num_return_sequences,
1563 is_encoder_decoder=self.config.is_encoder_decoder,
1564 **model_kwargs,
1565 )
1567 # 13. run sample
-> 1568 return self.sample(
1569 input_ids,
1570 logits_processor=logits_processor,
1571 logits_warper=logits_warper,
1572 stopping_criteria=stopping_criteria,
1573 pad_token_id=generation_config.pad_token_id,
1574 eos_token_id=generation_config.eos_token_id,
1575 output_scores=generation_config.output_scores,
1576 return_dict_in_generate=generation_config.return_dict_in_generate,
1577 synced_gpus=synced_gpus,
1578 streamer=streamer,
1579 **model_kwargs,
1580 )
1582 elif is_beam_gen_mode:
1583 if generation_config.num_return_sequences > generation_config.num_beams:

File ~/autodl-tmp/transformers/generation/utils.py:2615, in GenerationMixin.sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2612 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
2614 # forward pass to get next token
-> 2615 outputs = self(
2616 **model_inputs,
2617 return_dict=True,
2618 output_attentions=output_attentions,
2619 output_hidden_states=output_hidden_states,
2620 )
2622 if synced_gpus and this_peer_finished:
2623 continue # don't waste resources running the code we don't need

File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/miniconda3/lib/python3.8/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/falcon40b/modelling_RW.py:759, in RWForCausalLM.forward(self, input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments)
755 raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
757 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--> 759 transformer_outputs = self.transformer(
760 input_ids,
761 past_key_values=past_key_values,
762 attention_mask=attention_mask,
763 head_mask=head_mask,
764 inputs_embeds=inputs_embeds,
765 use_cache=use_cache,
766 output_attentions=output_attentions,
767 output_hidden_states=output_hidden_states,
768 return_dict=return_dict,
769 )
770 hidden_states = transformer_outputs[0]
772 lm_logits = self.lm_head(hidden_states)

File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/.cache/huggingface/modules/transformers_modules/falcon40b/modelling_RW.py:654, in RWModel.forward(self, input_ids, past_key_values, attention_mask, head_mask, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, **deprecated_arguments)
646 outputs = torch.utils.checkpoint.checkpoint(
647 create_custom_forward(block),
648 hidden_states,
(...)
651 head_mask[i],
652 )
653 else:
--> 654 outputs = block(
655 hidden_states,
656 layer_past=layer_past,
657 attention_mask=causal_mask,
658 head_mask=head_mask[i],
659 use_cache=use_cache,
660 output_attentions=output_attentions,
661 alibi=alibi,
662 )
664 hidden_states = outputs[0]
665 if use_cache is True:

File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/miniconda3/lib/python3.8/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/falcon40b/modelling_RW.py:396, in DecoderLayer.forward(self, hidden_states, alibi, attention_mask, layer_past, head_mask, use_cache, output_attentions)
393 residual = hidden_states
395 # Self attention.
--> 396 attn_outputs = self.self_attention(
397 ln_attn,
398 layer_past=layer_past,
399 attention_mask=attention_mask,
400 alibi=alibi,
401 head_mask=head_mask,
402 use_cache=use_cache,
403 output_attentions=output_attentions,
404 )
406 attention_output = attn_outputs[0]
408 outputs = attn_outputs[1:]

File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/miniconda3/lib/python3.8/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/falcon40b/modelling_RW.py:252, in Attention.forward(self, hidden_states, alibi, attention_mask, layer_past, head_mask, use_cache, output_attentions)
242 def forward(
243 self,
244 hidden_states: torch.Tensor,
(...)
250 output_attentions: bool = False,
251 ):
--> 252 fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size]
254 # 3 x [batch_size, seq_length, num_heads, head_dim]
255 (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)

File ~/miniconda3/lib/python3.8/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/miniconda3/lib/python3.8/site-packages/accelerate/hooks.py:165, in add_hook_to_module..new_forward(*args, **kwargs)
163 output = old_forward(*args, **kwargs)
164 else:
--> 165 output = old_forward(*args, **kwargs)
166 return module._hf_hook.post_forward(module, output)

File ~/.cache/huggingface/modules/transformers_modules/falcon40b/modelling_RW.py:32, in Linear.forward(self, input)
31 def forward(self, input: torch.Tensor) -> torch.Tensor:
---> 32 ret = input @ self.weight.T
33 if self.bias is None:
34 return ret

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling cublasCreate(handle)

Minami-su

May 28, 2023

问题解决了，3090x2 --> a40x2

Minami-su changed discussion status to closed May 28, 2023