I can successfully inference by llama3 8b struct (quantized) and SBERT locally using a NVIDIA 3070Ti GPU (12 GB), but after uploading to gitlab, the commit always fail with error “cuda out of memory” (even though I have 4 GPUs in the setting). This doesn’t seem reasonable. Are there any possible reasons?
What’s in the debug log?
Hi,
Thanks for reply,The debug log is shown below. I think this exception looks quite common, but what I don’t understand is that I can complete the task on the local with just one 4070ti (12GB), but I can’t complete it on 4 GPUs. And show “cuda out of memory”
Traceback (most recent call last):
File “run.py”, line 3, in
start_test_client()
File “/aicrowd_source/client_launcher.py”, line 20, in start_test_client
client.run_agent()
File “/home/aicrowd/.conda/lib/python3.8/site-packages/aicrowd_gym/clients/base_oracle_client.py”, line 193, in run_agent
raw_response, status, message = self.process_request(
File “/home/aicrowd/.conda/lib/python3.8/site-packages/aicrowd_gym/clients/base_oracle_client.py”, line 99, in process_request
"data": self.route_agent_request(
File “/home/aicrowd/.conda/lib/python3.8/site-packages/aicrowd_gym/clients/base_oracle_client.py”, line 129, in route_agent_request
return self.execute(target_attribute, *args, **kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/aicrowd_gym/clients/base_oracle_client.py”, line 142, in execute
return method(*args, **kwargs)
File “/aicrowd_source/aicrowd_wrapper.py”, line 62, in generate_answer
return self.agent.generate_answer(
File “/aicrowd_source/models/rag_llama_model.py”, line 172, in generate_answer
outputs = self.__pipeline(
File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/pipelines/text_generation.py”, line 240, in call
return super().__call__(text_inputs, **kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/pipelines/base.py”, line 1242, in call
return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/pipelines/base.py”, line 1249, in run_single
model_outputs = self.forward(model_inputs, **forward_params)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/pipelines/base.py”, line 1149, in forward
model_outputs = self._forward(model_inputs, **forward_params)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/pipelines/text_generation.py”, line 327, in _forward
generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/utils/_contextlib.py”, line 115, in decorate_context
return func(*args, **kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/generation/utils.py”, line 1622, in generate
result = self._sample(
File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/generation/utils.py”, line 2791, in _sample
outputs = self(
File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py”, line 1211, in forward
outputs = self.model(
File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py”, line 1018, in forward
layer_outputs = decoder_layer(
File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py”, line 756, in forward
hidden_states = self.mlp(hidden_states)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py”, line 240, in forward
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 856.00 MiB. GPU