CUDA out of memory issue

I can successfully inference by llama3 8b struct (quantized) and SBERT locally using a NVIDIA 3070Ti GPU (12 GB), but after uploading to gitlab, the commit always fail with error “cuda out of memory” (even though I have 4 GPUs in the setting). This doesn’t seem reasonable. Are there any possible reasons?

What’s in the debug log?

Hi,

Thanks for reply,The debug log is shown below. I think this exception looks quite common, but what I don’t understand is that I can complete the task on the local with just one 4070ti (12GB), but I can’t complete it on 4 GPUs. And show “cuda out of memory”

Traceback (most recent call last):

File “run.py”, line 3, in

start_test_client()

File “/aicrowd_source/client_launcher.py”, line 20, in start_test_client

client.run_agent()

File “/home/aicrowd/.conda/lib/python3.8/site-packages/aicrowd_gym/clients/base_oracle_client.py”, line 193, in run_agent

raw_response, status, message = self.process_request(

File “/home/aicrowd/.conda/lib/python3.8/site-packages/aicrowd_gym/clients/base_oracle_client.py”, line 99, in process_request

"data": self.route_agent_request(

File “/home/aicrowd/.conda/lib/python3.8/site-packages/aicrowd_gym/clients/base_oracle_client.py”, line 129, in route_agent_request

return self.execute(target_attribute, *args, **kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/aicrowd_gym/clients/base_oracle_client.py”, line 142, in execute

return method(*args, **kwargs)

File “/aicrowd_source/aicrowd_wrapper.py”, line 62, in generate_answer

return self.agent.generate_answer(

File “/aicrowd_source/models/rag_llama_model.py”, line 172, in generate_answer

outputs = self.__pipeline(

File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/pipelines/text_generation.py”, line 240, in call

return super().__call__(text_inputs, **kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/pipelines/base.py”, line 1242, in call

return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/pipelines/base.py”, line 1249, in run_single

model_outputs = self.forward(model_inputs, **forward_params)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/pipelines/base.py”, line 1149, in forward

model_outputs = self._forward(model_inputs, **forward_params)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/pipelines/text_generation.py”, line 327, in _forward

generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/utils/_contextlib.py”, line 115, in decorate_context

return func(*args, **kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/generation/utils.py”, line 1622, in generate

result = self._sample(

File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/generation/utils.py”, line 2791, in _sample

outputs = self(

File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1532, in _wrapped_call_impl

return self._call_impl(*args, **kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1541, in _call_impl

return forward_call(*args, **kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py”, line 1211, in forward

outputs = self.model(

File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1532, in _wrapped_call_impl

return self._call_impl(*args, **kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1541, in _call_impl

return forward_call(*args, **kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py”, line 1018, in forward

layer_outputs = decoder_layer(

File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1532, in _wrapped_call_impl

return self._call_impl(*args, **kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1541, in _call_impl

return forward_call(*args, **kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py”, line 756, in forward

hidden_states = self.mlp(hidden_states)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1532, in _wrapped_call_impl

return self._call_impl(*args, **kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/torch/nn/modules/module.py”, line 1541, in _call_impl

return forward_call(*args, **kwargs)

File “/home/aicrowd/.conda/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py”, line 240, in forward

down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))

torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 856.00 MiB. GPU