Why failed submission #284598

I submit this code.validation is success, but Generate Prediction is failed.
I can’t find problem. Why this submission is failed? Could you give me failed logs?

Hi Tereka,

Honestly I am not pretty sure about what the error is. It looks like a model loading timeout error.

The full logs are given below.

2025-05-14 13:38:12.828	
[rank0]: TimeoutError: Operation timed out
2025-05-14 13:38:12.828	
[rank0]:     raise TimeoutError("Operation timed out")
2025-05-14 13:38:12.828	
[rank0]:   File "/aicrowd-source/launcher.py", line 149, in _timeout_handler
2025-05-14 13:38:12.828	
[rank0]:     param_data.copy_(loaded_weight)
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/model_executor/layers/linear.py", line 705, in weight_loader
2025-05-14 13:38:12.828	
[rank0]:     weight_loader(param, loaded_weight, shard_id)
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/model_executor/models/mllama.py", line 1514, in load_weights
2025-05-14 13:38:12.828	
[rank0]:     loaded_weights = model.load_weights(
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 455, in load_model
2025-05-14 13:38:12.828	
[rank0]:     return loader.load_model(vllm_config=vllm_config)
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py", line 14, in get_model
2025-05-14 13:38:12.828	
[rank0]:     self.model = get_model(vllm_config=self.vllm_config)
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1111, in load_model
2025-05-14 13:38:12.828	
[rank0]:     self.model_runner.load_model()
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/worker/worker.py", line 203, in load_model
2025-05-14 13:38:12.828	
[rank0]:     return func(*args, **kwargs)
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/utils.py", line 2456, in run_method
2025-05-14 13:38:12.828	
[rank0]:     answer = run_method(self.driver_worker, method, args, kwargs)
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
2025-05-14 13:38:12.828	
[rank0]:     self.collective_rpc("load_model")
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor
2025-05-14 13:38:12.828	
[rank0]:     self._init_executor()
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 52, in __init__
2025-05-14 13:38:12.828	
[rank0]:     self.model_executor = executor_class(vllm_config=vllm_config)
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 275, in __init__
2025-05-14 13:38:12.828	
[rank0]:     return cls(
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 486, in from_vllm_config
2025-05-14 13:38:12.828	
[rank0]:     return engine_cls.from_vllm_config(
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 510, in from_engine_args
2025-05-14 13:38:12.828	
[rank0]:     self.llm_engine = LLMEngine.from_engine_args(
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 247, in __init__
2025-05-14 13:38:12.828	
[rank0]:     return fn(*args, **kwargs)
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/utils.py", line 1161, in inner
2025-05-14 13:38:12.828	
[rank0]:     self.llm = vllm.LLM(
2025-05-14 13:38:12.828	
[rank0]:   File "/aicrowd-source/agents/rag_agent.py", line 109, in initialize_models
2025-05-14 13:38:12.828	
[rank0]:     self.initialize_models()
2025-05-14 13:38:12.828	
[rank0]:   File "/aicrowd-source/agents/rag_agent.py", line 91, in __init__
2025-05-14 13:38:12.828	
[rank0]:     return fn(*args, **kwargs)
2025-05-14 13:38:12.828	
[rank0]:   File "/aicrowd-source/launcher.py", line 161, in run_with_timeout
2025-05-14 13:38:12.828	
[rank0]:     self.agent = run_with_timeout(
2025-05-14 13:38:12.828	
[rank0]:   File "/aicrowd-source/launcher.py", line 99, in init_agent
2025-05-14 13:38:12.828	
[rank0]:     return method(*args, **kwargs)
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/aicrowd_gym/clients/base_oracle_client.py", line 144, in execute
2025-05-14 13:38:12.828	
[rank0]:     return self.execute(target_attribute, *args, **kwargs)
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/aicrowd_gym/clients/base_oracle_client.py", line 131, in route_agent_request
2025-05-14 13:38:12.828	
[rank0]:     "data": self.route_agent_request(
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/aicrowd_gym/clients/base_oracle_client.py", line 101, in process_request
2025-05-14 13:38:12.828	
[rank0]:     raw_response, status, message = self.process_request(
2025-05-14 13:38:12.828	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/aicrowd_gym/clients/base_oracle_client.py", line 195, in run_agent
2025-05-14 13:38:12.828	
[rank0]:     oracle_client.run_agent()
2025-05-14 13:38:12.828	
[rank0]:   File "/aicrowd-source/launcher.py", line 178, in serve
2025-05-14 13:38:12.828	
[rank0]:     serve()
2025-05-14 13:38:12.828	
[rank0]:   File "/aicrowd-source/launcher.py", line 191, in main
2025-05-14 13:38:12.828	
[rank0]:     main()
2025-05-14 13:38:12.828	
[rank0]:   File "/aicrowd-source/launcher.py", line 198, in <module>
2025-05-14 13:38:12.828	
[rank0]:     raise exc
2025-05-14 13:38:12.828	
[rank0]:   File "/aicrowd-source/launcher.py", line 213, in <module>
2025-05-14 13:38:12.828	
[rank0]: Traceback (most recent call last):

@yilun_jin8
Thank you for sharing. umm, it’s strange.
Because validation is success, it is already tested initialization step, but generate prediction step is failed, I think 2 environments are same condition.

Is there difference between validation and generate prediction step?

I don’t think so. In any ways, model loading should always be the same.

Maybe I can ask our folks to trigger a re-evaluation, and see if this is a rare occasion instead of an error.

@yilun_jin8

I don’t think so. In any ways, model loading should always be the same.

I think so too for purpose of checking method.
I tried sometime same evaluation, but failed when generate prediction step…

Could you retry this submission and checking metrics and resources? I doubt that resources are still being held by zombie processes.

@yilun_jin8
Hi, Could you retry my submission and investigation?

Hi Tereka,

Last evening, aicrowd folks triggered a re-evaluation of submission 284598, and the error is the same. (You can see that the timestamps are different, so it is indeed a re-evaluation).

2025-05-14 22:17:50.462	
[rank0]: TimeoutError: Operation timed out
2025-05-14 22:17:50.462	
[rank0]:     raise TimeoutError("Operation timed out")
2025-05-14 22:17:50.462	
[rank0]:   File "/aicrowd-source/launcher.py", line 149, in _timeout_handler
2025-05-14 22:17:50.462	
[rank0]:     param_data.copy_(loaded_weight)
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/model_executor/layers/linear.py", line 705, in weight_loader
2025-05-14 22:17:50.462	
[rank0]:     weight_loader(param, loaded_weight, shard_id)
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/model_executor/models/mllama.py", line 1514, in load_weights
2025-05-14 22:17:50.462	
[rank0]:     loaded_weights = model.load_weights(
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/model_executor/model_loader/loader.py", line 455, in load_model
2025-05-14 22:17:50.462	
[rank0]:     return loader.load_model(vllm_config=vllm_config)
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py", line 14, in get_model
2025-05-14 22:17:50.462	
[rank0]:     self.model = get_model(vllm_config=self.vllm_config)
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/worker/model_runner.py", line 1111, in load_model
2025-05-14 22:17:50.462	
[rank0]:     self.model_runner.load_model()
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/worker/worker.py", line 203, in load_model
2025-05-14 22:17:50.462	
[rank0]:     return func(*args, **kwargs)
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/utils.py", line 2456, in run_method
2025-05-14 22:17:50.462	
[rank0]:     answer = run_method(self.driver_worker, method, args, kwargs)
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
2025-05-14 22:17:50.462	
[rank0]:     self.collective_rpc("load_model")
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor
2025-05-14 22:17:50.462	
[rank0]:     self._init_executor()
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 52, in __init__
2025-05-14 22:17:50.462	
[rank0]:     self.model_executor = executor_class(vllm_config=vllm_config)
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 275, in __init__
2025-05-14 22:17:50.462	
[rank0]:     return cls(
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 486, in from_vllm_config
2025-05-14 22:17:50.462	
[rank0]:     return engine_cls.from_vllm_config(
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 510, in from_engine_args
2025-05-14 22:17:50.462	
[rank0]:     self.llm_engine = LLMEngine.from_engine_args(
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 247, in __init__
2025-05-14 22:17:50.462	
[rank0]:     return fn(*args, **kwargs)
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/vllm/utils.py", line 1161, in inner
2025-05-14 22:17:50.462	
[rank0]:     self.llm = vllm.LLM(
2025-05-14 22:17:50.462	
[rank0]:   File "/aicrowd-source/agents/rag_agent.py", line 109, in initialize_models
2025-05-14 22:17:50.462	
[rank0]:     self.initialize_models()
2025-05-14 22:17:50.462	
[rank0]:   File "/aicrowd-source/agents/rag_agent.py", line 91, in __init__
2025-05-14 22:17:50.462	
[rank0]:     return fn(*args, **kwargs)
2025-05-14 22:17:50.462	
[rank0]:   File "/aicrowd-source/launcher.py", line 161, in run_with_timeout
2025-05-14 22:17:50.462	
[rank0]:     self.agent = run_with_timeout(
2025-05-14 22:17:50.462	
[rank0]:   File "/aicrowd-source/launcher.py", line 99, in init_agent
2025-05-14 22:17:50.462	
[rank0]:     return method(*args, **kwargs)
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/aicrowd_gym/clients/base_oracle_client.py", line 144, in execute
2025-05-14 22:17:50.462	
[rank0]:     return self.execute(target_attribute, *args, **kwargs)
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/aicrowd_gym/clients/base_oracle_client.py", line 131, in route_agent_request
2025-05-14 22:17:50.462	
[rank0]:     "data": self.route_agent_request(
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/aicrowd_gym/clients/base_oracle_client.py", line 101, in process_request
2025-05-14 22:17:50.462	
[rank0]:     raw_response, status, message = self.process_request(
2025-05-14 22:17:50.462	
[rank0]:   File "/usr/local/lib/python3.10/site-packages/aicrowd_gym/clients/base_oracle_client.py", line 195, in run_agent
2025-05-14 22:17:50.462	
[rank0]:     oracle_client.run_agent()
2025-05-14 22:17:50.462	
[rank0]:   File "/aicrowd-source/launcher.py", line 178, in serve
2025-05-14 22:17:50.462	
[rank0]:     serve()
2025-05-14 22:17:50.462	
[rank0]:   File "/aicrowd-source/launcher.py", line 191, in main
2025-05-14 22:17:50.462	
[rank0]:     main()
2025-05-14 22:17:50.462	
[rank0]:   File "/aicrowd-source/launcher.py", line 198, in <module>
2025-05-14 22:17:50.462	
[rank0]:     raise exc
2025-05-14 22:17:50.462	
[rank0]:   File "/aicrowd-source/launcher.py", line 213, in <module>
2025-05-14 22:17:50.462	
[rank0]: Traceback (most recent call last):

I have told technical folks at aicrowd to see what may be going on.