Created
June 17, 2025 23:29
-
-
Save crypdick/1c154b82a8c53fc5f5f2549456a30a7a to your computer and use it in GitHub Desktop.
stack trace for flaky ray data llm workload
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2025-06-17 16:26:19,406 DEBUG streaming_executor.py:546 -- 9: - MapBatches(vLLMEngineStageUDF): Tasks: 24; Actors: 3; Queued blocks: 13; Resources: 0.0 CPU, 3.0 GPU, 768.0MB object store; [8/24 objects local], Blocks Outputted: 0/None | |
2025-06-17 16:26:19,406 DEBUG streaming_executor.py:546 -- 10: - MapBatches(DetokenizeUDF): Tasks: 0; Actors: 1; Queued blocks: 0; Resources: 1.0 CPU, 0.0B object store; [all objects local], Blocks Outputted: 0/None | |
2025-06-17 16:26:19,406 DEBUG streaming_executor.py:546 -- 11: - Map(_postprocess)->Filter(NoneType)->Write: Tasks: 0; Actors: 0; Queued blocks: 0; Resources: 0.0 CPU, 0.0B object store, Blocks Outputted: 0/None | |
2025-06-17 16:26:27,977 ERROR streaming_executor_state.py:519 -- An exception was raised from a task of operator "MapBatches(vLLMEngineStageUDF)". Dataset execution will now abort. To ignore this exception and continue, set DataContext.max_errored_blocks. | |
Traceback (most recent call last): | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/execution/streaming_executor_state.py", line 487, in process_completed_tasks | |
bytes_read = task.on_data_ready( | |
^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py", line 156, in on_data_ready | |
raise ex from None | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/execution/interfaces/physical_operator.py", line 152, in on_data_ready | |
ray.get(block_ref) | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper | |
return fn(*args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper | |
return func(*args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/_private/worker.py", line 2851, in get | |
values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/_private/worker.py", line 951, in get_objects | |
raise value.as_instanceof_cause() | |
ray.exceptions.RayTaskError(EngineDeadError): ray::MapBatches(vLLMEngineStageUDF).submit() (pid=3006, ip=10.0.190.249, actor_id=b69436f602ee168b444a002f02000000, repr=MapWorker(MapBatches(vLLMEngineStageUDF))) | |
yield from _map_task( | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 557, in _map_task | |
for b_out in map_transformer.apply_transform(iter(blocks), ctx): | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 532, in __call__ | |
for data in iter: | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 211, in _udf_timed_iter | |
output = next(input) | |
^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 327, in __call__ | |
yield from self._batch_fn(input, ctx) | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 485, in transform_fn | |
raise out_item | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 317, in generate_async | |
output = await self._generate_async(request) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 399, in generate_async_v1 | |
async for request_output in stream: | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 315, in generate | |
out = q.get_nowait() or await q.get() | |
^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/output_processor.py", line 51, in get | |
raise output | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 317, in generate_async | |
output = await self._generate_async(request) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 399, in generate_async_v1 | |
async for request_output in stream: | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 315, in generate | |
out = q.get_nowait() or await q.get() | |
^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/output_processor.py", line 51, in get | |
raise output | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 317, in generate_async | |
output = await self._generate_async(request) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 399, in generate_async_v1 | |
async for request_output in stream: | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 315, in generate | |
out = q.get_nowait() or await q.get() | |
^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/output_processor.py", line 51, in get | |
raise output | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 366, in output_handler | |
outputs = await engine_core.get_output_async() | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 806, in get_output_async | |
raise self._format_exception(outputs) from None | |
vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause. | |
2025-06-17 16:26:27,985 DEBUG streaming_executor.py:189 -- Shutting down executor for dataset dataset_14_0 (failed with ray::MapBatches(vLLMEngineStageUDF).submit() (pid=3006, ip=10.0.190.249, actor_id=b69436f602ee168b444a002f02000000, repr=MapWorker(MapBatches(vLLMEngineStageUDF))) | |
yield from _map_task( | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_operator.py", line 557, in _map_task | |
for b_out in map_transformer.apply_transform(iter(blocks), ctx): | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 532, in __call__ | |
for data in iter: | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 211, in _udf_timed_iter | |
output = next(input) | |
^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/execution/operators/map_transformer.py", line 327, in __call__ | |
yield from self._batch_fn(input, ctx) | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/data/_internal/planner/plan_udf_map_op.py", line 485, in transform_fn | |
raise out_item | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 317, in generate_async | |
output = await self._generate_async(request) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 399, in generate_async_v1 | |
async for request_output in stream: | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 315, in generate | |
out = q.get_nowait() or await q.get() | |
^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/output_processor.py", line 51, in get | |
raise output | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 317, in generate_async | |
output = await self._generate_async(request) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 399, in generate_async_v1 | |
async for request_output in stream: | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 315, in generate | |
out = q.get_nowait() or await q.get() | |
^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/output_processor.py", line 51, in get | |
raise output | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 317, in generate_async | |
output = await self._generate_async(request) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/llm/_internal/batch/stages/vllm_engine_stage.py", line 399, in generate_async_v1 | |
async for request_output in stream: | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 315, in generate | |
out = q.get_nowait() or await q.get() | |
^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/output_processor.py", line 51, in get | |
raise output | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 366, in output_handler | |
outputs = await engine_core.get_output_async() | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 806, in get_output_async | |
raise self._format_exception(outputs) from None | |
vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.) | |
2025-06-17 16:26:27,989 INFO streaming_executor.py:228 -- ⚠️ Dataset dataset_14_0 execution failed | |
2025-06-17 16:26:28,006 DEBUG streaming_executor.py:241 -- Shut down operator hierarchy for dataset dataset_14_0 (min/max/total=0.0/0.004/0.005s) | |
2025-06-17 16:26:28,007 DEBUG streaming_executor.py:257 -- Shut down executor for dataset dataset_14_0 (took 0.022s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment