samos123 · September 29, 2024 02:48
diff --git a/gistfile1.txt b/gistfile1.txt
 WARNING 09-28 19:47:27 preprocess.py:86] Falling back on <BOS> for decoder start token id because decoder start token id is not available.
 ERROR 09-28 19:47:28 async_llm_engine.py:61] Engine background task failed
 ERROR 09-28 19:47:28 async_llm_engine.py:61] Traceback (most recent call last):
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 51, in _log_task_completion
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     return_value = task.result()
 ERROR 09-28 19:47:28 async_llm_engine.py:61]                    ^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 755, in run_engine_loop
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     result = task.result()
 ERROR 09-28 19:47:28 async_llm_engine.py:61]              ^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 678, in engine_step
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     request_outputs = await self.engine.step_async(virtual_engine)
 ERROR 09-28 19:47:28 async_llm_engine.py:61]                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 343, in step_async
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     outputs = await self.model_executor.execute_model_async(
 ERROR 09-28 19:47:28 async_llm_engine.py:61]               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/executor/gpu_executor.py", line 185, in execute_model_async
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     output = await make_async(self.driver_worker.execute_model
 ERROR 09-28 19:47:28 async_llm_engine.py:61]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/lib/python3.12/concurrent/futures/thread.py", line 58, in run
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     result = self.fn(*self.args, **self.kwargs)
 ERROR 09-28 19:47:28 async_llm_engine.py:61]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 327, in execute_model
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     output = self.model_runner.execute_model(
 ERROR 09-28 19:47:28 async_llm_engine.py:61]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     return func(*args, **kwargs)
 ERROR 09-28 19:47:28 async_llm_engine.py:61]            ^^^^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/worker/enc_dec_model_runner.py", line 223, in execute_model
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     output: SamplerOutput = self.model.sample(
 ERROR 09-28 19:47:28 async_llm_engine.py:61]                             ^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mllama.py", line 940, in sample
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     next_tokens = self.sampler(logits, sampling_metadata)
 ERROR 09-28 19:47:28 async_llm_engine.py:61]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     return self._call_impl(*args, **kwargs)
 ERROR 09-28 19:47:28 async_llm_engine.py:61]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     return forward_call(*args, **kwargs)
 ERROR 09-28 19:47:28 async_llm_engine.py:61]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/sampler.py", line 231, in forward
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     self._init_sampling_tensors(logits, sampling_metadata)
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/sampler.py", line 195, in _init_sampling_tensors
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     do_min_p) = SamplingTensors.from_sampling_metadata(
 ERROR 09-28 19:47:28 async_llm_engine.py:61]                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/sampling_metadata.py", line 470, in from_sampling_metadata
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     sampling_tensors = SamplingTensors.from_lists(
 ERROR 09-28 19:47:28 async_llm_engine.py:61]                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61]   File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/sampling_metadata.py", line 528, in from_lists
 ERROR 09-28 19:47:28 async_llm_engine.py:61]     temperatures_t = torch.tensor(
 ERROR 09-28 19:47:28 async_llm_engine.py:61]                      ^^^^^^^^^^^^^
 ERROR 09-28 19:47:28 async_llm_engine.py:61] RuntimeError: CUDA error: an illegal memory access was encountered
 ERROR 09-28 19:47:28 async_llm_engine.py:61] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
 ERROR 09-28 19:47:28 async_llm_engine.py:61] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
 ERROR 09-28 19:47:28 async_llm_engine.py:61] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
 ERROR 09-28 19:47:28 async_llm_engine.py:61]
 Exception in callback functools.partial(<function _log_task_completion at 0x7dbfb52dae80>, error_callback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7dbfb1737170>>)
 h
	WARNING 09-28 19:47:27 preprocess.py:86] Falling back on <BOS> for decoder start token id because decoder start token id is not available.
	ERROR 09-28 19:47:28 async_llm_engine.py:61] Engine background task failed
	ERROR 09-28 19:47:28 async_llm_engine.py:61] Traceback (most recent call last):
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 51, in _log_task_completion
	ERROR 09-28 19:47:28 async_llm_engine.py:61] return_value = task.result()
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 755, in run_engine_loop
	ERROR 09-28 19:47:28 async_llm_engine.py:61] result = task.result()
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 678, in engine_step
	ERROR 09-28 19:47:28 async_llm_engine.py:61] request_outputs = await self.engine.step_async(virtual_engine)
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 343, in step_async
	ERROR 09-28 19:47:28 async_llm_engine.py:61] outputs = await self.model_executor.execute_model_async(
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/gpu_executor.py", line 185, in execute_model_async
	ERROR 09-28 19:47:28 async_llm_engine.py:61] output = await make_async(self.driver_worker.execute_model
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/lib/python3.12/concurrent/futures/thread.py", line 58, in run
	ERROR 09-28 19:47:28 async_llm_engine.py:61] result = self.fn(self.args, *self.kwargs)
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 327, in execute_model
	ERROR 09-28 19:47:28 async_llm_engine.py:61] output = self.model_runner.execute_model(
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
	ERROR 09-28 19:47:28 async_llm_engine.py:61] return func(args, *kwargs)
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/enc_dec_model_runner.py", line 223, in execute_model
	ERROR 09-28 19:47:28 async_llm_engine.py:61] output: SamplerOutput = self.model.sample(
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mllama.py", line 940, in sample
	ERROR 09-28 19:47:28 async_llm_engine.py:61] next_tokens = self.sampler(logits, sampling_metadata)
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
	ERROR 09-28 19:47:28 async_llm_engine.py:61] return self._call_impl(args, *kwargs)
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1562, in _call_impl
	ERROR 09-28 19:47:28 async_llm_engine.py:61] return forward_call(args, *kwargs)
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/sampler.py", line 231, in forward
	ERROR 09-28 19:47:28 async_llm_engine.py:61] self._init_sampling_tensors(logits, sampling_metadata)
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/sampler.py", line 195, in _init_sampling_tensors
	ERROR 09-28 19:47:28 async_llm_engine.py:61] do_min_p) = SamplingTensors.from_sampling_metadata(
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/sampling_metadata.py", line 470, in from_sampling_metadata
	ERROR 09-28 19:47:28 async_llm_engine.py:61] sampling_tensors = SamplingTensors.from_lists(
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/sampling_metadata.py", line 528, in from_lists
	ERROR 09-28 19:47:28 async_llm_engine.py:61] temperatures_t = torch.tensor(
	ERROR 09-28 19:47:28 async_llm_engine.py:61] ^^^^^^^^^^^^^
	ERROR 09-28 19:47:28 async_llm_engine.py:61] RuntimeError: CUDA error: an illegal memory access was encountered
	ERROR 09-28 19:47:28 async_llm_engine.py:61] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
	ERROR 09-28 19:47:28 async_llm_engine.py:61] For debugging consider passing CUDA_LAUNCH_BLOCKING=1
	ERROR 09-28 19:47:28 async_llm_engine.py:61] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
	ERROR 09-28 19:47:28 async_llm_engine.py:61]
	Exception in callback functools.partial(<function _log_task_completion at 0x7dbfb52dae80>, error_callback=<bound method AsyncLLMEngine._error_callback of <vllm.engine.async_llm_engine.AsyncLLMEngine object at 0x7dbfb1737170>>)
	h