Created
March 16, 2026 22:15
-
-
Save kev-bi/0935ade82d40fa8daecd6636c26b095c to your computer and use it in GitHub Desktop.
nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:1.0.1 speculative decoding multiple MPI process crash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /workspace$ cat > config.yaml << 'EOF' | |
| speculative_config: | |
| decoding_type: Eagle3 | |
| max_draft_len: 4 | |
| speculative_model: yuhuili/EAGLE3-LLaMA3.1-Instruct-8B | |
| kv_cache_config: | |
| free_gpu_memory_fraction: 0.70 | |
| dtype: fp8 | |
| enable_block_reuse: false | |
| trust_remote_code: true | |
| EOF | |
| python3 -m dynamo.trtllm \ | |
| --model meta-llama/Llama-3.1-8B-Instruct \ | |
| --tensor-parallel-size 2 \ | |
| --extra-engine-args config.yaml | |
| /opt/dynamo/venv/lib/python3.12/site-packages/torch/library.py:356: UserWarning: Warning only once for all operators, other operators may also be overridden. | |
| Overriding a previously registered kernel for the same operator and the same dispatch key | |
| operator: flash_attn::_flash_attn_backward(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor(a6!)? dq, Tensor(a7!)? dk, Tensor(a8!)? dv, float dropout_p, float softmax_scale, bool causal, SymInt window_size_left, SymInt window_size_right, float softcap, Tensor? alibi_slopes, bool deterministic, Tensor? rng_state=None) -> Tensor | |
| registered at /opt/dynamo/venv/lib/python3.12/site-packages/torch/_library/custom_ops.py:922 | |
| dispatch key: ADInplaceOrView | |
| previous kernel: no debug info | |
| new kernel: registered at /opt/dynamo/venv/lib/python3.12/site-packages/torch/_library/custom_ops.py:922 (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/core/dispatch/OperatorEntry.cpp:208.) | |
| self.m.impl( | |
| /opt/dynamo/venv/lib/python3.12/site-packages/modelopt/torch/__init__.py:36: UserWarning: transformers version 4.57.1 is incompatible with nvidia-modelopt and may cause issues. Please install recommended version with `pip install nvidia-modelopt[hf]` if working with HF models. | |
| _warnings.warn( | |
| [TensorRT-LLM] TensorRT LLM version: 1.3.0rc5.post1 | |
| [2026-03-16 22:08:58] INFO __init__.py:53: dynamo.nixl_connect: Utilizing CuPy to enable GPU acceleration. | |
| 2026-03-16T22:08:58.249038Z INFO dynamo_runtime::distributed: Initializing KV store discovery backend: Etcd(http://localhost:2379) | |
| 2026-03-16T22:08:58.252803Z INFO dynamo_runtime::pipeline::network::manager: Initializing NetworkManager with TCP request plane mode=tcp host=10.42.133.200 port=OS-assigned | |
| 2026-03-16T22:08:58.253568Z INFO graceful_shutdown.install_signal_handlers: Signal handlers set up for graceful shutdown (discovery unregister + grace period) | |
| 2026-03-16T22:08:58.253670Z INFO main.worker: Initializing the worker with config: Config(allowed_local_media_path='', attn_backend='VANILLA', component='tensorrt_llm', connector=[], custom_jinja_template=None, default_guidance_scale=5.0, default_height=480, default_num_frames=81, default_num_inference_steps=50, default_width=832, disable_request_abort=True, disable_torch_compile=False, disaggregation_mode=<DisaggregationMode.AGGREGATED: 'prefill_and_decode'>, discovery_backend='etcd', dit_cfg_size=1, dit_dp_size=1, dit_fsdp_size=1, dit_ring_size=1, dit_tp_size=1, dit_ulysses_size=1, dump_config_to=None, durable_kv_events=False, dyn_reasoning_parser=None, dyn_tool_call_parser=None, enable_async_cpu_offload=False, enable_attention_dp=False, enable_cuda_graph=False, enable_fullgraph=False, enable_layerwise_nvtx_marker=False, enable_local_indexer=True, enable_teacache=False, encode_endpoint='', endpoint='generate', endpoint_types='chat,completions', event_plane='nats', expert_parallel_size=None, extra_engine_args='config.yaml', free_gpu_memory_fraction=0.9, fuse_qkv=True, gpus_per_node=None, guided_decoding_backend=None, kv_block_size=32, max_batch_size=2048, max_beam_width=1, max_file_size_mb=50, max_num_tokens=8192, max_seq_len=None, media_output_fs_url='file:///tmp/dynamo_media', media_output_http_url=None, modality=<Modality.TEXT: 'text'>, model='meta-llama/Llama-3.1-8B-Instruct', multimodal_embedding_cache_capacity_gb=0, namespace='dynamo', output_modalities=['text'], override_engine_args='', pipeline_parallel_size=1, publish_events_and_metrics=False, quant_algo=None, quant_dynamic=True, request_plane='tcp', revision=None, served_model_name=None, skip_components='', teacache_thresh=0.2, teacache_use_ret_steps=True, tensor_parallel_size=2, torch_compile_mode='default', torch_dtype='bfloat16', use_kv_events=False, warmup_steps=1) | |
| 2026-03-16T22:08:58.253709Z INFO __init__.init_worker: Initializing worker with modality=Modality.TEXT | |
| 2026-03-16T22:08:58.254727Z INFO __init__: Loaded nixl API module: <module 'nixl_cu13._api' from '/opt/dynamo/venv/lib/python3.12/site-packages/nixl_cu13/_api.py'> | |
| 2026-03-16T22:08:58.294925Z WARN consolidator_config.should_enable_consolidator: KV Event Consolidator is not enabled: kv_connector_config is not a dict | |
| 2026-03-16T22:08:58.295034Z INFO llm_worker.init_llm_worker: TensorRT-LLM engine args: {'model': 'meta-llama/Llama-3.1-8B-Instruct', 'scheduler_config': SchedulerConfig(capacity_scheduler_policy=<CapacitySchedulerPolicy.GUARANTEED_NO_EVICT: 'GUARANTEED_NO_EVICT'>, context_chunking_policy=None, dynamic_batch_config=DynamicBatchConfig(enable_batch_size_tuning=True, enable_max_num_tokens_tuning=False, dynamic_batch_moving_average_window=128), waiting_queue_policy=<WaitingQueuePolicy.FCFS: 'fcfs'>), 'tensor_parallel_size': 2, 'pipeline_parallel_size': 1, 'moe_expert_parallel_size': None, 'enable_attention_dp': False, 'backend': <Backend.PYTORCH: 'pytorch'>, 'kv_cache_config': KvCacheConfig(enable_block_reuse=False, max_tokens=None, max_attention_window=None, sink_token_length=None, free_gpu_memory_fraction=0.7, host_cache_size=None, onboard_blocks=True, cross_kv_cache_fraction=None, secondary_offload_min_priority=None, event_buffer_max_size=0, attention_dp_events_gather_period_ms=5, enable_partial_reuse=True, copy_on_partial_reuse=True, use_uvm=False, max_gpu_total_bytes=0, dtype='fp8', mamba_ssm_cache_dtype='auto', tokens_per_block=32, use_kv_cache_manager_v2=False, max_util_for_resume=0.95), 'gpus_per_node': 2, 'max_num_tokens': 8192, 'max_seq_len': None, 'max_beam_width': 1, 'max_batch_size': 2048, 'return_perf_metrics': False, 'enable_iter_perf_stats': False, 'kv_connector_config': None, 'speculative_config': Eagle3DecodingConfig(max_draft_len=4, max_total_draft_tokens=4, speculative_model='yuhuili/EAGLE3-LLaMA3.1-Instruct-8B', max_concurrency=None, draft_len_schedule=None, load_format=None, acceptance_window=None, acceptance_length_threshold=None, allow_advanced_sampling=False, eagle_choices=None, greedy_sampling=True, posterior_threshold=None, use_dynamic_tree=False, dynamic_tree_max_topK=None, num_eagle_layers=4, max_non_leaves_per_layer=None, eagle3_one_model=True, eagle3_layers_to_capture=None, eagle3_model_arch='llama3'), 'trust_remote_code': True} | |
| 2026-03-16T22:08:58.747407Z INFO llm_worker.init_llm_worker: Registering model with endpoint types: chat,completions | |
| 2026-03-16T22:08:58.747454Z INFO llm_worker.init_llm_worker: Initializing NIXL Connect. | |
| [03/16/2026-22:08:59] [TRT-LLM] [I] Using LLM with PyTorch backend | |
| [03/16/2026-22:08:59] [TRT-LLM] [I] neither checkpoint_format nor checkpoint_loader were provided, checkpoint_format will be set to HF. | |
| /opt/dynamo/venv/lib/python3.12/site-packages/torch/library.py:356: UserWarning: Warning only once for all operators, other operators may also be overridden. | |
| Overriding a previously registered kernel for the same operator and the same dispatch key | |
| operator: flash_attn::_flash_attn_backward(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor(a6!)? dq, Tensor(a7!)? dk, Tensor(a8!)? dv, float dropout_p, float softmax_scale, bool causal, SymInt window_size_left, SymInt window_size_right, float softcap, Tensor? alibi_slopes, bool deterministic, Tensor? rng_state=None) -> Tensor | |
| registered at /opt/dynamo/venv/lib/python3.12/site-packages/torch/_library/custom_ops.py:922 | |
| dispatch key: ADInplaceOrView | |
| previous kernel: no debug info | |
| new kernel: registered at /opt/dynamo/venv/lib/python3.12/site-packages/torch/_library/custom_ops.py:922 (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/core/dispatch/OperatorEntry.cpp:208.) | |
| self.m.impl( | |
| /opt/dynamo/venv/lib/python3.12/site-packages/torch/library.py:356: UserWarning: Warning only once for all operators, other operators may also be overridden. | |
| Overriding a previously registered kernel for the same operator and the same dispatch key | |
| operator: flash_attn::_flash_attn_backward(Tensor dout, Tensor q, Tensor k, Tensor v, Tensor out, Tensor softmax_lse, Tensor(a6!)? dq, Tensor(a7!)? dk, Tensor(a8!)? dv, float dropout_p, float softmax_scale, bool causal, SymInt window_size_left, SymInt window_size_right, float softcap, Tensor? alibi_slopes, bool deterministic, Tensor? rng_state=None) -> Tensor | |
| registered at /opt/dynamo/venv/lib/python3.12/site-packages/torch/_library/custom_ops.py:922 | |
| dispatch key: ADInplaceOrView | |
| previous kernel: no debug info | |
| new kernel: registered at /opt/dynamo/venv/lib/python3.12/site-packages/torch/_library/custom_ops.py:922 (Triggered internally at /opt/pytorch/pytorch/aten/src/ATen/core/dispatch/OperatorEntry.cpp:208.) | |
| self.m.impl( | |
| Multiple distributions found for package optimum. Picked distribution: optimum | |
| Multiple distributions found for package optimum. Picked distribution: optimum | |
| Multiple distributions found for package modelopt. Picked distribution: nvidia-modelopt | |
| Multiple distributions found for package modelopt. Picked distribution: nvidia-modelopt | |
| /opt/dynamo/venv/lib/python3.12/site-packages/modelopt/torch/__init__.py:36: UserWarning: transformers version 4.57.1 is incompatible with nvidia-modelopt and may cause issues. Please install recommended version with `pip install nvidia-modelopt[hf]` if working with HF models. | |
| _warnings.warn( | |
| /opt/dynamo/venv/lib/python3.12/site-packages/modelopt/torch/__init__.py:36: UserWarning: transformers version 4.57.1 is incompatible with nvidia-modelopt and may cause issues. Please install recommended version with `pip install nvidia-modelopt[hf]` if working with HF models. | |
| _warnings.warn( | |
| [TensorRT-LLM] TensorRT LLM version: 1.3.0rc5.post1 | |
| [TensorRT-LLM] TensorRT LLM version: 1.3.0rc5.post1 | |
| [2026-03-16 22:09:08] INFO __init__.py:53: dynamo.nixl_connect: Utilizing CuPy to enable GPU acceleration. | |
| [2026-03-16 22:09:08] INFO __init__.py:53: dynamo.nixl_connect: Utilizing CuPy to enable GPU acceleration. | |
| 2026-03-16T22:09:09.157183Z ERROR engine.get_llm_engine: Error in engine context: argument should be a str or an os.PathLike object where __fspath__ returns a str, not 'NoneType' | |
| Traceback (most recent call last): | |
| File "<frozen runpy>", line 198, in _run_module_as_main | |
| File "<frozen runpy>", line 88, in _run_code | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/trtllm/__main__.py", line 12, in <module> | |
| main() | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/trtllm/main.py", line 37, in main | |
| uvloop.run(worker()) | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/uvloop/__init__.py", line 109, in run | |
| return __asyncio.run( | |
| ^^^^^^^^^^^^^^ | |
| File "/usr/lib/python3.12/asyncio/runners.py", line 194, in run | |
| return runner.run(main) | |
| ^^^^^^^^^^^^^^^^ | |
| File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run | |
| return self._loop.run_until_complete(task) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/uvloop/__init__.py", line 61, in wrapper | |
| return await main | |
| ^^^^^^^^^^ | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/trtllm/main.py", line 33, in worker | |
| await init_worker(runtime, config, shutdown_event, shutdown_endpoints) | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/trtllm/workers/__init__.py", line 64, in init_worker | |
| await init_llm_worker(runtime, config, shutdown_event, shutdown_endpoints) | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/trtllm/workers/llm_worker.py", line 343, in init_llm_worker | |
| async with get_llm_engine( | |
| File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ | |
| return await anext(self.gen) | |
| ^^^^^^^^^^^^^^^^^^^^^ | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/trtllm/engine.py", line 185, in get_llm_engine | |
| await engine.initialize() | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/trtllm/engine.py", line 94, in initialize | |
| self._llm = self._llm_cls(**self.engine_args) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/tensorrt_llm/llmapi/llm.py", line 1286, in __init__ | |
| super().__init__(model, tokenizer, tokenizer_mode, skip_tokenizer_init, | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/tensorrt_llm/llmapi/llm.py", line 1172, in __init__ | |
| super().__init__(model, | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/tensorrt_llm/llmapi/llm.py", line 252, in __init__ | |
| self._build_model() | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/tensorrt_llm/llmapi/llm.py", line 1213, in _build_model | |
| super()._build_model() | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/tensorrt_llm/llmapi/llm.py", line 852, in _build_model | |
| self._engine_dir, self._hf_model_dir = model_loader() | |
| ^^^^^^^^^^^^^^ | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/tensorrt_llm/llmapi/llm_utils.py", line 728, in __call__ | |
| self._hf_model_dir = self._download_hf_model_if_needed( | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/tensorrt_llm/llmapi/llm_utils.py", line 698, in _download_hf_model_if_needed | |
| model_obj.model_dir = model_dir | |
| ^^^^^^^^^^^^^^^^^^^ | |
| File "/opt/dynamo/venv/lib/python3.12/site-packages/tensorrt_llm/llmapi/llm_args.py", line 1971, in model_dir | |
| model_dir = Path(model_dir) | |
| ^^^^^^^^^^^^^^^ | |
| File "/usr/lib/python3.12/pathlib.py", line 1164, in __init__ | |
| super().__init__(*args) | |
| File "/usr/lib/python3.12/pathlib.py", line 373, in __init__ | |
| raise TypeError( | |
| TypeError: argument should be a str or an os.PathLike object where __fspath__ returns a str, not 'NoneType' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment