{ "cells": [ { "cell_type": "code", "execution_count": 8, "id": "405bc169-e0b7-48e6-84b8-4e4a791cf61a", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO 06-09 04:41:03 [__init__.py:243] Automatically detected platform cuda.\n", "INFO 06-09 04:41:06 [__init__.py:31] Available plugins for group vllm.general_plugins:\n", "INFO 06-09 04:41:06 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver\n", "INFO 06-09 04:41:06 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.\n", "INFO 06-09 04:41:07 [api_server.py:1289] vLLM API server version 0.9.0.1\n", "INFO 06-09 04:41:08 [cli_args.py:300] non-default args: {'host': '0.0.0.0', 'task': 'embed', 'trust_remote_code': True, 'enforce_eager': True, 'served_model_name': ['local'], 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.4}\n", "WARNING 06-09 04:41:08 [config.py:3096] Your Quadro RTX 8000 device (with compute capability 7.5) doesn't support torch.bfloat16. Falling back to torch.float16 for compatibility.\n", "WARNING 06-09 04:41:08 [config.py:3135] Casting torch.bfloat16 to torch.float16.\n", "INFO 06-09 04:41:17 [config.py:473] Found sentence-transformers modules configuration.\n", "INFO 06-09 04:41:17 [config.py:493] Found pooling configuration.\n", "WARNING 06-09 04:41:17 [arg_utils.py:1583] Compute Capability < 8.0 is not supported by the V1 Engine. Falling back to V0. \n", "WARNING 06-09 04:41:17 [arg_utils.py:1431] The model has a long context length (40960). This may causeOOM during the initial memory profiling phase, or result in low performance due to small KV cache size. Consider setting --max-model-len to a smaller value.\n", "INFO 06-09 04:41:17 [config.py:1875] Defaulting to use mp for distributed inference\n", "WARNING 06-09 04:41:17 [cuda.py:87] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used\n", "INFO 06-09 04:41:17 [api_server.py:257] Started engine process with PID 84927\n", "INFO 06-09 04:41:21 [__init__.py:243] Automatically detected platform cuda.\n", "INFO 06-09 04:41:24 [__init__.py:31] Available plugins for group vllm.general_plugins:\n", "INFO 06-09 04:41:24 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver\n", "INFO 06-09 04:41:24 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.\n", "INFO 06-09 04:41:24 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.0.1) with config: model='Qwen/Qwen3-Embedding-4B', speculative_config=None, tokenizer='Qwen/Qwen3-Embedding-4B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=40960, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=local, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=False, pooler_config=PoolerConfig(pooling_type='LAST', normalize=True, softmax=None, step_tag_id=None, returned_token_ids=None), compilation_config={\"compile_sizes\": [], \"inductor_compile_config\": {\"enable_auto_functionalized_v2\": false}, \"cudagraph_capture_sizes\": [], \"max_capture_size\": 0}, use_cached_outputs=True, \n", "WARNING 06-09 04:41:25 [multiproc_worker_utils.py:306] Reducing Torch parallelism from 64 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.\n", "INFO 06-09 04:41:25 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n", "INFO 06-09 04:41:25 [cuda.py:289] Using XFormers backend.\n", "INFO 06-09 04:41:29 [__init__.py:243] Automatically detected platform cuda.\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m INFO 06-09 04:41:32 [multiproc_worker_utils.py:225] Worker ready; awaiting tasks\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m INFO 06-09 04:41:32 [__init__.py:31] Available plugins for group vllm.general_plugins:\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m INFO 06-09 04:41:32 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m INFO 06-09 04:41:32 [__init__.py:36] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m INFO 06-09 04:41:32 [cuda.py:240] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m INFO 06-09 04:41:32 [cuda.py:289] Using XFormers backend.\n", "INFO 06-09 04:41:33 [utils.py:1077] Found nccl from library libnccl.so.2\n", "INFO 06-09 04:41:33 [pynccl.py:69] vLLM is using nccl==2.26.2\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m INFO 06-09 04:41:33 [utils.py:1077] Found nccl from library libnccl.so.2\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m INFO 06-09 04:41:33 [pynccl.py:69] vLLM is using nccl==2.26.2\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m INFO 06-09 04:41:34 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/jovyan/.cache/vllm/gpu_p2p_access_cache_for_0,1.json\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m WARNING 06-09 04:41:34 [custom_all_reduce.py:146] Custom allreduce is disabled because your platform lacks GPU P2P capability or P2P test failed. To silence this warning, specify disable_custom_all_reduce=True explicitly.\n", "INFO 06-09 04:41:34 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/jovyan/.cache/vllm/gpu_p2p_access_cache_for_0,1.json\n", "WARNING 06-09 04:41:34 [custom_all_reduce.py:146] Custom allreduce is disabled because your platform lacks GPU P2P capability or P2P test failed. To silence this warning, specify disable_custom_all_reduce=True explicitly.\n", "INFO 06-09 04:41:34 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_523dee68'), local_subscribe_addr='ipc:///tmp/4d2d0127-8b88-42ce-ba52-5c7e4aac03b6', remote_subscribe_addr=None, remote_addr_ipv6=False)\n", "INFO 06-09 04:41:34 [parallel_state.py:1064] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m INFO 06-09 04:41:34 [parallel_state.py:1064] rank 1 in world size 2 is assigned as DP rank 0, PP rank 0, TP rank 1, EP rank 1\n", "INFO 06-09 04:41:34 [model_runner.py:1170] Starting to load model Qwen/Qwen3-Embedding-4B...\n", "\u001b[1;36m(VllmWorkerProcess pid=85107)\u001b[0;0m INFO 06-09 04:41:34 [model_runner.py:1170] Starting to load model Qwen/Qwen3-Embedding-4B...\n", "INFO 06-09 04:41:34 [weight_utils.py:291] Using model weights format ['*.safetensors']\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00