Model halucinates when running paralel requests in vllm

#12
by vladciocan88 - opened

Hello and thank you for the awesome job.
I`m running the instruct model on 2 h200 mostly for document data extraction in vllm.
vllm config:

Environment=CUDA_VISIBLE_DEVICES=0,1
ExecStart=/home/rocky/miniconda3/envs/vllm/bin/vllm serve \
  Qwen/Qwen3-Omni-30B-A3B-Instruct \
  --port 8001 \
  --tensor-parallel-size 2 \
  --max-model-len 32768 \
  --dtype bfloat16 \
  --allowed-local-media-path / \
  --gpu-memory-utilization 0.90

Request payload parameters:

{
 "temperature":0.0,
 "repetition_penalty":1.2,
 "response_format":"

{
                    "type": "json_schema",
                    "json_schema": {
                        "name": "doc_array",
                        "strict": true,
                        "schema": {
                            "type": "array",
                            "minItems": 1,
                            "maxItems": 5,
                            "items": {
                                "type": "object",
                                "additionalProperties": false,
                                "properties": {
                                    "beneficiary_name": {
                                        "type": "string",
                                        "minLength": 2,
                                        "maxLength":255
                                    },
                                    "supplier_name": {
                                        "type": "string",
                                        "minLength": 2,
                                        "maxLength":255
                                    },
                                    "doc_type": {
                                        "type": "string",
                                        "enum": [
                                            "DOC",
                                            "EXT"
                                        ]
                                    },
                                    "doc_serial_number": {
                                        "type": "string",
                                        "minLength": 3
                                    },
                                    "document_date": {
                                        "type": "string",
                                        "minLength": 4
                                    },
                                    "document_date_iso": {
                                        "type": [
                                            "string",
                                            "null"
                                        ],
                                        "pattern": "^\\d{4}-\\d{2}-\\d{2}$"
                                    },
                                    "total_value": {
                                        "type": [
                                            "number",
                                            "null"
                                        ]
                                    },
                                    "summary": {
                                        "type": "string",
                                        "minLength": 50
                                    },
                                    "documente_referentiate": {
                                        "type": "array",
                                        "items": {
                                            "type": "string"
                                        }
                                    },
                                    "pagini": {
                                        "type": "array",
                                        "minItems": 1,
                                        "items": {
                                            "type": "integer",
                                            "minimum": 1
                                        }
                                    }
                                },
                                "required": [
                                    "beneficiary_name",
                                    "supplier_name",
                                    "doc_type",
                                    "doc_serial_number",
                                    "document_date",
                                    "document_date_iso",
                                    "valoare_totala",
                                    "rezumat",
                                    "documente_referentiate",
                                    "pagini"
                                ]
                            }
                        }
                    }
                }

and sometimes I get something like this:

[{"beneficiary_name": "Beneficiaryname", "supplier_name": "Supplier name", "doc_type": "Act Aditional", "doc_serial_number": "09-24-00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000...[till the end of contextsize

Any advice?

Plus i`m running at about 50 t/s, is that normal?

Sign up or log in to comment