md/20251014.md

# RTX4090笔电操作记录

```shell
# 因清华大学开源镜像站 HTTP/403 换了中科大的镜像站，配置信息存放在这里
cat /etc/apt/sources.list

# 安装 openssh 端口号是默认的 22 没有修改
sudo apt install openssh-server -y
sudo systemctl enable ssh
sudo systemctl start ssh

# 安装 NVDIA 显卡驱动和
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cuda-toolkit-12-8
sudo apt-get install -y cuda-drivers
nvidia-smi

# 安装 nvidia-cuda-toolkit
apt install nvidia-cuda-toolkit
nvcc -V

# 创建了一个新的目录，用于存储 vllm 使用的模型或其他文件
mkdir /home/ss/vllm-py12 && cd /home/ss/vllm-py12

# 用 conda 建了个新环境，以下 pip install 都是在该环境执行的
conda create -n vllm-py12 python=3.12 -y
conda activate vllm-py12

# 安装 vllm
pip install vllm -i http://mirrors.cloud.tencent.com/pypi/simple --extra-index-url https://download.pytorch.org/whl/cu128

# 安装 modelscope
pip install modelscope -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com

# 拉取 gpt-oss-20b 模型
modelscope download --model openai-mirror/gpt-oss-20b --local_dir /home/ss/vllm-py12/gpt-oss-20b

# 运行 gpt-oss-20b 模型失败，移动端的 RTX4090 只有 16GB 显存，至少需要 16~24GB 显存
vllm serve \
    /home/ss/vllm-py12/gpt-oss-20b \
    --port 18777 \
    --api-key token_lcfc \
    --served-model-name gpt-oss-20b \
    --gpu-memory-utilization 0.95  \
    --tool-call-parser openai \
    --enable-auto-tool-choice

# Qwen3-8b 也需要 16~24GB显存，所以下载了 Qwen3-0.6B
modelscope download --model Qwen/Qwen3-0.6B --local_dir /home/ss/vllm-py12/qwen3-06b

# 运行 Qwen3-8b
vllm serve /home/ss/vllm-py12/qwen3-06b \
    --host 0.0.0.0 \
    --port 8000 \
    --served-model-name Qwen3-0.6B \
    --tensor-parallel-size 1 \
    --dtype auto \
    --gpu-memory-utilization 0.9 \
    --max-model-len 32768 \
    --trust-remote-code

```
#### 新建了一个脚本去测试结构化输出函数的bug

```shell
vim /home/ss/vllm-py12/vllm-crash-test.py
```

```python
from enum import Enum
from pydantic import BaseModel
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams

# 定义结构化输出 schema
class CarType(str, Enum):
    sedan = "sedan"
    suv = "SUV"
    truck = "Truck"
    coupe = "Coupe"

class CarDescription(BaseModel):
    brand: str
    model: str
    car_type: CarType

# 获取 JSON schema
json_schema = CarDescription.model_json_schema()

# 设置 prompt
prompt = (
    "Generate a JSON with the brand, model and car_type of "
    "the most iconic car from the 90's"
)

def format_output(title: str, output: str):
    print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")

def main():
    # 1. 初始化本地 LLM，加载本地模型文件
    llm = LLM(
        model="/home/ss/vllm-py12/qwen3-06b",  # 指向你的本地模型路径
        max_model_len=1024,
        enable_prefix_caching=True,
        gpu_memory_utilization=0.9,
    )

    # 2. 构造一个无效的 guided_decoding：没有任何有效字段
    # 这将导致 get_structured_output_key() 中 raise ValueError
    guided_decoding_invalid = GuidedDecodingParams(
        json=None,
        json_object=False,
        regex=None,
        choice=None,
        grammar=None,
        structural_tag=None
    )

    sampling_params = SamplingParams(
        temperature=0.0,
        max_tokens=512,
        guided_decoding=guided_decoding_invalid  # ✅ 传入但无有效字段
    )

    # 3. 生成输出（预期会触发 ValueError）
    try:
        outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
        for output in outputs:
            generated_text = output.outputs[0].text
            format_output("Output", generated_text)
    except Exception as e:
        print(f"Caught expected error: {e}")

if __name__ == "__main__":
    main()
```

#### 复现

```shell
python /home/ss/vllm-py12/vllm-crash-test.py
```

```text
/home/ss/vllm-py12/vllm-crash-test.py:50: DeprecationWarning: guided_decoding is deprecated. This will be removed in v0.12.0 or v1.0.0, which ever is soonest. Please use structured_outputs instead.
  sampling_params = SamplingParams(
Adding requests: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 93.46it/s]
Processed prompts:   0%|                      | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s](EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710] EngineCore encountered a fatal error.
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710] Traceback (most recent call last):
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 701, in run_engine_core
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     engine_core.run_busy_loop()
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 728, in run_busy_loop
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     self._process_engine_step()
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 754, in _process_engine_step
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     outputs, model_executed = self.step_fn()
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]                               ^^^^^^^^^^^^^^
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 283, in step
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     scheduler_output = self.scheduler.schedule()
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]                        ^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/site-packages/vllm/v1/core/sched/scheduler.py", line 359, in schedule
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     if structured_output_req and structured_output_req.grammar:
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/site-packages/vllm/v1/structured_output/request.py", line 45, in grammar
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     completed = self._check_grammar_completion()
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/site-packages/vllm/v1/structured_output/request.py", line 33, in _check_grammar_completion
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     self._grammar = self._grammar.result(timeout=0.0001)
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/concurrent/futures/_base.py", line 449, in result
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     return self.__get_result()
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]            ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     raise self._exception
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/concurrent/futures/thread.py", line 59, in run
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     result = self.fn(*self.args, **self.kwargs)
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/site-packages/vllm/v1/structured_output/__init__.py", line 128, in _async_create_grammar
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/functools.py", line 998, in __get__
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     val = self.func(instance)
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]           ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/site-packages/vllm/v1/structured_output/request.py", line 58, in structured_output_key
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     return get_structured_output_key(self.sampling_params)
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]   File "/root/miniconda3/envs/vllm-py12/lib/python3.12/site-packages/vllm/v1/structured_output/request.py", line 86, in get_structured_output_key
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710]     raise ValueError("No valid structured output parameter found")
(EngineCore_DP0 pid=190093) ERROR 10-15 10:50:58 [core.py:710] ValueError: No valid structured output parameter found
Caught expected error: EngineCore encountered an issue. See stack trace (above) for the root cause.
Processed prompts:   0%|                      | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
(vllm-py12) root@ss-IdeaPad-PC:/home/ss/vllm-py12#
```