2025-10-15 10:25:35 +08:00
|
|
|
|
# RTX4090笔电操作记录
|
|
|
|
|
|
|
2025-10-13 17:22:20 +08:00
|
|
|
|
```shell
|
2025-10-14 11:22:34 +08:00
|
|
|
|
# 因清华大学开源镜像站 HTTP/403 换了中科大的镜像站,配置信息存放在这里
|
2025-10-14 11:22:00 +08:00
|
|
|
|
cat /etc/apt/sources.list
|
2025-10-14 15:29:01 +08:00
|
|
|
|
|
|
|
|
|
|
# 安装 openssh 端口号是默认的 22 没有修改
|
|
|
|
|
|
sudo apt install openssh-server -y
|
|
|
|
|
|
sudo systemctl enable ssh
|
|
|
|
|
|
sudo systemctl start ssh
|
|
|
|
|
|
|
2025-10-14 15:49:42 +08:00
|
|
|
|
# 安装 NVDIA 显卡驱动和
|
2025-10-13 17:59:23 +08:00
|
|
|
|
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
|
|
|
|
|
|
sudo dpkg -i cuda-keyring_1.1-1_all.deb
|
|
|
|
|
|
sudo apt-get update
|
|
|
|
|
|
sudo apt-get -y install cuda-toolkit-12-8
|
|
|
|
|
|
sudo apt-get install -y cuda-drivers
|
|
|
|
|
|
nvidia-smi
|
2025-10-14 15:49:42 +08:00
|
|
|
|
|
|
|
|
|
|
# 安装 nvidia-cuda-toolkit
|
2025-10-14 11:10:26 +08:00
|
|
|
|
apt install nvidia-cuda-toolkit
|
2025-10-13 17:59:23 +08:00
|
|
|
|
nvcc -V
|
2025-10-14 15:29:01 +08:00
|
|
|
|
|
2025-10-14 15:46:59 +08:00
|
|
|
|
# 创建了一个新的目录,用于存储 vllm 使用的模型或其他文件
|
|
|
|
|
|
mkdir /home/ss/vllm-py12 && cd /home/ss/vllm-py12
|
2025-10-14 15:29:01 +08:00
|
|
|
|
|
2025-10-14 15:46:59 +08:00
|
|
|
|
# 用 conda 建了个新环境,以下 pip install 都是在该环境执行的
|
|
|
|
|
|
conda create -n vllm-py12 python=3.12 -y
|
2025-10-14 11:25:52 +08:00
|
|
|
|
conda activate vllm-py12
|
2025-10-14 15:29:01 +08:00
|
|
|
|
|
|
|
|
|
|
# 安装 vllm
|
2025-10-14 14:28:01 +08:00
|
|
|
|
pip install vllm -i http://mirrors.cloud.tencent.com/pypi/simple --extra-index-url https://download.pytorch.org/whl/cu128
|
2025-10-14 15:29:01 +08:00
|
|
|
|
|
|
|
|
|
|
# 安装 modelscope
|
|
|
|
|
|
pip install modelscope -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
|
|
|
|
|
|
|
2025-10-14 15:40:03 +08:00
|
|
|
|
# 拉取 gpt-oss-20b 模型
|
2025-10-14 15:39:47 +08:00
|
|
|
|
modelscope download --model openai-mirror/gpt-oss-20b --local_dir /home/ss/vllm-py12/gpt-oss-20b
|
2025-10-15 09:46:19 +08:00
|
|
|
|
|
2025-10-15 10:09:20 +08:00
|
|
|
|
# 运行 gpt-oss-20b 模型失败,移动端的 RTX4090 只有 16GB 显存,至少需要 16~24GB 显存
|
2025-10-15 09:46:19 +08:00
|
|
|
|
vllm serve \
|
2025-10-15 10:09:20 +08:00
|
|
|
|
/home/ss/vllm-py12/gpt-oss-20b \
|
|
|
|
|
|
--port 18777 \
|
|
|
|
|
|
--api-key token_lcfc \
|
|
|
|
|
|
--served-model-name gpt-oss-20b \
|
|
|
|
|
|
--gpu-memory-utilization 0.95 \
|
|
|
|
|
|
--tool-call-parser openai \
|
|
|
|
|
|
--enable-auto-tool-choice
|
|
|
|
|
|
|
|
|
|
|
|
# Qwen3-8b 也需要 16~24GB显存,所以下载了 Qwen3-0.6B
|
|
|
|
|
|
modelscope download --model Qwen/Qwen3-0.6B --local_dir /home/ss/vllm-py12/qwen3-06b
|
|
|
|
|
|
|
|
|
|
|
|
# 运行 Qwen3-8b
|
2025-10-15 10:30:12 +08:00
|
|
|
|
vllm serve /home/ss/vllm-py12/qwen3-06b \
|
2025-10-15 10:09:20 +08:00
|
|
|
|
--host 0.0.0.0 \
|
2025-10-15 10:30:12 +08:00
|
|
|
|
--port 8000 \
|
|
|
|
|
|
--served-model-name Qwen3-0.6B \
|
2025-10-15 10:09:20 +08:00
|
|
|
|
--tensor-parallel-size 1 \
|
|
|
|
|
|
--dtype auto \
|
|
|
|
|
|
--gpu-memory-utilization 0.9 \
|
|
|
|
|
|
--max-model-len 32768 \
|
|
|
|
|
|
--trust-remote-code
|
|
|
|
|
|
|
2025-10-15 10:32:37 +08:00
|
|
|
|
```
|
|
|
|
|
|
#### 新建了一个脚本去测试结构化输出函数的bug
|
|
|
|
|
|
|
|
|
|
|
|
```shell
|
|
|
|
|
|
vim /home/ss/vllm-py12/vllm-crash-test.py
|
|
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
```python
|
|
|
|
|
|
from enum import Enum
|
|
|
|
|
|
from pydantic import BaseModel
|
|
|
|
|
|
from vllm import LLM, SamplingParams
|
|
|
|
|
|
from vllm.sampling_params import GuidedDecodingParams
|
|
|
|
|
|
|
2025-10-15 10:53:17 +08:00
|
|
|
|
# 定义结构化输出 schema
|
2025-10-15 10:32:37 +08:00
|
|
|
|
class CarType(str, Enum):
|
|
|
|
|
|
sedan = "sedan"
|
|
|
|
|
|
suv = "SUV"
|
|
|
|
|
|
truck = "Truck"
|
|
|
|
|
|
coupe = "Coupe"
|
|
|
|
|
|
|
|
|
|
|
|
class CarDescription(BaseModel):
|
|
|
|
|
|
brand: str
|
|
|
|
|
|
model: str
|
|
|
|
|
|
car_type: CarType
|
|
|
|
|
|
|
2025-10-15 10:53:17 +08:00
|
|
|
|
# 获取 JSON schema
|
2025-10-15 10:32:37 +08:00
|
|
|
|
json_schema = CarDescription.model_json_schema()
|
2025-10-15 10:53:17 +08:00
|
|
|
|
|
|
|
|
|
|
# 设置 prompt
|
|
|
|
|
|
prompt = (
|
|
|
|
|
|
"Generate a JSON with the brand, model and car_type of "
|
2025-10-15 10:32:37 +08:00
|
|
|
|
"the most iconic car from the 90's"
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def format_output(title: str, output: str):
|
|
|
|
|
|
print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
2025-10-15 10:53:17 +08:00
|
|
|
|
# 1. 初始化本地 LLM,加载本地模型文件
|
|
|
|
|
|
llm = LLM(
|
|
|
|
|
|
model="/home/ss/vllm-py12/qwen3-06b", # 指向你的本地模型路径
|
|
|
|
|
|
max_model_len=1024,
|
|
|
|
|
|
enable_prefix_caching=True,
|
|
|
|
|
|
gpu_memory_utilization=0.9,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 2. 构造一个无效的 guided_decoding:没有任何有效字段
|
|
|
|
|
|
# 这将导致 get_structured_output_key() 中 raise ValueError
|
|
|
|
|
|
guided_decoding_invalid = GuidedDecodingParams(
|
|
|
|
|
|
json=None,
|
|
|
|
|
|
json_object=False,
|
|
|
|
|
|
regex=None,
|
|
|
|
|
|
choice=None,
|
|
|
|
|
|
grammar=None,
|
|
|
|
|
|
structural_tag=None
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
sampling_params = SamplingParams(
|
|
|
|
|
|
temperature=0.0,
|
|
|
|
|
|
max_tokens=512,
|
|
|
|
|
|
guided_decoding=guided_decoding_invalid # ✅ 传入但无有效字段
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
# 3. 生成输出(预期会触发 ValueError)
|
|
|
|
|
|
try:
|
|
|
|
|
|
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
|
|
|
|
|
|
for output in outputs:
|
|
|
|
|
|
generated_text = output.outputs[0].text
|
|
|
|
|
|
format_output("Output", generated_text)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"Caught expected error: {e}")
|
2025-10-15 10:32:37 +08:00
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
main()
|
|
|
|
|
|
```
|