From b79bd8bbc452a861e3460657aab78eb267cb52cc Mon Sep 17 00:00:00 2001 From: 8ga Date: Fri, 17 Oct 2025 11:14:26 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=2020251011.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 20251011.md | 285 ++-------------------------------------------------- 1 file changed, 7 insertions(+), 278 deletions(-) diff --git a/20251011.md b/20251011.md index 21f16be..3960ba1 100644 --- a/20251011.md +++ b/20251011.md @@ -2,292 +2,21 @@ ## 问题原因 -vllm在处理请求时,调用了 get_structured_output_key 函数,在处理 structured_outputs 参数时,由于不满足所有的情况,导致了抛出**No valid structured output parameter found**异常,该异常未被 EngineCore 捕获导致引擎崩溃,APIServer 发现引擎死了,自己也退出了进程。 - -## 使用抓包工具 tshark 监控 http 请求 - -### 在合适的目录下执行下面的命令,tshark 需要 root 权限,请使用 root 账号执行 - -##### 安装 tshark(已执行完毕) -``` -sudo apt-get install tshark -``` - -##### 创建一个 shell 脚本 -``` -vim hook_vllm_gpt-oss-120b.sh -``` -脚本内容如下: -``` -#!/bin/bash - -# ======================================== -# 监控本地 v1/chat/completions 接口的 HTTP 请求与响应 -# 仅保留最近 20 条日志(含流式响应,如 SSE) -# 使用 tshark + TCP 流跟踪 -# ======================================== - -# 配置 -INTERFACE="lo" # 本地回环接口 -PORT="8077" -ENDPOINT="/v1/chat/completions" -LOG_FILE="/hook/chat_completions.log" -TEMP_LOG="/hook/chat_completions.tmp" -PID_FILE="/hook/hook_vllm_gpt-oss-120b.pid" - -# 检查是否已运行 -if [ -f "$PID_FILE" ]; then - if ps -p $(cat "$PID_FILE") > /dev/null 2>&1; then - echo "【错误】监控脚本已在运行 (PID: $(cat $PID_FILE))" - exit 1 - else - rm -f "$PID_FILE" - fi -fi - -# 记录 PID -echo $$ > "$PID_FILE" - -# 清空日志 -> "$LOG_FILE" - -echo "✅ 开始监控 http://127.0.0.1:$PORT$ENDPOINT" -echo "📝 日志将保存到: $LOG_FILE" -echo "⏳ 仅保留最近 20 条,按 Ctrl+C 停止" - -# 信号处理:清理 PID 文件 -trap 'rm -f "$PID_FILE"; echo "⏹️ 监控已停止"; exit 0' SIGINT SIGTERM - -# 使用 tshark 跟踪 TCP 流 -sudo tshark \ - -i "$INTERFACE" \ - -f "tcp port $PORT and host 127.0.0.1" \ - -q \ - -z "follow,tcp,ascii" \ - 2>/dev/null | \ -stdbuf -oL awk -v endpoint="$ENDPOINT" -v log="$LOG_FILE" -v temp="$TEMP" ' -BEGIN { - RS = "\n\n" - ORS = "" - in_request = 0 - buffer = "" - count = 0 -} - -# 分割流,识别每条 TCP 流 -{ - if (match($0, /GET|POST|PUT|DELETE|HTTP/) && index($0, endpoint)) { - # 提取时间戳(tshark 输出第一行包含时间) - if (match($0, /Following.*on port [0-9]+$/)) { - ts = substr($0, RSTART, RLENGTH) - gsub(/.*on/, "on", ts) - } else { - ts = "unknown time" - } - - # 提取请求行和头 - split($0, lines, /\n/) - for (i=1; i<=length(lines); i++) { - if (lines[i] ~ /(GET|POST|PUT|DELETE)/ && index(lines[i], endpoint)) { - request_line = lines[i] - } - if (lines[i] ~ /Content-Type:/ || lines[i] ~ /Authorization:/ || lines[i] ~ /User-Agent:/) { - headers = headers "\n " lines[i] - } - } - - # 提取请求体(通常在空行后) - body = "" - in_body = 0 - for (i=1; i<=length(lines); i++) { - if (lines[i] == "" || lines[i] ~ /HTTP\/[0-9.]/) { - in_body = 1 - continue - } - if (in_body && lines[i] !~ /(No response found|Following)/) { - body = body lines[i] "\n" - } - } - - # 提取响应部分(HTTP/ 开头) - response = "" - for (i=1; i<=length(lines); i++) { - if (lines[i] ~ /^HTTP\// && i > 1) { - for (j=i; j<=length(lines); j++) { - if (lines[j] !~ /Following/) { - response = response lines[j] "\n" - } - } - break - } - } - - # 构造日志条目 - entry = "========================================\n" - entry = entry "🕒 " ts "\n" - entry = entry "📤 请求: " request_line "\n" - if (headers != "") { - entry = entry "📎 头部:\n" headers "\n" - } - if (body != "") { - entry = entry "📦 请求体:\n" body "\n" - } - if (response != "") { - entry = entry "📥 响应:\n" response "\n" - } - entry = entry "========================================\n\n" - - # 写入日志并保留最近 20 条 - cmd = "echo -e \"" entry "\" >> " log " && tail -n 200 " log " > " temp " && mv " temp " " log - system(cmd) - - # 重置 - headers = "" - body = "" - response = "" - } -} -' - -# 正常退出时清理 -rm -f "$PID_FILE" -``` - -##### 赋予执行权限 -``` -chmod +x hook_vllm_gpt-oss-120b.sh -``` - -##### 后台运行脚本 -``` -nohup /hook/hook_vllm_gpt-oss-120b.sh > /dev/null 2>&1 & -``` - -##### 查看请求日志 - -``` -tail -f /hook/chat_completions.log -``` - -##### 停止脚本 - -``` -pkill -f hook_vllm_gpt-oss-120b.sh -``` - -### 函数 get_structured_output_key 实现 - -#### 分支:release/v0.11.0 +由于外部调用 vllm 的 OpenAI API 服务时候,传入的请求参数让 vllm 调用了**get_structured_output_key**函数。在该函数里不能被正确处理,抛出了**No valid structured output parameter found**异常,该异常导致了 vllm 的 EngineCore 和 APIServer 进程死亡。 +## 源码出处 https://github.com/vllm-project/vllm/blob/releases/v0.11.0/vllm/v1/structured_output/request.py -```python -def get_structured_output_key(sampling_params: SamplingParams) -> StructuredOutputKey: - params = sampling_params.structured_outputs - assert params is not None, "params can't be None." - if params.json is not None: - if not isinstance(params.json, str): - json_str = json.dumps(params.json) - else: - json_str = params.json - return (StructuredOutputOptions.JSON, json_str) - elif params.json_object: - return (StructuredOutputOptions.JSON_OBJECT, "") - elif params.regex is not None: - return (StructuredOutputOptions.REGEX, params.regex) - elif params.choice is not None: - if not isinstance(params.choice, str): - json_str = json.dumps(params.choice) - else: - json_str = params.choice - return (StructuredOutputOptions.CHOICE, json_str) - elif params.grammar is not None: - return (StructuredOutputOptions.GRAMMAR, params.grammar) - elif params.structural_tag is not None: - return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag) - else: - raise ValueError("No valid structured output parameter found") -``` -#### 分支:release/v0.10.2 +## 问题追踪 -https://github.com/vllm-project/vllm/blob/releases/v0.10.2/vllm/v1/structured_output/request.py +由于 vllm 没有提供 http 请求参数的日志打印,也没有集成监控 http 请求的三方工具。所以在 Ubuntu 上安装了 tshark 抓包工具。通过 Java 脚本启动 tshark 命令,并将 tshark 抓包到的日志内容写入到磁盘文件。下一次 vllm 崩溃时,根据磁盘文件存储的日志内容分析是什么参数导致了**get_structured_output_key**的异常。 -```python -def get_structured_output_key( - sampling_params: SamplingParams) -> StructuredOutputKey: - params = sampling_params.guided_decoding - assert params is not None, "params can't be None." - if params.json is not None: - if not isinstance(params.json, str): - json_str = json.dumps(params.json) - else: - json_str = params.json - return (StructuredOutputOptions.JSON, json_str) - elif params.json_object: - return (StructuredOutputOptions.JSON_OBJECT, "") - elif params.regex is not None: - return (StructuredOutputOptions.REGEX, params.regex) - elif params.choice is not None: - if not isinstance(params.choice, str): - json_str = json.dumps(params.choice) - else: - json_str = params.choice - return (StructuredOutputOptions.CHOICE, json_str) - elif params.grammar is not None: - return (StructuredOutputOptions.GRAMMAR, params.grammar) - elif params.structural_tag is not None: - return (StructuredOutputOptions.STRUCTURAL_TAG, params.structural_tag) - else: - raise ValueError("No valid structured output parameter found") -``` -> 2个版本的唯一区别,仅仅是提取`SamplingParams`的属性不一样,其它判断逻辑都是一致的 +> 使用 Java 执行 tshark 是为了抹掉 Linux 和 Windows 的平台差异,不用修改代码和命令即可直接运行。 -### SamplingParams +> Java 脚本内容: -https://github.com/vllm-project/vllm/blob/releases/v0.11.0/vllm/sampling_params.py - -```python -# 其它参数已省略 -class SamplingParams( - 💡初始化逻辑是没问题的,默认值是None - structured_outputs: Optional[StructuredOutputsParams] = None - guided_decoding: Optional[GuidedDecodingParams] = None - - @staticmethod - def from_optional( - 💡默认值也是None - structured_outputs: Optional[StructuredOutputsParams] = None, - ) -> "SamplingParams": - if guided_decoding is not None: - warnings.warn( - "guided_decoding is deprecated. This will be removed in " - 💡官方将在 v0.12.0 废弃 guided_decoding 参数,使用 structured_outputs 参数替代,在 v0.11.0 版本做了兼容, - "v0.12.0 or v1.0.0, which ever is soonest. Please use " - "structured_outputs instead.", - DeprecationWarning, - stacklevel=2) - structured_outputs = guided_decoding - guided_decoding = None - return SamplingParams( - structured_outputs=structured_outputs, - ) -``` - -### StructuredOutputOptions - -```python -class StructuredOutputOptions(enum.Enum): - JSON = enum.auto() - JSON_OBJECT = enum.auto() - REGEX = enum.auto() - GRAMMAR = enum.auto() - CHOICE = enum.auto() - STRUCTURAL_TAG = enum.auto() -``` -> 💡只支持这6种类型,每个类型都对应 structured_outputs 下面的一个不同的参数。 - -## 崩溃日志片段 +## 崩溃日志 ```text (EngineCore_DP0 pid=2738693) ERROR 10-10 10:43:10 [core.py:710] EngineCore encountered a fatal error.