From 63963eba2990beb6ce40ab4560016a730e3e824a Mon Sep 17 00:00:00 2001 From: yuyr Date: Tue, 6 Jan 2026 12:13:41 +0800 Subject: [PATCH] =?UTF-8?q?V3.7=20=E6=8E=A8=E7=90=86=E5=BC=95=E6=93=8E?= =?UTF-8?q?=E4=BB=8Esglang=20=E5=85=A8=E9=87=8F=E5=88=87=E6=8D=A2=E4=B8=BA?= =?UTF-8?q?vllm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- specs/mvp/v3.7/v3.7_design.md | 215 +++++++++++++++++++++++ specs/mvp/v3.7/v3.7_dev_plan.md | 122 +++++++++++++ specs/mvp/v3.7/v3.7_summary.md | 121 +++++++++++++ src/mvp/configs/dev.yaml | 2 + src/mvp/configs/dev_v30.yaml | 2 + src/mvp/docker-compose.yaml | 11 +- src/mvp/images/argus-ray-node/Dockerfile | 2 +- src/mvp/py/argus/ray/builders.py | 2 +- src/mvp/py/argus/service/ui.py | 2 +- src/mvp/py/tests/test_builders.py | 3 + src/mvp/py/tests/test_ui.py | 2 + src/mvp/scripts/run_all_v30_api.sh | 14 +- 12 files changed, 484 insertions(+), 14 deletions(-) create mode 100644 specs/mvp/v3.7/v3.7_design.md create mode 100644 specs/mvp/v3.7/v3.7_dev_plan.md create mode 100644 specs/mvp/v3.7/v3.7_summary.md diff --git a/specs/mvp/v3.7/v3.7_design.md b/specs/mvp/v3.7/v3.7_design.md new file mode 100644 index 0000000..436aed9 --- /dev/null +++ b/specs/mvp/v3.7/v3.7_design.md @@ -0,0 +1,215 @@ +# MVP v3.7 设计方案:切换 `verlai/verl:vllm011.latest` + 默认 rollout=vllm + +## 0. 背景与目标 + +当前 dev/h1 环境的 Ray 节点镜像基于 `verlai/verl:sgl055.latest`,并且平台内置 PPO/GRPO 的默认参数中写死了: + +- `actor_rollout_ref.rollout.name=sglang` + +v3.7 的目标是: + +1. **Ray 节点镜像切换到 vLLM 版本** + - 基础镜像改为 `verlai/verl:vllm011.latest` + - 构建并打标:`argus/argus-ray-node:vllm011.latest` + - 构建在远端 `argus@h1` 上完成(本地没有 verlai 基础镜像) +2. **端到端跑通 v3.0 API 流程** + - 通过 `src/mvp/scripts/run_all_v30_api.sh` 完整 E2E +3. **内置训练任务默认使用 vLLM rollout** + - 提交 VERL 训练任务时将 `actor_rollout_ref.rollout.name` 从 `sglang` 改为 `vllm` + +> 备注:本迭代是“替换默认 backend”而非“新增能力”,尽量保持对 v3.6 功能兼容(W&B、SFTPGo、Advanced TaskSpec、stateless pool 等不改协议)。 + +--- + +## 1. 现状梳理(源码定位) + +### 1.1 Ray 节点镜像与 compose + +- Dockerfile:`src/mvp/images/argus-ray-node/Dockerfile` + - 当前 `ARG BASE_IMAGE=verlai/verl:sgl055.latest` +- Compose:`src/mvp/docker-compose.yaml` + - `ray_head.build.args.BASE_IMAGE: verlai/verl:sgl055.latest` + - `ray_head.image / worker.image: argus/argus-ray-node:v2.5` + +### 1.2 默认 rollout.name=sglang 的位置 + +平台内置 PPO/GRPO 参数由 Ray job 入口构建器生成: + +- `src/mvp/py/argus/ray/builders.py` + - `build_training_argv()` 中写死了: + - `actor_rollout_ref.rollout.name=sglang` + +WebUI 的 Advanced 示例也包含 rollout.name(用于指导用户): + +- `src/mvp/py/argus/service/ui.py` + - Advanced example 中当前为 `actor_rollout_ref.rollout.name=sglang`(需要同步改成 vllm,避免用户 copy/paste 走错) + +### 1.3 `run_all_v30_api.sh` 依赖默认参数 + +`src/mvp/scripts/run_all_v30_api.sh` 提交 PPO/GRPO/SFT 的 TaskSpec(YAML)时 **不会显式携带 rollout.name**,因此是否能切到 vllm,依赖平台默认值(builders)是否变更。 + +--- + +## 2. 方案设计 + +### 2.0 已确认决策(来自评审) + +1) **compose 移除 build**:允许移除 `ray_head.build`,强制使用远端已构建镜像。 +2) **全量切换 vllm**:不保留 sglang 作为可选项(v3.7 默认全部切到 vllm)。 +3) **backend 名称**:确认 VERL backend 名为 `vllm`(即 `actor_rollout_ref.rollout.name=vllm`)。 + +### 2.1 镜像策略(vllm011) + +#### 2.1.1 Dockerfile 修改 + +目标: +- 默认基础镜像改为 `verlai/verl:vllm011.latest` + +改动点: +- `src/mvp/images/argus-ray-node/Dockerfile` + - `ARG BASE_IMAGE=verlai/verl:vllm011.latest` + +说明: +- 仍保留 `BASE_IMAGE` build arg,便于未来热切换不同基础镜像(而不是把镜像写死在 compose)。 + +#### 2.1.2 镜像 tag + +构建产物镜像: +- `argus/argus-ray-node:vllm011.latest` + +> 注意:该 tag 用于表达“运行时依赖的 vllm 版本线”,而不是 MVP 功能版本(v3.7)。 + +#### 2.1.3 compose 复用新镜像(避免每次重建) + +目标:E2E 时尽量避免每次 `docker compose up` 都 build。 + +建议修改 `src/mvp/docker-compose.yaml`: +- `ray_head.image: argus/argus-ray-node:vllm011.latest` +- `ray_worker_0.image: argus/argus-ray-node:vllm011.latest` +- `ray_worker_1.image: argus/argus-ray-node:vllm011.latest` + +并采用:**移除 `ray_head.build`**(强制使用已构建镜像),避免每次 `docker compose up` 触发 build。 + +--- + +### 2.2 训练默认参数切换到 vllm + +目标:平台内置 PPO/GRPO 的默认 rollout backend 从 sglang 切到 vllm。 + +改动点: +- `src/mvp/py/argus/ray/builders.py` + - 将 `actor_rollout_ref.rollout.name=sglang` 替换为 `actor_rollout_ref.rollout.name=vllm` + +影响范围: +- PPO、GRPO(两者都走 `verl.trainer.main_ppo`) +- 对 SFT 不影响(SFT 走 `verl.trainer.sft_trainer_ray`) + +兼容性评估: +- `run_all_v30_api.sh` 会受益:无需修改 TaskSpec,即可自动切换。 +- 若未来仍需支持 sglang,可考虑在 v3.7 之后引入“配置驱动”的默认值(见 §2.4 可选增强)。 + +--- + +### 2.3 WebUI/模板同步(避免误导用户) + +目标:New Task 页面的 Advanced example 也应默认 vllm,避免用户 copy 后手工改参数。 + +改动点: +- `src/mvp/py/argus/service/ui.py` + - Advanced example 中 `actor_rollout_ref.rollout.name=vllm` + +> 注意:该模板仅用于 UX 指导;实际生效仍由用户提交的 command 决定。 + +--- + +### 2.4 可选增强(不强制,供评审) + +为避免后续再硬编码切换,可引入“平台训练默认值”配置(可选): + +- 在 `configs/dev.yaml` 增加: + ```yaml + verl_defaults: + rollout_backend: "vllm" # 或 "sglang" + ``` +- `builders.py` 从配置读取默认值,而非写死。 + +本次 v3.7 的最低交付可以先不做该增强,只做硬替换;若你希望后续支持 A/B 切换,再纳入。 + +--- + +## 3. 远端部署/迁移步骤(argus@h1) + +> 本节是“计划步骤”,评审通过后再执行。 + +### 3.1 同步代码到远端目录 + +远端目录约定: +- `argus@h1:/home2/argus/infra/mvp/src/mvp`(compose 与 scripts) + +将本地变更 rsync 到远端后再进行构建/拉起。 + +### 3.2 在远端构建镜像(只在 h1) + +在 `argus@h1` 执行(示例命令): + +```bash +cd /home2/argus/infra/mvp/src/mvp +docker build \ + -f images/argus-ray-node/Dockerfile \ + --build-arg BASE_IMAGE=verlai/verl:vllm011.latest \ + -t argus/argus-ray-node:vllm011.latest \ + . +``` + +### 3.3 清理旧环境并用新镜像拉起 + +```bash +cd /home2/argus/infra/mvp/src/mvp +docker compose down +docker compose up -d +``` + +验证: +- `docker ps` 中 `argus-ray-head/worker` 的 image 为 `argus/argus-ray-node:vllm011.latest` +- Ray dashboard 可访问:`http://:8265` + +### 3.4 E2E:跑 `run_all_v30_api.sh` + +```bash +cd /home2/argus/infra/mvp/src/mvp +MVP_INTERNAL_TOKEN=my-dev-token \ +WANDB_API_KEY=... \ +./scripts/run_all_v30_api.sh +``` + +验收关键点: +- PPO/GRPO/SFT 全部成功(或至少 PPO/GRPO 不卡在 rollout backend 初始化阶段) +- 任一 PPO/GRPO 的 driver logs / hydra overrides 中能看到: + - `actor_rollout_ref.rollout.name=vllm` + +--- + +## 4. 风险与排查要点 + +### 4.1 vLLM backend 在 VERL 的参数兼容性 + +平台默认传入的这些参数当前是为 sglang 写的: +- `actor_rollout_ref.rollout.tensor_model_parallel_size=1` +- `actor_rollout_ref.rollout.gpu_memory_utilization=0.4` + +vLLM rollout 是否接受/需要额外参数(例如 tokenizer、engine 配置),需要在 E2E 中观察: +- 如果 vLLM rollout 初始化报错,可能需要补充 vllm 特定 overrides(属于 v3.7 的后续修复项)。 + +### 4.2 镜像依赖差异 + +更换 base image 可能带来: +- Python/Ray/依赖版本差异 +- CUDA/NCCL 依赖差异 + +建议: +- 在 v3.7 评审通过后,优先跑最小 PPO(epochs=1、steps=10)验证 vllm backend 能启动并完成。 + +--- + +## 5. 待确认问题(请你评审时确认) +已完成评审确认(见 §2.0),无额外待确认项。 diff --git a/specs/mvp/v3.7/v3.7_dev_plan.md b/specs/mvp/v3.7/v3.7_dev_plan.md new file mode 100644 index 0000000..61b0f58 --- /dev/null +++ b/specs/mvp/v3.7/v3.7_dev_plan.md @@ -0,0 +1,122 @@ +# MVP v3.7 开发计划(TDD) + +> 目标:切换 Ray 节点基础镜像到 `verlai/verl:vllm011.latest`,并将平台内置 PPO/GRPO 默认 rollout backend 全量切到 `vllm`,最后在远端 `argus@h1` 通过 `run_all_v30_api.sh` 跑通端到端。 + +## M0 - 基线确认(不改行为) + +**目的**:确认当前 v3.6 baseline 可跑(避免把历史问题混入 v3.7)。 + +- [ ] 本地单测全绿:`.venv/bin/python -m pytest` +- [ ] 远端 h1 当前环境可跑(可选):`./scripts/run_all_v30_api.sh`(或至少能启动 Ray+API) + +**验收**: +- 单测通过,coverage ≥ 90%(现有门槛) + +--- + +## M1 - 训练默认参数切换到 vllm(TDD) + +**目的**:在不碰镜像/compose 的前提下,先把“默认 rollout=sglang”替换为 vllm,并用单测锁定行为。 + +### 1.1 新增/更新单测(先写测试) + +- [ ] `src/mvp/py/tests/test_builders.py` + - 新增断言:PPO/GRPO 的 argv 中包含 `actor_rollout_ref.rollout.name=vllm` + - 且不再包含 `actor_rollout_ref.rollout.name=sglang` + +- [ ] `src/mvp/py/tests/test_ui.py` + - New Task Advanced example 模板包含 `actor_rollout_ref.rollout.name=vllm`(避免用户 copy/paste 走错默认) + +> 这两条测试先写出来,预期先失败(red)。 + +### 1.2 实现改动(让测试变绿) + +- [ ] `src/mvp/py/argus/ray/builders.py` + - 将 `actor_rollout_ref.rollout.name=sglang` 改为 `...=vllm` + +- [ ] `src/mvp/py/argus/service/ui.py` + - Advanced example 中同样改为 `...=vllm` + +### 1.3 回归测试 + +- [ ] `.venv/bin/python -m pytest` + +**验收**: +- 单测全绿(coverage ≥ 90%) +- 平台内置 PPO/GRPO 构建出的 command/overrides 默认 rollout backend 为 vllm + +--- + +## M2 - 镜像与 compose 切换(远端构建为主) + +**目的**:完成镜像切换与环境拉起,确保 Ray stateless pool 正常工作。 + +### 2.1 Dockerfile 默认 base image 切换 + +- [ ] `src/mvp/images/argus-ray-node/Dockerfile` + - `ARG BASE_IMAGE=verlai/verl:vllm011.latest` + +### 2.2 docker-compose 强制使用新镜像(移除 build) + +- [ ] `src/mvp/docker-compose.yaml` + - 移除 `ray_head.build` 段(强制走 `image:`) + - `ray_head.image / ray_worker_0.image / ray_worker_1.image` 统一改为: + - `argus/argus-ray-node:vllm011.latest` + +### 2.3 远端构建镜像(h1) + +在 `argus@h1:/home2/argus/infra/mvp/src/mvp`: + +- [ ] `docker build -f images/argus-ray-node/Dockerfile -t argus/argus-ray-node:vllm011.latest .` + +### 2.4 清理旧 compose 并拉起 + +- [ ] `docker compose down` +- [ ] `docker compose up -d` +- [ ] 验证: + - `docker ps` 看到 `argus-ray-head/worker` 正常运行 + - Ray dashboard:`http://:8265` 可访问,节点数 1 head + 2 worker + +**验收**: +- h1 环境成功使用新镜像拉起 Ray 集群(head 无 GPU、worker 各 4 GPU 的配置仍保持) + +--- + +## M3 - 端到端验证(run_all_v30_api.sh) + +**目的**:验证在新镜像 + 默认 vllm rollout 下,API 提交的训练任务能跑通闭环。 + +### 3.1 同步代码到远端 + +- [ ] rsync `src/mvp` 到 `argus@h1:/home2/argus/infra/mvp/src/mvp` + +### 3.2 执行 E2E + +在 h1: + +- [ ] `./scripts/run_all_v30_api.sh`(确保环境变量按脚本要求设置:`MVP_INTERNAL_TOKEN`、可选 `WANDB_API_KEY` 等) + +### 3.3 核心检查点 + +- [ ] PPO/GRPO/SFT 任务整体流程可执行(至少 PPO/GRPO 不因 rollout backend 初始化失败) +- [ ] 任一 PPO/GRPO 的 Ray job logs / submit payload / hydra overrides 中可确认: + - `actor_rollout_ref.rollout.name=vllm` + +**验收**: +- `run_all_v30_api.sh` 端到端成功(或若 PPO/GRPO 因 vllm 参数差异失败,需在本 milestone 内补齐必要 overrides 并重新跑通) + +--- + +## 风险与回滚策略 + +### 风险 + +- vLLM rollout 可能对部分参数(如 batch/并发/显存利用率)有不同约束,导致训练启动失败。 +- base image 切换导致 ray/依赖版本差异。 + +### 回滚 + +回滚到 v3.6 / sglang 的最小动作: +- `docker-compose.yaml` 恢复旧镜像 tag +- `builders.py` 恢复 rollout.name=sglang + diff --git a/specs/mvp/v3.7/v3.7_summary.md b/specs/mvp/v3.7/v3.7_summary.md new file mode 100644 index 0000000..c00bf26 --- /dev/null +++ b/specs/mvp/v3.7/v3.7_summary.md @@ -0,0 +1,121 @@ +# MVP v3.7 迭代总结:切换 vLLM rollout + `verlai/verl:vllm011.latest` + +> 基线版本:v3.6(W&B + SFTPGo + WebUI/API + Ray stateless pool + Advanced TaskSpec) +> 验证环境:`argus@h1:/home2/argus/infra/mvp` + +## 1. 目标与结果 + +### 1.1 本次目标 + +1) Ray 节点镜像切换到 vLLM 版本: +- base image:`verlai/verl:vllm011.latest` +- 构建镜像 tag:`argus/argus-ray-node:vllm011.latest` + +2) 平台内置 PPO/GRPO 默认 rollout backend 全量切换: +- `actor_rollout_ref.rollout.name=sglang` → `actor_rollout_ref.rollout.name=vllm` + +3) 端到端验证: +- 使用 `src/mvp/scripts/run_all_v30_api.sh` 在 h1 上跑通 E2E(通过 API 提交 PPO/GRPO/SFT) + +### 1.2 实际结果(验收) + +- h1 上已成功构建并使用新镜像拉起(head + 2 worker): + - `docker ps` 显示 `argus-ray-head/worker-*` 使用 `argus/argus-ray-node:vllm011.latest` +- `run_all_v30_api.sh` 端到端跑通: + - PPO/GRPO/SFT 任务均 `SUCCEEDED` +- 在 job submit payload 中验证关键点: + - `actor_rollout_ref.rollout.name=vllm` + - `HF_HUB_OFFLINE=1`(见 §3.2) + +--- + +## 2. 代码与配置改动点 + +### 2.1 训练默认参数(sglang → vllm) + +- `src/mvp/py/argus/ray/builders.py` + - 将 PPO/GRPO 默认参数中的 `actor_rollout_ref.rollout.name` 固定为 `vllm` +- `src/mvp/py/argus/service/ui.py` + - New Task → Advanced example 同步改为 `actor_rollout_ref.rollout.name=vllm`(避免用户 copy/paste 走错) + +并用单测锁定行为(TDD): +- `src/mvp/py/tests/test_builders.py` +- `src/mvp/py/tests/test_ui.py` + +### 2.2 镜像与 compose(强制用预构建镜像) + +- `src/mvp/images/argus-ray-node/Dockerfile` + - 默认 `ARG BASE_IMAGE=verlai/verl:vllm011.latest` +- `src/mvp/docker-compose.yaml` + - 移除 `ray_head.build`(避免每次 `docker compose up` 触发 build) + - head/worker 统一使用 `image: argus/argus-ray-node:vllm011.latest` + +--- + +## 3. E2E 遇到的问题与修复 + +### 3.1 问题:vLLM 初始化触发 HF mirror 429 + +在切换到 vLLM rollout 后,PPO/GRPO 任务启动阶段出现: +- `huggingface_hub.errors.HfHubHTTPError: 429 Too Many Requests` +- 请求来源:`https://hf-mirror.com/api/models//tree/main?...` + +原因要点: +- 传入模型为 repo id(`Qwen/Qwen2.5-0.5B-Instruct`)时,vLLM 会调用 HF API 获取 repo tree/file list; +- 多进程/多 replica 并发会瞬间放大请求,导致 mirror 限流; +- 即便本地 cache 已存在,repo id 路径仍可能触发远端检查。 + +### 3.2 修复:禁用 HF Hub 联网 + 使用本地 snapshot path + +1) 在 Ray job runtime_env 注入离线开关: +- `src/mvp/configs/dev.yaml` +- `src/mvp/configs/dev_v30.yaml` + +新增: +```yaml +HF_HUB_OFFLINE: "1" +``` + +2) E2E 脚本提交任务时,`model_id` 改为本地 snapshot 目录,避免 repo id: +- `src/mvp/scripts/run_all_v30_api.sh` + - 在 head 容器内用 `snapshot_download(..., local_files_only=True)` 解析本地路径 + - 用该路径作为 `model_id:` 提交 PPO/GRPO/SFT + +> 结果:E2E 任务不再触发 HF mirror 429,PPO/GRPO/SFT 全部跑通。 + +--- + +## 4. 远端部署/操作记录(h1) + +### 4.1 构建镜像(h1 上执行) + +在 `argus@h1:/home2/argus/infra/mvp/src/mvp`: + +```bash +docker build -f images/argus-ray-node/Dockerfile \ + --build-arg BASE_IMAGE=verlai/verl:vllm011.latest \ + -t argus/argus-ray-node:vllm011.latest . +``` + +### 4.2 拉起环境(compose) + +```bash +docker compose down +docker compose up -d +``` + +### 4.3 E2E + +```bash +export MVP_INTERNAL_TOKEN=my-dev-token +export SFTPGO_ADMIN_PASSWORD=my-dev-sftpgo-admin +./scripts/run_all_v30_api.sh +``` + +--- + +## 5. 已知影响与注意事项 + +1) **vLLM rollout 更敏感于模型加载路径与联网行为**:建议默认离线(`HF_HUB_OFFLINE=1`)并优先使用本地 snapshot path。 +2) **镜像切换可能带来依赖差异**:后续若遇到 rollout 相关参数兼容问题,应以 vLLM 的配置要求为准逐项调整(保持小步快跑)。 + diff --git a/src/mvp/configs/dev.yaml b/src/mvp/configs/dev.yaml index de11c65..41e5475 100644 --- a/src/mvp/configs/dev.yaml +++ b/src/mvp/configs/dev.yaml @@ -15,6 +15,8 @@ ray: env_vars: HF_ENDPOINT: "https://hf-mirror.com" PYTHONUNBUFFERED: "1" + # v3.7: forbid HuggingFace Hub network access from Ray jobs (use cached snapshots). + HF_HUB_OFFLINE: "1" # 用户自定义代码目录(可被 PYTHONPATH 注入) user_code_path: "/private/user/code" diff --git a/src/mvp/configs/dev_v30.yaml b/src/mvp/configs/dev_v30.yaml index 131dd7c..4c0ac9f 100644 --- a/src/mvp/configs/dev_v30.yaml +++ b/src/mvp/configs/dev_v30.yaml @@ -15,6 +15,8 @@ ray: env_vars: HF_ENDPOINT: "https://hf-mirror.com" PYTHONUNBUFFERED: "1" + # v3.7: forbid HuggingFace Hub network access from Ray jobs (use cached snapshots). + HF_HUB_OFFLINE: "1" # v3.0 先不支持 user code 执行 user_code_path: "/private/user/code" diff --git a/src/mvp/docker-compose.yaml b/src/mvp/docker-compose.yaml index 1b8c86a..16e73d8 100644 --- a/src/mvp/docker-compose.yaml +++ b/src/mvp/docker-compose.yaml @@ -1,11 +1,6 @@ services: ray_head: - build: - context: . - dockerfile: images/argus-ray-node/Dockerfile - args: - BASE_IMAGE: verlai/verl:sgl055.latest - image: argus/argus-ray-node:v2.5 + image: argus/argus-ray-node:vllm011.latest container_name: argus-ray-head ports: - "8265:8265" @@ -96,7 +91,7 @@ services: - argus-wandb ray_worker_0: - image: argus/argus-ray-node:v2.5 + image: argus/argus-ray-node:vllm011.latest container_name: argus-ray-worker-0 volumes: - ../../verl:/workspace/verl @@ -128,7 +123,7 @@ services: PYTHONUNBUFFERED: "1" ray_worker_1: - image: argus/argus-ray-node:v2.5 + image: argus/argus-ray-node:vllm011.latest container_name: argus-ray-worker-1 volumes: - ../../verl:/workspace/verl diff --git a/src/mvp/images/argus-ray-node/Dockerfile b/src/mvp/images/argus-ray-node/Dockerfile index f90eb30..06423f1 100644 --- a/src/mvp/images/argus-ray-node/Dockerfile +++ b/src/mvp/images/argus-ray-node/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_IMAGE=verlai/verl:sgl055.latest +ARG BASE_IMAGE=verlai/verl:vllm011.latest FROM ${BASE_IMAGE} SHELL ["/bin/bash", "-lc"] diff --git a/src/mvp/py/argus/ray/builders.py b/src/mvp/py/argus/ray/builders.py index 9f03dd9..f1e18c3 100644 --- a/src/mvp/py/argus/ray/builders.py +++ b/src/mvp/py/argus/ray/builders.py @@ -38,7 +38,7 @@ def build_training_argv(spec: JobSpec, submission_id: str, job_dir: str) -> Buil "actor_rollout_ref.actor.optim.lr=1e-6", "actor_rollout_ref.actor.ppo_mini_batch_size=64", "actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4", - "actor_rollout_ref.rollout.name=sglang", + "actor_rollout_ref.rollout.name=vllm", "actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8", "actor_rollout_ref.rollout.tensor_model_parallel_size=1", "actor_rollout_ref.rollout.gpu_memory_utilization=0.4", diff --git a/src/mvp/py/argus/service/ui.py b/src/mvp/py/argus/service/ui.py index 5da48d7..b5a0822 100644 --- a/src/mvp/py/argus/service/ui.py +++ b/src/mvp/py/argus/service/ui.py @@ -416,7 +416,7 @@ command: | actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.ppo_mini_batch_size=64 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ diff --git a/src/mvp/py/tests/test_builders.py b/src/mvp/py/tests/test_builders.py index 74f48d7..a835051 100644 --- a/src/mvp/py/tests/test_builders.py +++ b/src/mvp/py/tests/test_builders.py @@ -29,6 +29,9 @@ def test_build_training_argv_ppo_smoke(): built = build_training_argv(spec, submission_id="sid", job_dir="/job") assert built.argv[:3] == ["python3", "-m", "verl.trainer.main_ppo"] assert "data.val_files=val.jsonl" in built.argv + # v3.7: default rollout backend switches from sglang -> vllm. + assert "actor_rollout_ref.rollout.name=vllm" in built.argv + assert "actor_rollout_ref.rollout.name=sglang" not in built.argv assert "trainer.test_freq=-1" in built.argv diff --git a/src/mvp/py/tests/test_ui.py b/src/mvp/py/tests/test_ui.py index d110c9e..7b86cf7 100644 --- a/src/mvp/py/tests/test_ui.py +++ b/src/mvp/py/tests/test_ui.py @@ -93,6 +93,8 @@ def test_ui_new_task_contains_advanced_example_snippet(tmp_path, monkeypatch): # workload is not needed for advanced in v3.5. assert "# workload:" not in r.text assert "actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu" in r.text + # v3.7: default rollout backend switches from sglang -> vllm. + assert "actor_rollout_ref.rollout.name=vllm" in r.text # v3.6: Advanced example uses platform-injected env vars so users don't need to edit W&B project/run. assert "trainer.logger=${MVP_TRAINER_LOGGER}" in r.text assert "trainer.project_name=${MVP_WANDB_PROJECT}" in r.text diff --git a/src/mvp/scripts/run_all_v30_api.sh b/src/mvp/scripts/run_all_v30_api.sh index 78b502b..f2e9917 100755 --- a/src/mvp/scripts/run_all_v30_api.sh +++ b/src/mvp/scripts/run_all_v30_api.sh @@ -224,10 +224,18 @@ dexec "${HEAD_CONTAINER}" bash -lc "set -euo pipefail; \ (cp -f /private/common/datasets/gsm8k_sft/train.parquet '/private/users/${USER_ID}/datasets/gsm8k_sft/train.parquet' 2>/dev/null || cp -f /private/datasets/gsm8k_sft/train.parquet '/private/users/${USER_ID}/datasets/gsm8k_sft/train.parquet' 2>/dev/null || true); \ (cp -f /private/common/datasets/gsm8k_sft/test.parquet '/private/users/${USER_ID}/datasets/gsm8k_sft/test.parquet' 2>/dev/null || cp -f /private/datasets/gsm8k_sft/test.parquet '/private/users/${USER_ID}/datasets/gsm8k_sft/test.parquet' 2>/dev/null || true)" +echo "[host] resolve local model snapshot path (avoid HF mirror 429 for vllm rollout)" +LOCAL_MODEL_PATH="$(dexec "${HEAD_CONTAINER}" bash -lc "python3 - <<'PY'\nimport os\nfrom huggingface_hub import snapshot_download\nmodel_id=os.environ.get('MODEL_ID','Qwen/Qwen2.5-0.5B-Instruct')\nos.environ.setdefault('HF_HOME','/private/hf')\ntry:\n p=snapshot_download(repo_id=model_id, local_files_only=True)\n print(p)\nexcept Exception as e:\n raise SystemExit(f'ERROR: model snapshot not in cache; run 30_prepare_data_and_model.sh first. {e!r}')\nPY\n" MODEL_ID='Qwen/Qwen2.5-0.5B-Instruct' | tail -n 1)" +if [[ -z "${LOCAL_MODEL_PATH}" || "${LOCAL_MODEL_PATH}" != /* ]]; then + echo "ERROR: failed to resolve LOCAL_MODEL_PATH: ${LOCAL_MODEL_PATH}" >&2 + exit 1 +fi +echo "[host] local_model_path: ${LOCAL_MODEL_PATH}" + echo "[host] submit PPO/GRPO/SFT via API using user dataset paths" -PPO_TASK_ID="$(submit_taskspec_inline "${USER_TOKEN}" $'workload: ppo\nnnodes: 2\nn_gpus_per_node: 4\ncode_path: /private/common/code/verl/verl_repo\ntrain_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/train.parquet\nval_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/test.parquet\nmodel_id: Qwen/Qwen2.5-0.5B-Instruct\ntotal_epochs: 1\ntotal_training_steps: 10\nsave_freq: 10\n')" -GRPO_TASK_ID="$(submit_taskspec_inline "${USER_TOKEN}" $'workload: grpo\nnnodes: 2\nn_gpus_per_node: 4\ncode_path: /private/common/code/verl/verl_repo\ntrain_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/train.parquet\nval_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/test.parquet\nmodel_id: Qwen/Qwen2.5-0.5B-Instruct\ntotal_epochs: 1\ntotal_training_steps: 10\nsave_freq: 10\n')" -SFT_TASK_ID="$(submit_taskspec_inline "${USER_TOKEN}" $'workload: sft\nnnodes: 1\nn_gpus_per_node: 1\ncode_path: /private/common/code/verl/verl_repo\ntrain_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k_sft/train.parquet\nval_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k_sft/test.parquet\nmodel_id: Qwen/Qwen2.5-0.5B-Instruct\ntotal_epochs: 1\ntotal_training_steps: 10\nsave_freq: 10\n')" +PPO_TASK_ID="$(submit_taskspec_inline "${USER_TOKEN}" $'workload: ppo\nnnodes: 2\nn_gpus_per_node: 4\ncode_path: /private/common/code/verl/verl_repo\ntrain_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/train.parquet\nval_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/test.parquet\nmodel_id: '"${LOCAL_MODEL_PATH}"$'\ntotal_epochs: 1\ntotal_training_steps: 10\nsave_freq: 10\n')" +GRPO_TASK_ID="$(submit_taskspec_inline "${USER_TOKEN}" $'workload: grpo\nnnodes: 2\nn_gpus_per_node: 4\ncode_path: /private/common/code/verl/verl_repo\ntrain_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/train.parquet\nval_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/test.parquet\nmodel_id: '"${LOCAL_MODEL_PATH}"$'\ntotal_epochs: 1\ntotal_training_steps: 10\nsave_freq: 10\n')" +SFT_TASK_ID="$(submit_taskspec_inline "${USER_TOKEN}" $'workload: sft\nnnodes: 1\nn_gpus_per_node: 1\ncode_path: /private/common/code/verl/verl_repo\ntrain_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k_sft/train.parquet\nval_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k_sft/test.parquet\nmodel_id: '"${LOCAL_MODEL_PATH}"$'\ntotal_epochs: 1\ntotal_training_steps: 10\nsave_freq: 10\n')" echo "[host] submitted task ids:" echo " ppo=${PPO_TASK_ID}"