From c405adc4fc1e727de23c67cb7acee4311406db90 Mon Sep 17 00:00:00 2001 From: yuyr Date: Tue, 23 Dec 2025 14:22:15 +0800 Subject: [PATCH] mvp v1.1 save --- .gitignore | 3 +- specs/mvp/milestones.md | 34 + specs/mvp/mvp_roadmap.md | 348 +++ specs/mvp/v1.1/mvp_plan.md | 169 ++ specs/mvp/v1.1/sdk_submit_refactor.md | 148 ++ specs/mvp/v1.1/v1.1_action.md | 333 +++ src/mvp/v1.1/README.md | 61 + src/mvp/v1.1/docker-compose.yaml | 89 + src/mvp/v1.1/job_spec.schema.json | 33 + src/mvp/v1.1/py/configs/dev.yaml | 20 + src/mvp/v1.1/py/jobspecs/grpo.yaml | 20 + src/mvp/v1.1/py/jobspecs/ppo.yaml | 22 + src/mvp/v1.1/py/jobspecs/sft.yaml | 22 + src/mvp/v1.1/py/mvp_v11/__init__.py | 1 + src/mvp/v1.1/py/mvp_v11/builders.py | 96 + src/mvp/v1.1/py/mvp_v11/driver_entrypoint.py | 63 + src/mvp/v1.1/py/mvp_v11/models.py | 121 ++ src/mvp/v1.1/py/mvp_v11/ray_job_tool.py | 171 ++ src/mvp/v1.1/py/mvp_v11/yaml_io.py | 21 + src/mvp/v1.1/py/requirements.txt | 2 + src/mvp/v1.1/py/run.py | 69 + src/mvp/v1.1/py/sitecustomize.py | 57 + src/mvp/v1.1/scripts/00_prereq_check.sh | 42 + src/mvp/v1.1/scripts/01_up.sh | 16 + src/mvp/v1.1/scripts/02_down.sh | 12 + src/mvp/v1.1/scripts/03_cleanup_v1_legacy.sh | 16 + src/mvp/v1.1/scripts/05_ensure_verl_repo.sh | 23 + src/mvp/v1.1/scripts/12_install_py_deps.sh | 10 + src/mvp/v1.1/scripts/20_start_head.sh | 18 + src/mvp/v1.1/scripts/21_start_workers.sh | 26 + .../v1.1/scripts/30_prepare_data_and_model.sh | 86 + src/mvp/v1.1/scripts/31_snapshot_verl_code.sh | 42 + src/mvp/v1.1/scripts/32_clone_verl_tags.sh | 39 + src/mvp/v1.1/scripts/40_submit_ppo_epoch1.sh | 72 + src/mvp/v1.1/scripts/41_submit_grpo_epoch1.sh | 73 + src/mvp/v1.1/scripts/42_submit_sft_minimal.sh | 62 + src/mvp/v1.1/scripts/43_submit_jobspec.sh | 17 + src/mvp/v1.1/scripts/44_submit_sdk.sh | 19 + .../scripts/46_submit_ppo_two_verl_tags.sh | 86 + src/mvp/v1.1/scripts/50_status.sh | 13 + src/mvp/v1.1/scripts/lib.sh | 52 + src/mvp/v1.1/scripts/run_all.sh | 15 + src/mvp/v1.1/submit_job.py | 282 +++ src/mvp/v1.1/templates/grpo.json | 31 + src/mvp/v1.1/templates/ppo.json | 31 + src/mvp/v1.1/templates/sft.json | 32 + src/mvp/v1/arch.excalidraw | 1877 +++++++++++++++++ 47 files changed, 4894 insertions(+), 1 deletion(-) create mode 100644 specs/mvp/milestones.md create mode 100644 specs/mvp/mvp_roadmap.md create mode 100644 specs/mvp/v1.1/mvp_plan.md create mode 100644 specs/mvp/v1.1/sdk_submit_refactor.md create mode 100644 specs/mvp/v1.1/v1.1_action.md create mode 100644 src/mvp/v1.1/README.md create mode 100644 src/mvp/v1.1/docker-compose.yaml create mode 100644 src/mvp/v1.1/job_spec.schema.json create mode 100644 src/mvp/v1.1/py/configs/dev.yaml create mode 100644 src/mvp/v1.1/py/jobspecs/grpo.yaml create mode 100644 src/mvp/v1.1/py/jobspecs/ppo.yaml create mode 100644 src/mvp/v1.1/py/jobspecs/sft.yaml create mode 100644 src/mvp/v1.1/py/mvp_v11/__init__.py create mode 100644 src/mvp/v1.1/py/mvp_v11/builders.py create mode 100644 src/mvp/v1.1/py/mvp_v11/driver_entrypoint.py create mode 100644 src/mvp/v1.1/py/mvp_v11/models.py create mode 100644 src/mvp/v1.1/py/mvp_v11/ray_job_tool.py create mode 100644 src/mvp/v1.1/py/mvp_v11/yaml_io.py create mode 100644 src/mvp/v1.1/py/requirements.txt create mode 100644 src/mvp/v1.1/py/run.py create mode 100644 src/mvp/v1.1/py/sitecustomize.py create mode 100644 src/mvp/v1.1/scripts/00_prereq_check.sh create mode 100644 src/mvp/v1.1/scripts/01_up.sh create mode 100644 src/mvp/v1.1/scripts/02_down.sh create mode 100644 src/mvp/v1.1/scripts/03_cleanup_v1_legacy.sh create mode 100644 src/mvp/v1.1/scripts/05_ensure_verl_repo.sh create mode 100644 src/mvp/v1.1/scripts/12_install_py_deps.sh create mode 100644 src/mvp/v1.1/scripts/20_start_head.sh create mode 100644 src/mvp/v1.1/scripts/21_start_workers.sh create mode 100644 src/mvp/v1.1/scripts/30_prepare_data_and_model.sh create mode 100644 src/mvp/v1.1/scripts/31_snapshot_verl_code.sh create mode 100644 src/mvp/v1.1/scripts/32_clone_verl_tags.sh create mode 100644 src/mvp/v1.1/scripts/40_submit_ppo_epoch1.sh create mode 100644 src/mvp/v1.1/scripts/41_submit_grpo_epoch1.sh create mode 100644 src/mvp/v1.1/scripts/42_submit_sft_minimal.sh create mode 100644 src/mvp/v1.1/scripts/43_submit_jobspec.sh create mode 100644 src/mvp/v1.1/scripts/44_submit_sdk.sh create mode 100644 src/mvp/v1.1/scripts/46_submit_ppo_two_verl_tags.sh create mode 100644 src/mvp/v1.1/scripts/50_status.sh create mode 100644 src/mvp/v1.1/scripts/lib.sh create mode 100644 src/mvp/v1.1/scripts/run_all.sh create mode 100644 src/mvp/v1.1/submit_job.py create mode 100644 src/mvp/v1.1/templates/grpo.json create mode 100644 src/mvp/v1.1/templates/ppo.json create mode 100644 src/mvp/v1.1/templates/sft.json create mode 100644 src/mvp/v1/arch.excalidraw diff --git a/.gitignore b/.gitignore index c72c0e1..ce0d616 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ verl/ skypilot-ssh-test/ -ray_in_docker/ \ No newline at end of file +ray_in_docker/ +__pycache__/ diff --git a/specs/mvp/milestones.md b/specs/mvp/milestones.md new file mode 100644 index 0000000..c983f1d --- /dev/null +++ b/specs/mvp/milestones.md @@ -0,0 +1,34 @@ + +# milestones + +通过以下几个里程碑来梳理和分析确认可行性,最终目标是产出一套基于Native Ray集群(无k8s底座)的verl 训练平台,支持多用户,运行各类verl任务,提高整体集群的资源利用效率,并且能够通过监测系统进行观察和资源统计,监控报警。未来形成运维SOP后,接入运维智能体,执行自动化运维。 +- Workload + - ppo on ray + - grpo on ray + - sft on ray 可行性 + - model serving on ray + - customize code 自定义代码,任意verl example 提交代码 + - 自定义reward function + - 同时多verl版本支持,同时跑不同的ray任务,但是使用不同版本的verl,甚至是用户魔改版本 +- Ray Job管理 + - 通过python api提交,而不是通过ray cli提交 + - 任务排队机制。无优先级,多个pending job谁先满足资源就谁先执行。 + - 【确认支持】gang scheduling (all or nothing), 指定好trainer.nnodes和trainer.n_gpus_per_node参数,不满足就pending。 + - 无配额管理、公平调度等特性。 + - Ray本身不支持任务超时参数,需要单独job监控,发现超时才停止。 + - Pipeline管理【高级, 暂不实现】 + - 提供对Ray Job进一步封装,串联多个Ray Job,自动完成训练,模型合并等job串联 +- 可观测性 Observability + - 测试本地部署 weight and bias server 可行性,如何集成现有job流程 + - 测试部署 prometheus & grafana,对ray节点进行监测 + - job监控,哪些job使用了多少资源,跑了多长时间,资源利用率是否充分,是否空占着GPU +- 数据、模型存储管理 + - shared dataset管理:所有用户共享的hf数据集 + - hf 模型管理:所有用户共享的hf 基座模型库 + - user dataset 管理: 用户独自的数据集管理 + - user 模型管理:用户独自的模型管理,保存训练好的模型 + - job 作业数据管理,作业产出物,临时目录数据 + - user management:用户可以通过统一界面来管理自己是user dataset/model space和自己运行的job的临时目录,从而灵活组织任务流水线,提供灵活的文件查看方式 +- 网络 + - 确认是否支持IB(H100环境),以及RoCEv2(H20环境),需要怎样配置 + diff --git a/specs/mvp/mvp_roadmap.md b/specs/mvp/mvp_roadmap.md new file mode 100644 index 0000000..716d002 --- /dev/null +++ b/specs/mvp/mvp_roadmap.md @@ -0,0 +1,348 @@ +# MVP Roadmap(V1 → V2 → … → 训练平台) + +本文档在 `specs/mvp/milestones.md` 的草稿基础上做**扩展与细化**:把目标拆成可迭代的版本(MVP v1/v2/…),保证每个版本都能**独立运行、可验证验收**,并且在上一版本基础上演进。 + +> 总目标(North Star):产出一套**基于 Native Ray 集群(无 K8s 底座)**的训练平台,面向多用户,支持 `verl` 各类训练/评测/Serving 工作负载,提升集群利用率,并通过可观测系统实现资源统计、监控告警,最终形成运维 SOP 并可接入运维智能体做自动化运维。 + +--- + +## 0. 关键原则(贯穿所有版本) + +1) **版本可独立运行**:每个版本都能从“空环境”按文档跑起来(不依赖未来能力)。 +2) **验收可客观验证**:每个里程碑必须有明确的 DoD(Definition of Done)与可复现步骤。 +3) **强制产物落盘**:模型/数据/日志/ckpt 必须可追踪、可复用、可审计(基于共享存储/NFS)。 +4) **Head 不参与计算**:Head 只承担控制面(GCS/Dashboard/Job server),避免训练抢占控制面资源。 +5) **按 submission id 组织作业**:作业输出目录与 Ray submission id 绑定,方便检索、回收、归档。 +6) **“先把 RL 跑稳”,再扩 workload**:先 PPO(已验证),再 GRPO/SFT/Serving。 + +--- + +## 0.1 里程碑总览(建议交付顺序) + +| 版本 | 定位 | 关键交付 | 核心验收点 | +|---|---|---|---| +| v1 | 可复现实验闭环 | Ray 集群 + PPO 跑通 + 持久化 | driver 不在 head;产物落盘 | +| v1.1 | 实验工程化 | JobSpec 模板 + 新增 1 个 workload | 可回归、可定位、可扩展 | +| v2.0 | 服务化入口 | API + Ray Jobs SDK | API 提交/查询/停止可用 | +| v2.1 | 节点纳管 | SSH 注入 + 资源池/标签 | 节点上线/下线、gang 约束 | +| v3.0 | 平台雏形 | 队列 + 超时 + 最小多用户 | pending→running 自动调度 | +| v3.1 | 可扩展平台 | 自定义代码/reward + 多版本 | 多版本并存、插件可用 | +| v4.0 | 可运营平台 | Prom/Grafana + W&B | 资源核算/告警/归档 | +| v4.1 | 可交接平台 | SOP + 自动化运维接口 | 非开发可按 SOP 运维 | +| v5.0 | 长期形态 | Serving + Pipeline | 训练→发布推理闭环 | + +## 1. 当前基线:MVP v1(已完成/已验证) + +### 1.1 目标 + +在单机(或同一宿主机)用 3 个容器跑通: + +- Ray head(无 GPU,CPU=0/GPU=0) +- 2 个 Ray worker(每个 4 GPU) +- 通过 **head 上的 `ray job submit`** 提交 `verl` PPO(`total_epochs=1`) +- 通过 **entrypoint 自定义资源**强制 driver 在 worker 上 +- 数据/模型/日志/ckpt 全部持久化 + +### 1.2 交付物(repo 中已存在) + +- 脚本与 compose:`src/mvp/v1/` +- 行动与验收文档:`specs/mvp/v1/v1_action.md` +- 共享目录约定:`shared/datasets`、`shared/hf`、`shared/jobs` 等(与 NFS 对齐) + +### 1.3 验收口径(摘要) + +- `ray job list` 的 `driver_info.node_ip_address` ∈ worker IP,且 ≠ head IP +- 训练输出落在 `/mnt/shared/jobs//...` +- checkpoint 按 `save_freq` 产生(避免爆磁盘) + +--- + +## 2. MVP v1.1(Hardening + 多 workload 可行性验证) + +> 目标:把 v1 从“实验脚本”升级成“可长期回归的最小系统”,并验证更多 workload 的可行性边界。 + +### 2.1 主要能力 + +- Workload 扩展(可选顺序): + - PPO(回归金标) + - GRPO on Ray(可运行验证) + - SFT on Ray(可运行验证:`llamafactory` 或 `verl` 相关 SFT 路径) +- 作业模板化(最小实现): + - 统一 JobSpec(YAML/JSON)描述:workload 类型、资源(nnodes/n_gpus_per_node)、数据、模型、输出目录、超时 + - 仍然用 `ray job submit`,但把 entrypoint 组装逻辑标准化 +- checkpoint 策略与磁盘保护: + - 默认 `save_freq` ≥ 10(或按训练总 steps 的比例) + - 明确保留策略(至少提供“保留最后 N 个 ckpt”的配置建议/脚本) +- “失败可定位”: + - 统一收敛日志入口(Ray job logs + hydra 日志目录 + 关键参数快照) + - 失败时能定位:是资源不足 / NCCL / 数据 / 模型 / 配置错误 + +### 2.2 验收(DoD) + +- 同一套脚本在同一台机器能连续跑 3 次 PPO 回归,产物目录不互相覆盖 +- 至少新增 1 个 workload(GRPO 或 SFT)可以跑通 “启动→训练→落盘” 闭环 +- 作业目录内包含: + - `config/submit_cmd.txt`(或 job spec 快照) + - `logs/`(可追踪) + - `checkpoints/`(按策略生成) + +--- + +## 3. MVP v2.0(Control Plane 服务化:API + Ray Jobs SDK) + +> 目标:从“人跑脚本”升级为“服务提交任务”。依然是 Native Ray 集群,但引入一个最小控制平面服务。 + +### 3.1 系统形态 + +- Control Plane(建议部署在 head/CPU 机器): + - FastAPI 服务(REST) + - Job 管理:用 Ray Jobs **Python SDK** 提交/查询/停止(不再依赖 CLI 文本解析) + - 节点视图:读取 Ray state(nodes, actors, placement groups) +- Data Plane: + - 仍然是预先启动的 worker 节点加入集群(先不做 SSH 动态纳管也可) + +### 3.2 API(MVP 级别) + +- `POST /v1/jobs`:提交 JobSpec(ppo/grpo/sft) +- `GET /v1/jobs`:列表(含状态、资源、开始/结束时间) +- `GET /v1/jobs/{id}`:详情(含输出目录、driver node) +- `POST /v1/jobs/{id}:stop`:停止作业 + +### 3.3 验收(DoD) + +- API 提交 PPO,返回 submission id;输出目录为 `/mnt/shared/jobs//...` +- API 查询 job 状态与 driver node(必须是 worker) +- 停止 job 后,资源释放、状态可见 + +--- + +## 4. MVP v2.1(SSH 纳管 + 资源池 + Gang 约束) + +> 目标:对齐你草稿里“SSH 纳管”的约束与需求:控制面能纳管 GPU 节点,形成可运营的资源池。 + +### 4.1 节点纳管(SSH Provisioner) + +- 控制面保存 NodeSpec(ip/user/port/labels/gpu_count) +- 通过 SSH 执行: + - `ray start --address=:6379 --resources=...` + - `ray stop`(drain/下线) +- 维护节点状态机:`pending → online → draining → offline` + +### 4.2 资源池与 gang(All-or-nothing) + +- 资源池最小模型: + - pool 标签(如 `pool_a`、`h20`、`ib_domain_1`) + - 提交 job 时指定 pool 约束 +- Gang 约束(MVP 实现方式): + - job spec 明确 `trainer.nnodes` + `trainer.n_gpus_per_node` + - 提交前检查 Ray 可用资源是否满足,不满足则进入 pending 队列(见 v3.0) + +### 4.3 验收(DoD) + +- 通过 API 注册 2 个 worker(SSH 注入 ray start)后,`ray status` 可见节点上线 +- 通过 API 下线节点,节点被标记不可调度且不再分配新 job +- gang 不满足时 job 不提交(或提交后一直 pending),满足后可运行 + +--- + +## 5. MVP v3.0(调度与多用户:队列 + 超时 + 最小权限) + +> 目标:平台开始“像个平台”:多用户、队列、超时、审计。仍然不做复杂配额/公平调度。 + +### 5.1 作业队列(简单但可用) + +- FIFO 队列:无优先级 +- “资源满足就调度”:谁先满足谁先跑(可接受非严格 FIFO) +- job 超时:Ray 原生不支持统一 timeout(草稿已指出),因此控制面需: + - 记录 start_time + - 定期扫描超时 job → `stop` + +### 5.2 多用户最小闭环 + +- 认证(MVP):token 或 basic auth(先不做复杂 RBAC) +- 归属与隔离(文件层): + - `/mnt/shared/users//datasets/` + - `/mnt/shared/users//models/` + - `/mnt/shared/jobs//` 记录 user/metadata + +### 5.3 验收(DoD) + +- 2 个用户可各自提交 job,能看到自己的 job 列表与输出目录 +- 超时策略可触发(模拟短 timeout),job 被停止且状态标记为 timeout +- 队列在资源不足时保持 pending,资源释放后自动运行 + +--- + +## 6. MVP v3.1(可扩展性:自定义代码/Reward、多版本 VERL) + +> 目标:把“平台内置 workload”升级成“用户可提交自定义代码与 reward”,并支持多版本并存。 + +### 6.1 自定义代码提交(最小实现) + +两种方式二选一(建议先做 A): + +- A:`working_dir` 指向 NFS 上的代码快照目录(用户自己准备/上传) +- B:上传 zip(控制面落到 NFS 并解压为 code snapshot) + +### 6.2 多版本 VERL 并存 + +约束前提:**基础镜像保持同一个**(生产环境容器由算力平台创建时已固定镜像标签)。 + +目标:在同一 Ray 集群内,不同 job 可以使用不同版本的 `verl`(例如不同分支/commit 或用户魔改版)。 + +已确认优先方案(A):**必须通过 Ray Job 的 `runtime_env.env_vars` 透传 `PYTHONPATH`**,让 job 粒度优先 import 指定代码快照。 + +建议方案(以 NFS 为中心,最小可行实现): + +- 在共享存储上以“不可变快照”的方式存放代码版本(推荐 commit hash 命名): + - `${SHARED_ROOT}/common/code/verl//...` + - `${SHARED_ROOT}/users//code/verl//...`(用户魔改版) +- JobSpec 增加 `code_path`(指向上述目录),控制面在提交 job 时注入(必须走 runtime_env): + - `runtime_env.env_vars.PYTHONPATH = ":$PYTHONPATH"`(把 code_path 放最前面,确保 import 优先级) + +示例(概念性,实际以 `${SHARED_ROOT}` 为准): + +```bash +CODE_PATH="${SHARED_ROOT}/common/code/verl/" + +ray job submit \ + --address="http://127.0.0.1:8265" \ + --submission-id="" \ + --runtime-env-json='{"env_vars": {"PYTHONPATH": "'"${CODE_PATH}"':$PYTHONPATH"}}' \ + -- \ + python3 -m verl.trainer.main_ppo ... +``` + +需要验证的关键点(作为 v3.1 的 DoD 之一): + +- 同时运行两个 job: + - jobA 使用 ``,jobB 使用 `` + - 互不影响,且各自训练/日志/ckpt 正常 +- job 粒度是否能做到“依赖隔离”(至少做到 `verl` 版本隔离;第三方依赖冲突可先假设镜像内一致) + +> 备注:当前 v1 的做法是容器内全局 `pip install -e /workspace/verl`,这会让所有 job 默认使用同一份 `verl`。要实现多版本并存,必须让 job 的 import 优先使用 `code_path`(或为每个 job 单独创建 venv/安装 wheel;后者更重,建议后置)。 + +### 6.3 自定义 reward function + +- JobSpec 支持 `reward_fn_path`(Python 模块路径) +- `reward_fn_path` 可指向共享存储中用户自定义代码目录(例如 `${SHARED_ROOT}/users//code/...`) + - 约束:代码必须在 job runtime 中可 import(由 `working_dir`/`PYTHONPATH` 或 runtime_env 保障) +- 控制面校验模块可导入(basic lint/安全白名单可后置) + +### 6.4 验收(DoD) + +- 同时运行两个 job:使用不同的 `verl` 代码版本(或用户魔改版本),互不影响 +- 用户可在 JobSpec 中替换 reward function 并跑通一个最小训练闭环 + +--- + +## 7. MVP v4.0(可观测性:Prometheus/Grafana + W&B 集成) + +> 目标:平台可运营:能回答“谁在用多少资源、跑了多久、利用率如何、是否空占 GPU”。 + +### 7.1 指标与监控 + +- Ray 指标接入 Prometheus(节点/任务/actor) +- GPU 指标:nvidia exporter 或 DCGM exporter +- Dashboard:Grafana(至少 3 张核心面板) + - 集群总 GPU/CPU 使用率、空闲率 + - 每 job 的 GPU 时间、峰值显存、运行时长 + - 节点健康(心跳/掉线)与告警 + +### 7.2 W&B(或等价)集成验证 + +- 最小可行:单机 self-host W&B server 可用性验证 +- JobSpec 支持启用/关闭 W&B,并传入 project/run name + +### 7.3 验收(DoD) + +- Grafana 上能看到集群与 job 资源视图 +- 某个 job GPU 利用率异常(模拟)能触发告警规则(邮件/IM/日志即可) +- W&B 指标能按 job 维度归档(至少 PPO 能上报) + +--- + +## 8. MVP v4.1(运维化:SOP + 自动化运维接口) + +> 目标:把平台变成“可交接”的系统:运维动作标准化,并为智能体留出接口。 + +### 8.1 SOP 与自动化入口 + +- SOP 文档: + - 节点上线/下线 + - 故障定位(Ray session、Ray job、NCCL、OOM) + - 资源回收(停止 job、清理 ckpt) +- 自动化接口(最小): + - `/v1/ops/drain_node` + - `/v1/ops/restart_ray_head`(谨慎:需要保护与权限) + - `/v1/ops/cleanup_job_artifacts` + +### 8.2 验收(DoD) + +- 按 SOP,非开发人员可完成一次“节点上线→跑任务→下线→清理” +- 自动化接口至少能完成 1 个高频动作(如清理/停止/下线) + +--- + +## 9. MVP v5.0(Serving 与 Pipeline,偏长期) + +> 目标:训练-部署一体化:支持 model serving,并在平台内串联训练→评测→发布。 + +### 9.1 Serving + +- Ray Serve(或等价)部署模型推理服务 +- Serving 与训练共用模型库与权限(按 user/project) + +### 9.2 Pipeline(草稿里标为高级) + +- Pipeline 是对多个 job 的封装(训练→merge→eval→publish) +- 可先实现最小 DAG(两步串联)作为验证 + +### 9.3 验收(DoD) + +- 训练产物一键发布为一个可访问的推理 endpoint +- Pipeline 能自动串联并产出最终 artifact(可回滚/可追踪) + +--- + +## 10. 并行技术验证(建议尽早做) + +这些属于“跨版本”风险项,建议在 v1.1 ~ v2.0 期间尽早做: + +### 10.1 网络(IB / RoCEv2) + +- 确认环境是否支持 IB(H100)或 RoCEv2(H20) +- 跑最小 NCCL 通信验证(all-reduce / bandwidth) +- 将必要的 NCCL 环境变量注入到 job runtime_env + +### 10.2 Ray + 多节点容器约束 + +- 多容器同宿主机时的 Ray node_ip/临时目录冲突规律(已踩坑,需固化规范) +- 端口范围与防火墙策略(Ray worker 端口、dashboard、metrics) + +--- + +## 11. 已确认的约束与假设(来自讨论结论) + +这些会直接影响 v2.1(SSH 纳管)与后续多用户/存储设计: + +1) **最终形态仍以“每节点容器”运行**(不是裸机 systemd)。 + - H20 开发环境:我们可在宿主机用 `docker compose` 自建容器,并通过 SSH 进入容器调试/纳管。 + - H100 生产环境:容器由算力平台创建/回收;平台侧控制面只能 **SSH 进入这些容器** 做纳管(执行 `ray start/stop`、注入 env 等)。 +2) **认证**:内部 token 即可(MVP 阶段不对接 SSO)。 +3) **存储**:只考虑 NFS。 + - 开发环境:NFS/共享目录可通过宿主机 bind mount 提供给容器。 + - 生产环境:所有容器挂载相同 NFS,容器内共享根路径为 `/private/`(需要在实现时把“共享根路径”做成可配置项,而不是写死 `/mnt/shared`)。 +4) **网络拓扑约束**:暂不做按 IB 域/机架/拓扑的强约束调度(第 10.1 仍需验证 IB/RoCE 是否可用与配置方式,但调度不引入拓扑维度)。 +5) **共享目录分层**:在 `users//...` 之外增加一个可读写的 `common/` 目录用于共享数据/模型/代码: + - `${SHARED_ROOT}/common/datasets/` + - `${SHARED_ROOT}/common/models/` + - `${SHARED_ROOT}/common/code/` + - 权限(MVP):先默认“所有内部 token 用户可读写”,后续再细化只读/受控写。 + +--- + +## 12. 仍需你确认/讨论的问题(剩余不确定项) + +1) `runtime_env.env_vars` 注入对“子进程/训练框架内部启动进程”的覆盖范围是否足够? + - 需要确认 `verl`/`sglang` 等子进程是否继承 driver 的环境变量(通常会继承,但建议在 v3.1 验收时明确验证)。 diff --git a/specs/mvp/v1.1/mvp_plan.md b/specs/mvp/v1.1/mvp_plan.md new file mode 100644 index 0000000..99dbbc9 --- /dev/null +++ b/specs/mvp/v1.1/mvp_plan.md @@ -0,0 +1,169 @@ +# MVP v1.1 计划(Hardening + 多 Workload 可行性验证) + +本目录是 `specs/mvp/v1/` 的下一步迭代:在 v1 已经跑通(Ray head + 2 worker,PPO on Ray,持久化落盘)的基础上,把它升级为**可长期回归**的最小系统,并扩展至少一个新 workload 的可行性闭环。 + +> v1.1 的目标不是做平台服务化(API/队列/多用户)——那是 v2/v3 的工作;v1.1 聚焦“工程化 + 可行性边界验证 + 可观测/可排障基础”。 + +--- + +## 1. v1 基线回顾(已完成) + +- 拓扑:1 head(无 GPU,CPU/GPU=0)+ 2 worker(各 4 GPU) +- 提交方式:必须用 head 上的 `ray job submit` +- driver 调度:通过 `worker_node` 自定义资源 + `--entrypoint-resources` 强制 driver 在 worker +- 输出:按 `submission_id` 组织到共享目录(NFS) + +相关实现参考: + +- 脚本:`src/mvp/v1/` +- 验收动作:`specs/mvp/v1/v1_action.md` +- Roadmap:`specs/mvp/mvp_roadmap.md` + +--- + +## 2. v1.1 目标(必须达成) + +### 2.1 工程化(Hardening) + +1) **JobSpec 标准化(最小)** + - 把“提交 job 需要的参数”收敛成结构化文件: + - Ray 基础配置(YAML):cluster 地址、entrypoint 资源约束、runtime_env 等 + - 训练 JobSpec(YAML):workload 语义与训练参数 + - 至少覆盖:`submission_id`、workload 类型、资源需求、共享根路径、模型/数据路径、输出目录、超时、环境变量注入。 + - v1.1 实现落点(已在 repo 里提供,SDK 方式): + - RayConfig 示例:`src/mvp/v1.1/py/configs/dev.yaml` + - JobSpec 示例:`src/mvp/v1.1/py/jobspecs/{ppo,grpo,sft}.yaml` + - 提交入口:`src/mvp/v1.1/py/run.py`(在 head 容器内执行,使用 Ray Python SDK 提交) + - 设计文档:`specs/mvp/v1.1/sdk_submit_refactor.md` + +2) **共享根路径抽象(dev/prod 一致)** + - 引入 `SHARED_ROOT` 作为唯一共享根路径: + - dev:建议也用 `/private`(docker compose 把宿主机 shared 挂到容器内 `/private`,模拟生产) + - prod:固定 `/private`(算力平台容器内 NFS) + - 任何代码/脚本不得写死 `/mnt/shared`(允许兼容旧路径但不得作为主路径)。 + +3) **共享目录分层(新增 `common/` 与 `user/`)** + - 在 `datasets/hf/jobs/outputs` 之外,新增一个所有用户可读写的共享区: + - `${SHARED_ROOT}/common/`:共享模型/数据/代码快照(多版本 verl / 公共数据) + - `${SHARED_ROOT}/user/`:用户自定义代码(例如 `reward_fn_path` 指向这里) + - v1.1 默认策略:先假设“所有用户可写”(后续 v3 再做权限与隔离)。 + +4) **可排障基础** + - 每个 job 目录必须有: + - `config/`:提交命令、JobSpec 快照、关键 env_vars + - `logs/`:Ray job logs + hydra logs(如有) + - `checkpoints/`:按 `save_freq` 控制频率(默认每 10 step) + - 提供“失败快照”能力:收集 `ray status` / `ray job list` / `ray list nodes` / `ray list actors`(最少其中 2 项)写入 job 目录。 + - v1.1 submitter 默认落盘: + - `${SHARED_ROOT}/jobs//config/job_spec.json` + - `${SHARED_ROOT}/jobs//config/runtime_env.json` + - `${SHARED_ROOT}/jobs//config/submit_cmd.txt` + - `${SHARED_ROOT}/jobs//logs/ray_job_submit.out` + - `${SHARED_ROOT}/jobs//debug/ray_status_{pre,post}.txt` + - `${SHARED_ROOT}/jobs//debug/ray_job_list_post.txt` + +### 2.2 Workload 扩展(至少新增 1 个) + +v1.1 需要新增并验收通过两个 workload(都要跑通闭环): + +- **GRPO on Ray**(推荐优先,复用 PPO 入口,通过算法配置切换) + - 基于 `python -m verl.trainer.main_ppo` + - 通过配置覆盖:`algorithm.adv_estimator=grpo`(以及必要的 rollout 参数) + + - **SFT on Ray(Ray-native)** + - 入口:`python -m verl.trainer.sft_trainer_ray` + - 参考实现:`verl/verl/trainer/sft_trainer_ray.py`(内部会 `ray.init()`) + - 需要确保 `ray.init()` 连接已有集群: + - 优先:`runtime_env.env_vars.RAY_ADDRESS=auto`(配合 `ray job submit`) + - 兜底:在 v1.1 的 launcher 脚本里显式 `ray.init(address="auto")` 再调用 trainer(避免依赖 Ray 的 env var 行为差异) + - 重要细节:Ray Job 的 entrypoint(driver)默认不分配 GPU,因此 SFT driver 侧不要强依赖 CUDA: + - 推荐:`trainer.device=cpu`(driver 只做 orchestration;训练由 Ray workers 占 GPU) + +--- + +## 3. v1.1 关键设计点 + +### 3.1 多版本代码与自定义逻辑(为 v3.1 铺路,但 v1.1 先做最小验证) + +已确定优先方案(A):通过 **Ray Job 的 `runtime_env.env_vars`** 注入 `PYTHONPATH`。 + +- `code_path`(例如 `${SHARED_ROOT}/common/code/verl/`) +- 提交 job 时设置: + - `runtime_env.env_vars.PYTHONPATH = ":$PYTHONPATH"` + +并约定: + +- `reward_fn_path` 可指向 `${SHARED_ROOT}/user/code/...` 下用户自定义代码 +- 与 `code_path` 一样,必须通过 `runtime_env.env_vars` 确保该路径可被 import(例如把 `${SHARED_ROOT}/user/code` 也加入 `PYTHONPATH`) + +v1.1 中至少做一次“代码覆盖验证”: + +- 在 code_path 下放一个可识别的 `verl` 版本标识(例如 `verl.__version__` 打印差异) +- 提交 job 并在日志中确认 import 的是 code_path 的版本(而不是镜像内默认安装) + +v1.1 的最小落地方式(已实现): + +- 提供代码快照脚本:`src/mvp/v1.1/scripts/31_snapshot_verl_code.sh` + - 会把 `/workspace/verl`(挂载的 repo)复制到 `${SHARED_ROOT}/common/code/verl//` + - 并写入 `${code_path}/mvp_marker.py`,用于在 Ray job logs 中验证“选用的是哪份 code_path” +- submitter 会在 entrypoint 前运行 preflight: + - 打印 `verl.__file__` 与 `mvp_marker.MARKER` + - 由此确认 job 粒度的 PYTHONPATH 生效,且不同 job 可指向不同 `code_path`(多版本共存) + +### 3.2 Checkpoint 策略(磁盘保护) + +- 默认:`save_freq=10`(每 10 step 保存一次) +- 对于 step 数已知的短任务(例如 29 steps),可以通过配置把 `save_freq` 调整为 10/15/29(按需求权衡) +- 作业目录按 `submission_id` 隔离,方便清理与归档 + +--- + +## 4. v1.1 交付物清单(代码 + 文档) + +### 4.1 代码(建议落点) + +在 `src/mvp/` 下新增 v1.1 级别的提交器与模板(或在 `src/mvp/v1` 原地演进但要保持 v1 可回归): + +- `src/mvp/v1.1/` + - `docker-compose.yaml`(与 v1 互不干扰的容器名/网络名) + - `scripts/`(Ray 启动/prepare 保留 bash;submit 通过 SDK 工具执行) + - `py/`(工程化提交层:YAML + Ray Python SDK) + - `py/configs/`(Ray 基础配置) + - `py/jobspecs/`(训练 JobSpec) + - `py/run.py`(入口) + +此外,为了对齐 dev 环境约束(远程机固定目录): + +- 远程机目录必须新增:`argus@h1:/home2/argus/infra/mvp/v1.1/` +- 该目录内需包含 v1.1 的全部内容(compose + scripts + README),可由本 repo 的 `src/mvp/v1.1/` 同步过去 + +### 4.2 文档 + +- `specs/mvp/v1.1/v1.1_action.md`:开发、部署、测试、验收流程(可复现) +- 更新 `specs/mvp/mvp_roadmap.md`:保持路线图与落地一致(按需) + +--- + +## 5. v1.1 验收标准(DoD) + +### 5.1 Hardening DoD + +- [ ] 所有提交均由 head 执行 `ray job submit`,且显式 `--submission-id=` +- [ ] 共享根路径由 `SHARED_ROOT` 控制(dev/prod 可切换),脚本无硬编码 +- [ ] 每个 job 的输出目录为:`${SHARED_ROOT}/jobs//` +- [ ] checkpoint 不会“每 step 保存”导致爆盘:默认 `save_freq=10` +- [ ] job 失败时,`${SHARED_ROOT}/jobs//config/` 中有足够信息定位(命令、env、ray 状态快照) + - [ ] v1.1 测试前会清理 v1 的遗留容器/进程(避免端口、容器名、Ray session 干扰) + +### 5.2 Workload DoD(GRPO + SFT 都必须) + +GRPO(必须): + +- [ ] `algorithm.adv_estimator=grpo` 的 job 可提交并进入 RUNNING +- [ ] job 能跑完最小训练步数(可设 `total_epochs=1` 或 `total_training_steps`) +- [ ] 输出目录内有日志与至少 1 次 checkpoint(或明确不保存并说明原因) + +SFT(必须): + +- [ ] `sft_trainer_ray` 可连接集群并跑到至少 1 个 step(推荐最小训练步数/epoch) +- [ ] 输出目录与 checkpoint 策略同 v1.1 规范(落盘到 `${SHARED_ROOT}/jobs//...`) diff --git a/specs/mvp/v1.1/sdk_submit_refactor.md b/specs/mvp/v1.1/sdk_submit_refactor.md new file mode 100644 index 0000000..9c6412b --- /dev/null +++ b/specs/mvp/v1.1/sdk_submit_refactor.md @@ -0,0 +1,148 @@ +# MVP v1.1 工程化重构方案:Ray Python SDK 提交层(YAML Config + YAML JobSpec) + +本文档把 v1.1 的“代码工程化”目标落到一个明确的设计:**保留现有 scripts**(Ray 集群构建、数据准备、模型准备、代码快照),将“任务提交机制”重构为 **Ray Python SDK**(`ray.job_submission.JobSubmissionClient`)驱动的 Python 工具层。 + +> 约束(已确认) +> 1) 基础配置用 YAML,JobSpec 也用 YAML。 +> 2) 工具必须在 **head 容器**执行(从 head 发起提交,满足“在 head 提交”的要求)。 +> 3) 训练参数组织保持与现在一致:仍然使用 **Hydra overrides** 方式构造 entrypoint。 +> 4) 不使用 `requests` 直连 HTTP API(只用 Ray SDK)。 + +--- + +## 1. 当前 Ray SDK 能力验证(关键前提) + +在 head 容器(`mvp11-ray-head`)中验证: + +- Ray 版本:`2.51.1` +- `JobSubmissionClient.submit_job` 支持以下关键字段: + - `submission_id` + - `runtime_env` + - `entrypoint_num_cpus` + - `entrypoint_num_gpus` + - `entrypoint_resources`(用于强制 driver 落 worker) + +因此 v1.1 可以“纯 SDK”完成提交,不需要 `requests` fallback。 + +--- + +## 2. 系统分层(不动 scripts,只重构提交层) + +### 2.1 scripts(保留) + +`src/mvp/v1.1/scripts/` 继续负责: + +- 容器生命周期:`01_up.sh` / `02_down.sh` +- Ray 启动:`20_start_head.sh` / `21_start_workers.sh` +- 数据/模型准备:`30_prepare_data_and_model.sh` +- 代码快照:`31_snapshot_verl_code.sh`(生成 `${SHARED_ROOT}/common/code/verl//`) + +scripts 可以新增一个“薄封装”脚本,负责 `docker exec` 进 head 容器并运行 Python 提交器,但 scripts 不再拼 `ray job submit ...` CLI 字符串。 + +### 2.2 Python 工具层(新增) + +在 `src/mvp/v1.1/py/` 新增提交工具层: + +- 读取 Ray 基础配置(YAML) +- 读取训练 JobSpec(YAML) +- 用 Ray Python SDK 提交/查询/停止/拉日志 +- 将 job 级别产物落盘到:`${SHARED_ROOT}/jobs//...` + +--- + +## 3. 输入定义:两份 YAML + +### 3.1 Ray 基础配置(RayConfig YAML) + +这份配置是“稳定可复用”的,描述 cluster 与 driver placement 等通用信息。 + +字段建议: + +- `address`: `http://127.0.0.1:8265`(从 head 容器内部视角) +- `shared_root`: `/private` +- `entrypoint_num_cpus`: `1` +- `entrypoint_resources`: `{"worker_node": 1}`(强制 driver 使用 worker 才有的资源) +- `runtime_env.env_vars`: HF cache / endpoint 等通用环境变量 +- `user_code_path`: `${shared_root}/user/code`(可选,默认值也可) + +### 3.2 训练 JobSpec(JobSpec YAML) + +这份配置是“一次训练”语义,描述 workload + 训练参数 + code_path 多版本等。 + +字段建议: + +- `workload`: `ppo|grpo|sft` +- `submission_id`: 可选(不填则生成;但最终必须显式传给 SDK) +- `code_path`: `${shared_root}/common/code/verl/`(多版本关键字段) +- `model_id` +- 数据路径:`train_file` / `val_file`(按 workload) +- 训练参数:`nnodes` / `n_gpus_per_node` / `total_training_steps` / `save_freq` / `test_freq` + +注意(SFT 的 driver 设备选择): + +- Ray job 的 entrypoint(driver)默认不分配 GPU(我们通常不设置 `entrypoint_num_gpus`)。 +- `sft_trainer_ray.py` 的 driver 会用 `trainer.device` 做张量统计;若设置为 `cuda` 且 driver 无 GPU,会报: + - `RuntimeError: No CUDA GPUs are available` +- 因此 v1.1 的 SFT JobSpec 默认应设置:`trainer.device=cpu`(训练 workers 仍会占用 GPU)。 + +--- + +## 4. Python 提交器的职责(tool class) + +建议实现 `RayJobTool`(或类似命名),能力: + +### 4.1 submit(核心) + +输入:`RayConfig + JobSpec` +输出:`submission_id` + +实现要点: + +- `client = JobSubmissionClient(address)` +- 生成/确定 `submission_id` +- `runtime_env` 合并逻辑: + - 合并 config 与 jobspec 的 `env_vars` + - 强制注入多版本: + - `PYTHONPATH = "::$PYTHONPATH"` +- 构造 entrypoint(保持 hydra overrides 风格): + - PPO/GRPO:`python3 -m verl.trainer.main_ppo ...` + - SFT:`python3 -m verl.trainer.sft_trainer_ray ...` +- 强制 driver 落 worker: + - `entrypoint_resources=config.entrypoint_resources` + - `entrypoint_num_cpus=config.entrypoint_num_cpus` +- 落盘产物: + - `${shared_root}/jobs//config/{ray_config.yaml,jobspec.yaml,submit_payload.json}` + - `${shared_root}/jobs//logs/submit.out` + - `${shared_root}/jobs//debug/{ray_status_pre,ray_job_list_post}.txt`(可用 SDK 或 `ray status` 采集) + +### 4.2 status / stop / logs / list + +- `status(submission_id)` +- `stop(submission_id)` +- `logs(submission_id)`(可支持 tail) +- `list()` + +--- + +## 5. `run.py` 入口(必须在 head 容器执行) + +建议入口: + +- `python3 /workspace/mvp/v1.1/py/run.py --config --jobspec --action submit` +- `--action` 支持:`submit|status|stop|logs|list` + +host 侧执行方式(由 scripts 薄封装): + +- `docker exec mvp11-ray-head python3 /workspace/mvp/v1.1/py/run.py ...` + +--- + +## 6. 验收口径(工程化部分) + +1) **SDK 提交**:不使用 `ray job submit` CLI,改用 `JobSubmissionClient.submit_job`。 +2) **driver 仍强制在 worker**:SDK 提交时 `entrypoint_resources={"worker_node":1}` 生效。 +3) **多版本共存验证**: + - 通过 `31_snapshot_verl_code.sh` 生成 `codeA/codeB` 两份 code_path + - 通过两份 JobSpec 分别指向不同 `code_path` + - 在 job logs 中看到不同的 marker(例如 `mvp_marker.MARKER`) + diff --git a/specs/mvp/v1.1/v1.1_action.md b/specs/mvp/v1.1/v1.1_action.md new file mode 100644 index 0000000..cd72a7f --- /dev/null +++ b/specs/mvp/v1.1/v1.1_action.md @@ -0,0 +1,333 @@ +# MVP v1.1 行动文档(实施方案 / 部署测试 / 验收口径) + +本文档面向“把 v1 跑通的实验脚本,升级为可长期回归的 v1.1 最小系统”,并给出**开发改造 → 部署测试 → 验收**的可复现流程。 + +> v1.1 的核心约束(来自讨论结论) +> - 仍然必须通过 **head 节点执行 `ray job submit`** 提交任务。 +> - 训练/driver **必须落在 worker**(head 不跑训练)。 +> - 多版本 `verl` 共存:同一镜像不变,必须通过 **Ray Job `runtime_env.env_vars` 注入 `PYTHONPATH`** 让 job 粒度选择代码版本。 +> - 存储只考虑 NFS:dev 环境我们自己 mount;生产环境容器内统一看到 `/private/`。 + +--- + +## 1. 目标与非目标 + +### 1.1 目标(v1.1 必须做到) + +1) **可回归**:同一环境连续跑多次 PPO 回归,不互相覆盖,输出按 submission id 归档。 +2) **可扩展**:新增并验收通过 2 个 workload(**GRPO + SFT**)并跑通闭环。 +3) **可排障**:每个 job 目录包含完整的提交快照、关键 env、Ray 状态快照与日志入口。 +4) **可多版本共存**:同一 Ray 集群内,不同 job 通过 `PYTHONPATH` 选择不同 `verl` 代码快照。 + +### 1.2 非目标(v1.1 不做) + +- 不做平台 API/队列/多租户/RBAC(这是 v2/v3)。 +- 不做复杂调度(拓扑、IB 域、NUMA、Gang 等自动化策略)。 + +--- + +## 2. 运行环境约定(dev / prod 一致抽象) + +### 2.1 拓扑(单机 3 容器) + +- `mvp-ray-head`:无 GPU,`ray start --head --num-cpus=0 --num-gpus=0`(控制面 only) +- `mvp-ray-worker-0`:4 GPU +- `mvp-ray-worker-1`:4 GPU + +### 2.2 “head 不跑训练”的硬约束实现(必须) + +1) **head CPU=0**:从资源层面阻断默认 task/driver 落到 head。 +2) **worker 自定义资源标签**:worker 启动时带 `--resources='{"worker_node": 100}'`。 +3) **ray job submit 强制 entrypoint 落 worker**:提交时必须带: + - `--entrypoint-resources='{"worker_node": 1}'` + - `--entrypoint-num-cpus=1`(显式声明 driver 需要的 CPU) + +> 验证口径:`ray job list` 的 `driver_info.node_ip_address` 必须是 worker 的 IP,而不是 head IP。 + +### 2.3 共享存储(NFS)与路径(关键) + +- 生产环境:容器内共享根路径固定为 `/private/`(算力平台统一挂载 NFS)。 +- 开发环境:docker compose 也应把宿主机共享目录挂载到容器内的 `/private/`,从而做到 dev/prod 一致。 + +统一约定(容器内视角): + +- `SHARED_ROOT=/private` +- Job 输出:`${SHARED_ROOT}/jobs//` + +建议的共享目录结构(v1.1 新增 `common/` 与 `user/`): + +- `${SHARED_ROOT}/datasets/`:通用数据(例如 gsm8k parquet) +- `${SHARED_ROOT}/hf/`:HuggingFace cache(模型/分词器/权重) +- `${SHARED_ROOT}/jobs/`:按 submission id 归档的作业目录(强制) +- `${SHARED_ROOT}/outputs/`:临时/非强约束输出(不建议长期依赖) +- `${SHARED_ROOT}/ray/`:Ray 调试痕迹(可选,通常 Ray 默认写 `/tmp/ray`) +- `${SHARED_ROOT}/common/`:所有用户可读写共享区(模型/数据/代码快照) + - `${SHARED_ROOT}/common/models/`:可复用基础模型(可用软链指向 hf cache 或 snapshot) + - `${SHARED_ROOT}/common/datasets/`:共享数据(或与 `datasets/` 统一规划) + - `${SHARED_ROOT}/common/code/`:代码快照(多版本 `verl` / 自定义 reward) +- `${SHARED_ROOT}/user/`:用户自定义内容(默认所有用户可写) + - `${SHARED_ROOT}/user/code/`:reward_fn 等自定义 Python 代码 + +--- + +## 3. 开发实施方案(代码改造清单) + +> v1.1 建议新增 `src/mvp/v1.1/`(保持 v1 可回归不被破坏)。 + +### 3.1 JobSpec(最小标准化) + +v1.1 的工程化目标是把“提交机制”迁移到 Ray Python SDK,因此输入拆为两份 YAML: + +1) Ray 基础配置(YAML):address / entrypoint resources / runtime_env 等 +2) 训练 JobSpec(YAML):workload 语义与训练参数(仍由 Hydra overrides 组织) + +训练 JobSpec(YAML)至少包含: + +- `submission_id`:可空;为空时由 submitter 生成(但最终必须显式传给 `ray job submit --submission-id`) +- `workload`:`ppo` / `grpo` / `sft`(v1.1 必须 `ppo` + `grpo` + `sft`) +- `shared_root`:默认 `/private`(容器内路径) +- `code_path`:`verl` 代码快照目录(用于多版本共存) +- `reward_fn_path`(可选):指向 `${shared_root}/user/code/...` 下的 Python 文件或模块入口 +- `model` / `dataset`:必须指向共享存储的持久化路径(避免每次下载/生成) +- `ray`:`address=http://127.0.0.1:8265`(从 head 容器内部视角) +- `resources`: + - `entrypoint_resources={"worker_node":1}` + - `entrypoint_num_cpus=1` +- `trainer_overrides`:训练参数覆盖(v1.1 默认 `total_epochs=1`、`save_freq=10`) +- `env_vars`:会被透传到 `runtime_env.env_vars`(必须包含 `PYTHONPATH` 注入) + +交付物(v1.1 SDK 方式): + +- `src/mvp/v1.1/py/configs/dev.yaml`(Ray 基础配置示例) +- `src/mvp/v1.1/py/jobspecs/{ppo,grpo,sft}.yaml`(训练 JobSpec 示例) +- `src/mvp/v1.1/py/run.py`(入口:使用 Ray Python SDK 提交/查询/停止/拉日志) +- 设计文档:`specs/mvp/v1.1/sdk_submit_refactor.md` + +### 3.2 多版本 `verl` 共存(必须) + +原则:**镜像固定不变**;job 粒度通过 `PYTHONPATH` 选择 `verl` 代码快照。 + +提交时必须注入(runtime_env): + +- `PYTHONPATH=":$PYTHONPATH"`(`CODE_PATH` 放最前面) + +并要求 job 在日志中打印一行确认 import 来源,例如: + +- `python -c "import verl,inspect; print(verl.__file__)"`(或训练入口启动时打印) + +v1.1 具体实现(可复现): + +- 先用 `src/mvp/v1.1/scripts/31_snapshot_verl_code.sh` 生成代码快照目录 `${SHARED_ROOT}/common/code/verl//` + - 该目录里会包含一个 `mvp_marker.py`(`MARKER=`) +- 提交 job 时让 `code_path` 指向该快照目录;submitter 会在 entrypoint 前打印: + - `MVP_PRECHECK_VERL_FILE`(验证 import 来源) + - `MVP_PRECHECK_MARKER`(验证选择的 code_path) + +### 3.3 `submit_job` 工具(组装 ray job submit) + +新增一个提交器(建议 Python,避免复杂 bash quoting): + +- 输入:JobSpec JSON +- 产物: + - 生成/确定 `submission_id` + - 创建 `${SHARED_ROOT}/jobs//config/`、`logs/`、`checkpoints/` + - 写入 `config/job_spec.json`(原样快照) + - 写入 `config/runtime_env.json`(最终用于 submit 的 JSON) + - 写入 `config/submit_cmd.txt`(最终命令行) +- 执行:在 **head 容器内**运行 `ray job submit ...` + +### 3.4 可排障:debug bundle(强制落盘) + +在 job 生命周期的关键节点收集并落盘(至少 2 类): + +- `ray status` +- `ray job list` +- `ray list nodes` +- `ray list actors` + +建议落盘到: + +- `${SHARED_ROOT}/jobs//debug/`(每次收集带时间戳文件名) + +### 3.5 Workload 扩展:GRPO(v1.1 新增闭环) + +优先用与 PPO 相同入口 `python -m verl.trainer.main_ppo`,仅通过配置切换算法: + +- `algorithm.adv_estimator=grpo` +- 其余保持最小可跑:`total_epochs=1`、`save_freq=10` + +### 3.6 Workload 扩展:SFT on Ray(v1.1 必须新增闭环) + +#### 3.6.1 入口与参考实现 + +- 入口:`python -m verl.trainer.sft_trainer_ray` +- 参考代码:`verl/verl/trainer/sft_trainer.py`(非 Ray 版本)与 `verl/verl/trainer/sft_trainer_ray.py`(Ray 版本) + +> v1.1 要验收的是 “SFT on Ray”,因此默认使用 `sft_trainer_ray.py`。 + +#### 3.6.2 连接已有 Ray 集群(必须) + +`sft_trainer_ray.py` 内部直接调用 `ray.init()`,为了确保它连接到**已有集群**(head+workers),v1.1 约定: + +- 提交 job 时通过 `runtime_env.env_vars` 注入:`RAY_ADDRESS=auto` + +如果发现 `ray.init()` 未按预期读取 `RAY_ADDRESS`(Ray 版本差异风险),v1.1 需要提供一个 launcher 兜底: + +- 由 launcher 先显式 `ray.init(address="auto")`,再调用 SFT trainer 逻辑 + +#### 3.6.3 SFT 数据格式(parquet schema) + +`sft_trainer_ray` 默认使用 `MultiTurnSFTDataset`,parquet 中至少需要: + +- `messages` 列:list[dict],dict 至少含 `role`/`content` + +v1.1 的 `prepare` 阶段需要生成并持久化 SFT 数据,例如: + +- `${SHARED_ROOT}/datasets/gsm8k_sft/train.parquet` +- `${SHARED_ROOT}/datasets/gsm8k_sft/val.parquet`(可选) + +单条样本的 `messages` 形态示例: + +- `[{ "role": "user", "content": "" }, { "role": "assistant", "content": "" }]` + +> 注意:SFT parquet 不能直接复用 PPO/RL 的 parquet(schema 不同)。 + +#### 3.6.4 重要细节:SFT Ray Driver 不应依赖 GPU + +在 `ray job submit` 模式下,我们的 entrypoint(driver)默认 **不会分配 GPU**(我们只指定了 `--entrypoint-num-cpus=1`,没有指定 `--entrypoint-num-gpus`)。 + +而 `verl/verl/trainer/sft_trainer_ray.py` 的 driver 逻辑里会用 `trainer.device` 来创建 `torch.tensor(..., device=...)` 做统计,如果设置为 `cuda` 且 driver 没有 GPU,会触发: + +- `RuntimeError: No CUDA GPUs are available` + +因此 v1.1 的 SFT on Ray 验收默认要求: + +- `trainer.device=cpu`(driver 只做 orchestration;真正训练仍由 Ray 的 TrainingWorker/资源池占用 GPU) + +### 3.7 v1.1 脚本化交付(必须独立完整) + +`src/mvp/v1.1/` 需要像 v1 一样提供一套完整脚本,确保 v1.1 可独立运行、可回归: + +- `src/mvp/v1.1/docker-compose.yaml`(容器名建议与 v1 区分,避免冲突) +- `src/mvp/v1.1/scripts/00_prereq_check.sh`(含 GPU/目录/NFS/verl 代码检查) +- `src/mvp/v1.1/scripts/01_up.sh` / `02_down.sh`(起停) +- `src/mvp/v1.1/scripts/20_start_head.sh` / `21_start_workers.sh` +- `src/mvp/v1.1/scripts/30_prepare_data_and_model.sh`(包含 PPO 数据 + SFT 数据) +- `src/mvp/v1.1/scripts/40_submit_ppo_epoch1.sh` +- `src/mvp/v1.1/scripts/41_submit_grpo_epoch1.sh` +- `src/mvp/v1.1/scripts/42_submit_sft_minimal.sh` +- `src/mvp/v1.1/scripts/50_status.sh` +- `src/mvp/v1.1/scripts/31_snapshot_verl_code.sh`(多版本 code snapshot) +- `src/mvp/v1.1/scripts/43_submit_jobspec.sh`(通过 JobSpec 提交) +- `src/mvp/v1.1/scripts/12_install_py_deps.sh`(安装 PyYAML 等依赖) +- `src/mvp/v1.1/scripts/44_submit_sdk.sh`(通过 Ray Python SDK + YAML 提交) + +--- + +## 4. 部署与测试流程(dev 环境) + +> dev 环境以远程机目录为例:`argus@h1:/home2/argus/infra/mvp`。v1.1 的所有内容要求放在: +> +> - `argus@h1:/home2/argus/infra/mvp/v1.1/` +> +> 并在该目录中通过脚本使用 `docker exec` 协调容器。 + +### 4.0 清理 v1 环境(必须先做) + +v1 已在 `argus@h1` 部署过容器与 Ray。为保证 v1.1 的可重复测试,开始 v1.1 前必须清理 v1: + +1) 停止并删除 v1 容器(推荐用 v1 的 down 脚本) +2) 确认 `docker ps` 中不再有 v1 的 `mvp-ray-head/mvp-ray-worker-*` + +v1.1 的脚本里也提供了一个 best-effort 清理脚本:`src/mvp/v1.1/scripts/03_cleanup_v1_legacy.sh`(远程目录中同名脚本)。 + +### 4.1 环境准备(一次性 / 幂等) + +1) 目录检查(远程机): + - `${WORKDIR}/shared/` 存在并具备上述子目录(含 `common/`、`user/`) +2) `verl` 代码目录检查: + - `${WORKDIR}/verl` 不存在则执行 `git clone https://github.com/volcengine/verl.git` +3) GPU 可用性检查: + - 设备存在(例如 0-7 可见),并按 worker 容器分配(每个 worker 4 GPU) +4) 模型与数据持久化路径: + - 模型与数据必须落在 `${SHARED_ROOT}` 下;若已存在则跳过下载/生成 + - SFT parquet 同样必须落在 `${SHARED_ROOT}` 下;若已存在则跳过生成 + +### 4.2 启动 Ray 集群(每次测试) + +1) `docker compose up -d` +2) head:`ray start --head --num-cpus=0 --num-gpus=0 ...` +3) workers:`ray start --address=:6379 --resources='{"worker_node":100}' ...` +4) 验证:`ray status` 显示 1 head + 2 worker,且 head `CPU:0 GPU:0` + +### 4.3 提交 PPO 回归(必须跑 2 次) + +1) 生成 JobSpec(可用模板 + 覆盖项) +2) 在 head 容器内执行 submitter(或直接 `ray job submit`) +3) 验证要点: + - `ray job list`:driver node 是 worker + - `${SHARED_ROOT}/jobs//` 下存在 `config/`、`logs/`、`checkpoints/` + - checkpoint 每 10 step 产生(例如 `global_step_10`) + +### 4.4 提交 GRPO(新增 workload 验收) + +同 PPO,但覆盖 `algorithm.adv_estimator=grpo`,确保能进入 RUNNING 并完成最小步数。 + +### 4.5 提交 SFT on Ray(新增 workload 验收,必须) + +1) 确认 `${SHARED_ROOT}/datasets/gsm8k_sft/train.parquet` 已存在(由 v1.1 prepare 生成)。 +2) 通过 head 容器执行 `ray job submit` 提交 `python -m verl.trainer.sft_trainer_ray`。 +3) 关键约束: + - `runtime_env.env_vars.RAY_ADDRESS=auto`(连接已有集群) + - `--entrypoint-resources='{"worker_node": 1}'`(driver 落 worker) + - `PYTHONPATH=:$PYTHONPATH`(多版本 verl) +4) 最小化训练配置建议(避免 OOM/耗时过长): + - `trainer.total_epochs=1` + - `trainer.total_training_steps=10~30` + - `trainer.save_freq=10` + - `trainer.nnodes=2`、`trainer.n_gpus_per_node=4`(用满 8 卡做一次最小分布式验证) + - `data.train_files=${SHARED_ROOT}/datasets/gsm8k_sft/train.parquet` + - `trainer.default_local_dir=${SHARED_ROOT}/jobs//checkpoints` + +### 4.6 工程化验证:JobSpec + 多版本共存(v1.1 必须) + +1) 生成两个 code snapshot(不同 `CODE_ID`): + - `CODE_ID=codeA ./scripts/31_snapshot_verl_code.sh` + - `CODE_ID=codeB ./scripts/31_snapshot_verl_code.sh` +2) 分别修改/复制 JobSpec 模板,使 `code_path` 指向不同 snapshot: + - `${SHARED_ROOT}/common/code/verl/codeA` + - `${SHARED_ROOT}/common/code/verl/codeB` +3) 用 JobSpec 提交(必须从 head): + - `./scripts/43_submit_jobspec.sh /workspace/mvp/v1.1/templates/ppo.json`(示例) +4) 在 Ray job logs 中验证: + - `MVP_PRECHECK_MARKER` 打印为对应的 `codeA`/`codeB` + - `MVP_PRECHECK_VERL_FILE` 指向 `${SHARED_ROOT}/common/code/verl/...` 而不是镜像内 site-packages + +--- + +## 5. 验收标准(Definition of Done) + +### 5.1 Hardening DoD(全部必选) + +- [ ] 提交必须来自 head:能在 head 容器内看到 `ray job submit ...` 的提交记录 +- [ ] driver 不在 head:`ray job list` 的 `driver_info.node_ip_address` ∈ worker IP,且 ≠ head IP +- [ ] 输出目录按 submission id 隔离:`${SHARED_ROOT}/jobs//` 不复用、不覆盖 +- [ ] 数据/模型持久化:再次提交时不重复下载/生成(有 “skip if exists” 的日志) +- [ ] checkpoint 策略有效:默认 `save_freq=10`,不会每 step 保存爆盘 +- [ ] debug bundle 落盘:`${SHARED_ROOT}/jobs//debug/` 至少包含 2 类 Ray 状态快照 +- [ ] 多版本共存验证通过:日志中能确认 `verl` import 来源来自 JobSpec 指定的 `code_path` + +### 5.2 Workload DoD(GRPO + SFT 都必须) + +- [ ] GRPO job 能提交、RUNNING、完成最小训练步数 +- [ ] GRPO job 产物目录满足与 PPO 相同的目录规范与 debug 规范 +- [ ] SFT job 能提交、连接已有集群并跑到至少 1 个 step(建议最小步数/epoch) +- [ ] SFT job 产物目录满足与 PPO 相同的目录规范与 debug 规范 + +--- + +## 6. 生产环境部署注意事项(v1.1 需要考虑但不强制在 dev 全量模拟) + +- 容器由算力平台创建:我们只负责 SSH 进去纳管(启动 ray / 提交 job / 收集产物)。 +- 容器内共享路径为 `/private`:所有脚本必须以 `SHARED_ROOT=/private` 工作,不得写死 `/mnt/shared`。 +- 认证仅内部 token:在 submitter 中把 token 作为 env var 透传(不写入日志明文)。 diff --git a/src/mvp/v1.1/README.md b/src/mvp/v1.1/README.md new file mode 100644 index 0000000..ffbbe67 --- /dev/null +++ b/src/mvp/v1.1/README.md @@ -0,0 +1,61 @@ +# MVP v1.1(GRPO + SFT on Ray)运行说明 + +本目录是一套**独立可运行**的 v1.1 交付:使用 1 个 Ray head(不跑训练)+ 2 个 Ray worker(各 4 GPU)在同一宿主机通过 `docker exec` 协调容器,并通过 **head 上的 `ray job submit`** 提交作业,同时强制 driver 落到 worker。 + +> 远程 dev 环境推荐目录布局: +> +> - `/home2/argus/infra/mvp/` +> - `shared/`(持久化:datasets/hf/jobs/...) +> - `verl/`(代码仓库,用于 prepare / snapshot) +> - `v1.1/`(本目录内容:compose + scripts) + +--- + +## 快速开始(远程机 argus@h1) + +在 `/home2/argus/infra/mvp/v1.1/` 下执行: + +```bash +./scripts/00_prereq_check.sh +./scripts/01_up.sh +./scripts/20_start_head.sh +./scripts/21_start_workers.sh +./scripts/30_prepare_data_and_model.sh +./scripts/12_install_py_deps.sh +./scripts/44_submit_sdk.sh /workspace/mvp/v1.1/py/configs/dev.yaml /workspace/mvp/v1.1/py/jobspecs/ppo.yaml +./scripts/44_submit_sdk.sh /workspace/mvp/v1.1/py/configs/dev.yaml /workspace/mvp/v1.1/py/jobspecs/grpo.yaml +./scripts/44_submit_sdk.sh /workspace/mvp/v1.1/py/configs/dev.yaml /workspace/mvp/v1.1/py/jobspecs/sft.yaml +./scripts/40_submit_ppo_epoch1.sh +./scripts/41_submit_grpo_epoch1.sh +./scripts/42_submit_sft_minimal.sh +./scripts/50_status.sh +``` + +说明: + +- `scripts/40/41/42` 是历史的 “CLI 提交脚本”(仍可用),但 v1.1 的工程化目标是把提交机制迁移到 `scripts/44_submit_sdk.sh`(Ray Python SDK + YAML 配置)。 + +停止并清理: + +```bash +./scripts/02_down.sh +``` + +--- + +## 关键约束(必须满足) + +- **必须通过 head 执行 `ray job submit`** 提交任务(满足“从 head 提交”要求)。 +- **head 不跑训练**:head 以 `--num-cpus=0 --num-gpus=0` 启动;worker 具备自定义资源 `worker_node`;提交时 `--entrypoint-resources='{"worker_node": 1}'` 强制 driver 落 worker。 +- **共享路径统一为 `/private`(容器内)**:compose 将宿主机 `../shared` 挂载到容器内 `/private`,对齐生产环境。 +- **多版本 verl**:通过 Ray Job `runtime_env.env_vars.PYTHONPATH` 注入 `${SHARED_ROOT}/common/code/verl/...`,job 粒度选择代码快照。 + +--- + +## 共享目录(容器内 /private) + +- `/private/datasets/`:数据(PPO 的 gsm8k RL parquet、SFT parquet) +- `/private/hf/`:HF 缓存(模型持久化,避免重复下载) +- `/private/jobs//`:每个 Ray Job 的输出目录(logs/config/debug/checkpoints) +- `/private/common/`:共享区(模型/数据/代码快照) +- `/private/user/`:用户自定义代码(例如 reward_fn) diff --git a/src/mvp/v1.1/docker-compose.yaml b/src/mvp/v1.1/docker-compose.yaml new file mode 100644 index 0000000..a885956 --- /dev/null +++ b/src/mvp/v1.1/docker-compose.yaml @@ -0,0 +1,89 @@ +version: "3.8" + +services: + ray_head: + image: verlai/verl:sgl055.latest + container_name: mvp11-ray-head + command: sleep infinity + ports: + - "8265:8265" + volumes: + - ../verl:/workspace/verl + - ../shared:/private + - .:/workspace/mvp/v1.1 + shm_size: "10g" + ulimits: + nofile: + soft: 65536 + hard: 65536 + cap_add: + - SYS_ADMIN + - SYS_PTRACE + networks: + - mvp11-ray-net + environment: + HF_HOME: "/private/hf" + HUGGINGFACE_HUB_CACHE: "/private/hf/hub" + TRANSFORMERS_CACHE: "/private/hf/transformers" + HF_ENDPOINT: "https://hf-mirror.com" + PYTHONUNBUFFERED: "1" + + ray_worker_0: + image: verlai/verl:sgl055.latest + container_name: mvp11-ray-worker-0 + command: sleep infinity + volumes: + - ../verl:/workspace/verl + - ../shared:/private + - .:/workspace/mvp/v1.1 + shm_size: "10g" + ulimits: + nofile: + soft: 65536 + hard: 65536 + cap_add: + - SYS_ADMIN + - SYS_PTRACE + networks: + - mvp11-ray-net + runtime: nvidia + environment: + NVIDIA_VISIBLE_DEVICES: "0,1,2,3" + NVIDIA_DRIVER_CAPABILITIES: "all" + HF_HOME: "/private/hf" + HUGGINGFACE_HUB_CACHE: "/private/hf/hub" + TRANSFORMERS_CACHE: "/private/hf/transformers" + HF_ENDPOINT: "https://hf-mirror.com" + PYTHONUNBUFFERED: "1" + + ray_worker_1: + image: verlai/verl:sgl055.latest + container_name: mvp11-ray-worker-1 + command: sleep infinity + volumes: + - ../verl:/workspace/verl + - ../shared:/private + - .:/workspace/mvp/v1.1 + shm_size: "10g" + ulimits: + nofile: + soft: 65536 + hard: 65536 + cap_add: + - SYS_ADMIN + - SYS_PTRACE + networks: + - mvp11-ray-net + runtime: nvidia + environment: + NVIDIA_VISIBLE_DEVICES: "4,5,6,7" + NVIDIA_DRIVER_CAPABILITIES: "all" + HF_HOME: "/private/hf" + HUGGINGFACE_HUB_CACHE: "/private/hf/hub" + TRANSFORMERS_CACHE: "/private/hf/transformers" + HF_ENDPOINT: "https://hf-mirror.com" + PYTHONUNBUFFERED: "1" + +networks: + mvp11-ray-net: + driver: bridge diff --git a/src/mvp/v1.1/job_spec.schema.json b/src/mvp/v1.1/job_spec.schema.json new file mode 100644 index 0000000..cb5524f --- /dev/null +++ b/src/mvp/v1.1/job_spec.schema.json @@ -0,0 +1,33 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "mvp-v1.1-job-spec", + "type": "object", + "required": ["workload", "shared_root", "code_path", "model_id", "ray", "runtime_env"], + "properties": { + "submission_id": { "type": "string" }, + "workload": { "type": "string", "enum": ["ppo", "grpo", "sft"] }, + "shared_root": { "type": "string" }, + "code_path": { "type": "string" }, + "model_id": { "type": "string" }, + "ppo": { "type": "object" }, + "grpo": { "type": "object" }, + "sft": { "type": "object" }, + "ray": { + "type": "object", + "required": ["address", "entrypoint_num_cpus", "entrypoint_resources"], + "properties": { + "address": { "type": "string" }, + "entrypoint_num_cpus": { "type": "number" }, + "entrypoint_resources": { "type": "object" } + } + }, + "runtime_env": { + "type": "object", + "required": ["env_vars"], + "properties": { + "env_vars": { "type": "object" } + } + } + } +} + diff --git a/src/mvp/v1.1/py/configs/dev.yaml b/src/mvp/v1.1/py/configs/dev.yaml new file mode 100644 index 0000000..2b7c830 --- /dev/null +++ b/src/mvp/v1.1/py/configs/dev.yaml @@ -0,0 +1,20 @@ +# Ray 基础配置(dev 环境 / head 容器内视角) +address: "http://127.0.0.1:8265" + +# 容器内共享根路径(对齐生产 /private) +shared_root: "/private" + +# 强制 driver 落 worker(head 不跑训练) +entrypoint_num_cpus: 1 +entrypoint_resources: + worker_node: 1 + +# 运行时环境变量(所有 job 通用) +runtime_env: + env_vars: + HF_ENDPOINT: "https://hf-mirror.com" + PYTHONUNBUFFERED: "1" + +# 用户自定义代码目录(可被 PYTHONPATH 注入) +user_code_path: "/private/user/code" + diff --git a/src/mvp/v1.1/py/jobspecs/grpo.yaml b/src/mvp/v1.1/py/jobspecs/grpo.yaml new file mode 100644 index 0000000..83eba66 --- /dev/null +++ b/src/mvp/v1.1/py/jobspecs/grpo.yaml @@ -0,0 +1,20 @@ +workload: "grpo" + +submission_id: "" + +code_path: "/private/common/code/verl/verl_repo" + +model_id: "Qwen/Qwen2.5-0.5B-Instruct" + +train_file: "/private/datasets/gsm8k/train.parquet" +val_file: "/private/datasets/gsm8k/test.parquet" + +nnodes: 2 +n_gpus_per_node: 4 + +total_epochs: 1 +total_training_steps: 10 + +save_freq: 10 +test_freq: -1 + diff --git a/src/mvp/v1.1/py/jobspecs/ppo.yaml b/src/mvp/v1.1/py/jobspecs/ppo.yaml new file mode 100644 index 0000000..05bc7f9 --- /dev/null +++ b/src/mvp/v1.1/py/jobspecs/ppo.yaml @@ -0,0 +1,22 @@ +workload: "ppo" + +# 可选:不填则 submitter 自动生成 +submission_id: "" + +# 多版本:指向 code snapshot(由 scripts/31_snapshot_verl_code.sh 生成) +code_path: "/private/common/code/verl/verl_repo" + +model_id: "Qwen/Qwen2.5-0.5B-Instruct" + +train_file: "/private/datasets/gsm8k/train.parquet" +val_file: "/private/datasets/gsm8k/test.parquet" + +nnodes: 2 +n_gpus_per_node: 4 + +total_epochs: 1 +total_training_steps: 10 + +save_freq: 10 +test_freq: -1 + diff --git a/src/mvp/v1.1/py/jobspecs/sft.yaml b/src/mvp/v1.1/py/jobspecs/sft.yaml new file mode 100644 index 0000000..67637b6 --- /dev/null +++ b/src/mvp/v1.1/py/jobspecs/sft.yaml @@ -0,0 +1,22 @@ +workload: "sft" + +submission_id: "" + +code_path: "/private/common/code/verl/verl_repo" + +model_id: "Qwen/Qwen2.5-0.5B-Instruct" + +train_file: "/private/datasets/gsm8k_sft/train.parquet" +val_file: null + +nnodes: 2 +n_gpus_per_node: 4 + +total_epochs: 1 +total_training_steps: 10 + +save_freq: 10 + +# SFT driver 默认不分配 GPU(ray job entrypoint 不指定 entrypoint_num_gpus),因此 driver 侧不要依赖 CUDA +trainer_device: "cpu" + diff --git a/src/mvp/v1.1/py/mvp_v11/__init__.py b/src/mvp/v1.1/py/mvp_v11/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/mvp/v1.1/py/mvp_v11/__init__.py @@ -0,0 +1 @@ + diff --git a/src/mvp/v1.1/py/mvp_v11/builders.py b/src/mvp/v1.1/py/mvp_v11/builders.py new file mode 100644 index 0000000..d4b9786 --- /dev/null +++ b/src/mvp/v1.1/py/mvp_v11/builders.py @@ -0,0 +1,96 @@ +from __future__ import annotations + +from dataclasses import dataclass + +from .models import JobSpec + + +@dataclass(frozen=True) +class BuiltCommand: + argv: list[str] + + +def build_training_argv(spec: JobSpec, submission_id: str, job_dir: str) -> BuiltCommand: + """ + Returns argv for the actual training process (Hydra overrides preserved). + This argv is executed by a lightweight Python driver entrypoint. + """ + if spec.workload in ("ppo", "grpo"): + algo_overrides: list[str] = [] + if spec.workload == "grpo": + algo_overrides.append("algorithm.adv_estimator=grpo") + + test_freq = spec.test_freq if spec.test_freq is not None else -1 + val_file = spec.val_file if spec.val_file is not None else "null" + + argv = [ + "python3", + "-m", + "verl.trainer.main_ppo", + f"data.train_files={spec.train_file}", + f"data.val_files={val_file}", + "data.train_batch_size=256", + "data.max_prompt_length=512", + "data.max_response_length=512", + f"actor_rollout_ref.model.path={spec.model_id}", + "actor_rollout_ref.actor.optim.lr=1e-6", + "actor_rollout_ref.actor.ppo_mini_batch_size=64", + "actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4", + "actor_rollout_ref.rollout.name=sglang", + "actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8", + "actor_rollout_ref.rollout.tensor_model_parallel_size=1", + "actor_rollout_ref.rollout.gpu_memory_utilization=0.4", + "actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4", + "critic.optim.lr=1e-5", + f"critic.model.path={spec.model_id}", + "critic.ppo_micro_batch_size_per_gpu=4", + "algorithm.kl_ctrl.kl_coef=0.001", + *algo_overrides, + "trainer.logger=console", + "trainer.val_before_train=False", + f"trainer.n_gpus_per_node={spec.n_gpus_per_node}", + f"trainer.nnodes={spec.nnodes}", + f"trainer.save_freq={spec.save_freq}", + f"trainer.test_freq={test_freq}", + f"trainer.total_epochs={spec.total_epochs}", + f"trainer.total_training_steps={spec.total_training_steps}", + "trainer.resume_mode=disable", + f"trainer.default_local_dir={job_dir}/checkpoints", + "+ray_kwargs.ray_init.address=auto", + f"hydra.run.dir={job_dir}/logs/hydra", + ] + return BuiltCommand(argv=argv) + + if spec.workload == "sft": + val_override = "null" if spec.val_file is None else spec.val_file + trainer_device = spec.trainer_device or "cpu" + + argv = [ + "python3", + "-m", + "verl.trainer.sft_trainer_ray", + f"model.path={spec.model_id}", + f"data.train_files={spec.train_file}", + f"data.val_files={val_override}", + "data.train_batch_size=64", + "data.micro_batch_size_per_gpu=1", + "data.max_token_len_per_gpu=2048", + "data.max_length=1024", + "trainer.logger=console", + "trainer.project_name=mvp11-sft", + f"trainer.experiment_name={submission_id}", + f"trainer.total_epochs={spec.total_epochs}", + f"trainer.total_training_steps={spec.total_training_steps}", + f"trainer.save_freq={spec.save_freq}", + "trainer.test_freq=-1", + "trainer.resume_mode=disable", + f"trainer.device={trainer_device}", + f"trainer.default_local_dir={job_dir}/checkpoints", + f"trainer.nnodes={spec.nnodes}", + f"trainer.n_gpus_per_node={spec.n_gpus_per_node}", + f"hydra.run.dir={job_dir}/logs/hydra", + ] + return BuiltCommand(argv=argv) + + raise ValueError(f"unsupported workload: {spec.workload}") + diff --git a/src/mvp/v1.1/py/mvp_v11/driver_entrypoint.py b/src/mvp/v1.1/py/mvp_v11/driver_entrypoint.py new file mode 100644 index 0000000..9f99c58 --- /dev/null +++ b/src/mvp/v1.1/py/mvp_v11/driver_entrypoint.py @@ -0,0 +1,63 @@ +from __future__ import annotations + +import argparse +import os +import shlex +import subprocess +import sys +from pathlib import Path + + +def _preflight() -> None: + print("MVP_PRECHECK_PYTHON:", sys.executable, flush=True) + print("MVP_PRECHECK_PYTHONPATH:", os.environ.get("PYTHONPATH"), flush=True) + print("MVP_PRECHECK_MVP_CODE_PATH:", os.environ.get("MVP_CODE_PATH"), flush=True) + try: + import verl # type: ignore + + print("MVP_PRECHECK_VERL_FILE:", getattr(verl, "__file__", None), flush=True) + except Exception as e: + print("MVP_PRECHECK_VERL_IMPORT_ERROR:", repr(e), flush=True) + + try: + import mvp_marker # type: ignore + + print("MVP_PRECHECK_MARKER:", getattr(mvp_marker, "MARKER", None), flush=True) + except Exception as e: + print("MVP_PRECHECK_MARKER_MISSING:", repr(e), flush=True) + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--job-dir", required=True) + parser.add_argument("cmd", nargs=argparse.REMAINDER) + args = parser.parse_args() + + job_dir = Path(args.job_dir) + job_dir.mkdir(parents=True, exist_ok=True) + + _preflight() + + if not args.cmd: + print("no command provided", file=sys.stderr) + return 2 + + # argparse includes the leading "--" if the caller uses it; strip it. + cmd = list(args.cmd) + if cmd and cmd[0] == "--": + cmd = cmd[1:] + if not cmd: + print("no command provided", file=sys.stderr) + return 2 + + # Execute training command as a subprocess so that logs are captured by Ray job logs. + cmd_str = " ".join(shlex.quote(x) for x in cmd) + print("MVP_DRIVER_EXEC:", cmd_str, flush=True) + + proc = subprocess.run(cmd, check=False) + print("MVP_DRIVER_EXIT_CODE:", proc.returncode, flush=True) + return proc.returncode + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/mvp/v1.1/py/mvp_v11/models.py b/src/mvp/v1.1/py/mvp_v11/models.py new file mode 100644 index 0000000..41f8804 --- /dev/null +++ b/src/mvp/v1.1/py/mvp_v11/models.py @@ -0,0 +1,121 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + + +def _require(d: dict[str, Any], key: str) -> Any: + if key not in d or d[key] in (None, ""): + raise ValueError(f"missing required field: {key}") + return d[key] + + +@dataclass(frozen=True) +class RayConfig: + address: str + shared_root: str + entrypoint_num_cpus: float + entrypoint_resources: dict[str, float] + runtime_env_env_vars: dict[str, str] + user_code_path: str + + @staticmethod + def from_dict(d: dict[str, Any]) -> "RayConfig": + runtime_env = d.get("runtime_env") or {} + env_vars = (runtime_env.get("env_vars") or {}) if isinstance(runtime_env, dict) else {} + if not isinstance(env_vars, dict): + raise ValueError("runtime_env.env_vars must be a mapping") + + entrypoint_resources = d.get("entrypoint_resources") or {} + if not isinstance(entrypoint_resources, dict): + raise ValueError("entrypoint_resources must be a mapping") + + return RayConfig( + address=str(_require(d, "address")), + shared_root=str(_require(d, "shared_root")), + entrypoint_num_cpus=float(d.get("entrypoint_num_cpus", 1)), + entrypoint_resources={str(k): float(v) for k, v in entrypoint_resources.items()}, + runtime_env_env_vars={str(k): str(v) for k, v in env_vars.items()}, + user_code_path=str(d.get("user_code_path", f"{_require(d, 'shared_root')}/user/code")), + ) + + def to_public_dict(self) -> dict[str, Any]: + return { + "address": self.address, + "shared_root": self.shared_root, + "entrypoint_num_cpus": self.entrypoint_num_cpus, + "entrypoint_resources": self.entrypoint_resources, + "runtime_env": {"env_vars": self.runtime_env_env_vars}, + "user_code_path": self.user_code_path, + } + + +@dataclass(frozen=True) +class JobSpec: + workload: str # ppo|grpo|sft + submission_id: str | None + code_path: str + model_id: str + + train_file: str + val_file: str | None + + nnodes: int + n_gpus_per_node: int + + total_epochs: int + total_training_steps: int + + save_freq: int + test_freq: int | None + + trainer_device: str | None # only for sft (driver-side device) + + @staticmethod + def from_dict(d: dict[str, Any]) -> "JobSpec": + workload = str(_require(d, "workload")) + if workload not in ("ppo", "grpo", "sft"): + raise ValueError(f"unsupported workload: {workload}") + + val_file = d.get("val_file", None) + if val_file in ("", "null"): + val_file = None + + test_freq = d.get("test_freq", None) + if test_freq in ("", "null"): + test_freq = None + + return JobSpec( + workload=workload, + submission_id=(str(d["submission_id"]) if d.get("submission_id") else None), + code_path=str(_require(d, "code_path")), + model_id=str(_require(d, "model_id")), + train_file=str(_require(d, "train_file")), + val_file=(str(val_file) if val_file is not None else None), + nnodes=int(d.get("nnodes", 2)), + n_gpus_per_node=int(d.get("n_gpus_per_node", 4)), + total_epochs=int(d.get("total_epochs", 1)), + total_training_steps=int(d.get("total_training_steps", 10)), + save_freq=int(d.get("save_freq", 10)), + test_freq=(int(test_freq) if test_freq is not None else None), + trainer_device=(str(d.get("trainer_device")) if d.get("trainer_device") else None), + ) + + def to_public_dict(self) -> dict[str, Any]: + out: dict[str, Any] = { + "workload": self.workload, + "submission_id": self.submission_id or "", + "code_path": self.code_path, + "model_id": self.model_id, + "train_file": self.train_file, + "val_file": self.val_file, + "nnodes": self.nnodes, + "n_gpus_per_node": self.n_gpus_per_node, + "total_epochs": self.total_epochs, + "total_training_steps": self.total_training_steps, + "save_freq": self.save_freq, + "test_freq": self.test_freq, + } + if self.workload == "sft": + out["trainer_device"] = self.trainer_device or "cpu" + return out diff --git a/src/mvp/v1.1/py/mvp_v11/ray_job_tool.py b/src/mvp/v1.1/py/mvp_v11/ray_job_tool.py new file mode 100644 index 0000000..01b899a --- /dev/null +++ b/src/mvp/v1.1/py/mvp_v11/ray_job_tool.py @@ -0,0 +1,171 @@ +from __future__ import annotations + +import json +import os +import shlex +from datetime import datetime +from pathlib import Path +from typing import Any + +import ray +from ray.job_submission import JobSubmissionClient + +from .builders import build_training_argv +from .models import JobSpec, RayConfig +from .yaml_io import dump_yaml + + +def _ts() -> str: + return datetime.now().strftime("%Y%m%d_%H%M%S") + + +def _mkdir(p: Path) -> None: + p.mkdir(parents=True, exist_ok=True) + + +def _write_text(p: Path, content: str) -> None: + _mkdir(p.parent) + p.write_text(content, encoding="utf-8") + + +def _write_json(p: Path, obj: Any) -> None: + _write_text(p, json.dumps(obj, indent=2, ensure_ascii=False) + "\n") + + +def _safe_basename(path: str) -> str: + return path.rstrip("/").split("/")[-1] + + +class RayJobTool: + def __init__(self, cfg: RayConfig): + self.cfg = cfg + self.client = JobSubmissionClient(cfg.address) + + def _job_dir(self, submission_id: str) -> str: + return f"{self.cfg.shared_root}/jobs/{submission_id}" + + def _runtime_env(self, spec: JobSpec) -> dict[str, Any]: + env_vars = dict(self.cfg.runtime_env_env_vars) + + # Default HF cache + env_vars.setdefault("HF_HOME", f"{self.cfg.shared_root}/hf") + env_vars.setdefault("HUGGINGFACE_HUB_CACHE", f"{self.cfg.shared_root}/hf/hub") + env_vars.setdefault("TRANSFORMERS_CACHE", f"{self.cfg.shared_root}/hf/transformers") + env_vars.setdefault("PYTHONUNBUFFERED", "1") + + # Tool code path must be importable on workers (compose mounts v1.1 into all containers). + # Place it before verl code to avoid interfering with verl import priority. + tool_code_path = os.environ.get("MVP_TOOL_CODE_PATH", "/workspace/mvp/v1.1/py") + + user_code_path = self.cfg.user_code_path + code_path = spec.code_path + + existing = env_vars.get("PYTHONPATH", "") + prefix = f"{tool_code_path}:{code_path}:{user_code_path}" + env_vars["PYTHONPATH"] = f"{prefix}:{existing}" if existing else prefix + + # For debugging / log visibility + env_vars["MVP_CODE_PATH"] = code_path + + # SFT: ensure ray.init() connects to the cluster + if spec.workload == "sft": + env_vars.setdefault("RAY_ADDRESS", "auto") + + return {"env_vars": env_vars} + + def submit(self, spec: JobSpec, no_wait: bool) -> str: + submission_id = spec.submission_id or f"mvp11_{spec.workload}_{_ts()}_{os.getpid()}" + job_dir = self._job_dir(submission_id) + + built = build_training_argv(spec, submission_id=submission_id, job_dir=job_dir) + entrypoint_argv = [ + "python3", + "-m", + "mvp_v11.driver_entrypoint", + "--job-dir", + job_dir, + *built.argv, + ] + entrypoint = " ".join(shlex.quote(x) for x in entrypoint_argv) + + runtime_env = self._runtime_env(spec) + + # Prepare job artifacts directory + job_root = Path(job_dir) + _mkdir(job_root / "config") + _mkdir(job_root / "logs") + _mkdir(job_root / "debug") + _mkdir(job_root / "checkpoints") + + _write_text(job_root / "config" / "ray_config.yaml", dump_yaml(self.cfg.to_public_dict())) + _write_text(job_root / "config" / "jobspec.yaml", dump_yaml(spec.to_public_dict())) + _write_json(job_root / "config" / "submit_payload.json", { + "submission_id": submission_id, + "address": self.cfg.address, + "entrypoint": entrypoint, + "entrypoint_num_cpus": self.cfg.entrypoint_num_cpus, + "entrypoint_resources": self.cfg.entrypoint_resources, + "runtime_env": runtime_env, + }) + + # Pre-submit debug snapshot (ray cluster resources via ray.init) + try: + ray.init(address="auto", ignore_reinit_error=True, log_to_driver=False) + _write_json(job_root / "debug" / "ray_cluster_resources_pre.json", ray.cluster_resources()) + _write_json(job_root / "debug" / "ray_available_resources_pre.json", ray.available_resources()) + except Exception as e: + _write_text(job_root / "debug" / "ray_resources_pre.error.txt", repr(e) + "\n") + + try: + submitted = self.client.submit_job( + entrypoint=entrypoint, + submission_id=submission_id, + runtime_env=runtime_env, + entrypoint_num_cpus=self.cfg.entrypoint_num_cpus, + entrypoint_resources=self.cfg.entrypoint_resources, + ) + except Exception as e: + _write_text(job_root / "logs" / "submit.error.txt", repr(e) + "\n") + raise + + _write_text(job_root / "config" / "ray_submission_id.txt", submitted + "\n") + + # Post-submit debug snapshot via SDK + try: + jobs = self.client.list_jobs() + _write_text( + job_root / "debug" / "ray_job_list_post.json", + json.dumps([_job_details_to_dict(j) for j in jobs], indent=2) + "\n", + ) + except Exception as e: + _write_text(job_root / "debug" / "ray_job_list_post.error.txt", repr(e) + "\n") + + if not no_wait: + # caller can separately wait; keep submit non-blocking by default in scripts + pass + + return submitted + + def status(self, submission_id: str) -> str: + return str(self.client.get_job_status(submission_id)) + + def stop(self, submission_id: str) -> bool: + return bool(self.client.stop_job(submission_id)) + + def logs(self, submission_id: str) -> str: + return self.client.get_job_logs(submission_id) + + def list(self) -> list[dict[str, Any]]: + return [_job_details_to_dict(j) for j in self.client.list_jobs()] + + +def _job_details_to_dict(obj: Any) -> dict[str, Any]: + # Ray uses pydantic models internally, but depending on bundled pydantic version + # we might get `.model_dump()` (v2) or `.dict()` (v1). + if hasattr(obj, "model_dump"): + return obj.model_dump() # type: ignore[no-any-return] + if hasattr(obj, "dict"): + return obj.dict() # type: ignore[no-any-return] + if hasattr(obj, "__dict__"): + return dict(obj.__dict__) + return {"repr": repr(obj)} diff --git a/src/mvp/v1.1/py/mvp_v11/yaml_io.py b/src/mvp/v1.1/py/mvp_v11/yaml_io.py new file mode 100644 index 0000000..c321688 --- /dev/null +++ b/src/mvp/v1.1/py/mvp_v11/yaml_io.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import yaml + + +def load_yaml(path: str) -> dict[str, Any]: + p = Path(path) + data = yaml.safe_load(p.read_text(encoding="utf-8")) + if data is None: + return {} + if not isinstance(data, dict): + raise ValueError(f"yaml root must be a mapping: {path}") + return data + + +def dump_yaml(data: dict[str, Any]) -> str: + return yaml.safe_dump(data, sort_keys=False, allow_unicode=True) + diff --git a/src/mvp/v1.1/py/requirements.txt b/src/mvp/v1.1/py/requirements.txt new file mode 100644 index 0000000..e3af026 --- /dev/null +++ b/src/mvp/v1.1/py/requirements.txt @@ -0,0 +1,2 @@ +PyYAML>=6.0.1 + diff --git a/src/mvp/v1.1/py/run.py b/src/mvp/v1.1/py/run.py new file mode 100644 index 0000000..67fe975 --- /dev/null +++ b/src/mvp/v1.1/py/run.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import json +import os +import sys + + +def _ensure_import_path() -> None: + # Allow `python3 /workspace/.../py/run.py` to import `mvp_v11.*` + here = os.path.dirname(os.path.abspath(__file__)) + if here not in sys.path: + sys.path.insert(0, here) + + +def main() -> int: + _ensure_import_path() + + from mvp_v11.models import JobSpec, RayConfig + from mvp_v11.ray_job_tool import RayJobTool + from mvp_v11.yaml_io import load_yaml + + parser = argparse.ArgumentParser() + parser.add_argument("--config", required=True, help="Ray base config yaml") + parser.add_argument("--jobspec", help="Training jobspec yaml (required for submit)") + parser.add_argument("--action", required=True, choices=["submit", "status", "stop", "logs", "list"]) + parser.add_argument("--submission-id", help="For status/stop/logs") + parser.add_argument("--no-wait", action="store_true", help="Submit and return immediately") + parser.add_argument("--tail", type=int, default=0, help="Tail N lines for logs") + args = parser.parse_args() + + cfg = RayConfig.from_dict(load_yaml(args.config)) + tool = RayJobTool(cfg) + + if args.action == "submit": + if not args.jobspec: + raise SystemExit("--jobspec is required for submit") + spec = JobSpec.from_dict(load_yaml(args.jobspec)) + submitted = tool.submit(spec, no_wait=args.no_wait) + print(submitted) + return 0 + + if args.action in ("status", "stop", "logs"): + sid = args.submission_id or "" + if not sid: + raise SystemExit("--submission-id is required for status/stop/logs") + if args.action == "status": + print(tool.status(sid)) + return 0 + if args.action == "stop": + print(tool.stop(sid)) + return 0 + logs = tool.logs(sid) + if args.tail and args.tail > 0: + lines = logs.splitlines() + logs = "\n".join(lines[-args.tail :]) + ("\n" if lines else "") + print(logs, end="") + return 0 + + if args.action == "list": + print(json.dumps(tool.list(), indent=2)) + return 0 + + raise SystemExit(f"unknown action: {args.action}") + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/mvp/v1.1/py/sitecustomize.py b/src/mvp/v1.1/py/sitecustomize.py new file mode 100644 index 0000000..478b284 --- /dev/null +++ b/src/mvp/v1.1/py/sitecustomize.py @@ -0,0 +1,57 @@ +""" +Job-scoped compatibility shims loaded automatically by Python at startup. + +This is intentionally lightweight and safe-by-default: +- Only patches missing symbols. +- Never raises (best-effort). + +Primary use case in MVP v1.1: +- Allow multiple `verl` versions (e.g. v0.6.0 vs v0.6.1) to run on the same + base image where `sglang` APIs may differ slightly. +""" + +from __future__ import annotations + + +def _patch_sglang_get_ip() -> None: + try: + import sglang.srt.utils as srt_utils # type: ignore + except Exception: + return + + if hasattr(srt_utils, "get_ip"): + return + + def get_ip() -> str: + # Best-effort local IP without external dependency. + try: + import socket + + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + try: + # Doesn't send packets; used to pick the default route/interface. + s.connect(("8.8.8.8", 80)) + return str(s.getsockname()[0]) + finally: + s.close() + except Exception: + # Fallback: hostname resolution + try: + import socket + + return str(socket.gethostbyname(socket.gethostname())) + except Exception: + return "127.0.0.1" + + try: + setattr(srt_utils, "get_ip", get_ip) + except Exception: + return + + +try: + _patch_sglang_get_ip() +except Exception: + # Never block interpreter startup. + pass + diff --git a/src/mvp/v1.1/scripts/00_prereq_check.sh b/src/mvp/v1.1/scripts/00_prereq_check.sh new file mode 100644 index 0000000..e6479f8 --- /dev/null +++ b/src/mvp/v1.1/scripts/00_prereq_check.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +echo "[host] prereq check" + +require_cmd docker +require_cmd bash + +if ! docker info >/dev/null 2>&1; then + echo "docker is not available (docker info failed)" >&2 + exit 1 +fi + +if ! docker compose version >/dev/null 2>&1; then + echo "docker compose is not available" >&2 + exit 1 +fi + +if ! command -v nvidia-smi >/dev/null 2>&1; then + echo "WARN: nvidia-smi not found on host; GPU validation skipped" +else + echo "[host] GPU summary" + nvidia-smi -L || true +fi + +echo "[host] ensure shared dirs exist under ../shared" +mkdir -p "${ROOT_DIR}/../shared"/{datasets,hf,jobs,outputs,ray,common,user} +mkdir -p "${ROOT_DIR}/../shared/common"/{code,datasets,models} +mkdir -p "${ROOT_DIR}/../shared/user"/{code} + +echo "[host] ensure verl repo exists under ../verl (required by prepare scripts)" +if [[ ! -d "${ROOT_DIR}/../verl" ]]; then + echo "missing ../verl. On remote, ensure /home2/argus/infra/mvp/verl exists (git clone volcengine/verl)." >&2 + exit 1 +fi + +echo "ok" + diff --git a/src/mvp/v1.1/scripts/01_up.sh b/src/mvp/v1.1/scripts/01_up.sh new file mode 100644 index 0000000..5dffc6c --- /dev/null +++ b/src/mvp/v1.1/scripts/01_up.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +if [[ "${SKIP_CLEANUP_V1:-0}" != "1" ]]; then + "${SCRIPT_DIR}/03_cleanup_v1_legacy.sh" || true +fi + +echo "[host] docker compose up -d (v1.1)" +dc up -d + +echo "[host] containers:" +docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Ports}}' | (head -n 1 && grep -E "mvp11-ray-") || true diff --git a/src/mvp/v1.1/scripts/02_down.sh b/src/mvp/v1.1/scripts/02_down.sh new file mode 100644 index 0000000..a0d6eb0 --- /dev/null +++ b/src/mvp/v1.1/scripts/02_down.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +echo "[host] docker compose down (v1.1)" +dc down -v || true + +echo "[host] done" + diff --git a/src/mvp/v1.1/scripts/03_cleanup_v1_legacy.sh b/src/mvp/v1.1/scripts/03_cleanup_v1_legacy.sh new file mode 100644 index 0000000..0f28585 --- /dev/null +++ b/src/mvp/v1.1/scripts/03_cleanup_v1_legacy.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "[host] cleanup v1 legacy containers (best-effort)" + +LEGACY=(mvp-ray-head mvp-ray-worker-0 mvp-ray-worker-1) + +for c in "${LEGACY[@]}"; do + if docker ps -a --format '{{.Names}}' | grep -qx "${c}"; then + echo "[host] removing legacy container: ${c}" + docker rm -f "${c}" >/dev/null 2>&1 || true + fi +done + +echo "[host] legacy cleanup done" + diff --git a/src/mvp/v1.1/scripts/05_ensure_verl_repo.sh b/src/mvp/v1.1/scripts/05_ensure_verl_repo.sh new file mode 100644 index 0000000..3a6da0d --- /dev/null +++ b/src/mvp/v1.1/scripts/05_ensure_verl_repo.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +VERL_DIR="${ROOT_DIR}/../verl" + +echo "[host] ensure verl repo exists at: ${VERL_DIR}" +if [[ -d "${VERL_DIR}/.git" ]]; then + echo "verl_repo_exists: skip" + exit 0 +fi + +if [[ -d "${VERL_DIR}" && ! -d "${VERL_DIR}/.git" ]]; then + echo "ERROR: ${VERL_DIR} exists but is not a git repo; please fix manually." >&2 + exit 1 +fi + +echo "cloning volcengine/verl -> ${VERL_DIR}" +git clone https://github.com/volcengine/verl.git "${VERL_DIR}" + diff --git a/src/mvp/v1.1/scripts/12_install_py_deps.sh b/src/mvp/v1.1/scripts/12_install_py_deps.sh new file mode 100644 index 0000000..da1649c --- /dev/null +++ b/src/mvp/v1.1/scripts/12_install_py_deps.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +echo "[head] install python deps for v1.1 SDK submitter (PyYAML)" +dexec "${HEAD_CONTAINER}" bash -lc "pip install -r /workspace/mvp/v1.1/py/requirements.txt" + diff --git a/src/mvp/v1.1/scripts/20_start_head.sh b/src/mvp/v1.1/scripts/20_start_head.sh new file mode 100644 index 0000000..fb3b7b7 --- /dev/null +++ b/src/mvp/v1.1/scripts/20_start_head.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +HEAD_IP="$(container_ip "${HEAD_CONTAINER}")" + +echo "[head] ray stop (best-effort)" +dexec "${HEAD_CONTAINER}" bash -lc "ray stop --force || true" + +echo "[head] start ray head (CPU=0 GPU=0): ${HEAD_IP}" +dexec "${HEAD_CONTAINER}" bash -lc "ray start --head --node-ip-address='${HEAD_IP}' --port=6379 --dashboard-host=0.0.0.0 --dashboard-port=8265 --num-cpus=0 --num-gpus=0 --disable-usage-stats" + +echo "[head] ray status" +dexec "${HEAD_CONTAINER}" bash -lc "ray status || true" + diff --git a/src/mvp/v1.1/scripts/21_start_workers.sh b/src/mvp/v1.1/scripts/21_start_workers.sh new file mode 100644 index 0000000..2f55a95 --- /dev/null +++ b/src/mvp/v1.1/scripts/21_start_workers.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +HEAD_IP="$(container_ip "${HEAD_CONTAINER}")" +HEAD_ADDR="${HEAD_IP}:6379" + +start_one() { + local worker="$1" + local ip + ip="$(container_ip "${worker}")" + echo "[${worker}] ray stop (best-effort)" + dexec "${worker}" bash -lc "ray stop --force || true" + echo "[${worker}] start ray worker -> head ${HEAD_ADDR}" + dexec "${worker}" bash -lc "ray start --address='${HEAD_ADDR}' --node-ip-address='${ip}' --resources='{\"worker_node\": 100}' --disable-usage-stats" +} + +start_one "${WORKER0_CONTAINER}" +start_one "${WORKER1_CONTAINER}" + +echo "[head] ray status" +dexec "${HEAD_CONTAINER}" bash -lc "ray status || true" + diff --git a/src/mvp/v1.1/scripts/30_prepare_data_and_model.sh b/src/mvp/v1.1/scripts/30_prepare_data_and_model.sh new file mode 100644 index 0000000..157e743 --- /dev/null +++ b/src/mvp/v1.1/scripts/30_prepare_data_and_model.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +MODEL_ID="${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}" + +PPO_DATA_DIR="${SHARED_ROOT}/datasets/gsm8k" +SFT_DATA_DIR="${SHARED_ROOT}/datasets/gsm8k_sft" + +CODE_SNAPSHOT_DIR="${SHARED_ROOT}/common/code/verl/verl_repo" + +echo "[head] ensure dataset dirs exist" +dexec "${HEAD_CONTAINER}" bash -lc "mkdir -p '${PPO_DATA_DIR}' '${SFT_DATA_DIR}'" + +echo "[head] prepare PPO dataset (gsm8k RL parquet) -> ${PPO_DATA_DIR}" +dexec "${HEAD_CONTAINER}" bash -lc "if [[ -f '${PPO_DATA_DIR}/train.parquet' && -f '${PPO_DATA_DIR}/test.parquet' ]]; then echo 'ppo_dataset_exists: skip'; else python3 /workspace/verl/examples/data_preprocess/gsm8k.py --local_save_dir '${PPO_DATA_DIR}'; fi" + +echo "[head] prepare SFT dataset (gsm8k messages parquet) -> ${SFT_DATA_DIR}" +if dexec "${HEAD_CONTAINER}" bash -lc "test -f '${SFT_DATA_DIR}/train.parquet'"; then + echo "[head] sft_dataset_exists: skip" +else + SFT_PY_CODE="$(cat <<'PY' +import os + +import pandas as pd +from datasets import load_dataset + +out_dir = os.environ["SFT_DATA_DIR"] +os.makedirs(out_dir, exist_ok=True) + +ds = load_dataset("openai/gsm8k", "main") + +instruction = "Let's think step by step and output the final answer after \"####\"." + +def to_messages(example): + q = example["question"].strip() + " " + instruction + a = example["answer"] + return { + "messages": [ + {"role": "user", "content": q}, + {"role": "assistant", "content": a}, + ] + } + +train = ds["train"].map(to_messages, remove_columns=ds["train"].column_names) +test = ds["test"].map(to_messages, remove_columns=ds["test"].column_names) + +pd.DataFrame(train).to_parquet(os.path.join(out_dir, "train.parquet"), index=False) +pd.DataFrame(test).to_parquet(os.path.join(out_dir, "test.parquet"), index=False) + +print("sft_dataset_written_ok:", out_dir) +PY +)" + printf "%s\n" "${SFT_PY_CODE}" | dexec "${HEAD_CONTAINER}" bash -lc "SFT_DATA_DIR='${SFT_DATA_DIR}' python3 -" +fi + +echo "[head] ensure model cached to persistent HF_HOME (idempotent) -> ${MODEL_ID}" +PY_CODE="$(cat <<'PY' +import os + +model_id = os.environ["MODEL_ID"] + +hf_home = os.environ.get("HF_HOME", "/private/hf") +os.environ.setdefault("HF_HOME", hf_home) +os.environ.setdefault("HUGGINGFACE_HUB_CACHE", os.path.join(hf_home, "hub")) +os.environ.setdefault("TRANSFORMERS_CACHE", os.path.join(hf_home, "transformers")) + +from huggingface_hub import snapshot_download + +try: + snapshot_download(repo_id=model_id, local_files_only=True) + print("model_cache_exists: skip", model_id) +except Exception: + print("model_cache_missing: downloading", model_id) + snapshot_download(repo_id=model_id) + print("model_cached_ok:", model_id) +PY +)" + +printf "%s\n" "${PY_CODE}" | dexec "${HEAD_CONTAINER}" bash -lc "MODEL_ID='${MODEL_ID}' python3 -" + +echo "[head] snapshot verl repo into shared common code path (idempotent best-effort) -> ${CODE_SNAPSHOT_DIR}" +dexec "${HEAD_CONTAINER}" bash -lc "mkdir -p '${CODE_SNAPSHOT_DIR}' && if command -v rsync >/dev/null 2>&1; then rsync -a --delete /workspace/verl/ '${CODE_SNAPSHOT_DIR}/'; else rm -rf '${CODE_SNAPSHOT_DIR:?}/'* && cp -a /workspace/verl/. '${CODE_SNAPSHOT_DIR}/'; fi && echo 'code_snapshot_ok'" diff --git a/src/mvp/v1.1/scripts/31_snapshot_verl_code.sh b/src/mvp/v1.1/scripts/31_snapshot_verl_code.sh new file mode 100644 index 0000000..0399e3e --- /dev/null +++ b/src/mvp/v1.1/scripts/31_snapshot_verl_code.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +# Create an immutable-ish code snapshot under: +# ${SHARED_ROOT}/common/code/verl/ +# +# By default, code_id is the git commit hash of /workspace/verl (mounted from ../verl). +# +# This enables job-level multi-version coexistence via runtime_env PYTHONPATH injection. + +CODE_ID="${CODE_ID:-}" + +if [[ -z "${CODE_ID}" ]]; then + CODE_ID="$(dexec "${HEAD_CONTAINER}" bash -lc "git config --global --add safe.directory /workspace/verl >/dev/null 2>&1 || true; git -C /workspace/verl rev-parse HEAD")" +fi + +DEST_DIR="${SHARED_ROOT}/common/code/verl/${CODE_ID}" + +echo "[head] snapshot verl repo -> ${DEST_DIR}" + +if dexec "${HEAD_CONTAINER}" bash -lc "test -d '${DEST_DIR}'"; then + echo "[head] code_snapshot_exists: skip" + exit 0 +fi + +dexec "${HEAD_CONTAINER}" bash -lc "mkdir -p '${DEST_DIR}'" + +# Copy code (no .git needed for runtime) +if dexec "${HEAD_CONTAINER}" bash -lc "command -v rsync >/dev/null 2>&1"; then + dexec "${HEAD_CONTAINER}" bash -lc "rsync -a --delete --exclude='.git' /workspace/verl/ '${DEST_DIR}/'" +else + dexec "${HEAD_CONTAINER}" bash -lc "tar -C /workspace/verl -cf - --exclude='.git' . | tar -C '${DEST_DIR}' -xf -" +fi + +# Add a tiny marker module for multi-version validation in Ray job logs. +dexec "${HEAD_CONTAINER}" bash -lc "printf \"%s\\n\" \"MARKER = '${CODE_ID}'\" > '${DEST_DIR}/mvp_marker.py'" + +echo "[head] code_snapshot_ok: ${CODE_ID}" diff --git a/src/mvp/v1.1/scripts/32_clone_verl_tags.sh b/src/mvp/v1.1/scripts/32_clone_verl_tags.sh new file mode 100644 index 0000000..dfcc470 --- /dev/null +++ b/src/mvp/v1.1/scripts/32_clone_verl_tags.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +VERL_REPO_URL="${VERL_REPO_URL:-https://github.com/volcengine/verl.git}" +DEST_BASE="${SHARED_ROOT}/common/code/verl" + +TAGS=("v0.6.0" "v0.6.1") + +echo "[head] ensure base dir: ${DEST_BASE}" +dexec "${HEAD_CONTAINER}" bash -lc "mkdir -p '${DEST_BASE}'" + +for tag in "${TAGS[@]}"; do + dest="${DEST_BASE}/verl_${tag}" + echo "[head] prepare verl tag ${tag} -> ${dest}" + + verify_repo_cmd="test -d '${dest}/.git' && git -C '${dest}' rev-parse --is-inside-work-tree >/dev/null 2>&1" + if dexec "${HEAD_CONTAINER}" bash -lc "${verify_repo_cmd}"; then + echo "[head] exists: verified git repo: ${dest}" + else + echo "[head] cloning ${tag} (retry with HTTP/1.1 if needed)" + dexec "${HEAD_CONTAINER}" bash -lc "rm -rf '${dest}'" + # Retry a few times because GitHub over HTTP/2 can occasionally fail with curl framing errors/timeouts. + dexec "${HEAD_CONTAINER}" bash -lc "set -euo pipefail; for i in 1 2 3; do echo \"clone_attempt=\\$i\"; if git -c http.version=HTTP/1.1 clone --filter=blob:none --single-branch --branch '${tag}' --depth 1 '${VERL_REPO_URL}' '${dest}'; then exit 0; fi; rm -rf '${dest}'; sleep 3; done; exit 1" + dexec "${HEAD_CONTAINER}" bash -lc "${verify_repo_cmd}" || { echo \"[head] clone failed or repo invalid: ${dest}\" >&2; exit 1; } + fi + + # Avoid git safe.directory issues when reading repo state + dexec "${HEAD_CONTAINER}" bash -lc "git config --global --add safe.directory '${dest}' >/dev/null 2>&1 || true" + dexec "${HEAD_CONTAINER}" bash -lc "printf 'tag='; git -C '${dest}' describe --tags --exact-match 2>/dev/null || true; printf '\\nhead='; git -C '${dest}' rev-parse HEAD; printf '\\n'" + + # Add marker for multi-version verification in Ray job logs + dexec "${HEAD_CONTAINER}" bash -lc "printf \"%s\\n\" \"MARKER = '${tag}'\" > '${dest}/mvp_marker.py'" +done + +echo "[head] done" diff --git a/src/mvp/v1.1/scripts/40_submit_ppo_epoch1.sh b/src/mvp/v1.1/scripts/40_submit_ppo_epoch1.sh new file mode 100644 index 0000000..ac11ab9 --- /dev/null +++ b/src/mvp/v1.1/scripts/40_submit_ppo_epoch1.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +SUBMISSION_ID="${SUBMISSION_ID:-mvp11_ppo_$(timestamp)_$RANDOM}" +JOB_DIR="${SHARED_ROOT}/jobs/${SUBMISSION_ID}" + +MODEL_ID="${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}" +TRAIN_FILE="${SHARED_ROOT}/datasets/gsm8k/train.parquet" +VAL_FILE="${SHARED_ROOT}/datasets/gsm8k/test.parquet" + +CODE_PATH="${CODE_PATH:-${SHARED_ROOT}/common/code/verl/verl_repo}" +TOTAL_TRAINING_STEPS="${TOTAL_TRAINING_STEPS:-10}" +SAVE_FREQ="${SAVE_FREQ:-10}" +TEST_FREQ="${TEST_FREQ:--1}" + +echo "[head] create job dir: ${JOB_DIR}" +dexec "${HEAD_CONTAINER}" bash -lc "mkdir -p '${JOB_DIR}'/{logs,checkpoints,config,debug}" + +SUBMIT_CMD="python3 -m verl.trainer.main_ppo \ +data.train_files=${TRAIN_FILE} \ +data.val_files=${VAL_FILE} \ +data.train_batch_size=256 \ +data.max_prompt_length=512 \ +data.max_response_length=512 \ +actor_rollout_ref.model.path=${MODEL_ID} \ +actor_rollout_ref.actor.optim.lr=1e-6 \ +actor_rollout_ref.actor.ppo_mini_batch_size=64 \ +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ +actor_rollout_ref.rollout.name=sglang \ +actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ +actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ +actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ +actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ +critic.optim.lr=1e-5 \ +critic.model.path=${MODEL_ID} \ +critic.ppo_micro_batch_size_per_gpu=4 \ +algorithm.kl_ctrl.kl_coef=0.001 \ +trainer.logger=console \ +trainer.val_before_train=False \ +trainer.n_gpus_per_node=4 \ +trainer.nnodes=2 \ +trainer.save_freq=${SAVE_FREQ} \ +trainer.test_freq=${TEST_FREQ} \ +trainer.total_epochs=1 \ +trainer.total_training_steps=${TOTAL_TRAINING_STEPS} \ +trainer.resume_mode=disable \ +trainer.default_local_dir=${JOB_DIR}/checkpoints \ ++ray_kwargs.ray_init.address=auto \ +hydra.run.dir=${JOB_DIR}/logs/hydra" + +printf "%s\n" "${SUBMIT_CMD}" | dexec "${HEAD_CONTAINER}" bash -lc "cat > '${JOB_DIR}/config/submit_cmd.txt'" + +echo "[head] debug snapshot (pre-submit)" +dexec "${HEAD_CONTAINER}" bash -lc "ray status >'${JOB_DIR}/debug/ray_status_pre.txt' 2>&1 || true" + +echo "[head] submit PPO via ray job submit (driver forced on worker)" +SUBMIT_OUT="$(dexec "${HEAD_CONTAINER}" bash -lc "ray job submit --address='${RAY_DASHBOARD_ADDR}' --submission-id='${SUBMISSION_ID}' --entrypoint-num-cpus=1 --entrypoint-resources='{\"worker_node\": 1}' --runtime-env-json='{\"env_vars\":{\"HF_HOME\":\"${SHARED_ROOT}/hf\",\"HUGGINGFACE_HUB_CACHE\":\"${SHARED_ROOT}/hf/hub\",\"TRANSFORMERS_CACHE\":\"${SHARED_ROOT}/hf/transformers\",\"HF_ENDPOINT\":\"https://hf-mirror.com\",\"PYTHONUNBUFFERED\":\"1\",\"PYTHONPATH\":\"${CODE_PATH}:${SHARED_ROOT}/user/code\"}}' --no-wait -- ${SUBMIT_CMD}")" + +printf "%s\n" "${SUBMIT_OUT}" +printf "%s\n" "${SUBMIT_OUT}" | dexec "${HEAD_CONTAINER}" bash -lc "cat > '${JOB_DIR}/logs/ray_job_submit.out'" +echo "${SUBMISSION_ID}" | dexec "${HEAD_CONTAINER}" bash -lc "cat > '${JOB_DIR}/config/ray_submission_id.txt'" + +echo "[head] debug snapshot (post-submit)" +dexec "${HEAD_CONTAINER}" bash -lc "ray job list >'${JOB_DIR}/debug/ray_job_list_post.txt' 2>&1 || true" +dexec "${HEAD_CONTAINER}" bash -lc "ray status >'${JOB_DIR}/debug/ray_status_post.txt' 2>&1 || true" + +echo "submitted: ${SUBMISSION_ID}" +echo "job dir: ${JOB_DIR}" diff --git a/src/mvp/v1.1/scripts/41_submit_grpo_epoch1.sh b/src/mvp/v1.1/scripts/41_submit_grpo_epoch1.sh new file mode 100644 index 0000000..835a6da --- /dev/null +++ b/src/mvp/v1.1/scripts/41_submit_grpo_epoch1.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +SUBMISSION_ID="${SUBMISSION_ID:-mvp11_grpo_$(timestamp)_$RANDOM}" +JOB_DIR="${SHARED_ROOT}/jobs/${SUBMISSION_ID}" + +MODEL_ID="${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}" +TRAIN_FILE="${SHARED_ROOT}/datasets/gsm8k/train.parquet" +VAL_FILE="${SHARED_ROOT}/datasets/gsm8k/test.parquet" + +CODE_PATH="${CODE_PATH:-${SHARED_ROOT}/common/code/verl/verl_repo}" +TOTAL_TRAINING_STEPS="${TOTAL_TRAINING_STEPS:-10}" +SAVE_FREQ="${SAVE_FREQ:-10}" +TEST_FREQ="${TEST_FREQ:--1}" + +echo "[head] create job dir: ${JOB_DIR}" +dexec "${HEAD_CONTAINER}" bash -lc "mkdir -p '${JOB_DIR}'/{logs,checkpoints,config,debug}" + +SUBMIT_CMD="python3 -m verl.trainer.main_ppo \ +data.train_files=${TRAIN_FILE} \ +data.val_files=${VAL_FILE} \ +data.train_batch_size=256 \ +data.max_prompt_length=512 \ +data.max_response_length=512 \ +actor_rollout_ref.model.path=${MODEL_ID} \ +actor_rollout_ref.actor.optim.lr=1e-6 \ +actor_rollout_ref.actor.ppo_mini_batch_size=64 \ +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ +actor_rollout_ref.rollout.name=sglang \ +actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ +actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ +actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ +actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ +critic.optim.lr=1e-5 \ +critic.model.path=${MODEL_ID} \ +critic.ppo_micro_batch_size_per_gpu=4 \ +algorithm.adv_estimator=grpo \ +algorithm.kl_ctrl.kl_coef=0.001 \ +trainer.logger=console \ +trainer.val_before_train=False \ +trainer.n_gpus_per_node=4 \ +trainer.nnodes=2 \ +trainer.save_freq=${SAVE_FREQ} \ +trainer.test_freq=${TEST_FREQ} \ +trainer.total_epochs=1 \ +trainer.total_training_steps=${TOTAL_TRAINING_STEPS} \ +trainer.resume_mode=disable \ +trainer.default_local_dir=${JOB_DIR}/checkpoints \ ++ray_kwargs.ray_init.address=auto \ +hydra.run.dir=${JOB_DIR}/logs/hydra" + +printf "%s\n" "${SUBMIT_CMD}" | dexec "${HEAD_CONTAINER}" bash -lc "cat > '${JOB_DIR}/config/submit_cmd.txt'" + +echo "[head] debug snapshot (pre-submit)" +dexec "${HEAD_CONTAINER}" bash -lc "ray status >'${JOB_DIR}/debug/ray_status_pre.txt' 2>&1 || true" + +echo "[head] submit GRPO via ray job submit (driver forced on worker)" +SUBMIT_OUT="$(dexec "${HEAD_CONTAINER}" bash -lc "ray job submit --address='${RAY_DASHBOARD_ADDR}' --submission-id='${SUBMISSION_ID}' --entrypoint-num-cpus=1 --entrypoint-resources='{\"worker_node\": 1}' --runtime-env-json='{\"env_vars\":{\"HF_HOME\":\"${SHARED_ROOT}/hf\",\"HUGGINGFACE_HUB_CACHE\":\"${SHARED_ROOT}/hf/hub\",\"TRANSFORMERS_CACHE\":\"${SHARED_ROOT}/hf/transformers\",\"HF_ENDPOINT\":\"https://hf-mirror.com\",\"PYTHONUNBUFFERED\":\"1\",\"PYTHONPATH\":\"${CODE_PATH}:${SHARED_ROOT}/user/code\"}}' --no-wait -- ${SUBMIT_CMD}")" + +printf "%s\n" "${SUBMIT_OUT}" +printf "%s\n" "${SUBMIT_OUT}" | dexec "${HEAD_CONTAINER}" bash -lc "cat > '${JOB_DIR}/logs/ray_job_submit.out'" +echo "${SUBMISSION_ID}" | dexec "${HEAD_CONTAINER}" bash -lc "cat > '${JOB_DIR}/config/ray_submission_id.txt'" + +echo "[head] debug snapshot (post-submit)" +dexec "${HEAD_CONTAINER}" bash -lc "ray job list >'${JOB_DIR}/debug/ray_job_list_post.txt' 2>&1 || true" +dexec "${HEAD_CONTAINER}" bash -lc "ray status >'${JOB_DIR}/debug/ray_status_post.txt' 2>&1 || true" + +echo "submitted: ${SUBMISSION_ID}" +echo "job dir: ${JOB_DIR}" diff --git a/src/mvp/v1.1/scripts/42_submit_sft_minimal.sh b/src/mvp/v1.1/scripts/42_submit_sft_minimal.sh new file mode 100644 index 0000000..d30c7f4 --- /dev/null +++ b/src/mvp/v1.1/scripts/42_submit_sft_minimal.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +SUBMISSION_ID="${SUBMISSION_ID:-mvp11_sft_$(timestamp)_$RANDOM}" +JOB_DIR="${SHARED_ROOT}/jobs/${SUBMISSION_ID}" + +MODEL_ID="${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}" +TRAIN_FILE="${SHARED_ROOT}/datasets/gsm8k_sft/train.parquet" +VAL_FILE="${SHARED_ROOT}/datasets/gsm8k_sft/test.parquet" + +CODE_PATH="${CODE_PATH:-${SHARED_ROOT}/common/code/verl/verl_repo}" +TOTAL_TRAINING_STEPS="${TOTAL_TRAINING_STEPS:-10}" +SAVE_FREQ="${SAVE_FREQ:-10}" +SFT_DRIVER_DEVICE="${SFT_DRIVER_DEVICE:-cpu}" + +echo "[head] create job dir: ${JOB_DIR}" +dexec "${HEAD_CONTAINER}" bash -lc "mkdir -p '${JOB_DIR}'/{logs,checkpoints,config,debug}" + +SUBMIT_CMD="python3 -m verl.trainer.sft_trainer_ray \ +model.path=${MODEL_ID} \ +data.train_files=${TRAIN_FILE} \ +data.val_files=null \ +data.train_batch_size=64 \ +data.micro_batch_size_per_gpu=1 \ +data.max_token_len_per_gpu=2048 \ +data.max_length=1024 \ +trainer.logger=console \ +trainer.project_name=mvp11-sft \ +trainer.experiment_name=${SUBMISSION_ID} \ +trainer.total_epochs=1 \ +trainer.total_training_steps=${TOTAL_TRAINING_STEPS} \ +trainer.save_freq=${SAVE_FREQ} \ +trainer.test_freq=-1 \ +trainer.resume_mode=disable \ +trainer.device=${SFT_DRIVER_DEVICE} \ +trainer.default_local_dir=${JOB_DIR}/checkpoints \ +trainer.nnodes=2 \ +trainer.n_gpus_per_node=4 \ +hydra.run.dir=${JOB_DIR}/logs/hydra" + +printf "%s\n" "${SUBMIT_CMD}" | dexec "${HEAD_CONTAINER}" bash -lc "cat > '${JOB_DIR}/config/submit_cmd.txt'" + +echo "[head] debug snapshot (pre-submit)" +dexec "${HEAD_CONTAINER}" bash -lc "ray status >'${JOB_DIR}/debug/ray_status_pre.txt' 2>&1 || true" + +echo "[head] submit SFT via ray job submit (driver forced on worker)" +SUBMIT_OUT="$(dexec "${HEAD_CONTAINER}" bash -lc "ray job submit --address='${RAY_DASHBOARD_ADDR}' --submission-id='${SUBMISSION_ID}' --entrypoint-num-cpus=1 --entrypoint-resources='{\"worker_node\": 1}' --runtime-env-json='{\"env_vars\":{\"HF_HOME\":\"${SHARED_ROOT}/hf\",\"HUGGINGFACE_HUB_CACHE\":\"${SHARED_ROOT}/hf/hub\",\"TRANSFORMERS_CACHE\":\"${SHARED_ROOT}/hf/transformers\",\"HF_ENDPOINT\":\"https://hf-mirror.com\",\"PYTHONUNBUFFERED\":\"1\",\"RAY_ADDRESS\":\"auto\",\"PYTHONPATH\":\"${CODE_PATH}:${SHARED_ROOT}/user/code\"}}' --no-wait -- ${SUBMIT_CMD}")" + +printf "%s\n" "${SUBMIT_OUT}" +printf "%s\n" "${SUBMIT_OUT}" | dexec "${HEAD_CONTAINER}" bash -lc "cat > '${JOB_DIR}/logs/ray_job_submit.out'" +echo "${SUBMISSION_ID}" | dexec "${HEAD_CONTAINER}" bash -lc "cat > '${JOB_DIR}/config/ray_submission_id.txt'" + +echo "[head] debug snapshot (post-submit)" +dexec "${HEAD_CONTAINER}" bash -lc "ray job list >'${JOB_DIR}/debug/ray_job_list_post.txt' 2>&1 || true" +dexec "${HEAD_CONTAINER}" bash -lc "ray status >'${JOB_DIR}/debug/ray_status_post.txt' 2>&1 || true" + +echo "submitted: ${SUBMISSION_ID}" +echo "job dir: ${JOB_DIR}" diff --git a/src/mvp/v1.1/scripts/43_submit_jobspec.sh b/src/mvp/v1.1/scripts/43_submit_jobspec.sh new file mode 100644 index 0000000..c4a1e19 --- /dev/null +++ b/src/mvp/v1.1/scripts/43_submit_jobspec.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +SPEC_PATH="${1:-}" +if [[ -z "${SPEC_PATH}" ]]; then + echo "usage: $0 " >&2 + echo "example: $0 /workspace/mvp/v1.1/templates/ppo.json" >&2 + exit 1 +fi + +# Submit from head container (required), but with driver forced onto worker via entrypoint resources in spec. +dexec "${HEAD_CONTAINER}" bash -lc "SHARED_ROOT='${SHARED_ROOT}' python3 /workspace/mvp/v1.1/submit_job.py --spec '${SPEC_PATH}' --no-wait" + diff --git a/src/mvp/v1.1/scripts/44_submit_sdk.sh b/src/mvp/v1.1/scripts/44_submit_sdk.sh new file mode 100644 index 0000000..d7bf010 --- /dev/null +++ b/src/mvp/v1.1/scripts/44_submit_sdk.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +CONFIG_PATH="${1:-/workspace/mvp/v1.1/py/configs/dev.yaml}" +JOBSPEC_PATH="${2:-}" + +if [[ -z "${JOBSPEC_PATH}" ]]; then + echo "usage: $0 " >&2 + echo "example: $0 /workspace/mvp/v1.1/py/configs/dev.yaml /workspace/mvp/v1.1/py/jobspecs/ppo.yaml" >&2 + exit 1 +fi + +echo "[head] submit via Ray Python SDK" +dexec "${HEAD_CONTAINER}" bash -lc "python3 /workspace/mvp/v1.1/py/run.py --config '${CONFIG_PATH}' --jobspec '${JOBSPEC_PATH}' --action submit --no-wait" + diff --git a/src/mvp/v1.1/scripts/46_submit_ppo_two_verl_tags.sh b/src/mvp/v1.1/scripts/46_submit_ppo_two_verl_tags.sh new file mode 100644 index 0000000..9ccf831 --- /dev/null +++ b/src/mvp/v1.1/scripts/46_submit_ppo_two_verl_tags.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +CONFIG_PATH="${1:-/workspace/mvp/v1.1/py/configs/dev.yaml}" + +TS="$(timestamp)" +BASE="/workspace/mvp/v1.1/py/jobspecs" + +NNODES="${NNODES:-1}" +N_GPUS_PER_NODE="${N_GPUS_PER_NODE:-1}" + +wait_job() { + local sid="$1" + echo "[head] wait: ${sid}" + while true; do + # Ray returns one of: PENDING/RUNNING/SUCCEEDED/FAILED/STOPPED + st="$(dexec "${HEAD_CONTAINER}" bash -lc "python3 /workspace/mvp/v1.1/py/run.py --config '${CONFIG_PATH}' --action status --submission-id '${sid}'" | tr -d '\r' | tail -n 1)" + echo "[head] status: ${sid} -> ${st}" + case "${st}" in + *SUCCEEDED*) + return 0 + ;; + *FAILED*|*STOPPED*) + echo "[head] job not successful: ${sid} (${st})" >&2 + echo "[head] last logs:" + dexec "${HEAD_CONTAINER}" bash -lc "python3 /workspace/mvp/v1.1/py/run.py --config '${CONFIG_PATH}' --action logs --submission-id '${sid}' --tail 200" || true + return 1 + ;; + *) + sleep 10 + ;; + esac + done +} + +show_precheck() { + local sid="$1" + echo "[head] verify precheck: ${sid}" + dexec "${HEAD_CONTAINER}" bash -lc "python3 /workspace/mvp/v1.1/py/run.py --config '${CONFIG_PATH}' --action logs --submission-id '${sid}' --tail 2000 | egrep 'MVP_PRECHECK_VERL_FILE|MVP_PRECHECK_MARKER' || true" +} + +make_spec() { + local tag="$1" + local code_path="$2" + local out_path="$3" + local sid="mvp11_ppo_${tag//./_}_${TS}" + dexec "${HEAD_CONTAINER}" bash -lc "cat > '${out_path}' <<'YAML' +workload: \"ppo\" +submission_id: \"${sid}\" +code_path: \"${code_path}\" +model_id: \"Qwen/Qwen2.5-0.5B-Instruct\" +train_file: \"/private/datasets/gsm8k/train.parquet\" +val_file: \"/private/datasets/gsm8k/test.parquet\" +nnodes: ${NNODES} +n_gpus_per_node: ${N_GPUS_PER_NODE} +total_epochs: 1 +total_training_steps: 10 +save_freq: 10 +test_freq: -1 +YAML" + echo "${sid}" +} + +echo "[head] submit PPO sequentially with verl v0.6.0 then v0.6.1" +echo "[head] resources: nnodes=${NNODES} n_gpus_per_node=${N_GPUS_PER_NODE}" + +sid0="$(make_spec "v0.6.0" "/private/common/code/verl/verl_v0.6.0" "${BASE}/tmp_ppo_verl_v0.6.0_${TS}.yaml")" +echo "[head] submit: ${sid0}" +dexec "${HEAD_CONTAINER}" bash -lc "python3 /workspace/mvp/v1.1/py/run.py --config '${CONFIG_PATH}' --jobspec '${BASE}/tmp_ppo_verl_v0.6.0_${TS}.yaml' --action submit --no-wait" +wait_job "${sid0}" +show_precheck "${sid0}" + +sid1="$(make_spec "v0.6.1" "/private/common/code/verl/verl_v0.6.1" "${BASE}/tmp_ppo_verl_v0.6.1_${TS}.yaml")" +echo "[head] submit: ${sid1}" +dexec "${HEAD_CONTAINER}" bash -lc "python3 /workspace/mvp/v1.1/py/run.py --config '${CONFIG_PATH}' --jobspec '${BASE}/tmp_ppo_verl_v0.6.1_${TS}.yaml' --action submit --no-wait" +wait_job "${sid1}" +show_precheck "${sid1}" + +echo "[head] done" +echo "submitted:" +echo " ${sid0} (verl v0.6.0)" +echo " ${sid1} (verl v0.6.1)" diff --git a/src/mvp/v1.1/scripts/50_status.sh b/src/mvp/v1.1/scripts/50_status.sh new file mode 100644 index 0000000..03847a0 --- /dev/null +++ b/src/mvp/v1.1/scripts/50_status.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib.sh +source "${SCRIPT_DIR}/lib.sh" + +echo "[head] ray status" +dexec "${HEAD_CONTAINER}" bash -lc "ray status || true" + +echo "[head] ray job list" +dexec "${HEAD_CONTAINER}" bash -lc "ray job list || true" + diff --git a/src/mvp/v1.1/scripts/lib.sh b/src/mvp/v1.1/scripts/lib.sh new file mode 100644 index 0000000..23e0997 --- /dev/null +++ b/src/mvp/v1.1/scripts/lib.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT_DIR="$(cd "${SCRIPT_DIR}/.." && pwd)" + +COMPOSE_FILE="${ROOT_DIR}/docker-compose.yaml" + +HEAD_CONTAINER="mvp11-ray-head" +WORKER0_CONTAINER="mvp11-ray-worker-0" +WORKER1_CONTAINER="mvp11-ray-worker-1" + +SHARED_ROOT="${SHARED_ROOT:-/private}" +RAY_DASHBOARD_ADDR="${RAY_DASHBOARD_ADDR:-http://127.0.0.1:8265}" + +dc() { + docker compose --project-directory "${ROOT_DIR}" -f "${COMPOSE_FILE}" "$@" +} + +require_cmd() { + local cmd="$1" + command -v "${cmd}" >/dev/null 2>&1 || { + echo "missing required command: ${cmd}" >&2 + exit 1 + } +} + +ensure_container_running() { + local name="$1" + if ! docker ps --format '{{.Names}}' | grep -qx "${name}"; then + echo "container not running: ${name}" >&2 + exit 1 + fi +} + +dexec() { + local name="$1" + shift + ensure_container_running "${name}" + docker exec -i "${name}" "$@" +} + +container_ip() { + local name="$1" + ensure_container_running "${name}" + docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' "${name}" +} + +timestamp() { + date +"%Y%m%d_%H%M%S" +} + diff --git a/src/mvp/v1.1/scripts/run_all.sh b/src/mvp/v1.1/scripts/run_all.sh new file mode 100644 index 0000000..cf84c01 --- /dev/null +++ b/src/mvp/v1.1/scripts/run_all.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +"${SCRIPT_DIR}/00_prereq_check.sh" +"${SCRIPT_DIR}/01_up.sh" +"${SCRIPT_DIR}/20_start_head.sh" +"${SCRIPT_DIR}/21_start_workers.sh" +"${SCRIPT_DIR}/30_prepare_data_and_model.sh" +"${SCRIPT_DIR}/40_submit_ppo_epoch1.sh" +"${SCRIPT_DIR}/41_submit_grpo_epoch1.sh" +"${SCRIPT_DIR}/42_submit_sft_minimal.sh" +"${SCRIPT_DIR}/50_status.sh" + diff --git a/src/mvp/v1.1/submit_job.py b/src/mvp/v1.1/submit_job.py new file mode 100644 index 0000000..9e8f54c --- /dev/null +++ b/src/mvp/v1.1/submit_job.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +import argparse +import json +import os +import shlex +import subprocess +from datetime import datetime +from pathlib import Path + + +def _ts(): + return datetime.now().strftime("%Y%m%d_%H%M%S") + + +def _expand(value: str) -> str: + return os.path.expandvars(value) + + +def _mkdir(path: Path) -> None: + path.mkdir(parents=True, exist_ok=True) + + +def _write_text(path: Path, content: str) -> None: + _mkdir(path.parent) + path.write_text(content, encoding="utf-8") + + +def _write_json(path: Path, obj) -> None: + _mkdir(path.parent) + path.write_text(json.dumps(obj, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + + +def _require(spec: dict, key: str): + if key not in spec: + raise SystemExit(f"missing required key in spec: {key}") + return spec[key] + + +def _default_submission_id(workload: str) -> str: + return f"mvp11_{workload}_{_ts()}_{os.getpid()}" + + +def _runtime_env(spec: dict) -> dict: + shared_root = _expand(_require(spec, "shared_root")) + code_path = _expand(_require(spec, "code_path")) + env_vars = dict(spec.get("runtime_env", {}).get("env_vars", {})) + + env_vars.setdefault("HF_HOME", f"{shared_root}/hf") + env_vars.setdefault("HUGGINGFACE_HUB_CACHE", f"{shared_root}/hf/hub") + env_vars.setdefault("TRANSFORMERS_CACHE", f"{shared_root}/hf/transformers") + env_vars.setdefault("PYTHONUNBUFFERED", "1") + + user_code = f"{shared_root}/user/code" + existing = env_vars.get("PYTHONPATH", "") + prefix = f"{code_path}:{user_code}" + env_vars["PYTHONPATH"] = f"{prefix}:{existing}" if existing else prefix + + # Helpful marker for logs/debugging + env_vars.setdefault("MVP_CODE_PATH", code_path) + + return {"env_vars": env_vars} + + +def _preflight_shell() -> str: + # Make multi-version/debugging observable in Ray job logs. + # `mvp_marker.py` is written by our snapshot script (optional); if missing, ignore. + py = r""" +import os +import sys +print("MVP_PRECHECK_PYTHON:", sys.executable) +print("MVP_PRECHECK_PYTHONPATH:", os.environ.get("PYTHONPATH")) +try: + import verl + print("MVP_PRECHECK_VERL_FILE:", getattr(verl, "__file__", None)) +except Exception as e: + print("MVP_PRECHECK_VERL_IMPORT_ERROR:", repr(e)) +try: + import mvp_marker + print("MVP_PRECHECK_MARKER:", getattr(mvp_marker, "MARKER", None)) +except Exception as e: + print("MVP_PRECHECK_MARKER_MISSING:", repr(e)) +""" + return f"python3 - <<'PY'\n{py.strip()}\nPY" + + +def _build_entrypoint(spec: dict, submission_id: str, job_dir: str) -> str: + workload = _require(spec, "workload") + model_id = _expand(_require(spec, "model_id")) + shared_root = _expand(_require(spec, "shared_root")) + + if workload in ("ppo", "grpo"): + cfg = spec.get(workload, {}) + train_file = _expand(cfg.get("train_file", f"{shared_root}/datasets/gsm8k/train.parquet")) + val_file = _expand(cfg.get("val_file", f"{shared_root}/datasets/gsm8k/test.parquet")) + nnodes = int(cfg.get("nnodes", 2)) + gpus_per_node = int(cfg.get("n_gpus_per_node", 4)) + total_epochs = int(cfg.get("total_epochs", 1)) + total_steps = int(cfg.get("total_training_steps", 10)) + save_freq = int(cfg.get("save_freq", 10)) + test_freq = int(cfg.get("test_freq", -1)) + + algo_overrides = "" + if workload == "grpo": + algo_overrides = "algorithm.adv_estimator=grpo" + + cmd = f"""python3 -m verl.trainer.main_ppo \ +data.train_files={train_file} \ +data.val_files={val_file} \ +data.train_batch_size=256 \ +data.max_prompt_length=512 \ +data.max_response_length=512 \ +actor_rollout_ref.model.path={model_id} \ +actor_rollout_ref.actor.optim.lr=1e-6 \ +actor_rollout_ref.actor.ppo_mini_batch_size=64 \ +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ +actor_rollout_ref.rollout.name=sglang \ +actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ +actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ +actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ +actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ +critic.optim.lr=1e-5 \ +critic.model.path={model_id} \ +critic.ppo_micro_batch_size_per_gpu=4 \ +algorithm.kl_ctrl.kl_coef=0.001 \ +{algo_overrides} \ +trainer.logger=console \ +trainer.val_before_train=False \ +trainer.n_gpus_per_node={gpus_per_node} \ +trainer.nnodes={nnodes} \ +trainer.save_freq={save_freq} \ +trainer.test_freq={test_freq} \ +trainer.total_epochs={total_epochs} \ +trainer.total_training_steps={total_steps} \ +trainer.resume_mode=disable \ +trainer.default_local_dir={job_dir}/checkpoints \ ++ray_kwargs.ray_init.address=auto \ +hydra.run.dir={job_dir}/logs/hydra""" + return "\n".join([_preflight_shell(), "exec " + cmd]) + + if workload == "sft": + cfg = spec.get("sft", {}) + train_file = _expand(cfg.get("train_file", f"{shared_root}/datasets/gsm8k_sft/train.parquet")) + val_file = cfg.get("val_file", None) + nnodes = int(cfg.get("nnodes", 2)) + gpus_per_node = int(cfg.get("n_gpus_per_node", 4)) + total_epochs = int(cfg.get("total_epochs", 1)) + total_steps = int(cfg.get("total_training_steps", 10)) + save_freq = int(cfg.get("save_freq", 10)) + device = cfg.get("device", "cpu") + + val_override = "data.val_files=null" if val_file is None else f"data.val_files={_expand(val_file)}" + + # Note: driver should not require CUDA under ray job submit (no entrypoint GPUs by default). + cmd = f"""python3 -m verl.trainer.sft_trainer_ray \ +model.path={model_id} \ +data.train_files={train_file} \ +{val_override} \ +data.train_batch_size=64 \ +data.micro_batch_size_per_gpu=1 \ +data.max_token_len_per_gpu=2048 \ +data.max_length=1024 \ +trainer.logger=console \ +trainer.project_name=mvp11-sft \ +trainer.experiment_name={submission_id} \ +trainer.total_epochs={total_epochs} \ +trainer.total_training_steps={total_steps} \ +trainer.save_freq={save_freq} \ +trainer.test_freq=-1 \ +trainer.resume_mode=disable \ +trainer.device={device} \ +trainer.default_local_dir={job_dir}/checkpoints \ +trainer.nnodes={nnodes} \ +trainer.n_gpus_per_node={gpus_per_node} \ +hydra.run.dir={job_dir}/logs/hydra""" + return "\n".join([_preflight_shell(), "exec " + cmd]) + + raise SystemExit(f"unsupported workload: {workload}") + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--spec", required=True, help="Path to JobSpec json (inside this container)") + parser.add_argument("--no-wait", action="store_true", help="Submit and return immediately") + parser.add_argument("--dry-run", action="store_true", help="Only print the submit command") + args = parser.parse_args() + + spec_path = Path(args.spec) + spec = json.loads(spec_path.read_text(encoding="utf-8")) + + shared_root = _expand(_require(spec, "shared_root")) + workload = _require(spec, "workload") + + submission_id = spec.get("submission_id") or _default_submission_id(workload) + job_dir = f"{shared_root}/jobs/{submission_id}" + + ray_cfg = _require(spec, "ray") + ray_addr = ray_cfg.get("address", "http://127.0.0.1:8265") + entrypoint_num_cpus = ray_cfg.get("entrypoint_num_cpus", 1) + entrypoint_resources = ray_cfg.get("entrypoint_resources", {"worker_node": 1}) + + runtime_env = _runtime_env(spec) + entrypoint = _build_entrypoint(spec, submission_id=submission_id, job_dir=job_dir) + + # Prepare job dir + job_root = Path(job_dir) + _mkdir(job_root / "config") + _mkdir(job_root / "logs") + _mkdir(job_root / "checkpoints") + _mkdir(job_root / "debug") + + # Snapshot config for audit/debug + _write_json(job_root / "config" / "job_spec.json", spec) + _write_json(job_root / "config" / "runtime_env.json", runtime_env) + _write_text(job_root / "config" / "ray_submission_id.txt", submission_id + "\n") + + submit_cmd_txt = "\n".join( + [ + "ray job submit", + f" --address={ray_addr}", + f" --submission-id={submission_id}", + f" --entrypoint-num-cpus={entrypoint_num_cpus}", + f" --entrypoint-resources={json.dumps(entrypoint_resources)}", + f" --runtime-env-json=", + f" {'--no-wait' if args.no_wait else ''}", + " -- bash -lc ''", + ] + ) + _write_text(job_root / "config" / "submit_cmd.txt", submit_cmd_txt + "\n") + + # Debug snapshot (pre-submit) + try: + pre = subprocess.run(["ray", "status"], capture_output=True, text=True, check=False) + _write_text(job_root / "debug" / "ray_status_pre.txt", (pre.stdout or "") + (pre.stderr or "")) + except FileNotFoundError: + _write_text(job_root / "debug" / "ray_status_pre.txt", "ray cli not found\n") + + submit_args = [ + "ray", + "job", + "submit", + "--address", + ray_addr, + "--submission-id", + submission_id, + "--entrypoint-num-cpus", + str(entrypoint_num_cpus), + "--entrypoint-resources", + json.dumps(entrypoint_resources), + "--runtime-env-json", + json.dumps(runtime_env), + ] + if args.no_wait: + submit_args.append("--no-wait") + submit_args += ["--", "bash", "-lc", entrypoint] + + if args.dry_run: + print(" ".join(shlex.quote(x) for x in submit_args)) + return 0 + + proc = subprocess.run(submit_args, capture_output=True, text=True, check=False) + _write_text(job_root / "logs" / "ray_job_submit.out", (proc.stdout or "") + (proc.stderr or "")) + print(proc.stdout, end="") + if proc.returncode != 0: + print(proc.stderr, end="", file=os.sys.stderr) + return proc.returncode + + # Debug snapshot (post-submit) + try: + post = subprocess.run(["ray", "job", "list", "--log-style=record", "-v"], capture_output=True, text=True, check=False) + _write_text(job_root / "debug" / "ray_job_list_post.txt", (post.stdout or "") + (post.stderr or "")) + post2 = subprocess.run(["ray", "status"], capture_output=True, text=True, check=False) + _write_text(job_root / "debug" / "ray_status_post.txt", (post2.stdout or "") + (post2.stderr or "")) + except FileNotFoundError: + pass + + print(f"job_dir: {job_dir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/src/mvp/v1.1/templates/grpo.json b/src/mvp/v1.1/templates/grpo.json new file mode 100644 index 0000000..cd98915 --- /dev/null +++ b/src/mvp/v1.1/templates/grpo.json @@ -0,0 +1,31 @@ +{ + "submission_id": "", + "workload": "grpo", + "shared_root": "${SHARED_ROOT}", + "code_path": "${SHARED_ROOT}/common/code/verl/verl_repo", + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "grpo": { + "train_file": "${SHARED_ROOT}/datasets/gsm8k/train.parquet", + "val_file": "${SHARED_ROOT}/datasets/gsm8k/test.parquet", + "nnodes": 2, + "n_gpus_per_node": 4, + "total_epochs": 1, + "total_training_steps": 10, + "save_freq": 10, + "test_freq": -1 + }, + "ray": { + "address": "http://127.0.0.1:8265", + "entrypoint_num_cpus": 1, + "entrypoint_resources": { + "worker_node": 1 + } + }, + "runtime_env": { + "env_vars": { + "HF_ENDPOINT": "https://hf-mirror.com", + "PYTHONUNBUFFERED": "1" + } + } +} + diff --git a/src/mvp/v1.1/templates/ppo.json b/src/mvp/v1.1/templates/ppo.json new file mode 100644 index 0000000..89241a4 --- /dev/null +++ b/src/mvp/v1.1/templates/ppo.json @@ -0,0 +1,31 @@ +{ + "submission_id": "", + "workload": "ppo", + "shared_root": "${SHARED_ROOT}", + "code_path": "${SHARED_ROOT}/common/code/verl/verl_repo", + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "ppo": { + "train_file": "${SHARED_ROOT}/datasets/gsm8k/train.parquet", + "val_file": "${SHARED_ROOT}/datasets/gsm8k/test.parquet", + "nnodes": 2, + "n_gpus_per_node": 4, + "total_epochs": 1, + "total_training_steps": 10, + "save_freq": 10, + "test_freq": -1 + }, + "ray": { + "address": "http://127.0.0.1:8265", + "entrypoint_num_cpus": 1, + "entrypoint_resources": { + "worker_node": 1 + } + }, + "runtime_env": { + "env_vars": { + "HF_ENDPOINT": "https://hf-mirror.com", + "PYTHONUNBUFFERED": "1" + } + } +} + diff --git a/src/mvp/v1.1/templates/sft.json b/src/mvp/v1.1/templates/sft.json new file mode 100644 index 0000000..9d4f8af --- /dev/null +++ b/src/mvp/v1.1/templates/sft.json @@ -0,0 +1,32 @@ +{ + "submission_id": "", + "workload": "sft", + "shared_root": "${SHARED_ROOT}", + "code_path": "${SHARED_ROOT}/common/code/verl/verl_repo", + "model_id": "Qwen/Qwen2.5-0.5B-Instruct", + "sft": { + "train_file": "${SHARED_ROOT}/datasets/gsm8k_sft/train.parquet", + "val_file": null, + "nnodes": 2, + "n_gpus_per_node": 4, + "total_epochs": 1, + "total_training_steps": 10, + "save_freq": 10, + "device": "cpu" + }, + "ray": { + "address": "http://127.0.0.1:8265", + "entrypoint_num_cpus": 1, + "entrypoint_resources": { + "worker_node": 1 + } + }, + "runtime_env": { + "env_vars": { + "HF_ENDPOINT": "https://hf-mirror.com", + "PYTHONUNBUFFERED": "1", + "RAY_ADDRESS": "auto" + } + } +} + diff --git a/src/mvp/v1/arch.excalidraw b/src/mvp/v1/arch.excalidraw new file mode 100644 index 0000000..b9e50ce --- /dev/null +++ b/src/mvp/v1/arch.excalidraw @@ -0,0 +1,1877 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", + "elements": [ + { + "id": "9Ql3xet2wP4Oy9RJ4s-8H", + "type": "rectangle", + "x": 165.33361053466797, + "y": 124.66671752929688, + "width": 201.66666412353516, + "height": 144.66665649414062, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a0", + "roundness": { + "type": 3 + }, + "seed": 1490759950, + "version": 404, + "versionNonce": 1106431423, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "Yx_fL1nYHbxINMBHYImpi" + }, + { + "id": "bAFwE3wo46b9BFt_2Gnm2", + "type": "arrow" + }, + { + "id": "Qzy3gfKzdZyrTwzOpuh86", + "type": "arrow" + }, + { + "id": "zj5n4D3014Kl-BdrNuRZp", + "type": "arrow" + }, + { + "id": "cqQ2Ij98wYdbKqXQcPhxe", + "type": "arrow" + } + ], + "updated": 1766373694946, + "link": null, + "locked": false + }, + { + "id": "Yx_fL1nYHbxINMBHYImpi", + "type": "text", + "x": 232.60698318481445, + "y": 184.5000457763672, + "width": 67.11991882324219, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a1", + "roundness": null, + "seed": 122704078, + "version": 259, + "versionNonce": 104768575, + "isDeleted": false, + "boundElements": [], + "updated": 1766372702157, + "link": null, + "locked": false, + "text": "scripts", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "9Ql3xet2wP4Oy9RJ4s-8H", + "originalText": "scripts", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "1BW7RJyVw0yjg93vAIlWT", + "type": "rectangle", + "x": 813.8334465026855, + "y": 81.00010681152344, + "width": 159.9999771118164, + "height": 88.66665649414062, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a2", + "roundness": { + "type": 3 + }, + "seed": 929976718, + "version": 789, + "versionNonce": 761000657, + "isDeleted": false, + "boundElements": [ + { + "id": "qcyVAzD12V9EAxE15fwAq", + "type": "arrow" + }, + { + "id": "Qzy3gfKzdZyrTwzOpuh86", + "type": "arrow" + } + ], + "updated": 1766373762851, + "link": null, + "locked": false + }, + { + "id": "nIO4qa9Aj4LF_WlHt2BNO", + "type": "rectangle", + "x": 455.8334159851074, + "y": 125.66667175292969, + "width": 201.66666412353516, + "height": 144.66665649414062, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a4", + "roundness": { + "type": 3 + }, + "seed": 2137052434, + "version": 311, + "versionNonce": 1823398289, + "isDeleted": false, + "boundElements": [ + { + "id": "bAFwE3wo46b9BFt_2Gnm2", + "type": "arrow" + }, + { + "id": "QgI8Gn67BTTI1MdOhOjjn", + "type": "arrow" + }, + { + "id": "qcyVAzD12V9EAxE15fwAq", + "type": "arrow" + } + ], + "updated": 1766373745383, + "link": null, + "locked": false + }, + { + "id": "bAFwE3wo46b9BFt_2Gnm2", + "type": "arrow", + "x": 372.0002746582031, + "y": 196.9000457763672, + "width": 252.06685104370115, + "height": 90.56665649414063, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a6", + "roundness": null, + "seed": 1158070290, + "version": 571, + "versionNonce": 231145393, + "isDeleted": false, + "boundElements": [], + "updated": 1766373712586, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 114.08342552185059, + 0 + ], + [ + 114.08342552185059, + -90.56665649414063 + ], + [ + 252.06685104370115, + -90.56665649414063 + ], + [ + 252.06685104370115, + -55.56665649414063 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "9Ql3xet2wP4Oy9RJ4s-8H", + "fixedPoint": [ + 1.0247933887424108, + 0.4993087557117625 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "e58le7CfnSkWVb-LoMmj4", + "fixedPoint": [ + 0.49736842105263096, + -0.13157894736842105 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "aBQ9npStDj077n0B7_JZ3", + "type": "rectangle", + "x": 814.000072479248, + "y": 227.6667022705078, + "width": 159.9999771118164, + "height": 88.66665649414062, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a7", + "roundness": { + "type": 3 + }, + "seed": 1963206674, + "version": 814, + "versionNonce": 371608031, + "isDeleted": false, + "boundElements": [ + { + "id": "QgI8Gn67BTTI1MdOhOjjn", + "type": "arrow" + }, + { + "id": "zj5n4D3014Kl-BdrNuRZp", + "type": "arrow" + } + ], + "updated": 1766373791812, + "link": null, + "locked": false + }, + { + "id": "QgI8Gn67BTTI1MdOhOjjn", + "type": "arrow", + "x": 648.1671257019043, + "y": 165.23338928222657, + "width": 145.8328742980957, + "height": 121.4333282470703, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aC", + "roundness": null, + "seed": 990553042, + "version": 371, + "versionNonce": 1356695409, + "isDeleted": false, + "boundElements": [], + "updated": 1766373712586, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 80.41647338867188, + 0 + ], + [ + 80.41647338867188, + 121.4333282470703 + ], + [ + 145.8328742980957, + 121.4333282470703 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "e58le7CfnSkWVb-LoMmj4", + "fixedPoint": [ + 1.131578947368421, + 0.49736842105263174 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "aBQ9npStDj077n0B7_JZ3", + "fixedPoint": [ + -0.12500047087676108, + 0.66541378226761 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "atIqduRWNX97Sp6yKe4hj", + "type": "rectangle", + "x": 431.1368826709601, + "y": 26.199444213977472, + "width": 611.63434968004, + "height": 645.8789403469411, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aw", + "roundness": { + "type": 3 + }, + "seed": 311282494, + "version": 376, + "versionNonce": 190431967, + "isDeleted": false, + "boundElements": [], + "updated": 1766373009377, + "link": null, + "locked": false + }, + { + "id": "k6SXmpRXbeQSbRrXG_o6a", + "type": "text", + "x": 656.1800675420305, + "y": -12.200757945542989, + "width": 199.19985961914062, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "dotted", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "ax", + "roundness": null, + "seed": 570087970, + "version": 192, + "versionNonce": 158307807, + "isDeleted": false, + "boundElements": [], + "updated": 1766373821110, + "link": null, + "locked": false, + "text": "a single H20 machine", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "a single H20 machine", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "qcyVAzD12V9EAxE15fwAq", + "type": "arrow", + "x": 648.1671257019043, + "y": 165.23338928222657, + "width": 160.66632080078125, + "height": 39.99995422363281, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b03", + "roundness": null, + "seed": 696982833, + "version": 370, + "versionNonce": 1628299665, + "isDeleted": false, + "boundElements": null, + "updated": 1766373712586, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 80.33316040039062, + 0 + ], + [ + 80.33316040039062, + -39.99995422363281 + ], + [ + 160.66632080078125, + -39.99995422363281 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "e58le7CfnSkWVb-LoMmj4", + "fixedPoint": [ + 1.131578947368421, + 0.49736842105263174 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "1BW7RJyVw0yjg93vAIlWT", + "fixedPoint": [ + -0.031250004470349, + 0.4988721803217357 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "Qzy3gfKzdZyrTwzOpuh86", + "type": "arrow", + "x": 371.8835678755345, + "y": 173.74970208077258, + "width": 520.5167182267604, + "height": 132.74959526924914, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b04", + "roundness": null, + "seed": 626515153, + "version": 428, + "versionNonce": 451690431, + "isDeleted": false, + "boundElements": null, + "updated": 1766372702159, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 21.28340523847919, + 0 + ], + [ + 21.28340523847919, + -132.74959526924914 + ], + [ + 520.5167182267604, + -132.74959526924914 + ], + [ + 520.5167182267604, + -111.41628228096789 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "9Ql3xet2wP4Oy9RJ4s-8H", + "fixedPoint": [ + 1.024214677416095, + 0.3392833272086004 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "eeIv7mr_n3Lg6B3DlXJHR", + "fixedPoint": [ + 0.49736842105263096, + -0.13157894736842105 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": true, + "fixedSegments": [ + { + "index": 2, + "start": [ + 21.28340523847919, + 0 + ], + "end": [ + 21.28340523847919, + -132.74959526924914 + ] + } + ], + "startIsSpecial": false, + "endIsSpecial": false + }, + { + "id": "zj5n4D3014Kl-BdrNuRZp", + "type": "arrow", + "x": 371.87357276423296, + "y": 221.26066759479318, + "width": 525.8600263263431, + "height": 135.07269116985526, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b05", + "roundness": null, + "seed": 2137319889, + "version": 378, + "versionNonce": 1170250751, + "isDeleted": false, + "boundElements": null, + "updated": 1766372702159, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 35.29340034978071, + 0 + ], + [ + 35.29340034978071, + 135.07269116985526 + ], + [ + 525.8600263263431, + 135.07269116985526 + ], + [ + 525.8600263263431, + 116.40612622844901 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "9Ql3xet2wP4Oy9RJ4s-8H", + "fixedPoint": [ + 1.0241651148800903, + 0.6677001626107852 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "c4sciwjA9ZH_GjmspcBan", + "fixedPoint": [ + 0.49736842105262796, + 1.131578947368421 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": true, + "fixedSegments": [ + { + "index": 2, + "start": [ + 35.29340034978071, + 0 + ], + "end": [ + 35.29340034978071, + 135.07269116985526 + ] + } + ], + "startIsSpecial": false, + "endIsSpecial": false + }, + { + "id": "bzQAQDSv4fPQAKJkqw9an", + "type": "arrow", + "x": 287.8335380554199, + "y": -76.33323669433594, + "width": 70.00006103515625, + "height": 26, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b06", + "roundness": null, + "seed": 301869471, + "version": 103, + "versionNonce": 2132861809, + "isDeleted": false, + "boundElements": null, + "updated": 1766372404307, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 35.000030517578125, + 0 + ], + [ + 35.000030517578125, + -26 + ], + [ + 70.00006103515625, + -26 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "JoAr6EMunnvVMp7Uo00g5", + "type": "text", + "x": 379.1668510437012, + "y": -102.99992370605469, + "width": 165.33987426757812, + "height": 25, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b07", + "roundness": null, + "seed": 1372433471, + "version": 19, + "versionNonce": 1494393073, + "isDeleted": false, + "boundElements": null, + "updated": 1766372417246, + "link": null, + "locked": false, + "text": "start ray cluster", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "start ray cluster", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "mWLybQDTFo_Um3eT0cEDa", + "type": "arrow", + "x": 577.8335380554199, + "y": -84.33323669433594, + "width": 59.33331298828125, + "height": 24.66668701171875, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b08", + "roundness": null, + "seed": 2218161, + "version": 62, + "versionNonce": 49511103, + "isDeleted": false, + "boundElements": null, + "updated": 1766372425873, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 29.666656494140625, + 0 + ], + [ + 29.666656494140625, + -24.66668701171875 + ], + [ + 59.33331298828125, + -24.66668701171875 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": null, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "sy0AQFPiKMUpC_fNA1u5Y", + "type": "text", + "x": 653.8335380554199, + "y": -102.99992370605469, + "width": 312.499755859375, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b09", + "roundness": null, + "seed": 516560529, + "version": 57, + "versionNonce": 1162137087, + "isDeleted": false, + "boundElements": null, + "updated": 1766372458567, + "link": null, + "locked": false, + "text": "ray job submit / drivier - worker", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "ray job submit / drivier - worker", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "e58le7CfnSkWVb-LoMmj4", + "type": "ellipse", + "x": 605.1671257019043, + "y": 146.33338928222656, + "width": 38, + "height": 38, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#99e9f2", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0C", + "roundness": { + "type": 2 + }, + "seed": 915719889, + "version": 349, + "versionNonce": 1238458833, + "isDeleted": false, + "boundElements": [ + { + "id": "bAFwE3wo46b9BFt_2Gnm2", + "type": "arrow" + }, + { + "id": "qcyVAzD12V9EAxE15fwAq", + "type": "arrow" + }, + { + "id": "QgI8Gn67BTTI1MdOhOjjn", + "type": "arrow" + }, + { + "id": "cqQ2Ij98wYdbKqXQcPhxe", + "type": "arrow" + } + ], + "updated": 1766373712585, + "link": null, + "locked": false + }, + { + "id": "eeIv7mr_n3Lg6B3DlXJHR", + "type": "ellipse", + "x": 873.5002861022949, + "y": 67.33341979980469, + "width": 38, + "height": 38, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffc9c9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0D", + "roundness": { + "type": 2 + }, + "seed": 1674226303, + "version": 428, + "versionNonce": 1555106719, + "isDeleted": false, + "boundElements": [ + { + "id": "Qzy3gfKzdZyrTwzOpuh86", + "type": "arrow" + } + ], + "updated": 1766372702159, + "link": null, + "locked": false + }, + { + "id": "c4sciwjA9ZH_GjmspcBan", + "type": "ellipse", + "x": 878.8335990905762, + "y": 294.6667938232422, + "width": 38, + "height": 38, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffc9c9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0E", + "roundness": { + "type": 2 + }, + "seed": 2058462079, + "version": 476, + "versionNonce": 803991519, + "isDeleted": false, + "boundElements": [ + { + "id": "zj5n4D3014Kl-BdrNuRZp", + "type": "arrow" + } + ], + "updated": 1766372702159, + "link": null, + "locked": false + }, + { + "id": "QQdAB0FVqHTGhVCKstHrm", + "type": "ellipse", + "x": 308.1669120788574, + "y": -185.9999237060547, + "width": 38, + "height": 38, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#99e9f2", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0F", + "roundness": { + "type": 2 + }, + "seed": 156527057, + "version": 305, + "versionNonce": 1422415839, + "isDeleted": false, + "boundElements": [], + "updated": 1766372581044, + "link": null, + "locked": false + }, + { + "id": "37ojV_5RQu-H5_oLEw_CC", + "type": "text", + "x": 375.1668510437012, + "y": -176.9999237060547, + "width": 136.61990356445312, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffc9c9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0G", + "roundness": null, + "seed": 122498079, + "version": 15, + "versionNonce": 121837457, + "isDeleted": false, + "boundElements": null, + "updated": 1766372589741, + "link": null, + "locked": false, + "text": "ray head node", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "ray head node", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "ZyQitTq4F7KArc1LsSU0L", + "type": "ellipse", + "x": 578.1668510437012, + "y": -185.33323669433594, + "width": 38, + "height": 38, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffc9c9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0H", + "roundness": { + "type": 2 + }, + "seed": 666768689, + "version": 440, + "versionNonce": 1110150815, + "isDeleted": false, + "boundElements": [], + "updated": 1766372594958, + "link": null, + "locked": false + }, + { + "id": "B_AKAU5jOYUjwmhK54Ddr", + "type": "text", + "x": 657.5235862731934, + "y": -178.16656494140625, + "width": 154.65988159179688, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "#ffc9c9", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0I", + "roundness": null, + "seed": 1500425873, + "version": 72, + "versionNonce": 150248991, + "isDeleted": false, + "boundElements": [], + "updated": 1766372606781, + "link": null, + "locked": false, + "text": "ray worker node", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "ray worker node", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "WPOvDTFRcRTqysrgfilw_", + "type": "rectangle", + "x": 453.1667900085449, + "y": 395.0000762939453, + "width": 572.6666870117189, + "height": 251.3333740234375, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0K", + "roundness": null, + "seed": 62506143, + "version": 121, + "versionNonce": 1380555377, + "isDeleted": false, + "boundElements": null, + "updated": 1766373299556, + "link": null, + "locked": false + }, + { + "id": "icDpH3wy-c2_99TXtmru5", + "type": "rectangle", + "x": 469.1667900085449, + "y": 410.87457554413584, + "width": 83.33331298828125, + "height": 138.7922487966845, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0L", + "roundness": null, + "seed": 292536607, + "version": 130, + "versionNonce": 398901169, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "oYnPwXC5ulNTf8ayVoUHX" + } + ], + "updated": 1766376371670, + "link": null, + "locked": false + }, + { + "id": "oYnPwXC5ulNTf8ayVoUHX", + "type": "text", + "x": 476.93347549438477, + "y": 455.2706999424781, + "width": 67.79994201660156, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0M", + "roundness": null, + "seed": 964724095, + "version": 75, + "versionNonce": 1019940831, + "isDeleted": false, + "boundElements": null, + "updated": 1766376371670, + "link": null, + "locked": false, + "text": "datase\nts", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "icDpH3wy-c2_99TXtmru5", + "originalText": "datasets", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "s3rGKdwzC87Z1v9gN0N-5", + "type": "rectangle", + "x": 582.166820526123, + "y": 410.4639351318977, + "width": 83.33331298828125, + "height": 138.7922487966845, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0N", + "roundness": null, + "seed": 401691633, + "version": 190, + "versionNonce": 700169215, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "D2W7CAX99N7IpvqJQP6eX" + } + ], + "updated": 1766376371670, + "link": null, + "locked": false + }, + { + "id": "D2W7CAX99N7IpvqJQP6eX", + "type": "text", + "x": 613.1934852600098, + "y": 467.36005953023994, + "width": 21.279983520507812, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0O", + "roundness": null, + "seed": 325299665, + "version": 138, + "versionNonce": 770525041, + "isDeleted": false, + "boundElements": [], + "updated": 1766376371670, + "link": null, + "locked": false, + "text": "hf", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "s3rGKdwzC87Z1v9gN0N-5", + "originalText": "hf", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "EGfEHSzmWYNx5ec9bP-9v", + "type": "rectangle", + "x": 701.5001945495605, + "y": 409.6427294956321, + "width": 83.33331298828125, + "height": 138.7922487966845, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0P", + "roundness": null, + "seed": 974114385, + "version": 214, + "versionNonce": 2090152273, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "XIfw_ABOr1dlUCUXEBJqA" + } + ], + "updated": 1766376371670, + "link": null, + "locked": false + }, + { + "id": "XIfw_ABOr1dlUCUXEBJqA", + "type": "text", + "x": 722.9068641662598, + "y": 466.53885389397436, + "width": 40.51997375488281, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0Q", + "roundness": null, + "seed": 881056817, + "version": 164, + "versionNonce": 663619647, + "isDeleted": false, + "boundElements": [], + "updated": 1766376371670, + "link": null, + "locked": false, + "text": "jobs", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "EGfEHSzmWYNx5ec9bP-9v", + "originalText": "jobs", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "Fe9YUb1NNu32IhvievWnr", + "type": "rectangle", + "x": 812.8334465026855, + "y": 408.8214486711559, + "width": 83.33331298828125, + "height": 138.7922487966845, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0R", + "roundness": null, + "seed": 66661759, + "version": 268, + "versionNonce": 1178642527, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "UZhPJS_QBKW_LMYwu1enR" + } + ], + "updated": 1766376371670, + "link": null, + "locked": false + }, + { + "id": "UZhPJS_QBKW_LMYwu1enR", + "type": "text", + "x": 821.1101188659668, + "y": 465.71757306949814, + "width": 66.77996826171875, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0S", + "roundness": null, + "seed": 1284842911, + "version": 227, + "versionNonce": 969134353, + "isDeleted": false, + "boundElements": [], + "updated": 1766376371670, + "link": null, + "locked": false, + "text": "output", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "Fe9YUb1NNu32IhvievWnr", + "originalText": "output", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "r8maqUMI8CEoTqgUHwtnZ", + "type": "rectangle", + "x": 924.8335075378418, + "y": 408.0001678466797, + "width": 83.33331298828125, + "height": 138.7922487966845, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0T", + "roundness": null, + "seed": 1203268831, + "version": 320, + "versionNonce": 204048113, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "nL0ZOKRltZyanG1aa7hOW" + } + ], + "updated": 1766376371670, + "link": null, + "locked": false + }, + { + "id": "nL0ZOKRltZyanG1aa7hOW", + "type": "text", + "x": 951.7201805114746, + "y": 464.8962922450219, + "width": 29.559967041015625, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0U", + "roundness": null, + "seed": 150305023, + "version": 273, + "versionNonce": 1246452895, + "isDeleted": false, + "boundElements": [], + "updated": 1766376371670, + "link": null, + "locked": false, + "text": "ray", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "r8maqUMI8CEoTqgUHwtnZ", + "originalText": "ray", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "OmfvpsuW0dhubnI5NH1Ni", + "type": "text", + "x": 675.8334770202637, + "y": 593.0000762939453, + "width": 121.67990112304688, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0V", + "roundness": null, + "seed": 1166095967, + "version": 60, + "versionNonce": 643125617, + "isDeleted": false, + "boundElements": null, + "updated": 1766373325167, + "link": null, + "locked": false, + "text": "/mnt/shared", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "/mnt/shared", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "cqQ2Ij98wYdbKqXQcPhxe", + "type": "arrow", + "x": 371.9714008624129, + "y": 208.42023305299114, + "width": 252.09572483949137, + "height": 50.3333740234375, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0W", + "roundness": null, + "seed": 1948514207, + "version": 127, + "versionNonce": 977608497, + "isDeleted": false, + "boundElements": null, + "updated": 1766373713002, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 96.77889477662518, + 0 + ], + [ + 96.77889477662518, + 31.246530252672926 + ], + [ + 252.09572483949137, + 31.246530252672926 + ], + [ + 252.09572483949137, + -19.086843770764574 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "9Ql3xet2wP4Oy9RJ4s-8H", + "fixedPoint": [ + 1.0246502128937116, + 0.5789413922556957 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "e58le7CfnSkWVb-LoMmj4", + "fixedPoint": [ + 0.49736842105263096, + 1.131578947368421 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "arrow", + "elbowed": true, + "fixedSegments": [ + { + "index": 3, + "start": [ + 96.77889477662518, + 31.246530252672926 + ], + "end": [ + 252.09572483949137, + 31.246530252672926 + ] + } + ], + "startIsSpecial": false, + "endIsSpecial": false + }, + { + "id": "xn_M6D4MoCOvg2I2eSA3o", + "type": "text", + "x": 491.16682052612305, + "y": 237.66676330566406, + "width": 137.33990478515625, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0X", + "roundness": null, + "seed": 485981873, + "version": 51, + "versionNonce": 224706367, + "isDeleted": false, + "boundElements": null, + "updated": 1766373736215, + "link": null, + "locked": false, + "text": "ray job submit", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "ray job submit", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "n0XX1HzCpzHfeo4hTRson", + "type": "text", + "x": 456.5001640319824, + "y": 281.00013732910156, + "width": 196.61985778808594, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0Y", + "roundness": null, + "seed": 960522577, + "version": 46, + "versionNonce": 1758456319, + "isDeleted": false, + "boundElements": null, + "updated": 1766373758962, + "link": null, + "locked": false, + "text": "head node container", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "head node container", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "-rnnyMPU-tQs8UVNBCb7C", + "type": "text", + "x": 792.5001945495605, + "y": 176.33338928222656, + "width": 225.5198211669922, + "height": 50, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0Z", + "roundness": null, + "seed": 501756561, + "version": 78, + "versionNonce": 1893991729, + "isDeleted": false, + "boundElements": null, + "updated": 1766373794871, + "link": null, + "locked": false, + "text": "worker node containers\n 4 x H20", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "worker node containers\n 4 x H20", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "qWTdVy6o3A4fkZsGRMCKB", + "type": "text", + "x": 483.4334098815918, + "y": 557.5000762939453, + "width": 40.1335388183593, + "height": 16.722307840983046, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0a", + "roundness": null, + "seed": 438165041, + "version": 42, + "versionNonce": 1297545873, + "isDeleted": false, + "boundElements": null, + "updated": 1766376434059, + "link": null, + "locked": false, + "text": "数据集", + "fontSize": 13.377846272786437, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "数据集", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "Zr04JN1SwCMGrPB7Otn5n", + "type": "text", + "x": 599.7667686462403, + "y": 556.6389223734539, + "width": 53.47998046875, + "height": 16.722307840983046, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0b", + "roundness": null, + "seed": 1427393105, + "version": 108, + "versionNonce": 20002801, + "isDeleted": false, + "boundElements": [], + "updated": 1766376447293, + "link": null, + "locked": false, + "text": "基座模型", + "fontSize": 13.377846272786437, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "基座模型", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "U5mN9zuDxufwaTsXAez1v", + "type": "text", + "x": 683.1001121520997, + "y": 553.9721743265789, + "width": 116.947265625, + "height": 33.44461568196609, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0c", + "roundness": null, + "seed": 1016566161, + "version": 228, + "versionNonce": 460195775, + "isDeleted": false, + "boundElements": [], + "updated": 1766376548390, + "link": null, + "locked": false, + "text": "job级别\nckpt, config, log等", + "fontSize": 13.377846272786437, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "top", + "containerId": null, + "originalText": "job级别\nckpt, config, log等", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "kyEqz0rZKz404SUTYZRai", + "type": "text", + "x": 931.766707611084, + "y": 557.3056093851726, + "width": 74.01625061035156, + "height": 33.44461568196609, + "angle": 0, + "strokeColor": "#e03131", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "b0d", + "roundness": null, + "seed": 1742834719, + "version": 152, + "versionNonce": 2136750623, + "isDeleted": false, + "boundElements": [], + "updated": 1766376569303, + "link": null, + "locked": false, + "text": "系统级\nsession日志", + "fontSize": 13.377846272786437, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "top", + "containerId": null, + "originalText": "系统级\nsession日志", + "autoResize": true, + "lineHeight": 1.25 + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file