From ac9c80ed8cab0aca3044d66d7684d851ae6ed78f Mon Sep 17 00:00:00 2001 From: yuyr Date: Mon, 12 Jan 2026 17:24:34 +0800 Subject: [PATCH] =?UTF-8?q?V3.9=20=E9=87=8D=E6=9E=84ui.py=EF=BC=8C?= =?UTF-8?q?=E6=AF=8F=E4=B8=AA=E7=BD=91=E9=A1=B5=E6=8B=86=E5=88=86=E7=8B=AC?= =?UTF-8?q?=E7=AB=8B=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 1 + README.md | 165 ++ specs/mvp/v3.9/ui_refactor_plan.md | 108 ++ src/mvp/configs/dev.yaml | 4 + src/mvp/configs/dev_v30.yaml | 4 + src/mvp/docker-compose.yaml | 7 + src/mvp/py/argus/service/ui.py | 1441 +----------------- src/mvp/py/argus/ui/__init__.py | 6 + src/mvp/py/argus/ui/assets/__init__.py | 2 + src/mvp/py/argus/ui/assets/base_css.py | 33 + src/mvp/py/argus/ui/assets/base_js.py | 70 + src/mvp/py/argus/ui/layout/__init__.py | 2 + src/mvp/py/argus/ui/layout/nav.py | 24 + src/mvp/py/argus/ui/layout/page.py | 36 + src/mvp/py/argus/ui/pages/__init__.py | 2 + src/mvp/py/argus/ui/pages/admin.py | 124 ++ src/mvp/py/argus/ui/pages/data.py | 93 ++ src/mvp/py/argus/ui/pages/login.py | 88 ++ src/mvp/py/argus/ui/pages/serving.py | 261 ++++ src/mvp/py/argus/ui/pages/task_detail.py | 88 ++ src/mvp/py/argus/ui/pages/task_logs.py | 48 + src/mvp/py/argus/ui/pages/task_new.py | 538 +++++++ src/mvp/py/argus/ui/pages/tasks.py | 101 ++ src/mvp/py/argus/ui/routes.py | 22 + src/mvp/scripts/30_prepare_data_and_model.sh | 26 +- src/mvp/scripts/run_all_v30_api.sh | 31 +- walkthrough.md | 27 + 27 files changed, 1896 insertions(+), 1456 deletions(-) create mode 100644 README.md create mode 100644 specs/mvp/v3.9/ui_refactor_plan.md create mode 100644 src/mvp/py/argus/ui/__init__.py create mode 100644 src/mvp/py/argus/ui/assets/__init__.py create mode 100644 src/mvp/py/argus/ui/assets/base_css.py create mode 100644 src/mvp/py/argus/ui/assets/base_js.py create mode 100644 src/mvp/py/argus/ui/layout/__init__.py create mode 100644 src/mvp/py/argus/ui/layout/nav.py create mode 100644 src/mvp/py/argus/ui/layout/page.py create mode 100644 src/mvp/py/argus/ui/pages/__init__.py create mode 100644 src/mvp/py/argus/ui/pages/admin.py create mode 100644 src/mvp/py/argus/ui/pages/data.py create mode 100644 src/mvp/py/argus/ui/pages/login.py create mode 100644 src/mvp/py/argus/ui/pages/serving.py create mode 100644 src/mvp/py/argus/ui/pages/task_detail.py create mode 100644 src/mvp/py/argus/ui/pages/task_logs.py create mode 100644 src/mvp/py/argus/ui/pages/task_new.py create mode 100644 src/mvp/py/argus/ui/pages/tasks.py create mode 100644 src/mvp/py/argus/ui/routes.py create mode 100644 walkthrough.md diff --git a/.gitignore b/.gitignore index a7f185a..f9a0a18 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ __pycache__/ .pytest_cache/ .coverage htmlcov/ +AGENTS.md \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..45ab2cd --- /dev/null +++ b/README.md @@ -0,0 +1,165 @@ +# Argus Cluster + +本仓库主要代码位于 `src/mvp/py/`,单元测试位于 `src/mvp/py/tests/`(`pytest.ini` 已配置 `testpaths`)。 + +## 使用 uv 创建/激活虚拟环境并安装依赖 + +前置条件:已安装 `uv`(Astral 的 Python 包/虚拟环境工具)。 + +1) 在仓库根目录创建虚拟环境: + +```bash +uv venv .venv +``` + +2) 激活虚拟环境: + +```bash +source .venv/bin/activate +``` + +3) 在虚拟环境中安装依赖(运行与测试依赖): + +```bash +uv pip install -r src/mvp/py/requirements.txt -r src/mvp/py/requirements-dev.txt +``` + +## 运行单元测试 + +在仓库根目录执行: + +```bash +pytest +``` + +如需显式使用 Python 模块方式: + +```bash +python -m pytest +``` + +## 远端开发(h1):同步代码、构建镜像、初始化共享目录、拉起 Ray 集群 + +`src/mvp/docker-compose.yaml` 以 `src/mvp/` 为工作目录,并挂载 `../../shared` 与 `../../verl`,因此推荐远端目录结构如下: + +- `/home2/argus/infra/mvp/src/mvp/`:本仓库的 `src/mvp/` 内容 +- `/home2/argus/infra/mvp/shared/`:共享目录(模拟/对齐 NFS) +- `/home2/argus/infra/mvp/verl/`:训练代码仓库(脚本会检查该目录存在) + +1) 同步代码到远端(只同步 `src/mvp/`,确保相对路径挂载正确): + +```bash +ssh argus@h1 "mkdir -p /home2/argus/infra/mvp/src/mvp" +rsync -av --delete src/mvp/ argus@h1:/home2/argus/infra/mvp/src/mvp/ +``` + +2) 在远端准备 `verl` 仓库(若已存在可跳过): + +```bash +ssh argus@h1 "mkdir -p /home2/argus/infra/mvp && test -d /home2/argus/infra/mvp/verl || echo '缺少 /home2/argus/infra/mvp/verl(请先 git clone)'" + +# 下载verl +ssh argus@h1 "cd /home2/argus/infra/mvp && git clone https://github.com/volcengine/verl.git" +``` + +3) 登录远端并初始化共享目录环境(会创建 `../../shared/...` 等目录): + +```bash +ssh argus@h1 +cd /home2/argus/infra/mvp/src/mvp +./scripts/00_prereq_check.sh +``` + +4) 构建 Ray 节点镜像并拉起集群(首次或镜像不存在时建议强制构建): + +```bash +cd /home2/argus/infra/mvp/src/mvp +BUILD=1 ./scripts/01_up.sh +``` + +5) 验证集群状态: + +```bash +./scripts/50_status.sh +curl -sS http://127.0.0.1:8265/api/version | head +docker ps --format 'table {{.Names}}\t{{.Status}}\t{{.Ports}}' | grep -E 'argus-ray-|sftpgo|wandb' || true +``` + +6) 关闭集群: + +```bash +./scripts/02_down.sh +``` + +## 远端(h1):端口映射、创建 W&B user用户 +使用以下命令将h1的端口映射到本机 +``` + ssh -p 12022 -L 8265:127.0.0.1:8265 -L 8080:127.0.0.1:8080 -L 8081:127.0.0.1:8081 -L 2022:127.0.0.1:2022 -L 8090:127.0.0.1:8090 -L 8000:127.0.0.1:8000 -o ProxyJump=ssh@jump.nasp.fit:36022 nasp@192.168.20.121 +``` +其中: +- 8265: ray dashboard +- 8080: webui & API 服务端口 +- 8081: sftpgo web client +- 2022: sftpgo sftp协议端口 +- 8090: weight & bias 网站端口 +- 8000: model serving openai服务端口 + + +## 远端(h1):预先准备数据/模型,启动与关闭 API Server + +下面命令均在远端执行: + +```bash +ssh argus@h1 +cd /home2/argus/infra/mvp/src/mvp +``` + +1) 预先下载数据集与模型(写入共享目录,幂等可重复执行): + +```bash +# 可选:指定要缓存的模型(默认 Qwen/Qwen2.5-0.5B-Instruct) +MODEL_ID="Qwen/Qwen2.5-0.5B-Instruct" ./scripts/30_prepare_data_and_model.sh +``` + +2) 安装 API 依赖(在 head 容器内 best-effort 安装 `fastapi/uvicorn/yaml` 等): + +```bash +./scripts/12_install_api_deps.sh +``` + +3) 启动 API Server(监听 `:8080`,需要设置鉴权 token): + +```bash +export MVP_INTERNAL_TOKEN="your-dev-token" +./scripts/60_start_api.sh +curl -sS http://127.0.0.1:8080/docs | head +``` + +查看 API 进程状态(基于 pid 文件): + +```bash +./scripts/62_status_api.sh +``` + +4) 关闭 API Server: + +```bash +./scripts/61_stop_api.sh +``` + +## 远端(h1):W&B Local(wandb)License 激活常见问题(端口转发) + +如果你通过 SSH 端口转发访问 W&B UI,但在 “Add license” 页面点击 “Update” 时提示: +`Unable to reach the backend api`,通常是因为 W&B 前端使用了“绝对 URL”去请求后端 API, +而该 URL 的 host/port 与你当前端口转发的地址不一致(例如 UI 打开在 `http://localhost:8090`, +但前端去请求 `http://localhost:8080`)。 + +建议二选一: + +- 方式 A(推荐,最简单):把本地 `8080` 转发到远端 `8090`,然后用 `http://localhost:8080` 打开 UI: + - `ssh -L 8080:127.0.0.1:8090 argus@h1` +- 方式 B:在远端启动容器前设置 `WANDB_HOST`(与浏览器地址一致),并重建 `wandb` 容器: + - `export WANDB_HOST=http://localhost:8090` + - `docker compose -f /home2/argus/infra/mvp/src/mvp/docker-compose.yaml up -d --force-recreate wandb` + +也可以通过设置 `WANDB_LICENSE=...` 在容器启动时注入 license,绕过 UI 更新步骤。 diff --git a/specs/mvp/v3.9/ui_refactor_plan.md b/specs/mvp/v3.9/ui_refactor_plan.md new file mode 100644 index 0000000..2dc3522 --- /dev/null +++ b/specs/mvp/v3.9/ui_refactor_plan.md @@ -0,0 +1,108 @@ +# v3.9 UI 重构方案(保持功能不变) + +## 背景与问题 + +当前 `src/mvp/py/argus/service/ui.py` 单文件约 1400+ 行,包含: + +- 全局 CSS/JS(长字符串) +- 布局渲染(nav/page 拼接) +- 11 个页面的 HTML + 大段内嵌 JS(包含 TaskSpec 模板与表单逻辑) + +导致:变更难定位、合并冲突多、缺少模块边界、复用困难、测试覆盖薄弱。 + +## 目标(功能不变) + +- **路由与页面行为完全不变**:URL、返回内容、按钮/表单行为、localStorage key(`mvp_token`/`mvp_sftp_password`)、API 调用路径保持不变。 +- **不引入前端构建链/新依赖**(仍然用纯字符串/轻量模板函数)。 +- 将 UI 拆分为可维护的多个文件(放到 `src/mvp/py/argus/ui/`)。 +- 增加最小的单测(确保路由可访问、关键 DOM 标识存在)。 + +## 非目标 + +- 不重做 UI 样式/交互;不引入 React/Vue;不改后端 API。 +- 不新增鉴权逻辑(仍然是浏览器 localStorage + Bearer token)。 + +## 拆分后的目录结构(建议) + +新增包:`src/mvp/py/argus/ui/` + +``` +argus/ui/ + __init__.py # register_ui_routes(app) 统一入口 + assets/ + base_css.py # BASE_CSS 常量 + base_js.py # BASE_JS 常量(apiFetch/apiJson 等通用函数) + layout/ + nav.py # nav(active) + 链接配置 + page.py # page(title, active, body, script, extra_head=...) + pages/ + login.py # /ui/login + tasks.py # /ui/tasks + task_new.py # /ui/tasks/new(模板常量 + 表单 JS) + task_detail.py # /ui/tasks/{task_id} + task_logs.py # /ui/tasks/{task_id}/logs + serving.py # /ui/serving, /ui/serving/new, /ui/serving/{model_key} + data.py # /ui/data + admin.py # /ui/admin + routes.py # 将各 pages.register(app) 聚合注册 +``` + +兼容层(可选但推荐):保留 `src/mvp/py/argus/service/ui.py` 仅做转发: + +```py +from argus.ui import register_ui_routes +``` + +这样可以避免一次性改动 `service/app.py` 的 import 路径,减少风险。 + +## 页面拆分原则 + +每个 page 模块提供两个函数: + +- `render(...) -> HTMLResponse`:只负责拼接 body/script(不直接碰 FastAPI app)。 +- `register(app: FastAPI) -> None`:只负责挂载路由(`@app.get(...)`)。 + +通用能力下沉: + +- `_BASE_CSS`/`_BASE_JS` 移到 `assets/`。 +- `_nav()`、`_page()` 移到 `layout/`。 +- 大块常量(TaskSpec 模板、UI 文案)放在页面模块同文件顶部,避免散落在函数内部。 + +## 资源交付方式(两种可选) + +### 方案 A(最稳):继续内联 CSS/JS,但拆到不同 Python 文件 + +- `page()` 内继续 ``、``。 +- 只改变代码组织,不改变浏览器加载方式,风险最低。 + +### 方案 B(推荐中期):新增静态端点分发资源 + +新增: +- `GET /ui/assets/base.css` +- `GET /ui/assets/base.js` + +页面改为 `` + ``。 +优点:减少 HTML 体积、浏览器缓存更好;缺点:需要确认反向代理/中间件不拦截这些路由。 + +建议 v3.9 先落地方案 A,稳定后再做方案 B。 + +## 迁移步骤(建议分 3 次 PR) + +1) **抽公共层**:引入 `argus/ui/assets/*`、`argus/ui/layout/*`,保持 UI 输出完全一致;`service/ui.py` 仍在但内部改为调用新 layout(或先不动)。 +2) **按页面迁移**:逐个把 routes 迁移到 `argus/ui/pages/*`,每迁一个页面就加一个最小测试用例(200 + 关键文本存在)。 +3) **清理与稳定**:`service/ui.py` 变为兼容转发;可选引入 `/ui/assets/*` 静态端点(方案 B)。 + +## 测试策略(最小但有效) + +新增 `src/mvp/py/tests/test_ui_pages.py`: + +- 创建 FastAPI app(复用现有测试的 app 初始化方式) +- 请求下列页面,断言 `status_code == 200`: + - `/ui/login`, `/ui/tasks`, `/ui/tasks/new`, `/ui/serving`, `/ui/data`, `/ui/admin` +- 断言响应包含稳定锚点文本(例如 `Argus MVP`, `New Task`, `Tasks`),避免脆弱的全量快照。 + +## 验收标准(Definition of Done) + +- 11 个 `/ui/*` 路由行为与输出不变(人工 smoke + 自动化最小测试)。 +- `src/mvp/py/argus/service/ui.py` 不再包含大段 HTML/JS(仅兼容转发或极薄封装)。 +- 新增/修改 UI 页面不需要触碰 1000+ 行单文件;每页的改动范围限定在对应模块。 diff --git a/src/mvp/configs/dev.yaml b/src/mvp/configs/dev.yaml index 4c41416..eb25620 100644 --- a/src/mvp/configs/dev.yaml +++ b/src/mvp/configs/dev.yaml @@ -17,6 +17,10 @@ ray: PYTHONUNBUFFERED: "1" # v3.7: forbid HuggingFace Hub network access from Ray jobs (use cached snapshots). HF_HUB_OFFLINE: "1" + # Unify cache dirs so `from_pretrained("org/name")` resolves from the same on-disk cache in offline mode. + HF_HOME: "/private/hf" + HUGGINGFACE_HUB_CACHE: "/private/hf/hub" + TRANSFORMERS_CACHE: "/private/hf/hub" # 用户自定义代码目录(可被 PYTHONPATH 注入) user_code_path: "/private/user/code" diff --git a/src/mvp/configs/dev_v30.yaml b/src/mvp/configs/dev_v30.yaml index 4c0ac9f..8be5b4e 100644 --- a/src/mvp/configs/dev_v30.yaml +++ b/src/mvp/configs/dev_v30.yaml @@ -17,6 +17,10 @@ ray: PYTHONUNBUFFERED: "1" # v3.7: forbid HuggingFace Hub network access from Ray jobs (use cached snapshots). HF_HUB_OFFLINE: "1" + # Unify cache dirs so `from_pretrained("org/name")` resolves from the same on-disk cache in offline mode. + HF_HOME: "/private/hf" + HUGGINGFACE_HUB_CACHE: "/private/hf/hub" + TRANSFORMERS_CACHE: "/private/hf/hub" # v3.0 先不支持 user code 执行 user_code_path: "/private/user/code" diff --git a/src/mvp/docker-compose.yaml b/src/mvp/docker-compose.yaml index 0bb57c7..af8662c 100644 --- a/src/mvp/docker-compose.yaml +++ b/src/mvp/docker-compose.yaml @@ -95,6 +95,13 @@ services: aliases: - wandb - argus-wandb + environment: + # W&B Local uses this as its externally-reachable base URL for the web app/backend. + # If you access via SSH port-forwarding, set WANDB_HOST to the exact URL you open in the browser + # (e.g. http://localhost:8090 or http://localhost:8080). + HOST: "${WANDB_HOST:-http://localhost:8090}" + # Optional: provide license at container start to avoid UI activation flakiness. + LICENSE: "${WANDB_LICENSE:-}" ray_worker_0: image: argus/argus-ray-node:vllm011.latest diff --git a/src/mvp/py/argus/service/ui.py b/src/mvp/py/argus/service/ui.py index d46be5b..010d887 100644 --- a/src/mvp/py/argus/service/ui.py +++ b/src/mvp/py/argus/service/ui.py @@ -1,1442 +1,11 @@ from __future__ import annotations -import html -import json - -from fastapi import FastAPI -from fastapi.responses import HTMLResponse, RedirectResponse - - -_BASE_CSS = """ -:root { --bg:#0b1020; --panel:#111a33; --muted:#95a3c6; --fg:#e8eeff; --accent:#7aa2ff; --danger:#ff6b6b; --ok:#3ddc97; } -* { box-sizing: border-box; } -body { margin:0; font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial; background:var(--bg); color:var(--fg); } -a { color:var(--accent); text-decoration:none; } -.layout { display:flex; min-height:100vh; } -.nav { width: 240px; padding:16px; background: linear-gradient(180deg, #0e1630, #0b1020); border-right: 1px solid rgba(255,255,255,0.06); } -.brand { font-weight: 700; letter-spacing: .2px; margin-bottom: 12px; } -.nav a { display:block; padding:10px 10px; border-radius:10px; color: var(--fg); opacity: .9; } -.nav a.active { background: rgba(122,162,255,0.14); border: 1px solid rgba(122,162,255,0.22); } -.nav a:hover { background: rgba(255,255,255,0.06); } -.main { flex:1; padding: 20px 24px; } -.card { background: rgba(255,255,255,0.04); border: 1px solid rgba(255,255,255,0.08); border-radius: 14px; padding: 16px; } -.row { display:flex; gap: 12px; align-items:center; flex-wrap: wrap; } -.muted { color: var(--muted); } -.btn { border: 1px solid rgba(255,255,255,0.16); background: rgba(255,255,255,0.06); color: var(--fg); padding: 10px 12px; border-radius: 10px; cursor: pointer; } -.btn:hover { background: rgba(255,255,255,0.10); } -.btn.active { background: rgba(122,162,255,0.14); border-color: rgba(122,162,255,0.22); } -.btn.danger { border-color: rgba(255,107,107,0.35); background: rgba(255,107,107,0.10); } -.pill { display:inline-block; padding: 2px 10px; border-radius: 999px; border: 1px solid rgba(255,255,255,0.16); font-size: 12px; } -.pill.ok { border-color: rgba(61,220,151,0.35); background: rgba(61,220,151,0.12); } -.pill.bad { border-color: rgba(255,107,107,0.35); background: rgba(255,107,107,0.12); } -textarea, input { width: 100%; color: var(--fg); background: rgba(255,255,255,0.06); border: 1px solid rgba(255,255,255,0.12); border-radius: 12px; padding: 10px 12px; outline: none; } -select { color: var(--fg); background: rgba(255,255,255,0.06); border: 1px solid rgba(255,255,255,0.12); border-radius: 12px; padding: 10px 12px; outline: none; } -select option { color: var(--fg); background: #0e1630; } -button:disabled { opacity: .45; cursor: not-allowed; } -pre { white-space: pre-wrap; word-break: break-word; } -table { width:100%; border-collapse: collapse; } -th, td { padding: 10px 8px; border-bottom: 1px solid rgba(255,255,255,0.08); text-align:left; } -""".strip() - - -_BASE_JS = """ -function mvpTokenGet() { - return (localStorage.getItem("mvp_token") || "").trim(); -} -function mvpTokenSet(v) { - localStorage.setItem("mvp_token", (v || "").trim()); -} -function mvpSftpPasswordGet() { - return (localStorage.getItem("mvp_sftp_password") || "").trim(); -} -function mvpSftpPasswordSet(v) { - localStorage.setItem("mvp_sftp_password", (v || "").trim()); -} -async function apiFetch(path, opts) { - opts = opts || {}; - opts.headers = opts.headers || {}; - const tok = mvpTokenGet(); - if (tok) opts.headers["Authorization"] = "Bearer " + tok; - return fetch(path, opts); -} -async function apiJson(path, opts) { - const resp = await apiFetch(path, opts); - const text = await resp.text(); - if (!resp.ok) { - const err = new Error("HTTP " + resp.status); - err.status = resp.status; - err.body = text; - throw err; - } - return JSON.parse(text); -} -function fmtJson(obj) { - try { return JSON.stringify(obj, null, 2); } catch (e) { return String(obj); } -} -function curOriginWithPort(port) { - const proto = window.location.protocol; - const host = window.location.hostname; - return proto + "//" + host + ":" + port; -} -async function copyText(v) { - if (!v) return false; - try { - await navigator.clipboard.writeText(v); - return true; - } catch (e) { - // Fallback for non-secure contexts (http) or older browsers. - try { - const ta = document.createElement("textarea"); - ta.value = v; - ta.style.position = "fixed"; - ta.style.opacity = "0"; - document.body.appendChild(ta); - ta.focus(); - ta.select(); - const ok = document.execCommand("copy"); - document.body.removeChild(ta); - return ok; - } catch (e2) { - return false; - } - } -} -document.addEventListener("DOMContentLoaded", () => { - const el = document.getElementById("nav-ray-dashboard"); - if (el) el.href = curOriginWithPort(8265); -}); -""".strip() - - -def _nav(active: str) -> str: - links = [ - ("login", "/ui/login", "Login"), - ("tasks", "/ui/tasks", "Tasks"), - ("serving", "/ui/serving", "Serving"), - ("new", "/ui/tasks/new", "New Task"), - ("data", "/ui/data", "Data"), - ("admin", "/ui/admin", "Admin"), - ("ray", "#", "Ray Dashboard"), - ] - items = [] - for key, href, label in links: - cls = "active" if key == active else "" - extra = "" - if key == "ray": - extra = ' id="nav-ray-dashboard" target="_blank" rel="noopener"' - items.append(f'{html.escape(label)}') - return "\n".join(items) - - -def _page(title: str, active: str, body: str, script: str = "") -> str: - return f""" - - - - - {html.escape(title)} - - - -
- -
- {body} -
-
- - - -""" - - -def register_ui_routes(app: FastAPI) -> None: - @app.get("/ui") - async def ui_root() -> RedirectResponse: - return RedirectResponse(url="/ui/tasks") - - @app.get("/ui/login") - async def ui_login() -> HTMLResponse: - body = """ -

Login

-
-
Paste your API token (without the Bearer prefix).
-
- -
-
- - - Go to Tasks -
-
-
-
-
-
-

User Info

-
Shown after login via /api/v2/me.
-
-
- -
-
-
(not loaded)
-
-
-
-

W&B

-
Weights & Biases local server (v3.6). Metrics are written by training jobs; this UI is for viewing.
-
-
- Open W&B (:8090) - -
-
-
project: (unknown)
-
base_url (job runtime): (unknown)
-
-""".strip() - script = """ -const tokEl = document.getElementById("tok"); -const msg = document.getElementById("msg"); -const me = document.getElementById("me"); -const wandbOpen = document.getElementById("wandb-open"); -const wandbProject = document.getElementById("wandb-project"); -const wandbBaseUrl = document.getElementById("wandb-base-url"); -document.getElementById("wandb-copy-project").onclick = async () => { await copyText(wandbProject.textContent || ""); }; -wandbOpen.href = curOriginWithPort(8090); -tokEl.value = mvpTokenGet(); - -async function refreshMe() { - me.textContent = "Loading..."; - try { - const obj = await apiJson("/api/v2/me"); - me.textContent = fmtJson(obj); - if (obj.wandb && obj.wandb.enabled) { - wandbProject.textContent = obj.wandb.project_name || "(unknown)"; - wandbBaseUrl.textContent = obj.wandb.base_url || "(unknown)"; - } else { - wandbProject.textContent = "(disabled)"; - wandbBaseUrl.textContent = "(disabled)"; - } - } catch (e) { - me.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); - wandbProject.textContent = "(error)"; - wandbBaseUrl.textContent = "(error)"; - } -} - -document.getElementById("me-refresh").onclick = refreshMe; -document.getElementById("save").onclick = () => { mvpTokenSet(tokEl.value); msg.textContent = "Saved."; refreshMe(); }; -document.getElementById("clear").onclick = () => { mvpTokenSet(""); tokEl.value = ""; msg.textContent = "Cleared."; me.textContent = "(not loaded)"; }; -if (mvpTokenGet()) refreshMe(); -""".strip() - return HTMLResponse(content=_page("Login", "login", body, script)) - - @app.get("/ui/tasks") - async def ui_tasks() -> HTMLResponse: - body = """ -

Tasks

-
-
- - New Task -
-
-
Loading...
-
-""".strip() - script = """ -const out = document.getElementById("out"); -async function refresh() { - out.textContent = "Loading..."; - try { - const q = await apiJson("/api/v2/queue"); - const completedLimit = 25; - const completedOffset = Number(localStorage.getItem("mvp_completed_offset") || "0") || 0; - const done = await apiJson("/api/v2/tasks?limit=" + completedLimit + "&offset=" + completedOffset + "&states=SUCCEEDED,FAILED,CANCELED"); - - function pill(state) { - const s = String(state || ""); - if (s === "SUCCEEDED") return `${s}`; - if (s === "FAILED") return `${s}`; - if (s === "CANCELED") return `${s}`; - if (s === "RUNNING") return `${s}`; - if (s === "QUEUED" || s === "PENDING_RESOURCES" || s === "SUBMITTING" || s === "SUBMITTED") return `${s}`; - return `${s}`; - } - function row(t) { - const id = t.task_id; - return ` - ${id} - ${t.workload} - ${pill(t.state)} - ${t.nnodes} x ${t.n_gpus_per_node} GPU - ${t.updated_at || ""} - `; - } - - const running = (q.running || []).map(row).join(""); - const pending = (q.pending || []).map(row).join(""); - const doneRows = (done.tasks || []).map(row).join(""); - const pageNo = Math.floor(completedOffset / completedLimit) + 1; - const prevDisabled = completedOffset <= 0; - const nextDisabled = !done.has_more; - out.innerHTML = ` -
Tip: configure token in Login.
-
-

Running

- ${running || ""}
TaskWorkloadStateResourcesUpdated
(none)
-
-

Pending

- ${pending || ""}
TaskWorkloadStateResourcesUpdated
(none)
-
-

Completed

-
-
Page ${pageNo}
-
- - -
-
- ${doneRows || ""}
TaskWorkloadStateResourcesUpdated
(none)
- `; - - const prevBtn = document.getElementById("done-prev"); - const nextBtn = document.getElementById("done-next"); - if (prevBtn) prevBtn.onclick = () => { - const cur = Number(localStorage.getItem("mvp_completed_offset") || "0") || 0; - const next = Math.max(0, cur - completedLimit); - localStorage.setItem("mvp_completed_offset", String(next)); - refresh(); - }; - if (nextBtn) nextBtn.onclick = () => { - const cur = Number(localStorage.getItem("mvp_completed_offset") || "0") || 0; - const next = cur + completedLimit; - localStorage.setItem("mvp_completed_offset", String(next)); - refresh(); - }; - } catch (e) { - out.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); - } -} -document.getElementById("refresh").onclick = refresh; -refresh(); -""".strip() - return HTMLResponse(content=_page("Tasks", "tasks", body, script)) - - @app.get("/ui/tasks/new") - async def ui_new_task() -> HTMLResponse: - ppo = """# PPO TaskSpec (YAML) -workload: ppo # 任务类型(必填):ppo|grpo|sft -code_path: /private/common/code/verl/verl_repo # 代码路径(必填):v3.0 固定使用 common 下的 verl 快照(不支持用户自定义代码) -model_id: Qwen/Qwen2.5-0.5B-Instruct # 基础模型(必填):HuggingFace 模型 ID 或 /private/... 本地模型路径 -train_file: /private/common/datasets/gsm8k/train.parquet # 训练数据(必填):parquet 文件路径(支持 /private/common/datasets 或 /private/users//datasets) -val_file: /private/common/datasets/gsm8k/test.parquet # 验证数据(必填):parquet 文件路径(VERL 侧会用来构建 val dataset,不能为 null) - -# nnodes: 2 # 训练节点数(可选,默认:2) -# n_gpus_per_node: 4 # 每节点 GPU 数(可选,默认:4) - -# total_epochs: 1 # 总训练 epoch(可选,默认:1) -# total_training_steps: null # 总训练 step(可选,默认:null;不传则让 VERL 按 epochs 和数据长度自动推导) -# save_freq: 10 # checkpoint 保存频率(step)(可选,默认:10) -# test_freq: null # 验证频率(step)(可选,默认:null;训练端会当成 -1=不验证) - -# submission_id: "" # Ray submission_id(可选,默认空;通常由服务自动生成,无需填写) -""".strip() - grpo = """# GRPO TaskSpec (YAML) -workload: grpo # 任务类型(必填):ppo|grpo|sft(grpo 会自动启用对应的算法配置) -code_path: /private/common/code/verl/verl_repo # 代码路径(必填):v3.0 固定使用 common 下的 verl 快照(不支持用户自定义代码) -model_id: Qwen/Qwen2.5-0.5B-Instruct # 基础模型(必填):HuggingFace 模型 ID 或 /private/... 本地模型路径 -train_file: /private/common/datasets/gsm8k/train.parquet # 训练数据(必填):parquet 文件路径(支持 /private/common/datasets 或 /private/users//datasets) -val_file: /private/common/datasets/gsm8k/test.parquet # 验证数据(必填):parquet 文件路径(VERL 侧会用来构建 val dataset,不能为 null) - -# nnodes: 2 # 训练节点数(可选,默认:2) -# n_gpus_per_node: 4 # 每节点 GPU 数(可选,默认:4) - -# total_epochs: 1 # 总训练 epoch(可选,默认:1) -# total_training_steps: null # 总训练 step(可选,默认:null;不传则让 VERL 按 epochs 和数据长度自动推导) -# save_freq: 10 # checkpoint 保存频率(step)(可选,默认:10) -# test_freq: null # 验证频率(step)(可选,默认:null;训练端会当成 -1=不验证) - -# submission_id: "" # Ray submission_id(可选,默认空;通常由服务自动生成,无需填写) -""".strip() - sft = """# SFT TaskSpec (YAML) -workload: sft # 任务类型(必填):ppo|grpo|sft -code_path: /private/common/code/verl/verl_repo # 代码路径(必填):v3.0 固定使用 common 下的 verl 快照(不支持用户自定义代码) -model_id: Qwen/Qwen2.5-0.5B-Instruct # 基础模型(必填):HuggingFace 模型 ID 或 /private/... 本地模型路径 -train_file: /private/common/datasets/gsm8k_sft/train.parquet # 训练数据(必填):parquet 文件路径(支持 /private/common/datasets 或 /private/users//datasets) -val_file: /private/common/datasets/gsm8k_sft/test.parquet # 验证数据(必填):parquet 文件路径(VERL 侧会用来构建 val dataset,不能为 null) - -# nnodes: 2 # 训练节点数(可选,默认:2;单机可设 1) -# n_gpus_per_node: 4 # 每节点 GPU 数(可选,默认:4;单卡可设 1) - -# total_epochs: 1 # 总训练 epoch(可选,默认:1) -# total_training_steps: null # 总训练 step(可选,默认:null;不传则让 VERL 按 epochs 和数据长度自动推导) -# save_freq: 10 # checkpoint 保存频率(step)(可选,默认:10) -# test_freq: null # 验证频率(step)(可选,默认:null;训练端会当成 -1=不验证) - -# trainer_device: cpu # 仅 SFT 生效:driver 侧 device(可选,默认:cpu) -# submission_id: "" # Ray submission_id(可选,默认空;通常由服务自动生成,无需填写) -""".strip() - adv = """# Advanced TaskSpec (YAML) - v3.6 -kind: advanced # 任务类型(必填):advanced(自定义 command) -# 说明:平台统一按 "advanced" 做任务分类与 task_id 命名(不按 ppo/grpo/sft 细分)。 +# Backward-compatible import path. # -# 自定义训练命令:平台会做 $HOME 宏替换: -# - $HOME -> /private/users/ -# - $HOME/common/datasets -> /private/datasets(共享只读数据) -# - $HOME/common/hf -> /private/hf(共享只读 HF cache) -# -# W&B(v3.6): -# - 平台会在 runtime_env 注入 WANDB_BASE_URL/WANDB_API_KEY/WANDB_DIR -# - 以及注入以下 env,供 Advanced command 使用(无需用户手动修改 project): -# - MVP_TRAINER_LOGGER: "console" 或 "[console,wandb]" -# - MVP_WANDB_PROJECT: "_project" -# - MVP_WANDB_RUN: "" -# -nnodes: 2 # 训练节点数(必填):用于平台队列调度与资源预检查 -n_gpus_per_node: 4 # 每节点 GPU 数(必填):用于平台队列调度与资源预检查 +# The UI implementation lives under `argus.ui` (split into multiple modules for maintainability), +# while `argus.service.app` continues to import `argus.service.ui.register_ui_routes`. -command: | - PYTHONUNBUFFERED=1 \ - python3 -m verl.trainer.main_ppo \ - data.train_files=$HOME/common/datasets/gsm8k/train.parquet \ - data.val_files=$HOME/common/datasets/gsm8k/test.parquet \ - data.train_batch_size=256 \ - data.max_prompt_length=512 \ - data.max_response_length=512 \ - actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ - actor_rollout_ref.actor.optim.lr=1e-6 \ - actor_rollout_ref.actor.ppo_mini_batch_size=64 \ - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ - actor_rollout_ref.rollout.name=vllm \ - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ - actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ - critic.optim.lr=1e-5 \ - critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \ - critic.ppo_micro_batch_size_per_gpu=4 \ - algorithm.kl_ctrl.kl_coef=0.001 \ - trainer.logger=${MVP_TRAINER_LOGGER} \ - trainer.project_name=${MVP_WANDB_PROJECT} \ - trainer.experiment_name=${MVP_WANDB_RUN} \ - trainer.val_before_train=False \ - trainer.nnodes=2 \ - trainer.n_gpus_per_node=4 \ - trainer.total_epochs=1 \ - trainer.total_training_steps=10 \ - trainer.save_freq=10 \ - trainer.test_freq=-1 \ - trainer.resume_mode=disable \ - trainer.default_local_dir=checkpoints \ - +ray_kwargs.ray_init.address=auto \ - hydra.run.dir=logs/hydra +from argus.ui import register_ui_routes -# 可选:自定义 reward(方式 A:直接写在 command 里) -# command 里增加如下 overrides: -# custom_reward_function.path=$HOME/code/reward.py -# custom_reward_function.name=compute_score -""".strip() - merge = """# Model Merge (YAML) - v3.5 (Advanced command) -# 用途:将 VERL 训练产生的 FSDP 分片 checkpoint 合并为 HuggingFace 格式目录。 -# -# 你需要把 替换成真实值: -# - submission id:在 Tasks 详情页里看到的 `ray_submission_id`(如 mvp2-...--a01) -# - global_step:对应 checkpoints 下的 global_step_xxx 目录(如 global_step_10) -# -# 注意:这里不需要 GPU,建议把 n_gpus_per_node 设为 0,这样不受训练任务占用 GPU 的影响。 -kind: advanced -nnodes: 1 -n_gpus_per_node: 0 +__all__ = ["register_ui_routes"] -command: | - python3 -m verl.model_merger merge \ - --backend fsdp \ - --local_dir $HOME/jobs//checkpoints//actor \ - --target_dir $HOME/jobs//checkpoints//actor/huggingface -""".strip() - evaluation = """# Evaluation (YAML) - v3.6 (Advanced command) -# 用途:对“已生成的结果 parquet”做离线评估(基于 custom reward function),示例使用 VERL 内置 main_eval。 -# -# 说明: -# - 你需要准备一个包含生成 responses 的 parquet,并替换 (示例放在某个 job 目录下)。 -# - main_eval 会在 Ray 上并发计算,因此这里也用 +ray_kwargs 连接已有 Ray 集群。 -# -kind: advanced -nnodes: 1 -n_gpus_per_node: 0 - -command: | - PYTHONUNBUFFERED=1 \ - python3 -m verl.trainer.main_eval \ - data.path=$HOME/jobs//outputs/.parquet \ - +ray_kwargs.ray_init.address=auto - -# 可选:指定 parquet schema(按你的 parquet 列名调整) -# data.response_key=responses -# data.data_source_key=data_source -# data.reward_model_key=reward_model -# -# 可选:自定义 reward(方式 A:用户自行提供 reward.py) -# custom_reward_function.path=$HOME/code/reward.py -# custom_reward_function.name=compute_score -""".strip() - body = f""" -

New Task

-
-
- Paste TaskSpec YAML and submit to API server. - Basic tasks require code_path; Advanced tasks use kind: advanced with a custom command. -
-
-
- - -
-
- -
-
- - - - - - -
-
- -
- - - -
-
- - Back -
-
-

-
-""".strip() - tpl_ppo = json.dumps(ppo) - tpl_grpo = json.dumps(grpo) - tpl_sft = json.dumps(sft) - tpl_adv = json.dumps(adv) - tpl_eval = json.dumps(evaluation) - tpl_merge = json.dumps(merge) - script = ( - """ - const msg = document.getElementById("msg"); - const yamlEl = document.getElementById("yaml"); - const TPL_PPO = __TPL_PPO__; - const TPL_GRPO = __TPL_GRPO__; - const TPL_SFT = __TPL_SFT__; - const TPL_ADV = __TPL_ADV__; - const TPL_EVAL = __TPL_EVAL__; - const TPL_MERGE = __TPL_MERGE__; - document.getElementById("tpl-ppo").onclick = () => { yamlEl.value = TPL_PPO; msg.textContent = ""; }; - document.getElementById("tpl-grpo").onclick = () => { yamlEl.value = TPL_GRPO; msg.textContent = ""; }; - document.getElementById("tpl-sft").onclick = () => { yamlEl.value = TPL_SFT; msg.textContent = ""; }; - document.getElementById("tpl-adv").onclick = () => { yamlEl.value = TPL_ADV; msg.textContent = ""; }; - document.getElementById("tpl-eval").onclick = () => { yamlEl.value = TPL_EVAL; msg.textContent = ""; }; - document.getElementById("tpl-merge").onclick = () => { yamlEl.value = TPL_MERGE; msg.textContent = ""; }; - -function yamlQuote(s) { - s = String(s ?? ""); - return '"' + s.replaceAll('\\\\', '\\\\\\\\').replaceAll('"', '\\\\"') + '"'; -} - -function buildYamlFromForm() { - const tpl = document.getElementById("f-tpl").value; - if (tpl === "advanced" || tpl === "merge") { - const nn = Number(document.getElementById("f-adv-nnodes").value || "1"); - const gpn = Number(document.getElementById("f-adv-gpn").value || "0"); - const cmd = String(document.getElementById("f-command").value || "").replace(/\\r/g, ""); - const lines = cmd.split("\\n"); - const indented = lines.map(l => " " + l).join("\\n"); - return `kind: advanced\\n` + `nnodes: ${nn}\\n` + `n_gpus_per_node: ${gpn}\\n\\n` + `command: |\\n${indented}\\n`; - } - - const workload = tpl; - const codePath = document.getElementById("f-code-path").value || ""; - const modelId = document.getElementById("f-model-id").value || ""; - const trainFile = document.getElementById("f-train-file").value || ""; - const valFile = document.getElementById("f-val-file").value || ""; - const nn = Number(document.getElementById("f-nnodes").value || "2"); - const gpn = Number(document.getElementById("f-gpn").value || "4"); - const epochs = Number(document.getElementById("f-epochs").value || "1"); - const stepsRaw = String(document.getElementById("f-steps").value || "").trim(); - const saveFreq = Number(document.getElementById("f-save-freq").value || "10"); - const testFreqRaw = String(document.getElementById("f-test-freq").value || "").trim(); - - let y = ""; - y += `workload: ${workload}\\n`; - y += `code_path: ${yamlQuote(codePath)}\\n`; - y += `model_id: ${yamlQuote(modelId)}\\n`; - y += `train_file: ${yamlQuote(trainFile)}\\n`; - y += `val_file: ${yamlQuote(valFile)}\\n`; - y += `nnodes: ${nn}\\n`; - y += `n_gpus_per_node: ${gpn}\\n`; - y += `total_epochs: ${epochs}\\n`; - if (stepsRaw) y += `total_training_steps: ${Number(stepsRaw)}\\n`; - y += `save_freq: ${saveFreq}\\n`; - if (testFreqRaw) y += `test_freq: ${Number(testFreqRaw)}\\n`; - if (tpl === "sft") { - const dev = document.getElementById("f-trainer-device").value || "cpu"; - y += `trainer_device: ${dev}\\n`; - } - return y; -} - -function updateFormVisibility() { - const tpl = document.getElementById("f-tpl").value; - const basic = document.getElementById("f-basic"); - const adv = document.getElementById("f-adv"); - const sftWrap = document.getElementById("f-sft-device-wrap"); - if (tpl === "advanced" || tpl === "merge") { - basic.style.display = "none"; - adv.style.display = "block"; - } else { - basic.style.display = "block"; - adv.style.display = "none"; - } - if (tpl === "sft") { - sftWrap.style.display = "block"; - } else { - sftWrap.style.display = "none"; - } -} - -function updatePreview() { - const text = buildYamlFromForm(); - document.getElementById("f-preview").textContent = text; - return text; -} - -function setBtnActive(id, on) { - const el = document.getElementById(id); - if (!el) return; - if (on) el.classList.add("active"); - else el.classList.remove("active"); -} - -function showMode(mode) { - const yamlPanel = document.getElementById("panel-yaml"); - const formPanel = document.getElementById("panel-form"); - if (mode === "form") { - yamlPanel.style.display = "none"; - formPanel.style.display = "block"; - setBtnActive("mode-yaml", false); - setBtnActive("mode-form", true); - updateFormVisibility(); - updatePreview(); - } else { - yamlPanel.style.display = "block"; - formPanel.style.display = "none"; - setBtnActive("mode-yaml", true); - setBtnActive("mode-form", false); - } -} - -function bindStepper(inputId, decId, incId, minV) { - const el = document.getElementById(inputId); - document.getElementById(decId).onclick = () => { - const v = Number(el.value || "0") - 1; - el.value = String(Math.max(minV, v)); - updatePreview(); - }; - document.getElementById(incId).onclick = () => { - const v = Number(el.value || "0") + 1; - el.value = String(Math.max(minV, v)); - updatePreview(); - }; -} - -// Init defaults (match examples) -document.getElementById("f-code-path").value = "/private/common/code/verl/verl_repo"; -document.getElementById("f-model-id").value = "Qwen/Qwen2.5-0.5B-Instruct"; -document.getElementById("f-train-file").value = "/private/common/datasets/gsm8k/train.parquet"; -document.getElementById("f-val-file").value = "/private/common/datasets/gsm8k/test.parquet"; -document.getElementById("f-nnodes").value = "2"; -document.getElementById("f-gpn").value = "4"; -document.getElementById("f-epochs").value = "1"; -document.getElementById("f-save-freq").value = "10"; -document.getElementById("f-trainer-device").value = "cpu"; - -document.getElementById("f-adv-nnodes").value = "2"; -document.getElementById("f-adv-gpn").value = "4"; -document.getElementById("f-command").value = `PYTHONUNBUFFERED=1 \\\npython3 -m verl.trainer.main_ppo \\\n data.train_files=$HOME/common/datasets/gsm8k/train.parquet \\\n data.val_files=$HOME/common/datasets/gsm8k/test.parquet \\\n +ray_kwargs.ray_init.address=auto`; - -bindStepper("f-nnodes", "f-nnodes-dec", "f-nnodes-inc", 1); -bindStepper("f-gpn", "f-gpn-dec", "f-gpn-inc", 0); -bindStepper("f-epochs", "f-epochs-dec", "f-epochs-inc", 1); -bindStepper("f-adv-nnodes", "f-adv-nnodes-dec", "f-adv-nnodes-inc", 1); -bindStepper("f-adv-gpn", "f-adv-gpn-dec", "f-adv-gpn-inc", 0); - -document.getElementById("f-tpl").onchange = () => { - const tpl = document.getElementById("f-tpl").value; - if (tpl === "ppo") { - document.getElementById("f-train-file").value = "/private/common/datasets/gsm8k/train.parquet"; - document.getElementById("f-val-file").value = "/private/common/datasets/gsm8k/test.parquet"; - } else if (tpl === "grpo") { - document.getElementById("f-train-file").value = "/private/common/datasets/gsm8k/train.parquet"; - document.getElementById("f-val-file").value = "/private/common/datasets/gsm8k/test.parquet"; - } else if (tpl === "sft") { - document.getElementById("f-train-file").value = "/private/common/datasets/gsm8k_sft/train.parquet"; - document.getElementById("f-val-file").value = "/private/common/datasets/gsm8k_sft/test.parquet"; - } else if (tpl === "advanced") { - document.getElementById("f-adv-nnodes").value = "2"; - document.getElementById("f-adv-gpn").value = "4"; - document.getElementById("f-command").value = TPL_ADV.split("command: |")[1]?.trimStart() || document.getElementById("f-command").value; - } else if (tpl === "merge") { - document.getElementById("f-adv-nnodes").value = "1"; - document.getElementById("f-adv-gpn").value = "0"; - document.getElementById("f-command").value = TPL_MERGE.split("command: |")[1]?.trimStart() || document.getElementById("f-command").value; - } - updateFormVisibility(); - updatePreview(); -}; - -for (const id of ["f-code-path","f-model-id","f-train-file","f-val-file","f-nnodes","f-gpn","f-epochs","f-steps","f-save-freq","f-test-freq","f-trainer-device","f-adv-nnodes","f-adv-gpn","f-command"]) { - const el = document.getElementById(id); - if (el) el.oninput = () => updatePreview(); -} - -document.getElementById("mode-yaml").onclick = () => { - // When switching to YAML, sync from form if form is visible. - const formVisible = document.getElementById("panel-form").style.display !== "none"; - if (formVisible) yamlEl.value = updatePreview(); - showMode("yaml"); -}; -document.getElementById("mode-form").onclick = () => { - showMode("form"); -}; - -showMode("yaml"); - -document.getElementById("submit").onclick = async () => { - msg.textContent = "Submitting..."; - const body = (document.getElementById("panel-form").style.display !== "none") ? updatePreview() : yamlEl.value; - const resp = await apiFetch("/api/v2/tasks", { method: "POST", headers: {"Content-Type":"text/plain"}, body }); - const text = await resp.text(); - if (!resp.ok) { msg.textContent = "Error: " + resp.status + "\\n" + text; return; } - const obj = JSON.parse(text); - msg.textContent = "OK: " + fmtJson(obj); - if (obj.task_id) window.location.href = "/ui/tasks/" + obj.task_id; -}; - """.strip() - .replace("__TPL_PPO__", tpl_ppo) - .replace("__TPL_GRPO__", tpl_grpo) - .replace("__TPL_SFT__", tpl_sft) - .replace("__TPL_ADV__", tpl_adv) - .replace("__TPL_EVAL__", tpl_eval) - .replace("__TPL_MERGE__", tpl_merge) - ) - return HTMLResponse(content=_page("New Task", "new", body, script)) - - @app.get("/ui/tasks/{task_id}") - async def ui_task_detail(task_id: str) -> HTMLResponse: - safe_id = html.escape(task_id) - body = f""" -

Task: {safe_id}

-
-
- Logs - - - Back -
-
-
Loading...
-
-
-
-

W&B

-
This is a best-effort hint. Run name maps to ray_submission_id (attempt).
-
-
- Open W&B (:8090) - -
-
-
project: (unknown)
-
run: (unknown)
-
-
-
-

TaskSpec (YAML)

-
Resolved TaskSpec (includes default values; submission_id reflects latest attempt when available).
-
-
Loading...
-
-""".strip() - script = f""" -document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); -document.getElementById("wandb-open").href = curOriginWithPort(8090); -document.getElementById("wandb-copy-run").onclick = async () => {{ await copyText((document.getElementById("wandb-run").textContent || \"\").trim()); }}; -const out = document.getElementById("out"); -const spec = document.getElementById("spec"); -async function refresh() {{ - out.textContent = "Loading..."; - spec.textContent = "Loading..."; - const resp = await apiFetch("/api/v2/tasks/{task_id}"); - const text = await resp.text(); - if (!resp.ok) {{ out.textContent = "Error: " + resp.status + "\\n" + text; return; }} - const obj = JSON.parse(text); - out.textContent = fmtJson(obj); - const p = document.getElementById("wandb-project"); - const r = document.getElementById("wandb-run"); - const w = (obj.latest_attempt && obj.latest_attempt.wandb) ? obj.latest_attempt.wandb : null; - if (w) {{ - p.textContent = w.project_name || "(unknown)"; - r.textContent = w.run_name || "(unknown)"; - }} else {{ - p.textContent = "(not available)"; - r.textContent = "(not available)"; - }} - - const resp2 = await apiFetch("/api/v2/tasks/{task_id}/spec"); - const text2 = await resp2.text(); - spec.textContent = resp2.ok ? text2 : ("Error: " + resp2.status + "\\n" + text2); -}} -document.getElementById("refresh").onclick = refresh; -document.getElementById("cancel").onclick = async () => {{ - if (!confirm("Cancel this task?")) return; - const resp = await apiFetch("/api/v2/tasks/{task_id}:cancel", {{ method: "POST" }}); - const text = await resp.text(); - out.textContent = (resp.ok ? "Canceled.\\n" : "Error: " + resp.status + "\\n") + text; - setTimeout(refresh, 800); -}}; -refresh(); -""".strip() - return HTMLResponse(content=_page(f"Task {task_id}", "tasks", body, script)) - - @app.get("/ui/tasks/{task_id}/logs") - async def ui_task_logs(task_id: str) -> HTMLResponse: - safe_id = html.escape(task_id) - body = f""" -

Logs: {safe_id}

-
-
- - - Back -
-
-
Loading...
-
-""".strip() - script = f""" -document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); -const out = document.getElementById("out"); -let timer = null; -async function refresh() {{ - const resp = await apiFetch("/api/v2/tasks/{task_id}/logs?tail=4000"); - const text = await resp.text(); - out.textContent = resp.ok ? text : ("Error: " + resp.status + "\\n" + text); -}} -document.getElementById("refresh").onclick = refresh; -document.getElementById("auto").onchange = (e) => {{ - if (e.target.checked) {{ - timer = setInterval(refresh, 2000); - }} else {{ - if (timer) clearInterval(timer); - timer = null; - }} -}}; -refresh(); -""".strip() - return HTMLResponse(content=_page(f"Logs {task_id}", "tasks", body, script)) - - @app.get("/ui/serving") - async def ui_serving() -> HTMLResponse: - body = """ -

Serving

-
-
- - New Model -
-
-
Loading...
-
-""".strip() - script = """ -document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); -const out = document.getElementById("out"); - -function pill(state) { - const s = String(state || ""); - if (s === "RUNNING") return `${s}`; - if (s === "FAILED") return `${s}`; - return `${s}`; -} - -async function refresh() { - out.textContent = "Loading..."; - try { - const lim = 50; - const off = Number(localStorage.getItem("mvp_serving_offset") || "0") || 0; - const resp = await apiJson("/api/v2/serve/models?limit=" + lim + "&offset=" + off + "&include_deleted=0"); - const items = resp.items || []; - const hasMore = !!resp.has_more; - const pageNo = Math.floor(off / lim) + 1; - const prevDisabled = off <= 0; - const nextDisabled = !hasMore; - - const baseHint = curOriginWithPort(8000) + "/serve//v1"; - function row(m) { - const endpoint = (m.endpoint && m.endpoint.openai_base_url) || (curOriginWithPort(8000) + "/serve/" + encodeURIComponent(m.model_key) + "/v1"); - return ` - ${m.model_key} - ${m.model_id} - ${pill(m.state)} - ${m.num_replicas} × ${m.gpus_per_replica} GPU - ${endpoint} - ${m.updated_at || ""} - `; - } - const rows = items.map(row).join(""); - - out.innerHTML = ` -
-
Per-model OpenAI base: ${baseHint}
-
- Page ${pageNo} - - -
-
- - - ${rows || ""} -
Model KeyModel IDStateResourcesEndpointUpdated
(none)
- `; - - const prevBtn = document.getElementById("prev"); - const nextBtn = document.getElementById("next"); - if (prevBtn) prevBtn.onclick = () => { localStorage.setItem("mvp_serving_offset", String(Math.max(0, off - lim))); refresh(); }; - if (nextBtn) nextBtn.onclick = () => { localStorage.setItem("mvp_serving_offset", String(off + lim)); refresh(); }; - } catch (e) { - let text = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); - if (e.body && String(e.body).includes("serving is not enabled")) { - text = "Serving is not enabled in server config.\\nAsk admin to enable `serving:` in dev.yaml."; - } - out.textContent = text; - } -} - -document.getElementById("refresh").onclick = refresh; -refresh(); -""".strip() - return HTMLResponse(content=_page("Serving", "serving", body, script)) - - @app.get("/ui/serving/new") - async def ui_serving_new() -> HTMLResponse: - example = """# ServingSpec (YAML) -# 说明: -# - model_id: 这里是 suffix(平台会自动加前缀:--) -# - model_source: 本地模型路径(支持 $HOME 宏;推荐使用 $HOME/common/hf 指向共享 HF cache) -# -# 常用路径: -# - $HOME/common/hf -> /private/hf -# - $HOME -> /private/users/ -# -model_id: qwen-0.5b -model_source: $HOME/common/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/ -num_replicas: 1 -gpus_per_replica: 1 - -# engine_kwargs: # 可选:透传给 vLLM -# gpu_memory_utilization: 0.4 -""".strip() - body = f""" -

New Model

-
-
Paste ServingSpec YAML and submit to /api/v2/serve/models.
-
- -
-
- - Back -
-
-

-
-""".strip() - script = """ -document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); -const out = document.getElementById("out"); -document.getElementById("submit").onclick = async () => { - out.textContent = "Submitting..."; - const yaml = document.getElementById("yaml").value || ""; - try { - const resp = await apiJson("/api/v2/serve/models", { method: "POST", headers: { "Content-Type": "application/yaml" }, body: yaml }); - out.textContent = "Created: " + resp.model_key + "\\nState: " + resp.state; - if (resp.model_key) window.location.href = "/ui/serving/" + encodeURIComponent(resp.model_key); - } catch (e) { - out.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); - } -}; -""".strip() - return HTMLResponse(content=_page("New Model", "serving", body, script)) - - @app.get("/ui/serving/{model_key}") - async def ui_serving_detail(model_key: str) -> HTMLResponse: - body = f""" -

Model

-
-
-
model_key: {html.escape(model_key)}
- -
-
-
- - - - -
-
-
Loading...
-
-

Resolved Spec (YAML)

-
(loading)
-
-

Events

-
(loading)
-
-

OpenAI Example

-
(loading)
-
-""".strip() - script = f""" -document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); -const modelKey = {json.dumps(model_key)}; -document.getElementById("openai-models").href = curOriginWithPort(8000) + "/serve/" + encodeURIComponent(modelKey) + "/v1/models"; -const meta = document.getElementById("meta"); -const spec = document.getElementById("spec"); -const eventsEl = document.getElementById("events"); -const example = document.getElementById("example"); -const replicas = document.getElementById("replicas"); - -function pill(state) {{ - const s = String(state || ""); - if (s === "RUNNING") return `${{s}}`; - if (s === "FAILED") return `${{s}}`; - return `${{s}}`; -}} - -function renderEvents(events) {{ - if (!events || !events.length) return "
(none)
"; - const rows = events.map(e => {{ - const payload = (e.payload_json || ""); - const short = String(payload).length > 240 ? String(payload).slice(0, 240) + "..." : String(payload); - return `${{e.created_at || ""}}${{e.event_type}}
${{short}}
`; - }}).join(""); - return `${{rows}}
TimeTypePayload
`; -}} - -async function refresh() {{ - meta.textContent = "Loading..."; - spec.textContent = "(loading)"; - eventsEl.textContent = "(loading)"; - example.textContent = "(loading)"; - try {{ - const obj = await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey)); - const m = obj.model || {{}}; - replicas.value = String(m.num_replicas || 1); - const endpoint = (m.endpoint && m.endpoint.openai_base_url) || (curOriginWithPort(8000) + "/serve/" + encodeURIComponent(modelKey) + "/v1"); - meta.innerHTML = ` -
-
state: ${{pill(m.state)}}
-
model_id: ${{m.model_id || ""}}
-
source: ${{m.model_source || ""}}
-
-
endpoint: ${{endpoint}}
- `; - spec.textContent = obj.resolved_spec_yaml || ""; - eventsEl.innerHTML = renderEvents(obj.events || []); - const base = endpoint; - const mid = m.model_id || ""; - example.textContent = `curl -sS -H 'Content-Type: application/json' \\\\\\n -X POST ${{base}}/chat/completions \\\\\\n --data-binary '{{\\"model\\":\\"${{mid}}\\",\\"messages\\":[{{\\"role\\":\\"user\\",\\"content\\":\\"hello\\"}}],\\"max_tokens\\":16,\\"stream\\":false}}' | python3 -m json.tool`; - }} catch (e) {{ - meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); - spec.textContent = ""; - eventsEl.textContent = ""; - example.textContent = ""; - }} -}} - -document.getElementById("scale").onclick = async () => {{ - const n = Number(replicas.value || "1"); - if (!Number.isFinite(n) || n < 1) return; - try {{ - await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey), {{ method: "PATCH", headers: {{ "Content-Type": "application/json" }}, body: JSON.stringify({{ num_replicas: n }}) }}); - await refresh(); - }} catch (e) {{ - meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); - }} -}}; - -document.getElementById("delete").onclick = async () => {{ - if (!confirm("Delete this model?")) return; - try {{ - await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey), {{ method: "DELETE" }}); - await refresh(); - }} catch (e) {{ - meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); - }} -}}; - -refresh(); -""".strip() - return HTMLResponse(content=_page("Model", "serving", body, script)) - - @app.get("/ui/data") - async def ui_data() -> HTMLResponse: - body = """ -

Data

-
-
User files live under your home directory. Keep long-term artifacts in models/ or datasets/.
-
- -
-
-
Username
-
-
- - -
-
-
-
SFTPGo password
-
-
- - - -
-
-
- -
- - -
-
- You can also use an SFTP client (e.g. FileZilla) with the same username/password. - Host: , Port: . -
- -
-
-
-""".strip() - script = """ -const msg = document.getElementById("msg"); -document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); -const u = document.getElementById("u"); -const p = document.getElementById("p"); -const sftpWeb = document.getElementById("sftp-web"); -const sftpHost = document.getElementById("sftp-host"); -const sftpPort = document.getElementById("sftp-port"); -document.getElementById("copy-u").onclick = async () => { await copyText(u.value || ""); }; -document.getElementById("copy-p").onclick = async () => { await copyText(p.value || ""); }; - -async function refresh() { - const resp = await apiFetch("/api/v2/me"); - const text = await resp.text(); - if (!resp.ok) { msg.textContent = "Error: " + resp.status + "\\n" + text; return; } - const obj = JSON.parse(text); - u.value = (obj.user_id || ""); - const cached = mvpSftpPasswordGet(); - if (cached) p.value = cached; - const host = curOriginWithPort(8081); - sftpWeb.href = host + "/web/client"; - sftpHost.textContent = (obj.sftp && obj.sftp.host) ? obj.sftp.host : window.location.hostname; - sftpPort.textContent = (obj.sftp && obj.sftp.port) ? String(obj.sftp.port) : "2022"; - msg.textContent = ""; -} -document.getElementById("reset-p").onclick = async () => { - p.value = ""; - mvpSftpPasswordSet(""); - msg.textContent = "Resetting..."; - const resp = await apiFetch("/api/v2/me/sftp:reset_password", { method: "POST" }); - const text = await resp.text(); - if (!resp.ok) { msg.textContent = "Error: " + resp.status + "\\n" + text; return; } - const obj = JSON.parse(text); - p.value = obj.password || ""; - mvpSftpPasswordSet(p.value); - msg.textContent = "SFTPGo password rotated."; -}; -refresh(); -""".strip() - return HTMLResponse(content=_page("Data", "data", body, script)) - - @app.get("/ui/admin") - async def ui_admin() -> HTMLResponse: - body = """ -

Admin

-
-
- This page requires the admin token (set it in Login). -
-
- -

Create user

-
- - - -
-
-

-
-  
-
- -
-
-
Loading...
-
-""".strip() - script = """ -const out = document.getElementById("out"); -const createMsg = document.getElementById("create-msg"); -const userIdEl = document.getElementById("new-user-id"); -const displayNameEl = document.getElementById("new-display-name"); - -function esc(s) { - s = String(s || ""); - return s.replaceAll("&","&").replaceAll("<","<").replaceAll(">",">"); -} - -async function refresh() { - out.textContent = "Loading..."; - try { - const obj = await apiJson("/api/v2/users?limit=200"); - const users = (obj.users || []); - function row(u) { - const uid = u.user_id; - const tok = u.token || ""; - const tokShort = tok ? (tok.length > 18 ? (tok.slice(0, 18) + "…") : tok) : ""; - const created = u.created_at || ""; - const updated = u.updated_at || ""; - const tCreated = u.token_created_at || ""; - const tUsed = u.token_last_used_at || ""; - return ` - ${esc(uid)} - ${esc(created)} - ${esc(updated)} - -
- ${esc(tokShort)} - - -
-
token_created_at: ${esc(tCreated)}; last_used_at: ${esc(tUsed)}
- - `; - } - out.innerHTML = ` - - - ${users.map(row).join("") || ""} -
UserCreatedUpdatedToken
(none)
- `; - for (const btn of out.querySelectorAll("button[data-copy]")) { - btn.onclick = async () => { await copyText(btn.getAttribute("data-copy") || ""); }; - } - for (const btn of out.querySelectorAll("button[data-issue]")) { - btn.onclick = async () => { - const uid = btn.getAttribute("data-issue"); - if (!uid) return; - try { - const r = await apiJson("/api/v2/users/" + encodeURIComponent(uid) + "/tokens", { method: "POST" }); - createMsg.textContent = "Issued token:\\n" + fmtJson(r); - await refresh(); - } catch (e) { - createMsg.textContent = "Error issuing token: " + (e.status || "") + "\\n" + (e.body || String(e)); - } - }; - } - } catch (e) { - out.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); - } -} - -document.getElementById("refresh").onclick = refresh; -document.getElementById("create-user").onclick = async () => { - createMsg.textContent = "Creating..."; - const user_id = (userIdEl.value || "").trim(); - const display_name = (displayNameEl.value || "").trim(); - if (!user_id) { createMsg.textContent = "user_id is required"; return; } - const payload = { user_id: user_id }; - if (display_name) payload.display_name = display_name; - try { - const r = await apiJson("/api/v2/users", { method: "POST", headers: {"Content-Type":"application/json"}, body: JSON.stringify(payload) }); - createMsg.textContent = "Created:\\n" + fmtJson(r); - userIdEl.value = ""; - displayNameEl.value = ""; - await refresh(); - } catch (e) { - createMsg.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); - } -}; - -refresh(); -""".strip() - return HTMLResponse(content=_page("Admin", "admin", body, script)) diff --git a/src/mvp/py/argus/ui/__init__.py b/src/mvp/py/argus/ui/__init__.py new file mode 100644 index 0000000..4ebedf8 --- /dev/null +++ b/src/mvp/py/argus/ui/__init__.py @@ -0,0 +1,6 @@ +from __future__ import annotations + +from .routes import register_ui_routes + +__all__ = ["register_ui_routes"] + diff --git a/src/mvp/py/argus/ui/assets/__init__.py b/src/mvp/py/argus/ui/assets/__init__.py new file mode 100644 index 0000000..3ce55a6 --- /dev/null +++ b/src/mvp/py/argus/ui/assets/__init__.py @@ -0,0 +1,2 @@ +from __future__ import annotations + diff --git a/src/mvp/py/argus/ui/assets/base_css.py b/src/mvp/py/argus/ui/assets/base_css.py new file mode 100644 index 0000000..0287424 --- /dev/null +++ b/src/mvp/py/argus/ui/assets/base_css.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +BASE_CSS = """ +:root { --bg:#0b1020; --panel:#111a33; --muted:#95a3c6; --fg:#e8eeff; --accent:#7aa2ff; --danger:#ff6b6b; --ok:#3ddc97; } +* { box-sizing: border-box; } +body { margin:0; font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial; background:var(--bg); color:var(--fg); } +a { color:var(--accent); text-decoration:none; } +.layout { display:flex; min-height:100vh; } +.nav { width: 240px; padding:16px; background: linear-gradient(180deg, #0e1630, #0b1020); border-right: 1px solid rgba(255,255,255,0.06); } +.brand { font-weight: 700; letter-spacing: .2px; margin-bottom: 12px; } +.nav a { display:block; padding:10px 10px; border-radius:10px; color: var(--fg); opacity: .9; } +.nav a.active { background: rgba(122,162,255,0.14); border: 1px solid rgba(122,162,255,0.22); } +.nav a:hover { background: rgba(255,255,255,0.06); } +.main { flex:1; padding: 20px 24px; } +.card { background: rgba(255,255,255,0.04); border: 1px solid rgba(255,255,255,0.08); border-radius: 14px; padding: 16px; } +.row { display:flex; gap: 12px; align-items:center; flex-wrap: wrap; } +.muted { color: var(--muted); } +.btn { border: 1px solid rgba(255,255,255,0.16); background: rgba(255,255,255,0.06); color: var(--fg); padding: 10px 12px; border-radius: 10px; cursor: pointer; } +.btn:hover { background: rgba(255,255,255,0.10); } +.btn.active { background: rgba(122,162,255,0.14); border-color: rgba(122,162,255,0.22); } +.btn.danger { border-color: rgba(255,107,107,0.35); background: rgba(255,107,107,0.10); } +.pill { display:inline-block; padding: 2px 10px; border-radius: 999px; border: 1px solid rgba(255,255,255,0.16); font-size: 12px; } +.pill.ok { border-color: rgba(61,220,151,0.35); background: rgba(61,220,151,0.12); } +.pill.bad { border-color: rgba(255,107,107,0.35); background: rgba(255,107,107,0.12); } +textarea, input { width: 100%; color: var(--fg); background: rgba(255,255,255,0.06); border: 1px solid rgba(255,255,255,0.12); border-radius: 12px; padding: 10px 12px; outline: none; } +select { color: var(--fg); background: rgba(255,255,255,0.06); border: 1px solid rgba(255,255,255,0.12); border-radius: 12px; padding: 10px 12px; outline: none; } +select option { color: var(--fg); background: #0e1630; } +button:disabled { opacity: .45; cursor: not-allowed; } +pre { white-space: pre-wrap; word-break: break-word; } +table { width:100%; border-collapse: collapse; } +th, td { padding: 10px 8px; border-bottom: 1px solid rgba(255,255,255,0.08); text-align:left; } +""".strip() + diff --git a/src/mvp/py/argus/ui/assets/base_js.py b/src/mvp/py/argus/ui/assets/base_js.py new file mode 100644 index 0000000..ea73324 --- /dev/null +++ b/src/mvp/py/argus/ui/assets/base_js.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +BASE_JS = """ +function mvpTokenGet() { + return (localStorage.getItem("mvp_token") || "").trim(); +} +function mvpTokenSet(v) { + localStorage.setItem("mvp_token", (v || "").trim()); +} +function mvpSftpPasswordGet() { + return (localStorage.getItem("mvp_sftp_password") || "").trim(); +} +function mvpSftpPasswordSet(v) { + localStorage.setItem("mvp_sftp_password", (v || "").trim()); +} +async function apiFetch(path, opts) { + opts = opts || {}; + opts.headers = opts.headers || {}; + const tok = mvpTokenGet(); + if (tok) opts.headers["Authorization"] = "Bearer " + tok; + return fetch(path, opts); +} +async function apiJson(path, opts) { + const resp = await apiFetch(path, opts); + const text = await resp.text(); + if (!resp.ok) { + const err = new Error("HTTP " + resp.status); + err.status = resp.status; + err.body = text; + throw err; + } + return JSON.parse(text); +} +function fmtJson(obj) { + try { return JSON.stringify(obj, null, 2); } catch (e) { return String(obj); } +} +function curOriginWithPort(port) { + const proto = window.location.protocol; + const host = window.location.hostname; + return proto + "//" + host + ":" + port; +} +async function copyText(v) { + if (!v) return false; + try { + await navigator.clipboard.writeText(v); + return true; + } catch (e) { + // Fallback for non-secure contexts (http) or older browsers. + try { + const ta = document.createElement("textarea"); + ta.value = v; + ta.style.position = "fixed"; + ta.style.opacity = "0"; + document.body.appendChild(ta); + ta.focus(); + ta.select(); + const ok = document.execCommand("copy"); + document.body.removeChild(ta); + return ok; + } catch (e2) { + return false; + } + } +} +document.addEventListener("DOMContentLoaded", () => { + const el = document.getElementById("nav-ray-dashboard"); + if (el) el.href = curOriginWithPort(8265); +}); +""".strip() + diff --git a/src/mvp/py/argus/ui/layout/__init__.py b/src/mvp/py/argus/ui/layout/__init__.py new file mode 100644 index 0000000..3ce55a6 --- /dev/null +++ b/src/mvp/py/argus/ui/layout/__init__.py @@ -0,0 +1,2 @@ +from __future__ import annotations + diff --git a/src/mvp/py/argus/ui/layout/nav.py b/src/mvp/py/argus/ui/layout/nav.py new file mode 100644 index 0000000..d783a7e --- /dev/null +++ b/src/mvp/py/argus/ui/layout/nav.py @@ -0,0 +1,24 @@ +from __future__ import annotations + +import html + + +def nav(active: str) -> str: + links = [ + ("login", "/ui/login", "Login"), + ("tasks", "/ui/tasks", "Tasks"), + ("serving", "/ui/serving", "Serving"), + ("new", "/ui/tasks/new", "New Task"), + ("data", "/ui/data", "Data"), + ("admin", "/ui/admin", "Admin"), + ("ray", "#", "Ray Dashboard"), + ] + items: list[str] = [] + for key, href, label in links: + cls = "active" if key == active else "" + extra = "" + if key == "ray": + extra = ' id="nav-ray-dashboard" target="_blank" rel="noopener"' + items.append(f'{html.escape(label)}') + return "\n".join(items) + diff --git a/src/mvp/py/argus/ui/layout/page.py b/src/mvp/py/argus/ui/layout/page.py new file mode 100644 index 0000000..b5f0264 --- /dev/null +++ b/src/mvp/py/argus/ui/layout/page.py @@ -0,0 +1,36 @@ +from __future__ import annotations + +import html + +from ..assets.base_css import BASE_CSS +from ..assets.base_js import BASE_JS +from .nav import nav + + +def page(title: str, active: str, body: str, script: str = "") -> str: + return f""" + + + + + {html.escape(title)} + + + +
+ +
+ {body} +
+
+ + + +""" + diff --git a/src/mvp/py/argus/ui/pages/__init__.py b/src/mvp/py/argus/ui/pages/__init__.py new file mode 100644 index 0000000..3ce55a6 --- /dev/null +++ b/src/mvp/py/argus/ui/pages/__init__.py @@ -0,0 +1,2 @@ +from __future__ import annotations + diff --git a/src/mvp/py/argus/ui/pages/admin.py b/src/mvp/py/argus/ui/pages/admin.py new file mode 100644 index 0000000..2e75ddc --- /dev/null +++ b/src/mvp/py/argus/ui/pages/admin.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +from fastapi import FastAPI +from fastapi.responses import HTMLResponse + +from ..layout.page import page + + +def register(app: FastAPI) -> None: + @app.get("/ui/admin") + async def ui_admin() -> HTMLResponse: + body = """ +

Admin

+
+
+ This page requires the admin token (set it in Login). +
+
+ +

Create user

+
+ + + +
+
+

+
+  
+
+ +
+
+
Loading...
+
+""".strip() + script = """ +const out = document.getElementById("out"); +const createMsg = document.getElementById("create-msg"); +const userIdEl = document.getElementById("new-user-id"); +const displayNameEl = document.getElementById("new-display-name"); + +function esc(s) { + s = String(s || ""); + return s.replaceAll("&","&").replaceAll("<","<").replaceAll(">",">"); +} + +async function refresh() { + out.textContent = "Loading..."; + try { + const obj = await apiJson("/api/v2/users?limit=200"); + const users = (obj.users || []); + function row(u) { + const uid = u.user_id; + const tok = u.token || ""; + const tokShort = tok ? (tok.length > 18 ? (tok.slice(0, 18) + "…") : tok) : ""; + const created = u.created_at || ""; + const updated = u.updated_at || ""; + const tCreated = u.token_created_at || ""; + const tUsed = u.token_last_used_at || ""; + return ` + ${esc(uid)} + ${esc(created)} + ${esc(updated)} + +
+ ${esc(tokShort)} + + +
+
token_created_at: ${esc(tCreated)}; last_used_at: ${esc(tUsed)}
+ + `; + } + out.innerHTML = ` + + + ${users.map(row).join("") || ""} +
UserCreatedUpdatedToken
(none)
+ `; + for (const btn of out.querySelectorAll("button[data-copy]")) { + btn.onclick = async () => { await copyText(btn.getAttribute("data-copy") || ""); }; + } + for (const btn of out.querySelectorAll("button[data-issue]")) { + btn.onclick = async () => { + const uid = btn.getAttribute("data-issue"); + if (!uid) return; + try { + const r = await apiJson("/api/v2/users/" + encodeURIComponent(uid) + "/tokens", { method: "POST" }); + createMsg.textContent = "Issued token:\\n" + fmtJson(r); + await refresh(); + } catch (e) { + createMsg.textContent = "Error issuing token: " + (e.status || "") + "\\n" + (e.body || String(e)); + } + }; + } + } catch (e) { + out.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); + } +} + +document.getElementById("refresh").onclick = refresh; +document.getElementById("create-user").onclick = async () => { + createMsg.textContent = "Creating..."; + const user_id = (userIdEl.value || "").trim(); + const display_name = (displayNameEl.value || "").trim(); + if (!user_id) { createMsg.textContent = "user_id is required"; return; } + const payload = { user_id: user_id }; + if (display_name) payload.display_name = display_name; + try { + const r = await apiJson("/api/v2/users", { method: "POST", headers: {"Content-Type":"application/json"}, body: JSON.stringify(payload) }); + createMsg.textContent = "Created:\\n" + fmtJson(r); + userIdEl.value = ""; + displayNameEl.value = ""; + await refresh(); + } catch (e) { + createMsg.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); + } +}; + +refresh(); +""".strip() + return HTMLResponse(content=page("Admin", "admin", body, script)) + diff --git a/src/mvp/py/argus/ui/pages/data.py b/src/mvp/py/argus/ui/pages/data.py new file mode 100644 index 0000000..120dbde --- /dev/null +++ b/src/mvp/py/argus/ui/pages/data.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +from fastapi import FastAPI +from fastapi.responses import HTMLResponse + +from ..layout.page import page + + +def register(app: FastAPI) -> None: + @app.get("/ui/data") + async def ui_data() -> HTMLResponse: + body = """ +

Data

+
+
User files live under your home directory. Keep long-term artifacts in models/ or datasets/.
+
+ +
+
+
Username
+
+
+ + +
+
+
+
SFTPGo password
+
+
+ + + +
+
+
+ +
+ + +
+
+ You can also use an SFTP client (e.g. FileZilla) with the same username/password. + Host: , Port: . +
+ +
+
+
+""".strip() + script = """ +const msg = document.getElementById("msg"); +document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); +const u = document.getElementById("u"); +const p = document.getElementById("p"); +const sftpWeb = document.getElementById("sftp-web"); +const sftpHost = document.getElementById("sftp-host"); +const sftpPort = document.getElementById("sftp-port"); +document.getElementById("copy-u").onclick = async () => { await copyText(u.value || ""); }; +document.getElementById("copy-p").onclick = async () => { await copyText(p.value || ""); }; + +async function refresh() { + const resp = await apiFetch("/api/v2/me"); + const text = await resp.text(); + if (!resp.ok) { msg.textContent = "Error: " + resp.status + "\\n" + text; return; } + const obj = JSON.parse(text); + u.value = (obj.user_id || ""); + const cached = mvpSftpPasswordGet(); + if (cached) p.value = cached; + const host = curOriginWithPort(8081); + sftpWeb.href = host + "/web/client"; + sftpHost.textContent = (obj.sftp && obj.sftp.host) ? obj.sftp.host : window.location.hostname; + sftpPort.textContent = (obj.sftp && obj.sftp.port) ? String(obj.sftp.port) : "2022"; + msg.textContent = ""; +} +document.getElementById("reset-p").onclick = async () => { + p.value = ""; + mvpSftpPasswordSet(""); + msg.textContent = "Resetting..."; + const resp = await apiFetch("/api/v2/me/sftp:reset_password", { method: "POST" }); + const text = await resp.text(); + if (!resp.ok) { msg.textContent = "Error: " + resp.status + "\\n" + text; return; } + const obj = JSON.parse(text); + p.value = obj.password || ""; + mvpSftpPasswordSet(p.value); + msg.textContent = "SFTPGo password rotated."; +}; +refresh(); +""".strip() + return HTMLResponse(content=page("Data", "data", body, script)) + diff --git a/src/mvp/py/argus/ui/pages/login.py b/src/mvp/py/argus/ui/pages/login.py new file mode 100644 index 0000000..cc6f930 --- /dev/null +++ b/src/mvp/py/argus/ui/pages/login.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from fastapi import FastAPI +from fastapi.responses import HTMLResponse + +from ..layout.page import page + + +def register(app: FastAPI) -> None: + @app.get("/ui/login") + async def ui_login() -> HTMLResponse: + body = """ +

Login

+
+
Paste your API token (without the Bearer prefix).
+
+ +
+
+ + + Go to Tasks +
+
+
+
+
+
+

User Info

+
Shown after login via /api/v2/me.
+
+
+ +
+
+
(not loaded)
+
+
+
+

W&B

+
Weights & Biases local server (v3.6). Metrics are written by training jobs; this UI is for viewing.
+
+
+ Open W&B (:8090) + +
+
+
project: (unknown)
+
base_url (job runtime): (unknown)
+
+""".strip() + script = """ +const tokEl = document.getElementById("tok"); +const msg = document.getElementById("msg"); +const me = document.getElementById("me"); +const wandbOpen = document.getElementById("wandb-open"); +const wandbProject = document.getElementById("wandb-project"); +const wandbBaseUrl = document.getElementById("wandb-base-url"); +document.getElementById("wandb-copy-project").onclick = async () => { await copyText(wandbProject.textContent || ""); }; +wandbOpen.href = curOriginWithPort(8090); +tokEl.value = mvpTokenGet(); + +async function refreshMe() { + me.textContent = "Loading..."; + try { + const obj = await apiJson("/api/v2/me"); + me.textContent = fmtJson(obj); + if (obj.wandb && obj.wandb.enabled) { + wandbProject.textContent = obj.wandb.project_name || "(unknown)"; + wandbBaseUrl.textContent = obj.wandb.base_url || "(unknown)"; + } else { + wandbProject.textContent = "(disabled)"; + wandbBaseUrl.textContent = "(disabled)"; + } + } catch (e) { + me.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); + wandbProject.textContent = "(error)"; + wandbBaseUrl.textContent = "(error)"; + } +} + +document.getElementById("me-refresh").onclick = refreshMe; +document.getElementById("save").onclick = () => { mvpTokenSet(tokEl.value); msg.textContent = "Saved."; refreshMe(); }; +document.getElementById("clear").onclick = () => { mvpTokenSet(""); tokEl.value = ""; msg.textContent = "Cleared."; me.textContent = "(not loaded)"; }; +if (mvpTokenGet()) refreshMe(); +""".strip() + return HTMLResponse(content=page("Login", "login", body, script)) + diff --git a/src/mvp/py/argus/ui/pages/serving.py b/src/mvp/py/argus/ui/pages/serving.py new file mode 100644 index 0000000..cb0c516 --- /dev/null +++ b/src/mvp/py/argus/ui/pages/serving.py @@ -0,0 +1,261 @@ +from __future__ import annotations + +import html +import json + +from fastapi import FastAPI +from fastapi.responses import HTMLResponse + +from ..layout.page import page + + +def register(app: FastAPI) -> None: + @app.get("/ui/serving") + async def ui_serving() -> HTMLResponse: + body = """ +

Serving

+
+
+ + New Model +
+
+
Loading...
+
+""".strip() + script = """ +document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); +const out = document.getElementById("out"); + +function pill(state) { + const s = String(state || ""); + if (s === "RUNNING") return `${s}`; + if (s === "FAILED") return `${s}`; + return `${s}`; +} + +async function refresh() { + out.textContent = "Loading..."; + try { + const lim = 50; + const off = Number(localStorage.getItem("mvp_serving_offset") || "0") || 0; + const resp = await apiJson("/api/v2/serve/models?limit=" + lim + "&offset=" + off + "&include_deleted=0"); + const items = resp.items || []; + const hasMore = !!resp.has_more; + const pageNo = Math.floor(off / lim) + 1; + const prevDisabled = off <= 0; + const nextDisabled = !hasMore; + + const baseHint = curOriginWithPort(8000) + "/serve//v1"; + function row(m) { + const endpoint = (m.endpoint && m.endpoint.openai_base_url) || (curOriginWithPort(8000) + "/serve/" + encodeURIComponent(m.model_key) + "/v1"); + return ` + ${m.model_key} + ${m.model_id} + ${pill(m.state)} + ${m.num_replicas} × ${m.gpus_per_replica} GPU + ${endpoint} + ${m.updated_at || ""} + `; + } + const rows = items.map(row).join(""); + + out.innerHTML = ` +
+
Per-model OpenAI base: ${baseHint}
+
+ Page ${pageNo} + + +
+
+ + + ${rows || ""} +
Model KeyModel IDStateResourcesEndpointUpdated
(none)
+ `; + + const prevBtn = document.getElementById("prev"); + const nextBtn = document.getElementById("next"); + if (prevBtn) prevBtn.onclick = () => { localStorage.setItem("mvp_serving_offset", String(Math.max(0, off - lim))); refresh(); }; + if (nextBtn) nextBtn.onclick = () => { localStorage.setItem("mvp_serving_offset", String(off + lim)); refresh(); }; + } catch (e) { + let text = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); + if (e.body && String(e.body).includes("serving is not enabled")) { + text = "Serving is not enabled in server config.\\nAsk admin to enable `serving:` in dev.yaml."; + } + out.textContent = text; + } +} + +document.getElementById("refresh").onclick = refresh; +refresh(); +""".strip() + return HTMLResponse(content=page("Serving", "serving", body, script)) + + @app.get("/ui/serving/new") + async def ui_serving_new() -> HTMLResponse: + example = """# ServingSpec (YAML) +# 说明: +# - model_id: 这里是 suffix(平台会自动加前缀:--) +# - model_source: 本地模型路径(支持 $HOME 宏;推荐使用 $HOME/common/hf 指向共享 HF cache) +# +# 常用路径: +# - $HOME/common/hf -> /private/hf +# - $HOME -> /private/users/ +# +model_id: qwen-0.5b +model_source: $HOME/common/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/ +num_replicas: 1 +gpus_per_replica: 1 + +# engine_kwargs: # 可选:透传给 vLLM +# gpu_memory_utilization: 0.4 +""".strip() + body = f""" +

New Model

+
+
Paste ServingSpec YAML and submit to /api/v2/serve/models.
+
+ +
+
+ + Back +
+
+

+
+""".strip() + script = """ +document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); +const out = document.getElementById("out"); +document.getElementById("submit").onclick = async () => { + out.textContent = "Submitting..."; + const yaml = document.getElementById("yaml").value || ""; + try { + const resp = await apiJson("/api/v2/serve/models", { method: "POST", headers: { "Content-Type": "application/yaml" }, body: yaml }); + out.textContent = "Created: " + resp.model_key + "\\nState: " + resp.state; + if (resp.model_key) window.location.href = "/ui/serving/" + encodeURIComponent(resp.model_key); + } catch (e) { + out.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); + } +}; +""".strip() + return HTMLResponse(content=page("New Model", "serving", body, script)) + + @app.get("/ui/serving/{model_key}") + async def ui_serving_detail(model_key: str) -> HTMLResponse: + body = f""" +

Model

+
+
+
model_key: {html.escape(model_key)}
+ +
+
+
+ + + + +
+
+
Loading...
+
+

Resolved Spec (YAML)

+
(loading)
+
+

Events

+
(loading)
+
+

OpenAI Example

+
(loading)
+
+""".strip() + script = f""" +document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); +const modelKey = {json.dumps(model_key)}; +document.getElementById("openai-models").href = curOriginWithPort(8000) + "/serve/" + encodeURIComponent(modelKey) + "/v1/models"; +const meta = document.getElementById("meta"); +const spec = document.getElementById("spec"); +const eventsEl = document.getElementById("events"); +const example = document.getElementById("example"); +const replicas = document.getElementById("replicas"); + +function pill(state) {{ + const s = String(state || ""); + if (s === "RUNNING") return `${{s}}`; + if (s === "FAILED") return `${{s}}`; + return `${{s}}`; +}} + +function renderEvents(events) {{ + if (!events || !events.length) return "
(none)
"; + const rows = events.map(e => {{ + const payload = (e.payload_json || ""); + const short = String(payload).length > 240 ? String(payload).slice(0, 240) + "..." : String(payload); + return `${{e.created_at || ""}}${{e.event_type}}
${{short}}
`; + }}).join(""); + return `${{rows}}
TimeTypePayload
`; +}} + +async function refresh() {{ + meta.textContent = "Loading..."; + spec.textContent = "(loading)"; + eventsEl.textContent = "(loading)"; + example.textContent = "(loading)"; + try {{ + const obj = await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey)); + const m = obj.model || {{}}; + replicas.value = String(m.num_replicas || 1); + const endpoint = (m.endpoint && m.endpoint.openai_base_url) || (curOriginWithPort(8000) + "/serve/" + encodeURIComponent(modelKey) + "/v1"); + meta.innerHTML = ` +
+
state: ${{pill(m.state)}}
+
model_id: ${{m.model_id || ""}}
+
source: ${{m.model_source || ""}}
+
+
endpoint: ${{endpoint}}
+ `; + spec.textContent = obj.resolved_spec_yaml || ""; + eventsEl.innerHTML = renderEvents(obj.events || []); + const base = endpoint; + const mid = m.model_id || ""; + example.textContent = `curl -sS -H 'Content-Type: application/json' \\\\\\n -X POST ${{base}}/chat/completions \\\\\\n --data-binary '{{\\"model\\":\\"${{mid}}\\",\\"messages\\":[{{\\"role\\":\\"user\\",\\"content\\":\\"hello\\"}}],\\"max_tokens\\":16,\\"stream\\":false}}' | python3 -m json.tool`; + }} catch (e) {{ + meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); + spec.textContent = ""; + eventsEl.textContent = ""; + example.textContent = ""; + }} +}} + +document.getElementById("scale").onclick = async () => {{ + const n = Number(replicas.value || "1"); + if (!Number.isFinite(n) || n < 1) return; + try {{ + await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey), {{ method: "PATCH", headers: {{ "Content-Type": "application/json" }}, body: JSON.stringify({{ num_replicas: n }}) }}); + await refresh(); + }} catch (e) {{ + meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); + }} +}}; + +document.getElementById("delete").onclick = async () => {{ + if (!confirm("Delete this model?")) return; + try {{ + await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey), {{ method: "DELETE" }}); + await refresh(); + }} catch (e) {{ + meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); + }} +}}; + +refresh(); +""".strip() + return HTMLResponse(content=page("Model", "serving", body, script)) + diff --git a/src/mvp/py/argus/ui/pages/task_detail.py b/src/mvp/py/argus/ui/pages/task_detail.py new file mode 100644 index 0000000..e70fdd6 --- /dev/null +++ b/src/mvp/py/argus/ui/pages/task_detail.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import html + +from fastapi import FastAPI +from fastapi.responses import HTMLResponse + +from ..layout.page import page + + +def register(app: FastAPI) -> None: + @app.get("/ui/tasks/{task_id}") + async def ui_task_detail(task_id: str) -> HTMLResponse: + safe_id = html.escape(task_id) + body = f""" +

Task: {safe_id}

+
+
+ Logs + + + Back +
+
+
Loading...
+
+
+
+

W&B

+
This is a best-effort hint. Run name maps to ray_submission_id (attempt).
+
+
+ Open W&B (:8090) + +
+
+
project: (unknown)
+
run: (unknown)
+
+
+
+

TaskSpec (YAML)

+
Resolved TaskSpec (includes default values; submission_id reflects latest attempt when available).
+
+
Loading...
+
+""".strip() + script = f""" +document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); +document.getElementById("wandb-open").href = curOriginWithPort(8090); +document.getElementById("wandb-copy-run").onclick = async () => {{ await copyText((document.getElementById("wandb-run").textContent || \"\").trim()); }}; +const out = document.getElementById("out"); +const spec = document.getElementById("spec"); +async function refresh() {{ + out.textContent = "Loading..."; + spec.textContent = "Loading..."; + const resp = await apiFetch("/api/v2/tasks/{task_id}"); + const text = await resp.text(); + if (!resp.ok) {{ out.textContent = "Error: " + resp.status + "\\n" + text; return; }} + const obj = JSON.parse(text); + out.textContent = fmtJson(obj); + const p = document.getElementById("wandb-project"); + const r = document.getElementById("wandb-run"); + const w = (obj.latest_attempt && obj.latest_attempt.wandb) ? obj.latest_attempt.wandb : null; + if (w) {{ + p.textContent = w.project_name || "(unknown)"; + r.textContent = w.run_name || "(unknown)"; + }} else {{ + p.textContent = "(not available)"; + r.textContent = "(not available)"; + }} + + const resp2 = await apiFetch("/api/v2/tasks/{task_id}/spec"); + const text2 = await resp2.text(); + spec.textContent = resp2.ok ? text2 : ("Error: " + resp2.status + "\\n" + text2); +}} +document.getElementById("refresh").onclick = refresh; +document.getElementById("cancel").onclick = async () => {{ + if (!confirm("Cancel this task?")) return; + const resp = await apiFetch("/api/v2/tasks/{task_id}:cancel", {{ method: "POST" }}); + const text = await resp.text(); + out.textContent = (resp.ok ? "Canceled.\\n" : "Error: " + resp.status + "\\n") + text; + setTimeout(refresh, 800); +}}; +refresh(); +""".strip() + return HTMLResponse(content=page(f"Task {task_id}", "tasks", body, script)) + diff --git a/src/mvp/py/argus/ui/pages/task_logs.py b/src/mvp/py/argus/ui/pages/task_logs.py new file mode 100644 index 0000000..a5fbb6f --- /dev/null +++ b/src/mvp/py/argus/ui/pages/task_logs.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import html + +from fastapi import FastAPI +from fastapi.responses import HTMLResponse + +from ..layout.page import page + + +def register(app: FastAPI) -> None: + @app.get("/ui/tasks/{task_id}/logs") + async def ui_task_logs(task_id: str) -> HTMLResponse: + safe_id = html.escape(task_id) + body = f""" +

Logs: {safe_id}

+
+
+ + + Back +
+
+
Loading...
+
+""".strip() + script = f""" +document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); +const out = document.getElementById("out"); +let timer = null; +async function refresh() {{ + const resp = await apiFetch("/api/v2/tasks/{task_id}/logs?tail=4000"); + const text = await resp.text(); + out.textContent = resp.ok ? text : ("Error: " + resp.status + "\\n" + text); +}} +document.getElementById("refresh").onclick = refresh; +document.getElementById("auto").onchange = (e) => {{ + if (e.target.checked) {{ + timer = setInterval(refresh, 2000); + }} else {{ + if (timer) clearInterval(timer); + timer = null; + }} +}}; +refresh(); +""".strip() + return HTMLResponse(content=page(f"Logs {task_id}", "tasks", body, script)) + diff --git a/src/mvp/py/argus/ui/pages/task_new.py b/src/mvp/py/argus/ui/pages/task_new.py new file mode 100644 index 0000000..4453425 --- /dev/null +++ b/src/mvp/py/argus/ui/pages/task_new.py @@ -0,0 +1,538 @@ +from __future__ import annotations + +import html + +from fastapi import FastAPI +from fastapi.responses import HTMLResponse + +from ..layout.page import page + + +def register(app: FastAPI) -> None: + @app.get("/ui/tasks/new") + async def ui_new_task() -> HTMLResponse: + ppo = """# PPO TaskSpec (YAML) +workload: ppo # 任务类型(必填):ppo|grpo|sft +code_path: /private/common/code/verl/verl_repo # 代码路径(必填):v3.0 固定使用 common 下的 verl 快照(不支持用户自定义代码) +model_id: Qwen/Qwen2.5-0.5B-Instruct # 基础模型(必填):HuggingFace 模型 ID 或 /private/... 本地模型路径 +train_file: /private/common/datasets/gsm8k/train.parquet # 训练数据(必填):parquet 文件路径(支持 /private/common/datasets 或 /private/users//datasets) +val_file: /private/common/datasets/gsm8k/test.parquet # 验证数据(必填):parquet 文件路径(VERL 侧会用来构建 val dataset,不能为 null) + +# nnodes: 2 # 训练节点数(可选,默认:2) +# n_gpus_per_node: 4 # 每节点 GPU 数(可选,默认:4) + +# total_epochs: 1 # 总训练 epoch(可选,默认:1) +# total_training_steps: null # 总训练 step(可选,默认:null;不传则让 VERL 按 epochs 和数据长度自动推导) +# save_freq: 10 # checkpoint 保存频率(step)(可选,默认:10) +# test_freq: null # 验证频率(step)(可选,默认:null;训练端会当成 -1=不验证) + +# submission_id: "" # Ray submission_id(可选,默认空;通常由服务自动生成,无需填写) +""".strip() + grpo = """# GRPO TaskSpec (YAML) +workload: grpo # 任务类型(必填):ppo|grpo|sft(grpo 会自动启用对应的算法配置) +code_path: /private/common/code/verl/verl_repo # 代码路径(必填):v3.0 固定使用 common 下的 verl 快照(不支持用户自定义代码) +model_id: Qwen/Qwen2.5-0.5B-Instruct # 基础模型(必填):HuggingFace 模型 ID 或 /private/... 本地模型路径 +train_file: /private/common/datasets/gsm8k/train.parquet # 训练数据(必填):parquet 文件路径(支持 /private/common/datasets 或 /private/users//datasets) +val_file: /private/common/datasets/gsm8k/test.parquet # 验证数据(必填):parquet 文件路径(VERL 侧会用来构建 val dataset,不能为 null) + +# nnodes: 2 # 训练节点数(可选,默认:2) +# n_gpus_per_node: 4 # 每节点 GPU 数(可选,默认:4) + +# total_epochs: 1 # 总训练 epoch(可选,默认:1) +# total_training_steps: null # 总训练 step(可选,默认:null;不传则让 VERL 按 epochs 和数据长度自动推导) +# save_freq: 10 # checkpoint 保存频率(step)(可选,默认:10) +# test_freq: null # 验证频率(step)(可选,默认:null;训练端会当成 -1=不验证) + +# submission_id: "" # Ray submission_id(可选,默认空;通常由服务自动生成,无需填写) +""".strip() + sft = """# SFT TaskSpec (YAML) +workload: sft # 任务类型(必填):ppo|grpo|sft +code_path: /private/common/code/verl/verl_repo # 代码路径(必填):v3.0 固定使用 common 下的 verl 快照(不支持用户自定义代码) +model_id: Qwen/Qwen2.5-0.5B-Instruct # 基础模型(必填):HuggingFace 模型 ID 或 /private/... 本地模型路径 +train_file: /private/common/datasets/gsm8k_sft/train.parquet # 训练数据(必填):parquet 文件路径(支持 /private/common/datasets 或 /private/users//datasets) +val_file: /private/common/datasets/gsm8k_sft/test.parquet # 验证数据(必填):parquet 文件路径(VERL 侧会用来构建 val dataset,不能为 null) + +# nnodes: 2 # 训练节点数(可选,默认:2;单机可设 1) +# n_gpus_per_node: 4 # 每节点 GPU 数(可选,默认:4;单卡可设 1) + +# total_epochs: 1 # 总训练 epoch(可选,默认:1) +# total_training_steps: null # 总训练 step(可选,默认:null;不传则让 VERL 按 epochs 和数据长度自动推导) +# save_freq: 10 # checkpoint 保存频率(step)(可选,默认:10) +# test_freq: null # 验证频率(step)(可选,默认:null;训练端会当成 -1=不验证) + +# trainer_device: cpu # 仅 SFT 生效:driver 侧 device(可选,默认:cpu) +# submission_id: "" # Ray submission_id(可选,默认空;通常由服务自动生成,无需填写) +""".strip() + adv = """# Advanced TaskSpec (YAML) - v3.6 +kind: advanced # 任务类型(必填):advanced(自定义 command) +# 说明:平台统一按 "advanced" 做任务分类与 task_id 命名(不按 ppo/grpo/sft 细分)。 +# +# 自定义训练命令:平台会做 $HOME 宏替换: +# - $HOME -> /private/users/ +# - $HOME/common/datasets -> /private/datasets(共享只读数据) +# - $HOME/common/hf -> /private/hf(共享只读 HF cache) +# +# W&B(v3.6): +# - 平台会在 runtime_env 注入 WANDB_BASE_URL/WANDB_API_KEY/WANDB_DIR +# - 以及注入以下 env,供 Advanced command 使用(无需用户手动修改 project): +# - MVP_TRAINER_LOGGER: "console" 或 "[console,wandb]" +# - MVP_WANDB_PROJECT: "_project" +# - MVP_WANDB_RUN: "" +# +nnodes: 2 # 训练节点数(必填):用于平台队列调度与资源预检查 +n_gpus_per_node: 4 # 每节点 GPU 数(必填):用于平台队列调度与资源预检查 + +command: | + PYTHONUNBUFFERED=1 \ + python3 -m verl.trainer.main_ppo \ + data.train_files=$HOME/common/datasets/gsm8k/train.parquet \ + data.val_files=$HOME/common/datasets/gsm8k/test.parquet \ + data.train_batch_size=256 \ + data.max_prompt_length=512 \ + data.max_response_length=512 \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + critic.optim.lr=1e-5 \ + critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.logger=${MVP_TRAINER_LOGGER} \ + trainer.project_name=${MVP_WANDB_PROJECT} \ + trainer.experiment_name=${MVP_WANDB_RUN} \ + trainer.val_before_train=False \ + trainer.nnodes=2 \ + trainer.n_gpus_per_node=4 \ + trainer.total_epochs=1 \ + trainer.total_training_steps=10 \ + trainer.save_freq=10 \ + trainer.test_freq=-1 \ + trainer.resume_mode=disable \ + trainer.default_local_dir=checkpoints \ + +ray_kwargs.ray_init.address=auto \ + hydra.run.dir=logs/hydra + +# 可选:自定义 reward(方式 A:直接写在 command 里) +# command 里增加如下 overrides: +# custom_reward_function.path=$HOME/code/reward.py +# custom_reward_function.name=compute_score +""".strip() + merge = """# Model Merge (YAML) - v3.5 (Advanced command) +# 用途:将 VERL 训练产生的 FSDP 分片 checkpoint 合并为 HuggingFace 格式目录。 +# +# 你需要把 替换成真实值: +# - submission id:在 Tasks 详情页里看到的 `ray_submission_id`(如 mvp2-...--a01) +# - global_step:对应 checkpoints 下的 global_step_xxx 目录(如 global_step_10) +# +# 注意:这里不需要 GPU,建议把 n_gpus_per_node 设为 0,这样不受训练任务占用 GPU 的影响。 +kind: advanced +nnodes: 1 +n_gpus_per_node: 0 + +command: | + python3 -m verl.model_merger merge \ + --backend fsdp \ + --local_dir $HOME/jobs//checkpoints//actor \ + --target_dir $HOME/jobs//checkpoints//actor/huggingface +""".strip() + evaluation = """# Evaluation (YAML) - v3.6 (Advanced command) +# 用途:对“已生成的结果 parquet”做离线评估(基于 custom reward function),示例使用 VERL 内置 main_eval。 +# +# 说明: +# - 你需要准备一个包含生成 responses 的 parquet,并替换 (示例放在某个 job 目录下)。 +# - main_eval 会在 Ray 上并发计算,因此这里也用 +ray_kwargs 连接已有 Ray 集群。 +# +kind: advanced +nnodes: 1 +n_gpus_per_node: 0 + +command: | + PYTHONUNBUFFERED=1 \ + python3 -m verl.trainer.main_eval \ + data.path=$HOME/jobs//outputs/.parquet \ + +ray_kwargs.ray_init.address=auto + +# 可选:指定 parquet schema(按你的 parquet 列名调整) +# data.response_key=responses +# data.data_source_key=data_source +# data.reward_model_key=reward_model +# +# 可选:自定义 reward(方式 A:用户自行提供 reward.py) +# custom_reward_function.path=$HOME/code/reward.py +# custom_reward_function.name=compute_score +""".strip() + body = f""" +

New Task

+
+
+ Paste TaskSpec YAML and submit to API server. + Basic tasks require code_path; Advanced tasks use kind: advanced with a custom command. +
+
+
+ + +
+
+ +
+
+ + + + + + +
+
+ +
+ + + +
+
+ + Back +
+
+

+
+""".strip() + + tpl_ppo = ppo.replace("\n", "\\n").replace('"', '\\"') + tpl_grpo = grpo.replace("\n", "\\n").replace('"', '\\"') + tpl_sft = sft.replace("\n", "\\n").replace('"', '\\"') + tpl_adv = adv.replace("\n", "\\n").replace('"', '\\"') + tpl_eval = evaluation.replace("\n", "\\n").replace('"', '\\"') + tpl_merge = merge.replace("\n", "\\n").replace('"', '\\"') + + script = ( + """ +document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265); +const yamlEl = document.getElementById("yaml"); +const msg = document.getElementById("msg"); + +const TPL_PPO = "__TPL_PPO__"; +const TPL_GRPO = "__TPL_GRPO__"; +const TPL_SFT = "__TPL_SFT__"; +const TPL_ADV = "__TPL_ADV__"; +const TPL_EVAL = "__TPL_EVAL__"; +const TPL_MERGE = "__TPL_MERGE__"; + +function showMode(mode) { + const a = document.getElementById("mode-yaml"); + const b = document.getElementById("mode-form"); + const py = document.getElementById("panel-yaml"); + const pf = document.getElementById("panel-form"); + if (mode === "yaml") { + a.classList.add("active"); b.classList.remove("active"); + py.style.display = ""; pf.style.display = "none"; + } else { + b.classList.add("active"); a.classList.remove("active"); + pf.style.display = ""; py.style.display = "none"; + } +} + +function yamlQuote(v) { + v = String(v || ""); + if (!v) return '""'; + if (/^[A-Za-z0-9_./:-]+$/.test(v) && !v.includes("#")) return v; + return '"' + v.replaceAll("\\\\","\\\\\\\\").replaceAll('"','\\\\"') + '"'; +} + +function updateFormVisibility() { + const tpl = document.getElementById("f-tpl").value; + const basic = document.getElementById("f-basic"); + const adv = document.getElementById("f-adv"); + const sftDev = document.getElementById("f-sft-device-wrap"); + if (tpl === "advanced" || tpl === "merge") { + basic.style.display = "none"; + adv.style.display = ""; + sftDev.style.display = "none"; + } else { + basic.style.display = ""; + adv.style.display = "none"; + sftDev.style.display = (tpl === "sft") ? "" : "none"; + } +} + +function updatePreview() { + const tpl = document.getElementById("f-tpl").value; + if (tpl === "advanced" || tpl === "merge") { + const nnodes = Number(document.getElementById("f-adv-nnodes").value || "1") || 1; + const gpn = Number(document.getElementById("f-adv-gpn").value || "0") || 0; + const cmd = (document.getElementById("f-command").value || "").trimEnd(); + const out = `kind: advanced\\n` + + `nnodes: ${nnodes}\\n` + + `n_gpus_per_node: ${gpn}\\n\\n` + + `command: |\\n` + + cmd.split("\\n").map(line => " " + line).join("\\n") + "\\n"; + return out; + } + + const workload = tpl; + const codePath = document.getElementById("f-code-path").value || ""; + const modelId = document.getElementById("f-model-id").value || ""; + const trainFile = document.getElementById("f-train-file").value || ""; + const valFile = document.getElementById("f-val-file").value || ""; + const nnodes = Number(document.getElementById("f-nnodes").value || "2") || 2; + const gpn = Number(document.getElementById("f-gpn").value || "4") || 4; + const epochs = Number(document.getElementById("f-epochs").value || "1") || 1; + const steps = (document.getElementById("f-steps").value || "").trim(); + const saveFreq = Number(document.getElementById("f-save-freq").value || "10") || 10; + const testFreq = (document.getElementById("f-test-freq").value || "").trim(); + const trainerDevice = document.getElementById("f-trainer-device").value || "cpu"; + + let y = ""; + y += `workload: ${workload}\\n`; + y += `code_path: ${yamlQuote(codePath)}\\n`; + y += `model_id: ${yamlQuote(modelId)}\\n`; + y += `train_file: ${yamlQuote(trainFile)}\\n`; + y += `val_file: ${yamlQuote(valFile)}\\n`; + y += `nnodes: ${nnodes}\\n`; + y += `n_gpus_per_node: ${gpn}\\n`; + y += `total_epochs: ${epochs}\\n`; + if (steps) y += `total_training_steps: ${Number(steps)}\\n`; + y += `save_freq: ${saveFreq}\\n`; + if (testFreq) y += `test_freq: ${Number(testFreq)}\\n`; + if (workload === "sft") y += `trainer_device: ${trainerDevice}\\n`; + return y; +} + +function bindStepper(inputId, decId, incId, min) { + const inp = document.getElementById(inputId); + document.getElementById(decId).onclick = () => { inp.value = String(Math.max(min, (Number(inp.value || "0") || 0) - 1)); inp.dispatchEvent(new Event("input")); }; + document.getElementById(incId).onclick = () => { inp.value = String((Number(inp.value || "0") || 0) + 1); inp.dispatchEvent(new Event("input")); }; +} + +document.getElementById("tpl-ppo").onclick = () => { yamlEl.value = TPL_PPO; }; +document.getElementById("tpl-grpo").onclick = () => { yamlEl.value = TPL_GRPO; }; +document.getElementById("tpl-sft").onclick = () => { yamlEl.value = TPL_SFT; }; +document.getElementById("tpl-adv").onclick = () => { yamlEl.value = TPL_ADV; }; +document.getElementById("tpl-eval").onclick = () => { yamlEl.value = TPL_EVAL; }; +document.getElementById("tpl-merge").onclick = () => { yamlEl.value = TPL_MERGE; }; + +document.getElementById("f-code-path").value = "/private/common/code/verl/verl_repo"; +document.getElementById("f-model-id").value = "Qwen/Qwen2.5-0.5B-Instruct"; +document.getElementById("f-train-file").value = "/private/common/datasets/gsm8k/train.parquet"; +document.getElementById("f-val-file").value = "/private/common/datasets/gsm8k/test.parquet"; +document.getElementById("f-nnodes").value = "2"; +document.getElementById("f-gpn").value = "4"; +document.getElementById("f-epochs").value = "1"; +document.getElementById("f-save-freq").value = "10"; +document.getElementById("f-trainer-device").value = "cpu"; + +document.getElementById("f-adv-nnodes").value = "2"; +document.getElementById("f-adv-gpn").value = "4"; +document.getElementById("f-command").value = `PYTHONUNBUFFERED=1 \\\\\\npython3 -m verl.trainer.main_ppo \\\\\\n data.train_files=$HOME/common/datasets/gsm8k/train.parquet \\\\\\n data.val_files=$HOME/common/datasets/gsm8k/test.parquet \\\\\\n +ray_kwargs.ray_init.address=auto`; + +bindStepper("f-nnodes", "f-nnodes-dec", "f-nnodes-inc", 1); +bindStepper("f-gpn", "f-gpn-dec", "f-gpn-inc", 0); +bindStepper("f-epochs", "f-epochs-dec", "f-epochs-inc", 1); +bindStepper("f-adv-nnodes", "f-adv-nnodes-dec", "f-adv-nnodes-inc", 1); +bindStepper("f-adv-gpn", "f-adv-gpn-dec", "f-adv-gpn-inc", 0); + +document.getElementById("f-tpl").onchange = () => { + const tpl = document.getElementById("f-tpl").value; + if (tpl === "ppo") { + document.getElementById("f-train-file").value = "/private/common/datasets/gsm8k/train.parquet"; + document.getElementById("f-val-file").value = "/private/common/datasets/gsm8k/test.parquet"; + } else if (tpl === "grpo") { + document.getElementById("f-train-file").value = "/private/common/datasets/gsm8k/train.parquet"; + document.getElementById("f-val-file").value = "/private/common/datasets/gsm8k/test.parquet"; + } else if (tpl === "sft") { + document.getElementById("f-train-file").value = "/private/common/datasets/gsm8k_sft/train.parquet"; + document.getElementById("f-val-file").value = "/private/common/datasets/gsm8k_sft/test.parquet"; + } else if (tpl === "advanced") { + document.getElementById("f-adv-nnodes").value = "2"; + document.getElementById("f-adv-gpn").value = "4"; + document.getElementById("f-command").value = TPL_ADV.split("command: |")[1]?.trimStart() || document.getElementById("f-command").value; + } else if (tpl === "merge") { + document.getElementById("f-adv-nnodes").value = "1"; + document.getElementById("f-adv-gpn").value = "0"; + document.getElementById("f-command").value = TPL_MERGE.split("command: |")[1]?.trimStart() || document.getElementById("f-command").value; + } + updateFormVisibility(); + updatePreview(); +}; + +for (const id of ["f-code-path","f-model-id","f-train-file","f-val-file","f-nnodes","f-gpn","f-epochs","f-steps","f-save-freq","f-test-freq","f-trainer-device","f-adv-nnodes","f-adv-gpn","f-command"]) { + const el = document.getElementById(id); + if (el) el.oninput = () => updatePreview(); +} + +document.getElementById("mode-yaml").onclick = () => { + // When switching to YAML, sync from form if form is visible. + const formVisible = document.getElementById("panel-form").style.display !== "none"; + if (formVisible) yamlEl.value = updatePreview(); + showMode("yaml"); +}; +document.getElementById("mode-form").onclick = () => { + showMode("form"); +}; + +showMode("yaml"); + +document.getElementById("submit").onclick = async () => { + msg.textContent = "Submitting..."; + const body = (document.getElementById("panel-form").style.display !== "none") ? updatePreview() : yamlEl.value; + const resp = await apiFetch("/api/v2/tasks", { method: "POST", headers: {"Content-Type":"text/plain"}, body }); + const text = await resp.text(); + if (!resp.ok) { msg.textContent = "Error: " + resp.status + "\\n" + text; return; } + const obj = JSON.parse(text); + msg.textContent = "OK: " + fmtJson(obj); + if (obj.task_id) window.location.href = "/ui/tasks/" + obj.task_id; +}; + """.strip() + .replace("__TPL_PPO__", tpl_ppo) + .replace("__TPL_GRPO__", tpl_grpo) + .replace("__TPL_SFT__", tpl_sft) + .replace("__TPL_ADV__", tpl_adv) + .replace("__TPL_EVAL__", tpl_eval) + .replace("__TPL_MERGE__", tpl_merge) + ) + return HTMLResponse(content=page("New Task", "new", body, script)) + diff --git a/src/mvp/py/argus/ui/pages/tasks.py b/src/mvp/py/argus/ui/pages/tasks.py new file mode 100644 index 0000000..24be064 --- /dev/null +++ b/src/mvp/py/argus/ui/pages/tasks.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +from fastapi import FastAPI +from fastapi.responses import HTMLResponse + +from ..layout.page import page + + +def register(app: FastAPI) -> None: + @app.get("/ui/tasks") + async def ui_tasks() -> HTMLResponse: + body = """ +

Tasks

+
+
+ + New Task +
+
+
Loading...
+
+""".strip() + script = """ +const out = document.getElementById("out"); +async function refresh() { + out.textContent = "Loading..."; + try { + const q = await apiJson("/api/v2/queue"); + const completedLimit = 25; + const completedOffset = Number(localStorage.getItem("mvp_completed_offset") || "0") || 0; + const done = await apiJson("/api/v2/tasks?limit=" + completedLimit + "&offset=" + completedOffset + "&states=SUCCEEDED,FAILED,CANCELED"); + + function pill(state) { + const s = String(state || ""); + if (s === "SUCCEEDED") return `${s}`; + if (s === "FAILED") return `${s}`; + if (s === "CANCELED") return `${s}`; + if (s === "RUNNING") return `${s}`; + if (s === "QUEUED" || s === "PENDING_RESOURCES" || s === "SUBMITTING" || s === "SUBMITTED") return `${s}`; + return `${s}`; + } + function row(t) { + const id = t.task_id; + return ` + ${id} + ${t.workload} + ${pill(t.state)} + ${t.nnodes} x ${t.n_gpus_per_node} GPU + ${t.updated_at || ""} + `; + } + + const running = (q.running || []).map(row).join(""); + const pending = (q.pending || []).map(row).join(""); + const doneRows = (done.tasks || []).map(row).join(""); + const pageNo = Math.floor(completedOffset / completedLimit) + 1; + const prevDisabled = completedOffset <= 0; + const nextDisabled = !done.has_more; + out.innerHTML = ` +
Tip: configure token in Login.
+
+

Running

+ ${running || ""}
TaskWorkloadStateResourcesUpdated
(none)
+
+

Pending

+ ${pending || ""}
TaskWorkloadStateResourcesUpdated
(none)
+
+

Completed

+
+
Page ${pageNo}
+
+ + +
+
+ ${doneRows || ""}
TaskWorkloadStateResourcesUpdated
(none)
+ `; + + const prevBtn = document.getElementById("done-prev"); + const nextBtn = document.getElementById("done-next"); + if (prevBtn) prevBtn.onclick = () => { + const cur = Number(localStorage.getItem("mvp_completed_offset") || "0") || 0; + const next = Math.max(0, cur - completedLimit); + localStorage.setItem("mvp_completed_offset", String(next)); + refresh(); + }; + if (nextBtn) nextBtn.onclick = () => { + const cur = Number(localStorage.getItem("mvp_completed_offset") || "0") || 0; + const next = cur + completedLimit; + localStorage.setItem("mvp_completed_offset", String(next)); + refresh(); + }; + } catch (e) { + out.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e)); + } +} +document.getElementById("refresh").onclick = refresh; +refresh(); +""".strip() + return HTMLResponse(content=page("Tasks", "tasks", body, script)) + diff --git a/src/mvp/py/argus/ui/routes.py b/src/mvp/py/argus/ui/routes.py new file mode 100644 index 0000000..142ac67 --- /dev/null +++ b/src/mvp/py/argus/ui/routes.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from fastapi import FastAPI +from fastapi.responses import RedirectResponse + +from .pages import admin, data, login, serving, task_detail, task_logs, task_new, tasks + + +def register_ui_routes(app: FastAPI) -> None: + @app.get("/ui") + async def ui_root() -> RedirectResponse: + return RedirectResponse(url="/ui/tasks") + + login.register(app) + tasks.register(app) + task_new.register(app) + task_detail.register(app) + task_logs.register(app) + serving.register(app) + data.register(app) + admin.register(app) + diff --git a/src/mvp/scripts/30_prepare_data_and_model.sh b/src/mvp/scripts/30_prepare_data_and_model.sh index 4c4a79f..53dac92 100755 --- a/src/mvp/scripts/30_prepare_data_and_model.sh +++ b/src/mvp/scripts/30_prepare_data_and_model.sh @@ -36,7 +36,7 @@ dexec "${HEAD_CONTAINER}" bash -lc " " echo "[head] prepare PPO dataset (gsm8k RL parquet) -> ${PPO_DATA_DIR}" -dexec "${HEAD_CONTAINER}" bash -lc "if [[ -f '${PPO_DATA_DIR}/train.parquet' && -f '${PPO_DATA_DIR}/test.parquet' ]]; then echo 'ppo_dataset_exists: skip'; else python3 /workspace/verl/examples/data_preprocess/gsm8k.py --local_save_dir '${PPO_DATA_DIR}'; fi" +dexec "${HEAD_CONTAINER}" bash -lc "if [[ -f '${PPO_DATA_DIR}/train.parquet' && -f '${PPO_DATA_DIR}/test.parquet' ]]; then echo 'ppo_dataset_exists: skip'; else PYTHONPATH=\"/workspace/verl:${PYTHONPATH:-}\" python3 /workspace/verl/examples/data_preprocess/gsm8k.py --local_save_dir '${PPO_DATA_DIR}'; fi" echo "[head] prepare SFT dataset (gsm8k messages parquet) -> ${SFT_DATA_DIR}" if dexec "${HEAD_CONTAINER}" bash -lc "test -f '${SFT_DATA_DIR}/train.parquet'"; then @@ -82,6 +82,7 @@ PY_CODE="$(cat <<'PY' import os model_id = os.environ["MODEL_ID"] +link_name = model_id.replace("/", "--") hf_home = os.environ.get("HF_HOME", "/private/hf") os.environ.setdefault("HF_HOME", hf_home) @@ -91,12 +92,27 @@ os.environ.setdefault("TRANSFORMERS_CACHE", os.path.join(hf_home, "transformers" from huggingface_hub import snapshot_download try: - snapshot_download(repo_id=model_id, local_files_only=True) - print("model_cache_exists: skip", model_id) + path = snapshot_download(repo_id=model_id, local_files_only=True) + print("model_cache_exists: skip", model_id, path) except Exception: print("model_cache_missing: downloading", model_id) - snapshot_download(repo_id=model_id) - print("model_cached_ok:", model_id) + path = snapshot_download(repo_id=model_id) + print("model_cached_ok:", model_id, path) + +# v3.0 path policy: use a stable symlink under /private/common/models/... +common_models_dir = "/private/common/models" +os.makedirs(common_models_dir, exist_ok=True) +dst = os.path.join(common_models_dir, link_name) +try: + if os.path.islink(dst) or os.path.exists(dst): + os.unlink(dst) +except Exception: + pass +try: + os.symlink(path, dst) + print("model_common_link_ok:", dst) +except Exception as e: + print("WARN: model_common_link_failed:", dst, repr(e)) PY )" diff --git a/src/mvp/scripts/run_all_v30_api.sh b/src/mvp/scripts/run_all_v30_api.sh index f2e9917..4be775c 100755 --- a/src/mvp/scripts/run_all_v30_api.sh +++ b/src/mvp/scripts/run_all_v30_api.sh @@ -20,7 +20,7 @@ RESET_SFTPGO="${RESET_SFTPGO:-0}" EXPECTED_RAY_NODES="${EXPECTED_RAY_NODES:-3}" # head + 2 workers CLUSTER_NAME="${CLUSTER_NAME:-argus-ray}" -CONFIG_IN_CONTAINER="${CONFIG_IN_CONTAINER:-/workspace/mvp/configs/dev_v30.yaml}" +CONFIG_IN_CONTAINER="${CONFIG_IN_CONTAINER:-/workspace/mvp/configs/dev.yaml}" SFTPGO_ADMIN_PASSWORD="${SFTPGO_ADMIN_PASSWORD:-my-dev-sftpgo-admin}" export SFTPGO_ADMIN_PASSWORD @@ -105,7 +105,12 @@ submit_taskspec_inline() { local resp resp="$(curl -sS -H "Authorization: Bearer ${token}" -H "Content-Type: application/yaml" --data-binary "${yaml_body}" "${API_ADDR}/api/v2/tasks")" echo "[host] submit_resp: ${resp}" >&2 - printf '%s' "${resp}" | python3 -c 'import sys,json; print(json.load(sys.stdin)["task_id"])' + printf '%s' "${resp}" | python3 -c 'import sys,json; o=json.load(sys.stdin); tid=o.get("task_id",""); print(tid) if tid else sys.exit(42)' + local rc=$? + if [[ "${rc}" != "0" ]]; then + echo "ERROR: submit failed: ${resp}" >&2 + return 1 + fi } wait_task() { @@ -154,12 +159,15 @@ ray_wait_ready 60 echo "[host] wait sftpgo ready" sftpgo_wait_ready 60 "http://127.0.0.1:8081/api/v2/token" -echo "[host] render v3.0 config with SFTPGo container IP (work around docker DNS issues)" +echo "[host] render config with SFTPGo container IP (work around docker DNS issues)" SFTPGO_IP="$(docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' argus-sftpgo)" -RENDERED_CFG_HOST_PATH="/tmp/dev_v30_rendered.yaml" -sed -E "s#^(\\s*admin_api_base:) .*#\\1 \"http://${SFTPGO_IP}:8080/api/v2\"#g" "${ROOT_DIR}/configs/dev_v30.yaml" >"${RENDERED_CFG_HOST_PATH}" -docker cp "${RENDERED_CFG_HOST_PATH}" "${HEAD_CONTAINER}:/tmp/dev_v30_rendered.yaml" -CONFIG_IN_CONTAINER="/tmp/dev_v30_rendered.yaml" +RENDERED_CFG_HOST_PATH="/tmp/dev_rendered.yaml" +sed -E \ + -e "s#^(\\s*admin_api_base:) .*#\\1 \"http://${SFTPGO_IP}:8080/api/v2\"#g" \ + -e "/^ sftpgo:/,/^ retention:/ s#^(\\s*host:) .*#\\1 \"127.0.0.1\"#g" \ + "${ROOT_DIR}/configs/dev.yaml" >"${RENDERED_CFG_HOST_PATH}" +docker cp "${RENDERED_CFG_HOST_PATH}" "${HEAD_CONTAINER}:/tmp/dev_rendered.yaml" +CONFIG_IN_CONTAINER="/tmp/dev_rendered.yaml" echo "[host] verify head discovery record (supervised in-container)" HEAD_IP_FILE="${SHARED_ROOT}/ray/discovery/${CLUSTER_NAME}/head.json" @@ -224,14 +232,7 @@ dexec "${HEAD_CONTAINER}" bash -lc "set -euo pipefail; \ (cp -f /private/common/datasets/gsm8k_sft/train.parquet '/private/users/${USER_ID}/datasets/gsm8k_sft/train.parquet' 2>/dev/null || cp -f /private/datasets/gsm8k_sft/train.parquet '/private/users/${USER_ID}/datasets/gsm8k_sft/train.parquet' 2>/dev/null || true); \ (cp -f /private/common/datasets/gsm8k_sft/test.parquet '/private/users/${USER_ID}/datasets/gsm8k_sft/test.parquet' 2>/dev/null || cp -f /private/datasets/gsm8k_sft/test.parquet '/private/users/${USER_ID}/datasets/gsm8k_sft/test.parquet' 2>/dev/null || true)" -echo "[host] resolve local model snapshot path (avoid HF mirror 429 for vllm rollout)" -LOCAL_MODEL_PATH="$(dexec "${HEAD_CONTAINER}" bash -lc "python3 - <<'PY'\nimport os\nfrom huggingface_hub import snapshot_download\nmodel_id=os.environ.get('MODEL_ID','Qwen/Qwen2.5-0.5B-Instruct')\nos.environ.setdefault('HF_HOME','/private/hf')\ntry:\n p=snapshot_download(repo_id=model_id, local_files_only=True)\n print(p)\nexcept Exception as e:\n raise SystemExit(f'ERROR: model snapshot not in cache; run 30_prepare_data_and_model.sh first. {e!r}')\nPY\n" MODEL_ID='Qwen/Qwen2.5-0.5B-Instruct' | tail -n 1)" -if [[ -z "${LOCAL_MODEL_PATH}" || "${LOCAL_MODEL_PATH}" != /* ]]; then - echo "ERROR: failed to resolve LOCAL_MODEL_PATH: ${LOCAL_MODEL_PATH}" >&2 - exit 1 -fi -echo "[host] local_model_path: ${LOCAL_MODEL_PATH}" - +LOCAL_MODEL_PATH="Qwen/Qwen2.5-0.5B-Instruct" echo "[host] submit PPO/GRPO/SFT via API using user dataset paths" PPO_TASK_ID="$(submit_taskspec_inline "${USER_TOKEN}" $'workload: ppo\nnnodes: 2\nn_gpus_per_node: 4\ncode_path: /private/common/code/verl/verl_repo\ntrain_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/train.parquet\nval_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/test.parquet\nmodel_id: '"${LOCAL_MODEL_PATH}"$'\ntotal_epochs: 1\ntotal_training_steps: 10\nsave_freq: 10\n')" GRPO_TASK_ID="$(submit_taskspec_inline "${USER_TOKEN}" $'workload: grpo\nnnodes: 2\nn_gpus_per_node: 4\ncode_path: /private/common/code/verl/verl_repo\ntrain_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/train.parquet\nval_file: /private/users/'"${USER_ID}"$'/datasets/gsm8k/test.parquet\nmodel_id: '"${LOCAL_MODEL_PATH}"$'\ntotal_epochs: 1\ntotal_training_steps: 10\nsave_freq: 10\n')" diff --git a/walkthrough.md b/walkthrough.md new file mode 100644 index 0000000..3427b9c --- /dev/null +++ b/walkthrough.md @@ -0,0 +1,27 @@ +# 源码走读 +## API Server +API Server是平台服务入口,提供API和web UI功能。 + +- 入口:src/mvp/py/server.py +- 配置文件:src/mvp/configs/dev.yaml,解析成V2Config数据结构 +- 核心模块: + - service:服务层,包括API服务,任务调度,模型调度等后台任务 + - core: 核心数据结构,任务ID,submission ID,model serving ID定义 + - ray: 对接ray cluster的client + +## service核心模块 +API Server核心模块包括: +- app.py: API 入口层,提供webui,将API请求转化为各类数据库操作,以及协调SFTPGo服务 +- scheduler.py:定时任务,调度训练任务队列,提交ray job,更新数据库中任务状态 +- serving_reconciler.py: 定时任务,模型推理调度,提交ray serve app,调整副本数,更新数据库中模型实例状态 +- db.py: 数据库操作基础工具类 +- janitor.py:定时清理训练任务中间过程数据 +- sftpgo.py: 对接SFTPGo的客户端,创建用户、重置密码、初始化用户目录等 + +## core核心模块 +- ray_job_tool.py:通过builder模板构建ray job命令,提交ray job到ray集群。 +- head_publisher.py: Ray head节点自动写head IP 文件到共享存储 +- worker_watchdog.py: 自动检查最新head IP文件,加入head的ray集群。 +- models: 定义RayConfig/JobSepc/AdvancedTaskSpec等核心数据结构 + +