v3.8 model serving 部署成功

This commit is contained in:
yuyr 2026-01-06 22:43:29 +08:00
parent 63963eba29
commit 686739fea2
39 changed files with 6772 additions and 1830 deletions

File diff suppressed because it is too large Load Diff

314
specs/mvp/v3.8/ray_serve.md Normal file
View File

@ -0,0 +1,314 @@
API参考资料
https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html
ray.serve.llm.LLMConfig
pydantic model ray.serve.llm.LLMConfig[source]
The configuration for starting an LLM deployment.
PublicAPI (alpha): This API is in alpha and may change before becoming stable.
field accelerator_type: str | None = None
The type of accelerator runs the model on. Only the following values are supported: [V100, P100, T4, P4, K80, A10G, L4, L40S, A100, H100, H200, H20, B200, Intel-GPU-Max-1550, Intel-GPU-Max-1100, Intel-GAUDI, AMD-Instinct-MI100, AMD-Instinct-MI250X, AMD-Instinct-MI250X-MI250, AMD-Instinct-MI210, AMD-Instinct-MI300A, AMD-Instinct-MI300X-OAM, AMD-Instinct-MI300X-HF, AMD-Instinct-MI308X, AMD-Instinct-MI325X-OAM, AMD-Instinct-MI350X-OAM, AMD-Instinct-MI355X-OAM, AMD-Radeon-R9-200-HD-7900, AMD-Radeon-HD-7900, aws-neuron-core, TPU-V2, TPU-V3, TPU-V4, TPU-V5P, TPU-V5LITEPOD, TPU-V6E, Ascend910B, Ascend910B4, MXC500, MXC550, A100-40G, A100-80G]
field callback_config: CallbackConfig [Optional]
Callback configuration to use for model initialization. Can be a string path to a class or a Callback subclass.
field deployment_config: Dict[str, Any] [Optional]
The Ray @server.deployment options. Supported fields are: name, num_replicas, ray_actor_options, max_ongoing_requests, autoscaling_config, max_queued_requests, user_config, health_check_period_s, health_check_timeout_s, graceful_shutdown_wait_loop_s, graceful_shutdown_timeout_s, logging_config, request_router_config. For more details, see the Ray Serve Documentation.
field engine_kwargs: Dict[str, Any] = {}
Additional keyword arguments for the engine. In case of vLLM, this will include all the configuration knobs they provide out of the box, except for tensor-parallelism which is set automatically from Ray Serve configs.
field experimental_configs: Dict[str, Any] [Optional]
Experimental configurations for Ray Serve LLM. This is a dictionary of key-value pairs. Current supported keys are: - stream_batching_interval_ms: Ray Serve LLM batches streaming requests together. This config decides how long to wait for the batch before processing the requests. Defaults to 50.0. - num_ingress_replicas: The number of replicas for the router. Ray Serve will take the max amount all the replicas. Default would be 2 router replicas per model replica.
field llm_engine: str = 'vLLM'
The LLMEngine that should be used to run the model. Only the following values are supported: [vLLM]
field log_engine_metrics: bool | None = True
Enable additional engine metrics via Ray Prometheus port.
field lora_config: Dict[str, Any] | LoraConfig | None = None
Settings for LoRA adapter. Validated against LoraConfig.
field model_loading_config: Dict[str, Any] | ModelLoadingConfig [Required]
The settings for how to download and expose the model. Validated against ModelLoadingConfig.
field placement_group_config: Dict[str, Any] | None = None
Ray placement group configuration for scheduling vLLM engine workers. Defines resource bundles and placement strategy for multi-node deployments. Should contain bundles (list of resource dicts) and optionally strategy (defaults to PACK). Example: {bundles: [{GPU: 1, CPU: 2}], strategy: PACK}
field runtime_env: Dict[str, Any] | None = None
The runtime_env to use for the model deployment replica and the engine workers.
apply_checkpoint_info(model_id_or_path: str, trust_remote_code: bool = False) → None[source]
Apply the checkpoint info to the model config.
classmethod from_file(path: str, **kwargs) → ModelT
Load a model from a YAML file path.
get_engine_config() → None | VLLMEngineConfig[source]
Returns the engine config for the given LLM config.
LLMConfig not only has engine config but also deployment config, etc.
get_or_create_callback() → CallbackBase | None[source]
Get or create the callback instance for this process.
This ensures one callback instance per process (singleton pattern). The instance is cached so the same object is used across all hooks.
Returns
:
Instance of class that implements Callback
multiplex_config() → ServeMultiplexConfig[source]
classmethod parse_yaml(file, **kwargs) → ModelT
setup_engine_backend()[source]
update_engine_kwargs(**kwargs: Any) → None[source]
Update the engine_kwargs and the engine_config engine_kwargs.
This is typically called during engine starts, when certain engine_kwargs (e.g., data_parallel_rank) become available.
validator validate_accelerator_type » accelerator_type[source]
validator validate_deployment_config » deployment_config[source]
Validates the deployment config dictionary.
validator validate_experimental_configs » experimental_configs[source]
Validates the experimental configs dictionary.
validator validate_llm_engine » llm_engine[source]
Validates the llm_engine string value.
validator validate_lora_config » lora_config[source]
Validates the lora config dictionary.
validator validate_model_loading_config » model_loading_config[source]
Validates the model loading config dictionary.
property input_modality: str
Returns the input modality of the model. There could be more types in the future. Right now assumes if the model doesnt support version, itll be text.
property max_request_context_length: int | None
property model_architecture: str
property model_id: str
property supports_vision: bool
# Python API
ray serve api
https://docs.ray.io/en/latest/serve/api/index.html#serve-api
Python API
Writing Applications
serve.Deployment
Class (or function) decorated with the @serve.deployment decorator.
serve.Application
One or more deployments bound with arguments that can be deployed together.
Deployment Decorators
serve.deployment
Decorator that converts a Python class to a Deployment.
serve.ingress
Wrap a deployment class with an ASGI application for HTTP request parsing.
serve.batch
Converts a function to asynchronously handle batches.
serve.multiplexed
Wrap a callable or method used to load multiplexed models in a replica.
Deployment Handles
Note
The deprecated RayServeHandle and RayServeSyncHandle APIs have been fully removed as of Ray 2.10. See the model composition guide for how to update code to use the DeploymentHandle API instead.
serve.handle.DeploymentHandle
A handle used to make requests to a deployment at runtime.
serve.handle.DeploymentResponse
A future-like object wrapping the result of a unary deployment handle call.
serve.handle.DeploymentResponseGenerator
A future-like object wrapping the result of a streaming deployment handle call.
Running Applications
serve.start
Start Serve on the cluster.
serve.run
Run an application and return a handle to its ingress deployment.
serve.delete
Delete an application by its name.
serve.status
Get the status of Serve on the cluster.
serve.shutdown
Completely shut down Serve on the cluster.
serve.shutdown_async
Completely shut down Serve on the cluster asynchronously.
Configurations
serve.config.ProxyLocation
Config for where to run proxies to receive ingress traffic to the cluster.
serve.config.gRPCOptions
gRPC options for the proxies.
serve.config.HTTPOptions
HTTP options for the proxies.
serve.config.AutoscalingConfig
Config for the Serve Autoscaler.
serve.config.AutoscalingPolicy
PublicAPI (alpha): This API is in alpha and may change before becoming stable.
serve.config.AutoscalingContext
Rich context provided to custom autoscaling policies.
serve.config.AggregationFunction
An enumeration.
serve.config.RequestRouterConfig
Config for the Serve request router.
Schemas
serve.schema.ServeActorDetails
Detailed info about a Ray Serve actor.
serve.schema.ProxyDetails
Detailed info about a Ray Serve ProxyActor.
serve.schema.ApplicationStatusOverview
Describes the status of an application and all its deployments.
serve.schema.ServeStatus
Describes the status of Serve.
serve.schema.DeploymentStatusOverview
Describes the status of a deployment.
serve.schema.EncodingType
Encoding type for the serve logs.
serve.schema.AutoscalingMetricsHealth
An enumeration.
serve.schema.AutoscalingStatus
An enumeration.
serve.schema.ScalingDecision
One autoscaling decision with minimal provenance.
serve.schema.DeploymentAutoscalingDetail
Deployment-level autoscaler observability.
serve.schema.ReplicaRank
Replica rank model.
Request Router
serve.request_router.ReplicaID
A unique identifier for a replica.
serve.request_router.PendingRequest
A request that is pending execution by a replica.
serve.request_router.RunningReplica
Contains info on a running replica.
serve.request_router.FIFOMixin
Mixin for FIFO routing.
serve.request_router.LocalityMixin
Mixin for locality routing.
serve.request_router.MultiplexMixin
Mixin for multiplex routing.
serve.request_router.RequestRouter
Abstract interface for a request router (how the router calls it).
Advanced APIs
serve.get_replica_context
Returns the deployment and replica tag from within a replica at runtime.
serve.context.ReplicaContext
Stores runtime context info for replicas.
serve.get_multiplexed_model_id
Get the multiplexed model ID for the current request.
serve.get_app_handle
Get a handle to the application's ingress deployment by name.
serve.get_deployment_handle
Get a handle to a deployment by name.
serve.grpc_util.RayServegRPCContext
Context manager to set and get gRPC context.
serve.exceptions.BackPressureError
Raised when max_queued_requests is exceeded on a DeploymentHandle.
serve.exceptions.RayServeException
serve.exceptions.RequestCancelledError
Raise when a Serve request is cancelled.
serve.exceptions.DeploymentUnavailableError
Raised when a Serve deployment is unavailable to receive requests.

View File

@ -0,0 +1,87 @@
基于提供的来源,以下是使用 **Builder Pattern构建器模式** 结合 Ray Serve 和 vllm 动态部署**中型大语言模型Medium-sized LLM**的原理与操作方案。
### 一、 核心原理
1. **中型 LLM 定义**:中型模型(如 Llama-3.1-70B通常具有约 70B 参数。它们通常运行在**单个节点**上,利用 **4 到 8 个 GPU**
2. **Builder Pattern 机制**:该模式通过 `build_openai_app` 函数提供高度抽象。开发者只需定义一个 `LLMConfig` 对象,即可自动构建并链接底层的 `LLMServer``OpenAiIngress` 组件。
3. **高性能后端 (vLLM)**Ray Serve LLM 使用 vLLM 作为推理引擎,支持高性能推理和显存管理。
4. **动态扩缩容与资源调度**
* **张量并行 (Tensor Parallelism)**:通过 `tensor_parallel_size` 将模型权重均匀分布在单节点的所有 GPU 上。
* **副本缩放 (Autoscaling)**:通过 `autoscaling_config` 动态调整 `min_replicas``max_replicas`,使服务能根据实时流量增减推理副本。
---
### 二、 操作方案
#### 1. 环境准备
确保已安装必要的依赖包并配置 Hugging Face 访问令牌(针对 Llama-3.1 等受限模型)。
```bash
pip install "ray[serve,llm]"
export HF_TOKEN=<YOUR_HUGGINGFACE_TOKEN>
```
#### 2. 编写部署脚本 (`serve_medium_llm.py`)
使用 **Builder Pattern** 定义配置并构建应用。以下示例配置了一个典型的 70B 模型部署:
```python
# serve_medium_llm.py
from ray.serve.llm import LLMConfig, build_openai_app
import os
llm_config = LLMConfig(
model_loading_config=dict(
model_id="my-llama-3.1-70b",
model_source="meta-llama/Llama-3.1-70B-Instruct",
),
accelerator_type="A100-40G", # 或 L40S
deployment_config=dict(
autoscaling_config=dict(
min_replicas=1, # 最小副本数
max_replicas=4, # 最大副本数,实现动态扩展
)
),
runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}),
engine_kwargs=dict(
max_model_len=32768, # 上下文长度
tensor_parallel_size=8, # 在单节点的 8 个 GPU 间拆分权重
),
)
# 使用 Builder Pattern 构建应用
app = build_openai_app({"llm_configs": [llm_config]})
```
#### 3. 启动部署
在终端运行以下命令启动服务:
```bash
serve run serve_medium_llm:app
```
部署过程通常需要几分钟,包括配置集群、启动 vLLM 服务器以及下载模型权重。
#### 4. 发送请求测试
服务启动后,可以通过符合 OpenAI 标准的接口进行访问。
```python
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="FAKE_KEY")
response = client.chat.completions.create(
model="my-llama-3.1-70b",
messages=[{"role": "user", "content": "解释一下什么是量子纠缠?"}],
stream=True
)
for chunk in response:
if chunk.choices.delta.content:
print(chunk.choices.delta.content, end="", flush=True)
```
---
### 三、 性能与并发优化建议
* **提高并发量**:可以通过降低 `max_model_len` 来减少 KV 缓存所需的显存,从而显著提升每个副本支持的最大并发请求数。
* **监控指标**:通过 Ray Serve LLM 仪表盘监控 **TTFT首字延迟**、**TPOT单字延迟** 和 **Token 吞吐量** 来评估服务性能。
* **精度折衷**:对于资源受限的场景,可以使用**量化模型**(如 FP8来减少模型内存占用为 KV 缓存留出更多空间,进而提高并发能力。
**比喻理解**
部署**中型 LLM** 就像是在一个大型车间里组装一台复杂的精密机器(模型权重)。**Builder Pattern** 是你的“全自动组装线”你只需设定好机器的参数Config生产线就会自动帮你把零件固定好并接通电源。而 **vLLM 和张量并行** 就像是让 8 个熟练工人GPU共同抬起这台沉重的机器每个人只负责自己那一部分的力气从而让机器能够平稳地运转。

View File

@ -0,0 +1,8 @@
1. 通过ray serve后端vllm来动态拉起llm支持多模型application部署
2. 默认一个模型只有一个replica用户配置可以多个
3. 用户可以删除(下线)模型
4. 可以指定模型用几张卡
5. 通过WebUI来进行配置查看当前部署的模型列表以及可以查看详情
6. 模型路径可以使用common也可以用户自己指定user路径
7.

224
specs/mvp/v3.8/v3.8_api.md Normal file
View File

@ -0,0 +1,224 @@
# MVP v3.8 API ReferenceServing
> 说明:本节为 v3.8 新增的 **Model Serving** APIRay Serve LLM / vLLM
> 认证Serving 管理 API 复用现有 MVP API 的认证方式(`Authorization: Bearer <user_token>`)。
> 推理:对外 OpenAI endpoint **不做鉴权**v3.8 约定)。
## 0. 基本信息
### 0.1 Base URLs
- MVP API server`http://<host>:8080`
- Ray Serve OpenAI ingress固定端口 8000`http://<host>:8000/v1`
### 0.2 认证
所有 `/api/v2/serve/*` 接口要求:
```
Authorization: Bearer <user_token>
```
其中 `user_token` 由管理员通过 `/api/v2/users/<user_id>/tokens` 颁发(沿用现有机制)。
### 0.3 命名规则:`model_id = user_id-YYYYMMDDHHMM-<suffix>`
- 用户提交时填写 `model_id`(语义为 suffix例如 `qwen-0.5b`
- 平台生成前缀:
- `prefix = "<user_id>-<YYYYMMDDHHMM>"`
- 平台实际对外暴露的 OpenAI model 名称为:
- `model_id = "<prefix>-<suffix>"`
- 示例:`alice-202601061235-qwen-0.5b`
## 1. 数据结构
### 1.1 ServingSpecYAML
请求体建议使用 YAML与 TaskSpec 一致),示例:
```yaml
model_id: qwen-0.5b # 必填suffix平台自动加 user_id- 前缀)
model_source: $HOME/common/hf/.../<sha> # 必填:本地路径或 repo id平台做 $HOME 宏替换与路径校验
num_replicas: 1 # 可选,默认 1
gpus_per_replica: 1 # 可选,默认 1
# engine_kwargs: # 可选vLLM 参数透传(白名单/黑名单由实现决定)
# max_model_len: 8192
# gpu_memory_utilization: 0.9
```
说明:
- `accelerator_type` 不在 ServingSpec 中暴露;由平台配置(`dev.yaml``serving.llm.accelerator_type`)统一注入到 Ray Serve LLM 的 `LLMConfig.accelerator_type`dev/h1: `H20`)。
#### 宏替换
- `$HOME``/private/users/<user_id>`
- `$HOME/common/hf``/private/hf`
- `$HOME/common/datasets``/private/datasets`serving 不强依赖,但保留一致语义)
#### 路径校验v3.8 约定)
`model_source` 允许:
- `/private/hf/...`common
- `/private/users/<user_id>/...`user
拒绝:
- 其它用户目录
- 非 `/private` 下路径
- 空路径或包含 `..` 的可疑路径
### 1.2 ServingModel响应体JSON
```json
{
"model_key": "svc-alice-20260106-123000-abcd",
"user_id": "alice",
"model_id": "alice-202601061235-qwen-0.5b",
"model_id_suffix": "qwen-0.5b",
"model_id_prefix": "alice-202601061235",
"model_source": "/private/hf/hub/models--.../snapshots/<sha>",
"num_replicas": 1,
"gpus_per_replica": 1,
"total_gpus": 1,
"state": "RUNNING",
"endpoint": {
"openai_base_url": "http://<host>:8000/v1",
"model": "alice-202601061235-qwen-0.5b"
},
"error_summary": null,
"created_at": "2026-01-06T12:30:00Z",
"updated_at": "2026-01-06T12:31:02Z"
}
```
## 2. 管理 APIMVP API server
### 2.1 Create / Upsert model
`POST /api/v2/serve/models`
#### Request
- Header: `Content-Type: application/yaml`
- Body: ServingSpecYAML
#### Response (202)
```json
{
"model_key": "svc-alice-20260106-123000-abcd",
"state": "QUEUED"
}
```
语义:
- 创建新模型(若 suffix 不存在)
- 或更新已有模型(若同一用户同一 suffix 已存在):更新 replicas/gpu 等配置,进入 `QUEUED` 等待 reconciler apply
### 2.2 List models (current user)
`GET /api/v2/serve/models`
#### Response (200)
```json
{
"items": [ ... ServingModel ... ],
"openai_base_url": "http://<host>:8000/v1"
}
```
### 2.3 Get model detail
`GET /api/v2/serve/models/{model_key}`
#### Response (200)
```json
{
"model": { ... ServingModel ... },
"resolved_spec_yaml": "model_id: ...\nmodel_source: ...\n",
"events": [
{ "event_type": "DEPLOY_REQUESTED", "created_at": "...", "payload": {...} }
],
"serve_status": {
"app_name": "argus_llm_app",
"app_status": "RUNNING"
}
}
```
### 2.4 Scale replicas (PATCH)
`PATCH /api/v2/serve/models/{model_key}`
#### Request (JSON)
```json
{ "num_replicas": 2 }
```
#### Response (200)
```json
{ "model_key": "...", "state": "QUEUED" }
```
> v3.8 只支持修改 `num_replicas`(以及可选 engine_kwargs`gpus_per_replica` 若修改,可能触发重新部署。
### 2.5 Delete / Undeploy model
`DELETE /api/v2/serve/models/{model_key}`
#### Response (200)
```json
{ "model_key": "...", "state": "DELETING" }
```
语义从“声明式配置”中删除该模型reconciler 会在下一轮 tick 触发 `serve.run(...)` 更新 app 配置并最终使其不可见。
### 2.6 Admin: Serve cluster status可选
`GET /api/v2/serve/status`
#### Response (200)
返回 `serve.status()` 摘要(集群级 + app 级)。
> 仅 admin token 可访问(沿用 v3.x admin gate
## 3. 推理 APIRay Serve OpenAI ingress
> v3.8 不做鉴权:无需 `Authorization`
### 3.1 List models
`GET http://<host>:8000/v1/models`
返回可用 model 列表(包含 `alice-qwen-0.5b` 这类带前缀名称)。
### 3.2 Chat completions
`POST http://<host>:8000/v1/chat/completions`
```json
{
"model": "alice-202601061235-qwen-0.5b",
"messages": [{"role":"user","content":"Hello"}],
"stream": false
}
```
### 3.3 Completions / Embeddings
按 Ray Serve LLM OpenAI ingress 支持范围提供v3.8 验收至少覆盖 chat
## 4. 错误码约定MVP API server
- `400 invalid yaml/spec`YAML 解析失败、字段缺失、值不合法
- `403 forbidden`路径越权model_source 访问其他用户目录)
- `409 conflict`model_id_suffix 冲突(同一用户重复创建且不允许覆盖时;若选择 upsert 则不返回该错误)
- `422 unprocessable`资源参数非法replica/gpu <=0
- `500 internal`reconciler/serve 调用异常(详情记录到 `serve_events`,并写入 `error_summary`

View File

@ -0,0 +1,371 @@
# MVP v3.8 详细设计方案Ray ServevLLM模型动态部署与管理
> 基线:当前已具备 v3.7 能力(训练平台 + W&B + SFTPGo + WebUI/API + Ray stateless pool训练侧默认 rollout=vllm
> v3.8 目标:在同一套 Ray 集群上,引入 **Ray Serve LLM后端 vLLM** 的模型推理服务能力,并通过 WebUI/API 动态管理模型生命周期。
## 0. 需求范围(来自 requirements.md
1) 通过 Ray Serve后端 vLLM动态拉起 LLM支持**多模型 application** 部署
2) 默认一个模型 1 个 replica用户可配置多个
3) 用户可删除(下线)模型
4) 用户可指定模型使用几张 GPU
5) WebUI 可配置、查看模型列表、查看详情
6) 模型路径可用 common也可用 user 路径(本地路径)
## 1. 总体架构
### 1.1 组件关系
v3.8 在现有“训练平台”之上新增一个 **Serving 子系统**
- **API server现有**
- 新增 Serving API模型部署/删除/扩缩容/状态)
- 新增 Serving 后台线程reconciler周期性对齐 DB 与 Ray Serve 实际状态
- **SQLite现有**
- 新增 `serve_models``serve_events` 等表,保存声明式配置与状态
- **Ray 集群(现有 stateless pool**
- 复用现有 head/worker 容器
- 在集群内启动 Ray Servecontroller + proxy + deployments
- **Ray Serve LLM新增**
- 通过 `ray.serve.llm.build_openai_app` 构建一个 OpenAI-compatible app
- app 内包含多个 `LLMConfig`(每个对应一个模型)
### 1.2 为什么选择“单个 multi-model application”
Ray Serve 支持 multi-app但在 dev/docker 场景下多个 app 的 route_prefix 管理更复杂;同时 requirements 要求“多模型 application 部署”,因此 v3.8 采用:
- 一个固定的 app`argus_llm_app`(名字可配置)
- route_prefix 固定为 `/`(对外暴露 `/v1/...` OpenAI 接口)
- 每个模型对应一个 `LLMConfig`,通过 `model_id` 区分(即 OpenAI API 里的 `model` 字段)
这样对用户而言最直观:
- base_url 固定:`http://<host>:8000/v1`
- `model=` 选择不同模型(`/v1/models` 自动列出)
## 2. Ray Serve 部署策略dev/h1 约束)
### 2.1 HTTP 入口端口与 docker compose
Ray Serve 默认 HTTP 端口是 `8000`。v3.8 约定:
- 在 **head 容器** 映射 `8000:8000`
- API server 仍在 `8080`
- Ray Dashboard 在 `8265`
原因:在单机多容器 docker 环境里,如果让 proxy “每个节点都起”,会出现多个容器同时想绑定同一个 host 端口的问题(不可行)。因此 v3.8 推荐:
- Serve proxy 位置设为 **HeadOnly**(只在 head 上提供 HTTP 入口)
- GPU replica 仍运行在 worker 上proxy 只转发,不跑推理)
> 需要注意:
> - Serve 的 HTTP 配置host/port/proxy_location**Ray 集群全局配置**,启动后无法动态修改,因此应当在平台启动时一次性设定并持久化。
> - proxy Actor 需要 CPU 资源head 节点的 `num-cpus=0` 策略可能需要在 v3.8 做小幅调整(例如给 head 保留少量 CPU但仍通过 `entrypoint_resources` 确保训练 driver 不会被调度到 head。
#### 2.1.1 compose 预期改动v3.8 实现时落地)
- `src/mvp/docker-compose.yaml`ray_head新增
- `ports: - "8000:8000"`
> worker 容器不暴露 8000避免 host 端口冲突),由 head proxy 统一对外提供入口。
### 2.2 启动/配置方式Python SDK 优先)
v3.8 采用 Ray Serve Python SDK
- `ray.init(address="auto")`
- `serve.start(proxy_location="HeadOnly", http_options={"host":"0.0.0.0","port":8000})`(一次性全局配置)
- `serve.run(app, name=<app_name>, route_prefix="/")`
- `serve.delete(name=<app_name>)`(必要时)
- `serve.status()` 查询集群/应用状态
理由:
- 避免在平台内部引入额外 REST client 依赖(并减少跨版本 REST schema 不稳定风险)
- API server 本身运行在 head 容器内,可直接 `ray.init(address="auto")` 连接现有集群
> 另Ray Dashboard 暴露 Serve REST API`PUT /api/serve/applications/` 等)可作为备选方案,但 v3.8 先不以它为主通路。
### 2.3 依赖与镜像假设
v3.8 依赖:
- `ray[serve]`Serve Controller/Proxy
- `ray[llm]`Ray Serve LLM 的 `ray.serve.llm` 模块)
- vLLM推理引擎
由于 v3.7 已切换到 `verlai/verl:vllm011.latest`,预期镜像内包含 vLLM`ray.serve.llm` 是否开箱即用需要在实现阶段确认。
若缺失v3.8 将在 `argus-ray-node` 镜像构建阶段补充 `pip install "ray[serve,llm]"`(或按官方建议的最小依赖)并做版本锁定。
### 2.4 Serving 配置dev.yaml
v3.8 新增一段 serving 配置,至少包含:
```yaml
serving:
serve:
http_port: 8000 # 固定 8000
proxy_location: HeadOnly # dev/docker 下推荐
llm:
accelerator_type: H20 # dev 环境填写 H20对应 ray.serve.llm.LLMConfig.accelerator_type
```
说明:
- `accelerator_type` 是 Ray Serve LLM 的 `LLMConfig.accelerator_type` 字段,用于表达“该模型运行在哪类加速卡上”。在 dev/h1 环境我们固定为 `H20`
- v3.8 不把 `accelerator_type` 暴露给普通用户编辑(避免误配);由部署环境配置统一决定。
## 3. 模型配置与资源映射
### 3.1 关键配置对象:`ray.serve.llm.LLMConfig`
每个模型部署由一个 `LLMConfig` 描述关键字段v3.8 用到的子集):
- `model_loading_config`
- `model_id`: 对外展示/请求时用的模型名(唯一 key
- `model_source`: HF repo id / S3 / **local path**
- `accelerator_type`
- 从 `dev.yaml``serving.llm.accelerator_type` 读取dev/h1: `H20`
- `deployment_config`
- `num_replicas``autoscaling_config`v3.8 先用固定 `num_replicas`
- `ray_actor_options`CPU/资源约束)
- `engine_kwargs`
- vLLM 相关参数(`max_model_len``gpu_memory_utilization` 等)
- `placement_group_config`
- 控制 vLLM engine workers 使用的资源 bundle用于多 GPU / 跨节点)
- `runtime_env`
- 注入 HF cache、离线开关等环境变量
### 3.2 GPU 张数gpus_per_replica如何落到 LLMConfig
v3.8 把用户输入的:
- `gpus_per_replica = N`
映射为:
- `engine_kwargs.tensor_parallel_size = N`(单机/跨机张量并行Ray Serve LLM 官方示例写法)
- `placement_group_config = {"bundles": [{"GPU": 1, "CPU": <cpu_per_gpu>}] * N, "strategy": "PACK"}`
并在 `engine_kwargs` 中保留 vLLM 其他参数(`max_model_len``gpu_memory_utilization` 等)。
> 兼容性说明Ray Serve LLM/Serve LLM 仍处于快速演进阶段v3.8 会以我们线上实际 Ray 版本为准做最小适配与回归测试。
### 3.2.1 跨节点场景N > 单机 GPU
Ray Serve LLM 默认使用 `PACK` 策略,优先把 GPU worker 放在尽量少的节点上;如果单机放不下,会自动 spill 到其它节点从而支持跨节点张量并行TP部署。
### 3.3 replica 数num_replicas
v3.8 默认:
- `num_replicas = 1`
允许用户在 UI 中设置为 `>=1`
多 replica 会线性消耗 GPU`num_replicas * gpus_per_replica`),需要做资源预检查。
### 3.4 模型路径与宏替换common / user
v3.8 支持两类模型来源:
1) **common**
- 典型为 `/private/hf/...`(共享 HF cache / snapshot
2) **user**
- `/private/users/<user_id>/models/...`
- 以及用户训练输出(例如 `jobs/<sid>/checkpoints/.../huggingface`
为保证 UI 易用,沿用平台已有的宏语义:
- `$HOME``/private/users/<user_id>`
- `$HOME/common/hf``/private/hf`
并进行路径校验:
- 允许前缀:`/private/hf``/private/users/<user_id>/`
- 拒绝:越权访问其他用户目录、或访问系统敏感路径
### 3.5 离线模式(避免 HF mirror 429
v3.7 训练侧已验证 `HF_HUB_OFFLINE=1` 的必要性。v3.8 Serving 侧同样默认注入:
- `HF_HOME=/private/hf`
- `HUGGINGFACE_HUB_CACHE=/private/hf/hub`
- `TRANSFORMERS_CACHE=/private/hf/transformers`
- `HF_HUB_OFFLINE=1`
- `HF_ENDPOINT=https://hf-mirror.com`(可保留,但离线模式下不应触发网络)
并建议用户在 ServingSpec 中尽量填写 **local path** 作为 `model_source`,而不是直接 repo id。
## 4. 平台数据模型SQLite
新增两张主表:
### 4.1 `serve_models`
每一行代表一个“声明式模型部署”:
- `model_key`(平台内部唯一 ID便于重命名/去重)
- `user_id`
- `model_id`(对外 OpenAI model 名称,要求 per-app 唯一)
- `model_source`(本地路径或 repo id存 resolved 后的结果)
- `num_replicas`
- `gpus_per_replica`
- `engine_kwargs_json`(可选)
- `state``QUEUED | DEPLOYING | RUNNING | FAILED | DELETING | DELETED`
- `serve_app_name`(默认 `argus_llm_app`
- `created_at / updated_at`
- `error_summary`
### 4.2 `serve_events`
记录关键事件与排障信息(类似 task_events
- `id`
- `model_key`
- `event_type`DEPLOY_REQUESTED/DEPLOY_APPLIED/STATUS_SYNC/DELETE_REQUESTED/...
- `payload_json`
- `created_at`
## 5. API 设计(新增)
在现有 `Authorization: Bearer <user_token>` 的认证体系下,新增 Serving API路径仅示意具体在实现时与现有 `api/v2` 对齐)。
### 5.1 用户接口
- `POST /api/v2/serve/models`
- body: YAML 或 JSONv3.8 先用 YAML 与现有 TaskSpec 一致)
- 创建/更新upsert一个模型配置进入 `QUEUED`
- `GET /api/v2/serve/models`
- 列出当前用户的模型列表(含 state、资源、endpoint
- `GET /api/v2/serve/models/{model_key}`
- 详情:完整 spec + 最近事件 + Serve status 摘要
- `PATCH /api/v2/serve/models/{model_key}`
- 修改 `num_replicas`、或 engine_kwargs可选
- `DELETE /api/v2/serve/models/{model_key}`
- 下线模型(进入 `DELETING`
### 5.2 系统接口admin
- `GET /api/v2/serve/status`admin
- 返回 `serve.status()` 的摘要(集群级 / app 级)
### 5.3 对外推理 endpoint
固定输出到 UI/接口中:
- `openai_base_url = http://<host>:8000/v1`
- 支持:
- `/v1/chat/completions`
- `/v1/completions`
- `/v1/embeddings`
- `/v1/models`
> v3.8 不做额外网关与鉴权(保持与现有 dev 环境一致);若后续需要,可在 v3.9+ 引入 token 校验/反向代理。
### 5.4 `model_id` 前缀策略user_id-
为避免多用户冲突并保持可读性:
v3.8 采用“**user_id + 日期小时分钟**”作为稳定前缀,以降低冲突并便于快速定位创建时间:
- 用户在 UI/API 中仅填写 `model_id_suffix`(或仍用字段名 `model_id`,但语义为 suffix
- 平台计算实际对外 `model_id`
- `prefix = f"{user_id}-{YYYYMMDDHHMM}"`
- `model_id = f"{prefix}-{model_id_suffix}"`
- 在列表/详情中同时展示:
- `model_id_suffix`(用户输入)
- `model_id_prefix`(平台生成,例如 `alice-202601061235`
- `model_id`(对外 OpenAI 名称)
## 6. 后台执行模型Serving Reconciler
v3.8 参考任务 scheduler 的模式,引入一个轻量的 reconciler
- tick 周期(例如 5s
- 每次 tick
1) 拉取 DB 中 `QUEUED/DEPLOYING/RUNNING/DELETING` 的模型
2) 调用 `serve.status()` 读取当前 app 及 deployments 状态
3) 若存在 `QUEUED` 或需要变更的模型:构建新的 multi-model app包含全部 `RUNNING/DEPLOYING/QUEUED` 的模型配置)并 `serve.run(...)`
4) 若存在 `DELETING`:从 app 配置中移除对应模型,并 `serve.run(...)` 应用变更
5) 更新每个模型的 state依据 Serve status
重要行为说明multi-model app 的代价):
- 每次“新增/删除/改 replicas”都会触发对同一个 app 的一次 `serve.run(...)` 更新;
- Ray Serve 会尽量做增量更新,但在某些版本/配置下可能导致 ingress/router 短暂重启;
- v3.8 先接受该代价(满足需求闭环优先);若后续需要“删除某模型不影响其它模型”,可演进为“每模型一个 app + 单独 route_prefix”的方案。
资源预检查:
- 在 apply 前使用 `ray.available_resources()` 做粗粒度 GPU 预检查:
- 需要 GPU 总量 = `sum(num_replicas * gpus_per_replica)`(仅对“新增/扩容的差量”更精确)
- 若不足:
- 模型保持 `QUEUED`,记录事件 `PENDING_RESOURCES`
- 用户 UI 显示“资源不足,等待释放”
> v3.8 不引入更复杂的抢占/优先级。Serving 与 Training 会竞争 GPU用户需要自行规划资源或后续版本引入统一调度
## 7. WebUI 设计(新增 Serving 页面)
新增侧边栏入口:**Serving**
### 7.1 Serving 列表页
- 展示字段:
- model_id
- user_id仅 admin 可见)
- replicas / gpus_per_replica / total_gpus
- stateRUNNING/DEPLOYING/QUEUED/FAILED
- 操作Scale修改 replicas、Delete
### 7.2 Serving 创建/编辑页
两种模式(与 New Task 类似,先做 YAML 模式即可):
示例 YAMLv3.8
```yaml
model_id: qwen-0.5b
model_source: $HOME/common/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/<sha>
num_replicas: 1
gpus_per_replica: 1
# engine_kwargs:
# max_model_len: 8192
# gpu_memory_utilization: 0.9
```
### 7.3 Serving 详情页
- 完整配置resolved spec
- Serve status 摘要deployments 状态、replica 健康)
- OpenAI 调用示例python openai client
## 8. 验收标准v3.8
1) 部署:
- 一键部署一个模型1 replica、1 GPU成功状态变为 RUNNING
- `/v1/models` 可列出该模型
2) 扩缩容:
- 修改 `num_replicas` 生效Serve status 看到副本数变化)
3) 多模型:
- 同一个 app 内能同时部署 2 个模型(不同 model_id
- 通过 OpenAI 接口用不同 `model=` 请求可得到响应
4) 下线:
- 删除某模型后 `/v1/models` 不再出现
5) 模型路径:
- 支持 `/private/hf/...`common`/private/users/<user>/...`user两类本地路径
6) 资源不足可解释:
- 当 GPU 不足时,模型进入 `QUEUED` 并在 UI/详情中提示“资源不足”
## 9. 待确认点(请你评审时确认)
已确认(来自评审):
1) 推理端口固定使用 `8000`Ray Serve 默认端口)。
2) 对外暴露的 OpenAI 接口 **不与现有 token 体系绑定**v3.8 不做推理侧鉴权)。
3) `model_id` 命名规则:平台统一加 `user_id + 日期小时分钟` 前缀,用户在 UI 里只填写后缀部分。
> 说明:这样可以避免跨用户 model_id 冲突,同时在 OpenAI API 的 `model=` 字段上自然可读。

View File

@ -0,0 +1,266 @@
# MVP v3.8 开发计划TDD细化版
> 目标:在 v3.7 基础上引入 Ray ServevLLM模型动态部署与管理多模型单 app并提供 WebUI + API 管理闭环。
> 约束(已确认):
> - 推理端口固定 `8000`Serve HTTP
> - 推理侧不接入现有 token 鉴权(对外 OpenAI endpoint 无鉴权)。
> - 对外 `model_id` 统一加前缀:`<user_id>-<YYYYMMDDHHMM>-<suffix>`(用户只填 suffix
> - `LLMConfig.accelerator_type``dev.yaml` 读取dev/h1: `H20`)。
本计划按“测试先行 → 实现 → 回归”的节奏拆分到可验证粒度;每个 milestone 都能单独验收。
---
## M0 - 基线与依赖探测(不改行为)
**目的**:确认 v3.7 baseline 稳定,并明确 Ray Serve LLM 依赖是否已具备(否则后续会卡在镜像/依赖)。
### M0.1 本地回归
- [ ] `.venv/bin/python -m pytest` 通过coverage ≥ 90%
### M0.2 远端回归h1
- [ ] `src/mvp/scripts/run_all_v30_api.sh` 可跑通(确认训练闭环未回退)
### M0.3 head 容器内依赖探测(记录结论)
- [ ] `python3 -c "import ray; import ray.serve; print(ray.__version__)"`
- [ ] `python3 -c "from ray.serve.llm import LLMConfig, build_openai_app; print('serve_llm_ok')"`
- [ ] 若失败(例如缺 `gymnasium`):记录缺失项,并在 M6 通过补齐 `ray[llm]` 解决
### M0.4 配置探测
- [ ] `configs/dev.yaml` 中存在:
- `serving.llm.accelerator_type: H20`
- `serving.serve.http_port: 8000`
- `serving.serve.proxy_location: HeadOnly`
**验收**
- baseline 无回退;依赖探测结论明确(可用/不可用)
---
## M1 - ServingSpec解析/校验/宏替换/路径校验)(单测驱动)
**目的**先把“输入”这层彻底固化API/UI 复用),避免后期反复改 schema。
### M1.1 新增/扩展数据模型
- [ ] `ServingSpec`(输入)
- `model_id`suffix
- `model_source`(支持 `$HOME` 宏)
- `num_replicas`default=1
- `gpus_per_replica`default=1
- `engine_kwargs`(可选 dict先原样存 DB实现阶段再做白名单/黑名单)
- [ ] `ResolvedServingSpec`(内部)
- `model_id_suffix`
- `model_id_prefix`(由平台生成:`user_id-YYYYMMDDHHMM`
- `model_id`(对外:`<prefix>-<suffix>`
- `model_source`resolved path
### M1.2 规则(写成纯函数,便于测)
- [ ] `validate_model_id_suffix(suffix)`:长度/字符集限制(建议:`[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}`
- [ ] `$HOME` 宏替换:`$HOME``$HOME/common/hf``$HOME/common/datasets`
- [ ] 路径校验(强制本地路径):
- 允许:`/private/hf/...``/private/users/<user_id>/...`
- 拒绝:`..`、空、其它用户路径、非 `/private` 路径
- [ ] `make_model_id_prefix(user_id, now_utc)``YYYYMMDDHHMM`UTC+ user_id
### M1.3 单测(先写失败用例,再补实现)
- [ ] `test_serving_spec_validation.py`
- suffix 合法/非法
- replicas/gpus 边界0、负数、小数、超大值按实现决定是否限制上限
- [ ] `test_serving_spec_paths.py`
- `$HOME` 替换正确
- 越权路径返回 403/ValueError按接口层映射
- `/private/hf``/private/users/<user>` 均可
- [ ] `test_serving_model_id_prefix.py`
- 固定时间输入 → prefix 输出一致(避免时区/格式问题)
**验收**
- 输入 spec 规则稳定;核心校验/替换均有单测覆盖
---
## M2 - SQLite 表结构与 Db 接口(单测驱动)
**目的**Serving 的声明式状态必须持久化,可审计、可恢复。
### M2.1 DB schema
- [ ] `serve_models`
- 主键:`model_key`(平台生成)
- unique`(user_id, model_id_suffix)`(实现 upsert
- 存储resolved spec包含 prefix/full model_id、resolved model_source
- 状态:`QUEUED/DEPLOYING/RUNNING/FAILED/DELETING/DELETED`
- `error_summary`
- [ ] `serve_events`append-only
### M2.2 Db 方法
- [ ] `upsert_serve_model(user_id, spec_yaml, now)` → (model_key, state)
- [ ] `list_serve_models(user_id, include_deleted=False, limit/offset?)`
- [ ] `get_serve_model(model_key)`
- [ ] `set_serve_model_state(model_key, state, error_summary=None)`
- [ ] `append_serve_event(model_key, event_type, payload_json=None)`
- [ ] `pick_next_runnable_serve_change()`(给 reconciler 用)
### M2.3 单测
- [ ] `test_db_serving.py`
- upsert 行为(同 suffix 更新不产生新 model_key 或产生新版本——此处需在实现前明确策略)
- state 流转 + 事件记录
- list 的过滤与排序(按 updated_at
**验收**
- DB 行为可预测upsert/unique 语义确定并测试覆盖
---
## M3 - Serving 管理 APIFastAPI单测驱动
**目的**:先把管理 API 跑通Ray Serve 先不接真实reconciler 之后再接)。
### M3.1 API 路由(用户)
- [ ] `POST /api/v2/serve/models`Content-Type: application/yaml
- 入参ServingSpec YAML
- 出参:`{model_key,state}`202
- [ ] `GET /api/v2/serve/models`
- 返回 items + `openai_base_url=http://<host>:8000/v1`
- [ ] `GET /api/v2/serve/models/{model_key}`
- 返回 model + resolved_spec_yaml + events分页可后置+ serve_status先空/占位)
- [ ] `PATCH /api/v2/serve/models/{model_key}`JSON
- 支持 `num_replicas`(最小闭环)
- [ ] `DELETE /api/v2/serve/models/{model_key}`
### M3.2 API 路由admin可选
- [ ] `GET /api/v2/serve/status`(仅 admin token
### M3.3 错误映射(必须测试)
- [ ] YAML 解析失败400
- [ ] spec 校验失败422
- [ ] 越权路径403
- [ ] 不存在 model_key404
### M3.4 单测
- [ ] `test_app_serving_api.py`
- happy pathcreate → list → get → patch → delete
- 多用户隔离:用户只能看到自己的 model
- 错误码覆盖400/403/404/422
**验收**
- API reference (`v3.8_api.md`) 中所有管理接口可返回预期结构Serve 未接入也能工作)
---
## M4 - ServeClient 抽象 + LLMConfig builder单测驱动
**目的**:将“如何从 ResolvedServingSpec 构造 LLMConfig”固化并把 Ray Serve 的依赖隔离到 client 里,便于 mock。
### M4.1 `ServeClient` 接口(可 mock
- [ ] `ensure_started(http_port=8000, proxy_location="HeadOnly")`
- [ ] `apply_app(app_name, llm_configs)`multi-model
- [ ] `get_status()`serve.status 摘要)
### M4.2 `build_llm_config(resolved_spec, accelerator_type, runtime_env_defaults)` 纯函数
- [ ] 写入 `LLMConfig.accelerator_type`(来自 dev.yamlH20
- [ ] `deployment_config.num_replicas`
- [ ] `engine_kwargs.tensor_parallel_size = gpus_per_replica`
- [ ] `placement_group_config` bundles 按 GPU 张数生成
- [ ] `runtime_env.env_vars` 注入(至少包含 HF cache + `HF_HUB_OFFLINE=1`
### M4.3 单测
- [ ] `test_llm_config_builder.py`
- gpus_per_replica=1/2/4 → tensor_parallel_size 与 bundles 数量正确
- accelerator_type 注入正确
- runtime_env 含 HF_HUB_OFFLINE 等关键 env
**验收**
- 从平台 spec 到 Ray Serve LLMConfig 的映射规则稳定,有单测锁定
---
## M5 - Serving Reconciler状态机 + 资源预检查)(单测驱动)
**目的**实现声明式对齐DB → Serve同时提供可解释的 QUEUED/FAILED 状态。
### M5.1 状态机(最小闭环)
- [ ] `QUEUED`:等待 apply
- [ ] `DEPLOYING`:已触发 apply等待 Serve running/healthy
- [ ] `RUNNING`Serve status running
- [ ] `FAILED`apply 或 status 失败(写 error_summary + event
- [ ] `DELETING`:等待从 app 中移除
- [ ] `DELETED`:完成删除(可选保留记录)
### M5.2 资源预检查
- [ ] `needed_total_gpus = sum(num_replicas*gpus_per_replica)`(最小可用预检查)
- [ ] `ray.available_resources()["GPU"]`(或更稳健的 per-node 统计)不足时:
- 保持 `QUEUED`
- 记录 `PENDING_RESOURCES` event
### M5.3 reconcile 策略multi-model app
- [ ] tick 读取 active models构建全量 `llm_configs`
- [ ] 处理 deleting从 configs 中移除对应 model再 apply
### M5.4 单测mock ServeClient + mock ray resources
- [ ] `test_serving_reconciler.py`
- 新增模型apply_app 被调用state 进入 DEPLOYING
- 删除模型apply_app configs 不包含该模型
- GPU 不足:不 applystate 仍 QUEUEDevent 写入
- apply 抛异常state FAILEDerror_summary 写入
**验收**
- reconciler 行为在纯单测环境可验证;失败可解释
---
## M6 - 真实集成h1Ray Serve 启动 + 推理闭环E2E
**目的**:在 dev/h1 环境真正跑通:部署模型 → `/v1/models` 可见 → `chat/completions` 成功 → 删除后消失。
### M6.1 compose/端口
- [ ] `src/mvp/docker-compose.yaml``ray_head` 增加 `8000:8000`
### M6.2 镜像依赖(若 M0 发现缺失)
- [ ] 在 `argus-ray-node` 镜像中补齐 `ray[serve,llm]`(版本与现有 Ray 对齐,避免升级 Ray 导致不兼容)
- 推荐优先补齐 `ray[llm]`(包含 `ray.serve.llm` 依赖闭包,如 `gymnasium`),再按需补 `ray[serve]`
- 验证点:`python3 -c "from ray.serve.llm import LLMConfig, build_openai_app; print('serve_llm_ok')"`
### M6.3 E2E 脚本(幂等)
- [ ] 新增 `scripts/run_all_v38_serving.sh`
- 起 compose确保 Serve 端口可用)
- 起 API
- 创建 user + token
- `POST /api/v2/serve/models` 创建 1GPU 模型
- 轮询模型 state 到 RUNNING
- `curl http://127.0.0.1:8000/v1/models` 验证包含 `<prefix>-<suffix>`
- `curl http://127.0.0.1:8000/v1/chat/completions` 进行最小推理
- `DELETE /api/v2/serve/models/{model_key}` 下线
- 再轮询 `/v1/models` 不包含
**验收**
- E2E 可重复跑通(至少两次连续跑不需要人工清理)
---
## M7 - WebUIServing 页面)(单测驱动)
**目的**:给用户可视化的模型管理页面(最小必要功能)。
### M7.1 页面
- [ ] Sidebar 增加 Serving
- [ ] `/ui/serving`:列表 + 状态 + 操作delete/scale
- [ ] `/ui/serving/new`YAML 输入 + submit
- [ ] `/ui/serving/{model_key}`详情resolved spec、events、OpenAI 调用示例)
### M7.2 单测
- [ ] `test_ui_serving.py`:路由 200、关键链接存在、包含 openai_base_url=8000
**验收**
- WebUI 覆盖 create/list/detail/scale/delete 的主链路
---
## M8 - 文档与验收用例(交付)
**目的**:给用户/运维一套可复用的运行方式与排障路径。
- [ ] 更新 `specs/mvp/v3.8/v3.8_progress.md`(按 milestone 记录)
- [ ] 补充 README可选端口说明、推理 API 无鉴权警示、模型路径约定
- [ ] 验收清单checklist
- 单测通过
- h1 E2E 通过
- UI 主链路可操作

View File

@ -0,0 +1,48 @@
# MVP v3.8 进展记录
## 2026-01-06
- 完成 v3.8 设计文档:`specs/mvp/v3.8/v3.8_design.md`
- 完成 v3.8 Serving API reference`specs/mvp/v3.8/v3.8_api.md`
- 完成 v3.8 TDD 开发计划:`specs/mvp/v3.8/v3.8_dev_plan.md`
- 完成 M0`configs/dev.yaml` 增加 `serving` 配置http_port=8000, proxy_location=HeadOnly, accelerator_type=H20
- 完成 M1ServingSpec 解析/宏替换/路径校验 + 单测(`src/mvp/py/argus/service/serving_spec.py`
- 完成 M2SQLite 新增 `serve_models`/`serve_events` + Db API + 单测(`src/mvp/py/argus/service/db.py`
- 完成 M3FastAPI Serving 管理 API + 单测(`src/mvp/py/argus/service/app.py`
- 完成 M4ServeClient 抽象 + LLMConfig builderdict 形态)+ 单测(`src/mvp/py/argus/service/serve_client.py``src/mvp/py/argus/service/serve_llm_config.py`
- 完成 M5Serving reconciler状态机 + 资源预检查 + mock 单测)(`src/mvp/py/argus/service/serving_reconciler.py`
### M6h1 真实集成)
- `argus-ray-node` 镜像补齐依赖:`ray[serve,llm]` + `gymnasium` + `dm-tree`(避免 `ray.serve.llm` 导入失败)
- 修复 Ray 2.49.2 兼容性问题:
- `LLMConfig` 不支持 `placement_group_config`,改为使用 `resources_per_bundle``src/mvp/py/argus/service/serve_llm_config.py`
- 远端 E2E
- `scripts/run_all_v38_serving.sh` 可跑通create → RUNNING → `/v1/models``chat/completions` → delete → DELETED
- 修复脚本中 `/v1/models` 解析的 bash heredoc 引号错误(`src/mvp/scripts/run_all_v38_serving.sh`
### M7WebUI - Serving
- WebUI 增加 Serving 页面:
- 列表:`/ui/serving`
- 创建:`/ui/serving/new`
- 详情/事件/缩放/删除:`/ui/serving/{model_key}`
- 单测覆盖:
- `src/mvp/py/tests/test_ui_serving.py`
### M8文档/验收)
- `src/mvp/README.md` 补充 v3.8 serving 端口与 E2E 脚本说明
### 环境探测h1 / head 容器)
> 目的:确认 Ray Serve LLM 依赖是否开箱即用,避免后续集成阶段才暴雷。
- `ray`:可用,版本 `2.49.2`
- `ray.serve`:可 importServe 基础可用)
- `ray.serve.llm`:当前不可 import
- 报错:`ModuleNotFoundError: No module named 'gymnasium'`
- 原因:`ray.serve.llm` 的导入链路会触发 `ray.rllib`,而 rllib 依赖 `gymnasium`
结论:
- v3.8 在实现阶段需要在 `argus-ray-node` 镜像中补齐 `ray[llm]`(推荐)或至少补齐 `gymnasium` 等必要依赖,确保 `from ray.serve.llm import ...` 可用。

View File

@ -24,3 +24,9 @@ v3.0 访问入口dev/h1
- SFTPGo
- SFTP`127.0.0.1:2022`
- Admin API/UI`http://127.0.0.1:8081`(容器内 8080host 映射到 8081 避免与 API server 冲突)
v3.8Ray Serve LLM / vLLM 模型服务):
- 推理端口:`8000`Ray Serve HTTP
- OpenAI-compatible endpoint`http://127.0.0.1:8000/v1`
- 注意v3.8 推理接口**不做鉴权**
- E2E 脚本:`scripts/run_all_v38_serving.sh`

View File

@ -69,3 +69,11 @@ data:
jobs_trash_after_days: 3
jobs_purge_after_days: 7
janitor_interval_s: 3600
# v3.8: model serving via Ray Serve LLM (vLLM backend)
serving:
serve:
http_port: 8000
proxy_location: HeadOnly
llm:
accelerator_type: H20

View File

@ -1,10 +1,16 @@
services:
ray_head:
image: argus/argus-ray-node:vllm011.latest
build:
context: .
dockerfile: images/argus-ray-node/Dockerfile
args:
BASE_IMAGE: verlai/verl:vllm011.latest
container_name: argus-ray-head
ports:
- "8265:8265"
- "8080:8080"
- "8000:8000"
volumes:
# NOTE: this compose file is intended for the dev env layout like:
# /home2/argus/infra/mvp/{shared,verl,src/mvp}
@ -92,6 +98,11 @@ services:
ray_worker_0:
image: argus/argus-ray-node:vllm011.latest
build:
context: .
dockerfile: images/argus-ray-node/Dockerfile
args:
BASE_IMAGE: verlai/verl:vllm011.latest
container_name: argus-ray-worker-0
volumes:
- ../../verl:/workspace/verl
@ -124,6 +135,11 @@ services:
ray_worker_1:
image: argus/argus-ray-node:vllm011.latest
build:
context: .
dockerfile: images/argus-ray-node/Dockerfile
args:
BASE_IMAGE: verlai/verl:vllm011.latest
container_name: argus-ray-worker-1
volumes:
- ../../verl:/workspace/verl

View File

@ -6,6 +6,15 @@ SHELL ["/bin/bash", "-lc"]
# Install supervisord (prefer pip to avoid relying on distro package manager).
RUN python3 -m pip install --no-cache-dir supervisor
# v3.8: Ray Serve LLM deps (keep Ray version pinned to what's already in the base image).
# NOTE: base image already includes Ray; we only add extras.
RUN RAY_VER="$(python3 -c 'import ray; print(ray.__version__)')" && \
python3 -m pip install --no-cache-dir "ray[serve,llm]==${RAY_VER}"
# Ray Serve LLM's import chain currently pulls in ray.rllib which requires extra deps.
# Install them explicitly to make `from ray.serve.llm import ...` work reliably.
RUN python3 -m pip install --no-cache-dir gymnasium dm-tree && \
python3 -c "from ray.serve.llm import LLMConfig, build_openai_app; print('ray_serve_llm_ok')"
RUN mkdir -p /opt/argus/py/argus/ray
# Minimal embedded code for stateless pool (API code is intentionally excluded).

View File

@ -16,9 +16,8 @@ exec ray start \
--port="${ray_port}" \
--dashboard-host=0.0.0.0 \
--dashboard-port="${dashboard_port}" \
--num-cpus=0 \
--num-cpus="${ARGUS_HEAD_NUM_CPUS:-1}" \
--num-gpus=0 \
--disable-usage-stats \
--block \
${ARGUS_RAY_EXTRA_ARGS:-}

View File

@ -26,3 +26,19 @@ def new_task_id(workload: str, *, user_id: str | None = None) -> str:
def attempt_submission_id(task_id: str, attempt_no: int) -> str:
return f"{task_id}--a{attempt_no:02d}"
def new_model_key(*, user_id: str) -> str:
"""
Internal identifier for a serving model record.
Note:
- model_id is the OpenAI-facing name (user_id + timestamp prefix + suffix).
- model_key is used for stable DB identity and API resource path.
"""
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
suffix = secrets.token_hex(2)
u = _normalize_user_id(user_id)
if not u:
raise ValueError("user_id is required")
return f"mvp2-{u}-serve-{ts}-{suffix}"

View File

@ -4,11 +4,13 @@ import os
import secrets
import threading
from typing import Any
import json
from dataclasses import asdict
import yaml
from fastapi import FastAPI, HTTPException, Request, Response
from argus.core.ids import new_task_id
from argus.core.ids import new_model_key, new_task_id
from argus.ray.models import AdvancedTaskSpec, JobSpec, RayConfig, parse_taskspec
from .advanced_command import expand_advanced_command, validate_advanced_command
@ -16,6 +18,7 @@ from .config import V2Config
from .db import Db
from .janitor import JobsJanitor
from .scheduler import Scheduler
from .serving_spec import ServingSpec, parse_serving_spec, resolve_serving_spec
from .sftpgo import SFTPGoAdminClient, SFTPGoError
from .ui import register_ui_routes
@ -85,6 +88,61 @@ def create_app(config_path: str) -> FastAPI:
common_root=f"{shared_root}/common",
)
def _serving_enabled() -> bool:
return bool(v2_cfg.serving.enabled)
def _openai_base_url(req: Request) -> str:
# Prefer forwarded headers if present; otherwise fall back to Host.
host = req.headers.get("x-forwarded-host") or req.headers.get("host") or req.url.hostname or "127.0.0.1"
# Strip port if present (common for Host header).
hostname = host
if hostname.startswith("[") and "]" in hostname:
# IPv6 like: [::1]:8080
hostname = hostname.split("]")[0] + "]"
else:
hostname = hostname.split(":")[0]
scheme = req.headers.get("x-forwarded-proto") or req.url.scheme or "http"
port = int(v2_cfg.serving.serve.http_port)
return f"{scheme}://{hostname}:{port}/v1"
def _dump_yaml(obj: Any) -> str:
return yaml.safe_dump(obj, sort_keys=False)
def _serving_spec_to_dict(spec: ServingSpec) -> dict[str, Any]:
return {
"model_id": spec.model_id,
"model_source": spec.model_source,
"num_replicas": int(spec.num_replicas),
"gpus_per_replica": int(spec.gpus_per_replica),
"engine_kwargs": spec.engine_kwargs,
}
def _serve_model_public(row: dict[str, Any], *, req: Request) -> dict[str, Any]:
num_replicas = int(row.get("num_replicas") or 0)
gpus_per_replica = int(row.get("gpus_per_replica") or 0)
total_gpus = num_replicas * gpus_per_replica
model_id = str(row.get("model_id") or "")
return {
"model_key": str(row.get("model_key") or ""),
"user_id": str(row.get("user_id") or ""),
"model_id": model_id,
"model_id_suffix": str(row.get("model_id_suffix") or ""),
"model_id_prefix": str(row.get("model_id_prefix") or ""),
"model_source": str(row.get("model_source") or ""),
"num_replicas": num_replicas,
"gpus_per_replica": gpus_per_replica,
"total_gpus": total_gpus,
"state": str(row.get("state") or ""),
"error_summary": row.get("error_summary"),
"created_at": str(row.get("created_at") or ""),
"updated_at": str(row.get("updated_at") or ""),
"deleted_at": row.get("deleted_at"),
"endpoint": {
"openai_base_url": _openai_base_url(req),
"model": model_id,
},
}
def _auth(req: Request) -> dict[str, Any]:
token_env = v2_cfg.auth.token_env
admin_token = os.environ.get(token_env, "")
@ -565,6 +623,162 @@ def create_app(config_path: str) -> FastAPI:
return db.list_queue()
return db.list_queue(user_id=str(subject["user_id"]))
# v3.8: Model serving (Ray Serve LLM) management APIs.
@app.post("/api/v2/serve/models")
async def create_serve_model(req: Request) -> dict[str, Any]:
subject = _auth(req)
if not _serving_enabled():
raise HTTPException(status_code=400, detail="serving is not enabled")
body = (await req.body()).decode("utf-8")
try:
obj = yaml.safe_load(body) or {}
except Exception as e:
raise HTTPException(status_code=400, detail=f"invalid YAML: {e!r}")
if not isinstance(obj, dict):
raise HTTPException(status_code=400, detail="serving spec must be a YAML mapping")
user_id = str(subject["user_id"]).strip()
try:
spec = parse_serving_spec(obj)
resolved = resolve_serving_spec(spec=spec, user_id=user_id)
except PermissionError as e:
raise HTTPException(status_code=403, detail=str(e))
except ValueError as e:
msg = str(e)
code = 422 if ("num_replicas" in msg or "gpus_per_replica" in msg) else 400
raise HTTPException(status_code=code, detail=f"invalid serving spec: {e!r}")
model_key = new_model_key(user_id=user_id)
try:
engine_kwargs_json = json.dumps(resolved.engine_kwargs, sort_keys=True) if resolved.engine_kwargs is not None else None
except TypeError as e:
raise HTTPException(status_code=400, detail=f"engine_kwargs must be JSON-serializable: {e!r}")
spec_yaml = _dump_yaml(_serving_spec_to_dict(spec))
resolved_spec_yaml = _dump_yaml(asdict(resolved))
db.create_serve_model(
model_key=model_key,
user_id=user_id,
model_id_suffix=resolved.model_id_suffix,
model_id_prefix=resolved.model_id_prefix,
model_id=resolved.model_id,
model_source=resolved.model_source,
num_replicas=resolved.num_replicas,
gpus_per_replica=resolved.gpus_per_replica,
engine_kwargs_json=engine_kwargs_json,
spec_yaml=spec_yaml,
resolved_spec_yaml=resolved_spec_yaml,
)
return {"model_key": model_key, "state": "QUEUED"}
@app.get("/api/v2/serve/models")
async def list_serve_models(req: Request, limit: int = 200, offset: int = 0, include_deleted: int = 0) -> dict[str, Any]:
subject = _auth(req)
if not _serving_enabled():
raise HTTPException(status_code=400, detail="serving is not enabled")
lim = max(1, min(int(limit), 1000))
off = max(0, int(offset))
inc = bool(int(include_deleted))
user_id = str(subject["user_id"])
items = db.list_serve_models(user_id=user_id, include_deleted=inc, limit=lim, offset=off)
out = [_serve_model_public(i, req=req) for i in items]
return {
"items": out,
"openai_base_url": _openai_base_url(req),
"limit": lim,
"offset": off,
"has_more": bool(len(items) == lim),
}
@app.get("/api/v2/serve/models/{model_key}")
async def get_serve_model(model_key: str, req: Request) -> dict[str, Any]:
subject = _auth(req)
if not _serving_enabled():
raise HTTPException(status_code=400, detail="serving is not enabled")
row = db.get_serve_model(model_key)
if not row:
raise HTTPException(status_code=404, detail="model not found")
if not subject.get("is_admin"):
if str(row.get("user_id") or "") != str(subject["user_id"]):
raise HTTPException(status_code=404, detail="model not found")
events = db.list_serve_events(model_key, limit=200, offset=0)
ev_out = [
{
"id": int(e.get("id") or 0),
"model_key": str(e.get("model_key") or ""),
"created_at": str(e.get("ts") or ""),
"event_type": str(e.get("event_type") or ""),
"payload_json": e.get("payload_json"),
}
for e in events
]
return {
"model": _serve_model_public(row, req=req),
"resolved_spec_yaml": str(row.get("resolved_spec_yaml") or ""),
"events": ev_out,
"serve_status": None,
}
@app.patch("/api/v2/serve/models/{model_key}")
async def patch_serve_model(model_key: str, req: Request) -> dict[str, Any]:
subject = _auth(req)
if not _serving_enabled():
raise HTTPException(status_code=400, detail="serving is not enabled")
row = db.get_serve_model(model_key)
if not row:
raise HTTPException(status_code=404, detail="model not found")
if not subject.get("is_admin"):
if str(row.get("user_id") or "") != str(subject["user_id"]):
raise HTTPException(status_code=404, detail="model not found")
obj = await req.json()
if not isinstance(obj, dict):
raise HTTPException(status_code=400, detail="body must be a JSON object")
if "num_replicas" not in obj:
raise HTTPException(status_code=400, detail="missing num_replicas")
num_replicas = obj.get("num_replicas")
if not isinstance(num_replicas, int) or int(num_replicas) < 1:
raise HTTPException(status_code=422, detail="num_replicas must be an integer >= 1")
db.update_serve_model_num_replicas(model_key=model_key, num_replicas=int(num_replicas))
return {"model_key": model_key, "state": "QUEUED"}
@app.delete("/api/v2/serve/models/{model_key}")
async def delete_serve_model(model_key: str, req: Request) -> dict[str, Any]:
subject = _auth(req)
if not _serving_enabled():
raise HTTPException(status_code=400, detail="serving is not enabled")
row = db.get_serve_model(model_key)
if not row:
raise HTTPException(status_code=404, detail="model not found")
if not subject.get("is_admin"):
if str(row.get("user_id") or "") != str(subject["user_id"]):
raise HTTPException(status_code=404, detail="model not found")
db.set_serve_model_state(model_key=model_key, state="DELETING", event_type="SERVE_DELETE_REQUESTED")
return {"model_key": model_key, "state": "DELETING"}
@app.get("/api/v2/serve/status")
async def serve_status(req: Request) -> dict[str, Any]:
_require_admin(req)
if not _serving_enabled():
raise HTTPException(status_code=400, detail="serving is not enabled")
return {
"enabled": True,
"openai_base_url": _openai_base_url(req),
"http_port": int(v2_cfg.serving.serve.http_port),
"proxy_location": str(v2_cfg.serving.serve.proxy_location),
"accelerator_type": str(v2_cfg.serving.llm.accelerator_type),
}
# v3.0: minimal WebUI (no server-side session; token stored in browser localStorage).
register_ui_routes(app)

View File

@ -57,6 +57,24 @@ class V2SFTPGoConfig:
admin_password_env: str = "SFTPGO_ADMIN_PASSWORD"
@dataclass(frozen=True)
class V2ServingServeConfig:
http_port: int = 8000
proxy_location: str = "HeadOnly"
@dataclass(frozen=True)
class V2ServingLLMConfig:
accelerator_type: str = ""
@dataclass(frozen=True)
class V2ServingConfig:
enabled: bool = False
serve: V2ServingServeConfig = V2ServingServeConfig()
llm: V2ServingLLMConfig = V2ServingLLMConfig()
@dataclass(frozen=True)
class V2DataConfig:
user_root: str
@ -72,6 +90,7 @@ class V2Config:
scheduler: V2SchedulerConfig
tracking: V2TrackingConfig
data: V2DataConfig
serving: V2ServingConfig
@staticmethod
def from_root_dict(root: dict[str, Any]) -> "V2Config":
@ -112,6 +131,15 @@ class V2Config:
if not isinstance(sftpgo, dict) or not isinstance(retention, dict):
raise ValueError("config.data.{sftpgo,retention} must be mappings")
serving = root.get("serving") or {}
if not isinstance(serving, dict):
raise ValueError("config.serving must be a mapping")
serving_enabled = bool(serving.get("enabled")) if "enabled" in serving else bool(serving)
serving_serve = serving.get("serve") or {}
serving_llm = serving.get("llm") or {}
if not isinstance(serving_serve, dict) or not isinstance(serving_llm, dict):
raise ValueError("config.serving.{serve,llm} must be mappings")
default_db_path = f"{shared_root}/common/db/mvp.sqlite3"
db_path = str(sqlite.get("db_path") or default_db_path)
@ -158,4 +186,14 @@ class V2Config:
janitor_interval_s=int(retention.get("janitor_interval_s") or 3600),
),
),
serving=V2ServingConfig(
enabled=serving_enabled,
serve=V2ServingServeConfig(
http_port=int(serving_serve.get("http_port") or 8000),
proxy_location=str(serving_serve.get("proxy_location") or "HeadOnly"),
),
llm=V2ServingLLMConfig(
accelerator_type=str(serving_llm.get("accelerator_type") or ""),
),
),
)

View File

@ -117,6 +117,43 @@ class Db:
)
"""
)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS serve_models (
model_key TEXT PRIMARY KEY,
user_id TEXT NOT NULL,
model_id_suffix TEXT NOT NULL,
model_id_prefix TEXT NOT NULL,
model_id TEXT NOT NULL,
model_source TEXT NOT NULL,
num_replicas INTEGER NOT NULL,
gpus_per_replica INTEGER NOT NULL,
engine_kwargs_json TEXT,
state TEXT NOT NULL,
spec_yaml TEXT NOT NULL,
resolved_spec_yaml TEXT NOT NULL,
error_summary TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL,
deleted_at TEXT
)
"""
)
conn.execute(
"""
CREATE TABLE IF NOT EXISTS serve_events (
id INTEGER PRIMARY KEY AUTOINCREMENT,
model_key TEXT NOT NULL,
ts TEXT NOT NULL,
event_type TEXT NOT NULL,
payload_json TEXT,
FOREIGN KEY (model_key) REFERENCES serve_models(model_key) ON DELETE CASCADE
)
"""
)
conn.execute("CREATE INDEX IF NOT EXISTS idx_serve_models_user ON serve_models(user_id)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_serve_models_state ON serve_models(state)")
conn.execute("CREATE INDEX IF NOT EXISTS idx_serve_events_model ON serve_events(model_key)")
@contextmanager
def tx(self) -> Iterator[sqlite3.Connection]:
@ -493,3 +530,239 @@ class Db:
(str(end_time_le), int(limit)),
).fetchall()
return [dict(r) for r in rows]
def create_serve_model(
self,
*,
model_key: str,
user_id: str,
model_id_suffix: str,
model_id_prefix: str,
model_id: str,
model_source: str,
num_replicas: int,
gpus_per_replica: int,
spec_yaml: str,
resolved_spec_yaml: str,
engine_kwargs_json: str | None = None,
) -> dict[str, Any]:
now = _utc_now_iso()
with self.tx() as conn:
conn.execute(
"""
INSERT INTO serve_models (
model_key,
user_id,
model_id_suffix,
model_id_prefix,
model_id,
model_source,
num_replicas,
gpus_per_replica,
engine_kwargs_json,
state,
spec_yaml,
resolved_spec_yaml,
created_at,
updated_at
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 'QUEUED', ?, ?, ?, ?)
""",
(
model_key,
user_id,
model_id_suffix,
model_id_prefix,
model_id,
model_source,
int(num_replicas),
int(gpus_per_replica),
engine_kwargs_json,
spec_yaml,
resolved_spec_yaml,
now,
now,
),
)
conn.execute(
"INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, 'SERVE_MODEL_CREATED', ?)",
(model_key, now, None),
)
row = conn.execute("SELECT * FROM serve_models WHERE model_key = ?", (model_key,)).fetchone()
return dict(row) if row else {}
def list_serve_models(
self,
*,
user_id: str,
include_deleted: bool = False,
limit: int = 200,
offset: int = 0,
) -> list[dict[str, Any]]:
with self._connect() as conn:
where_sql = "WHERE user_id = ?"
params: list[Any] = [user_id]
if not include_deleted:
where_sql += " AND deleted_at IS NULL"
params.append(int(limit))
params.append(max(0, int(offset)))
rows = conn.execute(
f"""
SELECT
model_key,
user_id,
model_id_suffix,
model_id_prefix,
model_id,
model_source,
num_replicas,
gpus_per_replica,
engine_kwargs_json,
state,
error_summary,
created_at,
updated_at,
deleted_at
FROM serve_models
{where_sql}
ORDER BY created_at DESC, model_key DESC
LIMIT ? OFFSET ?
""",
tuple(params),
).fetchall()
return [dict(r) for r in rows]
def list_all_serve_models(
self,
*,
include_deleted: bool = False,
limit: int = 2000,
offset: int = 0,
) -> list[dict[str, Any]]:
with self._connect() as conn:
where_sql = ""
if not include_deleted:
where_sql = "WHERE deleted_at IS NULL"
rows = conn.execute(
f"""
SELECT
model_key,
user_id,
model_id_suffix,
model_id_prefix,
model_id,
model_source,
num_replicas,
gpus_per_replica,
engine_kwargs_json,
state,
error_summary,
spec_yaml,
resolved_spec_yaml,
created_at,
updated_at,
deleted_at
FROM serve_models
{where_sql}
ORDER BY created_at ASC, model_key ASC
LIMIT ? OFFSET ?
""",
(int(limit), max(0, int(offset))),
).fetchall()
return [dict(r) for r in rows]
def get_serve_model(self, model_key: str) -> dict[str, Any] | None:
with self._connect() as conn:
row = conn.execute("SELECT * FROM serve_models WHERE model_key = ?", (model_key,)).fetchone()
return dict(row) if row else None
def list_serve_events(self, model_key: str, limit: int = 200, offset: int = 0) -> list[dict[str, Any]]:
with self._connect() as conn:
rows = conn.execute(
"""
SELECT id, model_key, ts, event_type, payload_json
FROM serve_events
WHERE model_key = ?
ORDER BY id DESC
LIMIT ? OFFSET ?
""",
(model_key, int(limit), max(0, int(offset))),
).fetchall()
return [dict(r) for r in rows]
def append_serve_event(self, *, model_key: str, event_type: str, payload_json: str | None = None) -> None:
now = _utc_now_iso()
with self.tx() as conn:
conn.execute(
"INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, ?, ?)",
(model_key, now, event_type, payload_json),
)
def set_serve_model_state(
self,
*,
model_key: str,
state: str,
error_summary: str | None = None,
event_type: str = "SERVE_STATE_UPDATE",
payload_json: str | None = None,
) -> None:
now = _utc_now_iso()
with self.tx() as conn:
sets = ["state = ?", "updated_at = ?"]
params: list[Any] = [state, now]
if error_summary is not None:
sets.append("error_summary = ?")
params.append(error_summary)
if state == "DELETED":
sets.append("deleted_at = ?")
params.append(now)
params.append(model_key)
conn.execute(f"UPDATE serve_models SET {', '.join(sets)} WHERE model_key = ?", tuple(params))
conn.execute(
"INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, ?, ?)",
(model_key, now, event_type, payload_json),
)
def update_serve_model_num_replicas(self, *, model_key: str, num_replicas: int) -> None:
if not isinstance(num_replicas, int) or num_replicas < 1:
raise ValueError("num_replicas must be an integer >= 1")
now = _utc_now_iso()
with self.tx() as conn:
conn.execute(
"""
UPDATE serve_models
SET num_replicas = ?, state = 'QUEUED', error_summary = NULL, updated_at = ?
WHERE model_key = ?
""",
(int(num_replicas), now, model_key),
)
conn.execute(
"INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, 'SERVE_PATCH_NUM_REPLICAS', ?)",
(model_key, now, str(num_replicas)),
)
def pick_next_runnable_serve_change(self) -> dict[str, Any] | None:
"""
Returns the next serve model that needs reconciliation.
Minimal state machine for now:
- QUEUED: needs (re)apply
- DELETING: needs removal
"""
with self._connect() as conn:
row = conn.execute(
"""
SELECT *
FROM serve_models
WHERE deleted_at IS NULL
AND state IN ('QUEUED','DELETING')
ORDER BY updated_at ASC
LIMIT 1
"""
).fetchone()
return dict(row) if row else None
# Backward compatible naming (v3.8 docs originally used "upsert").
def upsert_serve_model(self, **kwargs: Any) -> dict[str, Any]:
return self.create_serve_model(**kwargs)

View File

@ -16,6 +16,8 @@ from argus.ray.ray_job_tool import RayJobTool
from .config import V2Config
from .db import Db
from .ray_resources import ensure_ray_connected, get_cluster_available
from .serve_client import RayServeClient
from .serving_reconciler import ServingReconciler
_INSUFFICIENT_RE = re.compile(r"Total available GPUs\\s+\\d+\\s+is less than total desired GPUs\\s+\\d+")
@ -37,6 +39,18 @@ class Scheduler:
def __post_init__(self) -> None:
self.tool = RayJobTool(self.ray_cfg)
self._serving: ServingReconciler | None = None
if bool(self.v2_cfg.serving.enabled):
self._serving = ServingReconciler(
db=self.db,
v2_cfg=self.v2_cfg,
ray_runtime_env_env_vars=self.ray_cfg.runtime_env_env_vars,
serve_client=RayServeClient(
http_port=int(self.v2_cfg.serving.serve.http_port),
proxy_location=str(self.v2_cfg.serving.serve.proxy_location),
ray_init_address="auto",
),
)
def _job_dir_for_task(self, *, user_id: str | None, ray_submission_id: str) -> str:
root = self.ray_cfg.shared_root.rstrip("/")
@ -251,6 +265,14 @@ class Scheduler:
def tick(self) -> None:
ensure_ray_connected()
# v3.8: reconcile serve_models (best-effort).
if self._serving is not None:
try:
self._serving.tick()
except Exception:
# Keep scheduler alive even if serving tick fails.
pass
# Sync active tasks
for row in self.db.list_active_tasks(limit=50):
self._sync_one_running(row)

View File

@ -0,0 +1,45 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
@dataclass(frozen=True)
class RayServeClient:
"""
Minimal Ray Serve client wrapper.
This is intentionally tiny and uses runtime imports so that:
- unit tests can stub `ray` modules without needing real Ray installed
- production can run with the real Ray Serve stack (v3.8+)
"""
http_port: int = 8000
proxy_location: str = "HeadOnly"
ray_init_address: str = "auto"
def ensure_started(self) -> None:
import ray # runtime import
# Scheduler already calls ray.init(); make this idempotent.
ray.init(address=self.ray_init_address, ignore_reinit_error=True, log_to_driver=False) # type: ignore[call-arg]
# Import serve lazily to allow tests to stub it.
from ray import serve # type: ignore
serve.start(proxy_location=self.proxy_location, http_options={"host": "0.0.0.0", "port": int(self.http_port)})
def apply_app(self, *, app: Any, app_name: str, route_prefix: str = "/") -> Any:
from ray import serve # type: ignore
# If Ray Serve LLM isn't available, callers may pass a plain dict placeholder.
# Running that through serve.run() results in a confusing TypeError; fail fast.
if isinstance(app, dict):
raise ValueError("invalid serve app object (Ray Serve LLM not available or build_openai_app failed)")
return serve.run(app, name=app_name, route_prefix=route_prefix)
def get_status(self) -> Any:
from ray import serve # type: ignore
return serve.status()

View File

@ -0,0 +1,63 @@
from __future__ import annotations
from typing import Any
from .serving_spec import ResolvedServingSpec
def _ensure_hf_env_defaults(env: dict[str, str]) -> dict[str, str]:
out = dict(env or {})
# Prefer existing values if present, but always force offline mode in the platform.
out.setdefault("HF_HOME", "/private/hf")
out.setdefault("HUGGINGFACE_HUB_CACHE", "/private/hf/hub")
out.setdefault("TRANSFORMERS_CACHE", "/private/hf/transformers")
out["HF_HUB_OFFLINE"] = "1"
return out
def build_llm_config_dict(
resolved: ResolvedServingSpec,
*,
accelerator_type: str,
runtime_env_env_vars: dict[str, str] | None,
cpu_per_gpu: float = 1.0,
) -> dict[str, Any]:
"""
Pure builder: maps a platform ResolvedServingSpec to a Ray Serve LLM-like config.
We return a plain dict here to keep this layer unit-testable without depending on
a specific Ray Serve LLM version. The reconciler (later milestone) can choose to
instantiate `ray.serve.llm.LLMConfig` using this dict.
"""
if not accelerator_type:
raise ValueError("accelerator_type is required")
if resolved.num_replicas < 1:
raise ValueError("num_replicas must be >= 1")
if resolved.gpus_per_replica < 1:
raise ValueError("gpus_per_replica must be >= 1")
if cpu_per_gpu <= 0:
raise ValueError("cpu_per_gpu must be > 0")
engine_kwargs: dict[str, Any] = dict(resolved.engine_kwargs or {})
# Enforce tensor parallel mapping; user-provided value must not contradict requested GPUs.
engine_kwargs["tensor_parallel_size"] = int(resolved.gpus_per_replica)
# Ray Serve LLM (Ray 2.49.x) exposes `resources_per_bundle` instead of the older
# `placement_group_config`. Use a single bundle that reserves the full GPU set
# required by tensor-parallel execution.
resources_per_bundle = {
"GPU": float(resolved.gpus_per_replica),
"CPU": float(cpu_per_gpu) * float(resolved.gpus_per_replica),
}
env_vars = _ensure_hf_env_defaults(dict(runtime_env_env_vars or {}))
return {
# Ray Serve LLM expects `model_loading_config` with model_id/model_source.
"model_loading_config": {"model_id": resolved.model_id, "model_source": resolved.model_source},
"accelerator_type": accelerator_type,
"deployment_config": {"num_replicas": int(resolved.num_replicas)},
"engine_kwargs": engine_kwargs,
"resources_per_bundle": resources_per_bundle,
"runtime_env": {"env_vars": env_vars},
}

View File

@ -0,0 +1,151 @@
from __future__ import annotations
import json
import traceback
from dataclasses import dataclass
from typing import Any, Protocol
from argus.service.ray_resources import ClusterAvailable, get_cluster_available
from .config import V2Config
from .db import Db
from .serve_llm_config import build_llm_config_dict
from .serving_spec import ResolvedServingSpec
class ServeClient(Protocol):
def ensure_started(self) -> None: ...
def apply_app(self, *, app: Any, app_name: str, route_prefix: str = "/") -> Any: ...
def get_status(self) -> Any: ...
def _parse_engine_kwargs(row: dict[str, Any]) -> dict[str, Any] | None:
raw = row.get("engine_kwargs_json")
if raw in (None, ""):
return None
try:
obj = json.loads(str(raw))
return obj if isinstance(obj, dict) else None
except Exception:
return None
def _row_to_resolved_spec(row: dict[str, Any]) -> ResolvedServingSpec:
return ResolvedServingSpec(
user_id=str(row["user_id"]),
model_id_suffix=str(row["model_id_suffix"]),
model_id_prefix=str(row["model_id_prefix"]),
model_id=str(row["model_id"]),
model_source=str(row["model_source"]),
num_replicas=int(row["num_replicas"]),
gpus_per_replica=int(row["gpus_per_replica"]),
engine_kwargs=_parse_engine_kwargs(row),
)
def _needed_total_gpus(rows: list[dict[str, Any]]) -> int:
total = 0
for r in rows:
total += int(r.get("num_replicas") or 0) * int(r.get("gpus_per_replica") or 0)
return total
@dataclass
class ServingReconciler:
"""
v3.8: reconcile declared serve_models (SQLite) into a multi-model Ray Serve app.
This reconciler is intentionally conservative:
- Only acts on models in states QUEUED/DELETING.
- Performs a minimal GPU precheck using ray available GPU totals.
- Writes events and state transitions for explainability.
"""
db: Db
v2_cfg: V2Config
ray_runtime_env_env_vars: dict[str, str]
serve_client: ServeClient
app_name: str = "argus_llm_app"
route_prefix: str = "/"
cpu_per_gpu: float = 1.0
get_available_fn: Any = get_cluster_available
def tick(self) -> None:
# Pick the next desired change.
change = self.db.pick_next_runnable_serve_change()
if not change:
return
model_key = str(change["model_key"])
state = str(change.get("state") or "")
# Ensure Ray (and Serve) can be started before doing anything else.
try:
self.serve_client.ensure_started()
except Exception as e:
self.db.append_serve_event(model_key=model_key, event_type="SERVE_START_ERROR", payload_json=repr(e))
return
# Desired set: all non-deleted models except those marked DELETING.
all_rows = self.db.list_all_serve_models(include_deleted=False, limit=5000, offset=0)
# FAILED models are not part of the desired running set. A user can PATCH to
# re-queue a failed model (e.g., after fixing env/deps) which will move it back to QUEUED.
desired_rows = [r for r in all_rows if str(r.get("state") or "") not in ("DELETING", "DELETED", "FAILED")]
# Precheck resources: multi-model app apply needs enough GPUs for the whole desired set.
needed = _needed_total_gpus(desired_rows)
avail: ClusterAvailable = self.get_available_fn()
if float(avail.total_available_gpus) < float(needed):
msg = f"Insufficient GPUs: need {needed}, available {avail.total_available_gpus}"
self.db.append_serve_event(model_key=model_key, event_type="SERVE_PENDING_RESOURCES", payload_json=msg)
return
# Build per-model LLM configs (dict form in M4).
llm_cfg_dicts: list[dict[str, Any]] = []
accelerator_type = str(self.v2_cfg.serving.llm.accelerator_type or "")
for r in desired_rows:
resolved = _row_to_resolved_spec(r)
llm_cfg_dicts.append(
build_llm_config_dict(
resolved,
accelerator_type=accelerator_type,
runtime_env_env_vars=self.ray_runtime_env_env_vars,
cpu_per_gpu=self.cpu_per_gpu,
)
)
# Build a Ray Serve OpenAI-compatible app if Ray Serve LLM is available.
# Fall back to a plain dict so unit tests can run without real Ray Serve.
app_obj: Any
try:
from ray.serve.llm import LLMConfig, build_openai_app # type: ignore
llm_cfgs = [LLMConfig(**d) for d in llm_cfg_dicts]
app_obj = build_openai_app({"llm_configs": llm_cfgs})
except Exception as e:
self.db.append_serve_event(model_key=model_key, event_type="SERVE_LLM_IMPORT_ERROR", payload_json=repr(e))
app_obj = {"llm_configs": llm_cfg_dicts}
try:
self.db.append_serve_event(model_key=model_key, event_type="SERVE_APPLY_REQUESTED", payload_json=str(len(llm_cfg_dicts)))
self.serve_client.apply_app(app=app_obj, app_name=self.app_name, route_prefix=self.route_prefix)
except Exception as e:
err = f"{type(e).__name__}: {e}"
tb = traceback.format_exc(limit=10)
self.db.set_serve_model_state(model_key=model_key, state="FAILED", error_summary=err, event_type="SERVE_APPLY_FAILED", payload_json=tb)
return
# Apply succeeded. Update the changing model's state.
if state == "DELETING":
self.db.set_serve_model_state(model_key=model_key, state="DELETED", event_type="SERVE_DELETE_APPLIED")
return
# Mark as deploying; best-effort status probe can promote to RUNNING.
self.db.set_serve_model_state(model_key=model_key, state="DEPLOYING", event_type="SERVE_DEPLOYING")
try:
_ = self.serve_client.get_status()
self.db.set_serve_model_state(model_key=model_key, state="RUNNING", event_type="SERVE_RUNNING")
except Exception as e:
self.db.append_serve_event(model_key=model_key, event_type="SERVE_STATUS_ERROR", payload_json=repr(e))

View File

@ -0,0 +1,144 @@
from __future__ import annotations
import re
from dataclasses import dataclass
from datetime import datetime, timezone
from typing import Any
_MODEL_ID_SUFFIX_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$")
@dataclass(frozen=True)
class ServingSpec:
model_id: str
model_source: str
num_replicas: int = 1
gpus_per_replica: int = 1
engine_kwargs: dict[str, Any] | None = None
@dataclass(frozen=True)
class ResolvedServingSpec:
user_id: str
model_id_suffix: str
model_id_prefix: str
model_id: str
model_source: str
num_replicas: int
gpus_per_replica: int
engine_kwargs: dict[str, Any] | None
def validate_model_id_suffix(suffix: str) -> None:
if not isinstance(suffix, str):
raise ValueError("model_id must be a string")
s = suffix.strip()
if s != suffix:
raise ValueError("model_id must not contain leading/trailing whitespace")
if not s:
raise ValueError("model_id is required")
if not _MODEL_ID_SUFFIX_RE.match(s):
raise ValueError("model_id must match regex: ^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$")
if ".." in s:
raise ValueError("model_id must not contain '..'")
def make_model_id_prefix(*, user_id: str, now_utc: datetime | None = None) -> str:
if not user_id or not isinstance(user_id, str):
raise ValueError("user_id is required")
if "/" in user_id:
raise ValueError("user_id must not contain '/'")
dt = now_utc or datetime.now(timezone.utc)
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
dt = dt.astimezone(timezone.utc)
stamp = dt.strftime("%Y%m%d%H%M")
return f"{user_id}-{stamp}"
def expand_home_macros(*, user_id: str, text: str) -> str:
if not isinstance(text, str):
raise ValueError("model_source must be a string")
if not text:
raise ValueError("model_source is required")
out = text
out = out.replace("$HOME/common/hf", "/private/hf")
out = out.replace("$HOME/common/datasets", "/private/datasets")
out = out.replace("$HOME", f"/private/users/{user_id}")
return out
def validate_model_source_path(*, user_id: str, model_source: str) -> None:
if not isinstance(model_source, str):
raise ValueError("model_source must be a string")
if not model_source.startswith("/"):
raise ValueError("model_source must be an absolute path")
if not model_source.startswith("/private/"):
raise ValueError("model_source must be under /private")
if "\x00" in model_source:
raise ValueError("model_source contains null byte")
parts = [p for p in model_source.split("/") if p]
if any(p == ".." for p in parts):
raise ValueError("model_source must not contain '..'")
allowed_user_prefix = f"/private/users/{user_id}/"
allowed = model_source.startswith("/private/hf/") or model_source.startswith(allowed_user_prefix)
if not allowed:
raise PermissionError("model_source is not allowed (must be under /private/hf or your /private/users/<user_id>)")
def parse_serving_spec(obj: Any) -> ServingSpec:
if not isinstance(obj, dict):
raise ValueError("serving spec must be a mapping")
model_id = obj.get("model_id")
model_source = obj.get("model_source")
num_replicas = obj.get("num_replicas", 1)
gpus_per_replica = obj.get("gpus_per_replica", 1)
engine_kwargs = obj.get("engine_kwargs", None)
if not isinstance(model_id, str):
raise ValueError("missing required field: model_id")
validate_model_id_suffix(model_id)
if not isinstance(model_source, str) or not model_source:
raise ValueError("missing required field: model_source")
if not isinstance(num_replicas, int) or num_replicas < 1:
raise ValueError("num_replicas must be an integer >= 1")
if not isinstance(gpus_per_replica, int) or gpus_per_replica < 1:
raise ValueError("gpus_per_replica must be an integer >= 1")
if engine_kwargs is not None and not isinstance(engine_kwargs, dict):
raise ValueError("engine_kwargs must be a mapping when provided")
return ServingSpec(
model_id=model_id,
model_source=model_source,
num_replicas=num_replicas,
gpus_per_replica=gpus_per_replica,
engine_kwargs=engine_kwargs,
)
def resolve_serving_spec(*, spec: ServingSpec, user_id: str, now_utc: datetime | None = None) -> ResolvedServingSpec:
validate_model_id_suffix(spec.model_id)
prefix = make_model_id_prefix(user_id=user_id, now_utc=now_utc)
full_model_id = f"{prefix}-{spec.model_id}"
resolved_source = expand_home_macros(user_id=user_id, text=spec.model_source)
validate_model_source_path(user_id=user_id, model_source=resolved_source)
return ResolvedServingSpec(
user_id=user_id,
model_id_suffix=spec.model_id,
model_id_prefix=prefix,
model_id=full_model_id,
model_source=resolved_source,
num_replicas=spec.num_replicas,
gpus_per_replica=spec.gpus_per_replica,
engine_kwargs=spec.engine_kwargs,
)

View File

@ -112,6 +112,7 @@ def _nav(active: str) -> str:
links = [
("login", "/ui/login", "Login"),
("tasks", "/ui/tasks", "Tasks"),
("serving", "/ui/serving", "Serving"),
("new", "/ui/tasks/new", "New Task"),
("data", "/ui/data", "Data"),
("admin", "/ui/admin", "Admin"),
@ -992,6 +993,253 @@ refresh();
""".strip()
return HTMLResponse(content=_page(f"Logs {task_id}", "tasks", body, script))
@app.get("/ui/serving")
async def ui_serving() -> HTMLResponse:
body = """
<h1>Serving</h1>
<div class="card">
<div class="row">
<button class="btn" id="refresh">Refresh</button>
<a class="btn" href="/ui/serving/new" style="display:inline-block">New Model</a>
<a class="btn" id="openai-models" target="_blank" rel="noopener" href="#">OpenAI /v1/models</a>
</div>
<div style="height:10px"></div>
<div id="out" class="muted">Loading...</div>
</div>
""".strip()
script = """
document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265);
document.getElementById("openai-models").href = curOriginWithPort(8000) + "/v1/models";
const out = document.getElementById("out");
function pill(state) {
const s = String(state || "");
if (s === "RUNNING") return `<span class="pill ok">${s}</span>`;
if (s === "FAILED") return `<span class="pill bad">${s}</span>`;
return `<span class="pill">${s}</span>`;
}
async function refresh() {
out.textContent = "Loading...";
try {
const lim = 50;
const off = Number(localStorage.getItem("mvp_serving_offset") || "0") || 0;
const resp = await apiJson("/api/v2/serve/models?limit=" + lim + "&offset=" + off + "&include_deleted=0");
const items = resp.items || [];
const hasMore = !!resp.has_more;
const pageNo = Math.floor(off / lim) + 1;
const prevDisabled = off <= 0;
const nextDisabled = !hasMore;
function row(m) {
return `<tr>
<td><a href="/ui/serving/${m.model_key}">${m.model_key}</a></td>
<td><code>${m.model_id}</code></td>
<td>${pill(m.state)}</td>
<td>${m.num_replicas} × ${m.gpus_per_replica} GPU</td>
<td>${m.updated_at || ""}</td>
</tr>`;
}
const rows = items.map(row).join("");
out.innerHTML = `
<div class="row" style="justify-content: space-between; margin-bottom: 8px;">
<div class="muted">OpenAI base: <code>${resp.openai_base_url || curOriginWithPort(8000) + "/v1"}</code></div>
<div class="row">
<span class="muted">Page ${pageNo}</span>
<button class="btn" id="prev" ${prevDisabled ? "disabled" : ""}>Prev</button>
<button class="btn" id="next" ${nextDisabled ? "disabled" : ""}>Next</button>
</div>
</div>
<table>
<thead><tr><th>Model Key</th><th>Model ID</th><th>State</th><th>Resources</th><th>Updated</th></tr></thead>
<tbody>${rows || "<tr><td colspan=5 class=muted>(none)</td></tr>"}</tbody>
</table>
`;
const prevBtn = document.getElementById("prev");
const nextBtn = document.getElementById("next");
if (prevBtn) prevBtn.onclick = () => { localStorage.setItem("mvp_serving_offset", String(Math.max(0, off - lim))); refresh(); };
if (nextBtn) nextBtn.onclick = () => { localStorage.setItem("mvp_serving_offset", String(off + lim)); refresh(); };
} catch (e) {
let text = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
if (e.body && String(e.body).includes("serving is not enabled")) {
text = "Serving is not enabled in server config.\\nAsk admin to enable `serving:` in dev.yaml.";
}
out.textContent = text;
}
}
document.getElementById("refresh").onclick = refresh;
refresh();
""".strip()
return HTMLResponse(content=_page("Serving", "serving", body, script))
@app.get("/ui/serving/new")
async def ui_serving_new() -> HTMLResponse:
example = """# ServingSpec (YAML)
# 说明:
# - model_id: 这里是 suffix平台会自动加前缀<user_id>-<YYYYMMDDHHMM>-<suffix>
# - model_source: 本地模型路径(支持 $HOME 宏;推荐使用 $HOME/common/hf 指向共享 HF cache
#
# 常用路径:
# - $HOME/common/hf -> /private/hf
# - $HOME -> /private/users/<user_id>
#
model_id: qwen-0.5b
model_source: $HOME/common/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/<SNAPSHOT_HASH>
num_replicas: 1
gpus_per_replica: 1
# engine_kwargs: # 可选:透传给 vLLM
# gpu_memory_utilization: 0.4
""".strip()
body = f"""
<h1>New Model</h1>
<div class="card">
<div class="muted">Paste ServingSpec YAML and submit to <code>/api/v2/serve/models</code>.</div>
<div style="height:10px"></div>
<textarea id="yaml" rows="14">{html.escape(example)}</textarea>
<div style="height:10px"></div>
<div class="row">
<button class="btn" id="submit">Submit</button>
<a class="btn" href="/ui/serving" style="display:inline-block">Back</a>
</div>
<div style="height:10px"></div>
<pre id="out" class="muted"></pre>
</div>
""".strip()
script = """
document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265);
const out = document.getElementById("out");
document.getElementById("submit").onclick = async () => {
out.textContent = "Submitting...";
const yaml = document.getElementById("yaml").value || "";
try {
const resp = await apiJson("/api/v2/serve/models", { method: "POST", headers: { "Content-Type": "application/yaml" }, body: yaml });
out.textContent = "Created: " + resp.model_key + "\\nState: " + resp.state;
if (resp.model_key) window.location.href = "/ui/serving/" + encodeURIComponent(resp.model_key);
} catch (e) {
out.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
}
};
""".strip()
return HTMLResponse(content=_page("New Model", "serving", body, script))
@app.get("/ui/serving/{model_key}")
async def ui_serving_detail(model_key: str) -> HTMLResponse:
body = f"""
<h1>Model</h1>
<div class="card">
<div class="row" style="justify-content: space-between;">
<div class="muted">model_key: <code>{html.escape(model_key)}</code></div>
<div class="row">
<a class="btn" href="/ui/serving" style="display:inline-block">Back</a>
<a class="btn" id="openai-models" target="_blank" rel="noopener" href="#">OpenAI /v1/models</a>
</div>
</div>
<div style="height:10px"></div>
<div class="row">
<label class="muted" style="min-width:120px">Scale replicas</label>
<input id="replicas" type="number" min="1" step="1" value="1" style="max-width: 180px" />
<button class="btn" id="scale">Apply</button>
<button class="btn danger" id="delete">Delete</button>
</div>
<div style="height:10px"></div>
<div id="meta" class="muted">Loading...</div>
<div style="height:12px"></div>
<h3 style="margin-top:0">Resolved Spec (YAML)</h3>
<pre id="spec" class="muted">(loading)</pre>
<div style="height:12px"></div>
<h3 style="margin-top:0">Events</h3>
<div id="events" class="muted">(loading)</div>
<div style="height:12px"></div>
<h3 style="margin-top:0">OpenAI Example</h3>
<pre id="example" class="muted">(loading)</pre>
</div>
""".strip()
script = f"""
document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265);
document.getElementById("openai-models").href = curOriginWithPort(8000) + "/v1/models";
const modelKey = {json.dumps(model_key)};
const meta = document.getElementById("meta");
const spec = document.getElementById("spec");
const eventsEl = document.getElementById("events");
const example = document.getElementById("example");
const replicas = document.getElementById("replicas");
function pill(state) {{
const s = String(state || "");
if (s === "RUNNING") return `<span class="pill ok">${{s}}</span>`;
if (s === "FAILED") return `<span class="pill bad">${{s}}</span>`;
return `<span class="pill">${{s}}</span>`;
}}
function renderEvents(events) {{
if (!events || !events.length) return "<div class=muted>(none)</div>";
const rows = events.map(e => {{
const payload = (e.payload_json || "");
const short = String(payload).length > 240 ? String(payload).slice(0, 240) + "..." : String(payload);
return `<tr><td>${{e.created_at || ""}}</td><td><code>${{e.event_type}}</code></td><td><pre class=muted style=\\"margin:0\\">${{short}}</pre></td></tr>`;
}}).join("");
return `<table><thead><tr><th>Time</th><th>Type</th><th>Payload</th></tr></thead><tbody>${{rows}}</tbody></table>`;
}}
async function refresh() {{
meta.textContent = "Loading...";
spec.textContent = "(loading)";
eventsEl.textContent = "(loading)";
example.textContent = "(loading)";
try {{
const obj = await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey));
const m = obj.model || {{}};
replicas.value = String(m.num_replicas || 1);
meta.innerHTML = `
<div class=row>
<div>state: ${{pill(m.state)}}</div>
<div class=muted>model_id: <code>${{m.model_id || ""}}</code></div>
<div class=muted>source: <code>${{m.model_source || ""}}</code></div>
</div>
<div class=muted>endpoint: <code>${{(m.endpoint && m.endpoint.openai_base_url) || (curOriginWithPort(8000) + "/v1")}}</code></div>
`;
spec.textContent = obj.resolved_spec_yaml || "";
eventsEl.innerHTML = renderEvents(obj.events || []);
const base = (m.endpoint && m.endpoint.openai_base_url) || (curOriginWithPort(8000) + "/v1");
const mid = m.model_id || "";
example.textContent = `curl -sS -H 'Content-Type: application/json' -H 'Authorization: Bearer FAKE_KEY' \\\\\\n -X POST ${{base}}/chat/completions \\\\\\n --data-binary '{{\\"model\\":\\"${{mid}}\\",\\"messages\\":[{{\\"role\\":\\"user\\",\\"content\\":\\"hello\\"}}],\\"max_tokens\\":16,\\"stream\\":false}}' | python3 -m json.tool`;
}} catch (e) {{
meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
spec.textContent = "";
eventsEl.textContent = "";
example.textContent = "";
}}
}}
document.getElementById("scale").onclick = async () => {{
const n = Number(replicas.value || "1");
if (!Number.isFinite(n) || n < 1) return;
try {{
await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey), {{ method: "PATCH", headers: {{ "Content-Type": "application/json" }}, body: JSON.stringify({{ num_replicas: n }}) }});
await refresh();
}} catch (e) {{
meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
}}
}};
document.getElementById("delete").onclick = async () => {{
if (!confirm("Delete this model?")) return;
try {{
await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey), {{ method: "DELETE" }});
await refresh();
}} catch (e) {{
meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
}}
}};
refresh();
""".strip()
return HTMLResponse(content=_page("Model", "serving", body, script))
@app.get("/ui/data")
async def ui_data() -> HTMLResponse:
body = """

View File

@ -0,0 +1,282 @@
from __future__ import annotations
from pathlib import Path
import yaml
from fastapi.testclient import TestClient
def _write_config(tmp_path: Path) -> Path:
cfg = {
"ray": {
"address": "http://127.0.0.1:8265",
"shared_root": "/private",
"entrypoint_resources": {"worker_node": 1},
"runtime_env": {"env_vars": {}},
},
"data": {
"user_root": str(tmp_path / "users"),
},
"service": {
"api": {"host": "127.0.0.1", "port": 0},
"auth": {"token_env": "MVP_INTERNAL_TOKEN"},
"sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")},
"scheduler": {"tick_s": 1, "retry_interval_s": 1, "max_running_tasks": 1},
},
"serving": {
"serve": {"http_port": 8000, "proxy_location": "HeadOnly"},
"llm": {"accelerator_type": "H20"},
},
}
p = tmp_path / "cfg.yaml"
p.write_text(yaml.safe_dump(cfg), encoding="utf-8")
return p
def test_serving_api_crud_flow(tmp_path: Path, monkeypatch):
from argus.service import app as app_mod
cfg_path = _write_config(tmp_path)
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
class _Scheduler:
def __init__(self, **kwargs):
self.tool = object()
def run_forever(self, stop_flag):
return None
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
app = app_mod.create_app(str(cfg_path))
admin_headers = {"authorization": "Bearer admin-token"}
with TestClient(app) as c:
r = c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
assert r.status_code == 200
r2 = c.post("/api/v2/users/alice/tokens", headers=admin_headers)
assert r2.status_code == 200
user_token = r2.json()["token"]
headers = {"authorization": f"Bearer {user_token}"}
spec_yaml = (
"model_id: qwen-0.5b\n"
"model_source: $HOME/common/hf/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/sha\n"
"num_replicas: 1\n"
"gpus_per_replica: 1\n"
)
r3 = c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
assert r3.status_code == 200
assert r3.json()["model_key"] == "mk-alice"
assert r3.json()["state"] == "QUEUED"
r4 = c.get("/api/v2/serve/models?limit=10&offset=0", headers=headers)
assert r4.status_code == 200
obj = r4.json()
assert obj["openai_base_url"] == "http://testserver:8000/v1"
assert len(obj["items"]) == 1
assert obj["items"][0]["model_key"] == "mk-alice"
r5 = c.get("/api/v2/serve/models/mk-alice", headers=headers)
assert r5.status_code == 200
detail = r5.json()
assert detail["model"]["model_key"] == "mk-alice"
assert "model_id_prefix" in detail["model"]
assert "resolved_spec_yaml" in detail
assert isinstance(detail.get("events"), list)
r6 = c.patch("/api/v2/serve/models/mk-alice", headers=headers, json={"num_replicas": 2})
assert r6.status_code == 200
assert r6.json()["state"] == "QUEUED"
r7 = c.delete("/api/v2/serve/models/mk-alice", headers=headers)
assert r7.status_code == 200
assert r7.json()["state"] == "DELETING"
# Admin status endpoint
r8 = c.get("/api/v2/serve/status", headers=admin_headers)
assert r8.status_code == 200
assert r8.json()["http_port"] == 8000
def test_serving_api_rejects_path_outside_user_and_hf(tmp_path: Path, monkeypatch):
from argus.service import app as app_mod
cfg_path = _write_config(tmp_path)
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
class _Scheduler:
def __init__(self, **kwargs):
self.tool = object()
def run_forever(self, stop_flag):
return None
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
app = app_mod.create_app(str(cfg_path))
admin_headers = {"authorization": "Bearer admin-token"}
with TestClient(app) as c:
c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
r2 = c.post("/api/v2/users/alice/tokens", headers=admin_headers)
user_token = r2.json()["token"]
headers = {"authorization": f"Bearer {user_token}"}
spec_yaml = (
"model_id: x\n"
"model_source: /private/users/bob/models/evil\n"
"num_replicas: 1\n"
"gpus_per_replica: 1\n"
)
r3 = c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
assert r3.status_code == 403
def test_serving_api_invalid_yaml_and_non_mapping(tmp_path: Path, monkeypatch):
from argus.service import app as app_mod
cfg_path = _write_config(tmp_path)
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
class _Scheduler:
def __init__(self, **kwargs):
self.tool = object()
def run_forever(self, stop_flag):
return None
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
app = app_mod.create_app(str(cfg_path))
with TestClient(app) as c:
# Create a user token
admin_headers = {"authorization": "Bearer admin-token"}
c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
headers = {"authorization": f"Bearer {token}"}
r = c.post("/api/v2/serve/models", headers=headers, data=": bad\n")
assert r.status_code == 400
r2 = c.post("/api/v2/serve/models", headers=headers, data="- 1\n- 2\n")
assert r2.status_code == 400
def test_serving_api_engine_kwargs_binary_rejected(tmp_path: Path, monkeypatch):
"""
yaml !!binary is parsed as bytes, which is not JSON-serializable.
"""
from argus.service import app as app_mod
cfg_path = _write_config(tmp_path)
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
class _Scheduler:
def __init__(self, **kwargs):
self.tool = object()
def run_forever(self, stop_flag):
return None
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
app = app_mod.create_app(str(cfg_path))
admin_headers = {"authorization": "Bearer admin-token"}
with TestClient(app) as c:
c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
headers = {"authorization": f"Bearer {token}"}
spec_yaml = (
"model_id: x\n"
"model_source: $HOME/common/hf/x\n"
"engine_kwargs:\n"
" blob: !!binary \"AQID\"\n"
)
r = c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
assert r.status_code == 400
def test_serving_api_list_include_deleted_and_forwarded_base_url(tmp_path: Path, monkeypatch):
from argus.service import app as app_mod
from argus.service.config import V2Config
from argus.service.db import Db
cfg_path = _write_config(tmp_path)
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
class _Scheduler:
def __init__(self, **kwargs):
self.tool = object()
def run_forever(self, stop_flag):
return None
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
keys = iter(["mk-alice-1", "mk-alice-2"])
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: next(keys))
app = app_mod.create_app(str(cfg_path))
admin_headers = {"authorization": "Bearer admin-token"}
with TestClient(app) as c:
c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
headers = {"authorization": f"Bearer {token}"}
spec_yaml = "model_id: x\nmodel_source: $HOME/common/hf/x\n"
c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
# Mark one model as DELETED directly in DB (sets deleted_at).
root = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))
v2_cfg = V2Config.from_root_dict(root)
db = Db(v2_cfg.sqlite.db_path)
db.set_serve_model_state(model_key="mk-alice-2", state="DELETED")
r1 = c.get(
"/api/v2/serve/models?limit=10&offset=0&include_deleted=0",
headers={**headers, "x-forwarded-host": "example.com:8080", "x-forwarded-proto": "https"},
)
assert r1.status_code == 200
assert r1.json()["openai_base_url"] == "https://example.com:8000/v1"
assert {m["model_key"] for m in r1.json()["items"]} == {"mk-alice-1"}
r2 = c.get("/api/v2/serve/models?include_deleted=1", headers=headers)
assert r2.status_code == 200
assert {m["model_key"] for m in r2.json()["items"]} == {"mk-alice-1", "mk-alice-2"}
def test_serving_api_patch_invalid_num_replicas(tmp_path: Path, monkeypatch):
from argus.service import app as app_mod
cfg_path = _write_config(tmp_path)
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
class _Scheduler:
def __init__(self, **kwargs):
self.tool = object()
def run_forever(self, stop_flag):
return None
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: "mk-alice")
app = app_mod.create_app(str(cfg_path))
admin_headers = {"authorization": "Bearer admin-token"}
with TestClient(app) as c:
c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
headers = {"authorization": f"Bearer {token}"}
c.post("/api/v2/serve/models", headers=headers, data="model_id: x\nmodel_source: $HOME/common/hf/x\n")
r = c.patch("/api/v2/serve/models/mk-alice", headers=headers, json={"num_replicas": 0})
assert r.status_code == 422

View File

@ -0,0 +1,79 @@
from __future__ import annotations
import json
from pathlib import Path
def test_db_serving_model_crud_and_events(tmp_path: Path) -> None:
from argus.service.db import Db
db = Db(str(tmp_path / "mvp.sqlite3"))
db.init()
m1 = db.create_serve_model(
model_key="svc-001",
user_id="alice",
model_id_suffix="qwen-0.5b",
model_id_prefix="alice-202601061235",
model_id="alice-202601061235-qwen-0.5b",
model_source="/private/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/sha",
num_replicas=1,
gpus_per_replica=1,
engine_kwargs_json=json.dumps({"max_model_len": 8192}),
spec_yaml="model_id: qwen-0.5b\nmodel_source: $HOME/common/hf/...\n",
resolved_spec_yaml="model_id: alice-202601061235-qwen-0.5b\nmodel_source: /private/hf/...\n",
)
assert m1["model_key"] == "svc-001"
assert m1["state"] == "QUEUED"
# Same suffix may be created again; model_key is the identity.
m2 = db.create_serve_model(
model_key="svc-002",
user_id="alice",
model_id_suffix="qwen-0.5b",
model_id_prefix="alice-202601061236",
model_id="alice-202601061236-qwen-0.5b",
model_source="/private/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/sha",
num_replicas=1,
gpus_per_replica=2,
engine_kwargs_json=None,
spec_yaml="model_id: qwen-0.5b\nmodel_source: $HOME/common/hf/...\n",
resolved_spec_yaml="model_id: alice-202601061236-qwen-0.5b\nmodel_source: /private/hf/...\n",
)
assert m2["model_key"] == "svc-002"
assert m2["model_id"] != m1["model_id"]
got = db.get_serve_model("svc-001")
assert got is not None
assert got["gpus_per_replica"] == 1
items = db.list_serve_models(user_id="alice")
assert {i["model_key"] for i in items} == {"svc-001", "svc-002"}
# State transition writes a serve event.
db.set_serve_model_state(model_key="svc-001", state="DEPLOYING")
got2 = db.get_serve_model("svc-001")
assert got2 is not None
assert got2["state"] == "DEPLOYING"
events = db.list_serve_events("svc-001", limit=50)
assert len(events) >= 2
assert {e["event_type"] for e in events}.issuperset({"SERVE_MODEL_CREATED", "SERVE_STATE_UPDATE"})
# Reconciler pick: QUEUED/DELETING only.
picked = db.pick_next_runnable_serve_change()
assert picked is not None
assert picked["state"] == "QUEUED"
db.set_serve_model_state(model_key="svc-002", state="DELETING")
picked2 = db.pick_next_runnable_serve_change()
assert picked2 is not None
assert picked2["state"] in ("QUEUED", "DELETING")
# Deleted models are hidden unless include_deleted.
db.set_serve_model_state(model_key="svc-002", state="DELETED")
items2 = db.list_serve_models(user_id="alice", include_deleted=False)
assert {i["model_key"] for i in items2} == {"svc-001"}
items3 = db.list_serve_models(user_id="alice", include_deleted=True)
assert {i["model_key"] for i in items3} == {"svc-001", "svc-002"}

View File

@ -44,3 +44,32 @@ def test_attempt_submission_id_format():
assert attempt_submission_id("t", 1) == "t--a01"
assert attempt_submission_id("t", 12) == "t--a12"
def test_new_model_key_includes_user(monkeypatch):
import argus.core.ids as ids
class _FakeDatetime:
@staticmethod
def now():
class _DT:
def strftime(self, fmt: str) -> str:
assert fmt == "%Y%m%d-%H%M%S"
return "20250101-010203"
return _DT()
monkeypatch.setattr(ids, "datetime", _FakeDatetime)
monkeypatch.setattr(ids.secrets, "token_hex", lambda n: "abcd")
assert ids.new_model_key(user_id="Alice_01") == "mvp2-alice_01-serve-20250101-010203-abcd"
def test_new_model_key_requires_user_id():
from argus.core.ids import new_model_key
try:
new_model_key(user_id="")
assert False, "expected ValueError"
except ValueError as e:
assert "user_id is required" in str(e)

View File

@ -0,0 +1,78 @@
from __future__ import annotations
import pytest
def test_build_llm_config_dict_maps_tp_and_bundles():
from argus.service.serve_llm_config import build_llm_config_dict
from argus.service.serving_spec import ResolvedServingSpec
resolved = ResolvedServingSpec(
user_id="alice",
model_id_suffix="qwen-0.5b",
model_id_prefix="alice-202601061235",
model_id="alice-202601061235-qwen-0.5b",
model_source="/private/hf/x",
num_replicas=2,
gpus_per_replica=4,
engine_kwargs={"gpu_memory_utilization": 0.9},
)
cfg = build_llm_config_dict(
resolved,
accelerator_type="H20",
runtime_env_env_vars={"HF_ENDPOINT": "https://hf-mirror.com"},
cpu_per_gpu=2.0,
)
assert cfg["model_loading_config"]["model_id"] == "alice-202601061235-qwen-0.5b"
assert cfg["model_loading_config"]["model_source"] == "/private/hf/x"
assert cfg["accelerator_type"] == "H20"
assert cfg["deployment_config"]["num_replicas"] == 2
# gpus_per_replica -> tensor_parallel_size
assert cfg["engine_kwargs"]["tensor_parallel_size"] == 4
assert cfg["engine_kwargs"]["gpu_memory_utilization"] == 0.9
# resources_per_bundle reserves the full TP GPU set for each replica.
bundle = cfg["resources_per_bundle"]
assert bundle["GPU"] == 4.0
assert bundle["CPU"] == 8.0
def test_build_llm_config_dict_injects_hf_offline_defaults():
from argus.service.serve_llm_config import build_llm_config_dict
from argus.service.serving_spec import ResolvedServingSpec
resolved = ResolvedServingSpec(
user_id="alice",
model_id_suffix="x",
model_id_prefix="alice-202601061235",
model_id="alice-202601061235-x",
model_source="/private/users/alice/models/x",
num_replicas=1,
gpus_per_replica=1,
engine_kwargs=None,
)
cfg = build_llm_config_dict(resolved, accelerator_type="H20", runtime_env_env_vars={})
env = cfg["runtime_env"]["env_vars"]
assert env["HF_HUB_OFFLINE"] == "1"
assert env["HF_HOME"] == "/private/hf"
assert env["HUGGINGFACE_HUB_CACHE"].startswith("/private/hf/")
def test_build_llm_config_dict_requires_accelerator_type():
from argus.service.serve_llm_config import build_llm_config_dict
from argus.service.serving_spec import ResolvedServingSpec
resolved = ResolvedServingSpec(
user_id="alice",
model_id_suffix="x",
model_id_prefix="alice-202601061235",
model_id="alice-202601061235-x",
model_source="/private/hf/x",
num_replicas=1,
gpus_per_replica=1,
engine_kwargs=None,
)
with pytest.raises(ValueError, match="accelerator_type is required"):
build_llm_config_dict(resolved, accelerator_type="", runtime_env_env_vars={})

View File

@ -0,0 +1,55 @@
from __future__ import annotations
import sys
import types
def test_ray_serve_client_calls_start_run_status(monkeypatch):
import ray # provided by conftest stub
calls: list[tuple[str, object]] = []
def _init(*args, **kwargs):
calls.append(("ray.init", {"args": args, "kwargs": kwargs}))
monkeypatch.setattr(ray, "init", _init, raising=False)
serve = types.ModuleType("ray.serve")
def _start(**kwargs):
calls.append(("serve.start", kwargs))
return None
def _run(app, name=None, route_prefix=None):
calls.append(("serve.run", {"app": app, "name": name, "route_prefix": route_prefix}))
return {"deployed": True}
def _status():
calls.append(("serve.status", None))
return {"ok": True}
serve.start = _start # type: ignore[attr-defined]
serve.run = _run # type: ignore[attr-defined]
serve.status = _status # type: ignore[attr-defined]
sys.modules["ray.serve"] = serve
ray.serve = serve # type: ignore[attr-defined]
from argus.service.serve_client import RayServeClient
client = RayServeClient(http_port=8000, proxy_location="HeadOnly", ray_init_address="auto")
client.ensure_started()
out = client.apply_app(app="APP", app_name="argus_llm_app", route_prefix="/")
st = client.get_status()
assert out == {"deployed": True}
assert st == {"ok": True}
# Verify call order and key args.
assert calls[0][0] == "ray.init"
assert calls[0][1]["kwargs"].get("ignore_reinit_error") is True
assert calls[1][0] == "serve.start"
assert calls[1][1]["http_options"]["port"] == 8000
assert calls[2][0] == "serve.run"
assert calls[2][1]["name"] == "argus_llm_app"
assert calls[3][0] == "serve.status"

View File

@ -23,6 +23,7 @@ def test_v2_config_from_root_dict_new_format_defaults():
assert cfg.sqlite.db_path.endswith(".sqlite3")
assert cfg.scheduler.max_running_tasks == 3
assert cfg.tracking.wandb.enabled is False
assert cfg.serving.enabled is False
def test_v2_config_backward_compat_v2_section_and_default_db_path():
@ -57,6 +58,27 @@ def test_v2_config_requires_data_mappings():
V2Config.from_root_dict({**base, "data": {"sftpgo": ["x"], "retention": {}}})
def test_v2_config_requires_tracking_and_serving_mappings():
from argus.service.config import V2Config
base = {
"ray": {"shared_root": "/private"},
"service": {"api": {}, "auth": {}, "sqlite": {}, "scheduler": {}},
"data": {"sftpgo": {}, "retention": {}},
}
with pytest.raises(ValueError, match="config\\.tracking must be a mapping"):
V2Config.from_root_dict({**base, "tracking": ["nope"]})
with pytest.raises(ValueError, match="config\\.tracking\\.wandb must be a mapping"):
V2Config.from_root_dict({**base, "tracking": {"wandb": ["nope"]}})
with pytest.raises(ValueError, match="config\\.serving must be a mapping"):
V2Config.from_root_dict({**base, "serving": ["nope"]})
with pytest.raises(ValueError, match="config\\.serving\\.\\{serve,llm\\} must be mappings"):
V2Config.from_root_dict({**base, "serving": {"serve": ["x"], "llm": {}}})
def test_tracking_wandb_defaults_disabled():
from argus.service.config import V2Config

View File

@ -0,0 +1,23 @@
from __future__ import annotations
from datetime import datetime, timezone
import pytest
from argus.service.serving_spec import make_model_id_prefix
def test_make_model_id_prefix_uses_utc_minutes():
dt = datetime(2026, 1, 6, 12, 35, 59, tzinfo=timezone.utc)
assert make_model_id_prefix(user_id="alice", now_utc=dt) == "alice-202601061235"
def test_make_model_id_prefix_rejects_empty_user_id():
with pytest.raises(ValueError, match="user_id is required"):
make_model_id_prefix(user_id="", now_utc=datetime.now(timezone.utc))
def test_make_model_id_prefix_rejects_slash():
with pytest.raises(ValueError, match="must not contain"):
make_model_id_prefix(user_id="bad/user", now_utc=datetime.now(timezone.utc))

View File

@ -0,0 +1,207 @@
from __future__ import annotations
import json
from pathlib import Path
class _FakeServeClient:
def __init__(self):
self.started = 0
self.applied = []
self.status_calls = 0
self.fail_apply = False
self.fail_status = False
def ensure_started(self) -> None:
self.started += 1
def apply_app(self, *, app, app_name: str, route_prefix: str = "/"):
if self.fail_apply:
raise RuntimeError("boom")
self.applied.append({"app": app, "app_name": app_name, "route_prefix": route_prefix})
return {"ok": True}
def get_status(self):
self.status_calls += 1
if self.fail_status:
raise RuntimeError("status boom")
return {"ok": True}
def _seed_model(db, *, model_key: str, user_id: str, state: str, num_replicas: int = 1, gpus_per_replica: int = 1):
spec_yaml = "model_id: x\nmodel_source: $HOME/common/hf/x\n"
resolved_yaml = f"user_id: {user_id}\nmodel_id: {user_id}-202601061235-x\n"
db.create_serve_model(
model_key=model_key,
user_id=user_id,
model_id_suffix="x",
model_id_prefix=f"{user_id}-202601061235",
model_id=f"{user_id}-202601061235-x",
model_source="/private/hf/x",
num_replicas=num_replicas,
gpus_per_replica=gpus_per_replica,
engine_kwargs_json=json.dumps({"gpu_memory_utilization": 0.9}),
spec_yaml=spec_yaml,
resolved_spec_yaml=resolved_yaml,
)
db.set_serve_model_state(model_key=model_key, state=state, event_type="TEST_SEED")
def test_reconciler_skips_when_no_changes(tmp_path: Path):
from argus.service.config import V2Config
from argus.service.db import Db
from argus.service.serving_reconciler import ServingReconciler
root = {
"ray": {"shared_root": "/private"},
"service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
"data": {"sftpgo": {}, "retention": {}},
"serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
}
cfg = V2Config.from_root_dict(root)
db = Db(cfg.sqlite.db_path)
db.init()
client = _FakeServeClient()
rec = ServingReconciler(db=db, v2_cfg=cfg, ray_runtime_env_env_vars={}, serve_client=client, get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})())
rec.tick()
assert client.started == 0
assert client.applied == []
def test_reconciler_pending_resources_no_apply(tmp_path: Path):
from argus.service.config import V2Config
from argus.service.db import Db
from argus.service.serving_reconciler import ServingReconciler
cfg = V2Config.from_root_dict(
{
"ray": {"shared_root": "/private"},
"service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
"data": {"sftpgo": {}, "retention": {}},
"serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
}
)
db = Db(cfg.sqlite.db_path)
db.init()
_seed_model(db, model_key="mk1", user_id="alice", state="QUEUED", num_replicas=2, gpus_per_replica=4)
client = _FakeServeClient()
rec = ServingReconciler(
db=db,
v2_cfg=cfg,
ray_runtime_env_env_vars={},
serve_client=client,
get_available_fn=lambda: type("A", (), {"total_available_gpus": 1, "total_available_npus": 0})(),
)
rec.tick()
# Serve may be started even when resources are insufficient, but apply should not happen.
assert client.started == 1
assert client.applied == []
# State remains QUEUED.
row = db.get_serve_model("mk1")
assert row and row["state"] == "QUEUED"
ev = db.list_serve_events("mk1", limit=50)
assert any(e["event_type"] == "SERVE_PENDING_RESOURCES" for e in ev)
def test_reconciler_apply_success_marks_running(tmp_path: Path):
from argus.service.config import V2Config
from argus.service.db import Db
from argus.service.serving_reconciler import ServingReconciler
cfg = V2Config.from_root_dict(
{
"ray": {"shared_root": "/private"},
"service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
"data": {"sftpgo": {}, "retention": {}},
"serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
}
)
db = Db(cfg.sqlite.db_path)
db.init()
_seed_model(db, model_key="mk1", user_id="alice", state="QUEUED", num_replicas=1, gpus_per_replica=1)
client = _FakeServeClient()
rec = ServingReconciler(
db=db,
v2_cfg=cfg,
ray_runtime_env_env_vars={"HF_ENDPOINT": "https://hf-mirror.com"},
serve_client=client,
get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})(),
)
rec.tick()
assert client.started == 1
assert len(client.applied) == 1
applied = client.applied[0]["app"]["llm_configs"]
assert applied[0]["engine_kwargs"]["tensor_parallel_size"] == 1
assert applied[0]["runtime_env"]["env_vars"]["HF_HUB_OFFLINE"] == "1"
row = db.get_serve_model("mk1")
assert row and row["state"] == "RUNNING"
def test_reconciler_delete_removes_and_marks_deleted(tmp_path: Path):
from argus.service.config import V2Config
from argus.service.db import Db
from argus.service.serving_reconciler import ServingReconciler
cfg = V2Config.from_root_dict(
{
"ray": {"shared_root": "/private"},
"service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
"data": {"sftpgo": {}, "retention": {}},
"serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
}
)
db = Db(cfg.sqlite.db_path)
db.init()
_seed_model(db, model_key="keep", user_id="alice", state="RUNNING", num_replicas=1, gpus_per_replica=1)
_seed_model(db, model_key="del", user_id="alice", state="DELETING", num_replicas=1, gpus_per_replica=1)
client = _FakeServeClient()
rec = ServingReconciler(
db=db,
v2_cfg=cfg,
ray_runtime_env_env_vars={},
serve_client=client,
get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})(),
)
rec.tick()
assert len(client.applied) == 1
cfgs = client.applied[0]["app"]["llm_configs"]
assert {c["model_loading_config"]["model_id"] for c in cfgs} == {"alice-202601061235-x"} # only keep remains
row = db.get_serve_model("del")
assert row and row["state"] == "DELETED"
assert row.get("deleted_at")
def test_reconciler_apply_failure_marks_failed(tmp_path: Path):
from argus.service.config import V2Config
from argus.service.db import Db
from argus.service.serving_reconciler import ServingReconciler
cfg = V2Config.from_root_dict(
{
"ray": {"shared_root": "/private"},
"service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
"data": {"sftpgo": {}, "retention": {}},
"serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
}
)
db = Db(cfg.sqlite.db_path)
db.init()
_seed_model(db, model_key="mk1", user_id="alice", state="QUEUED")
client = _FakeServeClient()
client.fail_apply = True
rec = ServingReconciler(
db=db,
v2_cfg=cfg,
ray_runtime_env_env_vars={},
serve_client=client,
get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})(),
)
rec.tick()
row = db.get_serve_model("mk1")
assert row and row["state"] == "FAILED"
assert row.get("error_summary")

View File

@ -0,0 +1,47 @@
from __future__ import annotations
from datetime import datetime, timezone
import pytest
from argus.service.serving_spec import ServingSpec, resolve_serving_spec
def test_expand_home_macro_and_validate_user_path_ok():
spec = ServingSpec(
model_id="qwen-0.5b",
model_source="$HOME/models/my_model",
num_replicas=1,
gpus_per_replica=1,
)
r = resolve_serving_spec(spec=spec, user_id="alice", now_utc=datetime(2026, 1, 6, 12, 35, tzinfo=timezone.utc))
assert r.model_source == "/private/users/alice/models/my_model"
assert r.model_id == "alice-202601061235-qwen-0.5b"
def test_expand_common_hf_macro_ok():
spec = ServingSpec(
model_id="qwen-0.5b",
model_source="$HOME/common/hf/hub/models--Qwen--Qwen2.5/snapshots/abc",
num_replicas=1,
gpus_per_replica=1,
)
r = resolve_serving_spec(spec=spec, user_id="alice", now_utc=datetime(2026, 1, 6, 12, 35, tzinfo=timezone.utc))
assert r.model_source.startswith("/private/hf/")
@pytest.mark.parametrize(
"src",
[
"/etc/passwd",
"relative/path",
"/private/users/bob/models/x",
"/private/users/alice/../bob/x",
"/private/common/hf/x",
],
)
def test_model_source_path_rejected(src: str):
spec = ServingSpec(model_id="qwen-0.5b", model_source=src, num_replicas=1, gpus_per_replica=1)
with pytest.raises((ValueError, PermissionError)):
resolve_serving_spec(spec=spec, user_id="alice", now_utc=datetime(2026, 1, 6, 12, 35, tzinfo=timezone.utc))

View File

@ -0,0 +1,72 @@
from __future__ import annotations
import pytest
from argus.service.serving_spec import ServingSpec, parse_serving_spec, validate_model_id_suffix
@pytest.mark.parametrize(
"suffix",
[
"a",
"qwen-0.5b",
"Qwen2.5-0.5B",
"a_b",
"a.b-c",
"a" * 64,
],
)
def test_validate_model_id_suffix_accepts(suffix: str):
validate_model_id_suffix(suffix)
@pytest.mark.parametrize(
"suffix",
[
"",
" a",
"a ",
"-bad",
".bad",
"bad/",
"bad..",
"bad\n",
"bad\t",
"a" * 65,
],
)
def test_validate_model_id_suffix_rejects(suffix: str):
with pytest.raises(ValueError):
validate_model_id_suffix(suffix)
def test_parse_serving_spec_smoke_defaults():
spec = parse_serving_spec(
{
"model_id": "qwen-0.5b",
"model_source": "/private/hf/x",
}
)
assert isinstance(spec, ServingSpec)
assert spec.num_replicas == 1
assert spec.gpus_per_replica == 1
assert spec.engine_kwargs is None
def test_parse_serving_spec_rejects_missing_fields():
with pytest.raises(ValueError, match="missing required field: model_id"):
parse_serving_spec({"model_source": "/private/hf/x"})
with pytest.raises(ValueError, match="missing required field: model_source"):
parse_serving_spec({"model_id": "x"})
def test_parse_serving_spec_rejects_bad_types():
with pytest.raises(ValueError, match="serving spec must be a mapping"):
parse_serving_spec(["nope"])
with pytest.raises(ValueError, match="num_replicas"):
parse_serving_spec({"model_id": "x", "model_source": "/private/hf/x", "num_replicas": 0})
with pytest.raises(ValueError, match="gpus_per_replica"):
parse_serving_spec({"model_id": "x", "model_source": "/private/hf/x", "gpus_per_replica": 0})
with pytest.raises(ValueError, match="engine_kwargs"):
parse_serving_spec({"model_id": "x", "model_source": "/private/hf/x", "engine_kwargs": "nope"})

View File

@ -42,10 +42,13 @@ def test_ui_routes_render_200(tmp_path, monkeypatch):
"/ui/login",
"/ui/tasks",
"/ui/tasks/new",
"/ui/serving",
"/ui/serving/new",
"/ui/data",
"/ui/admin",
"/ui/tasks/any-task-id",
"/ui/tasks/any-task-id/logs",
"/ui/serving/any-model-key",
):
r = c.get(path, allow_redirects=True)
assert r.status_code == 200
@ -60,7 +63,7 @@ def test_ui_contains_sidebar_links(tmp_path, monkeypatch):
r = c.get("/ui/tasks")
assert r.status_code == 200
for link in ("/ui/tasks", "/ui/tasks/new", "/ui/data", "/ui/login", "/ui/admin"):
for link in ("/ui/tasks", "/ui/tasks/new", "/ui/serving", "/ui/data", "/ui/login", "/ui/admin"):
assert link in r.text
assert "Ray Dashboard" in r.text

View File

@ -0,0 +1,56 @@
from __future__ import annotations
from pathlib import Path
from fastapi.testclient import TestClient
from argus.service.app import create_app
def _write_config(tmp_path: Path) -> Path:
p = tmp_path / "cfg.yaml"
p.write_text(
"""
ray:
address: "http://127.0.0.1:8265"
shared_root: "/private"
entrypoint_num_cpus: 1
entrypoint_resources: { worker_node: 1 }
runtime_env: { env_vars: { PYTHONUNBUFFERED: "1" } }
service:
api: { host: "127.0.0.1", port: 8080 }
auth: { token_env: "MVP_INTERNAL_TOKEN" }
sqlite: { db_path: "%(db)s" }
data:
user_root: "%(users)s"
sftpgo: { enabled: false }
retention: { jobs_trash_after_days: 3, jobs_purge_after_days: 7, janitor_interval_s: 3600 }
serving: {}
"""
% {"db": str(tmp_path / "mvp.sqlite3"), "users": str(tmp_path / "users")}
)
return p
def test_ui_serving_pages_render(tmp_path, monkeypatch):
cfg = _write_config(tmp_path)
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
app = create_app(str(cfg))
c = TestClient(app)
for path in ("/ui/serving", "/ui/serving/new", "/ui/serving/any-model-key"):
r = c.get(path)
assert r.status_code == 200
assert "<html" in r.text.lower()
def test_ui_serving_contains_openai_port_8000(tmp_path, monkeypatch):
cfg = _write_config(tmp_path)
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
app = create_app(str(cfg))
c = TestClient(app)
r = c.get("/ui/serving")
assert r.status_code == 200
assert "curOriginWithPort(8000)" in r.text
assert "/v1/models" in r.text

View File

@ -11,10 +11,11 @@ fi
echo "[host] docker compose up -d (mvp)"
BUILD="${BUILD:-0}"
RAY_NODE_IMAGE="${RAY_NODE_IMAGE:-argus/argus-ray-node:vllm011.latest}"
# If the image isn't present locally, force build once.
if [[ "${BUILD}" != "1" ]]; then
if ! docker image inspect argus/argus-ray-node:v2.5 >/dev/null 2>&1; then
if ! docker image inspect "${RAY_NODE_IMAGE}" >/dev/null 2>&1; then
BUILD="1"
fi
fi

View File

@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -euo pipefail
container="${MVP_HEAD_CONTAINER:-argus-ray-head}"
model_source="${MODEL_SOURCE:-}"
if [[ -n "${1:-}" ]]; then
model_source="$1"
fi
argv=(python3 /workspace/mvp/scripts/serve_llm_smoke.py)
if [[ -n "${model_source}" ]]; then
argv+=(--model-source "${model_source}")
fi
argv+=(--accelerator-type "${ARGUS_ACCELERATOR_TYPE:-H20}")
echo "[host] run Ray Serve LLM smoke test in container: ${container}" >&2
docker exec -it "${container}" bash -lc "$(printf '%q ' "${argv[@]}")"

View File

@ -0,0 +1,193 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=lib.sh
source "${SCRIPT_DIR}/lib.sh"
API_ADDR="${API_ADDR:-http://127.0.0.1:8080}"
OPENAI_BASE_URL="${OPENAI_BASE_URL:-http://127.0.0.1:8000/v1}"
ADMIN_TOKEN="${MVP_INTERNAL_TOKEN:-}"
USER_ID="${USER_ID:-alice}"
EXPECTED_RAY_NODES="${EXPECTED_RAY_NODES:-3}" # head + 2 workers
CONFIG_IN_CONTAINER="${CONFIG_IN_CONTAINER:-/workspace/mvp/configs/dev.yaml}"
SFTPGO_ADMIN_PASSWORD="${SFTPGO_ADMIN_PASSWORD:-my-dev-sftpgo-admin}"
export SFTPGO_ADMIN_PASSWORD
if [[ -z "${ADMIN_TOKEN}" ]]; then
echo "ERROR: MVP_INTERNAL_TOKEN must be set in host env (admin token)" >&2
exit 1
fi
api_curl_admin() {
curl -sS -H "Authorization: Bearer ${ADMIN_TOKEN}" "$@"
}
api_wait_ready() {
local tries="${1:-60}"
for i in $(seq 1 "${tries}"); do
if curl -sS -m 2 "${API_ADDR}/docs" >/dev/null 2>&1; then
echo "[host] api_ready: ${API_ADDR}"
return 0
fi
echo "[host] waiting api... (${i}/${tries})"
sleep 2
done
echo "ERROR: api not ready: ${API_ADDR}" >&2
return 1
}
ray_wait_ready() {
local tries="${1:-60}"
for i in $(seq 1 "${tries}"); do
if curl -sS -m 2 "${RAY_DASHBOARD_ADDR}/api/version" >/dev/null 2>&1; then
echo "[host] ray_dashboard_ready: ${RAY_DASHBOARD_ADDR}"
return 0
fi
echo "[host] waiting ray dashboard... (${i}/${tries})"
sleep 2
done
echo "ERROR: ray dashboard not ready: ${RAY_DASHBOARD_ADDR}" >&2
return 1
}
ray_wait_nodes() {
local want="${1:-3}"
local tries="${2:-60}"
for i in $(seq 1 "${tries}"); do
local out n
out="$(docker exec -i "${HEAD_CONTAINER}" python3 -c "import ray; ray.init(address='auto', ignore_reinit_error=True, log_to_driver=False, logging_level='ERROR'); print(sum(1 for n in ray.nodes() if n.get('Alive')))" 2>/dev/null || true)"
n="$(printf '%s\n' "${out}" | tail -n 1 | tr -cd '0-9' || true)"
if [[ "${n}" =~ ^[0-9]+$ ]]; then
echo "[host] ray_nodes_alive=${n} (want>=${want})"
if [[ "${n}" -ge "${want}" ]]; then
return 0
fi
else
echo "[host] waiting ray nodes... (${i}/${tries})"
fi
sleep 2
done
echo "ERROR: ray nodes not ready (want>=${want})" >&2
docker exec -i "${HEAD_CONTAINER}" bash -lc "ray status || true" >&2 || true
return 1
}
openai_wait_ready() {
local tries="${1:-120}"
for i in $(seq 1 "${tries}"); do
if curl -sS -m 2 "${OPENAI_BASE_URL}/models" >/dev/null 2>&1; then
echo "[host] openai_ready: ${OPENAI_BASE_URL}"
return 0
fi
echo "[host] waiting openai... (${i}/${tries})"
sleep 2
done
echo "ERROR: openai not ready: ${OPENAI_BASE_URL}" >&2
return 1
}
wait_model_state() {
local token="$1"
local model_key="$2"
local want="$3"
local tries="${4:-120}"
for i in $(seq 1 "${tries}"); do
local body state
body="$(curl -sS -H "Authorization: Bearer ${token}" "${API_ADDR}/api/v2/serve/models/${model_key}")"
state="$(printf '%s' "${body}" | python3 -c 'import sys,json; print(json.load(sys.stdin)["model"]["state"])' 2>/dev/null || true)"
echo "[host] model ${model_key}: ${state}"
if [[ "${state}" == "${want}" ]]; then
return 0
fi
if [[ "${state}" == "FAILED" ]]; then
echo "[host] model failed; detail:" >&2
printf '%s\n' "${body}" | python3 -m json.tool >&2 || true
return 1
fi
sleep 2
done
echo "ERROR: model not in state ${want} after timeout" >&2
return 1
}
echo "[host] ===== run_all_v38_serving.sh begin ====="
"${SCRIPT_DIR}/00_prereq_check.sh"
"${SCRIPT_DIR}/03_cleanup_v1_legacy.sh"
"${SCRIPT_DIR}/04_cleanup_v2_legacy.sh"
echo "[host] bring down existing containers (best-effort)"
"${SCRIPT_DIR}/02_down.sh" || true
echo "[host] (re)create containers (Ray + SFTPGo + W&B)"
# For v3.8, we need the latest ray-node image (ray[llm] deps). Force build once.
BUILD="${BUILD:-1}" "${SCRIPT_DIR}/01_up.sh"
echo "[host] wait ray ready"
ray_wait_ready 60
ray_wait_nodes "${EXPECTED_RAY_NODES}" 120
echo "[host] prepare data/model (best-effort; uses shared caches)"
"${SCRIPT_DIR}/30_prepare_data_and_model.sh" || true
echo "[host] start api"
CONFIG_IN_CONTAINER="${CONFIG_IN_CONTAINER}" MVP_INTERNAL_TOKEN="${ADMIN_TOKEN}" "${SCRIPT_DIR}/60_start_api.sh"
api_wait_ready 60
echo "[host] create user (idempotent)"
api_curl_admin -X POST "${API_ADDR}/api/v2/users" -H "Content-Type: application/json" --data-binary "{\"user_id\":\"${USER_ID}\"}" >/dev/null || true
echo "[host] issue user token"
USER_TOKEN="$(api_curl_admin -X POST "${API_ADDR}/api/v2/users/${USER_ID}/tokens" | python3 -c 'import sys,json; print(json.load(sys.stdin)["token"])')"
echo "[host] resolve local model snapshot path (offline)"
LOCAL_MODEL_PATH="$(dexec "${HEAD_CONTAINER}" bash -lc "python3 -c \"import os; from huggingface_hub import snapshot_download; os.environ.setdefault('HF_HOME','/private/hf'); print(snapshot_download(repo_id='Qwen/Qwen2.5-0.5B-Instruct', local_files_only=True))\" " | tail -n 1)"
if [[ -z "${LOCAL_MODEL_PATH}" || "${LOCAL_MODEL_PATH}" != /* ]]; then
echo "ERROR: failed to resolve LOCAL_MODEL_PATH: ${LOCAL_MODEL_PATH}" >&2
exit 1
fi
echo "[host] local_model_path: ${LOCAL_MODEL_PATH}"
echo "[host] submit serving model via API"
SERVE_SPEC=$'model_id: qwen-0.5b\nmodel_source: '"${LOCAL_MODEL_PATH}"$'\nnum_replicas: 1\ngpus_per_replica: 1\n'
CREATE_RESP="$(curl -sS -H "Authorization: Bearer ${USER_TOKEN}" -H "Content-Type: application/yaml" --data-binary "${SERVE_SPEC}" "${API_ADDR}/api/v2/serve/models")"
echo "[host] create_model_resp: ${CREATE_RESP}"
MODEL_KEY="$(printf '%s' "${CREATE_RESP}" | python3 -c 'import sys,json; print(json.load(sys.stdin)["model_key"])')"
echo "[host] wait model RUNNING"
wait_model_state "${USER_TOKEN}" "${MODEL_KEY}" "RUNNING" 300
echo "[host] wait OpenAI ingress ready"
openai_wait_ready 120
echo "[host] verify /v1/models contains model"
MODEL_ID="$(
curl -sS "${OPENAI_BASE_URL}/models" \
| python3 -c 'import sys,json; obj=json.load(sys.stdin); print("\n".join([m.get("id","") for m in obj.get("data",[]) if isinstance(m,dict)]))' \
| grep -E "^${USER_ID}-[0-9]{12}-qwen-0\\.5b$" \
| head -n1 \
|| true
)"
if [[ -z "${MODEL_ID}" ]]; then
echo "ERROR: model id not found in /v1/models" >&2
curl -sS "${OPENAI_BASE_URL}/models" | python3 -m json.tool >&2 || true
exit 1
fi
echo "[host] model_id: ${MODEL_ID}"
echo "[host] chat completion (best-effort)"
CHAT_RESP="$(curl -sS -H "Content-Type: application/json" -H "Authorization: Bearer FAKE_KEY" -X POST "${OPENAI_BASE_URL}/chat/completions" --data-binary "{\"model\":\"${MODEL_ID}\",\"messages\":[{\"role\":\"user\",\"content\":\"hello\"}],\"max_tokens\":16,\"stream\":false}")"
printf '%s\n' "${CHAT_RESP}" | python3 -m json.tool >/dev/null 2>&1 || {
echo "ERROR: invalid chat response" >&2
printf '%s\n' "${CHAT_RESP}" >&2
exit 1
}
echo "[host] chat_ok"
echo "[host] delete model"
curl -sS -H "Authorization: Bearer ${USER_TOKEN}" -X DELETE "${API_ADDR}/api/v2/serve/models/${MODEL_KEY}" >/dev/null
wait_model_state "${USER_TOKEN}" "${MODEL_KEY}" "DELETED" 300
echo "[host] ===== run_all_v38_serving.sh done ====="

View File

@ -0,0 +1,102 @@
from __future__ import annotations
import argparse
import json
import os
import time
import urllib.request
from pathlib import Path
from typing import Any
def _pick_qwen_snapshot() -> str | None:
base = Path("/private/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots")
if not base.exists():
return None
snaps = sorted([p for p in base.iterdir() if p.is_dir()], reverse=True)
return str(snaps[0]) if snaps else None
def _http_get_json(url: str) -> Any:
with urllib.request.urlopen(url, timeout=10) as resp:
raw = resp.read().decode("utf-8")
return json.loads(raw)
def _wait_http_json(url: str, *, timeout_s: int) -> Any:
deadline = time.time() + float(timeout_s)
last_err: Exception | None = None
while time.time() < deadline:
try:
return _http_get_json(url)
except Exception as e:
last_err = e
time.sleep(2)
raise RuntimeError(f"timeout waiting for {url}: {last_err!r}")
def main(argv: list[str] | None = None) -> int:
ap = argparse.ArgumentParser(description="Ray Serve LLM smoke test (deploy + /v1/models probe).")
ap.add_argument("--ray-address", default="auto")
ap.add_argument("--http-port", type=int, default=8000)
ap.add_argument("--app-name", default="argus_llm_smoke")
ap.add_argument("--route-prefix", default="/")
ap.add_argument("--accelerator-type", default=os.environ.get("ARGUS_ACCELERATOR_TYPE") or "H20")
ap.add_argument("--model-id", default="smoke-qwen-0.5b")
ap.add_argument("--model-source", default=None, help="Local path or HF id. Default: cached Qwen snapshot under /private/hf.")
ap.add_argument("--tensor-parallel-size", type=int, default=1)
ap.add_argument("--num-replicas", type=int, default=1)
ap.add_argument("--wait-s", type=int, default=600)
args = ap.parse_args(argv)
model_source = str(args.model_source or _pick_qwen_snapshot() or "")
if not model_source:
raise SystemExit("missing --model-source and no cached Qwen snapshot found under /private/hf")
# Force offline HF behavior for the smoke test.
os.environ.setdefault("HF_HOME", "/private/hf")
os.environ.setdefault("HUGGINGFACE_HUB_CACHE", "/private/hf/hub")
os.environ.setdefault("TRANSFORMERS_CACHE", "/private/hf/transformers")
os.environ["HF_HUB_OFFLINE"] = "1"
import ray
ray.init(address=str(args.ray_address), ignore_reinit_error=True, log_to_driver=False)
from ray import serve
try:
serve.start(proxy_location="HeadOnly", http_options={"host": "0.0.0.0", "port": int(args.http_port)})
except Exception:
# Best-effort: Serve may already be running in the container (e.g., started by the MVP API scheduler).
pass
from ray.serve.llm import LLMConfig, build_openai_app
# Build a config dict and filter by the current Ray's LLMConfig schema, since fields
# may differ between Ray versions.
cfg_dict: dict[str, Any] = {
"model_loading_config": {"model_id": str(args.model_id), "model_source": model_source},
"accelerator_type": str(args.accelerator_type),
"deployment_config": {"num_replicas": int(args.num_replicas)},
"engine_kwargs": {"tensor_parallel_size": int(args.tensor_parallel_size)},
"runtime_env": {"env_vars": {"HF_HUB_OFFLINE": "1", "HF_HOME": "/private/hf"}},
}
allowed = set(getattr(LLMConfig, "model_fields", {}).keys())
if allowed:
cfg_dict = {k: v for k, v in cfg_dict.items() if k in allowed}
llm_cfg = LLMConfig(**cfg_dict)
app = build_openai_app({"llm_configs": [llm_cfg]})
serve.run(app, name=str(args.app_name), route_prefix=str(args.route_prefix))
models_url = f"http://127.0.0.1:{int(args.http_port)}/v1/models"
payload = _wait_http_json(models_url, timeout_s=int(args.wait_s))
print(json.dumps(payload, indent=2, sort_keys=True))
return 0
if __name__ == "__main__":
raise SystemExit(main())