v3.8 model serving 部署成功
This commit is contained in:
parent
63963eba29
commit
686739fea2
File diff suppressed because it is too large
Load Diff
314
specs/mvp/v3.8/ray_serve.md
Normal file
314
specs/mvp/v3.8/ray_serve.md
Normal file
@ -0,0 +1,314 @@
|
||||
|
||||
API参考资料
|
||||
https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html
|
||||
|
||||
ray.serve.llm.LLMConfig
|
||||
pydantic model ray.serve.llm.LLMConfig[source]
|
||||
The configuration for starting an LLM deployment.
|
||||
|
||||
PublicAPI (alpha): This API is in alpha and may change before becoming stable.
|
||||
|
||||
field accelerator_type: str | None = None
|
||||
The type of accelerator runs the model on. Only the following values are supported: [‘V100’, ‘P100’, ‘T4’, ‘P4’, ‘K80’, ‘A10G’, ‘L4’, ‘L40S’, ‘A100’, ‘H100’, ‘H200’, ‘H20’, ‘B200’, ‘Intel-GPU-Max-1550’, ‘Intel-GPU-Max-1100’, ‘Intel-GAUDI’, ‘AMD-Instinct-MI100’, ‘AMD-Instinct-MI250X’, ‘AMD-Instinct-MI250X-MI250’, ‘AMD-Instinct-MI210’, ‘AMD-Instinct-MI300A’, ‘AMD-Instinct-MI300X-OAM’, ‘AMD-Instinct-MI300X-HF’, ‘AMD-Instinct-MI308X’, ‘AMD-Instinct-MI325X-OAM’, ‘AMD-Instinct-MI350X-OAM’, ‘AMD-Instinct-MI355X-OAM’, ‘AMD-Radeon-R9-200-HD-7900’, ‘AMD-Radeon-HD-7900’, ‘aws-neuron-core’, ‘TPU-V2’, ‘TPU-V3’, ‘TPU-V4’, ‘TPU-V5P’, ‘TPU-V5LITEPOD’, ‘TPU-V6E’, ‘Ascend910B’, ‘Ascend910B4’, ‘MXC500’, ‘MXC550’, ‘A100-40G’, ‘A100-80G’]
|
||||
|
||||
field callback_config: CallbackConfig [Optional]
|
||||
Callback configuration to use for model initialization. Can be a string path to a class or a Callback subclass.
|
||||
|
||||
field deployment_config: Dict[str, Any] [Optional]
|
||||
The Ray @server.deployment options. Supported fields are: name, num_replicas, ray_actor_options, max_ongoing_requests, autoscaling_config, max_queued_requests, user_config, health_check_period_s, health_check_timeout_s, graceful_shutdown_wait_loop_s, graceful_shutdown_timeout_s, logging_config, request_router_config. For more details, see the Ray Serve Documentation.
|
||||
|
||||
field engine_kwargs: Dict[str, Any] = {}
|
||||
Additional keyword arguments for the engine. In case of vLLM, this will include all the configuration knobs they provide out of the box, except for tensor-parallelism which is set automatically from Ray Serve configs.
|
||||
|
||||
field experimental_configs: Dict[str, Any] [Optional]
|
||||
Experimental configurations for Ray Serve LLM. This is a dictionary of key-value pairs. Current supported keys are: - stream_batching_interval_ms: Ray Serve LLM batches streaming requests together. This config decides how long to wait for the batch before processing the requests. Defaults to 50.0. - num_ingress_replicas: The number of replicas for the router. Ray Serve will take the max amount all the replicas. Default would be 2 router replicas per model replica.
|
||||
|
||||
field llm_engine: str = 'vLLM'
|
||||
The LLMEngine that should be used to run the model. Only the following values are supported: [‘vLLM’]
|
||||
|
||||
field log_engine_metrics: bool | None = True
|
||||
Enable additional engine metrics via Ray Prometheus port.
|
||||
|
||||
field lora_config: Dict[str, Any] | LoraConfig | None = None
|
||||
Settings for LoRA adapter. Validated against LoraConfig.
|
||||
|
||||
field model_loading_config: Dict[str, Any] | ModelLoadingConfig [Required]
|
||||
The settings for how to download and expose the model. Validated against ModelLoadingConfig.
|
||||
|
||||
field placement_group_config: Dict[str, Any] | None = None
|
||||
Ray placement group configuration for scheduling vLLM engine workers. Defines resource bundles and placement strategy for multi-node deployments. Should contain ‘bundles’ (list of resource dicts) and optionally ‘strategy’ (defaults to ‘PACK’). Example: {‘bundles’: [{‘GPU’: 1, ‘CPU’: 2}], ‘strategy’: ‘PACK’}
|
||||
|
||||
field runtime_env: Dict[str, Any] | None = None
|
||||
The runtime_env to use for the model deployment replica and the engine workers.
|
||||
|
||||
apply_checkpoint_info(model_id_or_path: str, trust_remote_code: bool = False) → None[source]
|
||||
Apply the checkpoint info to the model config.
|
||||
|
||||
classmethod from_file(path: str, **kwargs) → ModelT
|
||||
Load a model from a YAML file path.
|
||||
|
||||
get_engine_config() → None | VLLMEngineConfig[source]
|
||||
Returns the engine config for the given LLM config.
|
||||
|
||||
LLMConfig not only has engine config but also deployment config, etc.
|
||||
|
||||
get_or_create_callback() → CallbackBase | None[source]
|
||||
Get or create the callback instance for this process.
|
||||
|
||||
This ensures one callback instance per process (singleton pattern). The instance is cached so the same object is used across all hooks.
|
||||
|
||||
Returns
|
||||
:
|
||||
Instance of class that implements Callback
|
||||
|
||||
multiplex_config() → ServeMultiplexConfig[source]
|
||||
classmethod parse_yaml(file, **kwargs) → ModelT
|
||||
setup_engine_backend()[source]
|
||||
update_engine_kwargs(**kwargs: Any) → None[source]
|
||||
Update the engine_kwargs and the engine_config engine_kwargs.
|
||||
|
||||
This is typically called during engine starts, when certain engine_kwargs (e.g., data_parallel_rank) become available.
|
||||
|
||||
validator validate_accelerator_type » accelerator_type[source]
|
||||
validator validate_deployment_config » deployment_config[source]
|
||||
Validates the deployment config dictionary.
|
||||
|
||||
validator validate_experimental_configs » experimental_configs[source]
|
||||
Validates the experimental configs dictionary.
|
||||
|
||||
validator validate_llm_engine » llm_engine[source]
|
||||
Validates the llm_engine string value.
|
||||
|
||||
validator validate_lora_config » lora_config[source]
|
||||
Validates the lora config dictionary.
|
||||
|
||||
validator validate_model_loading_config » model_loading_config[source]
|
||||
Validates the model loading config dictionary.
|
||||
|
||||
property input_modality: str
|
||||
Returns the input modality of the model. There could be more types in the future. Right now assumes if the model doesn’t support version, it’ll be text.
|
||||
|
||||
property max_request_context_length: int | None
|
||||
property model_architecture: str
|
||||
property model_id: str
|
||||
property supports_vision: bool
|
||||
|
||||
# Python API
|
||||
ray serve api
|
||||
https://docs.ray.io/en/latest/serve/api/index.html#serve-api
|
||||
|
||||
|
||||
Python API
|
||||
Writing Applications
|
||||
serve.Deployment
|
||||
|
||||
Class (or function) decorated with the @serve.deployment decorator.
|
||||
|
||||
serve.Application
|
||||
|
||||
One or more deployments bound with arguments that can be deployed together.
|
||||
|
||||
Deployment Decorators
|
||||
serve.deployment
|
||||
|
||||
Decorator that converts a Python class to a Deployment.
|
||||
|
||||
serve.ingress
|
||||
|
||||
Wrap a deployment class with an ASGI application for HTTP request parsing.
|
||||
|
||||
serve.batch
|
||||
|
||||
Converts a function to asynchronously handle batches.
|
||||
|
||||
serve.multiplexed
|
||||
|
||||
Wrap a callable or method used to load multiplexed models in a replica.
|
||||
|
||||
Deployment Handles
|
||||
Note
|
||||
|
||||
The deprecated RayServeHandle and RayServeSyncHandle APIs have been fully removed as of Ray 2.10. See the model composition guide for how to update code to use the DeploymentHandle API instead.
|
||||
|
||||
serve.handle.DeploymentHandle
|
||||
|
||||
A handle used to make requests to a deployment at runtime.
|
||||
|
||||
serve.handle.DeploymentResponse
|
||||
|
||||
A future-like object wrapping the result of a unary deployment handle call.
|
||||
|
||||
serve.handle.DeploymentResponseGenerator
|
||||
|
||||
A future-like object wrapping the result of a streaming deployment handle call.
|
||||
|
||||
Running Applications
|
||||
serve.start
|
||||
|
||||
Start Serve on the cluster.
|
||||
|
||||
serve.run
|
||||
|
||||
Run an application and return a handle to its ingress deployment.
|
||||
|
||||
serve.delete
|
||||
|
||||
Delete an application by its name.
|
||||
|
||||
serve.status
|
||||
|
||||
Get the status of Serve on the cluster.
|
||||
|
||||
serve.shutdown
|
||||
|
||||
Completely shut down Serve on the cluster.
|
||||
|
||||
serve.shutdown_async
|
||||
|
||||
Completely shut down Serve on the cluster asynchronously.
|
||||
|
||||
Configurations
|
||||
serve.config.ProxyLocation
|
||||
|
||||
Config for where to run proxies to receive ingress traffic to the cluster.
|
||||
|
||||
serve.config.gRPCOptions
|
||||
|
||||
gRPC options for the proxies.
|
||||
|
||||
serve.config.HTTPOptions
|
||||
|
||||
HTTP options for the proxies.
|
||||
|
||||
serve.config.AutoscalingConfig
|
||||
|
||||
Config for the Serve Autoscaler.
|
||||
|
||||
serve.config.AutoscalingPolicy
|
||||
|
||||
PublicAPI (alpha): This API is in alpha and may change before becoming stable.
|
||||
|
||||
serve.config.AutoscalingContext
|
||||
|
||||
Rich context provided to custom autoscaling policies.
|
||||
|
||||
serve.config.AggregationFunction
|
||||
|
||||
An enumeration.
|
||||
|
||||
serve.config.RequestRouterConfig
|
||||
|
||||
Config for the Serve request router.
|
||||
|
||||
Schemas
|
||||
serve.schema.ServeActorDetails
|
||||
|
||||
Detailed info about a Ray Serve actor.
|
||||
|
||||
serve.schema.ProxyDetails
|
||||
|
||||
Detailed info about a Ray Serve ProxyActor.
|
||||
|
||||
serve.schema.ApplicationStatusOverview
|
||||
|
||||
Describes the status of an application and all its deployments.
|
||||
|
||||
serve.schema.ServeStatus
|
||||
|
||||
Describes the status of Serve.
|
||||
|
||||
serve.schema.DeploymentStatusOverview
|
||||
|
||||
Describes the status of a deployment.
|
||||
|
||||
serve.schema.EncodingType
|
||||
|
||||
Encoding type for the serve logs.
|
||||
|
||||
serve.schema.AutoscalingMetricsHealth
|
||||
|
||||
An enumeration.
|
||||
|
||||
serve.schema.AutoscalingStatus
|
||||
|
||||
An enumeration.
|
||||
|
||||
serve.schema.ScalingDecision
|
||||
|
||||
One autoscaling decision with minimal provenance.
|
||||
|
||||
serve.schema.DeploymentAutoscalingDetail
|
||||
|
||||
Deployment-level autoscaler observability.
|
||||
|
||||
serve.schema.ReplicaRank
|
||||
|
||||
Replica rank model.
|
||||
|
||||
Request Router
|
||||
serve.request_router.ReplicaID
|
||||
|
||||
A unique identifier for a replica.
|
||||
|
||||
serve.request_router.PendingRequest
|
||||
|
||||
A request that is pending execution by a replica.
|
||||
|
||||
serve.request_router.RunningReplica
|
||||
|
||||
Contains info on a running replica.
|
||||
|
||||
serve.request_router.FIFOMixin
|
||||
|
||||
Mixin for FIFO routing.
|
||||
|
||||
serve.request_router.LocalityMixin
|
||||
|
||||
Mixin for locality routing.
|
||||
|
||||
serve.request_router.MultiplexMixin
|
||||
|
||||
Mixin for multiplex routing.
|
||||
|
||||
serve.request_router.RequestRouter
|
||||
|
||||
Abstract interface for a request router (how the router calls it).
|
||||
|
||||
Advanced APIs
|
||||
serve.get_replica_context
|
||||
|
||||
Returns the deployment and replica tag from within a replica at runtime.
|
||||
|
||||
serve.context.ReplicaContext
|
||||
|
||||
Stores runtime context info for replicas.
|
||||
|
||||
serve.get_multiplexed_model_id
|
||||
|
||||
Get the multiplexed model ID for the current request.
|
||||
|
||||
serve.get_app_handle
|
||||
|
||||
Get a handle to the application's ingress deployment by name.
|
||||
|
||||
serve.get_deployment_handle
|
||||
|
||||
Get a handle to a deployment by name.
|
||||
|
||||
serve.grpc_util.RayServegRPCContext
|
||||
|
||||
Context manager to set and get gRPC context.
|
||||
|
||||
serve.exceptions.BackPressureError
|
||||
|
||||
Raised when max_queued_requests is exceeded on a DeploymentHandle.
|
||||
|
||||
serve.exceptions.RayServeException
|
||||
|
||||
serve.exceptions.RequestCancelledError
|
||||
|
||||
Raise when a Serve request is cancelled.
|
||||
|
||||
serve.exceptions.DeploymentUnavailableError
|
||||
|
||||
Raised when a Serve deployment is unavailable to receive requests.
|
||||
87
specs/mvp/v3.8/ray_serve_llm.md
Normal file
87
specs/mvp/v3.8/ray_serve_llm.md
Normal file
@ -0,0 +1,87 @@
|
||||
|
||||
基于提供的来源,以下是使用 **Builder Pattern(构建器模式)** 结合 Ray Serve 和 vllm 动态部署**中型大语言模型(Medium-sized LLM)**的原理与操作方案。
|
||||
|
||||
### 一、 核心原理
|
||||
|
||||
1. **中型 LLM 定义**:中型模型(如 Llama-3.1-70B)通常具有约 70B 参数。它们通常运行在**单个节点**上,利用 **4 到 8 个 GPU**。
|
||||
2. **Builder Pattern 机制**:该模式通过 `build_openai_app` 函数提供高度抽象。开发者只需定义一个 `LLMConfig` 对象,即可自动构建并链接底层的 `LLMServer` 和 `OpenAiIngress` 组件。
|
||||
3. **高性能后端 (vLLM)**:Ray Serve LLM 使用 vLLM 作为推理引擎,支持高性能推理和显存管理。
|
||||
4. **动态扩缩容与资源调度**:
|
||||
* **张量并行 (Tensor Parallelism)**:通过 `tensor_parallel_size` 将模型权重均匀分布在单节点的所有 GPU 上。
|
||||
* **副本缩放 (Autoscaling)**:通过 `autoscaling_config` 动态调整 `min_replicas` 和 `max_replicas`,使服务能根据实时流量增减推理副本。
|
||||
|
||||
---
|
||||
|
||||
### 二、 操作方案
|
||||
|
||||
#### 1. 环境准备
|
||||
确保已安装必要的依赖包并配置 Hugging Face 访问令牌(针对 Llama-3.1 等受限模型)。
|
||||
```bash
|
||||
pip install "ray[serve,llm]"
|
||||
export HF_TOKEN=<YOUR_HUGGINGFACE_TOKEN>
|
||||
```
|
||||
|
||||
#### 2. 编写部署脚本 (`serve_medium_llm.py`)
|
||||
使用 **Builder Pattern** 定义配置并构建应用。以下示例配置了一个典型的 70B 模型部署:
|
||||
|
||||
```python
|
||||
# serve_medium_llm.py
|
||||
from ray.serve.llm import LLMConfig, build_openai_app
|
||||
import os
|
||||
|
||||
llm_config = LLMConfig(
|
||||
model_loading_config=dict(
|
||||
model_id="my-llama-3.1-70b",
|
||||
model_source="meta-llama/Llama-3.1-70B-Instruct",
|
||||
),
|
||||
accelerator_type="A100-40G", # 或 L40S
|
||||
deployment_config=dict(
|
||||
autoscaling_config=dict(
|
||||
min_replicas=1, # 最小副本数
|
||||
max_replicas=4, # 最大副本数,实现动态扩展
|
||||
)
|
||||
),
|
||||
runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}),
|
||||
engine_kwargs=dict(
|
||||
max_model_len=32768, # 上下文长度
|
||||
tensor_parallel_size=8, # 在单节点的 8 个 GPU 间拆分权重
|
||||
),
|
||||
)
|
||||
|
||||
# 使用 Builder Pattern 构建应用
|
||||
app = build_openai_app({"llm_configs": [llm_config]})
|
||||
```
|
||||
|
||||
#### 3. 启动部署
|
||||
在终端运行以下命令启动服务:
|
||||
```bash
|
||||
serve run serve_medium_llm:app
|
||||
```
|
||||
部署过程通常需要几分钟,包括配置集群、启动 vLLM 服务器以及下载模型权重。
|
||||
|
||||
#### 4. 发送请求测试
|
||||
服务启动后,可以通过符合 OpenAI 标准的接口进行访问。
|
||||
```python
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(base_url="http://localhost:8000/v1", api_key="FAKE_KEY")
|
||||
response = client.chat.completions.create(
|
||||
model="my-llama-3.1-70b",
|
||||
messages=[{"role": "user", "content": "解释一下什么是量子纠缠?"}],
|
||||
stream=True
|
||||
)
|
||||
for chunk in response:
|
||||
if chunk.choices.delta.content:
|
||||
print(chunk.choices.delta.content, end="", flush=True)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 三、 性能与并发优化建议
|
||||
|
||||
* **提高并发量**:可以通过降低 `max_model_len` 来减少 KV 缓存所需的显存,从而显著提升每个副本支持的最大并发请求数。
|
||||
* **监控指标**:通过 Ray Serve LLM 仪表盘监控 **TTFT(首字延迟)**、**TPOT(单字延迟)** 和 **Token 吞吐量** 来评估服务性能。
|
||||
* **精度折衷**:对于资源受限的场景,可以使用**量化模型**(如 FP8)来减少模型内存占用,为 KV 缓存留出更多空间,进而提高并发能力。
|
||||
|
||||
**比喻理解**:
|
||||
部署**中型 LLM** 就像是在一个大型车间里组装一台复杂的精密机器(模型权重)。**Builder Pattern** 是你的“全自动组装线”,你只需设定好机器的参数(Config),生产线就会自动帮你把零件固定好并接通电源。而 **vLLM 和张量并行** 就像是让 8 个熟练工人(GPU)共同抬起这台沉重的机器,每个人只负责自己那一部分的力气,从而让机器能够平稳地运转。
|
||||
8
specs/mvp/v3.8/requirements.md
Normal file
8
specs/mvp/v3.8/requirements.md
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
1. 通过ray serve(后端vllm)来动态拉起llm,支持多模型application部署,
|
||||
2. 默认一个模型只有一个replica,用户配置可以多个
|
||||
3. 用户可以删除(下线)模型
|
||||
4. 可以指定模型用几张卡
|
||||
5. 通过WebUI来进行配置,查看当前部署的模型列表,以及可以查看详情
|
||||
6. 模型路径可以使用common,也可以用户自己指定user路径
|
||||
7.
|
||||
224
specs/mvp/v3.8/v3.8_api.md
Normal file
224
specs/mvp/v3.8/v3.8_api.md
Normal file
@ -0,0 +1,224 @@
|
||||
# MVP v3.8 API Reference(Serving)
|
||||
|
||||
> 说明:本节为 v3.8 新增的 **Model Serving** API(Ray Serve LLM / vLLM)。
|
||||
> 认证:Serving 管理 API 复用现有 MVP API 的认证方式(`Authorization: Bearer <user_token>`)。
|
||||
> 推理:对外 OpenAI endpoint **不做鉴权**(v3.8 约定)。
|
||||
|
||||
## 0. 基本信息
|
||||
|
||||
### 0.1 Base URLs
|
||||
|
||||
- MVP API server:`http://<host>:8080`
|
||||
- Ray Serve OpenAI ingress(固定端口 8000):`http://<host>:8000/v1`
|
||||
|
||||
### 0.2 认证
|
||||
|
||||
所有 `/api/v2/serve/*` 接口要求:
|
||||
|
||||
```
|
||||
Authorization: Bearer <user_token>
|
||||
```
|
||||
|
||||
其中 `user_token` 由管理员通过 `/api/v2/users/<user_id>/tokens` 颁发(沿用现有机制)。
|
||||
|
||||
### 0.3 命名规则:`model_id = user_id-YYYYMMDDHHMM-<suffix>`
|
||||
|
||||
- 用户提交时填写 `model_id`(语义为 suffix,例如 `qwen-0.5b`)
|
||||
- 平台生成前缀:
|
||||
- `prefix = "<user_id>-<YYYYMMDDHHMM>"`
|
||||
- 平台实际对外暴露的 OpenAI model 名称为:
|
||||
- `model_id = "<prefix>-<suffix>"`
|
||||
- 示例:`alice-202601061235-qwen-0.5b`
|
||||
|
||||
## 1. 数据结构
|
||||
|
||||
### 1.1 ServingSpec(YAML)
|
||||
|
||||
请求体建议使用 YAML(与 TaskSpec 一致),示例:
|
||||
|
||||
```yaml
|
||||
model_id: qwen-0.5b # 必填:suffix(平台自动加 user_id- 前缀)
|
||||
model_source: $HOME/common/hf/.../<sha> # 必填:本地路径或 repo id;平台做 $HOME 宏替换与路径校验
|
||||
num_replicas: 1 # 可选,默认 1
|
||||
gpus_per_replica: 1 # 可选,默认 1
|
||||
# engine_kwargs: # 可选:vLLM 参数透传(白名单/黑名单由实现决定)
|
||||
# max_model_len: 8192
|
||||
# gpu_memory_utilization: 0.9
|
||||
```
|
||||
|
||||
说明:
|
||||
- `accelerator_type` 不在 ServingSpec 中暴露;由平台配置(`dev.yaml` 的 `serving.llm.accelerator_type`)统一注入到 Ray Serve LLM 的 `LLMConfig.accelerator_type`(dev/h1: `H20`)。
|
||||
|
||||
#### 宏替换
|
||||
|
||||
- `$HOME` → `/private/users/<user_id>`
|
||||
- `$HOME/common/hf` → `/private/hf`
|
||||
- `$HOME/common/datasets` → `/private/datasets`(serving 不强依赖,但保留一致语义)
|
||||
|
||||
#### 路径校验(v3.8 约定)
|
||||
|
||||
`model_source` 允许:
|
||||
|
||||
- `/private/hf/...`(common)
|
||||
- `/private/users/<user_id>/...`(user)
|
||||
|
||||
拒绝:
|
||||
|
||||
- 其它用户目录
|
||||
- 非 `/private` 下路径
|
||||
- 空路径或包含 `..` 的可疑路径
|
||||
|
||||
### 1.2 ServingModel(响应体,JSON)
|
||||
|
||||
```json
|
||||
{
|
||||
"model_key": "svc-alice-20260106-123000-abcd",
|
||||
"user_id": "alice",
|
||||
"model_id": "alice-202601061235-qwen-0.5b",
|
||||
"model_id_suffix": "qwen-0.5b",
|
||||
"model_id_prefix": "alice-202601061235",
|
||||
"model_source": "/private/hf/hub/models--.../snapshots/<sha>",
|
||||
"num_replicas": 1,
|
||||
"gpus_per_replica": 1,
|
||||
"total_gpus": 1,
|
||||
"state": "RUNNING",
|
||||
"endpoint": {
|
||||
"openai_base_url": "http://<host>:8000/v1",
|
||||
"model": "alice-202601061235-qwen-0.5b"
|
||||
},
|
||||
"error_summary": null,
|
||||
"created_at": "2026-01-06T12:30:00Z",
|
||||
"updated_at": "2026-01-06T12:31:02Z"
|
||||
}
|
||||
```
|
||||
|
||||
## 2. 管理 API(MVP API server)
|
||||
|
||||
### 2.1 Create / Upsert model
|
||||
|
||||
`POST /api/v2/serve/models`
|
||||
|
||||
#### Request
|
||||
|
||||
- Header: `Content-Type: application/yaml`
|
||||
- Body: ServingSpec(YAML)
|
||||
|
||||
#### Response (202)
|
||||
|
||||
```json
|
||||
{
|
||||
"model_key": "svc-alice-20260106-123000-abcd",
|
||||
"state": "QUEUED"
|
||||
}
|
||||
```
|
||||
|
||||
语义:
|
||||
- 创建新模型(若 suffix 不存在)
|
||||
- 或更新已有模型(若同一用户同一 suffix 已存在):更新 replicas/gpu 等配置,进入 `QUEUED` 等待 reconciler apply
|
||||
|
||||
### 2.2 List models (current user)
|
||||
|
||||
`GET /api/v2/serve/models`
|
||||
|
||||
#### Response (200)
|
||||
|
||||
```json
|
||||
{
|
||||
"items": [ ... ServingModel ... ],
|
||||
"openai_base_url": "http://<host>:8000/v1"
|
||||
}
|
||||
```
|
||||
|
||||
### 2.3 Get model detail
|
||||
|
||||
`GET /api/v2/serve/models/{model_key}`
|
||||
|
||||
#### Response (200)
|
||||
|
||||
```json
|
||||
{
|
||||
"model": { ... ServingModel ... },
|
||||
"resolved_spec_yaml": "model_id: ...\nmodel_source: ...\n",
|
||||
"events": [
|
||||
{ "event_type": "DEPLOY_REQUESTED", "created_at": "...", "payload": {...} }
|
||||
],
|
||||
"serve_status": {
|
||||
"app_name": "argus_llm_app",
|
||||
"app_status": "RUNNING"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 2.4 Scale replicas (PATCH)
|
||||
|
||||
`PATCH /api/v2/serve/models/{model_key}`
|
||||
|
||||
#### Request (JSON)
|
||||
|
||||
```json
|
||||
{ "num_replicas": 2 }
|
||||
```
|
||||
|
||||
#### Response (200)
|
||||
|
||||
```json
|
||||
{ "model_key": "...", "state": "QUEUED" }
|
||||
```
|
||||
|
||||
> v3.8 只支持修改 `num_replicas`(以及可选 engine_kwargs);`gpus_per_replica` 若修改,可能触发重新部署。
|
||||
|
||||
### 2.5 Delete / Undeploy model
|
||||
|
||||
`DELETE /api/v2/serve/models/{model_key}`
|
||||
|
||||
#### Response (200)
|
||||
|
||||
```json
|
||||
{ "model_key": "...", "state": "DELETING" }
|
||||
```
|
||||
|
||||
语义:从“声明式配置”中删除该模型,reconciler 会在下一轮 tick 触发 `serve.run(...)` 更新 app 配置并最终使其不可见。
|
||||
|
||||
### 2.6 Admin: Serve cluster status(可选)
|
||||
|
||||
`GET /api/v2/serve/status`
|
||||
|
||||
#### Response (200)
|
||||
|
||||
返回 `serve.status()` 摘要(集群级 + app 级)。
|
||||
|
||||
> 仅 admin token 可访问(沿用 v3.x admin gate)。
|
||||
|
||||
## 3. 推理 API(Ray Serve OpenAI ingress)
|
||||
|
||||
> v3.8 不做鉴权:无需 `Authorization`。
|
||||
|
||||
### 3.1 List models
|
||||
|
||||
`GET http://<host>:8000/v1/models`
|
||||
|
||||
返回可用 model 列表(包含 `alice-qwen-0.5b` 这类带前缀名称)。
|
||||
|
||||
### 3.2 Chat completions
|
||||
|
||||
`POST http://<host>:8000/v1/chat/completions`
|
||||
|
||||
```json
|
||||
{
|
||||
"model": "alice-202601061235-qwen-0.5b",
|
||||
"messages": [{"role":"user","content":"Hello"}],
|
||||
"stream": false
|
||||
}
|
||||
```
|
||||
|
||||
### 3.3 Completions / Embeddings
|
||||
|
||||
按 Ray Serve LLM OpenAI ingress 支持范围提供(v3.8 验收至少覆盖 chat)。
|
||||
|
||||
## 4. 错误码约定(MVP API server)
|
||||
|
||||
- `400 invalid yaml/spec`:YAML 解析失败、字段缺失、值不合法
|
||||
- `403 forbidden`:路径越权(model_source 访问其他用户目录)
|
||||
- `409 conflict`:model_id_suffix 冲突(同一用户重复创建且不允许覆盖时;若选择 upsert 则不返回该错误)
|
||||
- `422 unprocessable`:资源参数非法(replica/gpu <=0)
|
||||
- `500 internal`:reconciler/serve 调用异常(详情记录到 `serve_events`,并写入 `error_summary`)
|
||||
371
specs/mvp/v3.8/v3.8_design.md
Normal file
371
specs/mvp/v3.8/v3.8_design.md
Normal file
@ -0,0 +1,371 @@
|
||||
# MVP v3.8 详细设计方案:Ray Serve(vLLM)模型动态部署与管理
|
||||
|
||||
> 基线:当前已具备 v3.7 能力(训练平台 + W&B + SFTPGo + WebUI/API + Ray stateless pool,训练侧默认 rollout=vllm)。
|
||||
> v3.8 目标:在同一套 Ray 集群上,引入 **Ray Serve LLM(后端 vLLM)** 的模型推理服务能力,并通过 WebUI/API 动态管理模型生命周期。
|
||||
|
||||
## 0. 需求范围(来自 requirements.md)
|
||||
|
||||
1) 通过 Ray Serve(后端 vLLM)动态拉起 LLM,支持**多模型 application** 部署
|
||||
2) 默认一个模型 1 个 replica,用户可配置多个
|
||||
3) 用户可删除(下线)模型
|
||||
4) 用户可指定模型使用几张 GPU
|
||||
5) WebUI 可配置、查看模型列表、查看详情
|
||||
6) 模型路径可用 common,也可用 user 路径(本地路径)
|
||||
|
||||
## 1. 总体架构
|
||||
|
||||
### 1.1 组件关系
|
||||
|
||||
v3.8 在现有“训练平台”之上新增一个 **Serving 子系统**:
|
||||
|
||||
- **API server(现有)**
|
||||
- 新增 Serving API(模型部署/删除/扩缩容/状态)
|
||||
- 新增 Serving 后台线程(reconciler):周期性对齐 DB 与 Ray Serve 实际状态
|
||||
- **SQLite(现有)**
|
||||
- 新增 `serve_models`、`serve_events` 等表,保存声明式配置与状态
|
||||
- **Ray 集群(现有 stateless pool)**
|
||||
- 复用现有 head/worker 容器
|
||||
- 在集群内启动 Ray Serve(controller + proxy + deployments)
|
||||
- **Ray Serve LLM(新增)**
|
||||
- 通过 `ray.serve.llm.build_openai_app` 构建一个 OpenAI-compatible app
|
||||
- app 内包含多个 `LLMConfig`(每个对应一个模型)
|
||||
|
||||
### 1.2 为什么选择“单个 multi-model application”
|
||||
|
||||
Ray Serve 支持 multi-app,但在 dev/docker 场景下多个 app 的 route_prefix 管理更复杂;同时 requirements 要求“多模型 application 部署”,因此 v3.8 采用:
|
||||
|
||||
- 一个固定的 app:`argus_llm_app`(名字可配置)
|
||||
- route_prefix 固定为 `/`(对外暴露 `/v1/...` OpenAI 接口)
|
||||
- 每个模型对应一个 `LLMConfig`,通过 `model_id` 区分(即 OpenAI API 里的 `model` 字段)
|
||||
|
||||
这样对用户而言最直观:
|
||||
|
||||
- base_url 固定:`http://<host>:8000/v1`
|
||||
- `model=` 选择不同模型(`/v1/models` 自动列出)
|
||||
|
||||
## 2. Ray Serve 部署策略(dev/h1 约束)
|
||||
|
||||
### 2.1 HTTP 入口端口与 docker compose
|
||||
|
||||
Ray Serve 默认 HTTP 端口是 `8000`。v3.8 约定:
|
||||
|
||||
- 在 **head 容器** 映射 `8000:8000`
|
||||
- API server 仍在 `8080`
|
||||
- Ray Dashboard 在 `8265`
|
||||
|
||||
原因:在单机多容器 docker 环境里,如果让 proxy “每个节点都起”,会出现多个容器同时想绑定同一个 host 端口的问题(不可行)。因此 v3.8 推荐:
|
||||
|
||||
- Serve proxy 位置设为 **HeadOnly**(只在 head 上提供 HTTP 入口)
|
||||
- GPU replica 仍运行在 worker 上(proxy 只转发,不跑推理)
|
||||
|
||||
> 需要注意:
|
||||
> - Serve 的 HTTP 配置(host/port/proxy_location)是 **Ray 集群全局配置**,启动后无法动态修改,因此应当在平台启动时一次性设定并持久化。
|
||||
> - proxy Actor 需要 CPU 资源;head 节点的 `num-cpus=0` 策略可能需要在 v3.8 做小幅调整(例如给 head 保留少量 CPU),但仍通过 `entrypoint_resources` 确保训练 driver 不会被调度到 head。
|
||||
|
||||
#### 2.1.1 compose 预期改动(v3.8 实现时落地)
|
||||
|
||||
- `src/mvp/docker-compose.yaml`(ray_head)新增:
|
||||
- `ports: - "8000:8000"`
|
||||
|
||||
> worker 容器不暴露 8000(避免 host 端口冲突),由 head proxy 统一对外提供入口。
|
||||
|
||||
### 2.2 启动/配置方式(Python SDK 优先)
|
||||
|
||||
v3.8 采用 Ray Serve Python SDK:
|
||||
|
||||
- `ray.init(address="auto")`
|
||||
- `serve.start(proxy_location="HeadOnly", http_options={"host":"0.0.0.0","port":8000})`(一次性全局配置)
|
||||
- `serve.run(app, name=<app_name>, route_prefix="/")`
|
||||
- `serve.delete(name=<app_name>)`(必要时)
|
||||
- `serve.status()` 查询集群/应用状态
|
||||
|
||||
理由:
|
||||
|
||||
- 避免在平台内部引入额外 REST client 依赖(并减少跨版本 REST schema 不稳定风险)
|
||||
- API server 本身运行在 head 容器内,可直接 `ray.init(address="auto")` 连接现有集群
|
||||
|
||||
> 另:Ray Dashboard 暴露 Serve REST API(`PUT /api/serve/applications/` 等)可作为备选方案,但 v3.8 先不以它为主通路。
|
||||
|
||||
### 2.3 依赖与镜像假设
|
||||
|
||||
v3.8 依赖:
|
||||
|
||||
- `ray[serve]`(Serve Controller/Proxy)
|
||||
- `ray[llm]`(Ray Serve LLM 的 `ray.serve.llm` 模块)
|
||||
- vLLM(推理引擎)
|
||||
|
||||
由于 v3.7 已切换到 `verlai/verl:vllm011.latest`,预期镜像内包含 vLLM;但 `ray.serve.llm` 是否开箱即用需要在实现阶段确认。
|
||||
若缺失,v3.8 将在 `argus-ray-node` 镜像构建阶段补充 `pip install "ray[serve,llm]"`(或按官方建议的最小依赖)并做版本锁定。
|
||||
|
||||
### 2.4 Serving 配置(dev.yaml)
|
||||
|
||||
v3.8 新增一段 serving 配置,至少包含:
|
||||
|
||||
```yaml
|
||||
serving:
|
||||
serve:
|
||||
http_port: 8000 # 固定 8000
|
||||
proxy_location: HeadOnly # dev/docker 下推荐
|
||||
llm:
|
||||
accelerator_type: H20 # dev 环境填写 H20(对应 ray.serve.llm.LLMConfig.accelerator_type)
|
||||
```
|
||||
|
||||
说明:
|
||||
- `accelerator_type` 是 Ray Serve LLM 的 `LLMConfig.accelerator_type` 字段,用于表达“该模型运行在哪类加速卡上”。在 dev/h1 环境我们固定为 `H20`。
|
||||
- v3.8 不把 `accelerator_type` 暴露给普通用户编辑(避免误配);由部署环境配置统一决定。
|
||||
|
||||
## 3. 模型配置与资源映射
|
||||
|
||||
### 3.1 关键配置对象:`ray.serve.llm.LLMConfig`
|
||||
|
||||
每个模型部署由一个 `LLMConfig` 描述,关键字段(v3.8 用到的子集):
|
||||
|
||||
- `model_loading_config`
|
||||
- `model_id`: 对外展示/请求时用的模型名(唯一 key)
|
||||
- `model_source`: HF repo id / S3 / **local path**
|
||||
- `accelerator_type`
|
||||
- 从 `dev.yaml` 的 `serving.llm.accelerator_type` 读取(dev/h1: `H20`)
|
||||
- `deployment_config`
|
||||
- `num_replicas` 或 `autoscaling_config`(v3.8 先用固定 `num_replicas`)
|
||||
- `ray_actor_options`(CPU/资源约束)
|
||||
- `engine_kwargs`
|
||||
- vLLM 相关参数(`max_model_len`、`gpu_memory_utilization` 等)
|
||||
- `placement_group_config`
|
||||
- 控制 vLLM engine workers 使用的资源 bundle(用于多 GPU / 跨节点)
|
||||
- `runtime_env`
|
||||
- 注入 HF cache、离线开关等环境变量
|
||||
|
||||
### 3.2 GPU 张数(gpus_per_replica)如何落到 LLMConfig
|
||||
|
||||
v3.8 把用户输入的:
|
||||
|
||||
- `gpus_per_replica = N`
|
||||
|
||||
映射为:
|
||||
|
||||
- `engine_kwargs.tensor_parallel_size = N`(单机/跨机张量并行,Ray Serve LLM 官方示例写法)
|
||||
- `placement_group_config = {"bundles": [{"GPU": 1, "CPU": <cpu_per_gpu>}] * N, "strategy": "PACK"}`
|
||||
|
||||
并在 `engine_kwargs` 中保留 vLLM 其他参数(`max_model_len`、`gpu_memory_utilization` 等)。
|
||||
|
||||
> 兼容性说明:Ray Serve LLM/Serve LLM 仍处于快速演进阶段;v3.8 会以我们线上实际 Ray 版本为准做最小适配与回归测试。
|
||||
|
||||
### 3.2.1 跨节点场景(N > 单机 GPU)
|
||||
|
||||
Ray Serve LLM 默认使用 `PACK` 策略,优先把 GPU worker 放在尽量少的节点上;如果单机放不下,会自动 spill 到其它节点,从而支持跨节点张量并行(TP)部署。
|
||||
|
||||
### 3.3 replica 数(num_replicas)
|
||||
|
||||
v3.8 默认:
|
||||
|
||||
- `num_replicas = 1`
|
||||
|
||||
允许用户在 UI 中设置为 `>=1`。
|
||||
多 replica 会线性消耗 GPU(`num_replicas * gpus_per_replica`),需要做资源预检查。
|
||||
|
||||
### 3.4 模型路径与宏替换(common / user)
|
||||
|
||||
v3.8 支持两类模型来源:
|
||||
|
||||
1) **common**
|
||||
- 典型为 `/private/hf/...`(共享 HF cache / snapshot)
|
||||
|
||||
2) **user**
|
||||
- `/private/users/<user_id>/models/...`
|
||||
- 以及用户训练输出(例如 `jobs/<sid>/checkpoints/.../huggingface`)
|
||||
|
||||
为保证 UI 易用,沿用平台已有的宏语义:
|
||||
|
||||
- `$HOME` → `/private/users/<user_id>`
|
||||
- `$HOME/common/hf` → `/private/hf`
|
||||
|
||||
并进行路径校验:
|
||||
|
||||
- 允许前缀:`/private/hf`、`/private/users/<user_id>/`
|
||||
- 拒绝:越权访问其他用户目录、或访问系统敏感路径
|
||||
|
||||
### 3.5 离线模式(避免 HF mirror 429)
|
||||
|
||||
v3.7 训练侧已验证 `HF_HUB_OFFLINE=1` 的必要性。v3.8 Serving 侧同样默认注入:
|
||||
|
||||
- `HF_HOME=/private/hf`
|
||||
- `HUGGINGFACE_HUB_CACHE=/private/hf/hub`
|
||||
- `TRANSFORMERS_CACHE=/private/hf/transformers`
|
||||
- `HF_HUB_OFFLINE=1`
|
||||
- `HF_ENDPOINT=https://hf-mirror.com`(可保留,但离线模式下不应触发网络)
|
||||
|
||||
并建议用户在 ServingSpec 中尽量填写 **local path** 作为 `model_source`,而不是直接 repo id。
|
||||
|
||||
## 4. 平台数据模型(SQLite)
|
||||
|
||||
新增两张主表:
|
||||
|
||||
### 4.1 `serve_models`
|
||||
|
||||
每一行代表一个“声明式模型部署”:
|
||||
|
||||
- `model_key`(平台内部唯一 ID,便于重命名/去重)
|
||||
- `user_id`
|
||||
- `model_id`(对外 OpenAI model 名称,要求 per-app 唯一)
|
||||
- `model_source`(本地路径或 repo id,存 resolved 后的结果)
|
||||
- `num_replicas`
|
||||
- `gpus_per_replica`
|
||||
- `engine_kwargs_json`(可选)
|
||||
- `state`:`QUEUED | DEPLOYING | RUNNING | FAILED | DELETING | DELETED`
|
||||
- `serve_app_name`(默认 `argus_llm_app`)
|
||||
- `created_at / updated_at`
|
||||
- `error_summary`
|
||||
|
||||
### 4.2 `serve_events`
|
||||
|
||||
记录关键事件与排障信息(类似 task_events):
|
||||
|
||||
- `id`
|
||||
- `model_key`
|
||||
- `event_type`(DEPLOY_REQUESTED/DEPLOY_APPLIED/STATUS_SYNC/DELETE_REQUESTED/...)
|
||||
- `payload_json`
|
||||
- `created_at`
|
||||
|
||||
## 5. API 设计(新增)
|
||||
|
||||
在现有 `Authorization: Bearer <user_token>` 的认证体系下,新增 Serving API(路径仅示意,具体在实现时与现有 `api/v2` 对齐)。
|
||||
|
||||
### 5.1 用户接口
|
||||
|
||||
- `POST /api/v2/serve/models`
|
||||
- body: YAML 或 JSON(v3.8 先用 YAML 与现有 TaskSpec 一致)
|
||||
- 创建/更新(upsert)一个模型配置,进入 `QUEUED`
|
||||
- `GET /api/v2/serve/models`
|
||||
- 列出当前用户的模型列表(含 state、资源、endpoint)
|
||||
- `GET /api/v2/serve/models/{model_key}`
|
||||
- 详情:完整 spec + 最近事件 + Serve status 摘要
|
||||
- `PATCH /api/v2/serve/models/{model_key}`
|
||||
- 修改 `num_replicas`、或 engine_kwargs(可选)
|
||||
- `DELETE /api/v2/serve/models/{model_key}`
|
||||
- 下线模型(进入 `DELETING`)
|
||||
|
||||
### 5.2 系统接口(admin)
|
||||
|
||||
- `GET /api/v2/serve/status`(admin)
|
||||
- 返回 `serve.status()` 的摘要(集群级 / app 级)
|
||||
|
||||
### 5.3 对外推理 endpoint
|
||||
|
||||
固定输出到 UI/接口中:
|
||||
|
||||
- `openai_base_url = http://<host>:8000/v1`
|
||||
- 支持:
|
||||
- `/v1/chat/completions`
|
||||
- `/v1/completions`
|
||||
- `/v1/embeddings`
|
||||
- `/v1/models`
|
||||
|
||||
> v3.8 不做额外网关与鉴权(保持与现有 dev 环境一致);若后续需要,可在 v3.9+ 引入 token 校验/反向代理。
|
||||
|
||||
### 5.4 `model_id` 前缀策略(user_id-)
|
||||
|
||||
为避免多用户冲突并保持可读性:
|
||||
|
||||
v3.8 采用“**user_id + 日期小时分钟**”作为稳定前缀,以降低冲突并便于快速定位创建时间:
|
||||
|
||||
- 用户在 UI/API 中仅填写 `model_id_suffix`(或仍用字段名 `model_id`,但语义为 suffix)
|
||||
- 平台计算实际对外 `model_id`:
|
||||
- `prefix = f"{user_id}-{YYYYMMDDHHMM}"`
|
||||
- `model_id = f"{prefix}-{model_id_suffix}"`
|
||||
- 在列表/详情中同时展示:
|
||||
- `model_id_suffix`(用户输入)
|
||||
- `model_id_prefix`(平台生成,例如 `alice-202601061235`)
|
||||
- `model_id`(对外 OpenAI 名称)
|
||||
|
||||
## 6. 后台执行模型(Serving Reconciler)
|
||||
|
||||
v3.8 参考任务 scheduler 的模式,引入一个轻量的 reconciler:
|
||||
|
||||
- tick 周期(例如 5s)
|
||||
- 每次 tick:
|
||||
1) 拉取 DB 中 `QUEUED/DEPLOYING/RUNNING/DELETING` 的模型
|
||||
2) 调用 `serve.status()` 读取当前 app 及 deployments 状态
|
||||
3) 若存在 `QUEUED` 或需要变更的模型:构建新的 multi-model app(包含全部 `RUNNING/DEPLOYING/QUEUED` 的模型配置)并 `serve.run(...)`
|
||||
4) 若存在 `DELETING`:从 app 配置中移除对应模型,并 `serve.run(...)` 应用变更
|
||||
5) 更新每个模型的 state(依据 Serve status)
|
||||
|
||||
重要行为说明(multi-model app 的代价):
|
||||
- 每次“新增/删除/改 replicas”都会触发对同一个 app 的一次 `serve.run(...)` 更新;
|
||||
- Ray Serve 会尽量做增量更新,但在某些版本/配置下可能导致 ingress/router 短暂重启;
|
||||
- v3.8 先接受该代价(满足需求闭环优先);若后续需要“删除某模型不影响其它模型”,可演进为“每模型一个 app + 单独 route_prefix”的方案。
|
||||
|
||||
资源预检查:
|
||||
- 在 apply 前使用 `ray.available_resources()` 做粗粒度 GPU 预检查:
|
||||
- 需要 GPU 总量 = `sum(num_replicas * gpus_per_replica)`(仅对“新增/扩容的差量”更精确)
|
||||
- 若不足:
|
||||
- 模型保持 `QUEUED`,记录事件 `PENDING_RESOURCES`
|
||||
- 用户 UI 显示“资源不足,等待释放”
|
||||
|
||||
> v3.8 不引入更复杂的抢占/优先级。Serving 与 Training 会竞争 GPU;用户需要自行规划资源(或后续版本引入统一调度)。
|
||||
|
||||
## 7. WebUI 设计(新增 Serving 页面)
|
||||
|
||||
新增侧边栏入口:**Serving**
|
||||
|
||||
### 7.1 Serving 列表页
|
||||
|
||||
- 展示字段:
|
||||
- model_id
|
||||
- user_id(仅 admin 可见)
|
||||
- replicas / gpus_per_replica / total_gpus
|
||||
- state(RUNNING/DEPLOYING/QUEUED/FAILED)
|
||||
- 操作:Scale(修改 replicas)、Delete
|
||||
|
||||
### 7.2 Serving 创建/编辑页
|
||||
|
||||
两种模式(与 New Task 类似,先做 YAML 模式即可):
|
||||
|
||||
示例 YAML(v3.8):
|
||||
|
||||
```yaml
|
||||
model_id: qwen-0.5b
|
||||
model_source: $HOME/common/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/<sha>
|
||||
num_replicas: 1
|
||||
gpus_per_replica: 1
|
||||
# engine_kwargs:
|
||||
# max_model_len: 8192
|
||||
# gpu_memory_utilization: 0.9
|
||||
```
|
||||
|
||||
### 7.3 Serving 详情页
|
||||
|
||||
- 完整配置(resolved spec)
|
||||
- Serve status 摘要(deployments 状态、replica 健康)
|
||||
- OpenAI 调用示例(python openai client)
|
||||
|
||||
## 8. 验收标准(v3.8)
|
||||
|
||||
1) 部署:
|
||||
- 一键部署一个模型(1 replica、1 GPU)成功,状态变为 RUNNING
|
||||
- `/v1/models` 可列出该模型
|
||||
|
||||
2) 扩缩容:
|
||||
- 修改 `num_replicas` 生效(Serve status 看到副本数变化)
|
||||
|
||||
3) 多模型:
|
||||
- 同一个 app 内能同时部署 2 个模型(不同 model_id)
|
||||
- 通过 OpenAI 接口用不同 `model=` 请求可得到响应
|
||||
|
||||
4) 下线:
|
||||
- 删除某模型后 `/v1/models` 不再出现
|
||||
|
||||
5) 模型路径:
|
||||
- 支持 `/private/hf/...`(common)与 `/private/users/<user>/...`(user)两类本地路径
|
||||
|
||||
6) 资源不足可解释:
|
||||
- 当 GPU 不足时,模型进入 `QUEUED` 并在 UI/详情中提示“资源不足”
|
||||
|
||||
## 9. 待确认点(请你评审时确认)
|
||||
|
||||
已确认(来自评审):
|
||||
|
||||
1) 推理端口固定使用 `8000`(Ray Serve 默认端口)。
|
||||
2) 对外暴露的 OpenAI 接口 **不与现有 token 体系绑定**(v3.8 不做推理侧鉴权)。
|
||||
3) `model_id` 命名规则:平台统一加 `user_id + 日期小时分钟` 前缀,用户在 UI 里只填写后缀部分。
|
||||
|
||||
> 说明:这样可以避免跨用户 model_id 冲突,同时在 OpenAI API 的 `model=` 字段上自然可读。
|
||||
266
specs/mvp/v3.8/v3.8_dev_plan.md
Normal file
266
specs/mvp/v3.8/v3.8_dev_plan.md
Normal file
@ -0,0 +1,266 @@
|
||||
# MVP v3.8 开发计划(TDD,细化版)
|
||||
|
||||
> 目标:在 v3.7 基础上引入 Ray Serve(vLLM)模型动态部署与管理(多模型单 app),并提供 WebUI + API 管理闭环。
|
||||
> 约束(已确认):
|
||||
> - 推理端口固定 `8000`(Serve HTTP)。
|
||||
> - 推理侧不接入现有 token 鉴权(对外 OpenAI endpoint 无鉴权)。
|
||||
> - 对外 `model_id` 统一加前缀:`<user_id>-<YYYYMMDDHHMM>-<suffix>`(用户只填 suffix)。
|
||||
> - `LLMConfig.accelerator_type` 从 `dev.yaml` 读取(dev/h1: `H20`)。
|
||||
|
||||
本计划按“测试先行 → 实现 → 回归”的节奏拆分到可验证粒度;每个 milestone 都能单独验收。
|
||||
|
||||
---
|
||||
|
||||
## M0 - 基线与依赖探测(不改行为)
|
||||
|
||||
**目的**:确认 v3.7 baseline 稳定,并明确 Ray Serve LLM 依赖是否已具备(否则后续会卡在镜像/依赖)。
|
||||
|
||||
### M0.1 本地回归
|
||||
- [ ] `.venv/bin/python -m pytest` 通过(coverage ≥ 90%)
|
||||
|
||||
### M0.2 远端回归(h1)
|
||||
- [ ] `src/mvp/scripts/run_all_v30_api.sh` 可跑通(确认训练闭环未回退)
|
||||
|
||||
### M0.3 head 容器内依赖探测(记录结论)
|
||||
- [ ] `python3 -c "import ray; import ray.serve; print(ray.__version__)"`
|
||||
- [ ] `python3 -c "from ray.serve.llm import LLMConfig, build_openai_app; print('serve_llm_ok')"`
|
||||
- [ ] 若失败(例如缺 `gymnasium`):记录缺失项,并在 M6 通过补齐 `ray[llm]` 解决
|
||||
|
||||
### M0.4 配置探测
|
||||
- [ ] `configs/dev.yaml` 中存在:
|
||||
- `serving.llm.accelerator_type: H20`
|
||||
- `serving.serve.http_port: 8000`
|
||||
- `serving.serve.proxy_location: HeadOnly`
|
||||
|
||||
**验收**:
|
||||
- baseline 无回退;依赖探测结论明确(可用/不可用)
|
||||
|
||||
---
|
||||
|
||||
## M1 - ServingSpec(解析/校验/宏替换/路径校验)(单测驱动)
|
||||
|
||||
**目的**:先把“输入”这层彻底固化(API/UI 复用),避免后期反复改 schema。
|
||||
|
||||
### M1.1 新增/扩展数据模型
|
||||
- [ ] `ServingSpec`(输入)
|
||||
- `model_id`(suffix)
|
||||
- `model_source`(支持 `$HOME` 宏)
|
||||
- `num_replicas`(default=1)
|
||||
- `gpus_per_replica`(default=1)
|
||||
- `engine_kwargs`(可选 dict,先原样存 DB;实现阶段再做白名单/黑名单)
|
||||
- [ ] `ResolvedServingSpec`(内部)
|
||||
- `model_id_suffix`
|
||||
- `model_id_prefix`(由平台生成:`user_id-YYYYMMDDHHMM`)
|
||||
- `model_id`(对外:`<prefix>-<suffix>`)
|
||||
- `model_source`(resolved path)
|
||||
|
||||
### M1.2 规则(写成纯函数,便于测)
|
||||
- [ ] `validate_model_id_suffix(suffix)`:长度/字符集限制(建议:`[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}`)
|
||||
- [ ] `$HOME` 宏替换:`$HOME`、`$HOME/common/hf`、`$HOME/common/datasets`
|
||||
- [ ] 路径校验(强制本地路径):
|
||||
- 允许:`/private/hf/...`、`/private/users/<user_id>/...`
|
||||
- 拒绝:`..`、空、其它用户路径、非 `/private` 路径
|
||||
- [ ] `make_model_id_prefix(user_id, now_utc)`:`YYYYMMDDHHMM`(UTC)+ user_id
|
||||
|
||||
### M1.3 单测(先写失败用例,再补实现)
|
||||
- [ ] `test_serving_spec_validation.py`
|
||||
- suffix 合法/非法
|
||||
- replicas/gpus 边界:0、负数、小数、超大值(按实现决定是否限制上限)
|
||||
- [ ] `test_serving_spec_paths.py`
|
||||
- `$HOME` 替换正确
|
||||
- 越权路径返回 403/ValueError(按接口层映射)
|
||||
- `/private/hf` 与 `/private/users/<user>` 均可
|
||||
- [ ] `test_serving_model_id_prefix.py`
|
||||
- 固定时间输入 → prefix 输出一致(避免时区/格式问题)
|
||||
|
||||
**验收**:
|
||||
- 输入 spec 规则稳定;核心校验/替换均有单测覆盖
|
||||
|
||||
---
|
||||
|
||||
## M2 - SQLite 表结构与 Db 接口(单测驱动)
|
||||
|
||||
**目的**:Serving 的声明式状态必须持久化,可审计、可恢复。
|
||||
|
||||
### M2.1 DB schema
|
||||
- [ ] `serve_models`
|
||||
- 主键:`model_key`(平台生成)
|
||||
- unique:`(user_id, model_id_suffix)`(实现 upsert)
|
||||
- 存储:resolved spec(包含 prefix/full model_id、resolved model_source)
|
||||
- 状态:`QUEUED/DEPLOYING/RUNNING/FAILED/DELETING/DELETED`
|
||||
- `error_summary`
|
||||
- [ ] `serve_events`(append-only)
|
||||
|
||||
### M2.2 Db 方法
|
||||
- [ ] `upsert_serve_model(user_id, spec_yaml, now)` → (model_key, state)
|
||||
- [ ] `list_serve_models(user_id, include_deleted=False, limit/offset?)`
|
||||
- [ ] `get_serve_model(model_key)`
|
||||
- [ ] `set_serve_model_state(model_key, state, error_summary=None)`
|
||||
- [ ] `append_serve_event(model_key, event_type, payload_json=None)`
|
||||
- [ ] `pick_next_runnable_serve_change()`(给 reconciler 用)
|
||||
|
||||
### M2.3 单测
|
||||
- [ ] `test_db_serving.py`
|
||||
- upsert 行为(同 suffix 更新不产生新 model_key 或产生新版本——此处需在实现前明确策略)
|
||||
- state 流转 + 事件记录
|
||||
- list 的过滤与排序(按 updated_at)
|
||||
|
||||
**验收**:
|
||||
- DB 行为可预测;upsert/unique 语义确定并测试覆盖
|
||||
|
||||
---
|
||||
|
||||
## M3 - Serving 管理 API(FastAPI)(单测驱动)
|
||||
|
||||
**目的**:先把管理 API 跑通,Ray Serve 先不接真实(reconciler 之后再接)。
|
||||
|
||||
### M3.1 API 路由(用户)
|
||||
- [ ] `POST /api/v2/serve/models`(Content-Type: application/yaml)
|
||||
- 入参:ServingSpec YAML
|
||||
- 出参:`{model_key,state}`(202)
|
||||
- [ ] `GET /api/v2/serve/models`
|
||||
- 返回 items + `openai_base_url=http://<host>:8000/v1`
|
||||
- [ ] `GET /api/v2/serve/models/{model_key}`
|
||||
- 返回 model + resolved_spec_yaml + events(分页可后置)+ serve_status(先空/占位)
|
||||
- [ ] `PATCH /api/v2/serve/models/{model_key}`(JSON)
|
||||
- 支持 `num_replicas`(最小闭环)
|
||||
- [ ] `DELETE /api/v2/serve/models/{model_key}`
|
||||
|
||||
### M3.2 API 路由(admin,可选)
|
||||
- [ ] `GET /api/v2/serve/status`(仅 admin token)
|
||||
|
||||
### M3.3 错误映射(必须测试)
|
||||
- [ ] YAML 解析失败:400
|
||||
- [ ] spec 校验失败:422
|
||||
- [ ] 越权路径:403
|
||||
- [ ] 不存在 model_key:404
|
||||
|
||||
### M3.4 单测
|
||||
- [ ] `test_app_serving_api.py`
|
||||
- happy path:create → list → get → patch → delete
|
||||
- 多用户隔离:用户只能看到自己的 model
|
||||
- 错误码覆盖:400/403/404/422
|
||||
|
||||
**验收**:
|
||||
- API reference (`v3.8_api.md`) 中所有管理接口可返回预期结构(Serve 未接入也能工作)
|
||||
|
||||
---
|
||||
|
||||
## M4 - ServeClient 抽象 + LLMConfig builder(单测驱动)
|
||||
|
||||
**目的**:将“如何从 ResolvedServingSpec 构造 LLMConfig”固化,并把 Ray Serve 的依赖隔离到 client 里,便于 mock。
|
||||
|
||||
### M4.1 `ServeClient` 接口(可 mock)
|
||||
- [ ] `ensure_started(http_port=8000, proxy_location="HeadOnly")`
|
||||
- [ ] `apply_app(app_name, llm_configs)`(multi-model)
|
||||
- [ ] `get_status()`(serve.status 摘要)
|
||||
|
||||
### M4.2 `build_llm_config(resolved_spec, accelerator_type, runtime_env_defaults)` 纯函数
|
||||
- [ ] 写入 `LLMConfig.accelerator_type`(来自 dev.yaml:H20)
|
||||
- [ ] `deployment_config.num_replicas`
|
||||
- [ ] `engine_kwargs.tensor_parallel_size = gpus_per_replica`
|
||||
- [ ] `placement_group_config` bundles 按 GPU 张数生成
|
||||
- [ ] `runtime_env.env_vars` 注入(至少包含 HF cache + `HF_HUB_OFFLINE=1`)
|
||||
|
||||
### M4.3 单测
|
||||
- [ ] `test_llm_config_builder.py`
|
||||
- gpus_per_replica=1/2/4 → tensor_parallel_size 与 bundles 数量正确
|
||||
- accelerator_type 注入正确
|
||||
- runtime_env 含 HF_HUB_OFFLINE 等关键 env
|
||||
|
||||
**验收**:
|
||||
- 从平台 spec 到 Ray Serve LLMConfig 的映射规则稳定,有单测锁定
|
||||
|
||||
---
|
||||
|
||||
## M5 - Serving Reconciler(状态机 + 资源预检查)(单测驱动)
|
||||
|
||||
**目的**:实现声明式对齐:DB → Serve;同时提供可解释的 QUEUED/FAILED 状态。
|
||||
|
||||
### M5.1 状态机(最小闭环)
|
||||
- [ ] `QUEUED`:等待 apply
|
||||
- [ ] `DEPLOYING`:已触发 apply,等待 Serve running/healthy
|
||||
- [ ] `RUNNING`:Serve status running
|
||||
- [ ] `FAILED`:apply 或 status 失败(写 error_summary + event)
|
||||
- [ ] `DELETING`:等待从 app 中移除
|
||||
- [ ] `DELETED`:完成删除(可选保留记录)
|
||||
|
||||
### M5.2 资源预检查
|
||||
- [ ] `needed_total_gpus = sum(num_replicas*gpus_per_replica)`(最小可用预检查)
|
||||
- [ ] `ray.available_resources()["GPU"]`(或更稳健的 per-node 统计)不足时:
|
||||
- 保持 `QUEUED`
|
||||
- 记录 `PENDING_RESOURCES` event
|
||||
|
||||
### M5.3 reconcile 策略(multi-model app)
|
||||
- [ ] tick 读取 active models,构建全量 `llm_configs`
|
||||
- [ ] 处理 deleting:从 configs 中移除对应 model,再 apply
|
||||
|
||||
### M5.4 单测(mock ServeClient + mock ray resources)
|
||||
- [ ] `test_serving_reconciler.py`
|
||||
- 新增模型:apply_app 被调用;state 进入 DEPLOYING
|
||||
- 删除模型:apply_app configs 不包含该模型
|
||||
- GPU 不足:不 apply;state 仍 QUEUED;event 写入
|
||||
- apply 抛异常:state FAILED;error_summary 写入
|
||||
|
||||
**验收**:
|
||||
- reconciler 行为在纯单测环境可验证;失败可解释
|
||||
|
||||
---
|
||||
|
||||
## M6 - 真实集成(h1):Ray Serve 启动 + 推理闭环(E2E)
|
||||
|
||||
**目的**:在 dev/h1 环境真正跑通:部署模型 → `/v1/models` 可见 → `chat/completions` 成功 → 删除后消失。
|
||||
|
||||
### M6.1 compose/端口
|
||||
- [ ] `src/mvp/docker-compose.yaml`:`ray_head` 增加 `8000:8000`
|
||||
|
||||
### M6.2 镜像依赖(若 M0 发现缺失)
|
||||
- [ ] 在 `argus-ray-node` 镜像中补齐 `ray[serve,llm]`(版本与现有 Ray 对齐,避免升级 Ray 导致不兼容)
|
||||
- 推荐优先补齐 `ray[llm]`(包含 `ray.serve.llm` 依赖闭包,如 `gymnasium`),再按需补 `ray[serve]`
|
||||
- 验证点:`python3 -c "from ray.serve.llm import LLMConfig, build_openai_app; print('serve_llm_ok')"`
|
||||
|
||||
### M6.3 E2E 脚本(幂等)
|
||||
- [ ] 新增 `scripts/run_all_v38_serving.sh`:
|
||||
- 起 compose(确保 Serve 端口可用)
|
||||
- 起 API
|
||||
- 创建 user + token
|
||||
- `POST /api/v2/serve/models` 创建 1GPU 模型
|
||||
- 轮询模型 state 到 RUNNING
|
||||
- `curl http://127.0.0.1:8000/v1/models` 验证包含 `<prefix>-<suffix>`
|
||||
- `curl http://127.0.0.1:8000/v1/chat/completions` 进行最小推理
|
||||
- `DELETE /api/v2/serve/models/{model_key}` 下线
|
||||
- 再轮询 `/v1/models` 不包含
|
||||
|
||||
**验收**:
|
||||
- E2E 可重复跑通(至少两次连续跑不需要人工清理)
|
||||
|
||||
---
|
||||
|
||||
## M7 - WebUI(Serving 页面)(单测驱动)
|
||||
|
||||
**目的**:给用户可视化的模型管理页面(最小必要功能)。
|
||||
|
||||
### M7.1 页面
|
||||
- [ ] Sidebar 增加 Serving
|
||||
- [ ] `/ui/serving`:列表 + 状态 + 操作(delete/scale)
|
||||
- [ ] `/ui/serving/new`:YAML 输入 + submit
|
||||
- [ ] `/ui/serving/{model_key}`:详情(resolved spec、events、OpenAI 调用示例)
|
||||
|
||||
### M7.2 单测
|
||||
- [ ] `test_ui_serving.py`:路由 200、关键链接存在、包含 openai_base_url=8000
|
||||
|
||||
**验收**:
|
||||
- WebUI 覆盖 create/list/detail/scale/delete 的主链路
|
||||
|
||||
---
|
||||
|
||||
## M8 - 文档与验收用例(交付)
|
||||
|
||||
**目的**:给用户/运维一套可复用的运行方式与排障路径。
|
||||
|
||||
- [ ] 更新 `specs/mvp/v3.8/v3.8_progress.md`(按 milestone 记录)
|
||||
- [ ] 补充 README(可选):端口说明、推理 API 无鉴权警示、模型路径约定
|
||||
- [ ] 验收清单(checklist):
|
||||
- 单测通过
|
||||
- h1 E2E 通过
|
||||
- UI 主链路可操作
|
||||
48
specs/mvp/v3.8/v3.8_progress.md
Normal file
48
specs/mvp/v3.8/v3.8_progress.md
Normal file
@ -0,0 +1,48 @@
|
||||
# MVP v3.8 进展记录
|
||||
|
||||
## 2026-01-06
|
||||
|
||||
- 完成 v3.8 设计文档:`specs/mvp/v3.8/v3.8_design.md`
|
||||
- 完成 v3.8 Serving API reference:`specs/mvp/v3.8/v3.8_api.md`
|
||||
- 完成 v3.8 TDD 开发计划:`specs/mvp/v3.8/v3.8_dev_plan.md`
|
||||
- 完成 M0:`configs/dev.yaml` 增加 `serving` 配置(http_port=8000, proxy_location=HeadOnly, accelerator_type=H20)
|
||||
- 完成 M1:ServingSpec 解析/宏替换/路径校验 + 单测(`src/mvp/py/argus/service/serving_spec.py`)
|
||||
- 完成 M2:SQLite 新增 `serve_models`/`serve_events` + Db API + 单测(`src/mvp/py/argus/service/db.py`)
|
||||
- 完成 M3:FastAPI Serving 管理 API + 单测(`src/mvp/py/argus/service/app.py`)
|
||||
- 完成 M4:ServeClient 抽象 + LLMConfig builder(dict 形态)+ 单测(`src/mvp/py/argus/service/serve_client.py`、`src/mvp/py/argus/service/serve_llm_config.py`)
|
||||
- 完成 M5:Serving reconciler(状态机 + 资源预检查 + mock 单测)(`src/mvp/py/argus/service/serving_reconciler.py`)
|
||||
|
||||
### M6(h1 真实集成)
|
||||
|
||||
- `argus-ray-node` 镜像补齐依赖:`ray[serve,llm]` + `gymnasium` + `dm-tree`(避免 `ray.serve.llm` 导入失败)
|
||||
- 修复 Ray 2.49.2 兼容性问题:
|
||||
- `LLMConfig` 不支持 `placement_group_config`,改为使用 `resources_per_bundle`(`src/mvp/py/argus/service/serve_llm_config.py`)
|
||||
- 远端 E2E:
|
||||
- `scripts/run_all_v38_serving.sh` 可跑通:create → RUNNING → `/v1/models` → `chat/completions` → delete → DELETED
|
||||
- 修复脚本中 `/v1/models` 解析的 bash heredoc 引号错误(`src/mvp/scripts/run_all_v38_serving.sh`)
|
||||
|
||||
### M7(WebUI - Serving)
|
||||
|
||||
- WebUI 增加 Serving 页面:
|
||||
- 列表:`/ui/serving`
|
||||
- 创建:`/ui/serving/new`
|
||||
- 详情/事件/缩放/删除:`/ui/serving/{model_key}`
|
||||
- 单测覆盖:
|
||||
- `src/mvp/py/tests/test_ui_serving.py`
|
||||
|
||||
### M8(文档/验收)
|
||||
|
||||
- `src/mvp/README.md` 补充 v3.8 serving 端口与 E2E 脚本说明
|
||||
|
||||
### 环境探测(h1 / head 容器)
|
||||
|
||||
> 目的:确认 Ray Serve LLM 依赖是否开箱即用,避免后续集成阶段才暴雷。
|
||||
|
||||
- `ray`:可用,版本 `2.49.2`
|
||||
- `ray.serve`:可 import(Serve 基础可用)
|
||||
- `ray.serve.llm`:当前不可 import
|
||||
- 报错:`ModuleNotFoundError: No module named 'gymnasium'`
|
||||
- 原因:`ray.serve.llm` 的导入链路会触发 `ray.rllib`,而 rllib 依赖 `gymnasium`
|
||||
|
||||
结论:
|
||||
- v3.8 在实现阶段需要在 `argus-ray-node` 镜像中补齐 `ray[llm]`(推荐)或至少补齐 `gymnasium` 等必要依赖,确保 `from ray.serve.llm import ...` 可用。
|
||||
@ -24,3 +24,9 @@ v3.0 访问入口(dev/h1):
|
||||
- SFTPGo:
|
||||
- SFTP:`127.0.0.1:2022`
|
||||
- Admin API/UI:`http://127.0.0.1:8081`(容器内 8080,host 映射到 8081 避免与 API server 冲突)
|
||||
|
||||
v3.8(Ray Serve LLM / vLLM 模型服务):
|
||||
- 推理端口:`8000`(Ray Serve HTTP)
|
||||
- OpenAI-compatible endpoint:`http://127.0.0.1:8000/v1`
|
||||
- 注意:v3.8 推理接口**不做鉴权**
|
||||
- E2E 脚本:`scripts/run_all_v38_serving.sh`
|
||||
|
||||
@ -69,3 +69,11 @@ data:
|
||||
jobs_trash_after_days: 3
|
||||
jobs_purge_after_days: 7
|
||||
janitor_interval_s: 3600
|
||||
|
||||
# v3.8: model serving via Ray Serve LLM (vLLM backend)
|
||||
serving:
|
||||
serve:
|
||||
http_port: 8000
|
||||
proxy_location: HeadOnly
|
||||
llm:
|
||||
accelerator_type: H20
|
||||
|
||||
@ -1,10 +1,16 @@
|
||||
services:
|
||||
ray_head:
|
||||
image: argus/argus-ray-node:vllm011.latest
|
||||
build:
|
||||
context: .
|
||||
dockerfile: images/argus-ray-node/Dockerfile
|
||||
args:
|
||||
BASE_IMAGE: verlai/verl:vllm011.latest
|
||||
container_name: argus-ray-head
|
||||
ports:
|
||||
- "8265:8265"
|
||||
- "8080:8080"
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
# NOTE: this compose file is intended for the dev env layout like:
|
||||
# /home2/argus/infra/mvp/{shared,verl,src/mvp}
|
||||
@ -92,6 +98,11 @@ services:
|
||||
|
||||
ray_worker_0:
|
||||
image: argus/argus-ray-node:vllm011.latest
|
||||
build:
|
||||
context: .
|
||||
dockerfile: images/argus-ray-node/Dockerfile
|
||||
args:
|
||||
BASE_IMAGE: verlai/verl:vllm011.latest
|
||||
container_name: argus-ray-worker-0
|
||||
volumes:
|
||||
- ../../verl:/workspace/verl
|
||||
@ -124,6 +135,11 @@ services:
|
||||
|
||||
ray_worker_1:
|
||||
image: argus/argus-ray-node:vllm011.latest
|
||||
build:
|
||||
context: .
|
||||
dockerfile: images/argus-ray-node/Dockerfile
|
||||
args:
|
||||
BASE_IMAGE: verlai/verl:vllm011.latest
|
||||
container_name: argus-ray-worker-1
|
||||
volumes:
|
||||
- ../../verl:/workspace/verl
|
||||
|
||||
@ -6,6 +6,15 @@ SHELL ["/bin/bash", "-lc"]
|
||||
# Install supervisord (prefer pip to avoid relying on distro package manager).
|
||||
RUN python3 -m pip install --no-cache-dir supervisor
|
||||
|
||||
# v3.8: Ray Serve LLM deps (keep Ray version pinned to what's already in the base image).
|
||||
# NOTE: base image already includes Ray; we only add extras.
|
||||
RUN RAY_VER="$(python3 -c 'import ray; print(ray.__version__)')" && \
|
||||
python3 -m pip install --no-cache-dir "ray[serve,llm]==${RAY_VER}"
|
||||
# Ray Serve LLM's import chain currently pulls in ray.rllib which requires extra deps.
|
||||
# Install them explicitly to make `from ray.serve.llm import ...` work reliably.
|
||||
RUN python3 -m pip install --no-cache-dir gymnasium dm-tree && \
|
||||
python3 -c "from ray.serve.llm import LLMConfig, build_openai_app; print('ray_serve_llm_ok')"
|
||||
|
||||
RUN mkdir -p /opt/argus/py/argus/ray
|
||||
|
||||
# Minimal embedded code for stateless pool (API code is intentionally excluded).
|
||||
|
||||
@ -16,9 +16,8 @@ exec ray start \
|
||||
--port="${ray_port}" \
|
||||
--dashboard-host=0.0.0.0 \
|
||||
--dashboard-port="${dashboard_port}" \
|
||||
--num-cpus=0 \
|
||||
--num-cpus="${ARGUS_HEAD_NUM_CPUS:-1}" \
|
||||
--num-gpus=0 \
|
||||
--disable-usage-stats \
|
||||
--block \
|
||||
${ARGUS_RAY_EXTRA_ARGS:-}
|
||||
|
||||
|
||||
@ -26,3 +26,19 @@ def new_task_id(workload: str, *, user_id: str | None = None) -> str:
|
||||
|
||||
def attempt_submission_id(task_id: str, attempt_no: int) -> str:
|
||||
return f"{task_id}--a{attempt_no:02d}"
|
||||
|
||||
|
||||
def new_model_key(*, user_id: str) -> str:
|
||||
"""
|
||||
Internal identifier for a serving model record.
|
||||
|
||||
Note:
|
||||
- model_id is the OpenAI-facing name (user_id + timestamp prefix + suffix).
|
||||
- model_key is used for stable DB identity and API resource path.
|
||||
"""
|
||||
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
suffix = secrets.token_hex(2)
|
||||
u = _normalize_user_id(user_id)
|
||||
if not u:
|
||||
raise ValueError("user_id is required")
|
||||
return f"mvp2-{u}-serve-{ts}-{suffix}"
|
||||
|
||||
@ -4,11 +4,13 @@ import os
|
||||
import secrets
|
||||
import threading
|
||||
from typing import Any
|
||||
import json
|
||||
from dataclasses import asdict
|
||||
|
||||
import yaml
|
||||
from fastapi import FastAPI, HTTPException, Request, Response
|
||||
|
||||
from argus.core.ids import new_task_id
|
||||
from argus.core.ids import new_model_key, new_task_id
|
||||
from argus.ray.models import AdvancedTaskSpec, JobSpec, RayConfig, parse_taskspec
|
||||
|
||||
from .advanced_command import expand_advanced_command, validate_advanced_command
|
||||
@ -16,6 +18,7 @@ from .config import V2Config
|
||||
from .db import Db
|
||||
from .janitor import JobsJanitor
|
||||
from .scheduler import Scheduler
|
||||
from .serving_spec import ServingSpec, parse_serving_spec, resolve_serving_spec
|
||||
from .sftpgo import SFTPGoAdminClient, SFTPGoError
|
||||
from .ui import register_ui_routes
|
||||
|
||||
@ -85,6 +88,61 @@ def create_app(config_path: str) -> FastAPI:
|
||||
common_root=f"{shared_root}/common",
|
||||
)
|
||||
|
||||
def _serving_enabled() -> bool:
|
||||
return bool(v2_cfg.serving.enabled)
|
||||
|
||||
def _openai_base_url(req: Request) -> str:
|
||||
# Prefer forwarded headers if present; otherwise fall back to Host.
|
||||
host = req.headers.get("x-forwarded-host") or req.headers.get("host") or req.url.hostname or "127.0.0.1"
|
||||
# Strip port if present (common for Host header).
|
||||
hostname = host
|
||||
if hostname.startswith("[") and "]" in hostname:
|
||||
# IPv6 like: [::1]:8080
|
||||
hostname = hostname.split("]")[0] + "]"
|
||||
else:
|
||||
hostname = hostname.split(":")[0]
|
||||
scheme = req.headers.get("x-forwarded-proto") or req.url.scheme or "http"
|
||||
port = int(v2_cfg.serving.serve.http_port)
|
||||
return f"{scheme}://{hostname}:{port}/v1"
|
||||
|
||||
def _dump_yaml(obj: Any) -> str:
|
||||
return yaml.safe_dump(obj, sort_keys=False)
|
||||
|
||||
def _serving_spec_to_dict(spec: ServingSpec) -> dict[str, Any]:
|
||||
return {
|
||||
"model_id": spec.model_id,
|
||||
"model_source": spec.model_source,
|
||||
"num_replicas": int(spec.num_replicas),
|
||||
"gpus_per_replica": int(spec.gpus_per_replica),
|
||||
"engine_kwargs": spec.engine_kwargs,
|
||||
}
|
||||
|
||||
def _serve_model_public(row: dict[str, Any], *, req: Request) -> dict[str, Any]:
|
||||
num_replicas = int(row.get("num_replicas") or 0)
|
||||
gpus_per_replica = int(row.get("gpus_per_replica") or 0)
|
||||
total_gpus = num_replicas * gpus_per_replica
|
||||
model_id = str(row.get("model_id") or "")
|
||||
return {
|
||||
"model_key": str(row.get("model_key") or ""),
|
||||
"user_id": str(row.get("user_id") or ""),
|
||||
"model_id": model_id,
|
||||
"model_id_suffix": str(row.get("model_id_suffix") or ""),
|
||||
"model_id_prefix": str(row.get("model_id_prefix") or ""),
|
||||
"model_source": str(row.get("model_source") or ""),
|
||||
"num_replicas": num_replicas,
|
||||
"gpus_per_replica": gpus_per_replica,
|
||||
"total_gpus": total_gpus,
|
||||
"state": str(row.get("state") or ""),
|
||||
"error_summary": row.get("error_summary"),
|
||||
"created_at": str(row.get("created_at") or ""),
|
||||
"updated_at": str(row.get("updated_at") or ""),
|
||||
"deleted_at": row.get("deleted_at"),
|
||||
"endpoint": {
|
||||
"openai_base_url": _openai_base_url(req),
|
||||
"model": model_id,
|
||||
},
|
||||
}
|
||||
|
||||
def _auth(req: Request) -> dict[str, Any]:
|
||||
token_env = v2_cfg.auth.token_env
|
||||
admin_token = os.environ.get(token_env, "")
|
||||
@ -565,6 +623,162 @@ def create_app(config_path: str) -> FastAPI:
|
||||
return db.list_queue()
|
||||
return db.list_queue(user_id=str(subject["user_id"]))
|
||||
|
||||
# v3.8: Model serving (Ray Serve LLM) management APIs.
|
||||
@app.post("/api/v2/serve/models")
|
||||
async def create_serve_model(req: Request) -> dict[str, Any]:
|
||||
subject = _auth(req)
|
||||
if not _serving_enabled():
|
||||
raise HTTPException(status_code=400, detail="serving is not enabled")
|
||||
|
||||
body = (await req.body()).decode("utf-8")
|
||||
try:
|
||||
obj = yaml.safe_load(body) or {}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"invalid YAML: {e!r}")
|
||||
if not isinstance(obj, dict):
|
||||
raise HTTPException(status_code=400, detail="serving spec must be a YAML mapping")
|
||||
|
||||
user_id = str(subject["user_id"]).strip()
|
||||
try:
|
||||
spec = parse_serving_spec(obj)
|
||||
resolved = resolve_serving_spec(spec=spec, user_id=user_id)
|
||||
except PermissionError as e:
|
||||
raise HTTPException(status_code=403, detail=str(e))
|
||||
except ValueError as e:
|
||||
msg = str(e)
|
||||
code = 422 if ("num_replicas" in msg or "gpus_per_replica" in msg) else 400
|
||||
raise HTTPException(status_code=code, detail=f"invalid serving spec: {e!r}")
|
||||
|
||||
model_key = new_model_key(user_id=user_id)
|
||||
try:
|
||||
engine_kwargs_json = json.dumps(resolved.engine_kwargs, sort_keys=True) if resolved.engine_kwargs is not None else None
|
||||
except TypeError as e:
|
||||
raise HTTPException(status_code=400, detail=f"engine_kwargs must be JSON-serializable: {e!r}")
|
||||
|
||||
spec_yaml = _dump_yaml(_serving_spec_to_dict(spec))
|
||||
resolved_spec_yaml = _dump_yaml(asdict(resolved))
|
||||
|
||||
db.create_serve_model(
|
||||
model_key=model_key,
|
||||
user_id=user_id,
|
||||
model_id_suffix=resolved.model_id_suffix,
|
||||
model_id_prefix=resolved.model_id_prefix,
|
||||
model_id=resolved.model_id,
|
||||
model_source=resolved.model_source,
|
||||
num_replicas=resolved.num_replicas,
|
||||
gpus_per_replica=resolved.gpus_per_replica,
|
||||
engine_kwargs_json=engine_kwargs_json,
|
||||
spec_yaml=spec_yaml,
|
||||
resolved_spec_yaml=resolved_spec_yaml,
|
||||
)
|
||||
return {"model_key": model_key, "state": "QUEUED"}
|
||||
|
||||
@app.get("/api/v2/serve/models")
|
||||
async def list_serve_models(req: Request, limit: int = 200, offset: int = 0, include_deleted: int = 0) -> dict[str, Any]:
|
||||
subject = _auth(req)
|
||||
if not _serving_enabled():
|
||||
raise HTTPException(status_code=400, detail="serving is not enabled")
|
||||
|
||||
lim = max(1, min(int(limit), 1000))
|
||||
off = max(0, int(offset))
|
||||
inc = bool(int(include_deleted))
|
||||
user_id = str(subject["user_id"])
|
||||
|
||||
items = db.list_serve_models(user_id=user_id, include_deleted=inc, limit=lim, offset=off)
|
||||
out = [_serve_model_public(i, req=req) for i in items]
|
||||
return {
|
||||
"items": out,
|
||||
"openai_base_url": _openai_base_url(req),
|
||||
"limit": lim,
|
||||
"offset": off,
|
||||
"has_more": bool(len(items) == lim),
|
||||
}
|
||||
|
||||
@app.get("/api/v2/serve/models/{model_key}")
|
||||
async def get_serve_model(model_key: str, req: Request) -> dict[str, Any]:
|
||||
subject = _auth(req)
|
||||
if not _serving_enabled():
|
||||
raise HTTPException(status_code=400, detail="serving is not enabled")
|
||||
|
||||
row = db.get_serve_model(model_key)
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="model not found")
|
||||
if not subject.get("is_admin"):
|
||||
if str(row.get("user_id") or "") != str(subject["user_id"]):
|
||||
raise HTTPException(status_code=404, detail="model not found")
|
||||
|
||||
events = db.list_serve_events(model_key, limit=200, offset=0)
|
||||
ev_out = [
|
||||
{
|
||||
"id": int(e.get("id") or 0),
|
||||
"model_key": str(e.get("model_key") or ""),
|
||||
"created_at": str(e.get("ts") or ""),
|
||||
"event_type": str(e.get("event_type") or ""),
|
||||
"payload_json": e.get("payload_json"),
|
||||
}
|
||||
for e in events
|
||||
]
|
||||
return {
|
||||
"model": _serve_model_public(row, req=req),
|
||||
"resolved_spec_yaml": str(row.get("resolved_spec_yaml") or ""),
|
||||
"events": ev_out,
|
||||
"serve_status": None,
|
||||
}
|
||||
|
||||
@app.patch("/api/v2/serve/models/{model_key}")
|
||||
async def patch_serve_model(model_key: str, req: Request) -> dict[str, Any]:
|
||||
subject = _auth(req)
|
||||
if not _serving_enabled():
|
||||
raise HTTPException(status_code=400, detail="serving is not enabled")
|
||||
|
||||
row = db.get_serve_model(model_key)
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="model not found")
|
||||
if not subject.get("is_admin"):
|
||||
if str(row.get("user_id") or "") != str(subject["user_id"]):
|
||||
raise HTTPException(status_code=404, detail="model not found")
|
||||
|
||||
obj = await req.json()
|
||||
if not isinstance(obj, dict):
|
||||
raise HTTPException(status_code=400, detail="body must be a JSON object")
|
||||
if "num_replicas" not in obj:
|
||||
raise HTTPException(status_code=400, detail="missing num_replicas")
|
||||
num_replicas = obj.get("num_replicas")
|
||||
if not isinstance(num_replicas, int) or int(num_replicas) < 1:
|
||||
raise HTTPException(status_code=422, detail="num_replicas must be an integer >= 1")
|
||||
|
||||
db.update_serve_model_num_replicas(model_key=model_key, num_replicas=int(num_replicas))
|
||||
return {"model_key": model_key, "state": "QUEUED"}
|
||||
|
||||
@app.delete("/api/v2/serve/models/{model_key}")
|
||||
async def delete_serve_model(model_key: str, req: Request) -> dict[str, Any]:
|
||||
subject = _auth(req)
|
||||
if not _serving_enabled():
|
||||
raise HTTPException(status_code=400, detail="serving is not enabled")
|
||||
|
||||
row = db.get_serve_model(model_key)
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="model not found")
|
||||
if not subject.get("is_admin"):
|
||||
if str(row.get("user_id") or "") != str(subject["user_id"]):
|
||||
raise HTTPException(status_code=404, detail="model not found")
|
||||
|
||||
db.set_serve_model_state(model_key=model_key, state="DELETING", event_type="SERVE_DELETE_REQUESTED")
|
||||
return {"model_key": model_key, "state": "DELETING"}
|
||||
|
||||
@app.get("/api/v2/serve/status")
|
||||
async def serve_status(req: Request) -> dict[str, Any]:
|
||||
_require_admin(req)
|
||||
if not _serving_enabled():
|
||||
raise HTTPException(status_code=400, detail="serving is not enabled")
|
||||
return {
|
||||
"enabled": True,
|
||||
"openai_base_url": _openai_base_url(req),
|
||||
"http_port": int(v2_cfg.serving.serve.http_port),
|
||||
"proxy_location": str(v2_cfg.serving.serve.proxy_location),
|
||||
"accelerator_type": str(v2_cfg.serving.llm.accelerator_type),
|
||||
}
|
||||
|
||||
# v3.0: minimal WebUI (no server-side session; token stored in browser localStorage).
|
||||
register_ui_routes(app)
|
||||
|
||||
|
||||
@ -57,6 +57,24 @@ class V2SFTPGoConfig:
|
||||
admin_password_env: str = "SFTPGO_ADMIN_PASSWORD"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class V2ServingServeConfig:
|
||||
http_port: int = 8000
|
||||
proxy_location: str = "HeadOnly"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class V2ServingLLMConfig:
|
||||
accelerator_type: str = ""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class V2ServingConfig:
|
||||
enabled: bool = False
|
||||
serve: V2ServingServeConfig = V2ServingServeConfig()
|
||||
llm: V2ServingLLMConfig = V2ServingLLMConfig()
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class V2DataConfig:
|
||||
user_root: str
|
||||
@ -72,6 +90,7 @@ class V2Config:
|
||||
scheduler: V2SchedulerConfig
|
||||
tracking: V2TrackingConfig
|
||||
data: V2DataConfig
|
||||
serving: V2ServingConfig
|
||||
|
||||
@staticmethod
|
||||
def from_root_dict(root: dict[str, Any]) -> "V2Config":
|
||||
@ -112,6 +131,15 @@ class V2Config:
|
||||
if not isinstance(sftpgo, dict) or not isinstance(retention, dict):
|
||||
raise ValueError("config.data.{sftpgo,retention} must be mappings")
|
||||
|
||||
serving = root.get("serving") or {}
|
||||
if not isinstance(serving, dict):
|
||||
raise ValueError("config.serving must be a mapping")
|
||||
serving_enabled = bool(serving.get("enabled")) if "enabled" in serving else bool(serving)
|
||||
serving_serve = serving.get("serve") or {}
|
||||
serving_llm = serving.get("llm") or {}
|
||||
if not isinstance(serving_serve, dict) or not isinstance(serving_llm, dict):
|
||||
raise ValueError("config.serving.{serve,llm} must be mappings")
|
||||
|
||||
default_db_path = f"{shared_root}/common/db/mvp.sqlite3"
|
||||
db_path = str(sqlite.get("db_path") or default_db_path)
|
||||
|
||||
@ -158,4 +186,14 @@ class V2Config:
|
||||
janitor_interval_s=int(retention.get("janitor_interval_s") or 3600),
|
||||
),
|
||||
),
|
||||
serving=V2ServingConfig(
|
||||
enabled=serving_enabled,
|
||||
serve=V2ServingServeConfig(
|
||||
http_port=int(serving_serve.get("http_port") or 8000),
|
||||
proxy_location=str(serving_serve.get("proxy_location") or "HeadOnly"),
|
||||
),
|
||||
llm=V2ServingLLMConfig(
|
||||
accelerator_type=str(serving_llm.get("accelerator_type") or ""),
|
||||
),
|
||||
),
|
||||
)
|
||||
|
||||
@ -117,6 +117,43 @@ class Db:
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS serve_models (
|
||||
model_key TEXT PRIMARY KEY,
|
||||
user_id TEXT NOT NULL,
|
||||
model_id_suffix TEXT NOT NULL,
|
||||
model_id_prefix TEXT NOT NULL,
|
||||
model_id TEXT NOT NULL,
|
||||
model_source TEXT NOT NULL,
|
||||
num_replicas INTEGER NOT NULL,
|
||||
gpus_per_replica INTEGER NOT NULL,
|
||||
engine_kwargs_json TEXT,
|
||||
state TEXT NOT NULL,
|
||||
spec_yaml TEXT NOT NULL,
|
||||
resolved_spec_yaml TEXT NOT NULL,
|
||||
error_summary TEXT,
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL,
|
||||
deleted_at TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS serve_events (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
model_key TEXT NOT NULL,
|
||||
ts TEXT NOT NULL,
|
||||
event_type TEXT NOT NULL,
|
||||
payload_json TEXT,
|
||||
FOREIGN KEY (model_key) REFERENCES serve_models(model_key) ON DELETE CASCADE
|
||||
)
|
||||
"""
|
||||
)
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_serve_models_user ON serve_models(user_id)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_serve_models_state ON serve_models(state)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_serve_events_model ON serve_events(model_key)")
|
||||
|
||||
@contextmanager
|
||||
def tx(self) -> Iterator[sqlite3.Connection]:
|
||||
@ -493,3 +530,239 @@ class Db:
|
||||
(str(end_time_le), int(limit)),
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
def create_serve_model(
|
||||
self,
|
||||
*,
|
||||
model_key: str,
|
||||
user_id: str,
|
||||
model_id_suffix: str,
|
||||
model_id_prefix: str,
|
||||
model_id: str,
|
||||
model_source: str,
|
||||
num_replicas: int,
|
||||
gpus_per_replica: int,
|
||||
spec_yaml: str,
|
||||
resolved_spec_yaml: str,
|
||||
engine_kwargs_json: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
now = _utc_now_iso()
|
||||
with self.tx() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO serve_models (
|
||||
model_key,
|
||||
user_id,
|
||||
model_id_suffix,
|
||||
model_id_prefix,
|
||||
model_id,
|
||||
model_source,
|
||||
num_replicas,
|
||||
gpus_per_replica,
|
||||
engine_kwargs_json,
|
||||
state,
|
||||
spec_yaml,
|
||||
resolved_spec_yaml,
|
||||
created_at,
|
||||
updated_at
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 'QUEUED', ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
model_key,
|
||||
user_id,
|
||||
model_id_suffix,
|
||||
model_id_prefix,
|
||||
model_id,
|
||||
model_source,
|
||||
int(num_replicas),
|
||||
int(gpus_per_replica),
|
||||
engine_kwargs_json,
|
||||
spec_yaml,
|
||||
resolved_spec_yaml,
|
||||
now,
|
||||
now,
|
||||
),
|
||||
)
|
||||
conn.execute(
|
||||
"INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, 'SERVE_MODEL_CREATED', ?)",
|
||||
(model_key, now, None),
|
||||
)
|
||||
row = conn.execute("SELECT * FROM serve_models WHERE model_key = ?", (model_key,)).fetchone()
|
||||
return dict(row) if row else {}
|
||||
|
||||
def list_serve_models(
|
||||
self,
|
||||
*,
|
||||
user_id: str,
|
||||
include_deleted: bool = False,
|
||||
limit: int = 200,
|
||||
offset: int = 0,
|
||||
) -> list[dict[str, Any]]:
|
||||
with self._connect() as conn:
|
||||
where_sql = "WHERE user_id = ?"
|
||||
params: list[Any] = [user_id]
|
||||
if not include_deleted:
|
||||
where_sql += " AND deleted_at IS NULL"
|
||||
params.append(int(limit))
|
||||
params.append(max(0, int(offset)))
|
||||
rows = conn.execute(
|
||||
f"""
|
||||
SELECT
|
||||
model_key,
|
||||
user_id,
|
||||
model_id_suffix,
|
||||
model_id_prefix,
|
||||
model_id,
|
||||
model_source,
|
||||
num_replicas,
|
||||
gpus_per_replica,
|
||||
engine_kwargs_json,
|
||||
state,
|
||||
error_summary,
|
||||
created_at,
|
||||
updated_at,
|
||||
deleted_at
|
||||
FROM serve_models
|
||||
{where_sql}
|
||||
ORDER BY created_at DESC, model_key DESC
|
||||
LIMIT ? OFFSET ?
|
||||
""",
|
||||
tuple(params),
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
def list_all_serve_models(
|
||||
self,
|
||||
*,
|
||||
include_deleted: bool = False,
|
||||
limit: int = 2000,
|
||||
offset: int = 0,
|
||||
) -> list[dict[str, Any]]:
|
||||
with self._connect() as conn:
|
||||
where_sql = ""
|
||||
if not include_deleted:
|
||||
where_sql = "WHERE deleted_at IS NULL"
|
||||
rows = conn.execute(
|
||||
f"""
|
||||
SELECT
|
||||
model_key,
|
||||
user_id,
|
||||
model_id_suffix,
|
||||
model_id_prefix,
|
||||
model_id,
|
||||
model_source,
|
||||
num_replicas,
|
||||
gpus_per_replica,
|
||||
engine_kwargs_json,
|
||||
state,
|
||||
error_summary,
|
||||
spec_yaml,
|
||||
resolved_spec_yaml,
|
||||
created_at,
|
||||
updated_at,
|
||||
deleted_at
|
||||
FROM serve_models
|
||||
{where_sql}
|
||||
ORDER BY created_at ASC, model_key ASC
|
||||
LIMIT ? OFFSET ?
|
||||
""",
|
||||
(int(limit), max(0, int(offset))),
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
def get_serve_model(self, model_key: str) -> dict[str, Any] | None:
|
||||
with self._connect() as conn:
|
||||
row = conn.execute("SELECT * FROM serve_models WHERE model_key = ?", (model_key,)).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
def list_serve_events(self, model_key: str, limit: int = 200, offset: int = 0) -> list[dict[str, Any]]:
|
||||
with self._connect() as conn:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT id, model_key, ts, event_type, payload_json
|
||||
FROM serve_events
|
||||
WHERE model_key = ?
|
||||
ORDER BY id DESC
|
||||
LIMIT ? OFFSET ?
|
||||
""",
|
||||
(model_key, int(limit), max(0, int(offset))),
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
def append_serve_event(self, *, model_key: str, event_type: str, payload_json: str | None = None) -> None:
|
||||
now = _utc_now_iso()
|
||||
with self.tx() as conn:
|
||||
conn.execute(
|
||||
"INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, ?, ?)",
|
||||
(model_key, now, event_type, payload_json),
|
||||
)
|
||||
|
||||
def set_serve_model_state(
|
||||
self,
|
||||
*,
|
||||
model_key: str,
|
||||
state: str,
|
||||
error_summary: str | None = None,
|
||||
event_type: str = "SERVE_STATE_UPDATE",
|
||||
payload_json: str | None = None,
|
||||
) -> None:
|
||||
now = _utc_now_iso()
|
||||
with self.tx() as conn:
|
||||
sets = ["state = ?", "updated_at = ?"]
|
||||
params: list[Any] = [state, now]
|
||||
if error_summary is not None:
|
||||
sets.append("error_summary = ?")
|
||||
params.append(error_summary)
|
||||
if state == "DELETED":
|
||||
sets.append("deleted_at = ?")
|
||||
params.append(now)
|
||||
params.append(model_key)
|
||||
conn.execute(f"UPDATE serve_models SET {', '.join(sets)} WHERE model_key = ?", tuple(params))
|
||||
conn.execute(
|
||||
"INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, ?, ?)",
|
||||
(model_key, now, event_type, payload_json),
|
||||
)
|
||||
|
||||
def update_serve_model_num_replicas(self, *, model_key: str, num_replicas: int) -> None:
|
||||
if not isinstance(num_replicas, int) or num_replicas < 1:
|
||||
raise ValueError("num_replicas must be an integer >= 1")
|
||||
now = _utc_now_iso()
|
||||
with self.tx() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE serve_models
|
||||
SET num_replicas = ?, state = 'QUEUED', error_summary = NULL, updated_at = ?
|
||||
WHERE model_key = ?
|
||||
""",
|
||||
(int(num_replicas), now, model_key),
|
||||
)
|
||||
conn.execute(
|
||||
"INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, 'SERVE_PATCH_NUM_REPLICAS', ?)",
|
||||
(model_key, now, str(num_replicas)),
|
||||
)
|
||||
|
||||
def pick_next_runnable_serve_change(self) -> dict[str, Any] | None:
|
||||
"""
|
||||
Returns the next serve model that needs reconciliation.
|
||||
|
||||
Minimal state machine for now:
|
||||
- QUEUED: needs (re)apply
|
||||
- DELETING: needs removal
|
||||
"""
|
||||
with self._connect() as conn:
|
||||
row = conn.execute(
|
||||
"""
|
||||
SELECT *
|
||||
FROM serve_models
|
||||
WHERE deleted_at IS NULL
|
||||
AND state IN ('QUEUED','DELETING')
|
||||
ORDER BY updated_at ASC
|
||||
LIMIT 1
|
||||
"""
|
||||
).fetchone()
|
||||
return dict(row) if row else None
|
||||
|
||||
# Backward compatible naming (v3.8 docs originally used "upsert").
|
||||
def upsert_serve_model(self, **kwargs: Any) -> dict[str, Any]:
|
||||
return self.create_serve_model(**kwargs)
|
||||
|
||||
@ -16,6 +16,8 @@ from argus.ray.ray_job_tool import RayJobTool
|
||||
from .config import V2Config
|
||||
from .db import Db
|
||||
from .ray_resources import ensure_ray_connected, get_cluster_available
|
||||
from .serve_client import RayServeClient
|
||||
from .serving_reconciler import ServingReconciler
|
||||
|
||||
|
||||
_INSUFFICIENT_RE = re.compile(r"Total available GPUs\\s+\\d+\\s+is less than total desired GPUs\\s+\\d+")
|
||||
@ -37,6 +39,18 @@ class Scheduler:
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self.tool = RayJobTool(self.ray_cfg)
|
||||
self._serving: ServingReconciler | None = None
|
||||
if bool(self.v2_cfg.serving.enabled):
|
||||
self._serving = ServingReconciler(
|
||||
db=self.db,
|
||||
v2_cfg=self.v2_cfg,
|
||||
ray_runtime_env_env_vars=self.ray_cfg.runtime_env_env_vars,
|
||||
serve_client=RayServeClient(
|
||||
http_port=int(self.v2_cfg.serving.serve.http_port),
|
||||
proxy_location=str(self.v2_cfg.serving.serve.proxy_location),
|
||||
ray_init_address="auto",
|
||||
),
|
||||
)
|
||||
|
||||
def _job_dir_for_task(self, *, user_id: str | None, ray_submission_id: str) -> str:
|
||||
root = self.ray_cfg.shared_root.rstrip("/")
|
||||
@ -251,6 +265,14 @@ class Scheduler:
|
||||
def tick(self) -> None:
|
||||
ensure_ray_connected()
|
||||
|
||||
# v3.8: reconcile serve_models (best-effort).
|
||||
if self._serving is not None:
|
||||
try:
|
||||
self._serving.tick()
|
||||
except Exception:
|
||||
# Keep scheduler alive even if serving tick fails.
|
||||
pass
|
||||
|
||||
# Sync active tasks
|
||||
for row in self.db.list_active_tasks(limit=50):
|
||||
self._sync_one_running(row)
|
||||
|
||||
45
src/mvp/py/argus/service/serve_client.py
Normal file
45
src/mvp/py/argus/service/serve_client.py
Normal file
@ -0,0 +1,45 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RayServeClient:
|
||||
"""
|
||||
Minimal Ray Serve client wrapper.
|
||||
|
||||
This is intentionally tiny and uses runtime imports so that:
|
||||
- unit tests can stub `ray` modules without needing real Ray installed
|
||||
- production can run with the real Ray Serve stack (v3.8+)
|
||||
"""
|
||||
|
||||
http_port: int = 8000
|
||||
proxy_location: str = "HeadOnly"
|
||||
ray_init_address: str = "auto"
|
||||
|
||||
def ensure_started(self) -> None:
|
||||
import ray # runtime import
|
||||
|
||||
# Scheduler already calls ray.init(); make this idempotent.
|
||||
ray.init(address=self.ray_init_address, ignore_reinit_error=True, log_to_driver=False) # type: ignore[call-arg]
|
||||
|
||||
# Import serve lazily to allow tests to stub it.
|
||||
from ray import serve # type: ignore
|
||||
|
||||
serve.start(proxy_location=self.proxy_location, http_options={"host": "0.0.0.0", "port": int(self.http_port)})
|
||||
|
||||
def apply_app(self, *, app: Any, app_name: str, route_prefix: str = "/") -> Any:
|
||||
from ray import serve # type: ignore
|
||||
|
||||
# If Ray Serve LLM isn't available, callers may pass a plain dict placeholder.
|
||||
# Running that through serve.run() results in a confusing TypeError; fail fast.
|
||||
if isinstance(app, dict):
|
||||
raise ValueError("invalid serve app object (Ray Serve LLM not available or build_openai_app failed)")
|
||||
|
||||
return serve.run(app, name=app_name, route_prefix=route_prefix)
|
||||
|
||||
def get_status(self) -> Any:
|
||||
from ray import serve # type: ignore
|
||||
|
||||
return serve.status()
|
||||
63
src/mvp/py/argus/service/serve_llm_config.py
Normal file
63
src/mvp/py/argus/service/serve_llm_config.py
Normal file
@ -0,0 +1,63 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from .serving_spec import ResolvedServingSpec
|
||||
|
||||
|
||||
def _ensure_hf_env_defaults(env: dict[str, str]) -> dict[str, str]:
|
||||
out = dict(env or {})
|
||||
# Prefer existing values if present, but always force offline mode in the platform.
|
||||
out.setdefault("HF_HOME", "/private/hf")
|
||||
out.setdefault("HUGGINGFACE_HUB_CACHE", "/private/hf/hub")
|
||||
out.setdefault("TRANSFORMERS_CACHE", "/private/hf/transformers")
|
||||
out["HF_HUB_OFFLINE"] = "1"
|
||||
return out
|
||||
|
||||
|
||||
def build_llm_config_dict(
|
||||
resolved: ResolvedServingSpec,
|
||||
*,
|
||||
accelerator_type: str,
|
||||
runtime_env_env_vars: dict[str, str] | None,
|
||||
cpu_per_gpu: float = 1.0,
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Pure builder: maps a platform ResolvedServingSpec to a Ray Serve LLM-like config.
|
||||
|
||||
We return a plain dict here to keep this layer unit-testable without depending on
|
||||
a specific Ray Serve LLM version. The reconciler (later milestone) can choose to
|
||||
instantiate `ray.serve.llm.LLMConfig` using this dict.
|
||||
"""
|
||||
if not accelerator_type:
|
||||
raise ValueError("accelerator_type is required")
|
||||
if resolved.num_replicas < 1:
|
||||
raise ValueError("num_replicas must be >= 1")
|
||||
if resolved.gpus_per_replica < 1:
|
||||
raise ValueError("gpus_per_replica must be >= 1")
|
||||
if cpu_per_gpu <= 0:
|
||||
raise ValueError("cpu_per_gpu must be > 0")
|
||||
|
||||
engine_kwargs: dict[str, Any] = dict(resolved.engine_kwargs or {})
|
||||
# Enforce tensor parallel mapping; user-provided value must not contradict requested GPUs.
|
||||
engine_kwargs["tensor_parallel_size"] = int(resolved.gpus_per_replica)
|
||||
|
||||
# Ray Serve LLM (Ray 2.49.x) exposes `resources_per_bundle` instead of the older
|
||||
# `placement_group_config`. Use a single bundle that reserves the full GPU set
|
||||
# required by tensor-parallel execution.
|
||||
resources_per_bundle = {
|
||||
"GPU": float(resolved.gpus_per_replica),
|
||||
"CPU": float(cpu_per_gpu) * float(resolved.gpus_per_replica),
|
||||
}
|
||||
|
||||
env_vars = _ensure_hf_env_defaults(dict(runtime_env_env_vars or {}))
|
||||
|
||||
return {
|
||||
# Ray Serve LLM expects `model_loading_config` with model_id/model_source.
|
||||
"model_loading_config": {"model_id": resolved.model_id, "model_source": resolved.model_source},
|
||||
"accelerator_type": accelerator_type,
|
||||
"deployment_config": {"num_replicas": int(resolved.num_replicas)},
|
||||
"engine_kwargs": engine_kwargs,
|
||||
"resources_per_bundle": resources_per_bundle,
|
||||
"runtime_env": {"env_vars": env_vars},
|
||||
}
|
||||
151
src/mvp/py/argus/service/serving_reconciler.py
Normal file
151
src/mvp/py/argus/service/serving_reconciler.py
Normal file
@ -0,0 +1,151 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import traceback
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Protocol
|
||||
|
||||
from argus.service.ray_resources import ClusterAvailable, get_cluster_available
|
||||
|
||||
from .config import V2Config
|
||||
from .db import Db
|
||||
from .serve_llm_config import build_llm_config_dict
|
||||
from .serving_spec import ResolvedServingSpec
|
||||
|
||||
|
||||
class ServeClient(Protocol):
|
||||
def ensure_started(self) -> None: ...
|
||||
|
||||
def apply_app(self, *, app: Any, app_name: str, route_prefix: str = "/") -> Any: ...
|
||||
|
||||
def get_status(self) -> Any: ...
|
||||
|
||||
|
||||
def _parse_engine_kwargs(row: dict[str, Any]) -> dict[str, Any] | None:
|
||||
raw = row.get("engine_kwargs_json")
|
||||
if raw in (None, ""):
|
||||
return None
|
||||
try:
|
||||
obj = json.loads(str(raw))
|
||||
return obj if isinstance(obj, dict) else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _row_to_resolved_spec(row: dict[str, Any]) -> ResolvedServingSpec:
|
||||
return ResolvedServingSpec(
|
||||
user_id=str(row["user_id"]),
|
||||
model_id_suffix=str(row["model_id_suffix"]),
|
||||
model_id_prefix=str(row["model_id_prefix"]),
|
||||
model_id=str(row["model_id"]),
|
||||
model_source=str(row["model_source"]),
|
||||
num_replicas=int(row["num_replicas"]),
|
||||
gpus_per_replica=int(row["gpus_per_replica"]),
|
||||
engine_kwargs=_parse_engine_kwargs(row),
|
||||
)
|
||||
|
||||
|
||||
def _needed_total_gpus(rows: list[dict[str, Any]]) -> int:
|
||||
total = 0
|
||||
for r in rows:
|
||||
total += int(r.get("num_replicas") or 0) * int(r.get("gpus_per_replica") or 0)
|
||||
return total
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServingReconciler:
|
||||
"""
|
||||
v3.8: reconcile declared serve_models (SQLite) into a multi-model Ray Serve app.
|
||||
|
||||
This reconciler is intentionally conservative:
|
||||
- Only acts on models in states QUEUED/DELETING.
|
||||
- Performs a minimal GPU precheck using ray available GPU totals.
|
||||
- Writes events and state transitions for explainability.
|
||||
"""
|
||||
|
||||
db: Db
|
||||
v2_cfg: V2Config
|
||||
ray_runtime_env_env_vars: dict[str, str]
|
||||
serve_client: ServeClient
|
||||
app_name: str = "argus_llm_app"
|
||||
route_prefix: str = "/"
|
||||
cpu_per_gpu: float = 1.0
|
||||
get_available_fn: Any = get_cluster_available
|
||||
|
||||
def tick(self) -> None:
|
||||
# Pick the next desired change.
|
||||
change = self.db.pick_next_runnable_serve_change()
|
||||
if not change:
|
||||
return
|
||||
|
||||
model_key = str(change["model_key"])
|
||||
state = str(change.get("state") or "")
|
||||
|
||||
# Ensure Ray (and Serve) can be started before doing anything else.
|
||||
try:
|
||||
self.serve_client.ensure_started()
|
||||
except Exception as e:
|
||||
self.db.append_serve_event(model_key=model_key, event_type="SERVE_START_ERROR", payload_json=repr(e))
|
||||
return
|
||||
|
||||
# Desired set: all non-deleted models except those marked DELETING.
|
||||
all_rows = self.db.list_all_serve_models(include_deleted=False, limit=5000, offset=0)
|
||||
# FAILED models are not part of the desired running set. A user can PATCH to
|
||||
# re-queue a failed model (e.g., after fixing env/deps) which will move it back to QUEUED.
|
||||
desired_rows = [r for r in all_rows if str(r.get("state") or "") not in ("DELETING", "DELETED", "FAILED")]
|
||||
|
||||
# Precheck resources: multi-model app apply needs enough GPUs for the whole desired set.
|
||||
needed = _needed_total_gpus(desired_rows)
|
||||
avail: ClusterAvailable = self.get_available_fn()
|
||||
if float(avail.total_available_gpus) < float(needed):
|
||||
msg = f"Insufficient GPUs: need {needed}, available {avail.total_available_gpus}"
|
||||
self.db.append_serve_event(model_key=model_key, event_type="SERVE_PENDING_RESOURCES", payload_json=msg)
|
||||
return
|
||||
|
||||
# Build per-model LLM configs (dict form in M4).
|
||||
llm_cfg_dicts: list[dict[str, Any]] = []
|
||||
accelerator_type = str(self.v2_cfg.serving.llm.accelerator_type or "")
|
||||
for r in desired_rows:
|
||||
resolved = _row_to_resolved_spec(r)
|
||||
llm_cfg_dicts.append(
|
||||
build_llm_config_dict(
|
||||
resolved,
|
||||
accelerator_type=accelerator_type,
|
||||
runtime_env_env_vars=self.ray_runtime_env_env_vars,
|
||||
cpu_per_gpu=self.cpu_per_gpu,
|
||||
)
|
||||
)
|
||||
|
||||
# Build a Ray Serve OpenAI-compatible app if Ray Serve LLM is available.
|
||||
# Fall back to a plain dict so unit tests can run without real Ray Serve.
|
||||
app_obj: Any
|
||||
try:
|
||||
from ray.serve.llm import LLMConfig, build_openai_app # type: ignore
|
||||
|
||||
llm_cfgs = [LLMConfig(**d) for d in llm_cfg_dicts]
|
||||
app_obj = build_openai_app({"llm_configs": llm_cfgs})
|
||||
except Exception as e:
|
||||
self.db.append_serve_event(model_key=model_key, event_type="SERVE_LLM_IMPORT_ERROR", payload_json=repr(e))
|
||||
app_obj = {"llm_configs": llm_cfg_dicts}
|
||||
|
||||
try:
|
||||
self.db.append_serve_event(model_key=model_key, event_type="SERVE_APPLY_REQUESTED", payload_json=str(len(llm_cfg_dicts)))
|
||||
self.serve_client.apply_app(app=app_obj, app_name=self.app_name, route_prefix=self.route_prefix)
|
||||
except Exception as e:
|
||||
err = f"{type(e).__name__}: {e}"
|
||||
tb = traceback.format_exc(limit=10)
|
||||
self.db.set_serve_model_state(model_key=model_key, state="FAILED", error_summary=err, event_type="SERVE_APPLY_FAILED", payload_json=tb)
|
||||
return
|
||||
|
||||
# Apply succeeded. Update the changing model's state.
|
||||
if state == "DELETING":
|
||||
self.db.set_serve_model_state(model_key=model_key, state="DELETED", event_type="SERVE_DELETE_APPLIED")
|
||||
return
|
||||
|
||||
# Mark as deploying; best-effort status probe can promote to RUNNING.
|
||||
self.db.set_serve_model_state(model_key=model_key, state="DEPLOYING", event_type="SERVE_DEPLOYING")
|
||||
try:
|
||||
_ = self.serve_client.get_status()
|
||||
self.db.set_serve_model_state(model_key=model_key, state="RUNNING", event_type="SERVE_RUNNING")
|
||||
except Exception as e:
|
||||
self.db.append_serve_event(model_key=model_key, event_type="SERVE_STATUS_ERROR", payload_json=repr(e))
|
||||
144
src/mvp/py/argus/service/serving_spec.py
Normal file
144
src/mvp/py/argus/service/serving_spec.py
Normal file
@ -0,0 +1,144 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
|
||||
_MODEL_ID_SUFFIX_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ServingSpec:
|
||||
model_id: str
|
||||
model_source: str
|
||||
num_replicas: int = 1
|
||||
gpus_per_replica: int = 1
|
||||
engine_kwargs: dict[str, Any] | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ResolvedServingSpec:
|
||||
user_id: str
|
||||
model_id_suffix: str
|
||||
model_id_prefix: str
|
||||
model_id: str
|
||||
model_source: str
|
||||
num_replicas: int
|
||||
gpus_per_replica: int
|
||||
engine_kwargs: dict[str, Any] | None
|
||||
|
||||
|
||||
def validate_model_id_suffix(suffix: str) -> None:
|
||||
if not isinstance(suffix, str):
|
||||
raise ValueError("model_id must be a string")
|
||||
s = suffix.strip()
|
||||
if s != suffix:
|
||||
raise ValueError("model_id must not contain leading/trailing whitespace")
|
||||
if not s:
|
||||
raise ValueError("model_id is required")
|
||||
if not _MODEL_ID_SUFFIX_RE.match(s):
|
||||
raise ValueError("model_id must match regex: ^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$")
|
||||
if ".." in s:
|
||||
raise ValueError("model_id must not contain '..'")
|
||||
|
||||
|
||||
def make_model_id_prefix(*, user_id: str, now_utc: datetime | None = None) -> str:
|
||||
if not user_id or not isinstance(user_id, str):
|
||||
raise ValueError("user_id is required")
|
||||
if "/" in user_id:
|
||||
raise ValueError("user_id must not contain '/'")
|
||||
|
||||
dt = now_utc or datetime.now(timezone.utc)
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
dt = dt.astimezone(timezone.utc)
|
||||
stamp = dt.strftime("%Y%m%d%H%M")
|
||||
return f"{user_id}-{stamp}"
|
||||
|
||||
|
||||
def expand_home_macros(*, user_id: str, text: str) -> str:
|
||||
if not isinstance(text, str):
|
||||
raise ValueError("model_source must be a string")
|
||||
if not text:
|
||||
raise ValueError("model_source is required")
|
||||
|
||||
out = text
|
||||
out = out.replace("$HOME/common/hf", "/private/hf")
|
||||
out = out.replace("$HOME/common/datasets", "/private/datasets")
|
||||
out = out.replace("$HOME", f"/private/users/{user_id}")
|
||||
return out
|
||||
|
||||
|
||||
def validate_model_source_path(*, user_id: str, model_source: str) -> None:
|
||||
if not isinstance(model_source, str):
|
||||
raise ValueError("model_source must be a string")
|
||||
if not model_source.startswith("/"):
|
||||
raise ValueError("model_source must be an absolute path")
|
||||
if not model_source.startswith("/private/"):
|
||||
raise ValueError("model_source must be under /private")
|
||||
if "\x00" in model_source:
|
||||
raise ValueError("model_source contains null byte")
|
||||
parts = [p for p in model_source.split("/") if p]
|
||||
if any(p == ".." for p in parts):
|
||||
raise ValueError("model_source must not contain '..'")
|
||||
|
||||
allowed_user_prefix = f"/private/users/{user_id}/"
|
||||
allowed = model_source.startswith("/private/hf/") or model_source.startswith(allowed_user_prefix)
|
||||
if not allowed:
|
||||
raise PermissionError("model_source is not allowed (must be under /private/hf or your /private/users/<user_id>)")
|
||||
|
||||
|
||||
def parse_serving_spec(obj: Any) -> ServingSpec:
|
||||
if not isinstance(obj, dict):
|
||||
raise ValueError("serving spec must be a mapping")
|
||||
|
||||
model_id = obj.get("model_id")
|
||||
model_source = obj.get("model_source")
|
||||
num_replicas = obj.get("num_replicas", 1)
|
||||
gpus_per_replica = obj.get("gpus_per_replica", 1)
|
||||
engine_kwargs = obj.get("engine_kwargs", None)
|
||||
|
||||
if not isinstance(model_id, str):
|
||||
raise ValueError("missing required field: model_id")
|
||||
validate_model_id_suffix(model_id)
|
||||
|
||||
if not isinstance(model_source, str) or not model_source:
|
||||
raise ValueError("missing required field: model_source")
|
||||
|
||||
if not isinstance(num_replicas, int) or num_replicas < 1:
|
||||
raise ValueError("num_replicas must be an integer >= 1")
|
||||
if not isinstance(gpus_per_replica, int) or gpus_per_replica < 1:
|
||||
raise ValueError("gpus_per_replica must be an integer >= 1")
|
||||
|
||||
if engine_kwargs is not None and not isinstance(engine_kwargs, dict):
|
||||
raise ValueError("engine_kwargs must be a mapping when provided")
|
||||
|
||||
return ServingSpec(
|
||||
model_id=model_id,
|
||||
model_source=model_source,
|
||||
num_replicas=num_replicas,
|
||||
gpus_per_replica=gpus_per_replica,
|
||||
engine_kwargs=engine_kwargs,
|
||||
)
|
||||
|
||||
|
||||
def resolve_serving_spec(*, spec: ServingSpec, user_id: str, now_utc: datetime | None = None) -> ResolvedServingSpec:
|
||||
validate_model_id_suffix(spec.model_id)
|
||||
prefix = make_model_id_prefix(user_id=user_id, now_utc=now_utc)
|
||||
full_model_id = f"{prefix}-{spec.model_id}"
|
||||
|
||||
resolved_source = expand_home_macros(user_id=user_id, text=spec.model_source)
|
||||
validate_model_source_path(user_id=user_id, model_source=resolved_source)
|
||||
|
||||
return ResolvedServingSpec(
|
||||
user_id=user_id,
|
||||
model_id_suffix=spec.model_id,
|
||||
model_id_prefix=prefix,
|
||||
model_id=full_model_id,
|
||||
model_source=resolved_source,
|
||||
num_replicas=spec.num_replicas,
|
||||
gpus_per_replica=spec.gpus_per_replica,
|
||||
engine_kwargs=spec.engine_kwargs,
|
||||
)
|
||||
@ -112,6 +112,7 @@ def _nav(active: str) -> str:
|
||||
links = [
|
||||
("login", "/ui/login", "Login"),
|
||||
("tasks", "/ui/tasks", "Tasks"),
|
||||
("serving", "/ui/serving", "Serving"),
|
||||
("new", "/ui/tasks/new", "New Task"),
|
||||
("data", "/ui/data", "Data"),
|
||||
("admin", "/ui/admin", "Admin"),
|
||||
@ -992,6 +993,253 @@ refresh();
|
||||
""".strip()
|
||||
return HTMLResponse(content=_page(f"Logs {task_id}", "tasks", body, script))
|
||||
|
||||
@app.get("/ui/serving")
|
||||
async def ui_serving() -> HTMLResponse:
|
||||
body = """
|
||||
<h1>Serving</h1>
|
||||
<div class="card">
|
||||
<div class="row">
|
||||
<button class="btn" id="refresh">Refresh</button>
|
||||
<a class="btn" href="/ui/serving/new" style="display:inline-block">New Model</a>
|
||||
<a class="btn" id="openai-models" target="_blank" rel="noopener" href="#">OpenAI /v1/models</a>
|
||||
</div>
|
||||
<div style="height:10px"></div>
|
||||
<div id="out" class="muted">Loading...</div>
|
||||
</div>
|
||||
""".strip()
|
||||
script = """
|
||||
document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265);
|
||||
document.getElementById("openai-models").href = curOriginWithPort(8000) + "/v1/models";
|
||||
const out = document.getElementById("out");
|
||||
|
||||
function pill(state) {
|
||||
const s = String(state || "");
|
||||
if (s === "RUNNING") return `<span class="pill ok">${s}</span>`;
|
||||
if (s === "FAILED") return `<span class="pill bad">${s}</span>`;
|
||||
return `<span class="pill">${s}</span>`;
|
||||
}
|
||||
|
||||
async function refresh() {
|
||||
out.textContent = "Loading...";
|
||||
try {
|
||||
const lim = 50;
|
||||
const off = Number(localStorage.getItem("mvp_serving_offset") || "0") || 0;
|
||||
const resp = await apiJson("/api/v2/serve/models?limit=" + lim + "&offset=" + off + "&include_deleted=0");
|
||||
const items = resp.items || [];
|
||||
const hasMore = !!resp.has_more;
|
||||
const pageNo = Math.floor(off / lim) + 1;
|
||||
const prevDisabled = off <= 0;
|
||||
const nextDisabled = !hasMore;
|
||||
|
||||
function row(m) {
|
||||
return `<tr>
|
||||
<td><a href="/ui/serving/${m.model_key}">${m.model_key}</a></td>
|
||||
<td><code>${m.model_id}</code></td>
|
||||
<td>${pill(m.state)}</td>
|
||||
<td>${m.num_replicas} × ${m.gpus_per_replica} GPU</td>
|
||||
<td>${m.updated_at || ""}</td>
|
||||
</tr>`;
|
||||
}
|
||||
const rows = items.map(row).join("");
|
||||
|
||||
out.innerHTML = `
|
||||
<div class="row" style="justify-content: space-between; margin-bottom: 8px;">
|
||||
<div class="muted">OpenAI base: <code>${resp.openai_base_url || curOriginWithPort(8000) + "/v1"}</code></div>
|
||||
<div class="row">
|
||||
<span class="muted">Page ${pageNo}</span>
|
||||
<button class="btn" id="prev" ${prevDisabled ? "disabled" : ""}>Prev</button>
|
||||
<button class="btn" id="next" ${nextDisabled ? "disabled" : ""}>Next</button>
|
||||
</div>
|
||||
</div>
|
||||
<table>
|
||||
<thead><tr><th>Model Key</th><th>Model ID</th><th>State</th><th>Resources</th><th>Updated</th></tr></thead>
|
||||
<tbody>${rows || "<tr><td colspan=5 class=muted>(none)</td></tr>"}</tbody>
|
||||
</table>
|
||||
`;
|
||||
|
||||
const prevBtn = document.getElementById("prev");
|
||||
const nextBtn = document.getElementById("next");
|
||||
if (prevBtn) prevBtn.onclick = () => { localStorage.setItem("mvp_serving_offset", String(Math.max(0, off - lim))); refresh(); };
|
||||
if (nextBtn) nextBtn.onclick = () => { localStorage.setItem("mvp_serving_offset", String(off + lim)); refresh(); };
|
||||
} catch (e) {
|
||||
let text = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
|
||||
if (e.body && String(e.body).includes("serving is not enabled")) {
|
||||
text = "Serving is not enabled in server config.\\nAsk admin to enable `serving:` in dev.yaml.";
|
||||
}
|
||||
out.textContent = text;
|
||||
}
|
||||
}
|
||||
|
||||
document.getElementById("refresh").onclick = refresh;
|
||||
refresh();
|
||||
""".strip()
|
||||
return HTMLResponse(content=_page("Serving", "serving", body, script))
|
||||
|
||||
@app.get("/ui/serving/new")
|
||||
async def ui_serving_new() -> HTMLResponse:
|
||||
example = """# ServingSpec (YAML)
|
||||
# 说明:
|
||||
# - model_id: 这里是 suffix(平台会自动加前缀:<user_id>-<YYYYMMDDHHMM>-<suffix>)
|
||||
# - model_source: 本地模型路径(支持 $HOME 宏;推荐使用 $HOME/common/hf 指向共享 HF cache)
|
||||
#
|
||||
# 常用路径:
|
||||
# - $HOME/common/hf -> /private/hf
|
||||
# - $HOME -> /private/users/<user_id>
|
||||
#
|
||||
model_id: qwen-0.5b
|
||||
model_source: $HOME/common/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/<SNAPSHOT_HASH>
|
||||
num_replicas: 1
|
||||
gpus_per_replica: 1
|
||||
|
||||
# engine_kwargs: # 可选:透传给 vLLM
|
||||
# gpu_memory_utilization: 0.4
|
||||
""".strip()
|
||||
body = f"""
|
||||
<h1>New Model</h1>
|
||||
<div class="card">
|
||||
<div class="muted">Paste ServingSpec YAML and submit to <code>/api/v2/serve/models</code>.</div>
|
||||
<div style="height:10px"></div>
|
||||
<textarea id="yaml" rows="14">{html.escape(example)}</textarea>
|
||||
<div style="height:10px"></div>
|
||||
<div class="row">
|
||||
<button class="btn" id="submit">Submit</button>
|
||||
<a class="btn" href="/ui/serving" style="display:inline-block">Back</a>
|
||||
</div>
|
||||
<div style="height:10px"></div>
|
||||
<pre id="out" class="muted"></pre>
|
||||
</div>
|
||||
""".strip()
|
||||
script = """
|
||||
document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265);
|
||||
const out = document.getElementById("out");
|
||||
document.getElementById("submit").onclick = async () => {
|
||||
out.textContent = "Submitting...";
|
||||
const yaml = document.getElementById("yaml").value || "";
|
||||
try {
|
||||
const resp = await apiJson("/api/v2/serve/models", { method: "POST", headers: { "Content-Type": "application/yaml" }, body: yaml });
|
||||
out.textContent = "Created: " + resp.model_key + "\\nState: " + resp.state;
|
||||
if (resp.model_key) window.location.href = "/ui/serving/" + encodeURIComponent(resp.model_key);
|
||||
} catch (e) {
|
||||
out.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
|
||||
}
|
||||
};
|
||||
""".strip()
|
||||
return HTMLResponse(content=_page("New Model", "serving", body, script))
|
||||
|
||||
@app.get("/ui/serving/{model_key}")
|
||||
async def ui_serving_detail(model_key: str) -> HTMLResponse:
|
||||
body = f"""
|
||||
<h1>Model</h1>
|
||||
<div class="card">
|
||||
<div class="row" style="justify-content: space-between;">
|
||||
<div class="muted">model_key: <code>{html.escape(model_key)}</code></div>
|
||||
<div class="row">
|
||||
<a class="btn" href="/ui/serving" style="display:inline-block">Back</a>
|
||||
<a class="btn" id="openai-models" target="_blank" rel="noopener" href="#">OpenAI /v1/models</a>
|
||||
</div>
|
||||
</div>
|
||||
<div style="height:10px"></div>
|
||||
<div class="row">
|
||||
<label class="muted" style="min-width:120px">Scale replicas</label>
|
||||
<input id="replicas" type="number" min="1" step="1" value="1" style="max-width: 180px" />
|
||||
<button class="btn" id="scale">Apply</button>
|
||||
<button class="btn danger" id="delete">Delete</button>
|
||||
</div>
|
||||
<div style="height:10px"></div>
|
||||
<div id="meta" class="muted">Loading...</div>
|
||||
<div style="height:12px"></div>
|
||||
<h3 style="margin-top:0">Resolved Spec (YAML)</h3>
|
||||
<pre id="spec" class="muted">(loading)</pre>
|
||||
<div style="height:12px"></div>
|
||||
<h3 style="margin-top:0">Events</h3>
|
||||
<div id="events" class="muted">(loading)</div>
|
||||
<div style="height:12px"></div>
|
||||
<h3 style="margin-top:0">OpenAI Example</h3>
|
||||
<pre id="example" class="muted">(loading)</pre>
|
||||
</div>
|
||||
""".strip()
|
||||
script = f"""
|
||||
document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265);
|
||||
document.getElementById("openai-models").href = curOriginWithPort(8000) + "/v1/models";
|
||||
const modelKey = {json.dumps(model_key)};
|
||||
const meta = document.getElementById("meta");
|
||||
const spec = document.getElementById("spec");
|
||||
const eventsEl = document.getElementById("events");
|
||||
const example = document.getElementById("example");
|
||||
const replicas = document.getElementById("replicas");
|
||||
|
||||
function pill(state) {{
|
||||
const s = String(state || "");
|
||||
if (s === "RUNNING") return `<span class="pill ok">${{s}}</span>`;
|
||||
if (s === "FAILED") return `<span class="pill bad">${{s}}</span>`;
|
||||
return `<span class="pill">${{s}}</span>`;
|
||||
}}
|
||||
|
||||
function renderEvents(events) {{
|
||||
if (!events || !events.length) return "<div class=muted>(none)</div>";
|
||||
const rows = events.map(e => {{
|
||||
const payload = (e.payload_json || "");
|
||||
const short = String(payload).length > 240 ? String(payload).slice(0, 240) + "..." : String(payload);
|
||||
return `<tr><td>${{e.created_at || ""}}</td><td><code>${{e.event_type}}</code></td><td><pre class=muted style=\\"margin:0\\">${{short}}</pre></td></tr>`;
|
||||
}}).join("");
|
||||
return `<table><thead><tr><th>Time</th><th>Type</th><th>Payload</th></tr></thead><tbody>${{rows}}</tbody></table>`;
|
||||
}}
|
||||
|
||||
async function refresh() {{
|
||||
meta.textContent = "Loading...";
|
||||
spec.textContent = "(loading)";
|
||||
eventsEl.textContent = "(loading)";
|
||||
example.textContent = "(loading)";
|
||||
try {{
|
||||
const obj = await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey));
|
||||
const m = obj.model || {{}};
|
||||
replicas.value = String(m.num_replicas || 1);
|
||||
meta.innerHTML = `
|
||||
<div class=row>
|
||||
<div>state: ${{pill(m.state)}}</div>
|
||||
<div class=muted>model_id: <code>${{m.model_id || ""}}</code></div>
|
||||
<div class=muted>source: <code>${{m.model_source || ""}}</code></div>
|
||||
</div>
|
||||
<div class=muted>endpoint: <code>${{(m.endpoint && m.endpoint.openai_base_url) || (curOriginWithPort(8000) + "/v1")}}</code></div>
|
||||
`;
|
||||
spec.textContent = obj.resolved_spec_yaml || "";
|
||||
eventsEl.innerHTML = renderEvents(obj.events || []);
|
||||
const base = (m.endpoint && m.endpoint.openai_base_url) || (curOriginWithPort(8000) + "/v1");
|
||||
const mid = m.model_id || "";
|
||||
example.textContent = `curl -sS -H 'Content-Type: application/json' -H 'Authorization: Bearer FAKE_KEY' \\\\\\n -X POST ${{base}}/chat/completions \\\\\\n --data-binary '{{\\"model\\":\\"${{mid}}\\",\\"messages\\":[{{\\"role\\":\\"user\\",\\"content\\":\\"hello\\"}}],\\"max_tokens\\":16,\\"stream\\":false}}' | python3 -m json.tool`;
|
||||
}} catch (e) {{
|
||||
meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
|
||||
spec.textContent = "";
|
||||
eventsEl.textContent = "";
|
||||
example.textContent = "";
|
||||
}}
|
||||
}}
|
||||
|
||||
document.getElementById("scale").onclick = async () => {{
|
||||
const n = Number(replicas.value || "1");
|
||||
if (!Number.isFinite(n) || n < 1) return;
|
||||
try {{
|
||||
await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey), {{ method: "PATCH", headers: {{ "Content-Type": "application/json" }}, body: JSON.stringify({{ num_replicas: n }}) }});
|
||||
await refresh();
|
||||
}} catch (e) {{
|
||||
meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
|
||||
}}
|
||||
}};
|
||||
|
||||
document.getElementById("delete").onclick = async () => {{
|
||||
if (!confirm("Delete this model?")) return;
|
||||
try {{
|
||||
await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey), {{ method: "DELETE" }});
|
||||
await refresh();
|
||||
}} catch (e) {{
|
||||
meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
|
||||
}}
|
||||
}};
|
||||
|
||||
refresh();
|
||||
""".strip()
|
||||
return HTMLResponse(content=_page("Model", "serving", body, script))
|
||||
|
||||
@app.get("/ui/data")
|
||||
async def ui_data() -> HTMLResponse:
|
||||
body = """
|
||||
|
||||
282
src/mvp/py/tests/test_app_serving_api.py
Normal file
282
src/mvp/py/tests/test_app_serving_api.py
Normal file
@ -0,0 +1,282 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
|
||||
def _write_config(tmp_path: Path) -> Path:
|
||||
cfg = {
|
||||
"ray": {
|
||||
"address": "http://127.0.0.1:8265",
|
||||
"shared_root": "/private",
|
||||
"entrypoint_resources": {"worker_node": 1},
|
||||
"runtime_env": {"env_vars": {}},
|
||||
},
|
||||
"data": {
|
||||
"user_root": str(tmp_path / "users"),
|
||||
},
|
||||
"service": {
|
||||
"api": {"host": "127.0.0.1", "port": 0},
|
||||
"auth": {"token_env": "MVP_INTERNAL_TOKEN"},
|
||||
"sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")},
|
||||
"scheduler": {"tick_s": 1, "retry_interval_s": 1, "max_running_tasks": 1},
|
||||
},
|
||||
"serving": {
|
||||
"serve": {"http_port": 8000, "proxy_location": "HeadOnly"},
|
||||
"llm": {"accelerator_type": "H20"},
|
||||
},
|
||||
}
|
||||
p = tmp_path / "cfg.yaml"
|
||||
p.write_text(yaml.safe_dump(cfg), encoding="utf-8")
|
||||
return p
|
||||
|
||||
|
||||
def test_serving_api_crud_flow(tmp_path: Path, monkeypatch):
|
||||
from argus.service import app as app_mod
|
||||
|
||||
cfg_path = _write_config(tmp_path)
|
||||
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
|
||||
|
||||
class _Scheduler:
|
||||
def __init__(self, **kwargs):
|
||||
self.tool = object()
|
||||
|
||||
def run_forever(self, stop_flag):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
|
||||
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
|
||||
|
||||
app = app_mod.create_app(str(cfg_path))
|
||||
|
||||
admin_headers = {"authorization": "Bearer admin-token"}
|
||||
with TestClient(app) as c:
|
||||
r = c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
|
||||
assert r.status_code == 200
|
||||
r2 = c.post("/api/v2/users/alice/tokens", headers=admin_headers)
|
||||
assert r2.status_code == 200
|
||||
user_token = r2.json()["token"]
|
||||
|
||||
headers = {"authorization": f"Bearer {user_token}"}
|
||||
|
||||
spec_yaml = (
|
||||
"model_id: qwen-0.5b\n"
|
||||
"model_source: $HOME/common/hf/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/sha\n"
|
||||
"num_replicas: 1\n"
|
||||
"gpus_per_replica: 1\n"
|
||||
)
|
||||
r3 = c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
|
||||
assert r3.status_code == 200
|
||||
assert r3.json()["model_key"] == "mk-alice"
|
||||
assert r3.json()["state"] == "QUEUED"
|
||||
|
||||
r4 = c.get("/api/v2/serve/models?limit=10&offset=0", headers=headers)
|
||||
assert r4.status_code == 200
|
||||
obj = r4.json()
|
||||
assert obj["openai_base_url"] == "http://testserver:8000/v1"
|
||||
assert len(obj["items"]) == 1
|
||||
assert obj["items"][0]["model_key"] == "mk-alice"
|
||||
|
||||
r5 = c.get("/api/v2/serve/models/mk-alice", headers=headers)
|
||||
assert r5.status_code == 200
|
||||
detail = r5.json()
|
||||
assert detail["model"]["model_key"] == "mk-alice"
|
||||
assert "model_id_prefix" in detail["model"]
|
||||
assert "resolved_spec_yaml" in detail
|
||||
assert isinstance(detail.get("events"), list)
|
||||
|
||||
r6 = c.patch("/api/v2/serve/models/mk-alice", headers=headers, json={"num_replicas": 2})
|
||||
assert r6.status_code == 200
|
||||
assert r6.json()["state"] == "QUEUED"
|
||||
|
||||
r7 = c.delete("/api/v2/serve/models/mk-alice", headers=headers)
|
||||
assert r7.status_code == 200
|
||||
assert r7.json()["state"] == "DELETING"
|
||||
|
||||
# Admin status endpoint
|
||||
r8 = c.get("/api/v2/serve/status", headers=admin_headers)
|
||||
assert r8.status_code == 200
|
||||
assert r8.json()["http_port"] == 8000
|
||||
|
||||
|
||||
def test_serving_api_rejects_path_outside_user_and_hf(tmp_path: Path, monkeypatch):
|
||||
from argus.service import app as app_mod
|
||||
|
||||
cfg_path = _write_config(tmp_path)
|
||||
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
|
||||
|
||||
class _Scheduler:
|
||||
def __init__(self, **kwargs):
|
||||
self.tool = object()
|
||||
|
||||
def run_forever(self, stop_flag):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
|
||||
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
|
||||
|
||||
app = app_mod.create_app(str(cfg_path))
|
||||
|
||||
admin_headers = {"authorization": "Bearer admin-token"}
|
||||
with TestClient(app) as c:
|
||||
c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
|
||||
r2 = c.post("/api/v2/users/alice/tokens", headers=admin_headers)
|
||||
user_token = r2.json()["token"]
|
||||
headers = {"authorization": f"Bearer {user_token}"}
|
||||
|
||||
spec_yaml = (
|
||||
"model_id: x\n"
|
||||
"model_source: /private/users/bob/models/evil\n"
|
||||
"num_replicas: 1\n"
|
||||
"gpus_per_replica: 1\n"
|
||||
)
|
||||
r3 = c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
|
||||
assert r3.status_code == 403
|
||||
|
||||
|
||||
def test_serving_api_invalid_yaml_and_non_mapping(tmp_path: Path, monkeypatch):
|
||||
from argus.service import app as app_mod
|
||||
|
||||
cfg_path = _write_config(tmp_path)
|
||||
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
|
||||
|
||||
class _Scheduler:
|
||||
def __init__(self, **kwargs):
|
||||
self.tool = object()
|
||||
|
||||
def run_forever(self, stop_flag):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
|
||||
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
|
||||
app = app_mod.create_app(str(cfg_path))
|
||||
|
||||
with TestClient(app) as c:
|
||||
# Create a user token
|
||||
admin_headers = {"authorization": "Bearer admin-token"}
|
||||
c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
|
||||
token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
|
||||
headers = {"authorization": f"Bearer {token}"}
|
||||
|
||||
r = c.post("/api/v2/serve/models", headers=headers, data=": bad\n")
|
||||
assert r.status_code == 400
|
||||
|
||||
r2 = c.post("/api/v2/serve/models", headers=headers, data="- 1\n- 2\n")
|
||||
assert r2.status_code == 400
|
||||
|
||||
|
||||
def test_serving_api_engine_kwargs_binary_rejected(tmp_path: Path, monkeypatch):
|
||||
"""
|
||||
yaml !!binary is parsed as bytes, which is not JSON-serializable.
|
||||
"""
|
||||
from argus.service import app as app_mod
|
||||
|
||||
cfg_path = _write_config(tmp_path)
|
||||
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
|
||||
|
||||
class _Scheduler:
|
||||
def __init__(self, **kwargs):
|
||||
self.tool = object()
|
||||
|
||||
def run_forever(self, stop_flag):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
|
||||
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
|
||||
app = app_mod.create_app(str(cfg_path))
|
||||
|
||||
admin_headers = {"authorization": "Bearer admin-token"}
|
||||
with TestClient(app) as c:
|
||||
c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
|
||||
token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
|
||||
headers = {"authorization": f"Bearer {token}"}
|
||||
|
||||
spec_yaml = (
|
||||
"model_id: x\n"
|
||||
"model_source: $HOME/common/hf/x\n"
|
||||
"engine_kwargs:\n"
|
||||
" blob: !!binary \"AQID\"\n"
|
||||
)
|
||||
r = c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
|
||||
assert r.status_code == 400
|
||||
|
||||
|
||||
def test_serving_api_list_include_deleted_and_forwarded_base_url(tmp_path: Path, monkeypatch):
|
||||
from argus.service import app as app_mod
|
||||
from argus.service.config import V2Config
|
||||
from argus.service.db import Db
|
||||
|
||||
cfg_path = _write_config(tmp_path)
|
||||
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
|
||||
|
||||
class _Scheduler:
|
||||
def __init__(self, **kwargs):
|
||||
self.tool = object()
|
||||
|
||||
def run_forever(self, stop_flag):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
|
||||
keys = iter(["mk-alice-1", "mk-alice-2"])
|
||||
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: next(keys))
|
||||
|
||||
app = app_mod.create_app(str(cfg_path))
|
||||
|
||||
admin_headers = {"authorization": "Bearer admin-token"}
|
||||
with TestClient(app) as c:
|
||||
c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
|
||||
token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
|
||||
headers = {"authorization": f"Bearer {token}"}
|
||||
|
||||
spec_yaml = "model_id: x\nmodel_source: $HOME/common/hf/x\n"
|
||||
c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
|
||||
c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
|
||||
|
||||
# Mark one model as DELETED directly in DB (sets deleted_at).
|
||||
root = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))
|
||||
v2_cfg = V2Config.from_root_dict(root)
|
||||
db = Db(v2_cfg.sqlite.db_path)
|
||||
db.set_serve_model_state(model_key="mk-alice-2", state="DELETED")
|
||||
|
||||
r1 = c.get(
|
||||
"/api/v2/serve/models?limit=10&offset=0&include_deleted=0",
|
||||
headers={**headers, "x-forwarded-host": "example.com:8080", "x-forwarded-proto": "https"},
|
||||
)
|
||||
assert r1.status_code == 200
|
||||
assert r1.json()["openai_base_url"] == "https://example.com:8000/v1"
|
||||
assert {m["model_key"] for m in r1.json()["items"]} == {"mk-alice-1"}
|
||||
|
||||
r2 = c.get("/api/v2/serve/models?include_deleted=1", headers=headers)
|
||||
assert r2.status_code == 200
|
||||
assert {m["model_key"] for m in r2.json()["items"]} == {"mk-alice-1", "mk-alice-2"}
|
||||
|
||||
|
||||
def test_serving_api_patch_invalid_num_replicas(tmp_path: Path, monkeypatch):
|
||||
from argus.service import app as app_mod
|
||||
|
||||
cfg_path = _write_config(tmp_path)
|
||||
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
|
||||
|
||||
class _Scheduler:
|
||||
def __init__(self, **kwargs):
|
||||
self.tool = object()
|
||||
|
||||
def run_forever(self, stop_flag):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
|
||||
monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: "mk-alice")
|
||||
|
||||
app = app_mod.create_app(str(cfg_path))
|
||||
|
||||
admin_headers = {"authorization": "Bearer admin-token"}
|
||||
with TestClient(app) as c:
|
||||
c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
|
||||
token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
|
||||
headers = {"authorization": f"Bearer {token}"}
|
||||
|
||||
c.post("/api/v2/serve/models", headers=headers, data="model_id: x\nmodel_source: $HOME/common/hf/x\n")
|
||||
r = c.patch("/api/v2/serve/models/mk-alice", headers=headers, json={"num_replicas": 0})
|
||||
assert r.status_code == 422
|
||||
79
src/mvp/py/tests/test_db_serving.py
Normal file
79
src/mvp/py/tests/test_db_serving.py
Normal file
@ -0,0 +1,79 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def test_db_serving_model_crud_and_events(tmp_path: Path) -> None:
|
||||
from argus.service.db import Db
|
||||
|
||||
db = Db(str(tmp_path / "mvp.sqlite3"))
|
||||
db.init()
|
||||
|
||||
m1 = db.create_serve_model(
|
||||
model_key="svc-001",
|
||||
user_id="alice",
|
||||
model_id_suffix="qwen-0.5b",
|
||||
model_id_prefix="alice-202601061235",
|
||||
model_id="alice-202601061235-qwen-0.5b",
|
||||
model_source="/private/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/sha",
|
||||
num_replicas=1,
|
||||
gpus_per_replica=1,
|
||||
engine_kwargs_json=json.dumps({"max_model_len": 8192}),
|
||||
spec_yaml="model_id: qwen-0.5b\nmodel_source: $HOME/common/hf/...\n",
|
||||
resolved_spec_yaml="model_id: alice-202601061235-qwen-0.5b\nmodel_source: /private/hf/...\n",
|
||||
)
|
||||
assert m1["model_key"] == "svc-001"
|
||||
assert m1["state"] == "QUEUED"
|
||||
|
||||
# Same suffix may be created again; model_key is the identity.
|
||||
m2 = db.create_serve_model(
|
||||
model_key="svc-002",
|
||||
user_id="alice",
|
||||
model_id_suffix="qwen-0.5b",
|
||||
model_id_prefix="alice-202601061236",
|
||||
model_id="alice-202601061236-qwen-0.5b",
|
||||
model_source="/private/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/sha",
|
||||
num_replicas=1,
|
||||
gpus_per_replica=2,
|
||||
engine_kwargs_json=None,
|
||||
spec_yaml="model_id: qwen-0.5b\nmodel_source: $HOME/common/hf/...\n",
|
||||
resolved_spec_yaml="model_id: alice-202601061236-qwen-0.5b\nmodel_source: /private/hf/...\n",
|
||||
)
|
||||
assert m2["model_key"] == "svc-002"
|
||||
assert m2["model_id"] != m1["model_id"]
|
||||
|
||||
got = db.get_serve_model("svc-001")
|
||||
assert got is not None
|
||||
assert got["gpus_per_replica"] == 1
|
||||
|
||||
items = db.list_serve_models(user_id="alice")
|
||||
assert {i["model_key"] for i in items} == {"svc-001", "svc-002"}
|
||||
|
||||
# State transition writes a serve event.
|
||||
db.set_serve_model_state(model_key="svc-001", state="DEPLOYING")
|
||||
got2 = db.get_serve_model("svc-001")
|
||||
assert got2 is not None
|
||||
assert got2["state"] == "DEPLOYING"
|
||||
|
||||
events = db.list_serve_events("svc-001", limit=50)
|
||||
assert len(events) >= 2
|
||||
assert {e["event_type"] for e in events}.issuperset({"SERVE_MODEL_CREATED", "SERVE_STATE_UPDATE"})
|
||||
|
||||
# Reconciler pick: QUEUED/DELETING only.
|
||||
picked = db.pick_next_runnable_serve_change()
|
||||
assert picked is not None
|
||||
assert picked["state"] == "QUEUED"
|
||||
|
||||
db.set_serve_model_state(model_key="svc-002", state="DELETING")
|
||||
picked2 = db.pick_next_runnable_serve_change()
|
||||
assert picked2 is not None
|
||||
assert picked2["state"] in ("QUEUED", "DELETING")
|
||||
|
||||
# Deleted models are hidden unless include_deleted.
|
||||
db.set_serve_model_state(model_key="svc-002", state="DELETED")
|
||||
items2 = db.list_serve_models(user_id="alice", include_deleted=False)
|
||||
assert {i["model_key"] for i in items2} == {"svc-001"}
|
||||
items3 = db.list_serve_models(user_id="alice", include_deleted=True)
|
||||
assert {i["model_key"] for i in items3} == {"svc-001", "svc-002"}
|
||||
|
||||
@ -44,3 +44,32 @@ def test_attempt_submission_id_format():
|
||||
|
||||
assert attempt_submission_id("t", 1) == "t--a01"
|
||||
assert attempt_submission_id("t", 12) == "t--a12"
|
||||
|
||||
|
||||
def test_new_model_key_includes_user(monkeypatch):
|
||||
import argus.core.ids as ids
|
||||
|
||||
class _FakeDatetime:
|
||||
@staticmethod
|
||||
def now():
|
||||
class _DT:
|
||||
def strftime(self, fmt: str) -> str:
|
||||
assert fmt == "%Y%m%d-%H%M%S"
|
||||
return "20250101-010203"
|
||||
|
||||
return _DT()
|
||||
|
||||
monkeypatch.setattr(ids, "datetime", _FakeDatetime)
|
||||
monkeypatch.setattr(ids.secrets, "token_hex", lambda n: "abcd")
|
||||
|
||||
assert ids.new_model_key(user_id="Alice_01") == "mvp2-alice_01-serve-20250101-010203-abcd"
|
||||
|
||||
|
||||
def test_new_model_key_requires_user_id():
|
||||
from argus.core.ids import new_model_key
|
||||
|
||||
try:
|
||||
new_model_key(user_id="")
|
||||
assert False, "expected ValueError"
|
||||
except ValueError as e:
|
||||
assert "user_id is required" in str(e)
|
||||
|
||||
78
src/mvp/py/tests/test_llm_config_builder.py
Normal file
78
src/mvp/py/tests/test_llm_config_builder.py
Normal file
@ -0,0 +1,78 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def test_build_llm_config_dict_maps_tp_and_bundles():
|
||||
from argus.service.serve_llm_config import build_llm_config_dict
|
||||
from argus.service.serving_spec import ResolvedServingSpec
|
||||
|
||||
resolved = ResolvedServingSpec(
|
||||
user_id="alice",
|
||||
model_id_suffix="qwen-0.5b",
|
||||
model_id_prefix="alice-202601061235",
|
||||
model_id="alice-202601061235-qwen-0.5b",
|
||||
model_source="/private/hf/x",
|
||||
num_replicas=2,
|
||||
gpus_per_replica=4,
|
||||
engine_kwargs={"gpu_memory_utilization": 0.9},
|
||||
)
|
||||
|
||||
cfg = build_llm_config_dict(
|
||||
resolved,
|
||||
accelerator_type="H20",
|
||||
runtime_env_env_vars={"HF_ENDPOINT": "https://hf-mirror.com"},
|
||||
cpu_per_gpu=2.0,
|
||||
)
|
||||
assert cfg["model_loading_config"]["model_id"] == "alice-202601061235-qwen-0.5b"
|
||||
assert cfg["model_loading_config"]["model_source"] == "/private/hf/x"
|
||||
assert cfg["accelerator_type"] == "H20"
|
||||
assert cfg["deployment_config"]["num_replicas"] == 2
|
||||
|
||||
# gpus_per_replica -> tensor_parallel_size
|
||||
assert cfg["engine_kwargs"]["tensor_parallel_size"] == 4
|
||||
assert cfg["engine_kwargs"]["gpu_memory_utilization"] == 0.9
|
||||
|
||||
# resources_per_bundle reserves the full TP GPU set for each replica.
|
||||
bundle = cfg["resources_per_bundle"]
|
||||
assert bundle["GPU"] == 4.0
|
||||
assert bundle["CPU"] == 8.0
|
||||
|
||||
|
||||
def test_build_llm_config_dict_injects_hf_offline_defaults():
|
||||
from argus.service.serve_llm_config import build_llm_config_dict
|
||||
from argus.service.serving_spec import ResolvedServingSpec
|
||||
|
||||
resolved = ResolvedServingSpec(
|
||||
user_id="alice",
|
||||
model_id_suffix="x",
|
||||
model_id_prefix="alice-202601061235",
|
||||
model_id="alice-202601061235-x",
|
||||
model_source="/private/users/alice/models/x",
|
||||
num_replicas=1,
|
||||
gpus_per_replica=1,
|
||||
engine_kwargs=None,
|
||||
)
|
||||
cfg = build_llm_config_dict(resolved, accelerator_type="H20", runtime_env_env_vars={})
|
||||
env = cfg["runtime_env"]["env_vars"]
|
||||
assert env["HF_HUB_OFFLINE"] == "1"
|
||||
assert env["HF_HOME"] == "/private/hf"
|
||||
assert env["HUGGINGFACE_HUB_CACHE"].startswith("/private/hf/")
|
||||
|
||||
|
||||
def test_build_llm_config_dict_requires_accelerator_type():
|
||||
from argus.service.serve_llm_config import build_llm_config_dict
|
||||
from argus.service.serving_spec import ResolvedServingSpec
|
||||
|
||||
resolved = ResolvedServingSpec(
|
||||
user_id="alice",
|
||||
model_id_suffix="x",
|
||||
model_id_prefix="alice-202601061235",
|
||||
model_id="alice-202601061235-x",
|
||||
model_source="/private/hf/x",
|
||||
num_replicas=1,
|
||||
gpus_per_replica=1,
|
||||
engine_kwargs=None,
|
||||
)
|
||||
with pytest.raises(ValueError, match="accelerator_type is required"):
|
||||
build_llm_config_dict(resolved, accelerator_type="", runtime_env_env_vars={})
|
||||
55
src/mvp/py/tests/test_serve_client.py
Normal file
55
src/mvp/py/tests/test_serve_client.py
Normal file
@ -0,0 +1,55 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import types
|
||||
|
||||
|
||||
def test_ray_serve_client_calls_start_run_status(monkeypatch):
|
||||
import ray # provided by conftest stub
|
||||
|
||||
calls: list[tuple[str, object]] = []
|
||||
|
||||
def _init(*args, **kwargs):
|
||||
calls.append(("ray.init", {"args": args, "kwargs": kwargs}))
|
||||
|
||||
monkeypatch.setattr(ray, "init", _init, raising=False)
|
||||
|
||||
serve = types.ModuleType("ray.serve")
|
||||
|
||||
def _start(**kwargs):
|
||||
calls.append(("serve.start", kwargs))
|
||||
return None
|
||||
|
||||
def _run(app, name=None, route_prefix=None):
|
||||
calls.append(("serve.run", {"app": app, "name": name, "route_prefix": route_prefix}))
|
||||
return {"deployed": True}
|
||||
|
||||
def _status():
|
||||
calls.append(("serve.status", None))
|
||||
return {"ok": True}
|
||||
|
||||
serve.start = _start # type: ignore[attr-defined]
|
||||
serve.run = _run # type: ignore[attr-defined]
|
||||
serve.status = _status # type: ignore[attr-defined]
|
||||
|
||||
sys.modules["ray.serve"] = serve
|
||||
ray.serve = serve # type: ignore[attr-defined]
|
||||
|
||||
from argus.service.serve_client import RayServeClient
|
||||
|
||||
client = RayServeClient(http_port=8000, proxy_location="HeadOnly", ray_init_address="auto")
|
||||
client.ensure_started()
|
||||
out = client.apply_app(app="APP", app_name="argus_llm_app", route_prefix="/")
|
||||
st = client.get_status()
|
||||
|
||||
assert out == {"deployed": True}
|
||||
assert st == {"ok": True}
|
||||
|
||||
# Verify call order and key args.
|
||||
assert calls[0][0] == "ray.init"
|
||||
assert calls[0][1]["kwargs"].get("ignore_reinit_error") is True
|
||||
assert calls[1][0] == "serve.start"
|
||||
assert calls[1][1]["http_options"]["port"] == 8000
|
||||
assert calls[2][0] == "serve.run"
|
||||
assert calls[2][1]["name"] == "argus_llm_app"
|
||||
assert calls[3][0] == "serve.status"
|
||||
@ -23,6 +23,7 @@ def test_v2_config_from_root_dict_new_format_defaults():
|
||||
assert cfg.sqlite.db_path.endswith(".sqlite3")
|
||||
assert cfg.scheduler.max_running_tasks == 3
|
||||
assert cfg.tracking.wandb.enabled is False
|
||||
assert cfg.serving.enabled is False
|
||||
|
||||
|
||||
def test_v2_config_backward_compat_v2_section_and_default_db_path():
|
||||
@ -57,6 +58,27 @@ def test_v2_config_requires_data_mappings():
|
||||
V2Config.from_root_dict({**base, "data": {"sftpgo": ["x"], "retention": {}}})
|
||||
|
||||
|
||||
def test_v2_config_requires_tracking_and_serving_mappings():
|
||||
from argus.service.config import V2Config
|
||||
|
||||
base = {
|
||||
"ray": {"shared_root": "/private"},
|
||||
"service": {"api": {}, "auth": {}, "sqlite": {}, "scheduler": {}},
|
||||
"data": {"sftpgo": {}, "retention": {}},
|
||||
}
|
||||
|
||||
with pytest.raises(ValueError, match="config\\.tracking must be a mapping"):
|
||||
V2Config.from_root_dict({**base, "tracking": ["nope"]})
|
||||
|
||||
with pytest.raises(ValueError, match="config\\.tracking\\.wandb must be a mapping"):
|
||||
V2Config.from_root_dict({**base, "tracking": {"wandb": ["nope"]}})
|
||||
|
||||
with pytest.raises(ValueError, match="config\\.serving must be a mapping"):
|
||||
V2Config.from_root_dict({**base, "serving": ["nope"]})
|
||||
|
||||
with pytest.raises(ValueError, match="config\\.serving\\.\\{serve,llm\\} must be mappings"):
|
||||
V2Config.from_root_dict({**base, "serving": {"serve": ["x"], "llm": {}}})
|
||||
|
||||
def test_tracking_wandb_defaults_disabled():
|
||||
from argus.service.config import V2Config
|
||||
|
||||
|
||||
23
src/mvp/py/tests/test_serving_model_id_prefix.py
Normal file
23
src/mvp/py/tests/test_serving_model_id_prefix.py
Normal file
@ -0,0 +1,23 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from argus.service.serving_spec import make_model_id_prefix
|
||||
|
||||
|
||||
def test_make_model_id_prefix_uses_utc_minutes():
|
||||
dt = datetime(2026, 1, 6, 12, 35, 59, tzinfo=timezone.utc)
|
||||
assert make_model_id_prefix(user_id="alice", now_utc=dt) == "alice-202601061235"
|
||||
|
||||
|
||||
def test_make_model_id_prefix_rejects_empty_user_id():
|
||||
with pytest.raises(ValueError, match="user_id is required"):
|
||||
make_model_id_prefix(user_id="", now_utc=datetime.now(timezone.utc))
|
||||
|
||||
|
||||
def test_make_model_id_prefix_rejects_slash():
|
||||
with pytest.raises(ValueError, match="must not contain"):
|
||||
make_model_id_prefix(user_id="bad/user", now_utc=datetime.now(timezone.utc))
|
||||
|
||||
207
src/mvp/py/tests/test_serving_reconciler.py
Normal file
207
src/mvp/py/tests/test_serving_reconciler.py
Normal file
@ -0,0 +1,207 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class _FakeServeClient:
|
||||
def __init__(self):
|
||||
self.started = 0
|
||||
self.applied = []
|
||||
self.status_calls = 0
|
||||
self.fail_apply = False
|
||||
self.fail_status = False
|
||||
|
||||
def ensure_started(self) -> None:
|
||||
self.started += 1
|
||||
|
||||
def apply_app(self, *, app, app_name: str, route_prefix: str = "/"):
|
||||
if self.fail_apply:
|
||||
raise RuntimeError("boom")
|
||||
self.applied.append({"app": app, "app_name": app_name, "route_prefix": route_prefix})
|
||||
return {"ok": True}
|
||||
|
||||
def get_status(self):
|
||||
self.status_calls += 1
|
||||
if self.fail_status:
|
||||
raise RuntimeError("status boom")
|
||||
return {"ok": True}
|
||||
|
||||
|
||||
def _seed_model(db, *, model_key: str, user_id: str, state: str, num_replicas: int = 1, gpus_per_replica: int = 1):
|
||||
spec_yaml = "model_id: x\nmodel_source: $HOME/common/hf/x\n"
|
||||
resolved_yaml = f"user_id: {user_id}\nmodel_id: {user_id}-202601061235-x\n"
|
||||
db.create_serve_model(
|
||||
model_key=model_key,
|
||||
user_id=user_id,
|
||||
model_id_suffix="x",
|
||||
model_id_prefix=f"{user_id}-202601061235",
|
||||
model_id=f"{user_id}-202601061235-x",
|
||||
model_source="/private/hf/x",
|
||||
num_replicas=num_replicas,
|
||||
gpus_per_replica=gpus_per_replica,
|
||||
engine_kwargs_json=json.dumps({"gpu_memory_utilization": 0.9}),
|
||||
spec_yaml=spec_yaml,
|
||||
resolved_spec_yaml=resolved_yaml,
|
||||
)
|
||||
db.set_serve_model_state(model_key=model_key, state=state, event_type="TEST_SEED")
|
||||
|
||||
|
||||
def test_reconciler_skips_when_no_changes(tmp_path: Path):
|
||||
from argus.service.config import V2Config
|
||||
from argus.service.db import Db
|
||||
from argus.service.serving_reconciler import ServingReconciler
|
||||
|
||||
root = {
|
||||
"ray": {"shared_root": "/private"},
|
||||
"service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
|
||||
"data": {"sftpgo": {}, "retention": {}},
|
||||
"serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
|
||||
}
|
||||
cfg = V2Config.from_root_dict(root)
|
||||
db = Db(cfg.sqlite.db_path)
|
||||
db.init()
|
||||
|
||||
client = _FakeServeClient()
|
||||
rec = ServingReconciler(db=db, v2_cfg=cfg, ray_runtime_env_env_vars={}, serve_client=client, get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})())
|
||||
rec.tick()
|
||||
assert client.started == 0
|
||||
assert client.applied == []
|
||||
|
||||
|
||||
def test_reconciler_pending_resources_no_apply(tmp_path: Path):
|
||||
from argus.service.config import V2Config
|
||||
from argus.service.db import Db
|
||||
from argus.service.serving_reconciler import ServingReconciler
|
||||
|
||||
cfg = V2Config.from_root_dict(
|
||||
{
|
||||
"ray": {"shared_root": "/private"},
|
||||
"service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
|
||||
"data": {"sftpgo": {}, "retention": {}},
|
||||
"serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
|
||||
}
|
||||
)
|
||||
db = Db(cfg.sqlite.db_path)
|
||||
db.init()
|
||||
_seed_model(db, model_key="mk1", user_id="alice", state="QUEUED", num_replicas=2, gpus_per_replica=4)
|
||||
|
||||
client = _FakeServeClient()
|
||||
rec = ServingReconciler(
|
||||
db=db,
|
||||
v2_cfg=cfg,
|
||||
ray_runtime_env_env_vars={},
|
||||
serve_client=client,
|
||||
get_available_fn=lambda: type("A", (), {"total_available_gpus": 1, "total_available_npus": 0})(),
|
||||
)
|
||||
rec.tick()
|
||||
# Serve may be started even when resources are insufficient, but apply should not happen.
|
||||
assert client.started == 1
|
||||
assert client.applied == []
|
||||
# State remains QUEUED.
|
||||
row = db.get_serve_model("mk1")
|
||||
assert row and row["state"] == "QUEUED"
|
||||
ev = db.list_serve_events("mk1", limit=50)
|
||||
assert any(e["event_type"] == "SERVE_PENDING_RESOURCES" for e in ev)
|
||||
|
||||
|
||||
def test_reconciler_apply_success_marks_running(tmp_path: Path):
|
||||
from argus.service.config import V2Config
|
||||
from argus.service.db import Db
|
||||
from argus.service.serving_reconciler import ServingReconciler
|
||||
|
||||
cfg = V2Config.from_root_dict(
|
||||
{
|
||||
"ray": {"shared_root": "/private"},
|
||||
"service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
|
||||
"data": {"sftpgo": {}, "retention": {}},
|
||||
"serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
|
||||
}
|
||||
)
|
||||
db = Db(cfg.sqlite.db_path)
|
||||
db.init()
|
||||
_seed_model(db, model_key="mk1", user_id="alice", state="QUEUED", num_replicas=1, gpus_per_replica=1)
|
||||
|
||||
client = _FakeServeClient()
|
||||
rec = ServingReconciler(
|
||||
db=db,
|
||||
v2_cfg=cfg,
|
||||
ray_runtime_env_env_vars={"HF_ENDPOINT": "https://hf-mirror.com"},
|
||||
serve_client=client,
|
||||
get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})(),
|
||||
)
|
||||
rec.tick()
|
||||
assert client.started == 1
|
||||
assert len(client.applied) == 1
|
||||
applied = client.applied[0]["app"]["llm_configs"]
|
||||
assert applied[0]["engine_kwargs"]["tensor_parallel_size"] == 1
|
||||
assert applied[0]["runtime_env"]["env_vars"]["HF_HUB_OFFLINE"] == "1"
|
||||
row = db.get_serve_model("mk1")
|
||||
assert row and row["state"] == "RUNNING"
|
||||
|
||||
|
||||
def test_reconciler_delete_removes_and_marks_deleted(tmp_path: Path):
|
||||
from argus.service.config import V2Config
|
||||
from argus.service.db import Db
|
||||
from argus.service.serving_reconciler import ServingReconciler
|
||||
|
||||
cfg = V2Config.from_root_dict(
|
||||
{
|
||||
"ray": {"shared_root": "/private"},
|
||||
"service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
|
||||
"data": {"sftpgo": {}, "retention": {}},
|
||||
"serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
|
||||
}
|
||||
)
|
||||
db = Db(cfg.sqlite.db_path)
|
||||
db.init()
|
||||
_seed_model(db, model_key="keep", user_id="alice", state="RUNNING", num_replicas=1, gpus_per_replica=1)
|
||||
_seed_model(db, model_key="del", user_id="alice", state="DELETING", num_replicas=1, gpus_per_replica=1)
|
||||
|
||||
client = _FakeServeClient()
|
||||
rec = ServingReconciler(
|
||||
db=db,
|
||||
v2_cfg=cfg,
|
||||
ray_runtime_env_env_vars={},
|
||||
serve_client=client,
|
||||
get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})(),
|
||||
)
|
||||
rec.tick()
|
||||
assert len(client.applied) == 1
|
||||
cfgs = client.applied[0]["app"]["llm_configs"]
|
||||
assert {c["model_loading_config"]["model_id"] for c in cfgs} == {"alice-202601061235-x"} # only keep remains
|
||||
row = db.get_serve_model("del")
|
||||
assert row and row["state"] == "DELETED"
|
||||
assert row.get("deleted_at")
|
||||
|
||||
|
||||
def test_reconciler_apply_failure_marks_failed(tmp_path: Path):
|
||||
from argus.service.config import V2Config
|
||||
from argus.service.db import Db
|
||||
from argus.service.serving_reconciler import ServingReconciler
|
||||
|
||||
cfg = V2Config.from_root_dict(
|
||||
{
|
||||
"ray": {"shared_root": "/private"},
|
||||
"service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
|
||||
"data": {"sftpgo": {}, "retention": {}},
|
||||
"serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
|
||||
}
|
||||
)
|
||||
db = Db(cfg.sqlite.db_path)
|
||||
db.init()
|
||||
_seed_model(db, model_key="mk1", user_id="alice", state="QUEUED")
|
||||
|
||||
client = _FakeServeClient()
|
||||
client.fail_apply = True
|
||||
rec = ServingReconciler(
|
||||
db=db,
|
||||
v2_cfg=cfg,
|
||||
ray_runtime_env_env_vars={},
|
||||
serve_client=client,
|
||||
get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})(),
|
||||
)
|
||||
rec.tick()
|
||||
row = db.get_serve_model("mk1")
|
||||
assert row and row["state"] == "FAILED"
|
||||
assert row.get("error_summary")
|
||||
47
src/mvp/py/tests/test_serving_spec_paths.py
Normal file
47
src/mvp/py/tests/test_serving_spec_paths.py
Normal file
@ -0,0 +1,47 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from argus.service.serving_spec import ServingSpec, resolve_serving_spec
|
||||
|
||||
|
||||
def test_expand_home_macro_and_validate_user_path_ok():
|
||||
spec = ServingSpec(
|
||||
model_id="qwen-0.5b",
|
||||
model_source="$HOME/models/my_model",
|
||||
num_replicas=1,
|
||||
gpus_per_replica=1,
|
||||
)
|
||||
r = resolve_serving_spec(spec=spec, user_id="alice", now_utc=datetime(2026, 1, 6, 12, 35, tzinfo=timezone.utc))
|
||||
assert r.model_source == "/private/users/alice/models/my_model"
|
||||
assert r.model_id == "alice-202601061235-qwen-0.5b"
|
||||
|
||||
|
||||
def test_expand_common_hf_macro_ok():
|
||||
spec = ServingSpec(
|
||||
model_id="qwen-0.5b",
|
||||
model_source="$HOME/common/hf/hub/models--Qwen--Qwen2.5/snapshots/abc",
|
||||
num_replicas=1,
|
||||
gpus_per_replica=1,
|
||||
)
|
||||
r = resolve_serving_spec(spec=spec, user_id="alice", now_utc=datetime(2026, 1, 6, 12, 35, tzinfo=timezone.utc))
|
||||
assert r.model_source.startswith("/private/hf/")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"src",
|
||||
[
|
||||
"/etc/passwd",
|
||||
"relative/path",
|
||||
"/private/users/bob/models/x",
|
||||
"/private/users/alice/../bob/x",
|
||||
"/private/common/hf/x",
|
||||
],
|
||||
)
|
||||
def test_model_source_path_rejected(src: str):
|
||||
spec = ServingSpec(model_id="qwen-0.5b", model_source=src, num_replicas=1, gpus_per_replica=1)
|
||||
with pytest.raises((ValueError, PermissionError)):
|
||||
resolve_serving_spec(spec=spec, user_id="alice", now_utc=datetime(2026, 1, 6, 12, 35, tzinfo=timezone.utc))
|
||||
|
||||
72
src/mvp/py/tests/test_serving_spec_validation.py
Normal file
72
src/mvp/py/tests/test_serving_spec_validation.py
Normal file
@ -0,0 +1,72 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from argus.service.serving_spec import ServingSpec, parse_serving_spec, validate_model_id_suffix
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"suffix",
|
||||
[
|
||||
"a",
|
||||
"qwen-0.5b",
|
||||
"Qwen2.5-0.5B",
|
||||
"a_b",
|
||||
"a.b-c",
|
||||
"a" * 64,
|
||||
],
|
||||
)
|
||||
def test_validate_model_id_suffix_accepts(suffix: str):
|
||||
validate_model_id_suffix(suffix)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"suffix",
|
||||
[
|
||||
"",
|
||||
" a",
|
||||
"a ",
|
||||
"-bad",
|
||||
".bad",
|
||||
"bad/",
|
||||
"bad..",
|
||||
"bad\n",
|
||||
"bad\t",
|
||||
"a" * 65,
|
||||
],
|
||||
)
|
||||
def test_validate_model_id_suffix_rejects(suffix: str):
|
||||
with pytest.raises(ValueError):
|
||||
validate_model_id_suffix(suffix)
|
||||
|
||||
|
||||
def test_parse_serving_spec_smoke_defaults():
|
||||
spec = parse_serving_spec(
|
||||
{
|
||||
"model_id": "qwen-0.5b",
|
||||
"model_source": "/private/hf/x",
|
||||
}
|
||||
)
|
||||
assert isinstance(spec, ServingSpec)
|
||||
assert spec.num_replicas == 1
|
||||
assert spec.gpus_per_replica == 1
|
||||
assert spec.engine_kwargs is None
|
||||
|
||||
|
||||
def test_parse_serving_spec_rejects_missing_fields():
|
||||
with pytest.raises(ValueError, match="missing required field: model_id"):
|
||||
parse_serving_spec({"model_source": "/private/hf/x"})
|
||||
with pytest.raises(ValueError, match="missing required field: model_source"):
|
||||
parse_serving_spec({"model_id": "x"})
|
||||
|
||||
|
||||
def test_parse_serving_spec_rejects_bad_types():
|
||||
with pytest.raises(ValueError, match="serving spec must be a mapping"):
|
||||
parse_serving_spec(["nope"])
|
||||
with pytest.raises(ValueError, match="num_replicas"):
|
||||
parse_serving_spec({"model_id": "x", "model_source": "/private/hf/x", "num_replicas": 0})
|
||||
with pytest.raises(ValueError, match="gpus_per_replica"):
|
||||
parse_serving_spec({"model_id": "x", "model_source": "/private/hf/x", "gpus_per_replica": 0})
|
||||
with pytest.raises(ValueError, match="engine_kwargs"):
|
||||
parse_serving_spec({"model_id": "x", "model_source": "/private/hf/x", "engine_kwargs": "nope"})
|
||||
|
||||
@ -42,10 +42,13 @@ def test_ui_routes_render_200(tmp_path, monkeypatch):
|
||||
"/ui/login",
|
||||
"/ui/tasks",
|
||||
"/ui/tasks/new",
|
||||
"/ui/serving",
|
||||
"/ui/serving/new",
|
||||
"/ui/data",
|
||||
"/ui/admin",
|
||||
"/ui/tasks/any-task-id",
|
||||
"/ui/tasks/any-task-id/logs",
|
||||
"/ui/serving/any-model-key",
|
||||
):
|
||||
r = c.get(path, allow_redirects=True)
|
||||
assert r.status_code == 200
|
||||
@ -60,7 +63,7 @@ def test_ui_contains_sidebar_links(tmp_path, monkeypatch):
|
||||
|
||||
r = c.get("/ui/tasks")
|
||||
assert r.status_code == 200
|
||||
for link in ("/ui/tasks", "/ui/tasks/new", "/ui/data", "/ui/login", "/ui/admin"):
|
||||
for link in ("/ui/tasks", "/ui/tasks/new", "/ui/serving", "/ui/data", "/ui/login", "/ui/admin"):
|
||||
assert link in r.text
|
||||
assert "Ray Dashboard" in r.text
|
||||
|
||||
|
||||
56
src/mvp/py/tests/test_ui_serving.py
Normal file
56
src/mvp/py/tests/test_ui_serving.py
Normal file
@ -0,0 +1,56 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from argus.service.app import create_app
|
||||
|
||||
|
||||
def _write_config(tmp_path: Path) -> Path:
|
||||
p = tmp_path / "cfg.yaml"
|
||||
p.write_text(
|
||||
"""
|
||||
ray:
|
||||
address: "http://127.0.0.1:8265"
|
||||
shared_root: "/private"
|
||||
entrypoint_num_cpus: 1
|
||||
entrypoint_resources: { worker_node: 1 }
|
||||
runtime_env: { env_vars: { PYTHONUNBUFFERED: "1" } }
|
||||
service:
|
||||
api: { host: "127.0.0.1", port: 8080 }
|
||||
auth: { token_env: "MVP_INTERNAL_TOKEN" }
|
||||
sqlite: { db_path: "%(db)s" }
|
||||
data:
|
||||
user_root: "%(users)s"
|
||||
sftpgo: { enabled: false }
|
||||
retention: { jobs_trash_after_days: 3, jobs_purge_after_days: 7, janitor_interval_s: 3600 }
|
||||
serving: {}
|
||||
"""
|
||||
% {"db": str(tmp_path / "mvp.sqlite3"), "users": str(tmp_path / "users")}
|
||||
)
|
||||
return p
|
||||
|
||||
|
||||
def test_ui_serving_pages_render(tmp_path, monkeypatch):
|
||||
cfg = _write_config(tmp_path)
|
||||
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
|
||||
app = create_app(str(cfg))
|
||||
c = TestClient(app)
|
||||
|
||||
for path in ("/ui/serving", "/ui/serving/new", "/ui/serving/any-model-key"):
|
||||
r = c.get(path)
|
||||
assert r.status_code == 200
|
||||
assert "<html" in r.text.lower()
|
||||
|
||||
|
||||
def test_ui_serving_contains_openai_port_8000(tmp_path, monkeypatch):
|
||||
cfg = _write_config(tmp_path)
|
||||
monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
|
||||
app = create_app(str(cfg))
|
||||
c = TestClient(app)
|
||||
|
||||
r = c.get("/ui/serving")
|
||||
assert r.status_code == 200
|
||||
assert "curOriginWithPort(8000)" in r.text
|
||||
assert "/v1/models" in r.text
|
||||
@ -11,10 +11,11 @@ fi
|
||||
|
||||
echo "[host] docker compose up -d (mvp)"
|
||||
BUILD="${BUILD:-0}"
|
||||
RAY_NODE_IMAGE="${RAY_NODE_IMAGE:-argus/argus-ray-node:vllm011.latest}"
|
||||
|
||||
# If the image isn't present locally, force build once.
|
||||
if [[ "${BUILD}" != "1" ]]; then
|
||||
if ! docker image inspect argus/argus-ray-node:v2.5 >/dev/null 2>&1; then
|
||||
if ! docker image inspect "${RAY_NODE_IMAGE}" >/dev/null 2>&1; then
|
||||
BUILD="1"
|
||||
fi
|
||||
fi
|
||||
|
||||
18
src/mvp/scripts/debug_serve_llm_smoke.sh
Normal file
18
src/mvp/scripts/debug_serve_llm_smoke.sh
Normal file
@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
container="${MVP_HEAD_CONTAINER:-argus-ray-head}"
|
||||
model_source="${MODEL_SOURCE:-}"
|
||||
if [[ -n "${1:-}" ]]; then
|
||||
model_source="$1"
|
||||
fi
|
||||
|
||||
argv=(python3 /workspace/mvp/scripts/serve_llm_smoke.py)
|
||||
if [[ -n "${model_source}" ]]; then
|
||||
argv+=(--model-source "${model_source}")
|
||||
fi
|
||||
argv+=(--accelerator-type "${ARGUS_ACCELERATOR_TYPE:-H20}")
|
||||
|
||||
echo "[host] run Ray Serve LLM smoke test in container: ${container}" >&2
|
||||
docker exec -it "${container}" bash -lc "$(printf '%q ' "${argv[@]}")"
|
||||
|
||||
193
src/mvp/scripts/run_all_v38_serving.sh
Executable file
193
src/mvp/scripts/run_all_v38_serving.sh
Executable file
@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
# shellcheck source=lib.sh
|
||||
source "${SCRIPT_DIR}/lib.sh"
|
||||
|
||||
API_ADDR="${API_ADDR:-http://127.0.0.1:8080}"
|
||||
OPENAI_BASE_URL="${OPENAI_BASE_URL:-http://127.0.0.1:8000/v1}"
|
||||
ADMIN_TOKEN="${MVP_INTERNAL_TOKEN:-}"
|
||||
USER_ID="${USER_ID:-alice}"
|
||||
EXPECTED_RAY_NODES="${EXPECTED_RAY_NODES:-3}" # head + 2 workers
|
||||
|
||||
CONFIG_IN_CONTAINER="${CONFIG_IN_CONTAINER:-/workspace/mvp/configs/dev.yaml}"
|
||||
SFTPGO_ADMIN_PASSWORD="${SFTPGO_ADMIN_PASSWORD:-my-dev-sftpgo-admin}"
|
||||
export SFTPGO_ADMIN_PASSWORD
|
||||
|
||||
if [[ -z "${ADMIN_TOKEN}" ]]; then
|
||||
echo "ERROR: MVP_INTERNAL_TOKEN must be set in host env (admin token)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
api_curl_admin() {
|
||||
curl -sS -H "Authorization: Bearer ${ADMIN_TOKEN}" "$@"
|
||||
}
|
||||
|
||||
api_wait_ready() {
|
||||
local tries="${1:-60}"
|
||||
for i in $(seq 1 "${tries}"); do
|
||||
if curl -sS -m 2 "${API_ADDR}/docs" >/dev/null 2>&1; then
|
||||
echo "[host] api_ready: ${API_ADDR}"
|
||||
return 0
|
||||
fi
|
||||
echo "[host] waiting api... (${i}/${tries})"
|
||||
sleep 2
|
||||
done
|
||||
echo "ERROR: api not ready: ${API_ADDR}" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
ray_wait_ready() {
|
||||
local tries="${1:-60}"
|
||||
for i in $(seq 1 "${tries}"); do
|
||||
if curl -sS -m 2 "${RAY_DASHBOARD_ADDR}/api/version" >/dev/null 2>&1; then
|
||||
echo "[host] ray_dashboard_ready: ${RAY_DASHBOARD_ADDR}"
|
||||
return 0
|
||||
fi
|
||||
echo "[host] waiting ray dashboard... (${i}/${tries})"
|
||||
sleep 2
|
||||
done
|
||||
echo "ERROR: ray dashboard not ready: ${RAY_DASHBOARD_ADDR}" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
ray_wait_nodes() {
|
||||
local want="${1:-3}"
|
||||
local tries="${2:-60}"
|
||||
for i in $(seq 1 "${tries}"); do
|
||||
local out n
|
||||
out="$(docker exec -i "${HEAD_CONTAINER}" python3 -c "import ray; ray.init(address='auto', ignore_reinit_error=True, log_to_driver=False, logging_level='ERROR'); print(sum(1 for n in ray.nodes() if n.get('Alive')))" 2>/dev/null || true)"
|
||||
n="$(printf '%s\n' "${out}" | tail -n 1 | tr -cd '0-9' || true)"
|
||||
if [[ "${n}" =~ ^[0-9]+$ ]]; then
|
||||
echo "[host] ray_nodes_alive=${n} (want>=${want})"
|
||||
if [[ "${n}" -ge "${want}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
else
|
||||
echo "[host] waiting ray nodes... (${i}/${tries})"
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
echo "ERROR: ray nodes not ready (want>=${want})" >&2
|
||||
docker exec -i "${HEAD_CONTAINER}" bash -lc "ray status || true" >&2 || true
|
||||
return 1
|
||||
}
|
||||
|
||||
openai_wait_ready() {
|
||||
local tries="${1:-120}"
|
||||
for i in $(seq 1 "${tries}"); do
|
||||
if curl -sS -m 2 "${OPENAI_BASE_URL}/models" >/dev/null 2>&1; then
|
||||
echo "[host] openai_ready: ${OPENAI_BASE_URL}"
|
||||
return 0
|
||||
fi
|
||||
echo "[host] waiting openai... (${i}/${tries})"
|
||||
sleep 2
|
||||
done
|
||||
echo "ERROR: openai not ready: ${OPENAI_BASE_URL}" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
wait_model_state() {
|
||||
local token="$1"
|
||||
local model_key="$2"
|
||||
local want="$3"
|
||||
local tries="${4:-120}"
|
||||
for i in $(seq 1 "${tries}"); do
|
||||
local body state
|
||||
body="$(curl -sS -H "Authorization: Bearer ${token}" "${API_ADDR}/api/v2/serve/models/${model_key}")"
|
||||
state="$(printf '%s' "${body}" | python3 -c 'import sys,json; print(json.load(sys.stdin)["model"]["state"])' 2>/dev/null || true)"
|
||||
echo "[host] model ${model_key}: ${state}"
|
||||
if [[ "${state}" == "${want}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
if [[ "${state}" == "FAILED" ]]; then
|
||||
echo "[host] model failed; detail:" >&2
|
||||
printf '%s\n' "${body}" | python3 -m json.tool >&2 || true
|
||||
return 1
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
echo "ERROR: model not in state ${want} after timeout" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
echo "[host] ===== run_all_v38_serving.sh begin ====="
|
||||
|
||||
"${SCRIPT_DIR}/00_prereq_check.sh"
|
||||
"${SCRIPT_DIR}/03_cleanup_v1_legacy.sh"
|
||||
"${SCRIPT_DIR}/04_cleanup_v2_legacy.sh"
|
||||
|
||||
echo "[host] bring down existing containers (best-effort)"
|
||||
"${SCRIPT_DIR}/02_down.sh" || true
|
||||
|
||||
echo "[host] (re)create containers (Ray + SFTPGo + W&B)"
|
||||
# For v3.8, we need the latest ray-node image (ray[llm] deps). Force build once.
|
||||
BUILD="${BUILD:-1}" "${SCRIPT_DIR}/01_up.sh"
|
||||
|
||||
echo "[host] wait ray ready"
|
||||
ray_wait_ready 60
|
||||
ray_wait_nodes "${EXPECTED_RAY_NODES}" 120
|
||||
|
||||
echo "[host] prepare data/model (best-effort; uses shared caches)"
|
||||
"${SCRIPT_DIR}/30_prepare_data_and_model.sh" || true
|
||||
|
||||
echo "[host] start api"
|
||||
CONFIG_IN_CONTAINER="${CONFIG_IN_CONTAINER}" MVP_INTERNAL_TOKEN="${ADMIN_TOKEN}" "${SCRIPT_DIR}/60_start_api.sh"
|
||||
api_wait_ready 60
|
||||
|
||||
echo "[host] create user (idempotent)"
|
||||
api_curl_admin -X POST "${API_ADDR}/api/v2/users" -H "Content-Type: application/json" --data-binary "{\"user_id\":\"${USER_ID}\"}" >/dev/null || true
|
||||
|
||||
echo "[host] issue user token"
|
||||
USER_TOKEN="$(api_curl_admin -X POST "${API_ADDR}/api/v2/users/${USER_ID}/tokens" | python3 -c 'import sys,json; print(json.load(sys.stdin)["token"])')"
|
||||
|
||||
echo "[host] resolve local model snapshot path (offline)"
|
||||
LOCAL_MODEL_PATH="$(dexec "${HEAD_CONTAINER}" bash -lc "python3 -c \"import os; from huggingface_hub import snapshot_download; os.environ.setdefault('HF_HOME','/private/hf'); print(snapshot_download(repo_id='Qwen/Qwen2.5-0.5B-Instruct', local_files_only=True))\" " | tail -n 1)"
|
||||
if [[ -z "${LOCAL_MODEL_PATH}" || "${LOCAL_MODEL_PATH}" != /* ]]; then
|
||||
echo "ERROR: failed to resolve LOCAL_MODEL_PATH: ${LOCAL_MODEL_PATH}" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "[host] local_model_path: ${LOCAL_MODEL_PATH}"
|
||||
|
||||
echo "[host] submit serving model via API"
|
||||
SERVE_SPEC=$'model_id: qwen-0.5b\nmodel_source: '"${LOCAL_MODEL_PATH}"$'\nnum_replicas: 1\ngpus_per_replica: 1\n'
|
||||
CREATE_RESP="$(curl -sS -H "Authorization: Bearer ${USER_TOKEN}" -H "Content-Type: application/yaml" --data-binary "${SERVE_SPEC}" "${API_ADDR}/api/v2/serve/models")"
|
||||
echo "[host] create_model_resp: ${CREATE_RESP}"
|
||||
MODEL_KEY="$(printf '%s' "${CREATE_RESP}" | python3 -c 'import sys,json; print(json.load(sys.stdin)["model_key"])')"
|
||||
|
||||
echo "[host] wait model RUNNING"
|
||||
wait_model_state "${USER_TOKEN}" "${MODEL_KEY}" "RUNNING" 300
|
||||
|
||||
echo "[host] wait OpenAI ingress ready"
|
||||
openai_wait_ready 120
|
||||
|
||||
echo "[host] verify /v1/models contains model"
|
||||
MODEL_ID="$(
|
||||
curl -sS "${OPENAI_BASE_URL}/models" \
|
||||
| python3 -c 'import sys,json; obj=json.load(sys.stdin); print("\n".join([m.get("id","") for m in obj.get("data",[]) if isinstance(m,dict)]))' \
|
||||
| grep -E "^${USER_ID}-[0-9]{12}-qwen-0\\.5b$" \
|
||||
| head -n1 \
|
||||
|| true
|
||||
)"
|
||||
if [[ -z "${MODEL_ID}" ]]; then
|
||||
echo "ERROR: model id not found in /v1/models" >&2
|
||||
curl -sS "${OPENAI_BASE_URL}/models" | python3 -m json.tool >&2 || true
|
||||
exit 1
|
||||
fi
|
||||
echo "[host] model_id: ${MODEL_ID}"
|
||||
|
||||
echo "[host] chat completion (best-effort)"
|
||||
CHAT_RESP="$(curl -sS -H "Content-Type: application/json" -H "Authorization: Bearer FAKE_KEY" -X POST "${OPENAI_BASE_URL}/chat/completions" --data-binary "{\"model\":\"${MODEL_ID}\",\"messages\":[{\"role\":\"user\",\"content\":\"hello\"}],\"max_tokens\":16,\"stream\":false}")"
|
||||
printf '%s\n' "${CHAT_RESP}" | python3 -m json.tool >/dev/null 2>&1 || {
|
||||
echo "ERROR: invalid chat response" >&2
|
||||
printf '%s\n' "${CHAT_RESP}" >&2
|
||||
exit 1
|
||||
}
|
||||
echo "[host] chat_ok"
|
||||
|
||||
echo "[host] delete model"
|
||||
curl -sS -H "Authorization: Bearer ${USER_TOKEN}" -X DELETE "${API_ADDR}/api/v2/serve/models/${MODEL_KEY}" >/dev/null
|
||||
wait_model_state "${USER_TOKEN}" "${MODEL_KEY}" "DELETED" 300
|
||||
|
||||
echo "[host] ===== run_all_v38_serving.sh done ====="
|
||||
102
src/mvp/scripts/serve_llm_smoke.py
Normal file
102
src/mvp/scripts/serve_llm_smoke.py
Normal file
@ -0,0 +1,102 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def _pick_qwen_snapshot() -> str | None:
|
||||
base = Path("/private/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots")
|
||||
if not base.exists():
|
||||
return None
|
||||
snaps = sorted([p for p in base.iterdir() if p.is_dir()], reverse=True)
|
||||
return str(snaps[0]) if snaps else None
|
||||
|
||||
|
||||
def _http_get_json(url: str) -> Any:
|
||||
with urllib.request.urlopen(url, timeout=10) as resp:
|
||||
raw = resp.read().decode("utf-8")
|
||||
return json.loads(raw)
|
||||
|
||||
|
||||
def _wait_http_json(url: str, *, timeout_s: int) -> Any:
|
||||
deadline = time.time() + float(timeout_s)
|
||||
last_err: Exception | None = None
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
return _http_get_json(url)
|
||||
except Exception as e:
|
||||
last_err = e
|
||||
time.sleep(2)
|
||||
raise RuntimeError(f"timeout waiting for {url}: {last_err!r}")
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
ap = argparse.ArgumentParser(description="Ray Serve LLM smoke test (deploy + /v1/models probe).")
|
||||
ap.add_argument("--ray-address", default="auto")
|
||||
ap.add_argument("--http-port", type=int, default=8000)
|
||||
ap.add_argument("--app-name", default="argus_llm_smoke")
|
||||
ap.add_argument("--route-prefix", default="/")
|
||||
ap.add_argument("--accelerator-type", default=os.environ.get("ARGUS_ACCELERATOR_TYPE") or "H20")
|
||||
ap.add_argument("--model-id", default="smoke-qwen-0.5b")
|
||||
ap.add_argument("--model-source", default=None, help="Local path or HF id. Default: cached Qwen snapshot under /private/hf.")
|
||||
ap.add_argument("--tensor-parallel-size", type=int, default=1)
|
||||
ap.add_argument("--num-replicas", type=int, default=1)
|
||||
ap.add_argument("--wait-s", type=int, default=600)
|
||||
args = ap.parse_args(argv)
|
||||
|
||||
model_source = str(args.model_source or _pick_qwen_snapshot() or "")
|
||||
if not model_source:
|
||||
raise SystemExit("missing --model-source and no cached Qwen snapshot found under /private/hf")
|
||||
|
||||
# Force offline HF behavior for the smoke test.
|
||||
os.environ.setdefault("HF_HOME", "/private/hf")
|
||||
os.environ.setdefault("HUGGINGFACE_HUB_CACHE", "/private/hf/hub")
|
||||
os.environ.setdefault("TRANSFORMERS_CACHE", "/private/hf/transformers")
|
||||
os.environ["HF_HUB_OFFLINE"] = "1"
|
||||
|
||||
import ray
|
||||
|
||||
ray.init(address=str(args.ray_address), ignore_reinit_error=True, log_to_driver=False)
|
||||
|
||||
from ray import serve
|
||||
|
||||
try:
|
||||
serve.start(proxy_location="HeadOnly", http_options={"host": "0.0.0.0", "port": int(args.http_port)})
|
||||
except Exception:
|
||||
# Best-effort: Serve may already be running in the container (e.g., started by the MVP API scheduler).
|
||||
pass
|
||||
|
||||
from ray.serve.llm import LLMConfig, build_openai_app
|
||||
|
||||
# Build a config dict and filter by the current Ray's LLMConfig schema, since fields
|
||||
# may differ between Ray versions.
|
||||
cfg_dict: dict[str, Any] = {
|
||||
"model_loading_config": {"model_id": str(args.model_id), "model_source": model_source},
|
||||
"accelerator_type": str(args.accelerator_type),
|
||||
"deployment_config": {"num_replicas": int(args.num_replicas)},
|
||||
"engine_kwargs": {"tensor_parallel_size": int(args.tensor_parallel_size)},
|
||||
"runtime_env": {"env_vars": {"HF_HUB_OFFLINE": "1", "HF_HOME": "/private/hf"}},
|
||||
}
|
||||
allowed = set(getattr(LLMConfig, "model_fields", {}).keys())
|
||||
if allowed:
|
||||
cfg_dict = {k: v for k, v in cfg_dict.items() if k in allowed}
|
||||
|
||||
llm_cfg = LLMConfig(**cfg_dict)
|
||||
app = build_openai_app({"llm_configs": [llm_cfg]})
|
||||
|
||||
serve.run(app, name=str(args.app_name), route_prefix=str(args.route_prefix))
|
||||
|
||||
models_url = f"http://127.0.0.1:{int(args.http_port)}/v1/models"
|
||||
payload = _wait_http_json(models_url, timeout_s=int(args.wait_s))
|
||||
print(json.dumps(payload, indent=2, sort_keys=True))
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Loading…
x
Reference in New Issue
Block a user