v3.8 model serving 部署成功

2026-01-06 22:43:29 +08:00 · 2026-01-06 22:43:29 +08:00 · 686739fea2
commit 686739fea2
parent 63963eba29
39 changed files with 6772 additions and 1830 deletions
--- a/specs/mvp/sw_arch.excalidraw
+++ b/specs/mvp/sw_arch.excalidraw
--- a/specs/mvp/v3.8/ray_serve.md
+++ b/specs/mvp/v3.8/ray_serve.md
@ -0,0 +1,314 @@
+
+API参考资料
+https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html
+
+ray.serve.llm.LLMConfig
+pydantic model ray.serve.llm.LLMConfig[source]
+The configuration for starting an LLM deployment.
+
+PublicAPI (alpha): This API is in alpha and may change before becoming stable.
+
+field accelerator_type: str | None = None
+The type of accelerator runs the model on. Only the following values are supported: [‘V100’, ‘P100’, ‘T4’, ‘P4’, ‘K80’, ‘A10G’, ‘L4’, ‘L40S’, ‘A100’, ‘H100’, ‘H200’, ‘H20’, ‘B200’, ‘Intel-GPU-Max-1550’, ‘Intel-GPU-Max-1100’, ‘Intel-GAUDI’, ‘AMD-Instinct-MI100’, ‘AMD-Instinct-MI250X’, ‘AMD-Instinct-MI250X-MI250’, ‘AMD-Instinct-MI210’, ‘AMD-Instinct-MI300A’, ‘AMD-Instinct-MI300X-OAM’, ‘AMD-Instinct-MI300X-HF’, ‘AMD-Instinct-MI308X’, ‘AMD-Instinct-MI325X-OAM’, ‘AMD-Instinct-MI350X-OAM’, ‘AMD-Instinct-MI355X-OAM’, ‘AMD-Radeon-R9-200-HD-7900’, ‘AMD-Radeon-HD-7900’, ‘aws-neuron-core’, ‘TPU-V2’, ‘TPU-V3’, ‘TPU-V4’, ‘TPU-V5P’, ‘TPU-V5LITEPOD’, ‘TPU-V6E’, ‘Ascend910B’, ‘Ascend910B4’, ‘MXC500’, ‘MXC550’, ‘A100-40G’, ‘A100-80G’]
+
+field callback_config: CallbackConfig [Optional]
+Callback configuration to use for model initialization. Can be a string path to a class or a Callback subclass.
+
+field deployment_config: Dict[str, Any] [Optional]
+The Ray @server.deployment options. Supported fields are: name, num_replicas, ray_actor_options, max_ongoing_requests, autoscaling_config, max_queued_requests, user_config, health_check_period_s, health_check_timeout_s, graceful_shutdown_wait_loop_s, graceful_shutdown_timeout_s, logging_config, request_router_config. For more details, see the Ray Serve Documentation.
+
+field engine_kwargs: Dict[str, Any] = {}
+Additional keyword arguments for the engine. In case of vLLM, this will include all the configuration knobs they provide out of the box, except for tensor-parallelism which is set automatically from Ray Serve configs.
+
+field experimental_configs: Dict[str, Any] [Optional]
+Experimental configurations for Ray Serve LLM. This is a dictionary of key-value pairs. Current supported keys are: - stream_batching_interval_ms: Ray Serve LLM batches streaming requests together. This config decides how long to wait for the batch before processing the requests. Defaults to 50.0. - num_ingress_replicas: The number of replicas for the router. Ray Serve will take the max amount all the replicas. Default would be 2 router replicas per model replica.
+
+field llm_engine: str = 'vLLM'
+The LLMEngine that should be used to run the model. Only the following values are supported: [‘vLLM’]
+
+field log_engine_metrics: bool | None = True
+Enable additional engine metrics via Ray Prometheus port.
+
+field lora_config: Dict[str, Any] | LoraConfig | None = None
+Settings for LoRA adapter. Validated against LoraConfig.
+
+field model_loading_config: Dict[str, Any] | ModelLoadingConfig [Required]
+The settings for how to download and expose the model. Validated against ModelLoadingConfig.
+
+field placement_group_config: Dict[str, Any] | None = None
+Ray placement group configuration for scheduling vLLM engine workers. Defines resource bundles and placement strategy for multi-node deployments. Should contain ‘bundles’ (list of resource dicts) and optionally ‘strategy’ (defaults to ‘PACK’). Example: {‘bundles’: [{‘GPU’: 1, ‘CPU’: 2}], ‘strategy’: ‘PACK’}
+
+field runtime_env: Dict[str, Any] | None = None
+The runtime_env to use for the model deployment replica and the engine workers.
+
+apply_checkpoint_info(model_id_or_path: str, trust_remote_code: bool = False) → None[source]
+Apply the checkpoint info to the model config.
+
+classmethod from_file(path: str, **kwargs) → ModelT
+Load a model from a YAML file path.
+
+get_engine_config() → None | VLLMEngineConfig[source]
+Returns the engine config for the given LLM config.
+
+LLMConfig not only has engine config but also deployment config, etc.
+
+get_or_create_callback() → CallbackBase | None[source]
+Get or create the callback instance for this process.
+
+This ensures one callback instance per process (singleton pattern). The instance is cached so the same object is used across all hooks.
+
+Returns
+:
+Instance of class that implements Callback
+
+multiplex_config() → ServeMultiplexConfig[source]
+classmethod parse_yaml(file, **kwargs) → ModelT
+setup_engine_backend()[source]
+update_engine_kwargs(**kwargs: Any) → None[source]
+Update the engine_kwargs and the engine_config engine_kwargs.
+
+This is typically called during engine starts, when certain engine_kwargs (e.g., data_parallel_rank) become available.
+
+validator validate_accelerator_type  »  accelerator_type[source]
+validator validate_deployment_config  »  deployment_config[source]
+Validates the deployment config dictionary.
+
+validator validate_experimental_configs  »  experimental_configs[source]
+Validates the experimental configs dictionary.
+
+validator validate_llm_engine  »  llm_engine[source]
+Validates the llm_engine string value.
+
+validator validate_lora_config  »  lora_config[source]
+Validates the lora config dictionary.
+
+validator validate_model_loading_config  »  model_loading_config[source]
+Validates the model loading config dictionary.
+
+property input_modality: str
+Returns the input modality of the model. There could be more types in the future. Right now assumes if the model doesn’t support version, it’ll be text.
+
+property max_request_context_length: int | None
+property model_architecture: str
+property model_id: str
+property supports_vision: bool
+
+# Python API
+ray serve api
+https://docs.ray.io/en/latest/serve/api/index.html#serve-api
+
+
+Python API
+Writing Applications
+serve.Deployment
+
+Class (or function) decorated with the @serve.deployment decorator.
+
+serve.Application
+
+One or more deployments bound with arguments that can be deployed together.
+
+Deployment Decorators
+serve.deployment
+
+Decorator that converts a Python class to a Deployment.
+
+serve.ingress
+
+Wrap a deployment class with an ASGI application for HTTP request parsing.
+
+serve.batch
+
+Converts a function to asynchronously handle batches.
+
+serve.multiplexed
+
+Wrap a callable or method used to load multiplexed models in a replica.
+
+Deployment Handles
+Note
+
+The deprecated RayServeHandle and RayServeSyncHandle APIs have been fully removed as of Ray 2.10. See the model composition guide for how to update code to use the DeploymentHandle API instead.
+
+serve.handle.DeploymentHandle
+
+A handle used to make requests to a deployment at runtime.
+
+serve.handle.DeploymentResponse
+
+A future-like object wrapping the result of a unary deployment handle call.
+
+serve.handle.DeploymentResponseGenerator
+
+A future-like object wrapping the result of a streaming deployment handle call.
+
+Running Applications
+serve.start
+
+Start Serve on the cluster.
+
+serve.run
+
+Run an application and return a handle to its ingress deployment.
+
+serve.delete
+
+Delete an application by its name.
+
+serve.status
+
+Get the status of Serve on the cluster.
+
+serve.shutdown
+
+Completely shut down Serve on the cluster.
+
+serve.shutdown_async
+
+Completely shut down Serve on the cluster asynchronously.
+
+Configurations
+serve.config.ProxyLocation
+
+Config for where to run proxies to receive ingress traffic to the cluster.
+
+serve.config.gRPCOptions
+
+gRPC options for the proxies.
+
+serve.config.HTTPOptions
+
+HTTP options for the proxies.
+
+serve.config.AutoscalingConfig
+
+Config for the Serve Autoscaler.
+
+serve.config.AutoscalingPolicy
+
+PublicAPI (alpha): This API is in alpha and may change before becoming stable.
+
+serve.config.AutoscalingContext
+
+Rich context provided to custom autoscaling policies.
+
+serve.config.AggregationFunction
+
+An enumeration.
+
+serve.config.RequestRouterConfig
+
+Config for the Serve request router.
+
+Schemas
+serve.schema.ServeActorDetails
+
+Detailed info about a Ray Serve actor.
+
+serve.schema.ProxyDetails
+
+Detailed info about a Ray Serve ProxyActor.
+
+serve.schema.ApplicationStatusOverview
+
+Describes the status of an application and all its deployments.
+
+serve.schema.ServeStatus
+
+Describes the status of Serve.
+
+serve.schema.DeploymentStatusOverview
+
+Describes the status of a deployment.
+
+serve.schema.EncodingType
+
+Encoding type for the serve logs.
+
+serve.schema.AutoscalingMetricsHealth
+
+An enumeration.
+
+serve.schema.AutoscalingStatus
+
+An enumeration.
+
+serve.schema.ScalingDecision
+
+One autoscaling decision with minimal provenance.
+
+serve.schema.DeploymentAutoscalingDetail
+
+Deployment-level autoscaler observability.
+
+serve.schema.ReplicaRank
+
+Replica rank model.
+
+Request Router
+serve.request_router.ReplicaID
+
+A unique identifier for a replica.
+
+serve.request_router.PendingRequest
+
+A request that is pending execution by a replica.
+
+serve.request_router.RunningReplica
+
+Contains info on a running replica.
+
+serve.request_router.FIFOMixin
+
+Mixin for FIFO routing.
+
+serve.request_router.LocalityMixin
+
+Mixin for locality routing.
+
+serve.request_router.MultiplexMixin
+
+Mixin for multiplex routing.
+
+serve.request_router.RequestRouter
+
+Abstract interface for a request router (how the router calls it).
+
+Advanced APIs
+serve.get_replica_context
+
+Returns the deployment and replica tag from within a replica at runtime.
+
+serve.context.ReplicaContext
+
+Stores runtime context info for replicas.
+
+serve.get_multiplexed_model_id
+
+Get the multiplexed model ID for the current request.
+
+serve.get_app_handle
+
+Get a handle to the application's ingress deployment by name.
+
+serve.get_deployment_handle
+
+Get a handle to a deployment by name.
+
+serve.grpc_util.RayServegRPCContext
+
+Context manager to set and get gRPC context.
+
+serve.exceptions.BackPressureError
+
+Raised when max_queued_requests is exceeded on a DeploymentHandle.
+
+serve.exceptions.RayServeException
+
+serve.exceptions.RequestCancelledError
+
+Raise when a Serve request is cancelled.
+
+serve.exceptions.DeploymentUnavailableError
+
+Raised when a Serve deployment is unavailable to receive requests.
--- a/specs/mvp/v3.8/ray_serve_llm.md
+++ b/specs/mvp/v3.8/ray_serve_llm.md
@ -0,0 +1,87 @@
+
+基于提供的来源，以下是使用 **Builder Pattern（构建器模式）** 结合 Ray Serve 和 vllm 动态部署**中型大语言模型（Medium-sized LLM）**的原理与操作方案。
+
+### 一、 核心原理
+
+1.  **中型 LLM 定义**：中型模型（如 Llama-3.1-70B）通常具有约 70B 参数。它们通常运行在**单个节点**上，利用 **4 到 8 个 GPU**。
+2.  **Builder Pattern 机制**：该模式通过 `build_openai_app` 函数提供高度抽象。开发者只需定义一个 `LLMConfig` 对象，即可自动构建并链接底层的 `LLMServer` 和 `OpenAiIngress` 组件。
+3.  **高性能后端 (vLLM)**：Ray Serve LLM 使用 vLLM 作为推理引擎，支持高性能推理和显存管理。
+4.  **动态扩缩容与资源调度**：
+    *   **张量并行 (Tensor Parallelism)**：通过 `tensor_parallel_size` 将模型权重均匀分布在单节点的所有 GPU 上。
+    *   **副本缩放 (Autoscaling)**：通过 `autoscaling_config` 动态调整 `min_replicas` 和 `max_replicas`，使服务能根据实时流量增减推理副本。
+
+---
+
+### 二、 操作方案
+
+#### 1. 环境准备
+确保已安装必要的依赖包并配置 Hugging Face 访问令牌（针对 Llama-3.1 等受限模型）。
+```bash
+pip install "ray[serve,llm]"
+export HF_TOKEN=<YOUR_HUGGINGFACE_TOKEN>
+```
+
+#### 2. 编写部署脚本 (`serve_medium_llm.py`)
+使用 **Builder Pattern** 定义配置并构建应用。以下示例配置了一个典型的 70B 模型部署：
+
+```python
+# serve_medium_llm.py
+from ray.serve.llm import LLMConfig, build_openai_app
+import os
+
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="my-llama-3.1-70b",
+        model_source="meta-llama/Llama-3.1-70B-Instruct",
+    ),
+    accelerator_type="A100-40G",  # 或 L40S
+    deployment_config=dict(
+        autoscaling_config=dict(
+            min_replicas=1,      # 最小副本数
+            max_replicas=4,      # 最大副本数，实现动态扩展
+        )
+    ),
+    runtime_env=dict(env_vars={"HF_TOKEN": os.environ.get("HF_TOKEN")}),
+    engine_kwargs=dict(
+        max_model_len=32768,      # 上下文长度
+        tensor_parallel_size=8,   # 在单节点的 8 个 GPU 间拆分权重
+    ),
+)
+
+# 使用 Builder Pattern 构建应用
+app = build_openai_app({"llm_configs": [llm_config]})
+```
+
+#### 3. 启动部署
+在终端运行以下命令启动服务：
+```bash
+serve run serve_medium_llm:app
+```
+部署过程通常需要几分钟，包括配置集群、启动 vLLM 服务器以及下载模型权重。
+
+#### 4. 发送请求测试
+服务启动后，可以通过符合 OpenAI 标准的接口进行访问。
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url="http://localhost:8000/v1", api_key="FAKE_KEY")
+response = client.chat.completions.create(
+    model="my-llama-3.1-70b",
+    messages=[{"role": "user", "content": "解释一下什么是量子纠缠？"}],
+    stream=True
+)
+for chunk in response:
+    if chunk.choices.delta.content:
+        print(chunk.choices.delta.content, end="", flush=True)
+```
+
+---
+
+### 三、 性能与并发优化建议
+
+*   **提高并发量**：可以通过降低 `max_model_len` 来减少 KV 缓存所需的显存，从而显著提升每个副本支持的最大并发请求数。
+*   **监控指标**：通过 Ray Serve LLM 仪表盘监控 **TTFT（首字延迟）**、**TPOT（单字延迟）** 和 **Token 吞吐量** 来评估服务性能。
+*   **精度折衷**：对于资源受限的场景，可以使用**量化模型**（如 FP8）来减少模型内存占用，为 KV 缓存留出更多空间，进而提高并发能力。
+
+**比喻理解**：
+部署**中型 LLM** 就像是在一个大型车间里组装一台复杂的精密机器（模型权重）。**Builder Pattern** 是你的“全自动组装线”，你只需设定好机器的参数（Config），生产线就会自动帮你把零件固定好并接通电源。而 **vLLM 和张量并行** 就像是让 8 个熟练工人（GPU）共同抬起这台沉重的机器，每个人只负责自己那一部分的力气，从而让机器能够平稳地运转。
--- a/specs/mvp/v3.8/requirements.md
+++ b/specs/mvp/v3.8/requirements.md
@ -0,0 +1,8 @@
+
+1. 通过ray serve（后端vllm）来动态拉起llm，支持多模型application部署，
+2. 默认一个模型只有一个replica，用户配置可以多个
+3. 用户可以删除（下线）模型
+4. 可以指定模型用几张卡
+5. 通过WebUI来进行配置，查看当前部署的模型列表，以及可以查看详情
+6. 模型路径可以使用common，也可以用户自己指定user路径
+7. 
--- a/specs/mvp/v3.8/v3.8_api.md
+++ b/specs/mvp/v3.8/v3.8_api.md
@ -0,0 +1,224 @@
+# MVP v3.8 API Reference（Serving）
+
+> 说明：本节为 v3.8 新增的 **Model Serving** API（Ray Serve LLM / vLLM）。  
+> 认证：Serving 管理 API 复用现有 MVP API 的认证方式（`Authorization: Bearer <user_token>`）。  
+> 推理：对外 OpenAI endpoint **不做鉴权**（v3.8 约定）。
+
+## 0. 基本信息
+
+### 0.1 Base URLs
+
+- MVP API server：`http://<host>:8080`
+- Ray Serve OpenAI ingress（固定端口 8000）：`http://<host>:8000/v1`
+
+### 0.2 认证
+
+所有 `/api/v2/serve/*` 接口要求：
+
+```
+Authorization: Bearer <user_token>
+```
+
+其中 `user_token` 由管理员通过 `/api/v2/users/<user_id>/tokens` 颁发（沿用现有机制）。
+
+### 0.3 命名规则：`model_id = user_id-YYYYMMDDHHMM-<suffix>`
+
+- 用户提交时填写 `model_id`（语义为 suffix，例如 `qwen-0.5b`）
+- 平台生成前缀：
+  - `prefix = "<user_id>-<YYYYMMDDHHMM>"`
+- 平台实际对外暴露的 OpenAI model 名称为：
+  - `model_id = "<prefix>-<suffix>"`
+  - 示例：`alice-202601061235-qwen-0.5b`
+
+## 1. 数据结构
+
+### 1.1 ServingSpec（YAML）
+
+请求体建议使用 YAML（与 TaskSpec 一致），示例：
+
+```yaml
+model_id: qwen-0.5b                      # 必填：suffix（平台自动加 user_id- 前缀）
+model_source: $HOME/common/hf/.../<sha>  # 必填：本地路径或 repo id；平台做 $HOME 宏替换与路径校验
+num_replicas: 1                          # 可选，默认 1
+gpus_per_replica: 1                      # 可选，默认 1
+# engine_kwargs:                         # 可选：vLLM 参数透传（白名单/黑名单由实现决定）
+#   max_model_len: 8192
+#   gpu_memory_utilization: 0.9
+```
+
+说明：
+- `accelerator_type` 不在 ServingSpec 中暴露；由平台配置（`dev.yaml` 的 `serving.llm.accelerator_type`）统一注入到 Ray Serve LLM 的 `LLMConfig.accelerator_type`（dev/h1: `H20`）。
+
+#### 宏替换
+
+- `$HOME` → `/private/users/<user_id>`
+- `$HOME/common/hf` → `/private/hf`
+- `$HOME/common/datasets` → `/private/datasets`（serving 不强依赖，但保留一致语义）
+
+#### 路径校验（v3.8 约定）
+
+`model_source` 允许：
+
+- `/private/hf/...`（common）
+- `/private/users/<user_id>/...`（user）
+
+拒绝：
+
+- 其它用户目录
+- 非 `/private` 下路径
+- 空路径或包含 `..` 的可疑路径
+
+### 1.2 ServingModel（响应体，JSON）
+
+```json
+{
+  "model_key": "svc-alice-20260106-123000-abcd",
+  "user_id": "alice",
+  "model_id": "alice-202601061235-qwen-0.5b",
+  "model_id_suffix": "qwen-0.5b",
+  "model_id_prefix": "alice-202601061235",
+  "model_source": "/private/hf/hub/models--.../snapshots/<sha>",
+  "num_replicas": 1,
+  "gpus_per_replica": 1,
+  "total_gpus": 1,
+  "state": "RUNNING",
+  "endpoint": {
+    "openai_base_url": "http://<host>:8000/v1",
+    "model": "alice-202601061235-qwen-0.5b"
+  },
+  "error_summary": null,
+  "created_at": "2026-01-06T12:30:00Z",
+  "updated_at": "2026-01-06T12:31:02Z"
+}
+```
+
+## 2. 管理 API（MVP API server）
+
+### 2.1 Create / Upsert model
+
+`POST /api/v2/serve/models`
+
+#### Request
+
+- Header: `Content-Type: application/yaml`
+- Body: ServingSpec（YAML）
+
+#### Response (202)
+
+```json
+{
+  "model_key": "svc-alice-20260106-123000-abcd",
+  "state": "QUEUED"
+}
+```
+
+语义：
+- 创建新模型（若 suffix 不存在）
+- 或更新已有模型（若同一用户同一 suffix 已存在）：更新 replicas/gpu 等配置，进入 `QUEUED` 等待 reconciler apply
+
+### 2.2 List models (current user)
+
+`GET /api/v2/serve/models`
+
+#### Response (200)
+
+```json
+{
+  "items": [ ... ServingModel ... ],
+  "openai_base_url": "http://<host>:8000/v1"
+}
+```
+
+### 2.3 Get model detail
+
+`GET /api/v2/serve/models/{model_key}`
+
+#### Response (200)
+
+```json
+{
+  "model": { ... ServingModel ... },
+  "resolved_spec_yaml": "model_id: ...\nmodel_source: ...\n",
+  "events": [
+    { "event_type": "DEPLOY_REQUESTED", "created_at": "...", "payload": {...} }
+  ],
+  "serve_status": {
+    "app_name": "argus_llm_app",
+    "app_status": "RUNNING"
+  }
+}
+```
+
+### 2.4 Scale replicas (PATCH)
+
+`PATCH /api/v2/serve/models/{model_key}`
+
+#### Request (JSON)
+
+```json
+{ "num_replicas": 2 }
+```
+
+#### Response (200)
+
+```json
+{ "model_key": "...", "state": "QUEUED" }
+```
+
+> v3.8 只支持修改 `num_replicas`（以及可选 engine_kwargs）；`gpus_per_replica` 若修改，可能触发重新部署。
+
+### 2.5 Delete / Undeploy model
+
+`DELETE /api/v2/serve/models/{model_key}`
+
+#### Response (200)
+
+```json
+{ "model_key": "...", "state": "DELETING" }
+```
+
+语义：从“声明式配置”中删除该模型，reconciler 会在下一轮 tick 触发 `serve.run(...)` 更新 app 配置并最终使其不可见。
+
+### 2.6 Admin: Serve cluster status（可选）
+
+`GET /api/v2/serve/status`
+
+#### Response (200)
+
+返回 `serve.status()` 摘要（集群级 + app 级）。
+
+> 仅 admin token 可访问（沿用 v3.x admin gate）。
+
+## 3. 推理 API（Ray Serve OpenAI ingress）
+
+> v3.8 不做鉴权：无需 `Authorization`。
+
+### 3.1 List models
+
+`GET http://<host>:8000/v1/models`
+
+返回可用 model 列表（包含 `alice-qwen-0.5b` 这类带前缀名称）。
+
+### 3.2 Chat completions
+
+`POST http://<host>:8000/v1/chat/completions`
+
+```json
+{
+  "model": "alice-202601061235-qwen-0.5b",
+  "messages": [{"role":"user","content":"Hello"}],
+  "stream": false
+}
+```
+
+### 3.3 Completions / Embeddings
+
+按 Ray Serve LLM OpenAI ingress 支持范围提供（v3.8 验收至少覆盖 chat）。
+
+## 4. 错误码约定（MVP API server）
+
+- `400 invalid yaml/spec`：YAML 解析失败、字段缺失、值不合法
+- `403 forbidden`：路径越权（model_source 访问其他用户目录）
+- `409 conflict`：model_id_suffix 冲突（同一用户重复创建且不允许覆盖时；若选择 upsert 则不返回该错误）
+- `422 unprocessable`：资源参数非法（replica/gpu <=0）
+- `500 internal`：reconciler/serve 调用异常（详情记录到 `serve_events`，并写入 `error_summary`）
--- a/specs/mvp/v3.8/v3.8_design.md
+++ b/specs/mvp/v3.8/v3.8_design.md
@ -0,0 +1,371 @@
+# MVP v3.8 详细设计方案：Ray Serve（vLLM）模型动态部署与管理
+
+> 基线：当前已具备 v3.7 能力（训练平台 + W&B + SFTPGo + WebUI/API + Ray stateless pool，训练侧默认 rollout=vllm）。  
+> v3.8 目标：在同一套 Ray 集群上，引入 **Ray Serve LLM（后端 vLLM）** 的模型推理服务能力，并通过 WebUI/API 动态管理模型生命周期。
+
+## 0. 需求范围（来自 requirements.md）
+
+1) 通过 Ray Serve（后端 vLLM）动态拉起 LLM，支持**多模型 application** 部署  
+2) 默认一个模型 1 个 replica，用户可配置多个  
+3) 用户可删除（下线）模型  
+4) 用户可指定模型使用几张 GPU  
+5) WebUI 可配置、查看模型列表、查看详情  
+6) 模型路径可用 common，也可用 user 路径（本地路径）
+
+## 1. 总体架构
+
+### 1.1 组件关系
+
+v3.8 在现有“训练平台”之上新增一个 **Serving 子系统**：
+
+- **API server（现有）**
+  - 新增 Serving API（模型部署/删除/扩缩容/状态）
+  - 新增 Serving 后台线程（reconciler）：周期性对齐 DB 与 Ray Serve 实际状态
+- **SQLite（现有）**
+  - 新增 `serve_models`、`serve_events` 等表，保存声明式配置与状态
+- **Ray 集群（现有 stateless pool）**
+  - 复用现有 head/worker 容器
+  - 在集群内启动 Ray Serve（controller + proxy + deployments）
+- **Ray Serve LLM（新增）**
+  - 通过 `ray.serve.llm.build_openai_app` 构建一个 OpenAI-compatible app
+  - app 内包含多个 `LLMConfig`（每个对应一个模型）
+
+### 1.2 为什么选择“单个 multi-model application”
+
+Ray Serve 支持 multi-app，但在 dev/docker 场景下多个 app 的 route_prefix 管理更复杂；同时 requirements 要求“多模型 application 部署”，因此 v3.8 采用：
+
+- 一个固定的 app：`argus_llm_app`（名字可配置）
+- route_prefix 固定为 `/`（对外暴露 `/v1/...` OpenAI 接口）
+- 每个模型对应一个 `LLMConfig`，通过 `model_id` 区分（即 OpenAI API 里的 `model` 字段）
+
+这样对用户而言最直观：
+
+- base_url 固定：`http://<host>:8000/v1`
+- `model=` 选择不同模型（`/v1/models` 自动列出）
+
+## 2. Ray Serve 部署策略（dev/h1 约束）
+
+### 2.1 HTTP 入口端口与 docker compose
+
+Ray Serve 默认 HTTP 端口是 `8000`。v3.8 约定：
+
+- 在 **head 容器** 映射 `8000:8000`
+- API server 仍在 `8080`
+- Ray Dashboard 在 `8265`
+
+原因：在单机多容器 docker 环境里，如果让 proxy “每个节点都起”，会出现多个容器同时想绑定同一个 host 端口的问题（不可行）。因此 v3.8 推荐：
+
+- Serve proxy 位置设为 **HeadOnly**（只在 head 上提供 HTTP 入口）
+- GPU replica 仍运行在 worker 上（proxy 只转发，不跑推理）
+
+> 需要注意：
+> - Serve 的 HTTP 配置（host/port/proxy_location）是 **Ray 集群全局配置**，启动后无法动态修改，因此应当在平台启动时一次性设定并持久化。
+> - proxy Actor 需要 CPU 资源；head 节点的 `num-cpus=0` 策略可能需要在 v3.8 做小幅调整（例如给 head 保留少量 CPU），但仍通过 `entrypoint_resources` 确保训练 driver 不会被调度到 head。
+
+#### 2.1.1 compose 预期改动（v3.8 实现时落地）
+
+- `src/mvp/docker-compose.yaml`（ray_head）新增：
+  - `ports: - "8000:8000"`
+
+> worker 容器不暴露 8000（避免 host 端口冲突），由 head proxy 统一对外提供入口。
+
+### 2.2 启动/配置方式（Python SDK 优先）
+
+v3.8 采用 Ray Serve Python SDK：
+
+- `ray.init(address="auto")`
+- `serve.start(proxy_location="HeadOnly", http_options={"host":"0.0.0.0","port":8000})`（一次性全局配置）
+- `serve.run(app, name=<app_name>, route_prefix="/")`
+- `serve.delete(name=<app_name>)`（必要时）
+- `serve.status()` 查询集群/应用状态
+
+理由：
+
+- 避免在平台内部引入额外 REST client 依赖（并减少跨版本 REST schema 不稳定风险）
+- API server 本身运行在 head 容器内，可直接 `ray.init(address="auto")` 连接现有集群
+
+> 另：Ray Dashboard 暴露 Serve REST API（`PUT /api/serve/applications/` 等）可作为备选方案，但 v3.8 先不以它为主通路。
+
+### 2.3 依赖与镜像假设
+
+v3.8 依赖：
+
+- `ray[serve]`（Serve Controller/Proxy）
+- `ray[llm]`（Ray Serve LLM 的 `ray.serve.llm` 模块）
+- vLLM（推理引擎）
+
+由于 v3.7 已切换到 `verlai/verl:vllm011.latest`，预期镜像内包含 vLLM；但 `ray.serve.llm` 是否开箱即用需要在实现阶段确认。
+若缺失，v3.8 将在 `argus-ray-node` 镜像构建阶段补充 `pip install "ray[serve,llm]"`（或按官方建议的最小依赖）并做版本锁定。
+
+### 2.4 Serving 配置（dev.yaml）
+
+v3.8 新增一段 serving 配置，至少包含：
+
+```yaml
+serving:
+  serve:
+    http_port: 8000              # 固定 8000
+    proxy_location: HeadOnly     # dev/docker 下推荐
+  llm:
+    accelerator_type: H20        # dev 环境填写 H20（对应 ray.serve.llm.LLMConfig.accelerator_type）
+```
+
+说明：
+- `accelerator_type` 是 Ray Serve LLM 的 `LLMConfig.accelerator_type` 字段，用于表达“该模型运行在哪类加速卡上”。在 dev/h1 环境我们固定为 `H20`。
+- v3.8 不把 `accelerator_type` 暴露给普通用户编辑（避免误配）；由部署环境配置统一决定。
+
+## 3. 模型配置与资源映射
+
+### 3.1 关键配置对象：`ray.serve.llm.LLMConfig`
+
+每个模型部署由一个 `LLMConfig` 描述，关键字段（v3.8 用到的子集）：
+
+- `model_loading_config`
+  - `model_id`: 对外展示/请求时用的模型名（唯一 key）
+  - `model_source`: HF repo id / S3 / **local path**
+- `accelerator_type`
+  - 从 `dev.yaml` 的 `serving.llm.accelerator_type` 读取（dev/h1: `H20`）
+- `deployment_config`
+  - `num_replicas` 或 `autoscaling_config`（v3.8 先用固定 `num_replicas`）
+  - `ray_actor_options`（CPU/资源约束）
+- `engine_kwargs`
+  - vLLM 相关参数（`max_model_len`、`gpu_memory_utilization` 等）
+- `placement_group_config`
+  - 控制 vLLM engine workers 使用的资源 bundle（用于多 GPU / 跨节点）
+- `runtime_env`
+  - 注入 HF cache、离线开关等环境变量
+
+### 3.2 GPU 张数（gpus_per_replica）如何落到 LLMConfig
+
+v3.8 把用户输入的：
+
+- `gpus_per_replica = N`
+
+映射为：
+
+- `engine_kwargs.tensor_parallel_size = N`（单机/跨机张量并行，Ray Serve LLM 官方示例写法）
+- `placement_group_config = {"bundles": [{"GPU": 1, "CPU": <cpu_per_gpu>}] * N, "strategy": "PACK"}`
+
+并在 `engine_kwargs` 中保留 vLLM 其他参数（`max_model_len`、`gpu_memory_utilization` 等）。
+
+> 兼容性说明：Ray Serve LLM/Serve LLM 仍处于快速演进阶段；v3.8 会以我们线上实际 Ray 版本为准做最小适配与回归测试。
+
+### 3.2.1 跨节点场景（N > 单机 GPU）
+
+Ray Serve LLM 默认使用 `PACK` 策略，优先把 GPU worker 放在尽量少的节点上；如果单机放不下，会自动 spill 到其它节点，从而支持跨节点张量并行（TP）部署。
+
+### 3.3 replica 数（num_replicas）
+
+v3.8 默认：
+
+- `num_replicas = 1`
+
+允许用户在 UI 中设置为 `>=1`。  
+多 replica 会线性消耗 GPU（`num_replicas * gpus_per_replica`），需要做资源预检查。
+
+### 3.4 模型路径与宏替换（common / user）
+
+v3.8 支持两类模型来源：
+
+1) **common**
+- 典型为 `/private/hf/...`（共享 HF cache / snapshot）
+
+2) **user**
+- `/private/users/<user_id>/models/...`
+- 以及用户训练输出（例如 `jobs/<sid>/checkpoints/.../huggingface`）
+
+为保证 UI 易用，沿用平台已有的宏语义：
+
+- `$HOME` → `/private/users/<user_id>`
+- `$HOME/common/hf` → `/private/hf`
+
+并进行路径校验：
+
+- 允许前缀：`/private/hf`、`/private/users/<user_id>/`
+- 拒绝：越权访问其他用户目录、或访问系统敏感路径
+
+### 3.5 离线模式（避免 HF mirror 429）
+
+v3.7 训练侧已验证 `HF_HUB_OFFLINE=1` 的必要性。v3.8 Serving 侧同样默认注入：
+
+- `HF_HOME=/private/hf`
+- `HUGGINGFACE_HUB_CACHE=/private/hf/hub`
+- `TRANSFORMERS_CACHE=/private/hf/transformers`
+- `HF_HUB_OFFLINE=1`
+- `HF_ENDPOINT=https://hf-mirror.com`（可保留，但离线模式下不应触发网络）
+
+并建议用户在 ServingSpec 中尽量填写 **local path** 作为 `model_source`，而不是直接 repo id。
+
+## 4. 平台数据模型（SQLite）
+
+新增两张主表：
+
+### 4.1 `serve_models`
+
+每一行代表一个“声明式模型部署”：
+
+- `model_key`（平台内部唯一 ID，便于重命名/去重）
+- `user_id`
+- `model_id`（对外 OpenAI model 名称，要求 per-app 唯一）
+- `model_source`（本地路径或 repo id，存 resolved 后的结果）
+- `num_replicas`
+- `gpus_per_replica`
+- `engine_kwargs_json`（可选）
+- `state`：`QUEUED | DEPLOYING | RUNNING | FAILED | DELETING | DELETED`
+- `serve_app_name`（默认 `argus_llm_app`）
+- `created_at / updated_at`
+- `error_summary`
+
+### 4.2 `serve_events`
+
+记录关键事件与排障信息（类似 task_events）：
+
+- `id`
+- `model_key`
+- `event_type`（DEPLOY_REQUESTED/DEPLOY_APPLIED/STATUS_SYNC/DELETE_REQUESTED/...）
+- `payload_json`
+- `created_at`
+
+## 5. API 设计（新增）
+
+在现有 `Authorization: Bearer <user_token>` 的认证体系下，新增 Serving API（路径仅示意，具体在实现时与现有 `api/v2` 对齐）。
+
+### 5.1 用户接口
+
+- `POST /api/v2/serve/models`
+  - body: YAML 或 JSON（v3.8 先用 YAML 与现有 TaskSpec 一致）
+  - 创建/更新（upsert）一个模型配置，进入 `QUEUED`
+- `GET /api/v2/serve/models`
+  - 列出当前用户的模型列表（含 state、资源、endpoint）
+- `GET /api/v2/serve/models/{model_key}`
+  - 详情：完整 spec + 最近事件 + Serve status 摘要
+- `PATCH /api/v2/serve/models/{model_key}`
+  - 修改 `num_replicas`、或 engine_kwargs（可选）
+- `DELETE /api/v2/serve/models/{model_key}`
+  - 下线模型（进入 `DELETING`）
+
+### 5.2 系统接口（admin）
+
+- `GET /api/v2/serve/status`（admin）
+  - 返回 `serve.status()` 的摘要（集群级 / app 级）
+
+### 5.3 对外推理 endpoint
+
+固定输出到 UI/接口中：
+
+- `openai_base_url = http://<host>:8000/v1`
+- 支持：
+  - `/v1/chat/completions`
+  - `/v1/completions`
+  - `/v1/embeddings`
+  - `/v1/models`
+
+> v3.8 不做额外网关与鉴权（保持与现有 dev 环境一致）；若后续需要，可在 v3.9+ 引入 token 校验/反向代理。
+
+### 5.4 `model_id` 前缀策略（user_id-）
+
+为避免多用户冲突并保持可读性：
+
+v3.8 采用“**user_id + 日期小时分钟**”作为稳定前缀，以降低冲突并便于快速定位创建时间：
+
+- 用户在 UI/API 中仅填写 `model_id_suffix`（或仍用字段名 `model_id`，但语义为 suffix）
+- 平台计算实际对外 `model_id`：
+  - `prefix = f"{user_id}-{YYYYMMDDHHMM}"`
+  - `model_id = f"{prefix}-{model_id_suffix}"`
+- 在列表/详情中同时展示：
+  - `model_id_suffix`（用户输入）
+  - `model_id_prefix`（平台生成，例如 `alice-202601061235`）
+  - `model_id`（对外 OpenAI 名称）
+
+## 6. 后台执行模型（Serving Reconciler）
+
+v3.8 参考任务 scheduler 的模式，引入一个轻量的 reconciler：
+
+- tick 周期（例如 5s）
+- 每次 tick：
+  1) 拉取 DB 中 `QUEUED/DEPLOYING/RUNNING/DELETING` 的模型
+  2) 调用 `serve.status()` 读取当前 app 及 deployments 状态
+ 3) 若存在 `QUEUED` 或需要变更的模型：构建新的 multi-model app（包含全部 `RUNNING/DEPLOYING/QUEUED` 的模型配置）并 `serve.run(...)`
+ 4) 若存在 `DELETING`：从 app 配置中移除对应模型，并 `serve.run(...)` 应用变更
+ 5) 更新每个模型的 state（依据 Serve status）
+
+重要行为说明（multi-model app 的代价）：
+- 每次“新增/删除/改 replicas”都会触发对同一个 app 的一次 `serve.run(...)` 更新；
+- Ray Serve 会尽量做增量更新，但在某些版本/配置下可能导致 ingress/router 短暂重启；
+- v3.8 先接受该代价（满足需求闭环优先）；若后续需要“删除某模型不影响其它模型”，可演进为“每模型一个 app + 单独 route_prefix”的方案。
+
+资源预检查：
+- 在 apply 前使用 `ray.available_resources()` 做粗粒度 GPU 预检查：
+  - 需要 GPU 总量 = `sum(num_replicas * gpus_per_replica)`（仅对“新增/扩容的差量”更精确）
+- 若不足：
+  - 模型保持 `QUEUED`，记录事件 `PENDING_RESOURCES`
+  - 用户 UI 显示“资源不足，等待释放”
+
+> v3.8 不引入更复杂的抢占/优先级。Serving 与 Training 会竞争 GPU；用户需要自行规划资源（或后续版本引入统一调度）。
+
+## 7. WebUI 设计（新增 Serving 页面）
+
+新增侧边栏入口：**Serving**
+
+### 7.1 Serving 列表页
+
+- 展示字段：
+  - model_id
+  - user_id（仅 admin 可见）
+  - replicas / gpus_per_replica / total_gpus
+  - state（RUNNING/DEPLOYING/QUEUED/FAILED）
+  - 操作：Scale（修改 replicas）、Delete
+
+### 7.2 Serving 创建/编辑页
+
+两种模式（与 New Task 类似，先做 YAML 模式即可）：
+
+示例 YAML（v3.8）：
+
+```yaml
+model_id: qwen-0.5b
+model_source: $HOME/common/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/<sha>
+num_replicas: 1
+gpus_per_replica: 1
+# engine_kwargs:
+#   max_model_len: 8192
+#   gpu_memory_utilization: 0.9
+```
+
+### 7.3 Serving 详情页
+
+- 完整配置（resolved spec）
+- Serve status 摘要（deployments 状态、replica 健康）
+- OpenAI 调用示例（python openai client）
+
+## 8. 验收标准（v3.8）
+
+1) 部署：
+- 一键部署一个模型（1 replica、1 GPU）成功，状态变为 RUNNING
+- `/v1/models` 可列出该模型
+
+2) 扩缩容：
+- 修改 `num_replicas` 生效（Serve status 看到副本数变化）
+
+3) 多模型：
+- 同一个 app 内能同时部署 2 个模型（不同 model_id）
+- 通过 OpenAI 接口用不同 `model=` 请求可得到响应
+
+4) 下线：
+- 删除某模型后 `/v1/models` 不再出现
+
+5) 模型路径：
+- 支持 `/private/hf/...`（common）与 `/private/users/<user>/...`（user）两类本地路径
+
+6) 资源不足可解释：
+- 当 GPU 不足时，模型进入 `QUEUED` 并在 UI/详情中提示“资源不足”
+
+## 9. 待确认点（请你评审时确认）
+
+已确认（来自评审）：
+
+1) 推理端口固定使用 `8000`（Ray Serve 默认端口）。
+2) 对外暴露的 OpenAI 接口 **不与现有 token 体系绑定**（v3.8 不做推理侧鉴权）。
+3) `model_id` 命名规则：平台统一加 `user_id + 日期小时分钟` 前缀，用户在 UI 里只填写后缀部分。
+
+> 说明：这样可以避免跨用户 model_id 冲突，同时在 OpenAI API 的 `model=` 字段上自然可读。
--- a/specs/mvp/v3.8/v3.8_dev_plan.md
+++ b/specs/mvp/v3.8/v3.8_dev_plan.md
@ -0,0 +1,266 @@
+# MVP v3.8 开发计划（TDD，细化版）
+
+> 目标：在 v3.7 基础上引入 Ray Serve（vLLM）模型动态部署与管理（多模型单 app），并提供 WebUI + API 管理闭环。  
+> 约束（已确认）：
+> - 推理端口固定 `8000`（Serve HTTP）。
+> - 推理侧不接入现有 token 鉴权（对外 OpenAI endpoint 无鉴权）。
+> - 对外 `model_id` 统一加前缀：`<user_id>-<YYYYMMDDHHMM>-<suffix>`（用户只填 suffix）。
+> - `LLMConfig.accelerator_type` 从 `dev.yaml` 读取（dev/h1: `H20`）。
+
+本计划按“测试先行 → 实现 → 回归”的节奏拆分到可验证粒度；每个 milestone 都能单独验收。
+
+---
+
+## M0 - 基线与依赖探测（不改行为）
+
+**目的**：确认 v3.7 baseline 稳定，并明确 Ray Serve LLM 依赖是否已具备（否则后续会卡在镜像/依赖）。
+
+### M0.1 本地回归
+- [ ] `.venv/bin/python -m pytest` 通过（coverage ≥ 90%）
+
+### M0.2 远端回归（h1）
+- [ ] `src/mvp/scripts/run_all_v30_api.sh` 可跑通（确认训练闭环未回退）
+
+### M0.3 head 容器内依赖探测（记录结论）
+- [ ] `python3 -c "import ray; import ray.serve; print(ray.__version__)"`
+- [ ] `python3 -c "from ray.serve.llm import LLMConfig, build_openai_app; print('serve_llm_ok')"`
+- [ ] 若失败（例如缺 `gymnasium`）：记录缺失项，并在 M6 通过补齐 `ray[llm]` 解决
+
+### M0.4 配置探测
+- [ ] `configs/dev.yaml` 中存在：
+  - `serving.llm.accelerator_type: H20`
+  - `serving.serve.http_port: 8000`
+  - `serving.serve.proxy_location: HeadOnly`
+
+**验收**：
+- baseline 无回退；依赖探测结论明确（可用/不可用）
+
+---
+
+## M1 - ServingSpec（解析/校验/宏替换/路径校验）（单测驱动）
+
+**目的**：先把“输入”这层彻底固化（API/UI 复用），避免后期反复改 schema。
+
+### M1.1 新增/扩展数据模型
+- [ ] `ServingSpec`（输入）
+  - `model_id`（suffix）
+  - `model_source`（支持 `$HOME` 宏）
+  - `num_replicas`（default=1）
+  - `gpus_per_replica`（default=1）
+  - `engine_kwargs`（可选 dict，先原样存 DB；实现阶段再做白名单/黑名单）
+- [ ] `ResolvedServingSpec`（内部）
+  - `model_id_suffix`
+  - `model_id_prefix`（由平台生成：`user_id-YYYYMMDDHHMM`）
+  - `model_id`（对外：`<prefix>-<suffix>`）
+  - `model_source`（resolved path）
+
+### M1.2 规则（写成纯函数，便于测）
+- [ ] `validate_model_id_suffix(suffix)`：长度/字符集限制（建议：`[a-zA-Z0-9][a-zA-Z0-9._-]{0,63}`）
+- [ ] `$HOME` 宏替换：`$HOME`、`$HOME/common/hf`、`$HOME/common/datasets`
+- [ ] 路径校验（强制本地路径）：
+  - 允许：`/private/hf/...`、`/private/users/<user_id>/...`
+  - 拒绝：`..`、空、其它用户路径、非 `/private` 路径
+- [ ] `make_model_id_prefix(user_id, now_utc)`：`YYYYMMDDHHMM`（UTC）+ user_id
+
+### M1.3 单测（先写失败用例，再补实现）
+- [ ] `test_serving_spec_validation.py`
+  - suffix 合法/非法
+  - replicas/gpus 边界：0、负数、小数、超大值（按实现决定是否限制上限）
+- [ ] `test_serving_spec_paths.py`
+  - `$HOME` 替换正确
+  - 越权路径返回 403/ValueError（按接口层映射）
+  - `/private/hf` 与 `/private/users/<user>` 均可
+- [ ] `test_serving_model_id_prefix.py`
+  - 固定时间输入 → prefix 输出一致（避免时区/格式问题）
+
+**验收**：
+- 输入 spec 规则稳定；核心校验/替换均有单测覆盖
+
+---
+
+## M2 - SQLite 表结构与 Db 接口（单测驱动）
+
+**目的**：Serving 的声明式状态必须持久化，可审计、可恢复。
+
+### M2.1 DB schema
+- [ ] `serve_models`
+  - 主键：`model_key`（平台生成）
+  - unique：`(user_id, model_id_suffix)`（实现 upsert）
+  - 存储：resolved spec（包含 prefix/full model_id、resolved model_source）
+  - 状态：`QUEUED/DEPLOYING/RUNNING/FAILED/DELETING/DELETED`
+  - `error_summary`
+- [ ] `serve_events`（append-only）
+
+### M2.2 Db 方法
+- [ ] `upsert_serve_model(user_id, spec_yaml, now)` → (model_key, state)
+- [ ] `list_serve_models(user_id, include_deleted=False, limit/offset?)`
+- [ ] `get_serve_model(model_key)`
+- [ ] `set_serve_model_state(model_key, state, error_summary=None)`
+- [ ] `append_serve_event(model_key, event_type, payload_json=None)`
+- [ ] `pick_next_runnable_serve_change()`（给 reconciler 用）
+
+### M2.3 单测
+- [ ] `test_db_serving.py`
+  - upsert 行为（同 suffix 更新不产生新 model_key 或产生新版本——此处需在实现前明确策略）
+  - state 流转 + 事件记录
+  - list 的过滤与排序（按 updated_at）
+
+**验收**：
+- DB 行为可预测；upsert/unique 语义确定并测试覆盖
+
+---
+
+## M3 - Serving 管理 API（FastAPI）（单测驱动）
+
+**目的**：先把管理 API 跑通，Ray Serve 先不接真实（reconciler 之后再接）。
+
+### M3.1 API 路由（用户）
+- [ ] `POST /api/v2/serve/models`（Content-Type: application/yaml）
+  - 入参：ServingSpec YAML
+  - 出参：`{model_key,state}`（202）
+- [ ] `GET /api/v2/serve/models`
+  - 返回 items + `openai_base_url=http://<host>:8000/v1`
+- [ ] `GET /api/v2/serve/models/{model_key}`
+  - 返回 model + resolved_spec_yaml + events（分页可后置）+ serve_status（先空/占位）
+- [ ] `PATCH /api/v2/serve/models/{model_key}`（JSON）
+  - 支持 `num_replicas`（最小闭环）
+- [ ] `DELETE /api/v2/serve/models/{model_key}`
+
+### M3.2 API 路由（admin，可选）
+- [ ] `GET /api/v2/serve/status`（仅 admin token）
+
+### M3.3 错误映射（必须测试）
+- [ ] YAML 解析失败：400
+- [ ] spec 校验失败：422
+- [ ] 越权路径：403
+- [ ] 不存在 model_key：404
+
+### M3.4 单测
+- [ ] `test_app_serving_api.py`
+  - happy path：create → list → get → patch → delete
+  - 多用户隔离：用户只能看到自己的 model
+  - 错误码覆盖：400/403/404/422
+
+**验收**：
+- API reference (`v3.8_api.md`) 中所有管理接口可返回预期结构（Serve 未接入也能工作）
+
+---
+
+## M4 - ServeClient 抽象 + LLMConfig builder（单测驱动）
+
+**目的**：将“如何从 ResolvedServingSpec 构造 LLMConfig”固化，并把 Ray Serve 的依赖隔离到 client 里，便于 mock。
+
+### M4.1 `ServeClient` 接口（可 mock）
+- [ ] `ensure_started(http_port=8000, proxy_location="HeadOnly")`
+- [ ] `apply_app(app_name, llm_configs)`（multi-model）
+- [ ] `get_status()`（serve.status 摘要）
+
+### M4.2 `build_llm_config(resolved_spec, accelerator_type, runtime_env_defaults)` 纯函数
+- [ ] 写入 `LLMConfig.accelerator_type`（来自 dev.yaml：H20）
+- [ ] `deployment_config.num_replicas`
+- [ ] `engine_kwargs.tensor_parallel_size = gpus_per_replica`
+- [ ] `placement_group_config` bundles 按 GPU 张数生成
+- [ ] `runtime_env.env_vars` 注入（至少包含 HF cache + `HF_HUB_OFFLINE=1`）
+
+### M4.3 单测
+- [ ] `test_llm_config_builder.py`
+  - gpus_per_replica=1/2/4 → tensor_parallel_size 与 bundles 数量正确
+  - accelerator_type 注入正确
+  - runtime_env 含 HF_HUB_OFFLINE 等关键 env
+
+**验收**：
+- 从平台 spec 到 Ray Serve LLMConfig 的映射规则稳定，有单测锁定
+
+---
+
+## M5 - Serving Reconciler（状态机 + 资源预检查）（单测驱动）
+
+**目的**：实现声明式对齐：DB → Serve；同时提供可解释的 QUEUED/FAILED 状态。
+
+### M5.1 状态机（最小闭环）
+- [ ] `QUEUED`：等待 apply
+- [ ] `DEPLOYING`：已触发 apply，等待 Serve running/healthy
+- [ ] `RUNNING`：Serve status running
+- [ ] `FAILED`：apply 或 status 失败（写 error_summary + event）
+- [ ] `DELETING`：等待从 app 中移除
+- [ ] `DELETED`：完成删除（可选保留记录）
+
+### M5.2 资源预检查
+- [ ] `needed_total_gpus = sum(num_replicas*gpus_per_replica)`（最小可用预检查）
+- [ ] `ray.available_resources()["GPU"]`（或更稳健的 per-node 统计）不足时：
+  - 保持 `QUEUED`
+  - 记录 `PENDING_RESOURCES` event
+
+### M5.3 reconcile 策略（multi-model app）
+- [ ] tick 读取 active models，构建全量 `llm_configs`
+- [ ] 处理 deleting：从 configs 中移除对应 model，再 apply
+
+### M5.4 单测（mock ServeClient + mock ray resources）
+- [ ] `test_serving_reconciler.py`
+  - 新增模型：apply_app 被调用；state 进入 DEPLOYING
+  - 删除模型：apply_app configs 不包含该模型
+  - GPU 不足：不 apply；state 仍 QUEUED；event 写入
+  - apply 抛异常：state FAILED；error_summary 写入
+
+**验收**：
+- reconciler 行为在纯单测环境可验证；失败可解释
+
+---
+
+## M6 - 真实集成（h1）：Ray Serve 启动 + 推理闭环（E2E）
+
+**目的**：在 dev/h1 环境真正跑通：部署模型 → `/v1/models` 可见 → `chat/completions` 成功 → 删除后消失。
+
+### M6.1 compose/端口
+- [ ] `src/mvp/docker-compose.yaml`：`ray_head` 增加 `8000:8000`
+
+### M6.2 镜像依赖（若 M0 发现缺失）
+- [ ] 在 `argus-ray-node` 镜像中补齐 `ray[serve,llm]`（版本与现有 Ray 对齐，避免升级 Ray 导致不兼容）
+  - 推荐优先补齐 `ray[llm]`（包含 `ray.serve.llm` 依赖闭包，如 `gymnasium`），再按需补 `ray[serve]`
+  - 验证点：`python3 -c "from ray.serve.llm import LLMConfig, build_openai_app; print('serve_llm_ok')"`
+
+### M6.3 E2E 脚本（幂等）
+- [ ] 新增 `scripts/run_all_v38_serving.sh`：
+  - 起 compose（确保 Serve 端口可用）
+  - 起 API
+  - 创建 user + token
+  - `POST /api/v2/serve/models` 创建 1GPU 模型
+  - 轮询模型 state 到 RUNNING
+  - `curl http://127.0.0.1:8000/v1/models` 验证包含 `<prefix>-<suffix>`
+  - `curl http://127.0.0.1:8000/v1/chat/completions` 进行最小推理
+  - `DELETE /api/v2/serve/models/{model_key}` 下线
+  - 再轮询 `/v1/models` 不包含
+
+**验收**：
+- E2E 可重复跑通（至少两次连续跑不需要人工清理）
+
+---
+
+## M7 - WebUI（Serving 页面）（单测驱动）
+
+**目的**：给用户可视化的模型管理页面（最小必要功能）。
+
+### M7.1 页面
+- [ ] Sidebar 增加 Serving
+- [ ] `/ui/serving`：列表 + 状态 + 操作（delete/scale）
+- [ ] `/ui/serving/new`：YAML 输入 + submit
+- [ ] `/ui/serving/{model_key}`：详情（resolved spec、events、OpenAI 调用示例）
+
+### M7.2 单测
+- [ ] `test_ui_serving.py`：路由 200、关键链接存在、包含 openai_base_url=8000
+
+**验收**：
+- WebUI 覆盖 create/list/detail/scale/delete 的主链路
+
+---
+
+## M8 - 文档与验收用例（交付）
+
+**目的**：给用户/运维一套可复用的运行方式与排障路径。
+
+- [ ] 更新 `specs/mvp/v3.8/v3.8_progress.md`（按 milestone 记录）
+- [ ] 补充 README（可选）：端口说明、推理 API 无鉴权警示、模型路径约定
+- [ ] 验收清单（checklist）：
+  - 单测通过
+  - h1 E2E 通过
+  - UI 主链路可操作
--- a/specs/mvp/v3.8/v3.8_progress.md
+++ b/specs/mvp/v3.8/v3.8_progress.md
@ -0,0 +1,48 @@
+# MVP v3.8 进展记录
+
+## 2026-01-06
+
+- 完成 v3.8 设计文档：`specs/mvp/v3.8/v3.8_design.md`
+- 完成 v3.8 Serving API reference：`specs/mvp/v3.8/v3.8_api.md`
+- 完成 v3.8 TDD 开发计划：`specs/mvp/v3.8/v3.8_dev_plan.md`
+- 完成 M0：`configs/dev.yaml` 增加 `serving` 配置（http_port=8000, proxy_location=HeadOnly, accelerator_type=H20）
+- 完成 M1：ServingSpec 解析/宏替换/路径校验 + 单测（`src/mvp/py/argus/service/serving_spec.py`）
+- 完成 M2：SQLite 新增 `serve_models`/`serve_events` + Db API + 单测（`src/mvp/py/argus/service/db.py`）
+- 完成 M3：FastAPI Serving 管理 API + 单测（`src/mvp/py/argus/service/app.py`）
+- 完成 M4：ServeClient 抽象 + LLMConfig builder（dict 形态）+ 单测（`src/mvp/py/argus/service/serve_client.py`、`src/mvp/py/argus/service/serve_llm_config.py`）
+- 完成 M5：Serving reconciler（状态机 + 资源预检查 + mock 单测）（`src/mvp/py/argus/service/serving_reconciler.py`）
+
+### M6（h1 真实集成）
+
+- `argus-ray-node` 镜像补齐依赖：`ray[serve,llm]` + `gymnasium` + `dm-tree`（避免 `ray.serve.llm` 导入失败）
+- 修复 Ray 2.49.2 兼容性问题：
+  - `LLMConfig` 不支持 `placement_group_config`，改为使用 `resources_per_bundle`（`src/mvp/py/argus/service/serve_llm_config.py`）
+- 远端 E2E：
+  - `scripts/run_all_v38_serving.sh` 可跑通：create → RUNNING → `/v1/models` → `chat/completions` → delete → DELETED
+  - 修复脚本中 `/v1/models` 解析的 bash heredoc 引号错误（`src/mvp/scripts/run_all_v38_serving.sh`）
+
+### M7（WebUI - Serving）
+
+- WebUI 增加 Serving 页面：
+  - 列表：`/ui/serving`
+  - 创建：`/ui/serving/new`
+  - 详情/事件/缩放/删除：`/ui/serving/{model_key}`
+- 单测覆盖：
+  - `src/mvp/py/tests/test_ui_serving.py`
+
+### M8（文档/验收）
+
+- `src/mvp/README.md` 补充 v3.8 serving 端口与 E2E 脚本说明
+
+### 环境探测（h1 / head 容器）
+
+> 目的：确认 Ray Serve LLM 依赖是否开箱即用，避免后续集成阶段才暴雷。
+
+- `ray`：可用，版本 `2.49.2`
+- `ray.serve`：可 import（Serve 基础可用）
+- `ray.serve.llm`：当前不可 import
+  - 报错：`ModuleNotFoundError: No module named 'gymnasium'`
+  - 原因：`ray.serve.llm` 的导入链路会触发 `ray.rllib`，而 rllib 依赖 `gymnasium`
+
+结论：
+- v3.8 在实现阶段需要在 `argus-ray-node` 镜像中补齐 `ray[llm]`（推荐）或至少补齐 `gymnasium` 等必要依赖，确保 `from ray.serve.llm import ...` 可用。
--- a/src/mvp/README.md
+++ b/src/mvp/README.md
@ -24,3 +24,9 @@ v3.0 访问入口（dev/h1）：
 - SFTPGo：
  - SFTP：`127.0.0.1:2022`
  - Admin API/UI：`http://127.0.0.1:8081`（容器内 8080，host 映射到 8081 避免与 API server 冲突）
+
+v3.8（Ray Serve LLM / vLLM 模型服务）：
+- 推理端口：`8000`（Ray Serve HTTP）
+- OpenAI-compatible endpoint：`http://127.0.0.1:8000/v1`
+  - 注意：v3.8 推理接口**不做鉴权**
+- E2E 脚本：`scripts/run_all_v38_serving.sh`
--- a/src/mvp/configs/dev.yaml
+++ b/src/mvp/configs/dev.yaml
@ -69,3 +69,11 @@ data:
    jobs_trash_after_days: 3
    jobs_purge_after_days: 7
    janitor_interval_s: 3600
+
+# v3.8: model serving via Ray Serve LLM (vLLM backend)
+serving:
+  serve:
+    http_port: 8000
+    proxy_location: HeadOnly
+  llm:
+    accelerator_type: H20
--- a/src/mvp/docker-compose.yaml
+++ b/src/mvp/docker-compose.yaml
@ -1,10 +1,16 @@
 services:
  ray_head:
    image: argus/argus-ray-node:vllm011.latest
+    build:
+      context: .
+      dockerfile: images/argus-ray-node/Dockerfile
+      args:
+        BASE_IMAGE: verlai/verl:vllm011.latest
    container_name: argus-ray-head
    ports:
      - "8265:8265"
      - "8080:8080"
+      - "8000:8000"
    volumes:
      # NOTE: this compose file is intended for the dev env layout like:
      #   /home2/argus/infra/mvp/{shared,verl,src/mvp}
@ -92,6 +98,11 @@ services:

  ray_worker_0:
    image: argus/argus-ray-node:vllm011.latest
+    build:
+      context: .
+      dockerfile: images/argus-ray-node/Dockerfile
+      args:
+        BASE_IMAGE: verlai/verl:vllm011.latest
    container_name: argus-ray-worker-0
    volumes:
      - ../../verl:/workspace/verl
@ -124,6 +135,11 @@ services:

  ray_worker_1:
    image: argus/argus-ray-node:vllm011.latest
+    build:
+      context: .
+      dockerfile: images/argus-ray-node/Dockerfile
+      args:
+        BASE_IMAGE: verlai/verl:vllm011.latest
    container_name: argus-ray-worker-1
    volumes:
      - ../../verl:/workspace/verl
--- a/src/mvp/images/argus-ray-node/Dockerfile
+++ b/src/mvp/images/argus-ray-node/Dockerfile
@ -6,6 +6,15 @@ SHELL ["/bin/bash", "-lc"]
 # Install supervisord (prefer pip to avoid relying on distro package manager).
 RUN python3 -m pip install --no-cache-dir supervisor

+# v3.8: Ray Serve LLM deps (keep Ray version pinned to what's already in the base image).
+# NOTE: base image already includes Ray; we only add extras.
+RUN RAY_VER="$(python3 -c 'import ray; print(ray.__version__)')" && \
+    python3 -m pip install --no-cache-dir "ray[serve,llm]==${RAY_VER}"
+# Ray Serve LLM's import chain currently pulls in ray.rllib which requires extra deps.
+# Install them explicitly to make `from ray.serve.llm import ...` work reliably.
+RUN python3 -m pip install --no-cache-dir gymnasium dm-tree && \
+    python3 -c "from ray.serve.llm import LLMConfig, build_openai_app; print('ray_serve_llm_ok')"
+
 RUN mkdir -p /opt/argus/py/argus/ray

 # Minimal embedded code for stateless pool (API code is intentionally excluded).
--- a/src/mvp/images/argus-ray-node/argus-head-ray.sh
+++ b/src/mvp/images/argus-ray-node/argus-head-ray.sh
@ -16,9 +16,8 @@ exec ray start \
  --port="${ray_port}" \
  --dashboard-host=0.0.0.0 \
  --dashboard-port="${dashboard_port}" \
-  --num-cpus=0 \
+  --num-cpus="${ARGUS_HEAD_NUM_CPUS:-1}" \
  --num-gpus=0 \
  --disable-usage-stats \
  --block \
  ${ARGUS_RAY_EXTRA_ARGS:-}
-
--- a/src/mvp/py/argus/core/ids.py
+++ b/src/mvp/py/argus/core/ids.py
@ -26,3 +26,19 @@ def new_task_id(workload: str, *, user_id: str | None = None) -> str:

 def attempt_submission_id(task_id: str, attempt_no: int) -> str:
    return f"{task_id}--a{attempt_no:02d}"
+
+
+def new_model_key(*, user_id: str) -> str:
+    """
+    Internal identifier for a serving model record.
+
+    Note:
+    - model_id is the OpenAI-facing name (user_id + timestamp prefix + suffix).
+    - model_key is used for stable DB identity and API resource path.
+    """
+    ts = datetime.now().strftime("%Y%m%d-%H%M%S")
+    suffix = secrets.token_hex(2)
+    u = _normalize_user_id(user_id)
+    if not u:
+        raise ValueError("user_id is required")
+    return f"mvp2-{u}-serve-{ts}-{suffix}"
--- a/src/mvp/py/argus/service/app.py
+++ b/src/mvp/py/argus/service/app.py
@ -4,11 +4,13 @@ import os
 import secrets
 import threading
 from typing import Any
+import json
+from dataclasses import asdict

 import yaml
 from fastapi import FastAPI, HTTPException, Request, Response

-from argus.core.ids import new_task_id
+from argus.core.ids import new_model_key, new_task_id
 from argus.ray.models import AdvancedTaskSpec, JobSpec, RayConfig, parse_taskspec

 from .advanced_command import expand_advanced_command, validate_advanced_command
@ -16,6 +18,7 @@ from .config import V2Config
 from .db import Db
 from .janitor import JobsJanitor
 from .scheduler import Scheduler
+from .serving_spec import ServingSpec, parse_serving_spec, resolve_serving_spec
 from .sftpgo import SFTPGoAdminClient, SFTPGoError
 from .ui import register_ui_routes

@ -85,6 +88,61 @@ def create_app(config_path: str) -> FastAPI:
            common_root=f"{shared_root}/common",
        )

+    def _serving_enabled() -> bool:
+        return bool(v2_cfg.serving.enabled)
+
+    def _openai_base_url(req: Request) -> str:
+        # Prefer forwarded headers if present; otherwise fall back to Host.
+        host = req.headers.get("x-forwarded-host") or req.headers.get("host") or req.url.hostname or "127.0.0.1"
+        # Strip port if present (common for Host header).
+        hostname = host
+        if hostname.startswith("[") and "]" in hostname:
+            # IPv6 like: [::1]:8080
+            hostname = hostname.split("]")[0] + "]"
+        else:
+            hostname = hostname.split(":")[0]
+        scheme = req.headers.get("x-forwarded-proto") or req.url.scheme or "http"
+        port = int(v2_cfg.serving.serve.http_port)
+        return f"{scheme}://{hostname}:{port}/v1"
+
+    def _dump_yaml(obj: Any) -> str:
+        return yaml.safe_dump(obj, sort_keys=False)
+
+    def _serving_spec_to_dict(spec: ServingSpec) -> dict[str, Any]:
+        return {
+            "model_id": spec.model_id,
+            "model_source": spec.model_source,
+            "num_replicas": int(spec.num_replicas),
+            "gpus_per_replica": int(spec.gpus_per_replica),
+            "engine_kwargs": spec.engine_kwargs,
+        }
+
+    def _serve_model_public(row: dict[str, Any], *, req: Request) -> dict[str, Any]:
+        num_replicas = int(row.get("num_replicas") or 0)
+        gpus_per_replica = int(row.get("gpus_per_replica") or 0)
+        total_gpus = num_replicas * gpus_per_replica
+        model_id = str(row.get("model_id") or "")
+        return {
+            "model_key": str(row.get("model_key") or ""),
+            "user_id": str(row.get("user_id") or ""),
+            "model_id": model_id,
+            "model_id_suffix": str(row.get("model_id_suffix") or ""),
+            "model_id_prefix": str(row.get("model_id_prefix") or ""),
+            "model_source": str(row.get("model_source") or ""),
+            "num_replicas": num_replicas,
+            "gpus_per_replica": gpus_per_replica,
+            "total_gpus": total_gpus,
+            "state": str(row.get("state") or ""),
+            "error_summary": row.get("error_summary"),
+            "created_at": str(row.get("created_at") or ""),
+            "updated_at": str(row.get("updated_at") or ""),
+            "deleted_at": row.get("deleted_at"),
+            "endpoint": {
+                "openai_base_url": _openai_base_url(req),
+                "model": model_id,
+            },
+        }
+
    def _auth(req: Request) -> dict[str, Any]:
        token_env = v2_cfg.auth.token_env
        admin_token = os.environ.get(token_env, "")
@ -565,6 +623,162 @@ def create_app(config_path: str) -> FastAPI:
            return db.list_queue()
        return db.list_queue(user_id=str(subject["user_id"]))

+    # v3.8: Model serving (Ray Serve LLM) management APIs.
+    @app.post("/api/v2/serve/models")
+    async def create_serve_model(req: Request) -> dict[str, Any]:
+        subject = _auth(req)
+        if not _serving_enabled():
+            raise HTTPException(status_code=400, detail="serving is not enabled")
+
+        body = (await req.body()).decode("utf-8")
+        try:
+            obj = yaml.safe_load(body) or {}
+        except Exception as e:
+            raise HTTPException(status_code=400, detail=f"invalid YAML: {e!r}")
+        if not isinstance(obj, dict):
+            raise HTTPException(status_code=400, detail="serving spec must be a YAML mapping")
+
+        user_id = str(subject["user_id"]).strip()
+        try:
+            spec = parse_serving_spec(obj)
+            resolved = resolve_serving_spec(spec=spec, user_id=user_id)
+        except PermissionError as e:
+            raise HTTPException(status_code=403, detail=str(e))
+        except ValueError as e:
+            msg = str(e)
+            code = 422 if ("num_replicas" in msg or "gpus_per_replica" in msg) else 400
+            raise HTTPException(status_code=code, detail=f"invalid serving spec: {e!r}")
+
+        model_key = new_model_key(user_id=user_id)
+        try:
+            engine_kwargs_json = json.dumps(resolved.engine_kwargs, sort_keys=True) if resolved.engine_kwargs is not None else None
+        except TypeError as e:
+            raise HTTPException(status_code=400, detail=f"engine_kwargs must be JSON-serializable: {e!r}")
+
+        spec_yaml = _dump_yaml(_serving_spec_to_dict(spec))
+        resolved_spec_yaml = _dump_yaml(asdict(resolved))
+
+        db.create_serve_model(
+            model_key=model_key,
+            user_id=user_id,
+            model_id_suffix=resolved.model_id_suffix,
+            model_id_prefix=resolved.model_id_prefix,
+            model_id=resolved.model_id,
+            model_source=resolved.model_source,
+            num_replicas=resolved.num_replicas,
+            gpus_per_replica=resolved.gpus_per_replica,
+            engine_kwargs_json=engine_kwargs_json,
+            spec_yaml=spec_yaml,
+            resolved_spec_yaml=resolved_spec_yaml,
+        )
+        return {"model_key": model_key, "state": "QUEUED"}
+
+    @app.get("/api/v2/serve/models")
+    async def list_serve_models(req: Request, limit: int = 200, offset: int = 0, include_deleted: int = 0) -> dict[str, Any]:
+        subject = _auth(req)
+        if not _serving_enabled():
+            raise HTTPException(status_code=400, detail="serving is not enabled")
+
+        lim = max(1, min(int(limit), 1000))
+        off = max(0, int(offset))
+        inc = bool(int(include_deleted))
+        user_id = str(subject["user_id"])
+
+        items = db.list_serve_models(user_id=user_id, include_deleted=inc, limit=lim, offset=off)
+        out = [_serve_model_public(i, req=req) for i in items]
+        return {
+            "items": out,
+            "openai_base_url": _openai_base_url(req),
+            "limit": lim,
+            "offset": off,
+            "has_more": bool(len(items) == lim),
+        }
+
+    @app.get("/api/v2/serve/models/{model_key}")
+    async def get_serve_model(model_key: str, req: Request) -> dict[str, Any]:
+        subject = _auth(req)
+        if not _serving_enabled():
+            raise HTTPException(status_code=400, detail="serving is not enabled")
+
+        row = db.get_serve_model(model_key)
+        if not row:
+            raise HTTPException(status_code=404, detail="model not found")
+        if not subject.get("is_admin"):
+            if str(row.get("user_id") or "") != str(subject["user_id"]):
+                raise HTTPException(status_code=404, detail="model not found")
+
+        events = db.list_serve_events(model_key, limit=200, offset=0)
+        ev_out = [
+            {
+                "id": int(e.get("id") or 0),
+                "model_key": str(e.get("model_key") or ""),
+                "created_at": str(e.get("ts") or ""),
+                "event_type": str(e.get("event_type") or ""),
+                "payload_json": e.get("payload_json"),
+            }
+            for e in events
+        ]
+        return {
+            "model": _serve_model_public(row, req=req),
+            "resolved_spec_yaml": str(row.get("resolved_spec_yaml") or ""),
+            "events": ev_out,
+            "serve_status": None,
+        }
+
+    @app.patch("/api/v2/serve/models/{model_key}")
+    async def patch_serve_model(model_key: str, req: Request) -> dict[str, Any]:
+        subject = _auth(req)
+        if not _serving_enabled():
+            raise HTTPException(status_code=400, detail="serving is not enabled")
+
+        row = db.get_serve_model(model_key)
+        if not row:
+            raise HTTPException(status_code=404, detail="model not found")
+        if not subject.get("is_admin"):
+            if str(row.get("user_id") or "") != str(subject["user_id"]):
+                raise HTTPException(status_code=404, detail="model not found")
+
+        obj = await req.json()
+        if not isinstance(obj, dict):
+            raise HTTPException(status_code=400, detail="body must be a JSON object")
+        if "num_replicas" not in obj:
+            raise HTTPException(status_code=400, detail="missing num_replicas")
+        num_replicas = obj.get("num_replicas")
+        if not isinstance(num_replicas, int) or int(num_replicas) < 1:
+            raise HTTPException(status_code=422, detail="num_replicas must be an integer >= 1")
+
+        db.update_serve_model_num_replicas(model_key=model_key, num_replicas=int(num_replicas))
+        return {"model_key": model_key, "state": "QUEUED"}
+
+    @app.delete("/api/v2/serve/models/{model_key}")
+    async def delete_serve_model(model_key: str, req: Request) -> dict[str, Any]:
+        subject = _auth(req)
+        if not _serving_enabled():
+            raise HTTPException(status_code=400, detail="serving is not enabled")
+
+        row = db.get_serve_model(model_key)
+        if not row:
+            raise HTTPException(status_code=404, detail="model not found")
+        if not subject.get("is_admin"):
+            if str(row.get("user_id") or "") != str(subject["user_id"]):
+                raise HTTPException(status_code=404, detail="model not found")
+
+        db.set_serve_model_state(model_key=model_key, state="DELETING", event_type="SERVE_DELETE_REQUESTED")
+        return {"model_key": model_key, "state": "DELETING"}
+
+    @app.get("/api/v2/serve/status")
+    async def serve_status(req: Request) -> dict[str, Any]:
+        _require_admin(req)
+        if not _serving_enabled():
+            raise HTTPException(status_code=400, detail="serving is not enabled")
+        return {
+            "enabled": True,
+            "openai_base_url": _openai_base_url(req),
+            "http_port": int(v2_cfg.serving.serve.http_port),
+            "proxy_location": str(v2_cfg.serving.serve.proxy_location),
+            "accelerator_type": str(v2_cfg.serving.llm.accelerator_type),
+        }
+
    # v3.0: minimal WebUI (no server-side session; token stored in browser localStorage).
    register_ui_routes(app)

--- a/src/mvp/py/argus/service/config.py
+++ b/src/mvp/py/argus/service/config.py
@ -57,6 +57,24 @@ class V2SFTPGoConfig:
    admin_password_env: str = "SFTPGO_ADMIN_PASSWORD"


+@dataclass(frozen=True)
+class V2ServingServeConfig:
+    http_port: int = 8000
+    proxy_location: str = "HeadOnly"
+
+
+@dataclass(frozen=True)
+class V2ServingLLMConfig:
+    accelerator_type: str = ""
+
+
+@dataclass(frozen=True)
+class V2ServingConfig:
+    enabled: bool = False
+    serve: V2ServingServeConfig = V2ServingServeConfig()
+    llm: V2ServingLLMConfig = V2ServingLLMConfig()
+
+
@dataclass(frozen=True)
 class V2DataConfig:
    user_root: str
@ -72,6 +90,7 @@ class V2Config:
    scheduler: V2SchedulerConfig
    tracking: V2TrackingConfig
    data: V2DataConfig
+    serving: V2ServingConfig

    @staticmethod
    def from_root_dict(root: dict[str, Any]) -> "V2Config":
@ -112,6 +131,15 @@ class V2Config:
        if not isinstance(sftpgo, dict) or not isinstance(retention, dict):
            raise ValueError("config.data.{sftpgo,retention} must be mappings")

+        serving = root.get("serving") or {}
+        if not isinstance(serving, dict):
+            raise ValueError("config.serving must be a mapping")
+        serving_enabled = bool(serving.get("enabled")) if "enabled" in serving else bool(serving)
+        serving_serve = serving.get("serve") or {}
+        serving_llm = serving.get("llm") or {}
+        if not isinstance(serving_serve, dict) or not isinstance(serving_llm, dict):
+            raise ValueError("config.serving.{serve,llm} must be mappings")
+
        default_db_path = f"{shared_root}/common/db/mvp.sqlite3"
        db_path = str(sqlite.get("db_path") or default_db_path)

@ -158,4 +186,14 @@ class V2Config:
                    janitor_interval_s=int(retention.get("janitor_interval_s") or 3600),
                ),
            ),
+            serving=V2ServingConfig(
+                enabled=serving_enabled,
+                serve=V2ServingServeConfig(
+                    http_port=int(serving_serve.get("http_port") or 8000),
+                    proxy_location=str(serving_serve.get("proxy_location") or "HeadOnly"),
+                ),
+                llm=V2ServingLLMConfig(
+                    accelerator_type=str(serving_llm.get("accelerator_type") or ""),
+                ),
+            ),
        )
--- a/src/mvp/py/argus/service/db.py
+++ b/src/mvp/py/argus/service/db.py
@ -117,6 +117,43 @@ class Db:
                )
                """
            )
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS serve_models (
+                  model_key TEXT PRIMARY KEY,
+                  user_id TEXT NOT NULL,
+                  model_id_suffix TEXT NOT NULL,
+                  model_id_prefix TEXT NOT NULL,
+                  model_id TEXT NOT NULL,
+                  model_source TEXT NOT NULL,
+                  num_replicas INTEGER NOT NULL,
+                  gpus_per_replica INTEGER NOT NULL,
+                  engine_kwargs_json TEXT,
+                  state TEXT NOT NULL,
+                  spec_yaml TEXT NOT NULL,
+                  resolved_spec_yaml TEXT NOT NULL,
+                  error_summary TEXT,
+                  created_at TEXT NOT NULL,
+                  updated_at TEXT NOT NULL,
+                  deleted_at TEXT
+                )
+                """
+            )
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS serve_events (
+                  id INTEGER PRIMARY KEY AUTOINCREMENT,
+                  model_key TEXT NOT NULL,
+                  ts TEXT NOT NULL,
+                  event_type TEXT NOT NULL,
+                  payload_json TEXT,
+                  FOREIGN KEY (model_key) REFERENCES serve_models(model_key) ON DELETE CASCADE
+                )
+                """
+            )
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_serve_models_user ON serve_models(user_id)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_serve_models_state ON serve_models(state)")
+            conn.execute("CREATE INDEX IF NOT EXISTS idx_serve_events_model ON serve_events(model_key)")

    @contextmanager
    def tx(self) -> Iterator[sqlite3.Connection]:
@ -493,3 +530,239 @@ class Db:
                (str(end_time_le), int(limit)),
            ).fetchall()
            return [dict(r) for r in rows]
+
+    def create_serve_model(
+        self,
+        *,
+        model_key: str,
+        user_id: str,
+        model_id_suffix: str,
+        model_id_prefix: str,
+        model_id: str,
+        model_source: str,
+        num_replicas: int,
+        gpus_per_replica: int,
+        spec_yaml: str,
+        resolved_spec_yaml: str,
+        engine_kwargs_json: str | None = None,
+    ) -> dict[str, Any]:
+        now = _utc_now_iso()
+        with self.tx() as conn:
+            conn.execute(
+                """
+                INSERT INTO serve_models (
+                  model_key,
+                  user_id,
+                  model_id_suffix,
+                  model_id_prefix,
+                  model_id,
+                  model_source,
+                  num_replicas,
+                  gpus_per_replica,
+                  engine_kwargs_json,
+                  state,
+                  spec_yaml,
+                  resolved_spec_yaml,
+                  created_at,
+                  updated_at
+                )
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 'QUEUED', ?, ?, ?, ?)
+                """,
+                (
+                    model_key,
+                    user_id,
+                    model_id_suffix,
+                    model_id_prefix,
+                    model_id,
+                    model_source,
+                    int(num_replicas),
+                    int(gpus_per_replica),
+                    engine_kwargs_json,
+                    spec_yaml,
+                    resolved_spec_yaml,
+                    now,
+                    now,
+                ),
+            )
+            conn.execute(
+                "INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, 'SERVE_MODEL_CREATED', ?)",
+                (model_key, now, None),
+            )
+            row = conn.execute("SELECT * FROM serve_models WHERE model_key = ?", (model_key,)).fetchone()
+            return dict(row) if row else {}
+
+    def list_serve_models(
+        self,
+        *,
+        user_id: str,
+        include_deleted: bool = False,
+        limit: int = 200,
+        offset: int = 0,
+    ) -> list[dict[str, Any]]:
+        with self._connect() as conn:
+            where_sql = "WHERE user_id = ?"
+            params: list[Any] = [user_id]
+            if not include_deleted:
+                where_sql += " AND deleted_at IS NULL"
+            params.append(int(limit))
+            params.append(max(0, int(offset)))
+            rows = conn.execute(
+                f"""
+                SELECT
+                  model_key,
+                  user_id,
+                  model_id_suffix,
+                  model_id_prefix,
+                  model_id,
+                  model_source,
+                  num_replicas,
+                  gpus_per_replica,
+                  engine_kwargs_json,
+                  state,
+                  error_summary,
+                  created_at,
+                  updated_at,
+                  deleted_at
+                FROM serve_models
+                {where_sql}
+                ORDER BY created_at DESC, model_key DESC
+                LIMIT ? OFFSET ?
+                """,
+                tuple(params),
+            ).fetchall()
+            return [dict(r) for r in rows]
+
+    def list_all_serve_models(
+        self,
+        *,
+        include_deleted: bool = False,
+        limit: int = 2000,
+        offset: int = 0,
+    ) -> list[dict[str, Any]]:
+        with self._connect() as conn:
+            where_sql = ""
+            if not include_deleted:
+                where_sql = "WHERE deleted_at IS NULL"
+            rows = conn.execute(
+                f"""
+                SELECT
+                  model_key,
+                  user_id,
+                  model_id_suffix,
+                  model_id_prefix,
+                  model_id,
+                  model_source,
+                  num_replicas,
+                  gpus_per_replica,
+                  engine_kwargs_json,
+                  state,
+                  error_summary,
+                  spec_yaml,
+                  resolved_spec_yaml,
+                  created_at,
+                  updated_at,
+                  deleted_at
+                FROM serve_models
+                {where_sql}
+                ORDER BY created_at ASC, model_key ASC
+                LIMIT ? OFFSET ?
+                """,
+                (int(limit), max(0, int(offset))),
+            ).fetchall()
+            return [dict(r) for r in rows]
+
+    def get_serve_model(self, model_key: str) -> dict[str, Any] | None:
+        with self._connect() as conn:
+            row = conn.execute("SELECT * FROM serve_models WHERE model_key = ?", (model_key,)).fetchone()
+            return dict(row) if row else None
+
+    def list_serve_events(self, model_key: str, limit: int = 200, offset: int = 0) -> list[dict[str, Any]]:
+        with self._connect() as conn:
+            rows = conn.execute(
+                """
+                SELECT id, model_key, ts, event_type, payload_json
+                FROM serve_events
+                WHERE model_key = ?
+                ORDER BY id DESC
+                LIMIT ? OFFSET ?
+                """,
+                (model_key, int(limit), max(0, int(offset))),
+            ).fetchall()
+            return [dict(r) for r in rows]
+
+    def append_serve_event(self, *, model_key: str, event_type: str, payload_json: str | None = None) -> None:
+        now = _utc_now_iso()
+        with self.tx() as conn:
+            conn.execute(
+                "INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, ?, ?)",
+                (model_key, now, event_type, payload_json),
+            )
+
+    def set_serve_model_state(
+        self,
+        *,
+        model_key: str,
+        state: str,
+        error_summary: str | None = None,
+        event_type: str = "SERVE_STATE_UPDATE",
+        payload_json: str | None = None,
+    ) -> None:
+        now = _utc_now_iso()
+        with self.tx() as conn:
+            sets = ["state = ?", "updated_at = ?"]
+            params: list[Any] = [state, now]
+            if error_summary is not None:
+                sets.append("error_summary = ?")
+                params.append(error_summary)
+            if state == "DELETED":
+                sets.append("deleted_at = ?")
+                params.append(now)
+            params.append(model_key)
+            conn.execute(f"UPDATE serve_models SET {', '.join(sets)} WHERE model_key = ?", tuple(params))
+            conn.execute(
+                "INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, ?, ?)",
+                (model_key, now, event_type, payload_json),
+            )
+
+    def update_serve_model_num_replicas(self, *, model_key: str, num_replicas: int) -> None:
+        if not isinstance(num_replicas, int) or num_replicas < 1:
+            raise ValueError("num_replicas must be an integer >= 1")
+        now = _utc_now_iso()
+        with self.tx() as conn:
+            conn.execute(
+                """
+                UPDATE serve_models
+                SET num_replicas = ?, state = 'QUEUED', error_summary = NULL, updated_at = ?
+                WHERE model_key = ?
+                """,
+                (int(num_replicas), now, model_key),
+            )
+            conn.execute(
+                "INSERT INTO serve_events (model_key, ts, event_type, payload_json) VALUES (?, ?, 'SERVE_PATCH_NUM_REPLICAS', ?)",
+                (model_key, now, str(num_replicas)),
+            )
+
+    def pick_next_runnable_serve_change(self) -> dict[str, Any] | None:
+        """
+        Returns the next serve model that needs reconciliation.
+
+        Minimal state machine for now:
+        - QUEUED: needs (re)apply
+        - DELETING: needs removal
+        """
+        with self._connect() as conn:
+            row = conn.execute(
+                """
+                SELECT *
+                FROM serve_models
+                WHERE deleted_at IS NULL
+                  AND state IN ('QUEUED','DELETING')
+                ORDER BY updated_at ASC
+                LIMIT 1
+                """
+            ).fetchone()
+            return dict(row) if row else None
+
+    # Backward compatible naming (v3.8 docs originally used "upsert").
+    def upsert_serve_model(self, **kwargs: Any) -> dict[str, Any]:
+        return self.create_serve_model(**kwargs)
--- a/src/mvp/py/argus/service/scheduler.py
+++ b/src/mvp/py/argus/service/scheduler.py
@ -16,6 +16,8 @@ from argus.ray.ray_job_tool import RayJobTool
 from .config import V2Config
 from .db import Db
 from .ray_resources import ensure_ray_connected, get_cluster_available
+from .serve_client import RayServeClient
+from .serving_reconciler import ServingReconciler


 _INSUFFICIENT_RE = re.compile(r"Total available GPUs\\s+\\d+\\s+is less than total desired GPUs\\s+\\d+")
@ -37,6 +39,18 @@ class Scheduler:

    def __post_init__(self) -> None:
        self.tool = RayJobTool(self.ray_cfg)
+        self._serving: ServingReconciler | None = None
+        if bool(self.v2_cfg.serving.enabled):
+            self._serving = ServingReconciler(
+                db=self.db,
+                v2_cfg=self.v2_cfg,
+                ray_runtime_env_env_vars=self.ray_cfg.runtime_env_env_vars,
+                serve_client=RayServeClient(
+                    http_port=int(self.v2_cfg.serving.serve.http_port),
+                    proxy_location=str(self.v2_cfg.serving.serve.proxy_location),
+                    ray_init_address="auto",
+                ),
+            )

    def _job_dir_for_task(self, *, user_id: str | None, ray_submission_id: str) -> str:
        root = self.ray_cfg.shared_root.rstrip("/")
@ -251,6 +265,14 @@ class Scheduler:
    def tick(self) -> None:
        ensure_ray_connected()

+        # v3.8: reconcile serve_models (best-effort).
+        if self._serving is not None:
+            try:
+                self._serving.tick()
+            except Exception:
+                # Keep scheduler alive even if serving tick fails.
+                pass
+
        # Sync active tasks
        for row in self.db.list_active_tasks(limit=50):
            self._sync_one_running(row)
--- a/src/mvp/py/argus/service/serve_client.py
+++ b/src/mvp/py/argus/service/serve_client.py
@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+
+@dataclass(frozen=True)
+class RayServeClient:
+    """
+    Minimal Ray Serve client wrapper.
+
+    This is intentionally tiny and uses runtime imports so that:
+    - unit tests can stub `ray` modules without needing real Ray installed
+    - production can run with the real Ray Serve stack (v3.8+)
+    """
+
+    http_port: int = 8000
+    proxy_location: str = "HeadOnly"
+    ray_init_address: str = "auto"
+
+    def ensure_started(self) -> None:
+        import ray  # runtime import
+
+        # Scheduler already calls ray.init(); make this idempotent.
+        ray.init(address=self.ray_init_address, ignore_reinit_error=True, log_to_driver=False)  # type: ignore[call-arg]
+
+        # Import serve lazily to allow tests to stub it.
+        from ray import serve  # type: ignore
+
+        serve.start(proxy_location=self.proxy_location, http_options={"host": "0.0.0.0", "port": int(self.http_port)})
+
+    def apply_app(self, *, app: Any, app_name: str, route_prefix: str = "/") -> Any:
+        from ray import serve  # type: ignore
+
+        # If Ray Serve LLM isn't available, callers may pass a plain dict placeholder.
+        # Running that through serve.run() results in a confusing TypeError; fail fast.
+        if isinstance(app, dict):
+            raise ValueError("invalid serve app object (Ray Serve LLM not available or build_openai_app failed)")
+
+        return serve.run(app, name=app_name, route_prefix=route_prefix)
+
+    def get_status(self) -> Any:
+        from ray import serve  # type: ignore
+
+        return serve.status()
--- a/src/mvp/py/argus/service/serve_llm_config.py
+++ b/src/mvp/py/argus/service/serve_llm_config.py
@ -0,0 +1,63 @@
+from __future__ import annotations
+
+from typing import Any
+
+from .serving_spec import ResolvedServingSpec
+
+
+def _ensure_hf_env_defaults(env: dict[str, str]) -> dict[str, str]:
+    out = dict(env or {})
+    # Prefer existing values if present, but always force offline mode in the platform.
+    out.setdefault("HF_HOME", "/private/hf")
+    out.setdefault("HUGGINGFACE_HUB_CACHE", "/private/hf/hub")
+    out.setdefault("TRANSFORMERS_CACHE", "/private/hf/transformers")
+    out["HF_HUB_OFFLINE"] = "1"
+    return out
+
+
+def build_llm_config_dict(
+    resolved: ResolvedServingSpec,
+    *,
+    accelerator_type: str,
+    runtime_env_env_vars: dict[str, str] | None,
+    cpu_per_gpu: float = 1.0,
+) -> dict[str, Any]:
+    """
+    Pure builder: maps a platform ResolvedServingSpec to a Ray Serve LLM-like config.
+
+    We return a plain dict here to keep this layer unit-testable without depending on
+    a specific Ray Serve LLM version. The reconciler (later milestone) can choose to
+    instantiate `ray.serve.llm.LLMConfig` using this dict.
+    """
+    if not accelerator_type:
+        raise ValueError("accelerator_type is required")
+    if resolved.num_replicas < 1:
+        raise ValueError("num_replicas must be >= 1")
+    if resolved.gpus_per_replica < 1:
+        raise ValueError("gpus_per_replica must be >= 1")
+    if cpu_per_gpu <= 0:
+        raise ValueError("cpu_per_gpu must be > 0")
+
+    engine_kwargs: dict[str, Any] = dict(resolved.engine_kwargs or {})
+    # Enforce tensor parallel mapping; user-provided value must not contradict requested GPUs.
+    engine_kwargs["tensor_parallel_size"] = int(resolved.gpus_per_replica)
+
+    # Ray Serve LLM (Ray 2.49.x) exposes `resources_per_bundle` instead of the older
+    # `placement_group_config`. Use a single bundle that reserves the full GPU set
+    # required by tensor-parallel execution.
+    resources_per_bundle = {
+        "GPU": float(resolved.gpus_per_replica),
+        "CPU": float(cpu_per_gpu) * float(resolved.gpus_per_replica),
+    }
+
+    env_vars = _ensure_hf_env_defaults(dict(runtime_env_env_vars or {}))
+
+    return {
+        # Ray Serve LLM expects `model_loading_config` with model_id/model_source.
+        "model_loading_config": {"model_id": resolved.model_id, "model_source": resolved.model_source},
+        "accelerator_type": accelerator_type,
+        "deployment_config": {"num_replicas": int(resolved.num_replicas)},
+        "engine_kwargs": engine_kwargs,
+        "resources_per_bundle": resources_per_bundle,
+        "runtime_env": {"env_vars": env_vars},
+    }
--- a/src/mvp/py/argus/service/serving_reconciler.py
+++ b/src/mvp/py/argus/service/serving_reconciler.py
@ -0,0 +1,151 @@
+from __future__ import annotations
+
+import json
+import traceback
+from dataclasses import dataclass
+from typing import Any, Protocol
+
+from argus.service.ray_resources import ClusterAvailable, get_cluster_available
+
+from .config import V2Config
+from .db import Db
+from .serve_llm_config import build_llm_config_dict
+from .serving_spec import ResolvedServingSpec
+
+
+class ServeClient(Protocol):
+    def ensure_started(self) -> None: ...
+
+    def apply_app(self, *, app: Any, app_name: str, route_prefix: str = "/") -> Any: ...
+
+    def get_status(self) -> Any: ...
+
+
+def _parse_engine_kwargs(row: dict[str, Any]) -> dict[str, Any] | None:
+    raw = row.get("engine_kwargs_json")
+    if raw in (None, ""):
+        return None
+    try:
+        obj = json.loads(str(raw))
+        return obj if isinstance(obj, dict) else None
+    except Exception:
+        return None
+
+
+def _row_to_resolved_spec(row: dict[str, Any]) -> ResolvedServingSpec:
+    return ResolvedServingSpec(
+        user_id=str(row["user_id"]),
+        model_id_suffix=str(row["model_id_suffix"]),
+        model_id_prefix=str(row["model_id_prefix"]),
+        model_id=str(row["model_id"]),
+        model_source=str(row["model_source"]),
+        num_replicas=int(row["num_replicas"]),
+        gpus_per_replica=int(row["gpus_per_replica"]),
+        engine_kwargs=_parse_engine_kwargs(row),
+    )
+
+
+def _needed_total_gpus(rows: list[dict[str, Any]]) -> int:
+    total = 0
+    for r in rows:
+        total += int(r.get("num_replicas") or 0) * int(r.get("gpus_per_replica") or 0)
+    return total
+
+
+@dataclass
+class ServingReconciler:
+    """
+    v3.8: reconcile declared serve_models (SQLite) into a multi-model Ray Serve app.
+
+    This reconciler is intentionally conservative:
+    - Only acts on models in states QUEUED/DELETING.
+    - Performs a minimal GPU precheck using ray available GPU totals.
+    - Writes events and state transitions for explainability.
+    """
+
+    db: Db
+    v2_cfg: V2Config
+    ray_runtime_env_env_vars: dict[str, str]
+    serve_client: ServeClient
+    app_name: str = "argus_llm_app"
+    route_prefix: str = "/"
+    cpu_per_gpu: float = 1.0
+    get_available_fn: Any = get_cluster_available
+
+    def tick(self) -> None:
+        # Pick the next desired change.
+        change = self.db.pick_next_runnable_serve_change()
+        if not change:
+            return
+
+        model_key = str(change["model_key"])
+        state = str(change.get("state") or "")
+
+        # Ensure Ray (and Serve) can be started before doing anything else.
+        try:
+            self.serve_client.ensure_started()
+        except Exception as e:
+            self.db.append_serve_event(model_key=model_key, event_type="SERVE_START_ERROR", payload_json=repr(e))
+            return
+
+        # Desired set: all non-deleted models except those marked DELETING.
+        all_rows = self.db.list_all_serve_models(include_deleted=False, limit=5000, offset=0)
+        # FAILED models are not part of the desired running set. A user can PATCH to
+        # re-queue a failed model (e.g., after fixing env/deps) which will move it back to QUEUED.
+        desired_rows = [r for r in all_rows if str(r.get("state") or "") not in ("DELETING", "DELETED", "FAILED")]
+
+        # Precheck resources: multi-model app apply needs enough GPUs for the whole desired set.
+        needed = _needed_total_gpus(desired_rows)
+        avail: ClusterAvailable = self.get_available_fn()
+        if float(avail.total_available_gpus) < float(needed):
+            msg = f"Insufficient GPUs: need {needed}, available {avail.total_available_gpus}"
+            self.db.append_serve_event(model_key=model_key, event_type="SERVE_PENDING_RESOURCES", payload_json=msg)
+            return
+
+        # Build per-model LLM configs (dict form in M4).
+        llm_cfg_dicts: list[dict[str, Any]] = []
+        accelerator_type = str(self.v2_cfg.serving.llm.accelerator_type or "")
+        for r in desired_rows:
+            resolved = _row_to_resolved_spec(r)
+            llm_cfg_dicts.append(
+                build_llm_config_dict(
+                    resolved,
+                    accelerator_type=accelerator_type,
+                    runtime_env_env_vars=self.ray_runtime_env_env_vars,
+                    cpu_per_gpu=self.cpu_per_gpu,
+                )
+            )
+
+        # Build a Ray Serve OpenAI-compatible app if Ray Serve LLM is available.
+        # Fall back to a plain dict so unit tests can run without real Ray Serve.
+        app_obj: Any
+        try:
+            from ray.serve.llm import LLMConfig, build_openai_app  # type: ignore
+
+            llm_cfgs = [LLMConfig(**d) for d in llm_cfg_dicts]
+            app_obj = build_openai_app({"llm_configs": llm_cfgs})
+        except Exception as e:
+            self.db.append_serve_event(model_key=model_key, event_type="SERVE_LLM_IMPORT_ERROR", payload_json=repr(e))
+            app_obj = {"llm_configs": llm_cfg_dicts}
+
+        try:
+            self.db.append_serve_event(model_key=model_key, event_type="SERVE_APPLY_REQUESTED", payload_json=str(len(llm_cfg_dicts)))
+            self.serve_client.apply_app(app=app_obj, app_name=self.app_name, route_prefix=self.route_prefix)
+        except Exception as e:
+            err = f"{type(e).__name__}: {e}"
+            tb = traceback.format_exc(limit=10)
+            self.db.set_serve_model_state(model_key=model_key, state="FAILED", error_summary=err, event_type="SERVE_APPLY_FAILED", payload_json=tb)
+            return
+
+        # Apply succeeded. Update the changing model's state.
+        if state == "DELETING":
+            self.db.set_serve_model_state(model_key=model_key, state="DELETED", event_type="SERVE_DELETE_APPLIED")
+            return
+
+        # Mark as deploying; best-effort status probe can promote to RUNNING.
+        self.db.set_serve_model_state(model_key=model_key, state="DEPLOYING", event_type="SERVE_DEPLOYING")
+        try:
+            _ = self.serve_client.get_status()
+            self.db.set_serve_model_state(model_key=model_key, state="RUNNING", event_type="SERVE_RUNNING")
+        except Exception as e:
+            self.db.append_serve_event(model_key=model_key, event_type="SERVE_STATUS_ERROR", payload_json=repr(e))
--- a/src/mvp/py/argus/service/serving_spec.py
+++ b/src/mvp/py/argus/service/serving_spec.py
@ -0,0 +1,144 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from typing import Any
+
+
+_MODEL_ID_SUFFIX_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$")
+
+
+@dataclass(frozen=True)
+class ServingSpec:
+    model_id: str
+    model_source: str
+    num_replicas: int = 1
+    gpus_per_replica: int = 1
+    engine_kwargs: dict[str, Any] | None = None
+
+
+@dataclass(frozen=True)
+class ResolvedServingSpec:
+    user_id: str
+    model_id_suffix: str
+    model_id_prefix: str
+    model_id: str
+    model_source: str
+    num_replicas: int
+    gpus_per_replica: int
+    engine_kwargs: dict[str, Any] | None
+
+
+def validate_model_id_suffix(suffix: str) -> None:
+    if not isinstance(suffix, str):
+        raise ValueError("model_id must be a string")
+    s = suffix.strip()
+    if s != suffix:
+        raise ValueError("model_id must not contain leading/trailing whitespace")
+    if not s:
+        raise ValueError("model_id is required")
+    if not _MODEL_ID_SUFFIX_RE.match(s):
+        raise ValueError("model_id must match regex: ^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$")
+    if ".." in s:
+        raise ValueError("model_id must not contain '..'")
+
+
+def make_model_id_prefix(*, user_id: str, now_utc: datetime | None = None) -> str:
+    if not user_id or not isinstance(user_id, str):
+        raise ValueError("user_id is required")
+    if "/" in user_id:
+        raise ValueError("user_id must not contain '/'")
+
+    dt = now_utc or datetime.now(timezone.utc)
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    dt = dt.astimezone(timezone.utc)
+    stamp = dt.strftime("%Y%m%d%H%M")
+    return f"{user_id}-{stamp}"
+
+
+def expand_home_macros(*, user_id: str, text: str) -> str:
+    if not isinstance(text, str):
+        raise ValueError("model_source must be a string")
+    if not text:
+        raise ValueError("model_source is required")
+
+    out = text
+    out = out.replace("$HOME/common/hf", "/private/hf")
+    out = out.replace("$HOME/common/datasets", "/private/datasets")
+    out = out.replace("$HOME", f"/private/users/{user_id}")
+    return out
+
+
+def validate_model_source_path(*, user_id: str, model_source: str) -> None:
+    if not isinstance(model_source, str):
+        raise ValueError("model_source must be a string")
+    if not model_source.startswith("/"):
+        raise ValueError("model_source must be an absolute path")
+    if not model_source.startswith("/private/"):
+        raise ValueError("model_source must be under /private")
+    if "\x00" in model_source:
+        raise ValueError("model_source contains null byte")
+    parts = [p for p in model_source.split("/") if p]
+    if any(p == ".." for p in parts):
+        raise ValueError("model_source must not contain '..'")
+
+    allowed_user_prefix = f"/private/users/{user_id}/"
+    allowed = model_source.startswith("/private/hf/") or model_source.startswith(allowed_user_prefix)
+    if not allowed:
+        raise PermissionError("model_source is not allowed (must be under /private/hf or your /private/users/<user_id>)")
+
+
+def parse_serving_spec(obj: Any) -> ServingSpec:
+    if not isinstance(obj, dict):
+        raise ValueError("serving spec must be a mapping")
+
+    model_id = obj.get("model_id")
+    model_source = obj.get("model_source")
+    num_replicas = obj.get("num_replicas", 1)
+    gpus_per_replica = obj.get("gpus_per_replica", 1)
+    engine_kwargs = obj.get("engine_kwargs", None)
+
+    if not isinstance(model_id, str):
+        raise ValueError("missing required field: model_id")
+    validate_model_id_suffix(model_id)
+
+    if not isinstance(model_source, str) or not model_source:
+        raise ValueError("missing required field: model_source")
+
+    if not isinstance(num_replicas, int) or num_replicas < 1:
+        raise ValueError("num_replicas must be an integer >= 1")
+    if not isinstance(gpus_per_replica, int) or gpus_per_replica < 1:
+        raise ValueError("gpus_per_replica must be an integer >= 1")
+
+    if engine_kwargs is not None and not isinstance(engine_kwargs, dict):
+        raise ValueError("engine_kwargs must be a mapping when provided")
+
+    return ServingSpec(
+        model_id=model_id,
+        model_source=model_source,
+        num_replicas=num_replicas,
+        gpus_per_replica=gpus_per_replica,
+        engine_kwargs=engine_kwargs,
+    )
+
+
+def resolve_serving_spec(*, spec: ServingSpec, user_id: str, now_utc: datetime | None = None) -> ResolvedServingSpec:
+    validate_model_id_suffix(spec.model_id)
+    prefix = make_model_id_prefix(user_id=user_id, now_utc=now_utc)
+    full_model_id = f"{prefix}-{spec.model_id}"
+
+    resolved_source = expand_home_macros(user_id=user_id, text=spec.model_source)
+    validate_model_source_path(user_id=user_id, model_source=resolved_source)
+
+    return ResolvedServingSpec(
+        user_id=user_id,
+        model_id_suffix=spec.model_id,
+        model_id_prefix=prefix,
+        model_id=full_model_id,
+        model_source=resolved_source,
+        num_replicas=spec.num_replicas,
+        gpus_per_replica=spec.gpus_per_replica,
+        engine_kwargs=spec.engine_kwargs,
+    )
--- a/src/mvp/py/argus/service/ui.py
+++ b/src/mvp/py/argus/service/ui.py
@ -112,6 +112,7 @@ def _nav(active: str) -> str:
    links = [
        ("login", "/ui/login", "Login"),
        ("tasks", "/ui/tasks", "Tasks"),
+        ("serving", "/ui/serving", "Serving"),
        ("new", "/ui/tasks/new", "New Task"),
        ("data", "/ui/data", "Data"),
        ("admin", "/ui/admin", "Admin"),
@ -992,6 +993,253 @@ refresh();
 """.strip()
        return HTMLResponse(content=_page(f"Logs {task_id}", "tasks", body, script))

+    @app.get("/ui/serving")
+    async def ui_serving() -> HTMLResponse:
+        body = """
+<h1>Serving</h1>
+<div class="card">
+  <div class="row">
+    <button class="btn" id="refresh">Refresh</button>
+    <a class="btn" href="/ui/serving/new" style="display:inline-block">New Model</a>
+    <a class="btn" id="openai-models" target="_blank" rel="noopener" href="#">OpenAI /v1/models</a>
+  </div>
+  <div style="height:10px"></div>
+  <div id="out" class="muted">Loading...</div>
+</div>
+""".strip()
+        script = """
+document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265);
+document.getElementById("openai-models").href = curOriginWithPort(8000) + "/v1/models";
+const out = document.getElementById("out");
+
+function pill(state) {
+  const s = String(state || "");
+  if (s === "RUNNING") return `<span class="pill ok">${s}</span>`;
+  if (s === "FAILED") return `<span class="pill bad">${s}</span>`;
+  return `<span class="pill">${s}</span>`;
+}
+
+async function refresh() {
+  out.textContent = "Loading...";
+  try {
+    const lim = 50;
+    const off = Number(localStorage.getItem("mvp_serving_offset") || "0") || 0;
+    const resp = await apiJson("/api/v2/serve/models?limit=" + lim + "&offset=" + off + "&include_deleted=0");
+    const items = resp.items || [];
+    const hasMore = !!resp.has_more;
+    const pageNo = Math.floor(off / lim) + 1;
+    const prevDisabled = off <= 0;
+    const nextDisabled = !hasMore;
+
+    function row(m) {
+      return `<tr>
+        <td><a href="/ui/serving/${m.model_key}">${m.model_key}</a></td>
+        <td><code>${m.model_id}</code></td>
+        <td>${pill(m.state)}</td>
+        <td>${m.num_replicas} × ${m.gpus_per_replica} GPU</td>
+        <td>${m.updated_at || ""}</td>
+      </tr>`;
+    }
+    const rows = items.map(row).join("");
+
+    out.innerHTML = `
+      <div class="row" style="justify-content: space-between; margin-bottom: 8px;">
+        <div class="muted">OpenAI base: <code>${resp.openai_base_url || curOriginWithPort(8000) + "/v1"}</code></div>
+        <div class="row">
+          <span class="muted">Page ${pageNo}</span>
+          <button class="btn" id="prev" ${prevDisabled ? "disabled" : ""}>Prev</button>
+          <button class="btn" id="next" ${nextDisabled ? "disabled" : ""}>Next</button>
+        </div>
+      </div>
+      <table>
+        <thead><tr><th>Model Key</th><th>Model ID</th><th>State</th><th>Resources</th><th>Updated</th></tr></thead>
+        <tbody>${rows || "<tr><td colspan=5 class=muted>(none)</td></tr>"}</tbody>
+      </table>
+    `;
+
+    const prevBtn = document.getElementById("prev");
+    const nextBtn = document.getElementById("next");
+    if (prevBtn) prevBtn.onclick = () => { localStorage.setItem("mvp_serving_offset", String(Math.max(0, off - lim))); refresh(); };
+    if (nextBtn) nextBtn.onclick = () => { localStorage.setItem("mvp_serving_offset", String(off + lim)); refresh(); };
+  } catch (e) {
+    let text = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
+    if (e.body && String(e.body).includes("serving is not enabled")) {
+      text = "Serving is not enabled in server config.\\nAsk admin to enable `serving:` in dev.yaml.";
+    }
+    out.textContent = text;
+  }
+}
+
+document.getElementById("refresh").onclick = refresh;
+refresh();
+""".strip()
+        return HTMLResponse(content=_page("Serving", "serving", body, script))
+
+    @app.get("/ui/serving/new")
+    async def ui_serving_new() -> HTMLResponse:
+        example = """# ServingSpec (YAML)
+# 说明：
+# - model_id: 这里是 suffix（平台会自动加前缀：<user_id>-<YYYYMMDDHHMM>-<suffix>）
+# - model_source: 本地模型路径（支持 $HOME 宏；推荐使用 $HOME/common/hf 指向共享 HF cache）
+#
+# 常用路径：
+# - $HOME/common/hf -> /private/hf
+# - $HOME -> /private/users/<user_id>
+#
+model_id: qwen-0.5b
+model_source: $HOME/common/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/<SNAPSHOT_HASH>
+num_replicas: 1
+gpus_per_replica: 1
+
+# engine_kwargs:  # 可选：透传给 vLLM
+#   gpu_memory_utilization: 0.4
+""".strip()
+        body = f"""
+<h1>New Model</h1>
+<div class="card">
+  <div class="muted">Paste ServingSpec YAML and submit to <code>/api/v2/serve/models</code>.</div>
+  <div style="height:10px"></div>
+  <textarea id="yaml" rows="14">{html.escape(example)}</textarea>
+  <div style="height:10px"></div>
+  <div class="row">
+    <button class="btn" id="submit">Submit</button>
+    <a class="btn" href="/ui/serving" style="display:inline-block">Back</a>
+  </div>
+  <div style="height:10px"></div>
+  <pre id="out" class="muted"></pre>
+</div>
+""".strip()
+        script = """
+document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265);
+const out = document.getElementById("out");
+document.getElementById("submit").onclick = async () => {
+  out.textContent = "Submitting...";
+  const yaml = document.getElementById("yaml").value || "";
+  try {
+    const resp = await apiJson("/api/v2/serve/models", { method: "POST", headers: { "Content-Type": "application/yaml" }, body: yaml });
+    out.textContent = "Created: " + resp.model_key + "\\nState: " + resp.state;
+    if (resp.model_key) window.location.href = "/ui/serving/" + encodeURIComponent(resp.model_key);
+  } catch (e) {
+    out.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
+  }
+};
+""".strip()
+        return HTMLResponse(content=_page("New Model", "serving", body, script))
+
+    @app.get("/ui/serving/{model_key}")
+    async def ui_serving_detail(model_key: str) -> HTMLResponse:
+        body = f"""
+<h1>Model</h1>
+<div class="card">
+  <div class="row" style="justify-content: space-between;">
+    <div class="muted">model_key: <code>{html.escape(model_key)}</code></div>
+    <div class="row">
+      <a class="btn" href="/ui/serving" style="display:inline-block">Back</a>
+      <a class="btn" id="openai-models" target="_blank" rel="noopener" href="#">OpenAI /v1/models</a>
+    </div>
+  </div>
+  <div style="height:10px"></div>
+  <div class="row">
+    <label class="muted" style="min-width:120px">Scale replicas</label>
+    <input id="replicas" type="number" min="1" step="1" value="1" style="max-width: 180px" />
+    <button class="btn" id="scale">Apply</button>
+    <button class="btn danger" id="delete">Delete</button>
+  </div>
+  <div style="height:10px"></div>
+  <div id="meta" class="muted">Loading...</div>
+  <div style="height:12px"></div>
+  <h3 style="margin-top:0">Resolved Spec (YAML)</h3>
+  <pre id="spec" class="muted">(loading)</pre>
+  <div style="height:12px"></div>
+  <h3 style="margin-top:0">Events</h3>
+  <div id="events" class="muted">(loading)</div>
+  <div style="height:12px"></div>
+  <h3 style="margin-top:0">OpenAI Example</h3>
+  <pre id="example" class="muted">(loading)</pre>
+</div>
+""".strip()
+        script = f"""
+document.getElementById("nav-ray-dashboard").href = curOriginWithPort(8265);
+document.getElementById("openai-models").href = curOriginWithPort(8000) + "/v1/models";
+const modelKey = {json.dumps(model_key)};
+const meta = document.getElementById("meta");
+const spec = document.getElementById("spec");
+const eventsEl = document.getElementById("events");
+const example = document.getElementById("example");
+const replicas = document.getElementById("replicas");
+
+function pill(state) {{
+  const s = String(state || "");
+  if (s === "RUNNING") return `<span class="pill ok">${{s}}</span>`;
+  if (s === "FAILED") return `<span class="pill bad">${{s}}</span>`;
+  return `<span class="pill">${{s}}</span>`;
+}}
+
+function renderEvents(events) {{
+  if (!events || !events.length) return "<div class=muted>(none)</div>";
+  const rows = events.map(e => {{
+    const payload = (e.payload_json || "");
+    const short = String(payload).length > 240 ? String(payload).slice(0, 240) + "..." : String(payload);
+    return `<tr><td>${{e.created_at || ""}}</td><td><code>${{e.event_type}}</code></td><td><pre class=muted style=\\"margin:0\\">${{short}}</pre></td></tr>`;
+  }}).join("");
+  return `<table><thead><tr><th>Time</th><th>Type</th><th>Payload</th></tr></thead><tbody>${{rows}}</tbody></table>`;
+}}
+
+async function refresh() {{
+  meta.textContent = "Loading...";
+  spec.textContent = "(loading)";
+  eventsEl.textContent = "(loading)";
+  example.textContent = "(loading)";
+  try {{
+    const obj = await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey));
+    const m = obj.model || {{}};
+    replicas.value = String(m.num_replicas || 1);
+    meta.innerHTML = `
+      <div class=row>
+        <div>state: ${{pill(m.state)}}</div>
+        <div class=muted>model_id: <code>${{m.model_id || ""}}</code></div>
+        <div class=muted>source: <code>${{m.model_source || ""}}</code></div>
+      </div>
+      <div class=muted>endpoint: <code>${{(m.endpoint && m.endpoint.openai_base_url) || (curOriginWithPort(8000) + "/v1")}}</code></div>
+    `;
+    spec.textContent = obj.resolved_spec_yaml || "";
+    eventsEl.innerHTML = renderEvents(obj.events || []);
+    const base = (m.endpoint && m.endpoint.openai_base_url) || (curOriginWithPort(8000) + "/v1");
+    const mid = m.model_id || "";
+    example.textContent = `curl -sS -H 'Content-Type: application/json' -H 'Authorization: Bearer FAKE_KEY' \\\\\\n  -X POST ${{base}}/chat/completions \\\\\\n  --data-binary '{{\\"model\\":\\"${{mid}}\\",\\"messages\\":[{{\\"role\\":\\"user\\",\\"content\\":\\"hello\\"}}],\\"max_tokens\\":16,\\"stream\\":false}}' | python3 -m json.tool`;
+  }} catch (e) {{
+    meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
+    spec.textContent = "";
+    eventsEl.textContent = "";
+    example.textContent = "";
+  }}
+}}
+
+document.getElementById("scale").onclick = async () => {{
+  const n = Number(replicas.value || "1");
+  if (!Number.isFinite(n) || n < 1) return;
+  try {{
+    await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey), {{ method: "PATCH", headers: {{ "Content-Type": "application/json" }}, body: JSON.stringify({{ num_replicas: n }}) }});
+    await refresh();
+  }} catch (e) {{
+    meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
+  }}
+}};
+
+document.getElementById("delete").onclick = async () => {{
+  if (!confirm("Delete this model?")) return;
+  try {{
+    await apiJson("/api/v2/serve/models/" + encodeURIComponent(modelKey), {{ method: "DELETE" }});
+    await refresh();
+  }} catch (e) {{
+    meta.textContent = "Error: " + (e.status || "") + "\\n" + (e.body || String(e));
+  }}
+}};
+
+refresh();
+""".strip()
+        return HTMLResponse(content=_page("Model", "serving", body, script))
+
    @app.get("/ui/data")
    async def ui_data() -> HTMLResponse:
        body = """
--- a/src/mvp/py/tests/test_app_serving_api.py
+++ b/src/mvp/py/tests/test_app_serving_api.py
@ -0,0 +1,282 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import yaml
+from fastapi.testclient import TestClient
+
+
+def _write_config(tmp_path: Path) -> Path:
+    cfg = {
+        "ray": {
+            "address": "http://127.0.0.1:8265",
+            "shared_root": "/private",
+            "entrypoint_resources": {"worker_node": 1},
+            "runtime_env": {"env_vars": {}},
+        },
+        "data": {
+            "user_root": str(tmp_path / "users"),
+        },
+        "service": {
+            "api": {"host": "127.0.0.1", "port": 0},
+            "auth": {"token_env": "MVP_INTERNAL_TOKEN"},
+            "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")},
+            "scheduler": {"tick_s": 1, "retry_interval_s": 1, "max_running_tasks": 1},
+        },
+        "serving": {
+            "serve": {"http_port": 8000, "proxy_location": "HeadOnly"},
+            "llm": {"accelerator_type": "H20"},
+        },
+    }
+    p = tmp_path / "cfg.yaml"
+    p.write_text(yaml.safe_dump(cfg), encoding="utf-8")
+    return p
+
+
+def test_serving_api_crud_flow(tmp_path: Path, monkeypatch):
+    from argus.service import app as app_mod
+
+    cfg_path = _write_config(tmp_path)
+    monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
+
+    class _Scheduler:
+        def __init__(self, **kwargs):
+            self.tool = object()
+
+        def run_forever(self, stop_flag):
+            return None
+
+    monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
+    monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
+
+    app = app_mod.create_app(str(cfg_path))
+
+    admin_headers = {"authorization": "Bearer admin-token"}
+    with TestClient(app) as c:
+        r = c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
+        assert r.status_code == 200
+        r2 = c.post("/api/v2/users/alice/tokens", headers=admin_headers)
+        assert r2.status_code == 200
+        user_token = r2.json()["token"]
+
+        headers = {"authorization": f"Bearer {user_token}"}
+
+        spec_yaml = (
+            "model_id: qwen-0.5b\n"
+            "model_source: $HOME/common/hf/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/sha\n"
+            "num_replicas: 1\n"
+            "gpus_per_replica: 1\n"
+        )
+        r3 = c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
+        assert r3.status_code == 200
+        assert r3.json()["model_key"] == "mk-alice"
+        assert r3.json()["state"] == "QUEUED"
+
+        r4 = c.get("/api/v2/serve/models?limit=10&offset=0", headers=headers)
+        assert r4.status_code == 200
+        obj = r4.json()
+        assert obj["openai_base_url"] == "http://testserver:8000/v1"
+        assert len(obj["items"]) == 1
+        assert obj["items"][0]["model_key"] == "mk-alice"
+
+        r5 = c.get("/api/v2/serve/models/mk-alice", headers=headers)
+        assert r5.status_code == 200
+        detail = r5.json()
+        assert detail["model"]["model_key"] == "mk-alice"
+        assert "model_id_prefix" in detail["model"]
+        assert "resolved_spec_yaml" in detail
+        assert isinstance(detail.get("events"), list)
+
+        r6 = c.patch("/api/v2/serve/models/mk-alice", headers=headers, json={"num_replicas": 2})
+        assert r6.status_code == 200
+        assert r6.json()["state"] == "QUEUED"
+
+        r7 = c.delete("/api/v2/serve/models/mk-alice", headers=headers)
+        assert r7.status_code == 200
+        assert r7.json()["state"] == "DELETING"
+
+        # Admin status endpoint
+        r8 = c.get("/api/v2/serve/status", headers=admin_headers)
+        assert r8.status_code == 200
+        assert r8.json()["http_port"] == 8000
+
+
+def test_serving_api_rejects_path_outside_user_and_hf(tmp_path: Path, monkeypatch):
+    from argus.service import app as app_mod
+
+    cfg_path = _write_config(tmp_path)
+    monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
+
+    class _Scheduler:
+        def __init__(self, **kwargs):
+            self.tool = object()
+
+        def run_forever(self, stop_flag):
+            return None
+
+    monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
+    monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
+
+    app = app_mod.create_app(str(cfg_path))
+
+    admin_headers = {"authorization": "Bearer admin-token"}
+    with TestClient(app) as c:
+        c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
+        r2 = c.post("/api/v2/users/alice/tokens", headers=admin_headers)
+        user_token = r2.json()["token"]
+        headers = {"authorization": f"Bearer {user_token}"}
+
+        spec_yaml = (
+            "model_id: x\n"
+            "model_source: /private/users/bob/models/evil\n"
+            "num_replicas: 1\n"
+            "gpus_per_replica: 1\n"
+        )
+        r3 = c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
+        assert r3.status_code == 403
+
+
+def test_serving_api_invalid_yaml_and_non_mapping(tmp_path: Path, monkeypatch):
+    from argus.service import app as app_mod
+
+    cfg_path = _write_config(tmp_path)
+    monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
+
+    class _Scheduler:
+        def __init__(self, **kwargs):
+            self.tool = object()
+
+        def run_forever(self, stop_flag):
+            return None
+
+    monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
+    monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
+    app = app_mod.create_app(str(cfg_path))
+
+    with TestClient(app) as c:
+        # Create a user token
+        admin_headers = {"authorization": "Bearer admin-token"}
+        c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
+        token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
+        headers = {"authorization": f"Bearer {token}"}
+
+        r = c.post("/api/v2/serve/models", headers=headers, data=": bad\n")
+        assert r.status_code == 400
+
+        r2 = c.post("/api/v2/serve/models", headers=headers, data="- 1\n- 2\n")
+        assert r2.status_code == 400
+
+
+def test_serving_api_engine_kwargs_binary_rejected(tmp_path: Path, monkeypatch):
+    """
+    yaml !!binary is parsed as bytes, which is not JSON-serializable.
+    """
+    from argus.service import app as app_mod
+
+    cfg_path = _write_config(tmp_path)
+    monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
+
+    class _Scheduler:
+        def __init__(self, **kwargs):
+            self.tool = object()
+
+        def run_forever(self, stop_flag):
+            return None
+
+    monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
+    monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: f"mk-{user_id}")
+    app = app_mod.create_app(str(cfg_path))
+
+    admin_headers = {"authorization": "Bearer admin-token"}
+    with TestClient(app) as c:
+        c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
+        token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
+        headers = {"authorization": f"Bearer {token}"}
+
+        spec_yaml = (
+            "model_id: x\n"
+            "model_source: $HOME/common/hf/x\n"
+            "engine_kwargs:\n"
+            "  blob: !!binary \"AQID\"\n"
+        )
+        r = c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
+        assert r.status_code == 400
+
+
+def test_serving_api_list_include_deleted_and_forwarded_base_url(tmp_path: Path, monkeypatch):
+    from argus.service import app as app_mod
+    from argus.service.config import V2Config
+    from argus.service.db import Db
+
+    cfg_path = _write_config(tmp_path)
+    monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
+
+    class _Scheduler:
+        def __init__(self, **kwargs):
+            self.tool = object()
+
+        def run_forever(self, stop_flag):
+            return None
+
+    monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
+    keys = iter(["mk-alice-1", "mk-alice-2"])
+    monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: next(keys))
+
+    app = app_mod.create_app(str(cfg_path))
+
+    admin_headers = {"authorization": "Bearer admin-token"}
+    with TestClient(app) as c:
+        c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
+        token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
+        headers = {"authorization": f"Bearer {token}"}
+
+        spec_yaml = "model_id: x\nmodel_source: $HOME/common/hf/x\n"
+        c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
+        c.post("/api/v2/serve/models", headers=headers, data=spec_yaml)
+
+        # Mark one model as DELETED directly in DB (sets deleted_at).
+        root = yaml.safe_load(cfg_path.read_text(encoding="utf-8"))
+        v2_cfg = V2Config.from_root_dict(root)
+        db = Db(v2_cfg.sqlite.db_path)
+        db.set_serve_model_state(model_key="mk-alice-2", state="DELETED")
+
+        r1 = c.get(
+            "/api/v2/serve/models?limit=10&offset=0&include_deleted=0",
+            headers={**headers, "x-forwarded-host": "example.com:8080", "x-forwarded-proto": "https"},
+        )
+        assert r1.status_code == 200
+        assert r1.json()["openai_base_url"] == "https://example.com:8000/v1"
+        assert {m["model_key"] for m in r1.json()["items"]} == {"mk-alice-1"}
+
+        r2 = c.get("/api/v2/serve/models?include_deleted=1", headers=headers)
+        assert r2.status_code == 200
+        assert {m["model_key"] for m in r2.json()["items"]} == {"mk-alice-1", "mk-alice-2"}
+
+
+def test_serving_api_patch_invalid_num_replicas(tmp_path: Path, monkeypatch):
+    from argus.service import app as app_mod
+
+    cfg_path = _write_config(tmp_path)
+    monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
+
+    class _Scheduler:
+        def __init__(self, **kwargs):
+            self.tool = object()
+
+        def run_forever(self, stop_flag):
+            return None
+
+    monkeypatch.setattr(app_mod, "Scheduler", _Scheduler)
+    monkeypatch.setattr(app_mod, "new_model_key", lambda user_id: "mk-alice")
+
+    app = app_mod.create_app(str(cfg_path))
+
+    admin_headers = {"authorization": "Bearer admin-token"}
+    with TestClient(app) as c:
+        c.post("/api/v2/users", headers=admin_headers, json={"user_id": "alice"})
+        token = c.post("/api/v2/users/alice/tokens", headers=admin_headers).json()["token"]
+        headers = {"authorization": f"Bearer {token}"}
+
+        c.post("/api/v2/serve/models", headers=headers, data="model_id: x\nmodel_source: $HOME/common/hf/x\n")
+        r = c.patch("/api/v2/serve/models/mk-alice", headers=headers, json={"num_replicas": 0})
+        assert r.status_code == 422
--- a/src/mvp/py/tests/test_db_serving.py
+++ b/src/mvp/py/tests/test_db_serving.py
@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+def test_db_serving_model_crud_and_events(tmp_path: Path) -> None:
+    from argus.service.db import Db
+
+    db = Db(str(tmp_path / "mvp.sqlite3"))
+    db.init()
+
+    m1 = db.create_serve_model(
+        model_key="svc-001",
+        user_id="alice",
+        model_id_suffix="qwen-0.5b",
+        model_id_prefix="alice-202601061235",
+        model_id="alice-202601061235-qwen-0.5b",
+        model_source="/private/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/sha",
+        num_replicas=1,
+        gpus_per_replica=1,
+        engine_kwargs_json=json.dumps({"max_model_len": 8192}),
+        spec_yaml="model_id: qwen-0.5b\nmodel_source: $HOME/common/hf/...\n",
+        resolved_spec_yaml="model_id: alice-202601061235-qwen-0.5b\nmodel_source: /private/hf/...\n",
+    )
+    assert m1["model_key"] == "svc-001"
+    assert m1["state"] == "QUEUED"
+
+    # Same suffix may be created again; model_key is the identity.
+    m2 = db.create_serve_model(
+        model_key="svc-002",
+        user_id="alice",
+        model_id_suffix="qwen-0.5b",
+        model_id_prefix="alice-202601061236",
+        model_id="alice-202601061236-qwen-0.5b",
+        model_source="/private/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/sha",
+        num_replicas=1,
+        gpus_per_replica=2,
+        engine_kwargs_json=None,
+        spec_yaml="model_id: qwen-0.5b\nmodel_source: $HOME/common/hf/...\n",
+        resolved_spec_yaml="model_id: alice-202601061236-qwen-0.5b\nmodel_source: /private/hf/...\n",
+    )
+    assert m2["model_key"] == "svc-002"
+    assert m2["model_id"] != m1["model_id"]
+
+    got = db.get_serve_model("svc-001")
+    assert got is not None
+    assert got["gpus_per_replica"] == 1
+
+    items = db.list_serve_models(user_id="alice")
+    assert {i["model_key"] for i in items} == {"svc-001", "svc-002"}
+
+    # State transition writes a serve event.
+    db.set_serve_model_state(model_key="svc-001", state="DEPLOYING")
+    got2 = db.get_serve_model("svc-001")
+    assert got2 is not None
+    assert got2["state"] == "DEPLOYING"
+
+    events = db.list_serve_events("svc-001", limit=50)
+    assert len(events) >= 2
+    assert {e["event_type"] for e in events}.issuperset({"SERVE_MODEL_CREATED", "SERVE_STATE_UPDATE"})
+
+    # Reconciler pick: QUEUED/DELETING only.
+    picked = db.pick_next_runnable_serve_change()
+    assert picked is not None
+    assert picked["state"] == "QUEUED"
+
+    db.set_serve_model_state(model_key="svc-002", state="DELETING")
+    picked2 = db.pick_next_runnable_serve_change()
+    assert picked2 is not None
+    assert picked2["state"] in ("QUEUED", "DELETING")
+
+    # Deleted models are hidden unless include_deleted.
+    db.set_serve_model_state(model_key="svc-002", state="DELETED")
+    items2 = db.list_serve_models(user_id="alice", include_deleted=False)
+    assert {i["model_key"] for i in items2} == {"svc-001"}
+    items3 = db.list_serve_models(user_id="alice", include_deleted=True)
+    assert {i["model_key"] for i in items3} == {"svc-001", "svc-002"}
+
--- a/src/mvp/py/tests/test_ids.py
+++ b/src/mvp/py/tests/test_ids.py
@ -44,3 +44,32 @@ def test_attempt_submission_id_format():

    assert attempt_submission_id("t", 1) == "t--a01"
    assert attempt_submission_id("t", 12) == "t--a12"
+
+
+def test_new_model_key_includes_user(monkeypatch):
+    import argus.core.ids as ids
+
+    class _FakeDatetime:
+        @staticmethod
+        def now():
+            class _DT:
+                def strftime(self, fmt: str) -> str:
+                    assert fmt == "%Y%m%d-%H%M%S"
+                    return "20250101-010203"
+
+            return _DT()
+
+    monkeypatch.setattr(ids, "datetime", _FakeDatetime)
+    monkeypatch.setattr(ids.secrets, "token_hex", lambda n: "abcd")
+
+    assert ids.new_model_key(user_id="Alice_01") == "mvp2-alice_01-serve-20250101-010203-abcd"
+
+
+def test_new_model_key_requires_user_id():
+    from argus.core.ids import new_model_key
+
+    try:
+        new_model_key(user_id="")
+        assert False, "expected ValueError"
+    except ValueError as e:
+        assert "user_id is required" in str(e)
--- a/src/mvp/py/tests/test_llm_config_builder.py
+++ b/src/mvp/py/tests/test_llm_config_builder.py
@ -0,0 +1,78 @@
+from __future__ import annotations
+
+import pytest
+
+
+def test_build_llm_config_dict_maps_tp_and_bundles():
+    from argus.service.serve_llm_config import build_llm_config_dict
+    from argus.service.serving_spec import ResolvedServingSpec
+
+    resolved = ResolvedServingSpec(
+        user_id="alice",
+        model_id_suffix="qwen-0.5b",
+        model_id_prefix="alice-202601061235",
+        model_id="alice-202601061235-qwen-0.5b",
+        model_source="/private/hf/x",
+        num_replicas=2,
+        gpus_per_replica=4,
+        engine_kwargs={"gpu_memory_utilization": 0.9},
+    )
+
+    cfg = build_llm_config_dict(
+        resolved,
+        accelerator_type="H20",
+        runtime_env_env_vars={"HF_ENDPOINT": "https://hf-mirror.com"},
+        cpu_per_gpu=2.0,
+    )
+    assert cfg["model_loading_config"]["model_id"] == "alice-202601061235-qwen-0.5b"
+    assert cfg["model_loading_config"]["model_source"] == "/private/hf/x"
+    assert cfg["accelerator_type"] == "H20"
+    assert cfg["deployment_config"]["num_replicas"] == 2
+
+    # gpus_per_replica -> tensor_parallel_size
+    assert cfg["engine_kwargs"]["tensor_parallel_size"] == 4
+    assert cfg["engine_kwargs"]["gpu_memory_utilization"] == 0.9
+
+    # resources_per_bundle reserves the full TP GPU set for each replica.
+    bundle = cfg["resources_per_bundle"]
+    assert bundle["GPU"] == 4.0
+    assert bundle["CPU"] == 8.0
+
+
+def test_build_llm_config_dict_injects_hf_offline_defaults():
+    from argus.service.serve_llm_config import build_llm_config_dict
+    from argus.service.serving_spec import ResolvedServingSpec
+
+    resolved = ResolvedServingSpec(
+        user_id="alice",
+        model_id_suffix="x",
+        model_id_prefix="alice-202601061235",
+        model_id="alice-202601061235-x",
+        model_source="/private/users/alice/models/x",
+        num_replicas=1,
+        gpus_per_replica=1,
+        engine_kwargs=None,
+    )
+    cfg = build_llm_config_dict(resolved, accelerator_type="H20", runtime_env_env_vars={})
+    env = cfg["runtime_env"]["env_vars"]
+    assert env["HF_HUB_OFFLINE"] == "1"
+    assert env["HF_HOME"] == "/private/hf"
+    assert env["HUGGINGFACE_HUB_CACHE"].startswith("/private/hf/")
+
+
+def test_build_llm_config_dict_requires_accelerator_type():
+    from argus.service.serve_llm_config import build_llm_config_dict
+    from argus.service.serving_spec import ResolvedServingSpec
+
+    resolved = ResolvedServingSpec(
+        user_id="alice",
+        model_id_suffix="x",
+        model_id_prefix="alice-202601061235",
+        model_id="alice-202601061235-x",
+        model_source="/private/hf/x",
+        num_replicas=1,
+        gpus_per_replica=1,
+        engine_kwargs=None,
+    )
+    with pytest.raises(ValueError, match="accelerator_type is required"):
+        build_llm_config_dict(resolved, accelerator_type="", runtime_env_env_vars={})
--- a/src/mvp/py/tests/test_serve_client.py
+++ b/src/mvp/py/tests/test_serve_client.py
@ -0,0 +1,55 @@
+from __future__ import annotations
+
+import sys
+import types
+
+
+def test_ray_serve_client_calls_start_run_status(monkeypatch):
+    import ray  # provided by conftest stub
+
+    calls: list[tuple[str, object]] = []
+
+    def _init(*args, **kwargs):
+        calls.append(("ray.init", {"args": args, "kwargs": kwargs}))
+
+    monkeypatch.setattr(ray, "init", _init, raising=False)
+
+    serve = types.ModuleType("ray.serve")
+
+    def _start(**kwargs):
+        calls.append(("serve.start", kwargs))
+        return None
+
+    def _run(app, name=None, route_prefix=None):
+        calls.append(("serve.run", {"app": app, "name": name, "route_prefix": route_prefix}))
+        return {"deployed": True}
+
+    def _status():
+        calls.append(("serve.status", None))
+        return {"ok": True}
+
+    serve.start = _start  # type: ignore[attr-defined]
+    serve.run = _run  # type: ignore[attr-defined]
+    serve.status = _status  # type: ignore[attr-defined]
+
+    sys.modules["ray.serve"] = serve
+    ray.serve = serve  # type: ignore[attr-defined]
+
+    from argus.service.serve_client import RayServeClient
+
+    client = RayServeClient(http_port=8000, proxy_location="HeadOnly", ray_init_address="auto")
+    client.ensure_started()
+    out = client.apply_app(app="APP", app_name="argus_llm_app", route_prefix="/")
+    st = client.get_status()
+
+    assert out == {"deployed": True}
+    assert st == {"ok": True}
+
+    # Verify call order and key args.
+    assert calls[0][0] == "ray.init"
+    assert calls[0][1]["kwargs"].get("ignore_reinit_error") is True
+    assert calls[1][0] == "serve.start"
+    assert calls[1][1]["http_options"]["port"] == 8000
+    assert calls[2][0] == "serve.run"
+    assert calls[2][1]["name"] == "argus_llm_app"
+    assert calls[3][0] == "serve.status"
--- a/src/mvp/py/tests/test_service_config.py
+++ b/src/mvp/py/tests/test_service_config.py
@ -23,6 +23,7 @@ def test_v2_config_from_root_dict_new_format_defaults():
    assert cfg.sqlite.db_path.endswith(".sqlite3")
    assert cfg.scheduler.max_running_tasks == 3
    assert cfg.tracking.wandb.enabled is False
+    assert cfg.serving.enabled is False


 def test_v2_config_backward_compat_v2_section_and_default_db_path():
@ -57,6 +58,27 @@ def test_v2_config_requires_data_mappings():
        V2Config.from_root_dict({**base, "data": {"sftpgo": ["x"], "retention": {}}})


+def test_v2_config_requires_tracking_and_serving_mappings():
+    from argus.service.config import V2Config
+
+    base = {
+        "ray": {"shared_root": "/private"},
+        "service": {"api": {}, "auth": {}, "sqlite": {}, "scheduler": {}},
+        "data": {"sftpgo": {}, "retention": {}},
+    }
+
+    with pytest.raises(ValueError, match="config\\.tracking must be a mapping"):
+        V2Config.from_root_dict({**base, "tracking": ["nope"]})
+
+    with pytest.raises(ValueError, match="config\\.tracking\\.wandb must be a mapping"):
+        V2Config.from_root_dict({**base, "tracking": {"wandb": ["nope"]}})
+
+    with pytest.raises(ValueError, match="config\\.serving must be a mapping"):
+        V2Config.from_root_dict({**base, "serving": ["nope"]})
+
+    with pytest.raises(ValueError, match="config\\.serving\\.\\{serve,llm\\} must be mappings"):
+        V2Config.from_root_dict({**base, "serving": {"serve": ["x"], "llm": {}}})
+
 def test_tracking_wandb_defaults_disabled():
    from argus.service.config import V2Config

--- a/src/mvp/py/tests/test_serving_model_id_prefix.py
+++ b/src/mvp/py/tests/test_serving_model_id_prefix.py
@ -0,0 +1,23 @@
+from __future__ import annotations
+
+from datetime import datetime, timezone
+
+import pytest
+
+from argus.service.serving_spec import make_model_id_prefix
+
+
+def test_make_model_id_prefix_uses_utc_minutes():
+    dt = datetime(2026, 1, 6, 12, 35, 59, tzinfo=timezone.utc)
+    assert make_model_id_prefix(user_id="alice", now_utc=dt) == "alice-202601061235"
+
+
+def test_make_model_id_prefix_rejects_empty_user_id():
+    with pytest.raises(ValueError, match="user_id is required"):
+        make_model_id_prefix(user_id="", now_utc=datetime.now(timezone.utc))
+
+
+def test_make_model_id_prefix_rejects_slash():
+    with pytest.raises(ValueError, match="must not contain"):
+        make_model_id_prefix(user_id="bad/user", now_utc=datetime.now(timezone.utc))
+
--- a/src/mvp/py/tests/test_serving_reconciler.py
+++ b/src/mvp/py/tests/test_serving_reconciler.py
@ -0,0 +1,207 @@
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+
+class _FakeServeClient:
+    def __init__(self):
+        self.started = 0
+        self.applied = []
+        self.status_calls = 0
+        self.fail_apply = False
+        self.fail_status = False
+
+    def ensure_started(self) -> None:
+        self.started += 1
+
+    def apply_app(self, *, app, app_name: str, route_prefix: str = "/"):
+        if self.fail_apply:
+            raise RuntimeError("boom")
+        self.applied.append({"app": app, "app_name": app_name, "route_prefix": route_prefix})
+        return {"ok": True}
+
+    def get_status(self):
+        self.status_calls += 1
+        if self.fail_status:
+            raise RuntimeError("status boom")
+        return {"ok": True}
+
+
+def _seed_model(db, *, model_key: str, user_id: str, state: str, num_replicas: int = 1, gpus_per_replica: int = 1):
+    spec_yaml = "model_id: x\nmodel_source: $HOME/common/hf/x\n"
+    resolved_yaml = f"user_id: {user_id}\nmodel_id: {user_id}-202601061235-x\n"
+    db.create_serve_model(
+        model_key=model_key,
+        user_id=user_id,
+        model_id_suffix="x",
+        model_id_prefix=f"{user_id}-202601061235",
+        model_id=f"{user_id}-202601061235-x",
+        model_source="/private/hf/x",
+        num_replicas=num_replicas,
+        gpus_per_replica=gpus_per_replica,
+        engine_kwargs_json=json.dumps({"gpu_memory_utilization": 0.9}),
+        spec_yaml=spec_yaml,
+        resolved_spec_yaml=resolved_yaml,
+    )
+    db.set_serve_model_state(model_key=model_key, state=state, event_type="TEST_SEED")
+
+
+def test_reconciler_skips_when_no_changes(tmp_path: Path):
+    from argus.service.config import V2Config
+    from argus.service.db import Db
+    from argus.service.serving_reconciler import ServingReconciler
+
+    root = {
+        "ray": {"shared_root": "/private"},
+        "service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
+        "data": {"sftpgo": {}, "retention": {}},
+        "serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
+    }
+    cfg = V2Config.from_root_dict(root)
+    db = Db(cfg.sqlite.db_path)
+    db.init()
+
+    client = _FakeServeClient()
+    rec = ServingReconciler(db=db, v2_cfg=cfg, ray_runtime_env_env_vars={}, serve_client=client, get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})())
+    rec.tick()
+    assert client.started == 0
+    assert client.applied == []
+
+
+def test_reconciler_pending_resources_no_apply(tmp_path: Path):
+    from argus.service.config import V2Config
+    from argus.service.db import Db
+    from argus.service.serving_reconciler import ServingReconciler
+
+    cfg = V2Config.from_root_dict(
+        {
+            "ray": {"shared_root": "/private"},
+            "service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
+            "data": {"sftpgo": {}, "retention": {}},
+            "serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
+        }
+    )
+    db = Db(cfg.sqlite.db_path)
+    db.init()
+    _seed_model(db, model_key="mk1", user_id="alice", state="QUEUED", num_replicas=2, gpus_per_replica=4)
+
+    client = _FakeServeClient()
+    rec = ServingReconciler(
+        db=db,
+        v2_cfg=cfg,
+        ray_runtime_env_env_vars={},
+        serve_client=client,
+        get_available_fn=lambda: type("A", (), {"total_available_gpus": 1, "total_available_npus": 0})(),
+    )
+    rec.tick()
+    # Serve may be started even when resources are insufficient, but apply should not happen.
+    assert client.started == 1
+    assert client.applied == []
+    # State remains QUEUED.
+    row = db.get_serve_model("mk1")
+    assert row and row["state"] == "QUEUED"
+    ev = db.list_serve_events("mk1", limit=50)
+    assert any(e["event_type"] == "SERVE_PENDING_RESOURCES" for e in ev)
+
+
+def test_reconciler_apply_success_marks_running(tmp_path: Path):
+    from argus.service.config import V2Config
+    from argus.service.db import Db
+    from argus.service.serving_reconciler import ServingReconciler
+
+    cfg = V2Config.from_root_dict(
+        {
+            "ray": {"shared_root": "/private"},
+            "service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
+            "data": {"sftpgo": {}, "retention": {}},
+            "serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
+        }
+    )
+    db = Db(cfg.sqlite.db_path)
+    db.init()
+    _seed_model(db, model_key="mk1", user_id="alice", state="QUEUED", num_replicas=1, gpus_per_replica=1)
+
+    client = _FakeServeClient()
+    rec = ServingReconciler(
+        db=db,
+        v2_cfg=cfg,
+        ray_runtime_env_env_vars={"HF_ENDPOINT": "https://hf-mirror.com"},
+        serve_client=client,
+        get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})(),
+    )
+    rec.tick()
+    assert client.started == 1
+    assert len(client.applied) == 1
+    applied = client.applied[0]["app"]["llm_configs"]
+    assert applied[0]["engine_kwargs"]["tensor_parallel_size"] == 1
+    assert applied[0]["runtime_env"]["env_vars"]["HF_HUB_OFFLINE"] == "1"
+    row = db.get_serve_model("mk1")
+    assert row and row["state"] == "RUNNING"
+
+
+def test_reconciler_delete_removes_and_marks_deleted(tmp_path: Path):
+    from argus.service.config import V2Config
+    from argus.service.db import Db
+    from argus.service.serving_reconciler import ServingReconciler
+
+    cfg = V2Config.from_root_dict(
+        {
+            "ray": {"shared_root": "/private"},
+            "service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
+            "data": {"sftpgo": {}, "retention": {}},
+            "serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
+        }
+    )
+    db = Db(cfg.sqlite.db_path)
+    db.init()
+    _seed_model(db, model_key="keep", user_id="alice", state="RUNNING", num_replicas=1, gpus_per_replica=1)
+    _seed_model(db, model_key="del", user_id="alice", state="DELETING", num_replicas=1, gpus_per_replica=1)
+
+    client = _FakeServeClient()
+    rec = ServingReconciler(
+        db=db,
+        v2_cfg=cfg,
+        ray_runtime_env_env_vars={},
+        serve_client=client,
+        get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})(),
+    )
+    rec.tick()
+    assert len(client.applied) == 1
+    cfgs = client.applied[0]["app"]["llm_configs"]
+    assert {c["model_loading_config"]["model_id"] for c in cfgs} == {"alice-202601061235-x"}  # only keep remains
+    row = db.get_serve_model("del")
+    assert row and row["state"] == "DELETED"
+    assert row.get("deleted_at")
+
+
+def test_reconciler_apply_failure_marks_failed(tmp_path: Path):
+    from argus.service.config import V2Config
+    from argus.service.db import Db
+    from argus.service.serving_reconciler import ServingReconciler
+
+    cfg = V2Config.from_root_dict(
+        {
+            "ray": {"shared_root": "/private"},
+            "service": {"api": {}, "auth": {}, "sqlite": {"db_path": str(tmp_path / "mvp.sqlite3")}, "scheduler": {}},
+            "data": {"sftpgo": {}, "retention": {}},
+            "serving": {"serve": {"http_port": 8000}, "llm": {"accelerator_type": "H20"}},
+        }
+    )
+    db = Db(cfg.sqlite.db_path)
+    db.init()
+    _seed_model(db, model_key="mk1", user_id="alice", state="QUEUED")
+
+    client = _FakeServeClient()
+    client.fail_apply = True
+    rec = ServingReconciler(
+        db=db,
+        v2_cfg=cfg,
+        ray_runtime_env_env_vars={},
+        serve_client=client,
+        get_available_fn=lambda: type("A", (), {"total_available_gpus": 8, "total_available_npus": 0})(),
+    )
+    rec.tick()
+    row = db.get_serve_model("mk1")
+    assert row and row["state"] == "FAILED"
+    assert row.get("error_summary")
--- a/src/mvp/py/tests/test_serving_spec_paths.py
+++ b/src/mvp/py/tests/test_serving_spec_paths.py
@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from datetime import datetime, timezone
+
+import pytest
+
+from argus.service.serving_spec import ServingSpec, resolve_serving_spec
+
+
+def test_expand_home_macro_and_validate_user_path_ok():
+    spec = ServingSpec(
+        model_id="qwen-0.5b",
+        model_source="$HOME/models/my_model",
+        num_replicas=1,
+        gpus_per_replica=1,
+    )
+    r = resolve_serving_spec(spec=spec, user_id="alice", now_utc=datetime(2026, 1, 6, 12, 35, tzinfo=timezone.utc))
+    assert r.model_source == "/private/users/alice/models/my_model"
+    assert r.model_id == "alice-202601061235-qwen-0.5b"
+
+
+def test_expand_common_hf_macro_ok():
+    spec = ServingSpec(
+        model_id="qwen-0.5b",
+        model_source="$HOME/common/hf/hub/models--Qwen--Qwen2.5/snapshots/abc",
+        num_replicas=1,
+        gpus_per_replica=1,
+    )
+    r = resolve_serving_spec(spec=spec, user_id="alice", now_utc=datetime(2026, 1, 6, 12, 35, tzinfo=timezone.utc))
+    assert r.model_source.startswith("/private/hf/")
+
+
+@pytest.mark.parametrize(
+    "src",
+    [
+        "/etc/passwd",
+        "relative/path",
+        "/private/users/bob/models/x",
+        "/private/users/alice/../bob/x",
+        "/private/common/hf/x",
+    ],
+)
+def test_model_source_path_rejected(src: str):
+    spec = ServingSpec(model_id="qwen-0.5b", model_source=src, num_replicas=1, gpus_per_replica=1)
+    with pytest.raises((ValueError, PermissionError)):
+        resolve_serving_spec(spec=spec, user_id="alice", now_utc=datetime(2026, 1, 6, 12, 35, tzinfo=timezone.utc))
+
--- a/src/mvp/py/tests/test_serving_spec_validation.py
+++ b/src/mvp/py/tests/test_serving_spec_validation.py
@ -0,0 +1,72 @@
+from __future__ import annotations
+
+import pytest
+
+from argus.service.serving_spec import ServingSpec, parse_serving_spec, validate_model_id_suffix
+
+
+@pytest.mark.parametrize(
+    "suffix",
+    [
+        "a",
+        "qwen-0.5b",
+        "Qwen2.5-0.5B",
+        "a_b",
+        "a.b-c",
+        "a" * 64,
+    ],
+)
+def test_validate_model_id_suffix_accepts(suffix: str):
+    validate_model_id_suffix(suffix)
+
+
+@pytest.mark.parametrize(
+    "suffix",
+    [
+        "",
+        " a",
+        "a ",
+        "-bad",
+        ".bad",
+        "bad/",
+        "bad..",
+        "bad\n",
+        "bad\t",
+        "a" * 65,
+    ],
+)
+def test_validate_model_id_suffix_rejects(suffix: str):
+    with pytest.raises(ValueError):
+        validate_model_id_suffix(suffix)
+
+
+def test_parse_serving_spec_smoke_defaults():
+    spec = parse_serving_spec(
+        {
+            "model_id": "qwen-0.5b",
+            "model_source": "/private/hf/x",
+        }
+    )
+    assert isinstance(spec, ServingSpec)
+    assert spec.num_replicas == 1
+    assert spec.gpus_per_replica == 1
+    assert spec.engine_kwargs is None
+
+
+def test_parse_serving_spec_rejects_missing_fields():
+    with pytest.raises(ValueError, match="missing required field: model_id"):
+        parse_serving_spec({"model_source": "/private/hf/x"})
+    with pytest.raises(ValueError, match="missing required field: model_source"):
+        parse_serving_spec({"model_id": "x"})
+
+
+def test_parse_serving_spec_rejects_bad_types():
+    with pytest.raises(ValueError, match="serving spec must be a mapping"):
+        parse_serving_spec(["nope"])
+    with pytest.raises(ValueError, match="num_replicas"):
+        parse_serving_spec({"model_id": "x", "model_source": "/private/hf/x", "num_replicas": 0})
+    with pytest.raises(ValueError, match="gpus_per_replica"):
+        parse_serving_spec({"model_id": "x", "model_source": "/private/hf/x", "gpus_per_replica": 0})
+    with pytest.raises(ValueError, match="engine_kwargs"):
+        parse_serving_spec({"model_id": "x", "model_source": "/private/hf/x", "engine_kwargs": "nope"})
+
--- a/src/mvp/py/tests/test_ui.py
+++ b/src/mvp/py/tests/test_ui.py
@ -42,10 +42,13 @@ def test_ui_routes_render_200(tmp_path, monkeypatch):
        "/ui/login",
        "/ui/tasks",
        "/ui/tasks/new",
+        "/ui/serving",
+        "/ui/serving/new",
        "/ui/data",
        "/ui/admin",
        "/ui/tasks/any-task-id",
        "/ui/tasks/any-task-id/logs",
+        "/ui/serving/any-model-key",
    ):
        r = c.get(path, allow_redirects=True)
        assert r.status_code == 200
@ -60,7 +63,7 @@ def test_ui_contains_sidebar_links(tmp_path, monkeypatch):

    r = c.get("/ui/tasks")
    assert r.status_code == 200
-    for link in ("/ui/tasks", "/ui/tasks/new", "/ui/data", "/ui/login", "/ui/admin"):
+    for link in ("/ui/tasks", "/ui/tasks/new", "/ui/serving", "/ui/data", "/ui/login", "/ui/admin"):
        assert link in r.text
    assert "Ray Dashboard" in r.text

--- a/src/mvp/py/tests/test_ui_serving.py
+++ b/src/mvp/py/tests/test_ui_serving.py
@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from fastapi.testclient import TestClient
+
+from argus.service.app import create_app
+
+
+def _write_config(tmp_path: Path) -> Path:
+    p = tmp_path / "cfg.yaml"
+    p.write_text(
+        """
+ray:
+  address: "http://127.0.0.1:8265"
+  shared_root: "/private"
+  entrypoint_num_cpus: 1
+  entrypoint_resources: { worker_node: 1 }
+  runtime_env: { env_vars: { PYTHONUNBUFFERED: "1" } }
+service:
+  api: { host: "127.0.0.1", port: 8080 }
+  auth: { token_env: "MVP_INTERNAL_TOKEN" }
+  sqlite: { db_path: "%(db)s" }
+data:
+  user_root: "%(users)s"
+  sftpgo: { enabled: false }
+  retention: { jobs_trash_after_days: 3, jobs_purge_after_days: 7, janitor_interval_s: 3600 }
+serving: {}
+"""
+        % {"db": str(tmp_path / "mvp.sqlite3"), "users": str(tmp_path / "users")}
+    )
+    return p
+
+
+def test_ui_serving_pages_render(tmp_path, monkeypatch):
+    cfg = _write_config(tmp_path)
+    monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
+    app = create_app(str(cfg))
+    c = TestClient(app)
+
+    for path in ("/ui/serving", "/ui/serving/new", "/ui/serving/any-model-key"):
+        r = c.get(path)
+        assert r.status_code == 200
+        assert "<html" in r.text.lower()
+
+
+def test_ui_serving_contains_openai_port_8000(tmp_path, monkeypatch):
+    cfg = _write_config(tmp_path)
+    monkeypatch.setenv("MVP_INTERNAL_TOKEN", "admin-token")
+    app = create_app(str(cfg))
+    c = TestClient(app)
+
+    r = c.get("/ui/serving")
+    assert r.status_code == 200
+    assert "curOriginWithPort(8000)" in r.text
+    assert "/v1/models" in r.text
--- a/src/mvp/scripts/01_up.sh
+++ b/src/mvp/scripts/01_up.sh
@ -11,10 +11,11 @@ fi

 echo "[host] docker compose up -d (mvp)"
 BUILD="${BUILD:-0}"
+RAY_NODE_IMAGE="${RAY_NODE_IMAGE:-argus/argus-ray-node:vllm011.latest}"

 # If the image isn't present locally, force build once.
 if [[ "${BUILD}" != "1" ]]; then
-  if ! docker image inspect argus/argus-ray-node:v2.5 >/dev/null 2>&1; then
+  if ! docker image inspect "${RAY_NODE_IMAGE}" >/dev/null 2>&1; then
    BUILD="1"
  fi
 fi
--- a/src/mvp/scripts/debug_serve_llm_smoke.sh
+++ b/src/mvp/scripts/debug_serve_llm_smoke.sh
@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+container="${MVP_HEAD_CONTAINER:-argus-ray-head}"
+model_source="${MODEL_SOURCE:-}"
+if [[ -n "${1:-}" ]]; then
+  model_source="$1"
+fi
+
+argv=(python3 /workspace/mvp/scripts/serve_llm_smoke.py)
+if [[ -n "${model_source}" ]]; then
+  argv+=(--model-source "${model_source}")
+fi
+argv+=(--accelerator-type "${ARGUS_ACCELERATOR_TYPE:-H20}")
+
+echo "[host] run Ray Serve LLM smoke test in container: ${container}" >&2
+docker exec -it "${container}" bash -lc "$(printf '%q ' "${argv[@]}")"
+
--- a/src/mvp/scripts/run_all_v38_serving.sh
+++ b/src/mvp/scripts/run_all_v38_serving.sh
@ -0,0 +1,193 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# shellcheck source=lib.sh
+source "${SCRIPT_DIR}/lib.sh"
+
+API_ADDR="${API_ADDR:-http://127.0.0.1:8080}"
+OPENAI_BASE_URL="${OPENAI_BASE_URL:-http://127.0.0.1:8000/v1}"
+ADMIN_TOKEN="${MVP_INTERNAL_TOKEN:-}"
+USER_ID="${USER_ID:-alice}"
+EXPECTED_RAY_NODES="${EXPECTED_RAY_NODES:-3}" # head + 2 workers
+
+CONFIG_IN_CONTAINER="${CONFIG_IN_CONTAINER:-/workspace/mvp/configs/dev.yaml}"
+SFTPGO_ADMIN_PASSWORD="${SFTPGO_ADMIN_PASSWORD:-my-dev-sftpgo-admin}"
+export SFTPGO_ADMIN_PASSWORD
+
+if [[ -z "${ADMIN_TOKEN}" ]]; then
+  echo "ERROR: MVP_INTERNAL_TOKEN must be set in host env (admin token)" >&2
+  exit 1
+fi
+
+api_curl_admin() {
+  curl -sS -H "Authorization: Bearer ${ADMIN_TOKEN}" "$@"
+}
+
+api_wait_ready() {
+  local tries="${1:-60}"
+  for i in $(seq 1 "${tries}"); do
+    if curl -sS -m 2 "${API_ADDR}/docs" >/dev/null 2>&1; then
+      echo "[host] api_ready: ${API_ADDR}"
+      return 0
+    fi
+    echo "[host] waiting api... (${i}/${tries})"
+    sleep 2
+  done
+  echo "ERROR: api not ready: ${API_ADDR}" >&2
+  return 1
+}
+
+ray_wait_ready() {
+  local tries="${1:-60}"
+  for i in $(seq 1 "${tries}"); do
+    if curl -sS -m 2 "${RAY_DASHBOARD_ADDR}/api/version" >/dev/null 2>&1; then
+      echo "[host] ray_dashboard_ready: ${RAY_DASHBOARD_ADDR}"
+      return 0
+    fi
+    echo "[host] waiting ray dashboard... (${i}/${tries})"
+    sleep 2
+  done
+  echo "ERROR: ray dashboard not ready: ${RAY_DASHBOARD_ADDR}" >&2
+  return 1
+}
+
+ray_wait_nodes() {
+  local want="${1:-3}"
+  local tries="${2:-60}"
+  for i in $(seq 1 "${tries}"); do
+    local out n
+    out="$(docker exec -i "${HEAD_CONTAINER}" python3 -c "import ray; ray.init(address='auto', ignore_reinit_error=True, log_to_driver=False, logging_level='ERROR'); print(sum(1 for n in ray.nodes() if n.get('Alive')))" 2>/dev/null || true)"
+    n="$(printf '%s\n' "${out}" | tail -n 1 | tr -cd '0-9' || true)"
+    if [[ "${n}" =~ ^[0-9]+$ ]]; then
+      echo "[host] ray_nodes_alive=${n} (want>=${want})"
+      if [[ "${n}" -ge "${want}" ]]; then
+        return 0
+      fi
+    else
+      echo "[host] waiting ray nodes... (${i}/${tries})"
+    fi
+    sleep 2
+  done
+  echo "ERROR: ray nodes not ready (want>=${want})" >&2
+  docker exec -i "${HEAD_CONTAINER}" bash -lc "ray status || true" >&2 || true
+  return 1
+}
+
+openai_wait_ready() {
+  local tries="${1:-120}"
+  for i in $(seq 1 "${tries}"); do
+    if curl -sS -m 2 "${OPENAI_BASE_URL}/models" >/dev/null 2>&1; then
+      echo "[host] openai_ready: ${OPENAI_BASE_URL}"
+      return 0
+    fi
+    echo "[host] waiting openai... (${i}/${tries})"
+    sleep 2
+  done
+  echo "ERROR: openai not ready: ${OPENAI_BASE_URL}" >&2
+  return 1
+}
+
+wait_model_state() {
+  local token="$1"
+  local model_key="$2"
+  local want="$3"
+  local tries="${4:-120}"
+  for i in $(seq 1 "${tries}"); do
+    local body state
+    body="$(curl -sS -H "Authorization: Bearer ${token}" "${API_ADDR}/api/v2/serve/models/${model_key}")"
+    state="$(printf '%s' "${body}" | python3 -c 'import sys,json; print(json.load(sys.stdin)["model"]["state"])' 2>/dev/null || true)"
+    echo "[host] model ${model_key}: ${state}"
+    if [[ "${state}" == "${want}" ]]; then
+      return 0
+    fi
+    if [[ "${state}" == "FAILED" ]]; then
+      echo "[host] model failed; detail:" >&2
+      printf '%s\n' "${body}" | python3 -m json.tool >&2 || true
+      return 1
+    fi
+    sleep 2
+  done
+  echo "ERROR: model not in state ${want} after timeout" >&2
+  return 1
+}
+
+echo "[host] ===== run_all_v38_serving.sh begin ====="
+
+"${SCRIPT_DIR}/00_prereq_check.sh"
+"${SCRIPT_DIR}/03_cleanup_v1_legacy.sh"
+"${SCRIPT_DIR}/04_cleanup_v2_legacy.sh"
+
+echo "[host] bring down existing containers (best-effort)"
+"${SCRIPT_DIR}/02_down.sh" || true
+
+echo "[host] (re)create containers (Ray + SFTPGo + W&B)"
+# For v3.8, we need the latest ray-node image (ray[llm] deps). Force build once.
+BUILD="${BUILD:-1}" "${SCRIPT_DIR}/01_up.sh"
+
+echo "[host] wait ray ready"
+ray_wait_ready 60
+ray_wait_nodes "${EXPECTED_RAY_NODES}" 120
+
+echo "[host] prepare data/model (best-effort; uses shared caches)"
+"${SCRIPT_DIR}/30_prepare_data_and_model.sh" || true
+
+echo "[host] start api"
+CONFIG_IN_CONTAINER="${CONFIG_IN_CONTAINER}" MVP_INTERNAL_TOKEN="${ADMIN_TOKEN}" "${SCRIPT_DIR}/60_start_api.sh"
+api_wait_ready 60
+
+echo "[host] create user (idempotent)"
+api_curl_admin -X POST "${API_ADDR}/api/v2/users" -H "Content-Type: application/json" --data-binary "{\"user_id\":\"${USER_ID}\"}" >/dev/null || true
+
+echo "[host] issue user token"
+USER_TOKEN="$(api_curl_admin -X POST "${API_ADDR}/api/v2/users/${USER_ID}/tokens" | python3 -c 'import sys,json; print(json.load(sys.stdin)["token"])')"
+
+echo "[host] resolve local model snapshot path (offline)"
+LOCAL_MODEL_PATH="$(dexec "${HEAD_CONTAINER}" bash -lc "python3 -c \"import os; from huggingface_hub import snapshot_download; os.environ.setdefault('HF_HOME','/private/hf'); print(snapshot_download(repo_id='Qwen/Qwen2.5-0.5B-Instruct', local_files_only=True))\" " | tail -n 1)"
+if [[ -z "${LOCAL_MODEL_PATH}" || "${LOCAL_MODEL_PATH}" != /* ]]; then
+  echo "ERROR: failed to resolve LOCAL_MODEL_PATH: ${LOCAL_MODEL_PATH}" >&2
+  exit 1
+fi
+echo "[host] local_model_path: ${LOCAL_MODEL_PATH}"
+
+echo "[host] submit serving model via API"
+SERVE_SPEC=$'model_id: qwen-0.5b\nmodel_source: '"${LOCAL_MODEL_PATH}"$'\nnum_replicas: 1\ngpus_per_replica: 1\n'
+CREATE_RESP="$(curl -sS -H "Authorization: Bearer ${USER_TOKEN}" -H "Content-Type: application/yaml" --data-binary "${SERVE_SPEC}" "${API_ADDR}/api/v2/serve/models")"
+echo "[host] create_model_resp: ${CREATE_RESP}"
+MODEL_KEY="$(printf '%s' "${CREATE_RESP}" | python3 -c 'import sys,json; print(json.load(sys.stdin)["model_key"])')"
+
+echo "[host] wait model RUNNING"
+wait_model_state "${USER_TOKEN}" "${MODEL_KEY}" "RUNNING" 300
+
+echo "[host] wait OpenAI ingress ready"
+openai_wait_ready 120
+
+echo "[host] verify /v1/models contains model"
+MODEL_ID="$(
+  curl -sS "${OPENAI_BASE_URL}/models" \
+    | python3 -c 'import sys,json; obj=json.load(sys.stdin); print("\n".join([m.get("id","") for m in obj.get("data",[]) if isinstance(m,dict)]))' \
+    | grep -E "^${USER_ID}-[0-9]{12}-qwen-0\\.5b$" \
+    | head -n1 \
+    || true
+)"
+if [[ -z "${MODEL_ID}" ]]; then
+  echo "ERROR: model id not found in /v1/models" >&2
+  curl -sS "${OPENAI_BASE_URL}/models" | python3 -m json.tool >&2 || true
+  exit 1
+fi
+echo "[host] model_id: ${MODEL_ID}"
+
+echo "[host] chat completion (best-effort)"
+CHAT_RESP="$(curl -sS -H "Content-Type: application/json" -H "Authorization: Bearer FAKE_KEY" -X POST "${OPENAI_BASE_URL}/chat/completions" --data-binary "{\"model\":\"${MODEL_ID}\",\"messages\":[{\"role\":\"user\",\"content\":\"hello\"}],\"max_tokens\":16,\"stream\":false}")"
+printf '%s\n' "${CHAT_RESP}" | python3 -m json.tool >/dev/null 2>&1 || {
+  echo "ERROR: invalid chat response" >&2
+  printf '%s\n' "${CHAT_RESP}" >&2
+  exit 1
+}
+echo "[host] chat_ok"
+
+echo "[host] delete model"
+curl -sS -H "Authorization: Bearer ${USER_TOKEN}" -X DELETE "${API_ADDR}/api/v2/serve/models/${MODEL_KEY}" >/dev/null
+wait_model_state "${USER_TOKEN}" "${MODEL_KEY}" "DELETED" 300
+
+echo "[host] ===== run_all_v38_serving.sh done ====="
--- a/src/mvp/scripts/serve_llm_smoke.py
+++ b/src/mvp/scripts/serve_llm_smoke.py
@ -0,0 +1,102 @@
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import time
+import urllib.request
+from pathlib import Path
+from typing import Any
+
+
+def _pick_qwen_snapshot() -> str | None:
+    base = Path("/private/hf/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots")
+    if not base.exists():
+        return None
+    snaps = sorted([p for p in base.iterdir() if p.is_dir()], reverse=True)
+    return str(snaps[0]) if snaps else None
+
+
+def _http_get_json(url: str) -> Any:
+    with urllib.request.urlopen(url, timeout=10) as resp:
+        raw = resp.read().decode("utf-8")
+        return json.loads(raw)
+
+
+def _wait_http_json(url: str, *, timeout_s: int) -> Any:
+    deadline = time.time() + float(timeout_s)
+    last_err: Exception | None = None
+    while time.time() < deadline:
+        try:
+            return _http_get_json(url)
+        except Exception as e:
+            last_err = e
+            time.sleep(2)
+    raise RuntimeError(f"timeout waiting for {url}: {last_err!r}")
+
+
+def main(argv: list[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(description="Ray Serve LLM smoke test (deploy + /v1/models probe).")
+    ap.add_argument("--ray-address", default="auto")
+    ap.add_argument("--http-port", type=int, default=8000)
+    ap.add_argument("--app-name", default="argus_llm_smoke")
+    ap.add_argument("--route-prefix", default="/")
+    ap.add_argument("--accelerator-type", default=os.environ.get("ARGUS_ACCELERATOR_TYPE") or "H20")
+    ap.add_argument("--model-id", default="smoke-qwen-0.5b")
+    ap.add_argument("--model-source", default=None, help="Local path or HF id. Default: cached Qwen snapshot under /private/hf.")
+    ap.add_argument("--tensor-parallel-size", type=int, default=1)
+    ap.add_argument("--num-replicas", type=int, default=1)
+    ap.add_argument("--wait-s", type=int, default=600)
+    args = ap.parse_args(argv)
+
+    model_source = str(args.model_source or _pick_qwen_snapshot() or "")
+    if not model_source:
+        raise SystemExit("missing --model-source and no cached Qwen snapshot found under /private/hf")
+
+    # Force offline HF behavior for the smoke test.
+    os.environ.setdefault("HF_HOME", "/private/hf")
+    os.environ.setdefault("HUGGINGFACE_HUB_CACHE", "/private/hf/hub")
+    os.environ.setdefault("TRANSFORMERS_CACHE", "/private/hf/transformers")
+    os.environ["HF_HUB_OFFLINE"] = "1"
+
+    import ray
+
+    ray.init(address=str(args.ray_address), ignore_reinit_error=True, log_to_driver=False)
+
+    from ray import serve
+
+    try:
+        serve.start(proxy_location="HeadOnly", http_options={"host": "0.0.0.0", "port": int(args.http_port)})
+    except Exception:
+        # Best-effort: Serve may already be running in the container (e.g., started by the MVP API scheduler).
+        pass
+
+    from ray.serve.llm import LLMConfig, build_openai_app
+
+    # Build a config dict and filter by the current Ray's LLMConfig schema, since fields
+    # may differ between Ray versions.
+    cfg_dict: dict[str, Any] = {
+        "model_loading_config": {"model_id": str(args.model_id), "model_source": model_source},
+        "accelerator_type": str(args.accelerator_type),
+        "deployment_config": {"num_replicas": int(args.num_replicas)},
+        "engine_kwargs": {"tensor_parallel_size": int(args.tensor_parallel_size)},
+        "runtime_env": {"env_vars": {"HF_HUB_OFFLINE": "1", "HF_HOME": "/private/hf"}},
+    }
+    allowed = set(getattr(LLMConfig, "model_fields", {}).keys())
+    if allowed:
+        cfg_dict = {k: v for k, v in cfg_dict.items() if k in allowed}
+
+    llm_cfg = LLMConfig(**cfg_dict)
+    app = build_openai_app({"llm_configs": [llm_cfg]})
+
+    serve.run(app, name=str(args.app_name), route_prefix=str(args.route_prefix))
+
+    models_url = f"http://127.0.0.1:{int(args.http_port)}/v1/models"
+    payload = _wait_http_json(models_url, timeout_s=int(args.wait_s))
+    print(json.dumps(payload, indent=2, sort_keys=True))
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())