V2.5 版本端到端run_all_api 通过,自动部署容器,安装API server,并且执行任务提交

This commit is contained in:
yuyr 2025-12-29 16:58:19 +08:00
parent c06f61ed03
commit 7b89d60b3b
8 changed files with 136 additions and 86 deletions

View File

@ -10,7 +10,15 @@ if [[ "${SKIP_CLEANUP_V1:-0}" != "1" ]]; then
fi
echo "[host] docker compose up -d (mvp)"
BUILD="${BUILD:-1}"
BUILD="${BUILD:-0}"
# If the image isn't present locally, force build once.
if [[ "${BUILD}" != "1" ]]; then
if ! docker image inspect argus/argus-ray-node:v2.5 >/dev/null 2>&1; then
BUILD="1"
fi
fi
if [[ "${BUILD}" == "1" ]]; then
dc up -d --build
else

View File

@ -1,21 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=lib.sh
source "${SCRIPT_DIR}/lib.sh"
echo "[head] restart head container (ray + publisher are supervised in-container): ${HEAD_CONTAINER}"
docker restart "${HEAD_CONTAINER}" >/dev/null
echo "[head] wait ray dashboard ready (best-effort)"
for i in $(seq 1 60); do
if curl -sS -m 2 "${RAY_DASHBOARD_ADDR}" >/dev/null 2>&1; then
echo "[head] dashboard ready: ${RAY_DASHBOARD_ADDR}"
break
fi
sleep 2
done
echo "[head] ray status (best-effort)"
dexec "${HEAD_CONTAINER}" bash -lc "ray status || true"

View File

@ -1,26 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=lib.sh
source "${SCRIPT_DIR}/lib.sh"
HEAD_IP="$(container_ip "${HEAD_CONTAINER}")"
HEAD_ADDR="${HEAD_IP}:6379"
start_one() {
local worker="$1"
local ip
ip="$(container_ip "${worker}")"
echo "[${worker}] ray stop (best-effort)"
dexec "${worker}" bash -lc "ray stop --force || true"
echo "[${worker}] start ray worker -> head ${HEAD_ADDR}"
dexec "${worker}" bash -lc "ray start --address='${HEAD_ADDR}' --node-ip-address='${ip}' --resources='{\"worker_node\": 100}' --disable-usage-stats"
}
start_one "${WORKER0_CONTAINER}"
start_one "${WORKER1_CONTAINER}"
echo "[head] ray status"
dexec "${HEAD_CONTAINER}" bash -lc "ray status || true"

View File

@ -1,12 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=lib.sh
source "${SCRIPT_DIR}/lib.sh"
CLUSTER_NAME="${CLUSTER_NAME:-argus-ray}"
HEAD_IP_FILE="${HEAD_IP_FILE:-${SHARED_ROOT}/ray/discovery/${CLUSTER_NAME}/head.json}"
echo "[head] discovery publisher is supervised in-container; verify head record exists: ${HEAD_IP_FILE}"
dexec "${HEAD_CONTAINER}" bash -lc "test -f '${HEAD_IP_FILE}' && python3 -c 'import json,sys; print(json.load(open(sys.argv[1]))[\"job_server_url\"])' '${HEAD_IP_FILE}' || true"

View File

@ -1,13 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=lib.sh
source "${SCRIPT_DIR}/lib.sh"
echo "[workers] restart worker containers (ray is supervised in-container)"
docker restart "${WORKER0_CONTAINER}" >/dev/null
docker restart "${WORKER1_CONTAINER}" >/dev/null
echo "[head] ray status"
dexec "${HEAD_CONTAINER}" bash -lc "ray status || true"

View File

@ -12,6 +12,7 @@ source "${SCRIPT_DIR}/lib.sh"
API_ADDR="${API_ADDR:-http://127.0.0.1:8080}"
TOKEN="${MVP_INTERNAL_TOKEN:-}"
DB_IN_CONTAINER="${DB_IN_CONTAINER:-/private/common/db/mvp.sqlite3}"
EXPECTED_RAY_NODES="${EXPECTED_RAY_NODES:-3}" # head + 2 workers
if [[ -z "${TOKEN}" ]]; then
echo "ERROR: MVP_INTERNAL_TOKEN must be set in the host env for run_all_api.sh" >&2
@ -36,6 +37,42 @@ api_wait_ready() {
return 1
}
ray_wait_ready() {
local tries="${1:-60}"
for i in $(seq 1 "${tries}"); do
if curl -sS -m 2 "${RAY_DASHBOARD_ADDR}/api/version" >/dev/null 2>&1; then
echo "[host] ray_dashboard_ready: ${RAY_DASHBOARD_ADDR}"
return 0
fi
echo "[host] waiting ray dashboard... (${i}/${tries})"
sleep 2
done
echo "ERROR: ray dashboard not ready: ${RAY_DASHBOARD_ADDR}" >&2
return 1
}
ray_wait_nodes() {
local want="${1:-3}"
local tries="${2:-60}"
for i in $(seq 1 "${tries}"); do
local out n
out="$(docker exec -i "${HEAD_CONTAINER}" python3 -c "import ray; ray.init(address='auto', ignore_reinit_error=True, log_to_driver=False, logging_level='ERROR'); print(sum(1 for n in ray.nodes() if n.get('Alive')))" 2>/dev/null || true)"
n="$(printf '%s\n' "${out}" | tail -n 1 | tr -cd '0-9' || true)"
if [[ "${n}" =~ ^[0-9]+$ ]]; then
echo "[host] ray_nodes_alive=${n} (want>=${want})"
if [[ "${n}" -ge "${want}" ]]; then
return 0
fi
else
echo "[host] waiting ray nodes... (${i}/${tries})"
fi
sleep 2
done
echo "ERROR: ray nodes not ready (want>=${want})" >&2
docker exec -i "${HEAD_CONTAINER}" bash -lc "ray status || true" >&2 || true
return 1
}
submit_taskspec() {
local taskspec_path="$1"
echo "[host] submit via API: ${taskspec_path}" >&2
@ -82,9 +119,9 @@ echo "[host] ===== run_all_api.sh begin ====="
echo "[host] (re)create containers"
"${SCRIPT_DIR}/01_up.sh"
echo "[host] restart ray cluster"
"${SCRIPT_DIR}/20_start_head.sh"
"${SCRIPT_DIR}/21_start_workers.sh"
echo "[host] wait ray cluster ready (supervised in-container)"
ray_wait_ready 60
ray_wait_nodes "${EXPECTED_RAY_NODES}" 120
echo "[host] prepare data/model/code snapshot"
"${SCRIPT_DIR}/30_prepare_data_and_model.sh"

View File

@ -8,6 +8,44 @@ source "${SCRIPT_DIR}/lib.sh"
# Run the end-to-end flow using the CLI submitter (argus.cli).
# This script restarts the Ray cluster and submits PPO/GRPO/SFT sequentially.
EXPECTED_RAY_NODES="${EXPECTED_RAY_NODES:-3}" # head + 2 workers
ray_wait_ready() {
local tries="${1:-60}"
for i in $(seq 1 "${tries}"); do
if curl -sS -m 2 "${RAY_DASHBOARD_ADDR}/api/version" >/dev/null 2>&1; then
echo "[host] ray_dashboard_ready: ${RAY_DASHBOARD_ADDR}"
return 0
fi
echo "[host] waiting ray dashboard... (${i}/${tries})"
sleep 2
done
echo "ERROR: ray dashboard not ready: ${RAY_DASHBOARD_ADDR}" >&2
return 1
}
ray_wait_nodes() {
local want="${1:-3}"
local tries="${2:-60}"
for i in $(seq 1 "${tries}"); do
local out n
out="$(docker exec -i "${HEAD_CONTAINER}" python3 -c "import ray; ray.init(address='auto', ignore_reinit_error=True, log_to_driver=False, logging_level='ERROR'); print(sum(1 for n in ray.nodes() if n.get('Alive')))" 2>/dev/null || true)"
n="$(printf '%s\n' "${out}" | tail -n 1 | tr -cd '0-9' || true)"
if [[ "${n}" =~ ^[0-9]+$ ]]; then
echo "[host] ray_nodes_alive=${n} (want>=${want})"
if [[ "${n}" -ge "${want}" ]]; then
return 0
fi
else
echo "[host] waiting ray nodes... (${i}/${tries})"
fi
sleep 2
done
echo "ERROR: ray nodes not ready (want>=${want})" >&2
docker exec -i "${HEAD_CONTAINER}" bash -lc "ray status || true" >&2 || true
return 1
}
submit_and_wait() {
local taskspec_in_container="$1"
local sid
@ -49,8 +87,8 @@ submit_and_wait() {
"${SCRIPT_DIR}/03_cleanup_v1_legacy.sh"
"${SCRIPT_DIR}/05_ensure_verl_repo.sh"
"${SCRIPT_DIR}/01_up.sh"
"${SCRIPT_DIR}/20_start_head.sh"
"${SCRIPT_DIR}/21_start_workers.sh"
ray_wait_ready 60
ray_wait_nodes "${EXPECTED_RAY_NODES}" 120
"${SCRIPT_DIR}/30_prepare_data_and_model.sh"
"${SCRIPT_DIR}/12_install_py_deps.sh"
submit_and_wait /workspace/mvp/taskspecs/ppo.yaml

View File

@ -8,8 +8,8 @@ source "${SCRIPT_DIR}/lib.sh"
# E2E v2.5:
# - Clean legacy env
# - Start containers
# - Start ray head + discovery publisher
# - Start stateless worker watchdogs (auto-connect)
# - Ray head/worker auto-start (supervisor in-container)
# - Verify discovery + worker join
# - Prepare data/model/code (reuses existing downloads)
# - Start API
# - Create user + issue token
@ -19,6 +19,8 @@ API_ADDR="${API_ADDR:-http://127.0.0.1:8080}"
ADMIN_TOKEN="${MVP_INTERNAL_TOKEN:-}"
USER_ID="${USER_ID:-alice}"
RESET_DB="${RESET_DB:-1}"
EXPECTED_RAY_NODES="${EXPECTED_RAY_NODES:-3}" # head + 2 workers
CLUSTER_NAME="${CLUSTER_NAME:-argus-ray}"
if [[ -z "${ADMIN_TOKEN}" ]]; then
echo "ERROR: MVP_INTERNAL_TOKEN must be set in host env (admin token)" >&2
@ -43,6 +45,42 @@ api_wait_ready() {
return 1
}
ray_wait_ready() {
local tries="${1:-60}"
for i in $(seq 1 "${tries}"); do
if curl -sS -m 2 "${RAY_DASHBOARD_ADDR}/api/version" >/dev/null 2>&1; then
echo "[host] ray_dashboard_ready: ${RAY_DASHBOARD_ADDR}"
return 0
fi
echo "[host] waiting ray dashboard... (${i}/${tries})"
sleep 2
done
echo "ERROR: ray dashboard not ready: ${RAY_DASHBOARD_ADDR}" >&2
return 1
}
ray_wait_nodes() {
local want="${1:-3}"
local tries="${2:-60}"
for i in $(seq 1 "${tries}"); do
local out n
out="$(docker exec -i "${HEAD_CONTAINER}" python3 -c "import ray; ray.init(address='auto', ignore_reinit_error=True, log_to_driver=False, logging_level='ERROR'); print(sum(1 for n in ray.nodes() if n.get('Alive')))" 2>/dev/null || true)"
n="$(printf '%s\n' "${out}" | tail -n 1 | tr -cd '0-9' || true)"
if [[ "${n}" =~ ^[0-9]+$ ]]; then
echo "[host] ray_nodes_alive=${n} (want>=${want})"
if [[ "${n}" -ge "${want}" ]]; then
return 0
fi
else
echo "[host] waiting ray nodes... (${i}/${tries})"
fi
sleep 2
done
echo "ERROR: ray nodes not ready (want>=${want})" >&2
docker exec -i "${HEAD_CONTAINER}" bash -lc "ray status || true" >&2 || true
return 1
}
submit_taskspec() {
local token="$1"
local taskspec_path="$2"
@ -86,14 +124,15 @@ echo "[host] bring down existing containers (best-effort)"
echo "[host] (re)create containers"
"${SCRIPT_DIR}/01_up.sh"
echo "[host] restart ray head (no compute on head)"
"${SCRIPT_DIR}/20_start_head.sh"
echo "[host] wait ray head ready (ray is supervised in-container; head has no compute)"
ray_wait_ready 60
echo "[host] start head discovery publisher"
"${SCRIPT_DIR}/22_start_head_discovery.sh"
echo "[host] verify head discovery record (supervised in-container)"
HEAD_IP_FILE="${SHARED_ROOT}/ray/discovery/${CLUSTER_NAME}/head.json"
dexec "${HEAD_CONTAINER}" bash -lc "test -f '${HEAD_IP_FILE}' && python3 -c 'import json,sys; print(json.load(open(sys.argv[1]))[\"job_server_url\"])' '${HEAD_IP_FILE}' || true"
echo "[host] start stateless workers (watchdog auto-connect)"
"${SCRIPT_DIR}/23_start_workers_stateless.sh"
echo "[host] wait workers join (watchdog auto-connect; supervised in-container)"
ray_wait_nodes "${EXPECTED_RAY_NODES}" 120
echo "[host] prepare data/model/code snapshot (idempotent; reuse cache)"
"${SCRIPT_DIR}/30_prepare_data_and_model.sh"