From 7b89d60b3bc504f357bb0a409dd18c18bfdba54e Mon Sep 17 00:00:00 2001 From: yuyr Date: Mon, 29 Dec 2025 16:58:19 +0800 Subject: [PATCH] =?UTF-8?q?V2.5=20=E7=89=88=E6=9C=AC=E7=AB=AF=E5=88=B0?= =?UTF-8?q?=E7=AB=AFrun=5Fall=5Fapi=20=E9=80=9A=E8=BF=87=EF=BC=8C=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E9=83=A8=E7=BD=B2=E5=AE=B9=E5=99=A8=EF=BC=8C=E5=AE=89?= =?UTF-8?q?=E8=A3=85API=20server=EF=BC=8C=E5=B9=B6=E4=B8=94=E6=89=A7?= =?UTF-8?q?=E8=A1=8C=E4=BB=BB=E5=8A=A1=E6=8F=90=E4=BA=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mvp/scripts/01_up.sh | 10 +++- src/mvp/scripts/20_start_head.sh | 21 ------- src/mvp/scripts/21_start_workers.sh | 26 --------- src/mvp/scripts/22_start_head_discovery.sh | 12 ---- src/mvp/scripts/23_start_workers_stateless.sh | 13 ----- src/mvp/scripts/run_all_api.sh | 43 ++++++++++++++- src/mvp/scripts/run_all_cli.sh | 42 +++++++++++++- src/mvp/scripts/run_all_v25_api.sh | 55 ++++++++++++++++--- 8 files changed, 136 insertions(+), 86 deletions(-) delete mode 100755 src/mvp/scripts/20_start_head.sh delete mode 100755 src/mvp/scripts/21_start_workers.sh delete mode 100755 src/mvp/scripts/22_start_head_discovery.sh delete mode 100755 src/mvp/scripts/23_start_workers_stateless.sh diff --git a/src/mvp/scripts/01_up.sh b/src/mvp/scripts/01_up.sh index 48ca8c6..6196ecb 100755 --- a/src/mvp/scripts/01_up.sh +++ b/src/mvp/scripts/01_up.sh @@ -10,7 +10,15 @@ if [[ "${SKIP_CLEANUP_V1:-0}" != "1" ]]; then fi echo "[host] docker compose up -d (mvp)" -BUILD="${BUILD:-1}" +BUILD="${BUILD:-0}" + +# If the image isn't present locally, force build once. +if [[ "${BUILD}" != "1" ]]; then + if ! docker image inspect argus/argus-ray-node:v2.5 >/dev/null 2>&1; then + BUILD="1" + fi +fi + if [[ "${BUILD}" == "1" ]]; then dc up -d --build else diff --git a/src/mvp/scripts/20_start_head.sh b/src/mvp/scripts/20_start_head.sh deleted file mode 100755 index 8d299c5..0000000 --- a/src/mvp/scripts/20_start_head.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=lib.sh -source "${SCRIPT_DIR}/lib.sh" - -echo "[head] restart head container (ray + publisher are supervised in-container): ${HEAD_CONTAINER}" -docker restart "${HEAD_CONTAINER}" >/dev/null - -echo "[head] wait ray dashboard ready (best-effort)" -for i in $(seq 1 60); do - if curl -sS -m 2 "${RAY_DASHBOARD_ADDR}" >/dev/null 2>&1; then - echo "[head] dashboard ready: ${RAY_DASHBOARD_ADDR}" - break - fi - sleep 2 -done - -echo "[head] ray status (best-effort)" -dexec "${HEAD_CONTAINER}" bash -lc "ray status || true" diff --git a/src/mvp/scripts/21_start_workers.sh b/src/mvp/scripts/21_start_workers.sh deleted file mode 100755 index 2f55a95..0000000 --- a/src/mvp/scripts/21_start_workers.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=lib.sh -source "${SCRIPT_DIR}/lib.sh" - -HEAD_IP="$(container_ip "${HEAD_CONTAINER}")" -HEAD_ADDR="${HEAD_IP}:6379" - -start_one() { - local worker="$1" - local ip - ip="$(container_ip "${worker}")" - echo "[${worker}] ray stop (best-effort)" - dexec "${worker}" bash -lc "ray stop --force || true" - echo "[${worker}] start ray worker -> head ${HEAD_ADDR}" - dexec "${worker}" bash -lc "ray start --address='${HEAD_ADDR}' --node-ip-address='${ip}' --resources='{\"worker_node\": 100}' --disable-usage-stats" -} - -start_one "${WORKER0_CONTAINER}" -start_one "${WORKER1_CONTAINER}" - -echo "[head] ray status" -dexec "${HEAD_CONTAINER}" bash -lc "ray status || true" - diff --git a/src/mvp/scripts/22_start_head_discovery.sh b/src/mvp/scripts/22_start_head_discovery.sh deleted file mode 100755 index 2973672..0000000 --- a/src/mvp/scripts/22_start_head_discovery.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=lib.sh -source "${SCRIPT_DIR}/lib.sh" - -CLUSTER_NAME="${CLUSTER_NAME:-argus-ray}" -HEAD_IP_FILE="${HEAD_IP_FILE:-${SHARED_ROOT}/ray/discovery/${CLUSTER_NAME}/head.json}" - -echo "[head] discovery publisher is supervised in-container; verify head record exists: ${HEAD_IP_FILE}" -dexec "${HEAD_CONTAINER}" bash -lc "test -f '${HEAD_IP_FILE}' && python3 -c 'import json,sys; print(json.load(open(sys.argv[1]))[\"job_server_url\"])' '${HEAD_IP_FILE}' || true" diff --git a/src/mvp/scripts/23_start_workers_stateless.sh b/src/mvp/scripts/23_start_workers_stateless.sh deleted file mode 100755 index b76fab7..0000000 --- a/src/mvp/scripts/23_start_workers_stateless.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# shellcheck source=lib.sh -source "${SCRIPT_DIR}/lib.sh" - -echo "[workers] restart worker containers (ray is supervised in-container)" -docker restart "${WORKER0_CONTAINER}" >/dev/null -docker restart "${WORKER1_CONTAINER}" >/dev/null - -echo "[head] ray status" -dexec "${HEAD_CONTAINER}" bash -lc "ray status || true" diff --git a/src/mvp/scripts/run_all_api.sh b/src/mvp/scripts/run_all_api.sh index 3ff190b..3cf4a06 100755 --- a/src/mvp/scripts/run_all_api.sh +++ b/src/mvp/scripts/run_all_api.sh @@ -12,6 +12,7 @@ source "${SCRIPT_DIR}/lib.sh" API_ADDR="${API_ADDR:-http://127.0.0.1:8080}" TOKEN="${MVP_INTERNAL_TOKEN:-}" DB_IN_CONTAINER="${DB_IN_CONTAINER:-/private/common/db/mvp.sqlite3}" +EXPECTED_RAY_NODES="${EXPECTED_RAY_NODES:-3}" # head + 2 workers if [[ -z "${TOKEN}" ]]; then echo "ERROR: MVP_INTERNAL_TOKEN must be set in the host env for run_all_api.sh" >&2 @@ -36,6 +37,42 @@ api_wait_ready() { return 1 } +ray_wait_ready() { + local tries="${1:-60}" + for i in $(seq 1 "${tries}"); do + if curl -sS -m 2 "${RAY_DASHBOARD_ADDR}/api/version" >/dev/null 2>&1; then + echo "[host] ray_dashboard_ready: ${RAY_DASHBOARD_ADDR}" + return 0 + fi + echo "[host] waiting ray dashboard... (${i}/${tries})" + sleep 2 + done + echo "ERROR: ray dashboard not ready: ${RAY_DASHBOARD_ADDR}" >&2 + return 1 +} + +ray_wait_nodes() { + local want="${1:-3}" + local tries="${2:-60}" + for i in $(seq 1 "${tries}"); do + local out n + out="$(docker exec -i "${HEAD_CONTAINER}" python3 -c "import ray; ray.init(address='auto', ignore_reinit_error=True, log_to_driver=False, logging_level='ERROR'); print(sum(1 for n in ray.nodes() if n.get('Alive')))" 2>/dev/null || true)" + n="$(printf '%s\n' "${out}" | tail -n 1 | tr -cd '0-9' || true)" + if [[ "${n}" =~ ^[0-9]+$ ]]; then + echo "[host] ray_nodes_alive=${n} (want>=${want})" + if [[ "${n}" -ge "${want}" ]]; then + return 0 + fi + else + echo "[host] waiting ray nodes... (${i}/${tries})" + fi + sleep 2 + done + echo "ERROR: ray nodes not ready (want>=${want})" >&2 + docker exec -i "${HEAD_CONTAINER}" bash -lc "ray status || true" >&2 || true + return 1 +} + submit_taskspec() { local taskspec_path="$1" echo "[host] submit via API: ${taskspec_path}" >&2 @@ -82,9 +119,9 @@ echo "[host] ===== run_all_api.sh begin =====" echo "[host] (re)create containers" "${SCRIPT_DIR}/01_up.sh" -echo "[host] restart ray cluster" -"${SCRIPT_DIR}/20_start_head.sh" -"${SCRIPT_DIR}/21_start_workers.sh" +echo "[host] wait ray cluster ready (supervised in-container)" +ray_wait_ready 60 +ray_wait_nodes "${EXPECTED_RAY_NODES}" 120 echo "[host] prepare data/model/code snapshot" "${SCRIPT_DIR}/30_prepare_data_and_model.sh" diff --git a/src/mvp/scripts/run_all_cli.sh b/src/mvp/scripts/run_all_cli.sh index 3e89801..ca42f3f 100755 --- a/src/mvp/scripts/run_all_cli.sh +++ b/src/mvp/scripts/run_all_cli.sh @@ -8,6 +8,44 @@ source "${SCRIPT_DIR}/lib.sh" # Run the end-to-end flow using the CLI submitter (argus.cli). # This script restarts the Ray cluster and submits PPO/GRPO/SFT sequentially. +EXPECTED_RAY_NODES="${EXPECTED_RAY_NODES:-3}" # head + 2 workers + +ray_wait_ready() { + local tries="${1:-60}" + for i in $(seq 1 "${tries}"); do + if curl -sS -m 2 "${RAY_DASHBOARD_ADDR}/api/version" >/dev/null 2>&1; then + echo "[host] ray_dashboard_ready: ${RAY_DASHBOARD_ADDR}" + return 0 + fi + echo "[host] waiting ray dashboard... (${i}/${tries})" + sleep 2 + done + echo "ERROR: ray dashboard not ready: ${RAY_DASHBOARD_ADDR}" >&2 + return 1 +} + +ray_wait_nodes() { + local want="${1:-3}" + local tries="${2:-60}" + for i in $(seq 1 "${tries}"); do + local out n + out="$(docker exec -i "${HEAD_CONTAINER}" python3 -c "import ray; ray.init(address='auto', ignore_reinit_error=True, log_to_driver=False, logging_level='ERROR'); print(sum(1 for n in ray.nodes() if n.get('Alive')))" 2>/dev/null || true)" + n="$(printf '%s\n' "${out}" | tail -n 1 | tr -cd '0-9' || true)" + if [[ "${n}" =~ ^[0-9]+$ ]]; then + echo "[host] ray_nodes_alive=${n} (want>=${want})" + if [[ "${n}" -ge "${want}" ]]; then + return 0 + fi + else + echo "[host] waiting ray nodes... (${i}/${tries})" + fi + sleep 2 + done + echo "ERROR: ray nodes not ready (want>=${want})" >&2 + docker exec -i "${HEAD_CONTAINER}" bash -lc "ray status || true" >&2 || true + return 1 +} + submit_and_wait() { local taskspec_in_container="$1" local sid @@ -49,8 +87,8 @@ submit_and_wait() { "${SCRIPT_DIR}/03_cleanup_v1_legacy.sh" "${SCRIPT_DIR}/05_ensure_verl_repo.sh" "${SCRIPT_DIR}/01_up.sh" -"${SCRIPT_DIR}/20_start_head.sh" -"${SCRIPT_DIR}/21_start_workers.sh" +ray_wait_ready 60 +ray_wait_nodes "${EXPECTED_RAY_NODES}" 120 "${SCRIPT_DIR}/30_prepare_data_and_model.sh" "${SCRIPT_DIR}/12_install_py_deps.sh" submit_and_wait /workspace/mvp/taskspecs/ppo.yaml diff --git a/src/mvp/scripts/run_all_v25_api.sh b/src/mvp/scripts/run_all_v25_api.sh index 1d926a8..3c2d3b5 100755 --- a/src/mvp/scripts/run_all_v25_api.sh +++ b/src/mvp/scripts/run_all_v25_api.sh @@ -8,8 +8,8 @@ source "${SCRIPT_DIR}/lib.sh" # E2E v2.5: # - Clean legacy env # - Start containers -# - Start ray head + discovery publisher -# - Start stateless worker watchdogs (auto-connect) +# - Ray head/worker auto-start (supervisor in-container) +# - Verify discovery + worker join # - Prepare data/model/code (reuses existing downloads) # - Start API # - Create user + issue token @@ -19,6 +19,8 @@ API_ADDR="${API_ADDR:-http://127.0.0.1:8080}" ADMIN_TOKEN="${MVP_INTERNAL_TOKEN:-}" USER_ID="${USER_ID:-alice}" RESET_DB="${RESET_DB:-1}" +EXPECTED_RAY_NODES="${EXPECTED_RAY_NODES:-3}" # head + 2 workers +CLUSTER_NAME="${CLUSTER_NAME:-argus-ray}" if [[ -z "${ADMIN_TOKEN}" ]]; then echo "ERROR: MVP_INTERNAL_TOKEN must be set in host env (admin token)" >&2 @@ -43,6 +45,42 @@ api_wait_ready() { return 1 } +ray_wait_ready() { + local tries="${1:-60}" + for i in $(seq 1 "${tries}"); do + if curl -sS -m 2 "${RAY_DASHBOARD_ADDR}/api/version" >/dev/null 2>&1; then + echo "[host] ray_dashboard_ready: ${RAY_DASHBOARD_ADDR}" + return 0 + fi + echo "[host] waiting ray dashboard... (${i}/${tries})" + sleep 2 + done + echo "ERROR: ray dashboard not ready: ${RAY_DASHBOARD_ADDR}" >&2 + return 1 +} + +ray_wait_nodes() { + local want="${1:-3}" + local tries="${2:-60}" + for i in $(seq 1 "${tries}"); do + local out n + out="$(docker exec -i "${HEAD_CONTAINER}" python3 -c "import ray; ray.init(address='auto', ignore_reinit_error=True, log_to_driver=False, logging_level='ERROR'); print(sum(1 for n in ray.nodes() if n.get('Alive')))" 2>/dev/null || true)" + n="$(printf '%s\n' "${out}" | tail -n 1 | tr -cd '0-9' || true)" + if [[ "${n}" =~ ^[0-9]+$ ]]; then + echo "[host] ray_nodes_alive=${n} (want>=${want})" + if [[ "${n}" -ge "${want}" ]]; then + return 0 + fi + else + echo "[host] waiting ray nodes... (${i}/${tries})" + fi + sleep 2 + done + echo "ERROR: ray nodes not ready (want>=${want})" >&2 + docker exec -i "${HEAD_CONTAINER}" bash -lc "ray status || true" >&2 || true + return 1 +} + submit_taskspec() { local token="$1" local taskspec_path="$2" @@ -86,14 +124,15 @@ echo "[host] bring down existing containers (best-effort)" echo "[host] (re)create containers" "${SCRIPT_DIR}/01_up.sh" -echo "[host] restart ray head (no compute on head)" -"${SCRIPT_DIR}/20_start_head.sh" +echo "[host] wait ray head ready (ray is supervised in-container; head has no compute)" +ray_wait_ready 60 -echo "[host] start head discovery publisher" -"${SCRIPT_DIR}/22_start_head_discovery.sh" +echo "[host] verify head discovery record (supervised in-container)" +HEAD_IP_FILE="${SHARED_ROOT}/ray/discovery/${CLUSTER_NAME}/head.json" +dexec "${HEAD_CONTAINER}" bash -lc "test -f '${HEAD_IP_FILE}' && python3 -c 'import json,sys; print(json.load(open(sys.argv[1]))[\"job_server_url\"])' '${HEAD_IP_FILE}' || true" -echo "[host] start stateless workers (watchdog auto-connect)" -"${SCRIPT_DIR}/23_start_workers_stateless.sh" +echo "[host] wait workers join (watchdog auto-connect; supervised in-container)" +ray_wait_nodes "${EXPECTED_RAY_NODES}" 120 echo "[host] prepare data/model/code snapshot (idempotent; reuse cache)" "${SCRIPT_DIR}/30_prepare_data_and_model.sh"