#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=lib.sh source "${SCRIPT_DIR}/lib.sh" # Run the end-to-end flow using the CLI submitter (argus.cli). # This script restarts the Ray cluster and submits PPO/GRPO/SFT sequentially. submit_and_wait() { local taskspec_in_container="$1" local sid local out echo "[host] submit via SDK: ${taskspec_in_container}" out="$(dexec "${HEAD_CONTAINER}" bash -lc "cd /workspace/mvp/py && python3 -m argus.cli.run --config /workspace/mvp/configs/dev.yaml --taskspec '${taskspec_in_container}' --action submit --no-wait" | tr -d '\r')" sid="$(printf '%s\n' "${out}" | tail -n 1)" if [[ -z "${sid}" ]]; then echo "[host] failed to parse submission id from output:" >&2 printf '%s\n' "${out}" >&2 exit 1 fi echo "[host] submitted: ${sid}" while true; do st="$(dexec "${HEAD_CONTAINER}" bash -lc "cd /workspace/mvp/py && python3 -m argus.cli.run --config /workspace/mvp/configs/dev.yaml --action status --submission-id '${sid}'" | tr -d '\r' | tail -n 1)" echo "[host] status: ${sid} -> ${st}" case "${st}" in *SUCCEEDED*) return 0 ;; *FAILED*|*STOPPED*) echo "[host] job failed: ${sid} (${st})" >&2 echo "[host] last logs:" >&2 dexec "${HEAD_CONTAINER}" bash -lc "cd /workspace/mvp/py && python3 -m argus.cli.run --config /workspace/mvp/configs/dev.yaml --action logs --submission-id '${sid}' --tail 200" >&2 || true return 1 ;; *) sleep 10 ;; esac done } "${SCRIPT_DIR}/00_prereq_check.sh" "${SCRIPT_DIR}/03_cleanup_v1_legacy.sh" "${SCRIPT_DIR}/05_ensure_verl_repo.sh" "${SCRIPT_DIR}/01_up.sh" "${SCRIPT_DIR}/20_start_head.sh" "${SCRIPT_DIR}/21_start_workers.sh" "${SCRIPT_DIR}/30_prepare_data_and_model.sh" "${SCRIPT_DIR}/12_install_py_deps.sh" submit_and_wait /workspace/mvp/taskspecs/ppo.yaml submit_and_wait /workspace/mvp/taskspecs/grpo.yaml submit_and_wait /workspace/mvp/taskspecs/sft.yaml "${SCRIPT_DIR}/50_status.sh"