argus-cluster/src/mvp/scripts/run_all_cli.sh

60 lines
2.0 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=lib.sh
source "${SCRIPT_DIR}/lib.sh"
# Run the end-to-end flow using the CLI submitter (argus.cli).
# This script restarts the Ray cluster and submits PPO/GRPO/SFT sequentially.
submit_and_wait() {
local taskspec_in_container="$1"
local sid
local out
echo "[host] submit via SDK: ${taskspec_in_container}"
out="$(dexec "${HEAD_CONTAINER}" bash -lc "cd /workspace/mvp/py && python3 -m argus.cli.run --config /workspace/mvp/configs/dev.yaml --taskspec '${taskspec_in_container}' --action submit --no-wait" | tr -d '\r')"
sid="$(printf '%s\n' "${out}" | tail -n 1)"
if [[ -z "${sid}" ]]; then
echo "[host] failed to parse submission id from output:" >&2
printf '%s\n' "${out}" >&2
exit 1
fi
echo "[host] submitted: ${sid}"
while true; do
st="$(dexec "${HEAD_CONTAINER}" bash -lc "cd /workspace/mvp/py && python3 -m argus.cli.run --config /workspace/mvp/configs/dev.yaml --action status --submission-id '${sid}'" | tr -d '\r' | tail -n 1)"
echo "[host] status: ${sid} -> ${st}"
case "${st}" in
*SUCCEEDED*)
return 0
;;
*FAILED*|*STOPPED*)
echo "[host] job failed: ${sid} (${st})" >&2
echo "[host] last logs:" >&2
dexec "${HEAD_CONTAINER}" bash -lc "cd /workspace/mvp/py && python3 -m argus.cli.run --config /workspace/mvp/configs/dev.yaml --action logs --submission-id '${sid}' --tail 200" >&2 || true
return 1
;;
*)
sleep 10
;;
esac
done
}
"${SCRIPT_DIR}/00_prereq_check.sh"
"${SCRIPT_DIR}/03_cleanup_v1_legacy.sh"
"${SCRIPT_DIR}/05_ensure_verl_repo.sh"
"${SCRIPT_DIR}/01_up.sh"
"${SCRIPT_DIR}/20_start_head.sh"
"${SCRIPT_DIR}/21_start_workers.sh"
"${SCRIPT_DIR}/30_prepare_data_and_model.sh"
"${SCRIPT_DIR}/12_install_py_deps.sh"
submit_and_wait /workspace/mvp/taskspecs/ppo.yaml
submit_and_wait /workspace/mvp/taskspecs/grpo.yaml
submit_and_wait /workspace/mvp/taskspecs/sft.yaml
"${SCRIPT_DIR}/50_status.sh"