60 lines
2.0 KiB
Bash
Executable File
60 lines
2.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
# shellcheck source=lib.sh
|
|
source "${SCRIPT_DIR}/lib.sh"
|
|
|
|
# Run the end-to-end flow using the CLI submitter (argus.cli).
|
|
# This script restarts the Ray cluster and submits PPO/GRPO/SFT sequentially.
|
|
|
|
submit_and_wait() {
|
|
local taskspec_in_container="$1"
|
|
local sid
|
|
local out
|
|
|
|
echo "[host] submit via SDK: ${taskspec_in_container}"
|
|
out="$(dexec "${HEAD_CONTAINER}" bash -lc "cd /workspace/mvp/py && python3 -m argus.cli.run --config /workspace/mvp/configs/dev.yaml --taskspec '${taskspec_in_container}' --action submit --no-wait" | tr -d '\r')"
|
|
sid="$(printf '%s\n' "${out}" | tail -n 1)"
|
|
|
|
if [[ -z "${sid}" ]]; then
|
|
echo "[host] failed to parse submission id from output:" >&2
|
|
printf '%s\n' "${out}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
echo "[host] submitted: ${sid}"
|
|
|
|
while true; do
|
|
st="$(dexec "${HEAD_CONTAINER}" bash -lc "cd /workspace/mvp/py && python3 -m argus.cli.run --config /workspace/mvp/configs/dev.yaml --action status --submission-id '${sid}'" | tr -d '\r' | tail -n 1)"
|
|
echo "[host] status: ${sid} -> ${st}"
|
|
case "${st}" in
|
|
*SUCCEEDED*)
|
|
return 0
|
|
;;
|
|
*FAILED*|*STOPPED*)
|
|
echo "[host] job failed: ${sid} (${st})" >&2
|
|
echo "[host] last logs:" >&2
|
|
dexec "${HEAD_CONTAINER}" bash -lc "cd /workspace/mvp/py && python3 -m argus.cli.run --config /workspace/mvp/configs/dev.yaml --action logs --submission-id '${sid}' --tail 200" >&2 || true
|
|
return 1
|
|
;;
|
|
*)
|
|
sleep 10
|
|
;;
|
|
esac
|
|
done
|
|
}
|
|
|
|
"${SCRIPT_DIR}/00_prereq_check.sh"
|
|
"${SCRIPT_DIR}/03_cleanup_v1_legacy.sh"
|
|
"${SCRIPT_DIR}/05_ensure_verl_repo.sh"
|
|
"${SCRIPT_DIR}/01_up.sh"
|
|
"${SCRIPT_DIR}/20_start_head.sh"
|
|
"${SCRIPT_DIR}/21_start_workers.sh"
|
|
"${SCRIPT_DIR}/30_prepare_data_and_model.sh"
|
|
"${SCRIPT_DIR}/12_install_py_deps.sh"
|
|
submit_and_wait /workspace/mvp/taskspecs/ppo.yaml
|
|
submit_and_wait /workspace/mvp/taskspecs/grpo.yaml
|
|
submit_and_wait /workspace/mvp/taskspecs/sft.yaml
|
|
"${SCRIPT_DIR}/50_status.sh"
|