#!/usr/bin/env bash set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=lib.sh source "${SCRIPT_DIR}/lib.sh" submit_and_wait() { local jobspec_in_container="$1" local sid local out echo "[host] submit via SDK: ${jobspec_in_container}" out="$(dexec "${HEAD_CONTAINER}" bash -lc "python3 /workspace/mvp/v1.1/py/run.py --config /workspace/mvp/v1.1/py/configs/dev.yaml --jobspec '${jobspec_in_container}' --action submit --no-wait" | tr -d '\r')" sid="$(printf '%s\n' "${out}" | tail -n 1)" if [[ -z "${sid}" ]]; then echo "[host] failed to parse submission id from output:" >&2 printf '%s\n' "${out}" >&2 exit 1 fi echo "[host] submitted: ${sid}" while true; do st="$(dexec "${HEAD_CONTAINER}" bash -lc "python3 /workspace/mvp/v1.1/py/run.py --config /workspace/mvp/v1.1/py/configs/dev.yaml --action status --submission-id '${sid}'" | tr -d '\r' | tail -n 1)" echo "[host] status: ${sid} -> ${st}" case "${st}" in *SUCCEEDED*) return 0 ;; *FAILED*|*STOPPED*) echo "[host] job failed: ${sid} (${st})" >&2 echo "[host] last logs:" >&2 dexec "${HEAD_CONTAINER}" bash -lc "python3 /workspace/mvp/v1.1/py/run.py --config /workspace/mvp/v1.1/py/configs/dev.yaml --action logs --submission-id '${sid}' --tail 200" >&2 || true return 1 ;; *) sleep 10 ;; esac done } "${SCRIPT_DIR}/00_prereq_check.sh" "${SCRIPT_DIR}/03_cleanup_v1_legacy.sh" "${SCRIPT_DIR}/05_ensure_verl_repo.sh" "${SCRIPT_DIR}/01_up.sh" "${SCRIPT_DIR}/20_start_head.sh" "${SCRIPT_DIR}/21_start_workers.sh" "${SCRIPT_DIR}/30_prepare_data_and_model.sh" "${SCRIPT_DIR}/12_install_py_deps.sh" submit_and_wait /workspace/mvp/v1.1/py/jobspecs/ppo.yaml submit_and_wait /workspace/mvp/v1.1/py/jobspecs/grpo.yaml submit_and_wait /workspace/mvp/v1.1/py/jobspecs/sft.yaml "${SCRIPT_DIR}/50_status.sh"