CUDA_VISIBLE_DEVICES=2,3 \ NPROC_PER_NODE=2 \ swift rlhf \ --rlhf_type grpo \ --model /data1/yuyr/qwen3-8b \ --dataset AI-MO/NuminaMath-TIR#5000 \ --reward_funcs accuracy cosine \ --use_vllm true \ --vllm_mode server \ --vllm_server_host localhost \ --vllm_server_port 8000 \ --per_device_train_batch_size 8 \ --per_device_eval_batch_size 8 \ --async_generate true \ --num_generations 4 \ --deepspeed zero3