You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
exception calling callback for <Future at 0x7f9eaf6502b0 state=finished raised RuntimeError>
Traceback (most recent call last):
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 342, in _invoke_callbacks
callback(self)
File "/program/ms-swift/swift/trainers/rlhf_trainer/grpo_trainer.py", line 697, in done
current_queue.put(DataCache(inputs, _self.result(), distributed_idx))
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 451, in result
return self.__get_result()
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/program/ms-swift/swift/trainers/rlhf_trainer/grpo_trainer.py", line 689, in infer_task
with set_device_context(self.infer_device), self.multi_turn_completion_length_context():
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/contextlib.py", line 142, in exit
next(self.gen)
File "/program/ms-swift/swift/llm/infer/infer_engine/utils.py", line 478, in set_device_context
set_device(origin_device)
File "/program/ms-swift/swift/utils/torch_utils.py", line 276, in set_device
torch.cuda.set_device(local_rank)
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/site-packages/torch/cuda/init.py", line 476, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
exception calling callback for <Future at 0x7f86e5749f60 state=finished raised RuntimeError>
Traceback (most recent call last):
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 342, in _invoke_callbacks
callback(self)
File "/program/ms-swift/swift/trainers/rlhf_trainer/grpo_trainer.py", line 697, in done
current_queue.put(DataCache(inputs, _self.result(), distributed_idx))
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 451, in result
return self.__get_result()
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/program/ms-swift/swift/trainers/rlhf_trainer/grpo_trainer.py", line 689, in infer_task
with set_device_context(self.infer_device), self.multi_turn_completion_length_context():
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/contextlib.py", line 142, in exit
next(self.gen)
File "/program/ms-swift/swift/llm/infer/infer_engine/utils.py", line 478, in set_device_context
set_device(origin_device)
File "/program/ms-swift/swift/utils/torch_utils.py", line 276, in set_device
torch.cuda.set_device(local_rank)
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/site-packages/torch/cuda/init.py", line 476, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
The text was updated successfully, but these errors were encountered:
ms-swift: git cloned on 4/17/2025
vllm: 0.8.4
torch: 2.6.0
deepspeed: 0.16.7
python: 3.10.6
8 GPUs total, 4:4 Async Mode:
NPROC_PER_NODE=4
swift rlhf
--rlhf_type grpo
--model Qwen/Qwen2.5-7B-Instruct
--train_type lora
--dataset $DATA_DIR
--torch_dtype bfloat16
--num_train_epochs 1
--max_length $MAX_TOKEN_LENGTH
--per_device_train_batch_size 2
--per_device_eval_batch_size 2
--gradient_accumulation_steps 8
--eval_steps 250
--save_steps 250
--learning_rate 5e-6
--lr_scheduler_type cosine
--save_total_limit 5
--logging_steps 3
--output_dir $OUTPUT_DIR
--warmup_ratio 0.05
--dataloader_num_workers 4
--max_completion_length $RESPONSE_TOKEN_LENGTH
--reward_funcs accuracy format
--num_generations 8
--system examples/train/grpo/prompt.txt
--use_vllm true
--vllm_gpu_memory_utilization 0.85
--vllm_max_model_len $MAX_TOKEN_LENGTH
--deepspeed zero3
--temperature 0.9
--top_p 0.85
--top_k 50
--log_completions true
--num_infer_workers 4
--tensor_parallel_size 1
--async_generate true
--move_model_batches 16
--offload_optimizer false
--offload_model false
--report_to wandb
--ds3_gather_for_generation true \
exception calling callback for <Future at 0x7f9eaf6502b0 state=finished raised RuntimeError>
Traceback (most recent call last):
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 342, in _invoke_callbacks
callback(self)
File "/program/ms-swift/swift/trainers/rlhf_trainer/grpo_trainer.py", line 697, in done
current_queue.put(DataCache(inputs, _self.result(), distributed_idx))
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 451, in result
return self.__get_result()
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/program/ms-swift/swift/trainers/rlhf_trainer/grpo_trainer.py", line 689, in infer_task
with set_device_context(self.infer_device), self.multi_turn_completion_length_context():
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/contextlib.py", line 142, in exit
next(self.gen)
File "/program/ms-swift/swift/llm/infer/infer_engine/utils.py", line 478, in set_device_context
set_device(origin_device)
File "/program/ms-swift/swift/utils/torch_utils.py", line 276, in set_device
torch.cuda.set_device(local_rank)
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/site-packages/torch/cuda/init.py", line 476, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions.exception calling callback for <Future at 0x7f86e5749f60 state=finished raised RuntimeError>
Traceback (most recent call last):
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 342, in _invoke_callbacks
callback(self)
File "/program/ms-swift/swift/trainers/rlhf_trainer/grpo_trainer.py", line 697, in done
current_queue.put(DataCache(inputs, _self.result(), distributed_idx))
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 451, in result
return self.__get_result()
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/program/ms-swift/swift/trainers/rlhf_trainer/grpo_trainer.py", line 689, in infer_task
with set_device_context(self.infer_device), self.multi_turn_completion_length_context():
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/contextlib.py", line 142, in exit
next(self.gen)
File "/program/ms-swift/swift/llm/infer/infer_engine/utils.py", line 478, in set_device_context
set_device(origin_device)
File "/program/ms-swift/swift/utils/torch_utils.py", line 276, in set_device
torch.cuda.set_device(local_rank)
File "/program/miniconda3/envs/myenv_msswift/lib/python3.10/site-packages/torch/cuda/init.py", line 476, in set_device
torch._C._cuda_setDevice(device)
RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions.The text was updated successfully, but these errors were encountered: