3 Commits

Author SHA1 Message Date
  PCL-张晗 3386461756 v0 3 months ago
  PCL-张晗 ffed470341 v0 3 months ago
  PCL-张晗 9c7bd93de6 HeteroRL 3 months ago
100 changed files with 104 additions and 6484 deletions
Split View
  1. +0
    -4
      .gitignore
  2. +8
    -0
      .idea/.gitignore
  3. +1
    -0
      .idea/.name
  4. +12
    -0
      .idea/HeteroRL.iml
  5. +6
    -0
      .idea/inspectionProfiles/profiles_settings.xml
  6. +7
    -0
      .idea/misc.xml
  7. +8
    -0
      .idea/modules.xml
  8. +6
    -0
      .idea/vcs.xml
  9. +0
    -137
      GPUdebug.py
  10. +0
    -201
      LICENSE
  11. +0
    -53
      Makefile
  12. +17
    -19
      README.md
  13. BIN
      assets/plan-of-attack.png
  14. +0
    -618
      install.sh
  15. +0
    -142
      lighteval_results/main.py
  16. +0
    -3
      once.py
  17. +0
    -60
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v0d.yaml
  18. +0
    -65
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v1.yaml
  19. +0
    -65
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v2.yaml
  20. +0
    -65
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v3.yaml
  21. +0
    -72
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4.yaml
  22. +0
    -76
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4a.yaml
  23. +0
    -76
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4b.yaml
  24. +0
    -76
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5.yaml
  25. +0
    -78
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5a.yaml
  26. +0
    -78
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5b.yaml
  27. +0
    -78
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5c.yaml
  28. +0
    -78
      recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5d.yaml
  29. +0
    -78
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0.yaml
  30. +0
    -78
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0a.yaml
  31. +0
    -78
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0b.yaml
  32. +0
    -79
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0c1.yaml
  33. +0
    -78
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0c2.yaml
  34. +0
    -78
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0d.yaml
  35. +0
    -77
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v1.yaml
  36. +0
    -79
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v2.yaml
  37. +0
    -79
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v2a.yaml
  38. +0
    -79
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6a.yaml
  39. +0
    -82
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6b.yaml
  40. +0
    -81
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6c.yaml
  41. +0
    -81
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6d.yaml
  42. +0
    -81
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6e.yaml
  43. +0
    -81
      recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v7.yaml
  44. +7
    -4
      recipes/HeteoRL/config.yaml
  45. +8
    -6
      recipes/Online/config.yaml
  46. +0
    -54
      recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
  47. +0
    -54
      recipes/Qwen2.5-7B-Instruct/grpo/config_demo_v1.yaml
  48. +0
    -64
      recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1_vllm.yaml
  49. +0
    -60
      recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2.yaml
  50. +0
    -60
      recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2_g16.yaml
  51. +0
    -73
      recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2_vllm.yaml
  52. +0
    -73
      recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v3_vllm.yaml
  53. +0
    -15
      recipes/README.md
  54. +0
    -16
      recipes/accelerate_configs/ddp.yaml
  55. +0
    -16
      recipes/accelerate_configs/ddp_4gpus.yaml
  56. +0
    -16
      recipes/accelerate_configs/ddp_6gpus.yaml
  57. +0
    -16
      recipes/accelerate_configs/ddp_8gpus.yaml
  58. +0
    -27
      recipes/accelerate_configs/fsdp.yaml
  59. +0
    -21
      recipes/accelerate_configs/zero1.yaml
  60. +0
    -21
      recipes/accelerate_configs/zero2.yaml
  61. +0
    -21
      recipes/accelerate_configs/zero2_2A100s.yaml
  62. +0
    -22
      recipes/accelerate_configs/zero3.yaml
  63. +0
    -22
      recipes/accelerate_configs/zero3_4A100s.yaml
  64. +0
    -12
      retrieve_analysis.sh
  65. +0
    -479
      retrieve_and_analysis.py
  66. +0
    -472
      retrieve_and_analysis_bak.py
  67. +0
    -146
      scripts/decontaminate.py
  68. +0
    -174
      scripts/generate_reasoning.py
  69. +0
    -28
      scripts/get_tensor_parallel_size.py
  70. +0
    -61
      scripts/run_benchmarks.py
  71. +0
    -55
      scripts/upload_details.py
  72. +0
    -41
      setup.cfg
  73. +0
    -145
      setup.py
  74. +4
    -5
      sh_dir/HeteroRL_Learner_4gpus.sh
  75. +13
    -13
      sh_dir/HeteroRL_Sampler_4gpus.sh
  76. +0
    -60
      sh_dir/Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint.sh
  77. +7
    -9
      sh_dir/Online_gXpo_4gpus.sh
  78. +0
    -44
      sh_dir/Online_gXpo_4gpus_benchmark.sh
  79. +0
    -27
      sh_dir/README.md
  80. +0
    -71
      sh_dir/Sampler_4gpus_single_benchmark_checkpoint.sh
  81. +0
    -55
      sh_dir/debug/MoIS_Learner_4gpus_nRMs_debug.sh
  82. +0
    -62
      sh_dir/debug/MoIS_Sampler_4gpus_debug.sh
  83. +0
    -24
      sh_dir/debug/train_grpo_4gpus_debug.sh
  84. BIN
      sh_dir/old/.MoIS_Learner_4gpus_nRMs.sh.swp
  85. +0
    -49
      sh_dir/old/Debug_Learner_MoIS_4gpus_nRMs.sh
  86. +0
    -27
      sh_dir/old/Debug_Learner_v2_4gpus.sh
  87. +0
    -38
      sh_dir/old/Debug_Sampler_4gpus_Part1.sh
  88. +0
    -2
      sh_dir/old/Kill_Learner.sh
  89. +0
    -2
      sh_dir/old/Kill_Sampler.sh
  90. +0
    -2
      sh_dir/old/Kill_debug.sh
  91. +0
    -56
      sh_dir/old/MoIS_Learner_4gpus_nRMs.sh
  92. +0
    -54
      sh_dir/old/MoIS_Learner_4gpus_nRMs_LogNorm.sh
  93. +0
    -59
      sh_dir/old/MoIS_Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint_bak.sh
  94. +0
    -48
      sh_dir/old/MoIS_Learner_4gpus_nRMs_debug.sh
  95. +0
    -41
      sh_dir/old/MoIS_SamplerV2_4gpus_Part1.sh
  96. +0
    -41
      sh_dir/old/MoIS_SamplerV2_4gpus_Part2.sh
  97. +0
    -41
      sh_dir/old/MoIS_SamplerV2_4gpus_Part3.sh
  98. +0
    -41
      sh_dir/old/MoIS_SamplerV2_4gpus_Part4.sh
  99. +0
    -66
      sh_dir/old/MoIS_Sampler_4gpus.sh
  100. +0
    -40
      sh_dir/old/MoIS_Sampler_4gpus_Part1.sh

+ 0
- 4
.gitignore View File

@@ -1,4 +0,0 @@
wandb
log_dir
*.txt
A100_vs_910A

+ 8
- 0
.idea/.gitignore View File

@@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

+ 1
- 0
.idea/.name View File

@@ -0,0 +1 @@
HeteroRL

+ 12
- 0
.idea/HeteroRL.iml View File

@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$/../HeteroRL" />
<orderEntry type="jdk" jdkName="Python 3.13" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="GOOGLE" />
<option name="myDocStringFormat" value="Google" />
</component>
</module>

+ 6
- 0
.idea/inspectionProfiles/profiles_settings.xml View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

+ 7
- 0
.idea/misc.xml View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.13" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13" project-jdk-type="Python SDK" />
</project>

+ 8
- 0
.idea/modules.xml View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/AsyGPG.iml" filepath="$PROJECT_DIR$/.idea/AsyGPG.iml" />
</modules>
</component>
</project>

+ 6
- 0
.idea/vcs.xml View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

+ 0
- 137
GPUdebug.py View File

@@ -1,137 +0,0 @@
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import os
def setup_process(rank, world_size):
"""设置分布式进程"""
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12356'
# 初始化进程组,使用 NCCL 后端处理 GPU 通信
dist.init_process_group("nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
def cleanup_process():
"""清理进程"""
dist.destroy_process_group()
def all_reduce_demo(rank, world_size):
"""All-Reduce 演示函数"""
# 设置当前GPU设备
torch.cuda.set_device(rank)
device = f'cuda:{rank}'
print(f"\n=== 进程 {rank} (GPU {rank}) ===")
# 创建测试张量 - 每个进程创建不同的值
original_tensor = torch.tensor([
[1.0 + rank, 2.0 + rank],
[3.0 + rank, 4.0 + rank]
], device=device)
print(f"进程 {rank} 的原始张量:")
print(original_tensor)
# 测试不同的 All-Reduce 操作
operations = [
(dist.ReduceOp.SUM, "求和"),
(dist.ReduceOp.AVG, "平均"),
(dist.ReduceOp.MAX, "最大值"),
(dist.ReduceOp.MIN, "最小值")
]
for op, op_name in operations:
# 复制原始张量
tensor_copy = original_tensor.clone()
# 执行 All-Reduce
dist.all_reduce(tensor_copy, op=op)
print(f"\n进程 {rank} - {op_name}操作后的结果:")
print(tensor_copy)
def main():
"""主函数"""
# 使用 GPU 0,1,2,3,4
gpu_ids = [0, 1, 2, 3, 4]
available_gpus = torch.cuda.device_count()
print(f"系统可用GPU数量: {available_gpus}")
# 过滤出实际可用的GPU
valid_gpu_ids = [gpu_id for gpu_id in gpu_ids if gpu_id < available_gpus]
if len(valid_gpu_ids) == 0:
print("没有可用的GPU!")
return
world_size = len(valid_gpu_ids)
print(f"将使用GPU: {valid_gpu_ids}")
# 设置环境变量
os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, valid_gpu_ids))
try:
# 启动多进程
mp.spawn(
all_reduce_demo,
args=(world_size,),
nprocs=world_size,
join=True
)
except Exception as e:
print(f"发生错误: {e}")
finally:
print("演示完成")
# 简化版本 - 只做基本的求和操作
def simple_all_reduce_demo(rank, world_size):
"""简化版 All-Reduce 演示"""
setup_process(rank, world_size)
# 设置GPU设备
device = torch.device(f'cuda:{rank}')
# 创建张量
tensor = torch.tensor([[rank + 1.0, rank + 2.0],
[rank + 3.0, rank + 4.0]], device=device)
print(f"进程 {rank} (GPU {rank}) - 原始张量:")
print(tensor)
print("-" * 30)
# All-Reduce 操作 (默认求和)
dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
print(f"进程 {rank} (GPU {rank}) - All-Reduce后:")
print(tensor)
cleanup_process()
if __name__ == "__main__":
# 检查CUDA是否可用
if not torch.cuda.is_available():
print("CUDA 不可用!")
exit(1)
# 运行简化版本
world_size = min(5, torch.cuda.device_count()) # 最多使用5个GPU
if world_size == 0:
print("没有可用的GPU!")
exit(1)
print(f"使用 {world_size} 个GPU进行演示")
mp.spawn(
simple_all_reduce_demo,
args=(world_size,),
nprocs=world_size,
join=True
)

+ 0
- 201
LICENSE View File

@@ -1,201 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

1. Definitions.

"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.

"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.

"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.

"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.

"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.

"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.

"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).

"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.

"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."

"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.

2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.

3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.

4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:

(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and

(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and

(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and

(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.

You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.

5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.

6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.

7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.

8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.

9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS

APPENDIX: How to apply the Apache License to your work.

To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [yyyy] [name of copyright owner]

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

+ 0
- 53
Makefile View File

@@ -1,53 +0,0 @@
.PHONY: style quality

# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
export PYTHONPATH = src

check_dirs := src tests


# dev dependencies
install:
uv venv openr1 --python 3.11 && . openr1/bin/activate && uv pip install --upgrade pip
uv pip install vllm==0.7.2
uv pip install setuptools
uv pip install flash-attn --no-build-isolation
GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"

style:
ruff format --line-length 119 --target-version py310 $(check_dirs) setup.py
isort $(check_dirs) setup.py

quality:
ruff check --line-length 119 --target-version py310 $(check_dirs) setup.py
isort --check-only $(check_dirs) setup.py
flake8 --max-line-length 119 $(check_dirs) setup.py

test:
pytest -sv --ignore=tests/slow/ tests/

slow_test:
pytest -sv -vv tests/slow/

# Evaluation

evaluate:
$(eval PARALLEL_ARGS := $(if $(PARALLEL),$(shell \
if [ "$(PARALLEL)" = "data" ]; then \
echo "data_parallel_size=$(NUM_GPUS)"; \
elif [ "$(PARALLEL)" = "tensor" ]; then \
echo "tensor_parallel_size=$(NUM_GPUS)"; \
fi \
),))
$(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \
MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}" && \
if [ "$(TASK)" = "lcb" ]; then \
lighteval vllm $$MODEL_ARGS "extended|lcb:codegeneration|0|0" \
--use-chat-template \
--output-dir data/evals/$(MODEL); \
else \
lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \
--custom-tasks src/open_r1/evaluate.py \
--use-chat-template \
--output-dir data/evals/$(MODEL); \
fi

+ 17
- 19
README.md View File

@@ -1,30 +1,28 @@
An heterogeneous RL algorithm built on [GPG](https://github.com/AMAP-ML/GPG)/[trl](https://github.com/huggingface/trl)/[openR1](https://github.com/huggingface/open-r1).

Asynchronous Reinforcement Learning
```shell
# Enter the current directory (if the directory is different, you need to replace the corresponding path variables in the script).
cd /userhome/Research_HUB/GPG/open-r1

# Launch the learner firstly(using 4 * 80GB Nvidia A100 by default)
CUDA_VISIBLE_DEVICES=2,3,4,5 bash sh_dir/MoIS_Learner_4gpus_nRMs_LogNorm_benchmark.sh learner_script_EqQ_v0_benchmark EqQ_1th 1 v6b EqQ Async_EqQ_diff_32

# Then launch the sampler(using 4 * 80GB Nvidia A100 for each sampler by default)
# Asynchronous Reinforcement Learning

## Option 1: launch all samplers at once
CUDA_VISIBLE_DEVICES=0,1,2,3 bash sh_dir/MoIS_Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint.sh learner_script_checkpoint GEPO_nothink_1th 1 v6b gepo 1L2S_GEPO_diff32_nothink
## Enter the current directory (if the directory is different, you need to replace the corresponding path variables in the script).

## Option 2: launch samplers one by one in sequence
## (Optional) Resume from checkpoint
# please put the path of checkpoint into model_name_or_path
bash sh_dir/MoIS_Sampler_4gpus_single_benchmark_checkpoint.sh sampler_script_checkpoint GEPO_nothink_1th v6b gepo 1L2S_GEPO_diff32_nothink 0 &
bash sh_dir/MoIS_Sampler_4gpus_single_benchmark_checkpoint.sh sampler_script_checkpoint GEPO_nothink_1th v6b gepo 1L2S_GEPO_diff32_nothink 1 &
## Launch the learner firstly(using 4 * 80GB Nvidia A100 by default)
```shell
cd ./open-r1
CUDA_VISIBLE_DEVICES=0,1,2,3 bash sh_dir/HeteroRL_Learner_4gpus.sh learner_script_checkpoint GEPO_think_1th 1 v6b gepo 1L2S_GEPO_diff32_think
```
## Sampler: launch samplers one by one in sequence
### resume from checkpoint: put the path of checkpoint into model_name_or_path
```shell
bash sh_dir/HeteroRL_Sampler_4gpus.sh sampler_script_checkpoint GEPO_think_1th v6b gepo 1L2S_GEPO_diff32_think 0 &
bash sh_dir/HeteroRL_Sampler_4gpus.sh sampler_script_checkpoint GEPO_think_1th v6b gepo 1L2S_GEPO_diff32_think 1 &
bash sh_dir/HeteroRL_Sampler_4gpus.sh sampler_script_checkpoint GEPO_think_1th v6b gepo 1L2S_GEPO_diff32_think 2 &
bash sh_dir/HeteroRL_Sampler_4gpus.sh sampler_script_checkpoint GEPO_think_1th v6b gepo 1L2S_GEPO_diff32_think 3 &
```


Online-policy(using 4 * 80GB Nvidia A100 by default):

# We support grpo/bnpo/dr_grpo/gepo/gspo loss currently.
```shell
# Enter the current directory (if the directory is different, you need to replace the corresponding path variables in the script).
cd /userhome/Research_HUB/GPG/open-r1
# We support grpo/bnpo/dr_grpo/EqP/EqQ/gspo loss currently.
CUDA_VISIBLE_DEVICES="0,1,2,3" MASTER_PORT=29510 bash sh_dir/train_grpo_4gpus_benchmark.sh grpo
CUDA_VISIBLE_DEVICES="0,1,2,3" MASTER_PORT=29510 bash sh_dir/Online_gXpo_4gpus.sh gepo
```

BIN
assets/plan-of-attack.png View File

Before After
Width: 1761  |  Height: 2019  |  Size: 371 KiB

+ 0
- 618
install.sh View File

@@ -1,618 +0,0 @@
#!/bin/sh
set -eu

# code-server's automatic install script.
# See https://coder.com/docs/code-server/latest/install

usage() {
arg0="$0"
if [ "$0" = sh ]; then
arg0="curl -fsSL https://code-server.dev/install.sh | sh -s --"
else
not_curl_usage="The latest script is available at https://code-server.dev/install.sh
"
fi

cath << EOF
Installs code-server.
It tries to use the system package manager if possible.
After successful installation it explains how to start using code-server.

Pass in user@host to install code-server on user@host over ssh.
The remote host must have internet access.
${not_curl_usage-}
Usage:

$arg0 [--dry-run] [--version X.X.X] [--edge] [--method detect] \
[--prefix ~/.local] [--rsh ssh] [user@host]

--dry-run
Echo the commands for the install process without running them.

--version X.X.X
Install a specific version instead of the latest.

--edge
Install the latest edge version instead of the latest stable version.

--method [detect | standalone]
Choose the installation method. Defaults to detect.
- detect detects the system package manager and tries to use it.
Full reference on the process is further below.
- standalone installs a standalone release archive into ~/.local
Add ~/.local/bin to your \$PATH to use it.

--prefix <dir>
Sets the prefix used by standalone release archives. Defaults to ~/.local
The release is unarchived into ~/.local/lib/code-server-X.X.X
and the binary symlinked into ~/.local/bin/code-server
To install system wide pass --prefix=/usr/local

--rsh <bin>
Specifies the remote shell for remote installation. Defaults to ssh.

The detection method works as follows:
- Debian, Ubuntu, Raspbian: install the deb package from GitHub.
- Fedora, CentOS, RHEL, openSUSE: install the rpm package from GitHub.
- Arch Linux: install from the AUR (which pulls releases from GitHub).
- FreeBSD, Alpine: install from npm.
- macOS: install using Homebrew if installed otherwise install from GitHub.
- All others: install the release from GitHub.

We only build releases on GitHub for amd64 and arm64 on Linux and amd64 for
macOS. When the detection method tries to pull a release from GitHub it will
fall back to installing from npm when there is no matching release for the
system's operating system and architecture.

The standalone method will force installion using GitHub releases. It will not
fall back to npm so on architectures without pre-built releases this will error.

The installer will cache all downloaded assets into ~/.cache/code-server

More installation docs are at https://coder.com/docs/code-server/latest/install
EOF
}

echo_latest_version() {
if [ "${EDGE-}" ]; then
version="$(curl -fsSL https://api.githubfast.com/repos/coder/code-server/releases | awk 'match($0,/.*"html_url": "(.*\/releases\/tag\/.*)".*/)' | head -n 1 | awk -F '"' '{print $4}')"
else
# https://gist.githubfast.com/lukechilds/a83e1d7127b78fef38c2914c4ececc3c#gistcomment-2758860
version="$(curl -fsSLI -o /dev/null -w "%{url_effective}" https://githubfast.com/coder/code-server/releases/latest)"
fi
version="${version#https://githubfast.com/coder/code-server/releases/tag/}"
version="${version#v}"
echo "$version"
}

echo_npm_postinstall() {
echoh
cath << EOF
npm package has been installed.

Extend your path to use code-server:
PATH="$NPM_BIN_DIR:\$PATH"
Then run with:
code-server
EOF
}

echo_standalone_postinstall() {
echoh
cath << EOF
Standalone release has been installed into $STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION

Extend your path to use code-server:
PATH="$STANDALONE_INSTALL_PREFIX/bin:\$PATH"
Then run with:
code-server
EOF
}

echo_brew_postinstall() {
echoh
cath << EOF
Brew release has been installed.

Run with:
code-server
EOF
}

echo_systemd_postinstall() {
echoh
cath << EOF
$1 package has been installed.

To have systemd start code-server now and restart on boot:
sudo systemctl enable --now code-server@\$USER
Or, if you don't want/need a background service you can run:
code-server
EOF
}

echo_coder_postinstall() {
echoh
echoh "Deploy code-server for your team with Coder: https://githubfast.com/coder/coder"
}

main() {
if [ "${TRACE-}" ]; then
set -x
fi

unset \
DRY_RUN \
METHOD \
OPTIONAL \
ALL_FLAGS \
RSH_ARGS \
EDGE \
RSH

ALL_FLAGS=""
while [ "$#" -gt 0 ]; do
case "$1" in
-*)
ALL_FLAGS="${ALL_FLAGS} $1"
;;
esac

case "$1" in
--dry-run)
DRY_RUN=1
;;
--method)
METHOD="$(parse_arg "$@")"
shift
;;
--method=*)
METHOD="$(parse_arg "$@")"
;;
--prefix)
STANDALONE_INSTALL_PREFIX="$(parse_arg "$@")"
shift
;;
--prefix=*)
STANDALONE_INSTALL_PREFIX="$(parse_arg "$@")"
;;
--version)
VERSION="$(parse_arg "$@")"
shift
;;
--version=*)
VERSION="$(parse_arg "$@")"
;;
--edge)
EDGE=1
;;
--rsh)
RSH="$(parse_arg "$@")"
shift
;;
--rsh=*)
RSH="$(parse_arg "$@")"
;;
-h | --h | -help | --help)
usage
exit 0
;;
--)
shift
# We remove the -- added above.
ALL_FLAGS="${ALL_FLAGS% --}"
RSH_ARGS="$*"
break
;;
-*)
echoerr "Unknown flag $1"
echoerr "Run with --help to see usage."
exit 1
;;
*)
RSH_ARGS="$*"
break
;;
esac

shift
done

if [ "${RSH_ARGS-}" ]; then
RSH="${RSH-ssh}"
echoh "Installing remotely with $RSH $RSH_ARGS"
curl -fsSL https://code-server.dev/install.sh | prefix "$RSH_ARGS" "$RSH" "$RSH_ARGS" sh -s -- "$ALL_FLAGS"
return
fi

METHOD="${METHOD-detect}"
if [ "$METHOD" != detect ] && [ "$METHOD" != standalone ]; then
echoerr "Unknown install method \"$METHOD\""
echoerr "Run with --help to see usage."
exit 1
fi

# These are used by the various install_* functions that make use of GitHub
# releases in order to download and unpack the right release.
CACHE_DIR=$(echo_cache_dir)
STANDALONE_INSTALL_PREFIX=${STANDALONE_INSTALL_PREFIX:-$HOME/.local}
VERSION=${VERSION:-$(echo_latest_version)}
# These can be overridden for testing but shouldn't normally be used as it can
# result in a broken code-server.
OS=${OS:-$(os)}
ARCH=${ARCH:-$(arch)}

distro_name

# Standalone installs by pulling pre-built releases from GitHub.
if [ "$METHOD" = standalone ]; then
if has_standalone; then
install_standalone
echo_coder_postinstall
exit 0
else
echoerr "There are no standalone releases for $ARCH"
echoerr "Please try again without '--method standalone'"
exit 1
fi
fi

# DISTRO can be overridden for testing but shouldn't normally be used as it
# can result in a broken code-server.
DISTRO=${DISTRO:-$(distro)}

case $DISTRO in
# macOS uses brew when available and falls back to standalone. We only have
# amd64 for macOS so for anything else use npm.
macos)
BREW_PATH="${BREW_PATH-brew}"
if command_exists "$BREW_PATH"; then
install_brew
else
echoh "Homebrew not installed."
echoh "Falling back to standalone installation."
npm_fallback install_standalone
fi
;;
# The .deb and .rpm files are pulled from GitHub and we only have amd64 and
# arm64 there and need to fall back to npm otherwise.
debian) npm_fallback install_deb ;;
fedora | opensuse) npm_fallback install_rpm ;;
# Arch uses the AUR package which only supports amd64 and arm64 since it
# pulls releases from GitHub so we need to fall back to npm.
arch) npm_fallback install_aur ;;
# We don't have GitHub releases that work on Alpine or FreeBSD so we have no
# choice but to use npm here.
alpine | freebsd) install_npm ;;
# For anything else we'll try to install standalone but fall back to npm if
# we don't have releases for the architecture.
*)
echoh "Unsupported package manager."
echoh "Falling back to standalone installation."
npm_fallback install_standalone
;;
esac

echo_coder_postinstall
}

parse_arg() {
case "$1" in
*=*)
# Remove everything after first equal sign.
opt="${1%%=*}"
# Remove everything before first equal sign.
optarg="${1#*=}"
if [ ! "$optarg" ] && [ ! "${OPTIONAL-}" ]; then
echoerr "$opt requires an argument"
echoerr "Run with --help to see usage."
exit 1
fi
echo "$optarg"
return
;;
esac

case "${2-}" in
"" | -*)
if [ ! "${OPTIONAL-}" ]; then
echoerr "$1 requires an argument"
echoerr "Run with --help to see usage."
exit 1
fi
;;
*)
echo "$2"
return
;;
esac
}

fetch() {
URL="$1"
FILE="$2"

if [ -e "$FILE" ]; then
echoh "+ Reusing $FILE"
return
fi

sh_c mkdir -p "$CACHE_DIR"
sh_c curl \
-#fL \
-o "$FILE.incomplete" \
-C - \
"$URL"
sh_c mv "$FILE.incomplete" "$FILE"
}

install_brew() {
echoh "Installing latest from Homebrew."
echoh

sh_c "$BREW_PATH" install code-server

echo_brew_postinstall
}

install_deb() {
echoh "Installing v$VERSION of the $ARCH deb package from GitHub."
echoh

fetch "https://githubfast.com/coder/code-server/releases/download/v$VERSION/code-server_${VERSION}_$ARCH.deb" \
"$CACHE_DIR/code-server_${VERSION}_$ARCH.deb"
sudo_sh_c dpkg -i "$CACHE_DIR/code-server_${VERSION}_$ARCH.deb"

echo_systemd_postinstall deb
}

install_rpm() {
echoh "Installing v$VERSION of the $ARCH rpm package from GitHub."
echoh

fetch "https://githubfast.com/coder/code-server/releases/download/v$VERSION/code-server-$VERSION-$ARCH.rpm" \
"$CACHE_DIR/code-server-$VERSION-$ARCH.rpm"
sudo_sh_c rpm -U "$CACHE_DIR/code-server-$VERSION-$ARCH.rpm"

echo_systemd_postinstall rpm
}

install_aur() {
echoh "Installing latest from the AUR."
echoh

sh_c mkdir -p "$CACHE_DIR/code-server-aur"
sh_c "curl -#fsSL https://aur.archlinux.org/cgit/aur.git/snapshot/code-server.tar.gz | tar -xzC $CACHE_DIR/code-server-aur --strip-components 1"
echo "+ cd $CACHE_DIR/code-server-aur"
if [ ! "${DRY_RUN-}" ]; then
cd "$CACHE_DIR/code-server-aur"
fi
sh_c makepkg -si --noconfirm

echo_systemd_postinstall AUR
}

install_standalone() {
echoh "Installing v$VERSION of the $ARCH release from GitHub."
echoh

fetch "https://githubfast.com/coder/code-server/releases/download/v$VERSION/code-server-$VERSION-$OS-$ARCH.tar.gz" \
"$CACHE_DIR/code-server-$VERSION-$OS-$ARCH.tar.gz"

# -w only works if the directory exists so try creating it first. If this
# fails we can ignore the error as the -w check will then swap us to sudo.
sh_c mkdir -p "$STANDALONE_INSTALL_PREFIX" 2> /dev/null || true

sh_c="sh_c"
if [ ! -w "$STANDALONE_INSTALL_PREFIX" ]; then
sh_c="sudo_sh_c"
fi

if [ -e "$STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION" ]; then
echoh
echoh "code-server-$VERSION is already installed at $STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION"
echoh "Remove it to reinstall."
exit 0
fi

"$sh_c" mkdir -p "$STANDALONE_INSTALL_PREFIX/lib" "$STANDALONE_INSTALL_PREFIX/bin"
"$sh_c" tar -C "$STANDALONE_INSTALL_PREFIX/lib" -xzf "$CACHE_DIR/code-server-$VERSION-$OS-$ARCH.tar.gz"
"$sh_c" mv -f "$STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION-$OS-$ARCH" "$STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION"
"$sh_c" ln -fs "$STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION/bin/code-server" "$STANDALONE_INSTALL_PREFIX/bin/code-server"

echo_standalone_postinstall
}

install_npm() {
echoh "Installing v$VERSION from npm."
echoh

NPM_PATH="${YARN_PATH-npm}"

if command_exists "$NPM_PATH"; then
sh_c="sh_c"
if [ ! "${DRY_RUN-}" ] && [ ! -w "$(NPM_PATH config get prefix)" ]; then
sh_c="sudo_sh_c"
fi
echoh "Installing with npm."
echoh
"$sh_c" "$NPM_PATH" install -g "code-server@$VERSION" --unsafe-perm
NPM_BIN_DIR="\$($NPM_PATH bin -g)" echo_npm_postinstall
return
fi
echoerr "Please install npm to install code-server!"
echoerr "You will need at least node v20 and a few C dependencies."
echoerr "See the docs https://coder.com/docs/code-server/latest/install#npm"

exit 1
}

# Run $1 if we have a standalone otherwise run install_npm.
npm_fallback() {
if has_standalone; then
$1
else
echoh "No standalone releases for $ARCH."
echoh "Falling back to installation from npm."
install_npm
fi
}

# Determine if we have standalone releases on GitHub for the system's arch.
has_standalone() {
case $ARCH in
arm64) return 0 ;;
# We only have arm64 for macOS.
amd64)
[ "$(distro)" != macos ]
return
;;
*) return 1 ;;
esac
}

os() {
uname="$(uname)"
case $uname in
Linux) echo linux ;;
Darwin) echo macos ;;
FreeBSD) echo freebsd ;;
*) echo "$uname" ;;
esac
}

# Print the detected Linux distro, otherwise print the OS name.
#
# Example outputs:
# - macos -> macos
# - freebsd -> freebsd
# - ubuntu, raspbian, debian ... -> debian
# - amzn, centos, rhel, fedora, ... -> fedora
# - opensuse-{leap,tumbleweed} -> opensuse
# - alpine -> alpine
# - arch, manjaro, endeavouros, ... -> arch
#
# Inspired by https://githubfast.com/docker/docker-install/blob/26ff363bcf3b3f5a00498ac43694bf1c7d9ce16c/install.sh#L111-L120.
distro() {
if [ "$OS" = "macos" ] || [ "$OS" = "freebsd" ]; then
echo "$OS"
return
fi

if [ -f /etc/os-release ]; then
(
. /etc/os-release
if [ "${ID_LIKE-}" ]; then
for id_like in $ID_LIKE; do
case "$id_like" in debian | fedora | opensuse | arch)
echo "$id_like"
return
;;
esac
done
fi

echo "$ID"
)
return
fi
}

# Print a human-readable name for the OS/distro.
distro_name() {
if [ "$(uname)" = "Darwin" ]; then
echo "macOS v$(sw_vers -productVersion)"
return
fi

if [ -f /etc/os-release ]; then
(
. /etc/os-release
echo "$PRETTY_NAME"
)
return
fi

# Prints something like: Linux 4.19.0-9-amd64
uname -sr
}

arch() {
uname_m=$(uname -m)
case $uname_m in
aarch64) echo arm64 ;;
x86_64) echo amd64 ;;
*) echo "$uname_m" ;;
esac
}

command_exists() {
if [ ! "$1" ]; then return 1; fi
command -v "$@" > /dev/null
}

sh_c() {
echoh "+ $*"
if [ ! "${DRY_RUN-}" ]; then
sh -c "$*"
fi
}

sudo_sh_c() {
if [ "$(id -u)" = 0 ]; then
sh_c "$@"
elif command_exists doas; then
sh_c "doas $*"
elif command_exists sudo; then
sh_c "sudo $*"
elif command_exists su; then
sh_c "su root -c '$*'"
else
echoh
echoerr "This script needs to run the following command as root."
echoerr " $*"
echoerr "Please install doas, sudo, or su."
exit 1
fi
}

echo_cache_dir() {
if [ "${XDG_CACHE_HOME-}" ]; then
echo "$XDG_CACHE_HOME/code-server"
elif [ "${HOME-}" ]; then
echo "$HOME/.cache/code-server"
else
echo "/tmp/code-server-cache"
fi
}

echoh() {
echo "$@" | humanpath
}

cath() {
humanpath
}

echoerr() {
echoh "$@" >&2
}

# humanpath replaces all occurrences of " $HOME" with " ~"
# and all occurrences of '"$HOME' with the literal '"$HOME'.
humanpath() {
sed "s# $HOME# ~#g; s#\"$HOME#\"\$HOME#g"
}

# We need to make sure we exit with a non zero exit if the command fails.
# /bin/sh does not support -o pipefail unfortunately.
prefix() {
PREFIX="$1"
shift
fifo="$(mktemp -d)/fifo"
mkfifo "$fifo"
sed -e "s#^#$PREFIX: #" "$fifo" &
"$@" > "$fifo" 2>&1
}

main "$@"

+ 0
- 142
lighteval_results/main.py View File

@@ -1,142 +0,0 @@
# MIT License

# Copyright (c) 2025 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""Usage:
lighteval vllm \
"pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.6,top_p:0.95}" \
"extended|lcb:codegeneration|0|0"

lighteval vllm \
"pretrained=Qwen/Qwen2.5-Coder-3B-Instruct,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.2,top_p:0.95}" \
"extended|lcb:codegeneration|0|0"
"""

import json
from typing import Any

import numpy as np
from aenum import extend_enum
from datasets import get_dataset_config_names

from lighteval.metrics.metrics import MetricCategory, Metrics, MetricUseCase, SampleLevelMetric
from lighteval.tasks.extended.lcb.codegen_metrics import (
codegen_metrics,
extract_code,
translate_private_test_cases,
)
from lighteval.tasks.lighteval_task import Doc, LightevalTaskConfig


def prepare_prompt(line: dict[str, Any]) -> str:
query = "You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests.\n\n"
query += f"Question: {line['question_content']}\n\n"
if starter_code := line.get("starter_code", None):
query += "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
query += f"```python\n{starter_code}\n```\n\n"
else:
query += "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows."
query += "```python\n# YOUR CODE HERE\n```\n\n"
return query


def lcb_codegeneration_prompt_fn(line, task_name: str = "lcb:codegeneration") -> Doc:
# For the prompt we need a more general function that can be used tweaked like in:
# https://github.com/LiveCodeBench/LiveCodeBench/blob/main/lcb_runner/prompts/code_generation.py
query = prepare_prompt(line)
# List of dicts of the form: [{"input": "6\nabc\nacb\nbac\nbca\ncab\ncba\n", "output": "YES\nYES\nYES\nNO\nNO\nYES\n", "testtype": "stdin"}]
public_test_cases = json.loads(line["public_test_cases"])
private_test_cases = translate_private_test_cases(line["private_test_cases"])
inputs = [test["input"] for test in public_test_cases + private_test_cases]
outputs = [test["output"] for test in public_test_cases + private_test_cases]
return Doc(
task_name=task_name,
query=query,
choices=[""],
gold_index=0,
specific={
"inputs": inputs,
"outputs": outputs,
"fn_name": json.loads(line["metadata"]).get("func_name", None),
},
)


def codegen_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> float:
"""Estimates the Pass@1 metric for the code generation task.
Extract the code from each prediction, Runs it for each sample and generations,
and computes the Pass@1 over the outputs.
"""
# Extract generated code snippets
generated_code_snippets = [[extract_code(pred) for pred in predictions]] # noqa: F841
evaluation_sample = { # noqa: F841
"inputs": formatted_doc.specific["inputs"],
"outputs": formatted_doc.specific["outputs"],
"fn_name": formatted_doc.specific["fn_name"],
}
# This is a list of lists because
evaluation_sample = [{"input_output": json.dumps(evaluation_sample)}]

metrics, _ = codegen_metrics(
evaluation_sample,
generated_code_snippets,
k_list=[1], # Only run for Pass@1
num_process_evaluate=8,
)
return metrics["pass@1"]


lcb_codegen_metric = SampleLevelMetric(
metric_name="codegen_pass@1:16", # This is the way of informing the number of generations currently
category=MetricCategory.GENERATIVE_SAMPLING,
use_case=MetricUseCase.REASONING,
higher_is_better=True,
sample_level_fn=codegen_metric,
corpus_level_fn=np.mean,
)


extend_enum(Metrics, "lcb_codegen_metric", lcb_codegen_metric)

configs = get_dataset_config_names("livecodebench/code_generation_lite", trust_remote_code=True)

tasks = []

for subset in configs:
# To keep the base subset as the default, the others are named "lcb:codegeneration_v4", "lcb:codegeneration_v5"... etc
name = "lcb:codegeneration" if subset == "v4_v5" else f"lcb:codegeneration_{subset}"
task = LightevalTaskConfig(
name=name,
suite=["extended"],
prompt_function=lcb_codegeneration_prompt_fn,
hf_repo="livecodebench/code_generation_lite",
hf_subset=subset, # https://github.com/LiveCodeBench/LiveCodeBench/tree/main?tab=readme-ov-file#dataset-versions
hf_avail_splits=["test"],
evaluation_splits=["test"],
generation_size=32768,
metric=[Metrics.lcb_codegen_metric],
stop_sequence=[], # no stop sequence, will use EOS token
trust_dataset=True,
version=0,
)
tasks.append(task)


TASKS_TABLE = tasks

+ 0
- 3
once.py View File

@@ -1,3 +0,0 @@
def main():
print()


+ 0
- 60
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v0d.yaml View File

@@ -1,60 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags. Please reason step by step, and put your final answer within \\boxed{}."
#system_prompt: " Please reason step by step, and put your final answer within \\boxed{}."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
reward_weights:
- 1.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20

+ 0
- 65
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v1.yaml View File

@@ -1,65 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- tag_count
- length
- repetition_penalty
reward_weights:
- 0.25
- 0.25
- 0.25
- 0.25
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20

+ 0
- 65
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v2.yaml View File

@@ -1,65 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- tag_count
- length
- repetition_penalty
reward_weights:
- 0.45
- 0.25
- 0.05
- 0.25
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20

+ 0
- 65
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v3.yaml View File

@@ -1,65 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- tag_count
- extra_box_v1
- repetition_penalty
reward_weights:
- 0.45
- 0.25
- 0.05
- 0.25
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20

+ 0
- 72
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4.yaml View File

@@ -1,72 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "lognormal"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60

+ 0
- 76
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4a.yaml View File

@@ -1,76 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 0.35
- 0.35
- 0.1
- 0.0
- 0.0
- 0.1
- 0.1
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "lognormal"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60

+ 0
- 76
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4b.yaml View File

@@ -1,76 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
#system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "lognormal"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60

+ 0
- 76
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5.yaml View File

@@ -1,76 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "lognormal"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60

+ 0
- 78
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5a.yaml View File

@@ -1,78 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "weibull"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60

+ 0
- 78
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5b.yaml View File

@@ -1,78 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "nodelay"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60

+ 0
- 78
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5c.yaml View File

@@ -1,78 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "lognormal"
lower_bound: 600
upper_bound: 3600
confidence: 0.995
default_delay: 60

+ 0
- 78
recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5d.yaml View File

@@ -1,78 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "weibull"
lower_bound: 600
upper_bound: 3600
confidence: 0.995
default_delay: 60

+ 0
- 78
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0.yaml View File

@@ -1,78 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "weibull"
lower_bound: 600
upper_bound: 3600
confidence: 0.995
default_delay: 60
loss_type: "mois"

+ 0
- 78
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0a.yaml View File

@@ -1,78 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "weibull"
lower_bound: 600
upper_bound: 3600
confidence: 0.995
default_delay: 60
loss_type: "bnpo"

+ 0
- 78
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0b.yaml View File

@@ -1,78 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "weibull"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "bnpo"

+ 0
- 79
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0c1.yaml View File

@@ -1,79 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
# system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "nodelay"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "is_bnpo"

+ 0
- 78
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0c2.yaml View File

@@ -1,78 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "nodelay"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "pg"

+ 0
- 78
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0d.yaml View File

@@ -1,78 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "nodelay"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "pg"

+ 0
- 77
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v1.yaml View File

@@ -1,77 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 0.5
- 0.05
- 0.05
- 0.1
- 0.0
- 0.1
- 0.1
- 0.1
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "weibull"
lower_bound: 600
upper_bound: 3600
confidence: 0.995
default_delay: 60

+ 0
- 79
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v2.yaml View File

@@ -1,79 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
# system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "nodelay"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "ais_bnpo"

+ 0
- 79
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v2a.yaml View File

@@ -1,79 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
# system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "nodelay"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "pg"

+ 0
- 79
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6a.yaml View File

@@ -1,79 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
# system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "nodelay"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "amis_gspo"

+ 0
- 82
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6b.yaml View File

@@ -1,82 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
# system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
#system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "lognormal"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "ais_bnpo"
cppo_beta: 0.005
max_diff_step: 32

+ 0
- 81
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6c.yaml View File

@@ -1,81 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
# system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "lognormal"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "ais_bnpo"
cppo_beta: 0.005
max_diff_step: 4

+ 0
- 81
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6d.yaml View File

@@ -1,81 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
# system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "lognormal"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "ais_bnpo"
cppo_beta: 0.005
max_diff_step: 16

+ 0
- 81
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6e.yaml View File

@@ -1,81 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
# system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "lognormal"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "ais_bnpo"
cppo_beta: 0.005
max_diff_step: 8

+ 0
- 81
recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v7.yaml View File

@@ -1,81 +0,0 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
# system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen3-1.7B
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 1.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "lognormal"
lower_bound: 36000
upper_bound: 72000
confidence: 0.995
default_delay: 36000
loss_type: "ais_bnpo"
cppo_beta: 0.0
max_diff_step: 32000

recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0c.yaml → recipes/HeteoRL/config.yaml View File

@@ -1,8 +1,9 @@

# Model arguments
model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
model_name_or_path: Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
# torch_dtype: float32
attn_implementation: flash_attention_2

# Data training arguments
@@ -38,7 +39,7 @@ max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
output_dir: "/output_dir"
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
@@ -70,9 +71,11 @@ warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
delay_sampler: "nodelay"
delay_sampler: "lognormal"
lower_bound: 60
upper_bound: 1920
confidence: 0.995
default_delay: 60
loss_type: "bnpo"
loss_type: "gepo"
cppo_beta: 0.005
max_diff_step: 32

recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v0.yaml → recipes/Online/config.yaml View File

@@ -1,23 +1,23 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_name_or_path: Qwen/Qwen3-1.7B
model_revision: main
torch_dtype: bfloat16
# torch_dtype: float32
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
system_prompt: "You are ara helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.45
vllm_gpu_memory_utilization: 0.25
do_eval: true
eval_strategy: steps
eval_steps: 33
@@ -25,7 +25,7 @@ gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_model_id: None
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
@@ -39,7 +39,7 @@ max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
output_dir: output_dir
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
@@ -57,3 +57,5 @@ warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
# loss_type : "pg"
loss_type: "grpo"

+ 0
- 54
recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml View File

@@ -1,54 +0,0 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"

# GRPO trainer config
beta: 0.001
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen2.5-7B-Instruct-GRPO
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_train_epochs: 1
output_dir: data/Qwen2.5-7B-Instruct-GRPO
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 0.2
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
warmup_ratio: 0.1

+ 0
- 54
recipes/Qwen2.5-7B-Instruct/grpo/config_demo_v1.yaml View File

@@ -1,54 +0,0 @@
# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"

# GRPO trainer config
beta: 0.001
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen2.5-7B-Instruct-GRPO
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 16
num_train_epochs: 1
output_dir: data/Qwen2.5-7B-Instruct-GRPO
overwrite_output_dir: true
per_device_train_batch_size: 4
push_to_hub: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 0.2
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
warmup_ratio: 0.1

+ 0
- 64
recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1_vllm.yaml View File

@@ -1,64 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
# torch_dtype: float32
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
#system_prompt: "You are ara helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.25
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
reward_weights:
- 1.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20
# loss_type : "pg"
loss_type: "grpo"

+ 0
- 60
recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2.yaml View File

@@ -1,60 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: false
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: true
eval_strategy: steps
eval_steps: 100
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 5
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 1
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v2
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 4
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- format
reward_weights:
- 1.0
- 1.0
save_strategy: "no"
seed: 42
warmup_ratio: 0.03
temperature: 1.0
top_p : 1.0
scale_rewards: false


+ 0
- 60
recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2_g16.yaml View File

@@ -1,60 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: false
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: true
eval_strategy: steps
eval_steps: 100
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 5
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 16
num_train_epochs: 1
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v2-g16
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- format
reward_weights:
- 1.0
- 1.0
save_strategy: "no"
seed: 42
warmup_ratio: 0.03
temperature: 1.0
top_p : 1.0
scale_rewards: false


+ 0
- 73
recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2_vllm.yaml View File

@@ -1,73 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.25
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 0.8
- 0.0
- 0.1
- 0.0
- 0.0
- 0.0
- 0.1
- 0.0
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20

+ 0
- 73
recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v3_vllm.yaml View File

@@ -1,73 +0,0 @@

# Model arguments
model_name_or_path: models/Qwen2.5-Math-7B
#model_name_or_path: Qwen/Qwen2.5-Math-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
#dataset_name: DigitalLearningGmbH/MATH-lighteval
dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
dataset_config: "train.parquet"
system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
# GRPO trainer config
beta: 0.0
bf16: true
use_vllm: true
vllm_mode: "colocate"
# vllm_device: auto
vllm_gpu_memory_utilization: 0.25
do_eval: true
eval_strategy: steps
eval_steps: 33
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen-2.5-7B-Simple-RL
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: false
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_prompt_length: 1024
max_completion_length: 3000
max_steps: -1
num_generations: 8
num_train_epochs: 3
output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
overwrite_output_dir: true
per_device_eval_batch_size: 16
per_device_train_batch_size: 8
push_to_hub: false
report_to:
- tensorboard
reward_funcs:
- accuracy_lv35
- extra_box_v1
- extra_box_v2
- fused_format
- tag_count
- length
- repetition_penalty
- language_penalty
reward_weights:
- 0.5
- 0.05
- 0.05
- 0.1
- 0.0
- 0.1
- 0.1
- 0.1
save_strategy: "steps"
save_steps: 33
seed: 42
warmup_ratio: 0.03
temperature: 0.6
top_p : 0.95
top_k : 20

+ 0
- 15
recipes/README.md View File

@@ -1,15 +0,0 @@
# Post-training recipes

## OlympicCoder

To train the OlympicCoder models, run:

```
# 7B
sbatch --nodes=1 slurm/train.slurm OlympicCoder-7B sft v00.00 zero3

# 32B
sbatch --nodes=16 slurm/train.slurm OlympicCoder-32B sft v00.00 fsdp
```

Note that we found it necessary to switch to FSDP1 and paged AdamW 8-bit for the 32B model in order to fit the largest possible context size.

+ 0
- 16
recipes/accelerate_configs/ddp.yaml View File

@@ -1,16 +0,0 @@
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: MULTI_GPU
downcast_bf16: 'no'
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

+ 0
- 16
recipes/accelerate_configs/ddp_4gpus.yaml View File

@@ -1,16 +0,0 @@
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: MULTI_GPU
downcast_bf16: 'no'
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 4
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

+ 0
- 16
recipes/accelerate_configs/ddp_6gpus.yaml View File

@@ -1,16 +0,0 @@
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: MULTI_GPU
downcast_bf16: 'no'
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 6
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

+ 0
- 16
recipes/accelerate_configs/ddp_8gpus.yaml View File

@@ -1,16 +0,0 @@
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: MULTI_GPU
downcast_bf16: 'no'
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

+ 0
- 27
recipes/accelerate_configs/fsdp.yaml View File

@@ -1,27 +0,0 @@
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: FSDP
downcast_bf16: 'no'
enable_cpu_affinity: false
fsdp_config:
fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
fsdp_backward_prefetch: BACKWARD_PRE
fsdp_cpu_ram_efficient_loading: true
fsdp_forward_prefetch: true
fsdp_offload_params: false
fsdp_sharding_strategy: FULL_SHARD
fsdp_state_dict_type: FULL_STATE_DICT
fsdp_sync_module_states: true
fsdp_use_orig_params: true
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

+ 0
- 21
recipes/accelerate_configs/zero1.yaml View File

@@ -1,21 +0,0 @@
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_multinode_launcher: standard
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 1
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

+ 0
- 21
recipes/accelerate_configs/zero2.yaml View File

@@ -1,21 +0,0 @@
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_multinode_launcher: standard
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 2
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

+ 0
- 21
recipes/accelerate_configs/zero2_2A100s.yaml View File

@@ -1,21 +0,0 @@
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_multinode_launcher: standard
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 2
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

+ 0
- 22
recipes/accelerate_configs/zero3.yaml View File

@@ -1,22 +0,0 @@
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_multinode_launcher: standard
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: true
zero3_save_16bit_model: true
zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

+ 0
- 22
recipes/accelerate_configs/zero3_4A100s.yaml View File

@@ -1,22 +0,0 @@
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
deepspeed_multinode_launcher: standard
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: true
zero3_save_16bit_model: true
zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 4
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false

+ 0
- 12
retrieve_analysis.sh View File

@@ -1,12 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/retrieve_and_analysis/${formatted_time}.log
sync_weights_path=$1
max_num_model_weight=64
num_samples=64
num_generations=8
echo $log_path
# --skip_retrieve_model_weight
nohup python retrieve_and_analysis.py --sync_weights_path $sync_weights_path \
--num_samples $num_samples --num_generations $num_generations \
--max_num_model_weight $max_num_model_weight --random_dataset > $log_path 2>&1 &


+ 0
- 479
retrieve_and_analysis.py View File

@@ -1,479 +0,0 @@
import torch
import shutil
import os
import time
import datetime
import re
from trl.trainer.utils import pad
from tqdm import tqdm
from latex2sympy2_extended import NormalizationConfig
from math_verify import LatexExtractionConfig, parse, verify
import matplotlib.pyplot as plt
import torch.nn.functional as F
from contextlib import nullcontext
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from vllm import LLM, SamplingParams
from trl.trainer.utils import selective_log_softmax
from trl.extras.profiling import profiling_context
import os
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import os
import pandas as pd
import argparse
import torch.distributed as dist
import atexit

def retrieve_model_weight(sync_weights_path=None, target_directory = "/extrahome0/retrieve_model_weight", max_num_model_weight = 32):# 初始的 global_step
# if os.path.exists(sync_weights_path) and os.path.isdir(sync_weights_path):
# files = os.listdir(sync_weights_path)
# if len(files) == 0:
# print(f"目录 {save_dir} 存在,但为空。")
# else:
# print(f"目录 {save_dir} 存在,包含 {len(files)} 个文件:")
# print(files)
# else:
# print(f"目录 {save_dir} 不存在!")
current_global_step = 0

# 权重文件路径
# sync_weights_path = "/extrahome0/save_dir/AsyncGRPO/4gpus/Async_MoISv6i_1th_cfgv6b/tmp/Qwen3-1.7B/gpg_async_weights.pt"

# 目标目录
# target_directory = "/extrahome0/retrieve_model_weight"

# 确保目标目录存在
os.makedirs(target_directory, exist_ok=True)

print(f"开始监控文件: {sync_weights_path}")
print(f"目标目录: {target_directory}")
num_model_weight = 0
try:
while num_model_weight < max_num_model_weight:
try:
# 读取当前保存的 global_step
global_step, _ = torch.load(sync_weights_path, map_location="cpu")
# 检查是否比上一次的 step 正好大 1
if global_step == current_global_step + 1 or current_global_step == 0:
target_path = os.path.join(target_directory, f"gpg_async_weights_{global_step}.pt")
shutil.copy(sync_weights_path, target_path)
print(f"✅ 步数增加 1: {current_global_step} → {global_step}")
print(f"已复制权重文件到: {target_path}")
num_model_weight += 1
# 更新记录的 step
current_global_step = global_step
elif global_step > current_global_step + 1:
print(f"⚠️ 步数跳跃: {current_global_step} → {global_step}(跳过了中间步骤)")
break
# current_global_step = global_step # 可选:是否更新?根据需求决定
else:
# global_step <= current_global_step,无需操作
pass # 可以选择打印日志

# except FileNotFoundError:
# print(f"❌ 文件未找到: {sync_weights_path}")
except Exception as e:
print(f"❌ 读取文件时发生错误: {e}")

# 等待 1 秒后再次检查
time.sleep(1)

except KeyboardInterrupt:
print("\n\n监控已手动停止。")


def custom_loading_dataset(dataset_name, train_name='train.parquet', test_name='test.parquet', max_length=512, tokenizer=None):
"""
Load and preprocess a dataset from Parquet files, and filter out samples exceeding a specified length.

Args:
dataset_name (str): The base directory of the dataset.
train_name (str, optional): The name of the training file. Defaults to 'train.parquet'.
test_name (str, optional): The name of the test file. Defaults to 'test.parquet'.
max_length (int, optional): Maximum length of the samples to keep. Defaults to 512.
tokenizer (str, optional): tokenizer to use. Defaults to 'bert-base-uncased'.

Returns:
DatasetDict: A dictionary-like object containing the training and test datasets.
"""
# 定义数据文件路径
train_path = os.path.join(dataset_name, train_name)
test_path = os.path.join(dataset_name, test_name)


# 定义一个函数来计算文本的长度
def get_length(text):
inputs = tokenizer(text, return_tensors="pt", padding=False, truncation=False)
return inputs["input_ids"].shape[1]

# 读取训练数据
try:
train_data = pd.read_parquet(train_path)
train_data['split'] = 'train' # 添加 split 列
except FileNotFoundError:
raise FileNotFoundError(f"Training file not found at {train_path}")

# 读取测试数据
try:
test_data = pd.read_parquet(test_path)
test_data['split'] = 'test' # 添加 split 列
except FileNotFoundError:
print(f"Test file not found at {test_path}. Skipping test data.")
test_data = None

# 定义列名映射
column_mapping = {
'ground_truth_answer': 'ground_truth',
'subject': 'topic',
'target': 'solution',
# 'data_source': 'source',
'input': 'instruction',
# 'ability': 'skill',
# 'reward_model': 'reward',
# 'extra_info': 'metadata',
'question': 'problem'
}


# 重命名列
train_data.rename(columns=column_mapping, inplace=True)

if test_data is not None:
test_data.rename(columns=column_mapping, inplace=True)


# 计算每个样本的长度
train_data['length'] = train_data['instruction'].apply(get_length)
if test_data is not None:
test_data['length'] = test_data['instruction'].apply(get_length)

# 过滤掉超过 max_length 的样本
train_data = train_data[train_data['length'] <= max_length]
if test_data is not None:
test_data = test_data[test_data['length'] <= max_length]

# 转换为 Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
if test_data is not None:
test_dataset = Dataset.from_pandas(test_data)
else:
test_dataset = None

# 创建 DatasetDict
dataset_dict = DatasetDict({
'train': train_dataset,
'test': test_dataset
})

return dataset_dict
def make_conversation(example):
prompt = []
system_prompt = "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
if system_prompt is not None:
prompt.append({"role": "system", "content": system_prompt})
prompt.append({"role": "user", "content": example["problem"]})


# prompt = example["problem"] + " The reasoning process MUST BE enclosed within <think> and </think> tags. Please reason step by step, and put your final answer within \\boxed{}."
# if add_think:
# prompt += " /think"

return {"prompt": prompt}


def pre_process(completions):
"""retrieve the completion content from input"""
if isinstance(completions[0],(list,)):
completion_contents = [completion[0]["content"] for completion in completions]
elif isinstance(completions[0],(dict)):
completion_contents = [completion["content"] for completion in completions]
else:
completion_contents = [completion for completion in completions]
return completion_contents

def accuracy_reward_lv35(completions, solution, **kwargs):
"""Reward function that checks if the completion is the same as the ground truth."""
# if isinstance(completions[0],(dict)):
# contents = [completion["content"] for completion in completions]
# else:
# contents = [completion for completion in completions]
contents = pre_process(completions)
rewards = []
for content, sol in zip(contents, solution):
box_sol = "$\\\\boxed{}$".format(sol)
try:
gold_parsed = parse(
box_sol,
extraction_mode="first_match",
)
except TimeoutError:
rank = dist.get_rank() if dist.is_initialized() else 0
print(f"[Rank {rank}] gold parse timeout | content='{content}' | sol='{sol}' | box_sol='{box_sol}'")
rewards.append(1.0)
continue
if len(gold_parsed) != 0:
# We require the answer to be provided in correct latex (no malformed operators)
try:
answer_parsed = parse(
content,
extraction_config=[
LatexExtractionConfig(
normalization_config=NormalizationConfig(
nits=False,
malformed_operators=False,
basic_latex=True,
equations=True,
boxed="all",
units=True,
),
# Ensures that boxed is tried first
boxed_match_priority=0,
try_extract_without_anchor=False,
)
],
extraction_mode="first_match",
)
# print(f'answer_parsed:{answer_parsed}')
# if len(anxswer_parsed) == 0:
# print(f"answer_parsed is None | content='{content}' | sol='{sol}'")
except TimeoutError:
rank = dist.get_rank() if dist.is_initialized() else 0
print(f"[Rank {rank}] answer parse timeout | content='{content}' | sol='{sol}'")
rewards.append(0.0)
continue
# Reward 1 if the content is the same as the ground truth, 0 otherwise
try:
reward = float(verify(answer_parsed, gold_parsed))
except Exception as e:
print(f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}")
reward = 0.0
else:
# If the gold solution is not parseable, we reward 1 to skip this example
reward = 1.0
print("accuracy_reward_lv35: Failed to parse gold solution: ", box_sol)
rewards.append(reward)

return torch.Tensor(rewards)

def _get_per_token_logps(temperature, model, input_ids, attention_mask, logits_to_keep, batch_size=None) -> torch.Tensor:
batch_size = batch_size or input_ids.size(0) # Chunk inputs into smaller batches to reduce memory peak
all_logps = []
for i in range(0, input_ids.size(0), batch_size):
input_ids_batch = input_ids[i : i + batch_size]
attention_mask_batch = attention_mask[i : i + batch_size]

# We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
logits = model(
input_ids=input_ids_batch, attention_mask=attention_mask_batch, logits_to_keep=logits_to_keep + 1
).logits
logits = logits[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
input_ids_batch = input_ids_batch[:, -logits_to_keep:]
# Divide logits by sampling temperature.
# See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
logits = logits / temperature
logps = selective_log_softmax(logits, input_ids_batch) # compute logprobs for the input tokens
all_logps.append(logps)
return torch.cat(all_logps, dim=0)

def move_to_vllm(model, llm):
for name, param in model.named_parameters():
with nullcontext([param]):
llm_model = llm.llm_engine.model_executor.driver_worker.model_runner.model
llm_model.load_weights([(name, param.data)])
llm.reset_prefix_cache()
print('vllm updated!')

def cleanup_dist():
if dist.is_initialized():
print("Cleaning up distributed process group...")
dist.destroy_process_group()

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--sync_weights_path", type=str, required=True, help="The path to model weights")
parser.add_argument("--max_num_model_weight", type=int, required=True, help="The number of model weights")
parser.add_argument("--num_samples", type=int, required=True, help="The number of samples")
parser.add_argument("--num_generations", type=int, required=True, help="The number of generations per sample")
parser.add_argument("--skip_retrieve_model_weight", type=bool, default=False, help="skip the retrival of model weight")
parser.add_argument("--random_dataset", type=bool, default=False, help="re-sample different data from dataset")
return parser.parse_args()

def main():
args = get_args()
print(args)
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
target_directory = f"/extrahome0/retrieve_model_weight/{timestamp}"
max_num_model_weight = args.max_num_model_weight
if not args.skip_retrieve_model_weight:
retrieve_model_weight(args.sync_weights_path, target_directory, max_num_model_weight)
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

temperature=0.6
top_p=0.95
N=args.num_samples
top_k=20
max_length = 2048
num_generations=args.num_generations
begin_ind=0
end_ind=begin_ind+N
resample = True
scale_rewards = False
solutions = []
prompts_text = []
sampling_params = SamplingParams(temperature=temperature, top_p=top_p, top_k=top_k, max_tokens=max_length)

# Initialize the tokenizer
# tokenizer = AutoTokenizer.from_pretrained("/extrahome0/HF_models/Qwen/Qwen3-1.7B")
# os.environ["VLLM_USE_V1"] = "0"

# Configurae the sampling parameters (for thinking mode)
sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=20, max_tokens=2048)

# Initialize the vLLM engine
# llm = LLM(model="/extrahome0/HF_models/Qwen/Qwen3-1.7B")
llm = LLM(model="/extrahome0/HF_models/Qwen/Qwen3-1.7B",gpu_memory_utilization=0.8)
tokenizer =AutoTokenizer.from_pretrained("/extrahome0/HF_models/Qwen/Qwen3-1.7B", trust_remote_code=True)
data_path = "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5"
model_id = "/extrahome0/HF_models/Qwen/Qwen3-1.7B"
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)

device = model.device
batch_size = 8
max_prompt_length = 768
seed = 42
mode = "test"
dataset = custom_loading_dataset(data_path, max_length=max_prompt_length, tokenizer=tokenizer)
dataset = dataset.map(make_conversation)
for split in dataset:
if "messages" in dataset[split].column_names:
dataset[split] = dataset[split].remove_columns("messages")

current_dataset = dataset['train'] if mode == "train" else dataset['test']
current_dataset = current_dataset.shuffle(seed=seed)

# model_list = sorted(os.listdir(target_directory))
model_list = sorted([file.name for file in Path(target_directory).glob('gpg_*.pt')], key=lambda x: int(re.search(r'gpg_async_weights_(\d+)', x).group(1)))
assert len(model_list) == max_num_model_weight, f"Error! got {len(model_list)} < {max_num_model_weight} models in model_list: {model_list}"
# def get_logprobs_and_reward(model_list):
log_probs = []
advantages_list = []
completion_ids_list = []
prompt_completion_ids_list = []
completion_mask_list = []
attention_mask_list = []
for model_name in tqdm(model_list):
model_id, state_dict = torch.load(f"{target_directory}/{model_name}", map_location="cpu")
model.load_state_dict(state_dict)
print(f"model_id {model_id} loaded!")
move_to_vllm(model, llm)

if resample:
for ind in range(begin_ind, end_ind):
for _ in range(num_generations):
prompts_text.append("<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer.<|im_start|>user\n" + current_dataset[ind]['problem'] + "<|im_end|>\n<|im_start|>assistant\n")
# prompts_text.append(dataset['train'][ind]['problem'] + "/no_think")
solutions.append(current_dataset[ind]['solution'])
# prompts_text = [maybe_apply_chat_template(example, tokenizer)["prompt"] for example in inputs]
prompt_inputs = tokenizer(
text=prompts_text, return_tensors="pt", padding=True, padding_side="left", add_special_tokens=False
)
prompt_ids, prompt_mask = prompt_inputs["input_ids"].to(device), prompt_inputs["attention_mask"].to(device)
if max_prompt_length is not None:
# If max_prompt_length is set, we trim the prompt to keep only the last `max_prompt_length` tokens.
# Then we decode those tokens back into text. We manually remove leading pad tokens from the decoded text,
# because we can't use `skip_special_tokens=True` (some special tokens are still needed for generation).
prompt_ids = prompt_ids[:, -max_prompt_length :]
prompt_mask = prompt_mask[:, -max_prompt_length :]
prompts_text = tokenizer.batch_decode(
prompt_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
)
prompts_text = [
re.sub(rf"^({re.escape(tokenizer.pad_token)})+", "", text) for text in prompts_text
]
begin_ind = end_ind
if not args.random_dataset:
resample = False

all_outputs = llm.generate(prompts_text, sampling_params, use_tqdm=False)
completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs]
completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
completion_ids = pad(completion_ids, padding_value=tokenizer.pad_token_id)
prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
is_eos = completion_ids == tokenizer.eos_token_id
eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)
eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1)
completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
logits_to_keep = completion_ids.size(1)
with torch.no_grad():
logps = _get_per_token_logps(
model=model,
input_ids=prompt_completion_ids,
attention_mask=attention_mask,
logits_to_keep=logits_to_keep,
temperature=temperature,
batch_size=batch_size
)
log_probs.append(logps)
completions_text = tokenizer.batch_decode(completion_ids, skip_special_tokens=True)
rewards = accuracy_reward_lv35(completions=completions_text, solution=solutions).to(device)
mean_grouped_rewards = rewards.view(-1, num_generations).mean(dim=1)
std_grouped_rewards = rewards.view(-1, num_generations).std(dim=1)
# is_std_zero = torch.isclose(std_grouped_rewards, torch.zeros_like(std_grouped_rewards))

# Normalize the rewards to compute the advantages
mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(num_generations, dim=0)
std_grouped_rewards = std_grouped_rewards.repeat_interleave(num_generations, dim=0)
advantages = rewards - mean_grouped_rewards
if scale_rewards:
advantages = advantages / (std_grouped_rewards + 1e-4)
advantages_list.append(advantages)
completion_ids_list.append(completion_ids)
completion_mask_list.append(completion_mask)
prompt_completion_ids_list.append(prompt_completion_ids)
attention_mask_list.append(attention_mask)
print("sampling finish!")
learner_log_probs = []
with torch.no_grad():
for i in tqdm(range(max_num_model_weight-1)):
learner_logps = _get_per_token_logps(
model=model,
input_ids=prompt_completion_ids_list[i],
attention_mask=attention_mask_list[i],
logits_to_keep=completion_ids_list[i].size(1),
temperature=temperature,
batch_size=batch_size
)
learner_log_probs.append(learner_logps)

save_path = f"{target_directory}/log_probs_and_advantages.pt"
torch.save({
'sampler_log_probs': log_probs,
'advantages_list': advantages_list,
'prompt_ids_list': prompt_inputs["input_ids"],
'prompt_mask_list': prompt_inputs["attention_mask"],
'completion_ids_list': completion_ids_list,
'prompt_completion_ids_list': prompt_completion_ids_list,
'completion_mask_list': completion_mask_list,
'attention_mask_list': attention_mask_list,
'learner_log_probs': learner_log_probs
}, save_path)

print("learning finish!")
# calculation

if __name__ == "__main__":
atexit.register(cleanup_dist)
main()

+ 0
- 472
retrieve_and_analysis_bak.py View File

@@ -1,472 +0,0 @@
import torch
import shutil
import os
import time
import datetime
import re
from trl.trainer.utils import pad
from tqdm import tqdm
from latex2sympy2_extended import NormalizationConfig
from math_verify import LatexExtractionConfig, parse, verify
import matplotlib.pyplot as plt
import torch.nn.functional as F
from contextlib import nullcontext
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from vllm import LLM, SamplingParams
from trl.trainer.utils import selective_log_softmax
from trl.extras.profiling import profiling_context
import os
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import os
import pandas as pd
import argparse
import torch.distributed as dist
import atexit

def retrieve_model_weight(sync_weights_path=None, target_directory = "/extrahome0/retrieve_model_weight", max_num_model_weight = 32):# 初始的 global_step
# if os.path.exists(sync_weights_path) and os.path.isdir(sync_weights_path):
# files = os.listdir(sync_weights_path)
# if len(files) == 0:
# print(f"目录 {save_dir} 存在,但为空。")
# else:
# print(f"目录 {save_dir} 存在,包含 {len(files)} 个文件:")
# print(files)
# else:
# print(f"目录 {save_dir} 不存在!")
current_global_step = 0

# 权重文件路径
# sync_weights_path = "/extrahome0/save_dir/AsyncGRPO/4gpus/Async_MoISv6i_1th_cfgv6b/tmp/Qwen3-1.7B/gpg_async_weights.pt"

# 目标目录
# target_directory = "/extrahome0/retrieve_model_weight"

# 确保目标目录存在
os.makedirs(target_directory, exist_ok=True)

print(f"开始监控文件: {sync_weights_path}")
print(f"目标目录: {target_directory}")
num_model_weight = 0
try:
while num_model_weight < max_num_model_weight:
try:
# 读取当前保存的 global_step
global_step, _ = torch.load(sync_weights_path, map_location="cpu")
# 检查是否比上一次的 step 正好大 1
if global_step == current_global_step + 1 or current_global_step == 0:
target_path = os.path.join(target_directory, f"gpg_async_weights_{global_step}.pt")
shutil.copy(sync_weights_path, target_path)
print(f"✅ 步数增加 1: {current_global_step} → {global_step}")
print(f"已复制权重文件到: {target_path}")
num_model_weight += 1
# 更新记录的 step
current_global_step = global_step
elif global_step > current_global_step + 1:
print(f"⚠️ 步数跳跃: {current_global_step} → {global_step}(跳过了中间步骤)")
break
# current_global_step = global_step # 可选:是否更新?根据需求决定
else:
# global_step <= current_global_step,无需操作
pass # 可以选择打印日志

# except FileNotFoundError:
# print(f"❌ 文件未找到: {sync_weights_path}")
except Exception as e:
print(f"❌ 读取文件时发生错误: {e}")

# 等待 1 秒后再次检查
time.sleep(1)

except KeyboardInterrupt:
print("\n\n监控已手动停止。")


def custom_loading_dataset(dataset_name, train_name='train.parquet', test_name='test.parquet', max_length=512, tokenizer=None):
"""
Load and preprocess a dataset from Parquet files, and filter out samples exceeding a specified length.

Args:
dataset_name (str): The base directory of the dataset.
train_name (str, optional): The name of the training file. Defaults to 'train.parquet'.
test_name (str, optional): The name of the test file. Defaults to 'test.parquet'.
max_length (int, optional): Maximum length of the samples to keep. Defaults to 512.
tokenizer (str, optional): tokenizer to use. Defaults to 'bert-base-uncased'.

Returns:
DatasetDict: A dictionary-like object containing the training and test datasets.
"""
# 定义数据文件路径
train_path = os.path.join(dataset_name, train_name)
test_path = os.path.join(dataset_name, test_name)


# 定义一个函数来计算文本的长度
def get_length(text):
inputs = tokenizer(text, return_tensors="pt", padding=False, truncation=False)
return inputs["input_ids"].shape[1]

# 读取训练数据
try:
train_data = pd.read_parquet(train_path)
train_data['split'] = 'train' # 添加 split 列
except FileNotFoundError:
raise FileNotFoundError(f"Training file not found at {train_path}")

# 读取测试数据
try:
test_data = pd.read_parquet(test_path)
test_data['split'] = 'test' # 添加 split 列
except FileNotFoundError:
print(f"Test file not found at {test_path}. Skipping test data.")
test_data = None

# 定义列名映射
column_mapping = {
'ground_truth_answer': 'ground_truth',
'subject': 'topic',
'target': 'solution',
# 'data_source': 'source',
'input': 'instruction',
# 'ability': 'skill',
# 'reward_model': 'reward',
# 'extra_info': 'metadata',
'question': 'problem'
}


# 重命名列
train_data.rename(columns=column_mapping, inplace=True)

if test_data is not None:
test_data.rename(columns=column_mapping, inplace=True)


# 计算每个样本的长度
train_data['length'] = train_data['instruction'].apply(get_length)
if test_data is not None:
test_data['length'] = test_data['instruction'].apply(get_length)

# 过滤掉超过 max_length 的样本
train_data = train_data[train_data['length'] <= max_length]
if test_data is not None:
test_data = test_data[test_data['length'] <= max_length]

# 转换为 Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
if test_data is not None:
test_dataset = Dataset.from_pandas(test_data)
else:
test_dataset = None

# 创建 DatasetDict
dataset_dict = DatasetDict({
'train': train_dataset,
'test': test_dataset
})

return dataset_dict
def make_conversation(example):
prompt = []
system_prompt = "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
if system_prompt is not None:
prompt.append({"role": "system", "content": system_prompt})
prompt.append({"role": "user", "content": example["problem"]})


# prompt = example["problem"] + " The reasoning process MUST BE enclosed within <think> and </think> tags. Please reason step by step, and put your final answer within \\boxed{}."
# if add_think:
# prompt += " /think"

return {"prompt": prompt}


def pre_process(completions):
"""retrieve the completion content from input"""
if isinstance(completions[0],(list,)):
completion_contents = [completion[0]["content"] for completion in completions]
elif isinstance(completions[0],(dict)):
completion_contents = [completion["content"] for completion in completions]
else:
completion_contents = [completion for completion in completions]
return completion_contents

def accuracy_reward_lv35(completions, solution, **kwargs):
"""Reward function that checks if the completion is the same as the ground truth."""
# if isinstance(completions[0],(dict)):
# contents = [completion["content"] for completion in completions]
# else:
# contents = [completion for completion in completions]
contents = pre_process(completions)
rewards = []
for content, sol in zip(contents, solution):
box_sol = "$\\\\boxed{}$".format(sol)
try:
gold_parsed = parse(
box_sol,
extraction_mode="first_match",
)
except TimeoutError:
rank = dist.get_rank() if dist.is_initialized() else 0
print(f"[Rank {rank}] gold parse timeout | content='{content}' | sol='{sol}' | box_sol='{box_sol}'")
rewards.append(1.0)
continue
if len(gold_parsed) != 0:
# We require the answer to be provided in correct latex (no malformed operators)
try:
answer_parsed = parse(
content,
extraction_config=[
LatexExtractionConfig(
normalization_config=NormalizationConfig(
nits=False,
malformed_operators=False,
basic_latex=True,
equations=True,
boxed="all",
units=True,
),
# Ensures that boxed is tried first
boxed_match_priority=0,
try_extract_without_anchor=False,
)
],
extraction_mode="first_match",
)
# print(f'answer_parsed:{answer_parsed}')
# if len(anxswer_parsed) == 0:
# print(f"answer_parsed is None | content='{content}' | sol='{sol}'")
except TimeoutError:
rank = dist.get_rank() if dist.is_initialized() else 0
print(f"[Rank {rank}] answer parse timeout | content='{content}' | sol='{sol}'")
rewards.append(0.0)
continue
# Reward 1 if the content is the same as the ground truth, 0 otherwise
try:
reward = float(verify(answer_parsed, gold_parsed))
except Exception as e:
print(f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}")
reward = 0.0
else:
# If the gold solution is not parseable, we reward 1 to skip this example
reward = 1.0
print("accuracy_reward_lv35: Failed to parse gold solution: ", box_sol)
rewards.append(reward)

return torch.Tensor(rewards)

def _get_per_token_logps(temperature, model, input_ids, attention_mask, logits_to_keep, batch_size=None) -> torch.Tensor:
batch_size = batch_size or input_ids.size(0) # Chunk inputs into smaller batches to reduce memory peak
all_logps = []
for i in range(0, input_ids.size(0), batch_size):
input_ids_batch = input_ids[i : i + batch_size]
attention_mask_batch = attention_mask[i : i + batch_size]

# We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
logits = model(
input_ids=input_ids_batch, attention_mask=attention_mask_batch, logits_to_keep=logits_to_keep + 1
).logits
logits = logits[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
input_ids_batch = input_ids_batch[:, -logits_to_keep:]
# Divide logits by sampling temperature.
# See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
logits = logits / temperature
logps = selective_log_softmax(logits, input_ids_batch) # compute logprobs for the input tokens
all_logps.append(logps)
return torch.cat(all_logps, dim=0)

def move_to_vllm(model, llm):
for name, param in model.named_parameters():
with nullcontext([param]):
llm_model = llm.llm_engine.model_executor.driver_worker.model_runner.model
llm_model.load_weights([(name, param.data)])
llm.reset_prefix_cache()
print('vllm updated!')

def cleanup_dist():
if dist.is_initialized():
print("Cleaning up distributed process group...")
dist.destroy_process_group()

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--sync_weights_path", type=str, required=True, help="The path to model weights")
parser.add_argument("--max_num_model_weight", type=int, required=True, help="The number of model weights")
parser.add_argument("--num_samples", type=int, required=True, help="The number of samples")
parser.add_argument("--num_generations", type=int, required=True, help="The number of generations per sample")
parser.add_argument("--skip_retrieve_model_weight", type=bool, default=False, required=True, help="skip the retrival of model weight")
parser.add_argument("--random_dataset", type=bool, default=False, required=True, help="skip the retrival of model weight")
return parser.parse_args()

def main():
args = get_args()
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
target_directory = f"/extrahome0/retrieve_model_weight/{timestamp}"
max_num_model_weight = args.max_num_model_weight
if not args.skip_retrieve_model_weight:
retrieve_model_weight(args.sync_weights_path, target_directory, max_num_model_weight)
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

temperature=0.6
top_p=0.95
N=args.num_samples
top_k=20
max_length = 2048
num_generations=args.num_generations
begin_ind=0
end_ind=begin_ind+N
scale_rewards = False
solutions = []
prompts_text = []
sampling_params = SamplingParams(temperature=temperature, top_p=top_p, top_k=top_k, max_tokens=max_length)

# Initialize the tokenizer
# tokenizer = AutoTokenizer.from_pretrained("/extrahome0/HF_models/Qwen/Qwen3-1.7B")
# os.environ["VLLM_USE_V1"] = "0"

# Configurae the sampling parameters (for thinking mode)
sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=20, max_tokens=2048)

# Initialize the vLLM engine
# llm = LLM(model="/extrahome0/HF_models/Qwen/Qwen3-1.7B")
llm = LLM(model="/extrahome0/HF_models/Qwen/Qwen3-1.7B",gpu_memory_utilization=0.8)
tokenizer =AutoTokenizer.from_pretrained("/extrahome0/HF_models/Qwen/Qwen3-1.7B", trust_remote_code=True)
data_path = "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5"
model_id = "/extrahome0/HF_models/Qwen/Qwen3-1.7B"
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)

device = model.device
batch_size = 8
max_prompt_length = 768
seed = 42
mode = "test"
dataset = custom_loading_dataset(data_path, max_length=max_prompt_length, tokenizer=tokenizer)
dataset = dataset.map(make_conversation)
for split in dataset:
if "messages" in dataset[split].column_names:
dataset[split] = dataset[split].remove_columns("messages")

current_dataset = dataset['train'] if mode == "train" else dataset['test']
current_dataset = current_dataset.shuffle(seed=seed)

for ind in range(begin_ind, end_ind):
for _ in range(num_generations):
prompts_text.append("<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer.<|im_start|>user\n" + current_dataset[ind]['problem'] + "<|im_end|>\n<|im_start|>assistant\n")
# prompts_text.append(dataset['train'][ind]['problem'] + "/no_think")
solutions.append(current_dataset[ind]['solution'])
# prompts_text = [maybe_apply_chat_template(example, tokenizer)["prompt"] for example in inputs]
prompt_inputs = tokenizer(
text=prompts_text, return_tensors="pt", padding=True, padding_side="left", add_special_tokens=False
)
prompt_ids, prompt_mask = prompt_inputs["input_ids"].to(device), prompt_inputs["attention_mask"].to(device)
if max_prompt_length is not None:
# If max_prompt_length is set, we trim the prompt to keep only the last `max_prompt_length` tokens.
# Then we decode those tokens back into text. We manually remove leading pad tokens from the decoded text,
# because we can't use `skip_special_tokens=True` (some special tokens are still needed for generation).
prompt_ids = prompt_ids[:, -max_prompt_length :]
prompt_mask = prompt_mask[:, -max_prompt_length :]
prompts_text = tokenizer.batch_decode(
prompt_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
)
prompts_text = [
re.sub(rf"^({re.escape(tokenizer.pad_token)})+", "", text) for text in prompts_text
]

print(len(prompts_text))
# model_list = sorted(os.listdir(target_directory))
model_list = sorted([file.name for file in Path(target_directory).glob('gpg_*.pt')], key=lambda x: int(re.search(r'gpg_async_weights_(\d+)', x).group(1)))
assert len(model_list) == max_num_model_weight, f"Error! got {len(model_list)} < {max_num_model_weight} models in model_list: {model_list}"
# def get_logprobs_and_reward(model_list):
log_probs = []
advantages_list = []
completion_ids_list = []
prompt_completion_ids_list = []
completion_mask_list = []
attention_mask_list = []
for model_name in tqdm(model_list):
model_id, state_dict = torch.load(f"{target_directory}/{model_name}", map_location="cpu")
model.load_state_dict(state_dict)
print(f"model_id {model_id} loaded!")
move_to_vllm(model, llm)
all_outputs = llm.generate(prompts_text, sampling_params, use_tqdm=False)
completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs]
completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
completion_ids = pad(completion_ids, padding_value=tokenizer.pad_token_id)
prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
is_eos = completion_ids == tokenizer.eos_token_id
eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)
eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1)
completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
logits_to_keep = completion_ids.size(1)
with torch.no_grad():
logps = _get_per_token_logps(
model=model,
input_ids=prompt_completion_ids,
attention_mask=attention_mask,
logits_to_keep=logits_to_keep,
temperature=temperature,
batch_size=batch_size
)
log_probs.append(logps)
completions_text = tokenizer.batch_decode(completion_ids, skip_special_tokens=True)
rewards = accuracy_reward_lv35(completions=completions_text, solution=solutions).to(device)
mean_grouped_rewards = rewards.view(-1, num_generations).mean(dim=1)
std_grouped_rewards = rewards.view(-1, num_generations).std(dim=1)
# is_std_zero = torch.isclose(std_grouped_rewards, torch.zeros_like(std_grouped_rewards))

# Normalize the rewards to compute the advantages
mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(num_generations, dim=0)
std_grouped_rewards = std_grouped_rewards.repeat_interleave(num_generations, dim=0)
advantages = rewards - mean_grouped_rewards
if scale_rewards:
advantages = advantages / (std_grouped_rewards + 1e-4)
advantages_list.append(advantages)
completion_ids_list.append(completion_ids)
completion_mask_list.append(completion_mask)
prompt_completion_ids_list.append(prompt_completion_ids)
attention_mask_list.append(attention_mask)
print("sampling finish!")
learner_log_probs = []
with torch.no_grad():
for i in tqdm(range(max_num_model_weight-1)):
learner_logps = _get_per_token_logps(
model=model,
input_ids=prompt_completion_ids_list[i],
attention_mask=attention_mask_list[i],
logits_to_keep=completion_ids_list[i].size(1),
temperature=temperature,
batch_size=batch_size
)
learner_log_probs.append(learner_logps)

save_path = f"{target_directory}/log_probs_and_advantages.pt"
torch.save({
'sampler_log_probs': log_probs,
'advantages_list': advantages_list,
'prompt_ids_list': prompt_inputs["input_ids"],
'prompt_mask_list': prompt_inputs["attention_mask"],
'completion_ids_list': completion_ids_list,
'prompt_completion_ids_list': prompt_completion_ids_list,
'completion_mask_list': completion_mask_list,
'attention_mask_list': attention_mask_list,
'learner_log_probs': learner_log_probs
}, save_path)

print("learning finish!")
# calculation

if __name__ == "__main__":
atexit.register(cleanup_dist)
main()

+ 0
- 146
scripts/decontaminate.py View File

@@ -1,146 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script is used to decontaminate a dataset by checking for n-gram overlap with other datasets.
It uses the same approach presented in https://arxiv.org/abs/2501.19393,
as found in: https://github.com/simplescaling/s1/blob/main/data/decontaminate_util.py

Usage:

python scripts/decontaminate.py \
--dataset open-r1/verifiable-coding-problems-python \
--split train \
--ngram_size 8 \
--problem_column problem \
--cleanup
"""

import collections

from tqdm import tqdm


def normalize_string(text: str) -> str:
"""Basic string normalization."""
# Convert to lowercase and normalize whitespace
text = text.lower().strip()
# Replace multiple spaces with single space
text = " ".join(text.split())
return text


def word_ngrams(text: str, n: int) -> list:
"""Generate word-level n-grams from text."""
words = text.split()
return [" ".join(words[i : i + n]) for i in range(len(words) - n + 1)]


def build_ngram_lookup(documents: list[str], ngram_size: int = 8) -> dict[str, set[int]]:
"""Build ngram lookup for documents."""
lookup = collections.defaultdict(set)

for doc_id, document in enumerate(tqdm(documents)):
normalized_text = normalize_string(document)
ngrams = word_ngrams(normalized_text, ngram_size)
for ngram in ngrams:
lookup[ngram].add(doc_id)

return lookup


def build_ngram_single(document: str, ngram_size: int = 8) -> set[str]:
normalized_text = normalize_string(document)
ngrams = word_ngrams(normalized_text, ngram_size)

return set(ngrams)


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to check for contamination.")
parser.add_argument("--config", type=str, default=None, help="Name of the dataset config to load.")
parser.add_argument("--split", type=str, default="train", help="Split to check for contamination, defaults to `train`.")
parser.add_argument("--ngram_size", type=int, default=8, help="Size of n-grams to build, defaults to 8.")
parser.add_argument(
"--problem_column", type=str, default="problem", help="Name of the column containing the problem (prompt)."
)
parser.add_argument(
"--cleanup",
action="store_true",
help="Whether to remove the contaminated rows before pushing the dataset.",
)
parser.add_argument(
"--new_dataset_name",
type=str,
default=None,
help="New name for the dataset. If not provided, will reuse the name and add a `_decontaminated` to the name."
)
args = parser.parse_args()

from datasets import load_dataset, Dataset

# Load the dataset to check for contamination
ds = load_dataset(args.dataset, name=args.config, split=args.split)

eval_datasets = {
"aime_2024": (load_dataset("HuggingFaceH4/aime_2024", split="train"), "problem"),
"aime_2025": (load_dataset("yentinglin/aime_2025", split="train"), "problem"),
"math_500": (load_dataset("HuggingFaceH4/MATH-500", split="test"), "problem"),
"gpqa": (load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train", trust_remote_code=True), "Question"),
"lcb": (
load_dataset(
"livecodebench/code_generation_lite", split="test", version_tag="v4_v5", trust_remote_code=True
),
"question_content",
),
}
ngram_lookups = {}
for ds_name, (eval_dataset, problem_col) in eval_datasets.items():
ngram_lookups[ds_name] = build_ngram_lookup(eval_dataset[problem_col], ngram_size=args.ngram_size)

for eval_name, ngram_lookup in ngram_lookups.items():
# Update the ngram_lookup variable for each dataset
def find_contaminated(row):
# For each example we have to build the ngrams and check for all of them on each row
ngrams = build_ngram_single(row[args.problem_column], ngram_size=args.ngram_size)
row[f"contaminated_{eval_name}"] = any(set(ngram in ngram_lookup for ngram in ngrams))
return row

ds = ds.map(find_contaminated, num_proc=8)

# Allow cleaning up via CLI args (removing the contaminated examples and dropping the columns)
def cleanup(dataset: Dataset) -> Dataset:
initial_size = len(dataset)
contamination_cols = [col for col in dataset.column_names if col.startswith("contaminated_")]
for col in contamination_cols:
if col.startswith("contaminated_"):
size_prior = len(dataset)
dataset = dataset.filter(lambda x: not x[col], num_proc=8)
if len(dataset) < size_prior:
print(f"Removed {size_prior - len(dataset)} samples from '{col.replace('contaminated_', '')}'")
dataset = dataset.remove_columns(contamination_cols)
print(f"Initial size: {initial_size}, Final size: {len(dataset)}")
return dataset

if args.cleanup:
ds = cleanup(ds)

new_ds_name = args.new_dataset_name or f"{args.dataset}_decontaminated"
config_name = args.config if args.config is not None else "default"
url = ds.push_to_hub(new_ds_name, config_name=config_name, split="train")
print(f"Decontaminated dataset: {url}")

+ 0
- 174
scripts/generate_reasoning.py View File

@@ -1,174 +0,0 @@
import argparse
import asyncio
import hashlib
import json
import os
import random
from asyncio import Lock
from typing import Set

from datasets import load_dataset
from tqdm.asyncio import tqdm

import aiofiles
import aiohttp
import uvloop


file_lock = Lock()


async def generate_completion(session, prompt, args):
retry_budget = 10
while retry_budget > 0:
try:
await asyncio.sleep(random.uniform(0.0, 0.1))
async with session.post(
f"http://{args.api_addr}/v1/chat/completions",
json={
"model": "default",
"messages": [{"role": "user", "content": prompt}],
"max_tokens": args.max_tokens,
"temperature": args.temperature,
"top_p": args.top_p,
},
headers={"Authorization": "Bearer EMPTY"},
) as response:
return await response.json(content_type=None)
except Exception as e:
print(f"API error (will retry): {e}")
retry_budget -= 1
await asyncio.sleep(10)
return None


async def process_example(example, session, args, output_file, pbar):
prompt = args.prompt_template.format(prompt=example[args.prompt_column])

try:
tasks = [generate_completion(session, prompt, args) for _ in range(args.num_generations)]

completions = await asyncio.gather(*tasks)

if any(completion is None for completion in completions):
print(f"Error processing example")
pbar.update(1)
return None

generations = []
finish_reasons = []
api_metadata = []

for completion in completions:
generations.append(completion["choices"][0]["message"]["content"])
finish_reasons.append(completion["choices"][0]["finish_reason"])
api_metadata.append(completion["usage"])

# Combine original dataset fields with generations
result = {
**example, # Preserve all original dataset fields
"generations": generations,
"finish_reasons": finish_reasons,
"api_metadata": api_metadata,
}

# Write to file with lock
async with file_lock:
async with aiofiles.open(output_file, mode="a") as f:
await f.write(json.dumps(result) + "\n")
await f.flush()

pbar.set_postfix(active=len(pbar.active_tasks), refresh=False)
pbar.update(1)

return result
except Exception as e:
print(f"Error processing example: {e}")
pbar.update(1)
return None


async def load_processed_uuids(output_file, uuid_column):
processed_uuids = set()
if os.path.exists(output_file):
async with aiofiles.open(output_file, mode="r") as f:
async for line in f:
try:
data = json.loads(line)
processed_uuids.add(hashlib.md5(str(data[uuid_column]).encode()).hexdigest())
except json.JSONDecodeError:
continue
return processed_uuids


async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dataset-name", type=str, required=True)
parser.add_argument("--output-file", type=str, required=True)
parser.add_argument("--prompt-column", type=str, required=True)
parser.add_argument("--uuid-column", type=str, required=True)
parser.add_argument("--api-addr", type=str, default="localhost:39876")
parser.add_argument("--num-generations", type=int, default=4)
parser.add_argument(
"--prompt-template",
type=str,
default="You will be given a problem. Please reason step by step, and put your final answer within \\boxed{{}}:\n{prompt}",
)
parser.add_argument("--temperature", type=float, default=0.6)
parser.add_argument("--top-p", type=float, default=0.95)
parser.add_argument("--max-tokens", type=int, default=16384)
parser.add_argument("--max-concurrent", type=int, default=1000)
args = parser.parse_args()

dataset = load_dataset(args.dataset_name, split="train").shuffle()
processed_uuids = await load_processed_uuids(args.output_file, args.uuid_column)
if processed_uuids:
print(f"Found {len(processed_uuids)} already processed examples, resuming from there...")

if not os.path.exists(args.output_file):
async with aiofiles.open(args.output_file, mode="w") as f:
await f.write("")

active_tasks: Set[asyncio.Task] = set()

pbar = tqdm(
total=len(dataset) - len(processed_uuids),
desc="Generating responses",
unit="row",
mininterval=2,
smoothing=0.0001,
)
pbar.active_tasks = active_tasks

async with aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=60 * 60),
connector=aiohttp.TCPConnector(limit=args.max_concurrent, ttl_dns_cache=300, keepalive_timeout=60 * 60),
) as session:
for example in dataset:
uuid = hashlib.md5(str(example[args.uuid_column]).encode()).hexdigest()
if uuid not in processed_uuids:
# Wait if we've hit the concurrency limit
while len(active_tasks) >= args.max_concurrent:
done, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED)
for task in done:
try:
await task
except Exception as e:
print(f"Task failed: {e}")

task = asyncio.create_task(process_example(example, session, args, args.output_file, pbar))
active_tasks.add(task)
task.add_done_callback(active_tasks.discard)

pbar.set_postfix(active=len(active_tasks), refresh=True)

# Wait for remaining tasks
if active_tasks:
await asyncio.gather(*active_tasks, return_exceptions=True)

pbar.close()


if __name__ == "__main__":
uvloop.install()
asyncio.run(main())

+ 0
- 28
scripts/get_tensor_parallel_size.py View File

@@ -1,28 +0,0 @@
import argparse
from transformers import AutoConfig
from math import gcd

def get_tensor_parallel_size(model_name: str, revision: str = None, default_tp: int = 8) -> int:
try:
config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)
num_heads = getattr(config, 'num_attention_heads', None)

if num_heads is not None and num_heads % default_tp != 0:
tp = gcd(num_heads, default_tp)
return max(tp, 1)
else:
return default_tp
except Exception as e:
print(f"Warning: Failed to fetch config for {model_name}@{revision}: {e}")
return default_tp

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, required=True, help="Hugging Face model name or path")
parser.add_argument("--revision", type=str, default=None, help="Model revision if applicable")
parser.add_argument("--default_tp", type=int, default=8, help="Default TP size (usually GPUs per node)")

args = parser.parse_args()

tp = get_tensor_parallel_size(args.model_name, args.revision, args.default_tp)
print(tp)

+ 0
- 61
scripts/run_benchmarks.py View File

@@ -1,61 +0,0 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass, field
from typing import List, Optional

from open_r1.utils.evaluation import SUPPORTED_BENCHMARKS, run_benchmark_jobs
from open_r1.configs import SFTConfig
from trl import ModelConfig, TrlParser


@dataclass
class ScriptArguments:
model_id: str = field(
default="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
metadata={"help": "The Hub model id to push the model to."},
)
model_revision: str = field(default="main", metadata={"help": "The Hub model branch to push the model to."})
trust_remote_code: bool = field(default=False, metadata={"help": "Trust the remote code."})
benchmarks: List[str] = field(
default_factory=lambda: [], metadata={"help": "The benchmarks to run after training."}
)
list_benchmarks: bool = field(default=False, metadata={"help": "List all supported benchmarks."})
system_prompt: Optional[str] = field(
default=None, metadata={"help": "The system prompt to use for the benchmark."}
)


def main():
parser = TrlParser(ScriptArguments)
args = parser.parse_args_and_config()[0]
if args.list_benchmarks:
print("Supported benchmarks:")
for benchmark in SUPPORTED_BENCHMARKS:
print(f" - {benchmark}")
return
benchmark_args = SFTConfig(
output_dir="",
hub_model_id=args.model_id,
hub_model_revision=args.model_revision,
benchmarks=args.benchmarks,
system_prompt=args.system_prompt,
)
run_benchmark_jobs(
benchmark_args,
ModelConfig(model_name_or_path="", model_revision="", trust_remote_code=args.trust_remote_code),
)


if __name__ == "__main__":
main()

+ 0
- 55
scripts/upload_details.py View File

@@ -1,55 +0,0 @@
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Push the details from a LightEval run to the Hub.

Usage:

python src/open_r1/utils/upload_details.py \
--data_files {path_to_parquet_file} \
--hub_repo_id {hub_repo_id} \
--config_name {config_name}
"""

from dataclasses import dataclass, field
from typing import List

from datasets import load_dataset
from transformers import HfArgumentParser


@dataclass
class ScriptArguments:
data_files: List[str] = field(default_factory=list)
hub_repo_id: str = None
config_name: str = None


def main():
parser = HfArgumentParser(ScriptArguments)
args = parser.parse_args_into_dataclasses()[0]

if all(file.endswith(".json") for file in args.data_files):
ds = load_dataset("json", data_files=args.data_files)
elif all(file.endswith(".jsonl") for file in args.data_files):
ds = load_dataset("json", data_files=args.data_files)
else:
ds = load_dataset("parquet", data_files=args.data_files)
url = ds.push_to_hub(args.hub_repo_id, config_name=args.config_name, private=True)
print(f"Dataset available at: {url}")


if __name__ == "__main__":
main()

+ 0
- 41
setup.cfg View File

@@ -1,41 +0,0 @@
[isort]
default_section = FIRSTPARTY
ensure_newline_before_comments = True
force_grid_wrap = 0
include_trailing_comma = True
known_first_party = open_r1
known_third_party =
transformers
datasets
fugashi
git
h5py
matplotlib
nltk
numpy
packaging
pandas
psutil
pytest
rouge_score
sacrebleu
seqeval
sklearn
streamlit
torch
tqdm

line_length = 119
lines_after_imports = 2
multi_line_output = 3
use_parentheses = True

[flake8]
ignore = E203, E501, E741, W503, W605
max-line-length = 119
per-file-ignores =
# imported but unused
__init__.py: F401

[tool:pytest]
doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS

+ 0
- 145
setup.py View File

@@ -1,145 +0,0 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Adapted from huggingface/transformers: https://github.com/huggingface/transformers/blob/21a2d900eceeded7be9edc445b56877b95eda4ca/setup.py


import re
import shutil
from pathlib import Path

from setuptools import find_packages, setup


# Remove stale open_r1.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
stale_egg_info = Path(__file__).parent / "open_r1.egg-info"
if stale_egg_info.exists():
print(
(
"Warning: {} exists.\n\n"
"If you recently updated open_r1, this is expected,\n"
"but it may prevent open_r1 from installing in editable mode.\n\n"
"This directory is automatically generated by Python's packaging tools.\n"
"I will remove it now.\n\n"
"See https://github.com/pypa/pip/issues/5466 for details.\n"
).format(stale_egg_info)
)
shutil.rmtree(stale_egg_info)


# IMPORTANT: all dependencies should be listed here with their version requirements, if any.
# * If a dependency is fast-moving (e.g. trl), pin to the exact version
_deps = [
"accelerate==1.4.0",
"bitsandbytes>=0.43.0",
"datasets>=3.2.0",
"deepspeed==0.15.4",
"distilabel[vllm,ray,openai]>=1.5.2",
"e2b-code-interpreter>=1.0.5",
"einops>=0.8.0",
"flake8>=6.0.0",
"hf_transfer>=0.1.4",
"huggingface-hub[cli]>=0.19.2,<1.0",
"isort>=5.12.0",
"langdetect", # Needed for LightEval's extended tasks
"latex2sympy2_extended>=1.0.6",
"liger_kernel==0.5.3",
"lighteval @ git+https://github.com/huggingface/lighteval.git@ed084813e0bd12d82a06d9f913291fdbee774905",
"math-verify==0.5.2", # Used for math verification in grpo
"packaging>=23.0",
"parameterized>=0.9.0",
"peft>=0.14.0",
"pytest",
"python-dotenv",
"ruff>=0.9.0",
"safetensors>=0.3.3",
"sentencepiece>=0.1.99",
"torch==2.5.1",
"transformers==4.50.0",
"trl==0.16.0",
"vllm==0.7.2",
"wandb>=0.19.1",
]

# this is a lookup table with items like:
#
# tokenizers: "tokenizers==0.9.4"
# packaging: "packaging"
#
# some of the values are versioned whereas others aren't.
deps = {b: a for a, b in (re.findall(r"^(([^!=<>~ \[\]]+)(?:\[[^\]]+\])?(?:[!=<>~ ].*)?$)", x)[0] for x in _deps)}


def deps_list(*pkgs):
return [deps[pkg] for pkg in pkgs]


extras = {}
extras["tests"] = deps_list("pytest", "parameterized", "math-verify")
extras["torch"] = deps_list("torch")
extras["quality"] = deps_list("ruff", "isort", "flake8")
extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv")
extras["eval"] = deps_list("lighteval", "math-verify")
extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"]

# core dependencies shared across the whole project - keep this to a bare minimum :)
install_requires = [
deps["accelerate"],
deps["bitsandbytes"],
deps["einops"],
deps["datasets"],
deps["deepspeed"],
deps["hf_transfer"],
deps["huggingface-hub"],
deps["langdetect"],
deps["latex2sympy2_extended"],
deps["math-verify"],
deps["liger_kernel"],
deps["packaging"], # utilities from PyPA to e.g., compare versions
deps["safetensors"],
deps["sentencepiece"],
deps["transformers"],
deps["trl"],
deps["wandb"],
]

setup(
name="open-r1",
version="0.1.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
author="The Hugging Face team (past and future)",
author_email="lewis@huggingface.co",
description="Open R1",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
keywords="llm inference-time compute reasoning",
license="Apache",
url="https://github.com/huggingface/open-r1",
package_dir={"": "src"},
packages=find_packages("src"),
zip_safe=False,
extras_require=extras,
python_requires=">=3.10.9",
install_requires=install_requires,
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
)

sh_dir/Learner_4gpus_nRMs_LogNorm_benchmark.sh → sh_dir/HeteroRL_Learner_4gpus.sh View File

@@ -8,13 +8,12 @@ cfg=$4
loss_type=$5
wandb_name=$6
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/learner/${loss_type}/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log
log_path=/userhome/Research_HUB/HeteroRL/open-r1/log_dir/learner/${loss_type}/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log
mkdir -p "$(dirname "$log_path")"
checkpoint=/extrahome0/save_dir/4gpus/Learner_EqQ_2th_cfgv6b/Qwen3-1.7B/checkpoint-64
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/learner/${loss_type}
export WANDB_DIR=/userhome/Research_HUB/HeteroRL/open-r1/wandb/learner/${loss_type}
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export PYTHONPATH=/userhome/Research_HUB/HeteroRL/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
@@ -39,7 +38,7 @@ accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--max_steps 1295 \
--save_strategy "steps" --save_steps 64000 --save_total_limit 32 \
--eval_strategy 'steps' --eval_steps 64 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--wandb_entity "xxxx" --wandb_project "HeteroRL" --report_to "wandb" \
--log_completions True --logging_steps 1 \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
--vllm_gpu_memory_utilization 0.25 \

sh_dir/Sampler_4gpus_single_benchmark.sh → sh_dir/HeteroRL_Sampler_4gpus.sh View File

@@ -8,13 +8,13 @@ loss_type=$4
wandb_name=$5
sampler_id=$6
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/sampler/${loss_type}/$1_sampler${sampler_id}_$2_cfg$3_${formatted_time}.log
log_path=/userhome/Research_HUB/HeteroRL/open-r1/log_dir/sampler/${loss_type}/$1_sampler${sampler_id}_$2_cfg$3_${formatted_time}.log
mkdir -p "$(dirname "$log_path")"
echo $log_path
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/sampler${sampler_id}
export WANDB_DIR=/userhome/Research_HUB/HeteroRL/open-r1/wandb/sampler${sampler_id}
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export PYTHONPATH=/userhome/Research_HUB/HeteroRL/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
@@ -37,14 +37,14 @@ elif [[ $sampler_id -eq 1 ]]; then
export CUDA_VISIBLE_DEVICES="4,5,6,7"
export MASTER_PORT=29522
vllm_gpu_memory_utilization=0.6
#elif [[ $sampler_id -eq 2 ]]; then
# export CUDA_VISIBLE_DEVICES="4,5,6,7"
# export MASTER_PORT=29523
# vllm_gpu_memory_utilization=0.3
#elif [[ $sampler_id -eq 3 ]]; then
# export CUDA_VISIBLE_DEVICES="4,5,6,7"
# export MASTER_PORT=29524
# vllm_gpu_memory_utilization=0.6
elif [[ $sampler_id -eq 2 ]]; then
export CUDA_VISIBLE_DEVICES="4,5,6,7"
export MASTER_PORT=29523
vllm_gpu_memory_utilization=0.3
elif [[ $sampler_id -eq 3 ]]; then
export CUDA_VISIBLE_DEVICES="4,5,6,7"
export MASTER_PORT=29524
vllm_gpu_memory_utilization=0.6
fi

#rm $SYNC_WEIGHTS_PATH
@@ -61,9 +61,9 @@ accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--num_generations 8 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--wandb_entity "xxx" --wandb_project "HeteroRL" --report_to "wandb" \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
--num_samplers 2 --sampler_id $sampler_id \
--num_samplers 4 --sampler_id $sampler_id \
--wandb_name $wandb_name \
--loss_type $loss_type \
--vllm_gpu_memory_utilization $vllm_gpu_memory_utilization > $log_path 2>&1

+ 0
- 60
sh_dir/Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint.sh View File

@@ -1,60 +0,0 @@
export TZ='Asia/Shanghai'
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
export SYNC_SAMPLER_STEPS=$3
cfg=$4
loss_type=$5
wandb_name=$6
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/learner/${loss_type}/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log
mkdir -p "$(dirname "$log_path")"
# checkpoint=/extrahome0/save_dir/4gpus/Learner_EqQ_2th_cfgv6b/Qwen3-1.7B/checkpoint-64
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/learner/${loss_type}
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29510
export SAVEPATH=/extrahome0/save_dir/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
export FS_QUEUE_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/async_checkpoint.pt
export QUEUE_TIMEOUT_SECONDS=3600

echo $log_path
# export CUDA_VISIBLE_DEVICES=1,2,3,4
rm -r $FS_QUEUE_PATH
rm $SYNC_WEIGHTS_PATH
accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--max_prompt_length 768 \
--scale_rewards False \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--max_steps 1295 \
--save_strategy "steps" --save_steps 64 --save_total_limit 1 \
--eval_strategy 'steps' --eval_steps 64 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--log_completions True --logging_steps 1 \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
--vllm_gpu_memory_utilization 0.25 \
--max_completion_length 2048 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 8 \
--num_generations 8 \
--wandb_name $wandb_name \
--ais_beta 0.5 \
--use_benchmark \
--loss_type $loss_type \
--resume_from_checkpoint False \
--use_think False \
--eval_on_start False > $log_path 2>&1 &


# --resume_from_checkpoint $checkpoint \

sh_dir/Online_gXpo_4gpus_benchmark_bs512.sh → sh_dir/Online_gXpo_4gpus.sh View File

@@ -1,26 +1,24 @@
export TZ='Asia/Shanghai'
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
loss_type=$1
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/online/${loss_type}/${formatted_time}.log
log_path=/userhome/Research_HUB/HeteroRL/open-r1/log_dir/online/${loss_type}/${formatted_time}.log
mkdir -p "$(dirname "$log_path")"
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/online/${loss_type}
export WANDB_DIR=/userhome/Research_HUB/HeteroRL/open-r1/wandb/online/${loss_type}
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export PYTHONPATH=/userhome/Research_HUB/HeteroRL/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
# export MASTER_PORT=29506
export SAVEPATH="/extrahome0/save_dir/GPG/4gpus/${loss_type}/${formatted_time}/Qwen3-1.7B"

export SAVEPATH="/extrahome0/save_dir/4gpus/${loss_type}/${formatted_time}/Qwen3-1.7B"
echo $log_path
accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/online_rl.py --config recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1_vllm.yaml --output_dir $SAVEPATH \
--save_total_limit 1 --num_train_epochs 10 --gradient_accumulation_steps 16 --max_completion_length 2048 --max_prompt_length 768 \
--save_total_limit 1 --num_train_epochs 5 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--save_strategy "steps" --save_steps 64 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--save_strategy "steps" --save_steps 64 --log_completions False \
--wandb_entity "xxx" --wandb_project "HeteroRL" --report_to "wandb" \
--per_device_eval_batch_size 16 --per_device_train_batch_size 8 --eval_strategy "steps" --eval_steps 64 --eval_on_start True --use_benchmark \
--logging_steps 1 --use_vllm True --loss_type $loss_type > $log_path 2>&1 &

+ 0
- 44
sh_dir/Online_gXpo_4gpus_benchmark.sh View File

@@ -1,44 +0,0 @@
export TZ='Asia/Shanghai'
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
loss_type=$1
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/online/${loss_type}/${formatted_time}.log
mkdir -p "$(dirname "$log_path")"
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/online/${loss_type}
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
# export MASTER_PORT=29506
export SAVEPATH="/extrahome0/save_dir/GPG/4gpus/${loss_type}/${formatted_time}/Qwen3-1.7B"
# export CUDA_VISIBLE_DEVICES=4,5,6,7
# if [[ $loss_type == "gspo" ]]; then
# export CUDA_VISIBLE_DEVICES="0,1,2,3"
# export MASTER_PORT=29508
# elif [[ $loss_type == "EqQ" ]]; then
# export CUDA_VISIBLE_DEVICES="4,5,6,7"
# export MASTER_PORT=29507
# elif [[ $loss_type == "grpo" ]]; then
# export CUDA_VISIBLE_DEVICES="0,1,2,3"
# export MASTER_PORT=29506
# fi

# if [[ $loss_type == "dr_grpo" ]]; then
# export CUDA_VISIBLE_DEVICES="0,1,2,3"
# export MASTER_PORT=29508
# elif [[ $loss_type == "bnpo" ]]; then
# export CUDA_VISIBLE_DEVICES="4,5,6,7"
# export MASTER_PORT=29507
# fi
echo $log_path
accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/online_rl.py --config recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1_vllm.yaml --output_dir $SAVEPATH \
--save_total_limit 1 --num_train_epochs 5 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--save_strategy "steps" --save_steps 64 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--per_device_eval_batch_size 16 --per_device_train_batch_size 8 --eval_strategy "steps" --eval_steps 64 --eval_on_start True --use_benchmark \
--logging_steps 1 --use_vllm True --loss_type $loss_type > $log_path 2>&1 &

+ 0
- 27
sh_dir/README.md View File

@@ -1,27 +0,0 @@
An heterogeneous RL algorithm built on [GPG](https://github.com/AMAP-ML/GPG)/[trl](https://github.com/huggingface/trl)/[openR1](https://github.com/huggingface/open-r1).

# Asynchronous Reinforcement Learning

## Enter the current directory (if the directory is different, you need to replace the corresponding path variables in the script).


## Launch the learner firstly(using 4 * 80GB Nvidia A100 by default)
```shell
cd /userhome/Research_HUB/GPG/open-r1
CUDA_VISIBLE_DEVICES=0,1,2,3 bash sh_dir/Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint.sh learner_script_checkpoint GEPO_nothink_1th 1 v6b gepo 1L2S_GEPO_diff32_nothink
```
## Sampler: launch samplers one by one in sequence
### resume from checkpoint: put the path of checkpoint into model_name_or_path
```shell
bash sh_dir/Sampler_4gpus_single_benchmark_checkpoint.sh sampler_script_checkpoint GEPO_nothink_1th v6b gepo 1L2S_GEPO_diff32_nothink 0 &
bash sh_dir/Sampler_4gpus_single_benchmark_checkpoint.sh sampler_script_checkpoint GEPO_nothink_1th v6b gepo 1L2S_GEPO_diff32_nothink 1 &
```


Online-policy(using 4 * 80GB Nvidia A100 by default):

# We support grpo/bnpo/dr_grpo/gepo/gspo loss currently.
```shell
cd /userhome/Research_HUB/GPG/open-r1
CUDA_VISIBLE_DEVICES="0,1,2,3" MASTER_PORT=29510 bash sh_dir/Online_gXpo_4gpus_benchmark.sh gepo
```

+ 0
- 71
sh_dir/Sampler_4gpus_single_benchmark_checkpoint.sh View File

@@ -1,71 +0,0 @@
export TZ='Asia/Shanghai'
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
cfg=$3
loss_type=$4
wandb_name=$5
sampler_id=$6
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/sampler/${loss_type}/$1_sampler${sampler_id}_$2_cfg$3_${formatted_time}.log
mkdir -p "$(dirname "$log_path")"
echo $log_path
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/sampler${sampler_id}
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export SAVEPATH=/extrahome0/save_dir/4gpus/Sampler_${xth}_cfg${cfg}/sampler${sampler_id}/Qwen3-1.7B
export FS_QUEUE_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/async_checkpoint.pt
export SYNC_SAMPLER_STEPS=1

if ! [[ "$sampler_id" =~ ^[0-3]$ ]]; then
echo "Error: sampler_id must be 0, 1, 2 or 3"
exit 1
fi

if [[ $sampler_id -eq 0 ]]; then
export CUDA_VISIBLE_DEVICES="4,5,6,7"
export MASTER_PORT=29521
vllm_gpu_memory_utilization=0.3
elif [[ $sampler_id -eq 1 ]]; then
export CUDA_VISIBLE_DEVICES="4,5,6,7"
export MASTER_PORT=29522
vllm_gpu_memory_utilization=0.6
#elif [[ $sampler_id -eq 2 ]]; then
# export CUDA_VISIBLE_DEVICES="4,5,6,7"
# export MASTER_PORT=29523
# vllm_gpu_memory_utilization=0.3
#elif [[ $sampler_id -eq 3 ]]; then
# export CUDA_VISIBLE_DEVICES="4,5,6,7"
# export MASTER_PORT=29524
# vllm_gpu_memory_utilization=0.6
fi

#rm $SYNC_WEIGHTS_PATH
#echo "rm$SYNC_WEIGHTS_PATH"
accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--save_strategy "steps" --save_steps 100000 --save_total_limit 5 \
--num_train_epochs 3 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --eval_strategy 'no' \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--log_completions True --logging_steps 32 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--num_generations 8 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
--num_samplers 2 --sampler_id $sampler_id \
--wandb_name $wandb_name \
--loss_type $loss_type \
--resume_from_checkpoint False \
--use_think False \
--vllm_gpu_memory_utilization $vllm_gpu_memory_utilization > $log_path 2>&1

+ 0
- 55
sh_dir/debug/MoIS_Learner_4gpus_nRMs_debug.sh View File

@@ -1,55 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
export SYNC_SAMPLER_STEPS=$3
cfg=$4
wandb_name=$5
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/learner/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log


export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/learner
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29510
export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
export QUEUE_TIMEOUT_SECONDS=3600

echo $log_path
export CUDA_VISIBLE_DEVICES=0,1,2,3
# rm -r $FS_QUEUE_PATH
# rm $SYNC_WEIGHTS_PATH
accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--max_prompt_length 768 \
--scale_rewards False \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--max_steps 1295 \
--save_strategy "steps" --save_steps 3200 --save_total_limit 5 \
--eval_strategy 'steps' --eval_steps 64 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--log_completions True --logging_steps 1 \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
--vllm_gpu_memory_utilization 0.25 \
--max_completion_length 2048 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 8 \
--num_generations 8 \
--wandb_name $wandb_name \
--ais_beta 0.5 \
--cppo_beta 0.1 \
--max_diff_step 4 \
--eval_on_start False



+ 0
- 62
sh_dir/debug/MoIS_Sampler_4gpus_debug.sh View File

@@ -1,62 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
cfg=$3
sampler_id=$4
########################## parameters ##########################
# log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_sampler${sampler_id}_$2_cfg$3_${formatted_time}.log
# echo $log_path
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler/debug
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/sampler${sampler_id}/Qwen3-1.7B
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
export SYNC_SAMPLER_STEPS=1

if ! [[ "$sampler_id" =~ ^[0-3]$ ]]; then
echo "Error: sampler_id must be 0, 1, 2 or 3"
exit 1
fi

if [[ $sampler_id -eq 0 ]]; then
export CUDA_VISIBLE_DEVICES="0,1,2,3"
export MASTER_PORT=29521
vllm_gpu_memory_utilization=0.45
elif [[ $sampler_id -eq 1 ]]; then
export CUDA_VISIBLE_DEVICES="0,1,2,3"
export MASTER_PORT=29522
vllm_gpu_memory_utilization=0.90
elif [[ $sampler_id -eq 2 ]]; then
export CUDA_VISIBLE_DEVICES="4,5,6,7"
export MASTER_PORT=29523
vllm_gpu_memory_utilization=0.45
elif [[ $sampler_id -eq 3 ]]; then
export CUDA_VISIBLE_DEVICES="4,5,6,7"
export MASTER_PORT=29524
vllm_gpu_memory_utilization=0.90
fi
#rm $SYNC_WEIGHTS_PATH
#echo "rm$SYNC_WEIGHTS_PATH"
accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--save_strategy "steps" --save_steps 100000 --save_total_limit 5 \
--num_train_epochs 3 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --eval_strategy 'no' \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--log_completions True --logging_steps 32 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--num_generations 8 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
--num_samplers 1 --sampler_id $sampler_id \
--vllm_gpu_memory_utilization $vllm_gpu_memory_utilization

+ 0
- 24
sh_dir/debug/train_grpo_4gpus_debug.sh View File

@@ -1,24 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/grpo/${formatted_time}.log

export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/grpo
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29506
export SAVEPATH="/extrahome0/save_dir/GPG/4gpus/GRPO/Qwen3-1.7B"
export CUDA_VISIBLE_DEVICES=0,1,2,3
echo $log_path
accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/grpo.py --config recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1_vllm.yaml --output_dir $SAVEPATH \
--save_total_limit 5 --num_train_epochs 5 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--save_strategy "steps" --save_steps 32000 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--per_device_eval_batch_size 16 --per_device_train_batch_size 8 --eval_strategy "steps" --eval_steps 32 --eval_on_start True \
--logging_steps 1 --use_vllm True > $log_path 2>&1

BIN
sh_dir/old/.MoIS_Learner_4gpus_nRMs.sh.swp View File


+ 0
- 49
sh_dir/old/Debug_Learner_MoIS_4gpus_nRMs.sh View File

@@ -1,49 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
export SYNC_SAMPLER_STEPS=$3
cfg=$4
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/learner/$1_$2_SyncF$3_cfg$4_${formatted_time}.log


export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/learner
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29510
export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Learner_${xth}/Qwen3-1.7B
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}/tmp/Qwen3-1.7B/gpg_async_weights.pt
export QUEUE_TIMEOUT_SECONDS=3600

echo $log_path
export CUDA_VISIBLE_DEVICES=0,1,2,3
rm -r $FS_QUEUE_PATH
rm $SYNC_WEIGHTS_PATH
accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--max_prompt_length 768 \
--scale_rewards False \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-0.6B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--max_steps 1295 \
--save_strategy "steps" --save_steps 3200 --save_total_limit 5 \
--eval_strategy 'steps' --eval_steps 32 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--log_completions True --logging_steps 1 \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$4.yaml \
--vllm_gpu_memory_utilization 0.25 \
--max_completion_length 2048 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 2 \
--num_generations 2 \
--eval_on_start False \
--seed 2025 > $log_path 2>&1



+ 0
- 27
sh_dir/old/Debug_Learner_v2_4gpus.sh View File

@@ -1,27 +0,0 @@
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/debug
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29510
export SAVEPATH="/extrahome0/save_dir/GPG/4gpus/LearnerV2_debug/Qwen3-1.7B"
export FS_QUEUE_PATH="/extrahome0/save_dir/GPG/4gpus/AsyncV2_debug/Rollout/Qwen3-1.7B"
export SYNC_WEIGHTS_PATH="/extrahome0/save_dir/GPG/4gpus/AsyncV2_debug/tmp_3th/Qwen3-1.7B/gpg_async_weights.pt"
export SYNC_SAMPLER_STEPS=1
#export CUDA_VISIBLE_DEVICES=4,5,6,7


accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/learner_script_v2.py --config recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1.yaml --output_dir $SAVEPATH \
--num_train_epochs 5 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --adjust_gd --min_inverse_alpha 0.5 \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--max_steps 1295 --per_device_train_batch_size 8 --gradient_accumulation_steps 8 \
--save_strategy "steps" --save_steps 32 --save_total_limit 5 \
--eval_strategy 'steps' --eval_steps 32 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
# --vllm_gpu_memory_utilization 1.0 \
# --eval_on_start True --log_completions True --logging_steps 1 \

+ 0
- 38
sh_dir/old/Debug_Sampler_4gpus_Part1.sh View File

@@ -1,38 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
cfg=$3
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
echo $log_path
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29521

export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}/part1/Qwen3-1.7B
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}/tmp/Qwen3-1.7B/gpg_async_weights.pt

export SYNC_SAMPLER_STEPS=1
export CUDA_VISIBLE_DEVICES=0,1,2,3
#rm $SYNC_WEIGHTS_PATH
#echo "rm$SYNC_WEIGHTS_PATH"
accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--save_strategy "steps" --save_steps 100000 --save_total_limit 5 \
--num_train_epochs 3 --gradient_accumulation_steps 2 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --eval_strategy 'no' \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-0.6B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--per_device_train_batch_size 2 --log_completions True \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
--seed 41 --num_generations 2 \
--vllm_gpu_memory_utilization 0.5 > $log_path 2>&1

+ 0
- 2
sh_dir/old/Kill_Learner.sh View File

@@ -1,2 +0,0 @@
sleep 9600s
ps -ef | grep learner | grep -v grep | awk '{print $2}' | xargs kill -9

+ 0
- 2
sh_dir/old/Kill_Sampler.sh View File

@@ -1,2 +0,0 @@
sleep 9600s
ps -ef | grep sampler | grep -v grep | awk '{print $2}' | xargs kill -9

+ 0
- 2
sh_dir/old/Kill_debug.sh View File

@@ -1,2 +0,0 @@
sleep 30s
echo "debug"

+ 0
- 56
sh_dir/old/MoIS_Learner_4gpus_nRMs.sh View File

@@ -1,56 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
export SYNC_SAMPLER_STEPS=$3
cfg=$4
wandb_name=$5
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/learner/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log


export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/learner
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29510
export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
export QUEUE_TIMEOUT_SECONDS=3600

echo $log_path
export CUDA_VISIBLE_DEVICES=0,1,2,3
rm -r $FS_QUEUE_PATH
rm $SYNC_WEIGHTS_PATH
accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--max_prompt_length 768 \
--scale_rewards False \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--max_steps 1295 \
--save_strategy "steps" --save_steps 3200 --save_total_limit 5 \
--eval_strategy 'steps' --eval_steps 64 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--log_completions True --logging_steps 1 \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
--vllm_gpu_memory_utilization 0.25 \
--max_completion_length 2048 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 8 \
--num_generations 8 \
--wandb_name $wandb_name \
--ais_beta 0.5 \
--cppo_beta 0.00 \
--max_diff_step 4 \
--loss_type "ais_bnpo" \
--eval_on_start False > $log_path 2>&1



+ 0
- 54
sh_dir/old/MoIS_Learner_4gpus_nRMs_LogNorm.sh View File

@@ -1,54 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
export SYNC_SAMPLER_STEPS=$3
cfg=$4
wandb_name=$5
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/learner/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log


export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/learner
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29510
export SAVEPATH=/extrahome0/save_dir/AsyncGRPO/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
export FS_QUEUE_PATH=/extrahome0/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/extrahome0/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
export QUEUE_TIMEOUT_SECONDS=3600

echo $log_path
export CUDA_VISIBLE_DEVICES=0,1,2,3
rm -r $FS_QUEUE_PATH
rm $SYNC_WEIGHTS_PATH
accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--max_prompt_length 768 \
--scale_rewards False \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--max_steps 1295 \
--save_strategy "steps" --save_steps 3200 --save_total_limit 5 \
--eval_strategy 'steps' --eval_steps 64 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--log_completions True --logging_steps 1 \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
--vllm_gpu_memory_utilization 0.25 \
--max_completion_length 2048 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 8 \
--num_generations 8 \
--wandb_name $wandb_name \
--ais_beta 0.5 \
--loss_type "ais_gspo" \
--eval_on_start False > $log_path 2>&1



+ 0
- 59
sh_dir/old/MoIS_Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint_bak.sh View File

@@ -1,59 +0,0 @@
export TZ='Asia/Shanghai'
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
export SYNC_SAMPLER_STEPS=$3
cfg=$4
loss_type=$5
wandb_name=$6
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/learner/${loss_type}/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log
mkdir -p "$(dirname "$log_path")"
# checkpoint=/extrahome0/save_dir/4gpus/Learner_EqQ_2th_cfgv6b/Qwen3-1.7B/checkpoint-64
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/learner/${loss_type}
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29510
export SAVEPATH=/extrahome0/save_dir/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
export FS_QUEUE_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/async_checkpoint.pt
export QUEUE_TIMEOUT_SECONDS=3600

echo $log_path
export CUDA_VISIBLE_DEVICES=1,2,3,4
rm -r $FS_QUEUE_PATH
rm $SYNC_WEIGHTS_PATH
accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--max_prompt_length 768 \
--scale_rewards False \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-0.6B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--max_steps 1295 \
--save_strategy "steps" --save_steps 2 --save_total_limit 1 \
--eval_strategy 'steps' --eval_steps 64 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--log_completions True --logging_steps 1 \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
--vllm_gpu_memory_utilization 0.15 \
--max_completion_length 512 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--num_generations 8 \
--wandb_name $wandb_name \
--ais_beta 0.5 \
--use_benchmark \
--loss_type $loss_type \
--resume_from_checkpoint True \
--eval_on_start False > $log_path 2>&1 &


# --resume_from_checkpoint $checkpoint \

+ 0
- 48
sh_dir/old/MoIS_Learner_4gpus_nRMs_debug.sh View File

@@ -1,48 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
export SYNC_SAMPLER_STEPS=$3
cfg=$4
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/learner/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log


export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/learner
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29510
export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
export QUEUE_TIMEOUT_SECONDS=3600
export CUDA_VISIBLE_DEVICES=0,1,2,3
# rm -r $FS_QUEUE_PATH
# rm $SYNC_WEIGHTS_PATH
accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--max_prompt_length 768 \
--scale_rewards False \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--max_steps 1295 \
--save_strategy "steps" --save_steps 3200 --save_total_limit 5 \
--eval_strategy 'steps' --eval_steps 32 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--log_completions True --logging_steps 1 \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
--vllm_gpu_memory_utilization 0.25 \
--max_completion_length 2048 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 8 \
--num_generations 8 \
--eval_on_start False \
--seed 2025



+ 0
- 41
sh_dir/old/MoIS_SamplerV2_4gpus_Part1.sh View File

@@ -1,41 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
cfg=$3
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
echo $log_path
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29521

export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/part1/Qwen3-1.7B
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt

export SYNC_SAMPLER_STEPS=1
export CUDA_VISIBLE_DEVICES=0,1,2,3
#rm $SYNC_WEIGHTS_PATH
#echo "rm$SYNC_WEIGHTS_PATH"
accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--save_strategy "steps" --save_steps 100000 --save_total_limit 5 \
--num_train_epochs 3 --gradient_accumulation_steps 1 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --eval_strategy 'no' \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--log_completions False --logging_steps 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--num_generations 8 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
--sampler_id 0 \
--vllm_gpu_memory_utilization 0.45 > $log_path 2>&1

+ 0
- 41
sh_dir/old/MoIS_SamplerV2_4gpus_Part2.sh View File

@@ -1,41 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
cfg=$3
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
echo $log_path
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29522

export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/part1/Qwen3-1.7B
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt

export SYNC_SAMPLER_STEPS=1
export CUDA_VISIBLE_DEVICES=0,1,2,3
#rm $SYNC_WEIGHTS_PATH
#echo "rm$SYNC_WEIGHTS_PATH"
accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--save_strategy "steps" --save_steps 100000 --save_total_limit 5 \
--num_train_epochs 3 --gradient_accumulation_steps 1 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --eval_strategy 'no' \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--log_completions False --logging_steps 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--num_generations 8 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
--sampler_id 1 \
--vllm_gpu_memory_utilization 0.90 > $log_path 2>&1

+ 0
- 41
sh_dir/old/MoIS_SamplerV2_4gpus_Part3.sh View File

@@ -1,41 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
cfg=$3
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
echo $log_path
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29523

export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/part1/Qwen3-1.7B
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt

export SYNC_SAMPLER_STEPS=1
export CUDA_VISIBLE_DEVICES=4,5,6,7
#rm $SYNC_WEIGHTS_PATH
#echo "rm$SYNC_WEIGHTS_PATH"
accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--save_strategy "steps" --save_steps 100000 --save_total_limit 5 \
--num_train_epochs 3 --gradient_accumulation_steps 1 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --eval_strategy 'no' \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--log_completions False --logging_steps 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--num_generations 8 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
--sampler_id 2 \
--vllm_gpu_memory_utilization 0.45 > $log_path 2>&1

+ 0
- 41
sh_dir/old/MoIS_SamplerV2_4gpus_Part4.sh View File

@@ -1,41 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
cfg=$3
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
echo $log_path
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29524

export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/part1/Qwen3-1.7B
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt

export SYNC_SAMPLER_STEPS=1
export CUDA_VISIBLE_DEVICES=4,5,6,7
#rm $SYNC_WEIGHTS_PATH
#echo "rm$SYNC_WEIGHTS_PATH"
accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--save_strategy "steps" --save_steps 100000 --save_total_limit 5 \
--num_train_epochs 3 --gradient_accumulation_steps 1 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --eval_strategy 'no' \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--log_completions False --logging_steps 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--num_generations 8 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
--sampler_id 3 \
--vllm_gpu_memory_utilization 0.90 > $log_path 2>&1

+ 0
- 66
sh_dir/old/MoIS_Sampler_4gpus.sh View File

@@ -1,66 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
cfg=$3
wandb_name=$4
########################## parameters ##########################
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
export USE_FLASH_ATTN=true
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
export SYNC_SAMPLER_STEPS=1

for sampler_id in 0 2 1 3; do
if ! [[ "$sampler_id" =~ ^[0-3]$ ]]; then
echo "Error: sampler_id must be 0, 1, 2 or 3"
exit 1
fi

if [[ $sampler_id -eq 0 ]]; then
export CUDA_VISIBLE_DEVICES="0,1,2,3"
export MASTER_PORT=29521
vllm_gpu_memory_utilization=0.45
elif [[ $sampler_id -eq 1 ]]; then
export CUDA_VISIBLE_DEVICES="0,1,2,3"
export MASTER_PORT=29522
vllm_gpu_memory_utilization=0.9
elif [[ $sampler_id -eq 2 ]]; then
export CUDA_VISIBLE_DEVICES="4,5,6,7"
export MASTER_PORT=29523
vllm_gpu_memory_utilization=0.45
elif [[ $sampler_id -eq 3 ]]; then
export CUDA_VISIBLE_DEVICES="4,5,6,7"
export MASTER_PORT=29524
vllm_gpu_memory_utilization=0.9
fi
export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/sampler${sampler_id}/Qwen3-1.7B
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_sampler${sampler_id}_$2_cfg$3_${formatted_time}.log
echo $log_path
#rm $SYNC_WEIGHTS_PATH
#echo "rm$SYNC_WEIGHTS_PATH"
accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--save_strategy "steps" --save_steps 100000 --save_total_limit 5 \
--num_train_epochs 3 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --eval_strategy 'no' \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--log_completions True --logging_steps 32 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--num_generations 8 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
--num_samplers 4 --sampler_id $sampler_id \
--wandb_name $wandb_name \
--vllm_gpu_memory_utilization $vllm_gpu_memory_utilization > $log_path 2>&1 &
sleep 2
done

+ 0
- 40
sh_dir/old/MoIS_Sampler_4gpus_Part1.sh View File

@@ -1,40 +0,0 @@
formatted_time=$(date "+%Y%m%d-%H-%M-%S")
########################## parameters ##########################
scriptname=$1
xth=$2
cfg=$3
########################## parameters ##########################
log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
echo $log_path
export WANDB_MODE=offline
export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
export WORLD_SIZE=1
export RANK=0
export GPUS=4
export MASTER_ADDR="localhost"
export MASTER_PORT=29521

export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/part1/Qwen3-1.7B
export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt

export SYNC_SAMPLER_STEPS=1
export CUDA_VISIBLE_DEVICES=0,1,2,3
#rm $SYNC_WEIGHTS_PATH
#echo "rm$SYNC_WEIGHTS_PATH"
accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
--num_machines $WORLD_SIZE --machine_rank $RANK --num_processes=$GPUS --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
src/open_r1/$scriptname.py --output_dir $SAVEPATH \
--save_strategy "steps" --save_steps 100000 --save_total_limit 5 \
--num_train_epochs 3 --gradient_accumulation_steps 1 --max_completion_length 2048 --max_prompt_length 768 \
--scale_rewards False --eval_strategy 'no' \
--model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
--dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--num_generations 8 \
--wandb_entity "pcl-zh" --wandb_project "GPG" --report_to "wandb" \
--config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
--seed 41 \
--vllm_gpu_memory_utilization 0.45 > $log_path 2>&1

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save
Baidu
map