v0

HeteroRL
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +0,0 @@
 wandb
 log_dir
 *.txt
 A100_vs_910A
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
 # Default ignored files
 /shelf/
 /workspace.xml
 # Editor-based HTTP Client requests
 /httpRequests/
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
--- a/.idea/.name
+++ b/.idea/.name
@@ -0,0 +1 @@
 HeteroRL
--- a/.idea/HeteroRL.iml
+++ b/.idea/HeteroRL.iml
@@ -0,0 +1,12 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$/../HeteroRL" />
    <orderEntry type="jdk" jdkName="Python 3.13" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyDocumentationSettings">
    <option name="format" value="GOOGLE" />
    <option name="myDocStringFormat" value="Google" />
  </component>
 </module>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
 <component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
 </component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="Black">
    <option name="sdkName" value="Python 3.13" />
  </component>
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13" project-jdk-type="Python SDK" />
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/AsyGPG.iml" filepath="$PROJECT_DIR$/.idea/AsyGPG.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="" vcs="Git" />
  </component>
 </project>
--- a/GPUdebug.py
+++ b/GPUdebug.py
@@ -1,137 +0,0 @@
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
 import os


 def setup_process(rank, world_size):
    """设置分布式进程"""
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12356'

    # 初始化进程组，使用 NCCL 后端处理 GPU 通信
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)


 def cleanup_process():
    """清理进程"""
    dist.destroy_process_group()


 def all_reduce_demo(rank, world_size):
    """All-Reduce 演示函数"""
    # 设置当前GPU设备
    torch.cuda.set_device(rank)
    device = f'cuda:{rank}'

    print(f"\n=== 进程 {rank} (GPU {rank}) ===")

    # 创建测试张量 - 每个进程创建不同的值
    original_tensor = torch.tensor([
        [1.0 + rank, 2.0 + rank],
        [3.0 + rank, 4.0 + rank]
    ], device=device)

    print(f"进程 {rank} 的原始张量:")
    print(original_tensor)

    # 测试不同的 All-Reduce 操作
    operations = [
        (dist.ReduceOp.SUM, "求和"),
        (dist.ReduceOp.AVG, "平均"),
        (dist.ReduceOp.MAX, "最大值"),
        (dist.ReduceOp.MIN, "最小值")
    ]

    for op, op_name in operations:
        # 复制原始张量
        tensor_copy = original_tensor.clone()

        # 执行 All-Reduce
        dist.all_reduce(tensor_copy, op=op)

        print(f"\n进程 {rank} - {op_name}操作后的结果:")
        print(tensor_copy)


 def main():
    """主函数"""
    # 使用 GPU 0,1,2,3,4
    gpu_ids = [0, 1, 2, 3, 4]
    available_gpus = torch.cuda.device_count()

    print(f"系统可用GPU数量: {available_gpus}")

    # 过滤出实际可用的GPU
    valid_gpu_ids = [gpu_id for gpu_id in gpu_ids if gpu_id < available_gpus]

    if len(valid_gpu_ids) == 0:
        print("没有可用的GPU!")
        return

    world_size = len(valid_gpu_ids)
    print(f"将使用GPU: {valid_gpu_ids}")

    # 设置环境变量
    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, valid_gpu_ids))

    try:
        # 启动多进程
        mp.spawn(
            all_reduce_demo,
            args=(world_size,),
            nprocs=world_size,
            join=True
        )
    except Exception as e:
        print(f"发生错误: {e}")
    finally:
        print("演示完成")


 # 简化版本 - 只做基本的求和操作
 def simple_all_reduce_demo(rank, world_size):
    """简化版 All-Reduce 演示"""
    setup_process(rank, world_size)

    # 设置GPU设备
    device = torch.device(f'cuda:{rank}')

    # 创建张量
    tensor = torch.tensor([[rank + 1.0, rank + 2.0],
                           [rank + 3.0, rank + 4.0]], device=device)

    print(f"进程 {rank} (GPU {rank}) - 原始张量:")
    print(tensor)
    print("-" * 30)

    # All-Reduce 操作 (默认求和)
    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)

    print(f"进程 {rank} (GPU {rank}) - All-Reduce后:")
    print(tensor)

    cleanup_process()


 if __name__ == "__main__":
    # 检查CUDA是否可用
    if not torch.cuda.is_available():
        print("CUDA 不可用!")
        exit(1)

    # 运行简化版本
    world_size = min(5, torch.cuda.device_count())  # 最多使用5个GPU
    if world_size == 0:
        print("没有可用的GPU!")
        exit(1)

    print(f"使用 {world_size} 个GPU进行演示")

    mp.spawn(
        simple_all_reduce_demo,
        args=(world_size,),
        nprocs=world_size,
        join=True
    )
--- a/+ 0
+++ b/+ 0
@@ -1,201 +0,0 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/+ 0
+++ b/+ 0
@@ -1,53 +0,0 @@
 .PHONY: style quality

 # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
 export PYTHONPATH = src

 check_dirs := src tests


 # dev dependencies
 install:
 	uv venv openr1 --python 3.11 && . openr1/bin/activate && uv pip install --upgrade pip
 	uv pip install vllm==0.7.2
 	uv pip install setuptools
 	uv pip install flash-attn --no-build-isolation
 	GIT_LFS_SKIP_SMUDGE=1 uv pip install -e ".[dev]"

 style:
 	ruff format --line-length 119 --target-version py310 $(check_dirs) setup.py
 	isort $(check_dirs) setup.py

 quality:
 	ruff check --line-length 119 --target-version py310 $(check_dirs) setup.py
 	isort --check-only $(check_dirs) setup.py
 	flake8 --max-line-length 119 $(check_dirs) setup.py

 test:
 	pytest -sv --ignore=tests/slow/ tests/

 slow_test:
 	pytest -sv -vv tests/slow/

 # Evaluation

 evaluate:
 	$(eval PARALLEL_ARGS := $(if $(PARALLEL),$(shell \
 		if [ "$(PARALLEL)" = "data" ]; then \
 			echo "data_parallel_size=$(NUM_GPUS)"; \
 		elif [ "$(PARALLEL)" = "tensor" ]; then \
 			echo "tensor_parallel_size=$(NUM_GPUS)"; \
 		fi \
 	),))
 	$(if $(filter tensor,$(PARALLEL)),export VLLM_WORKER_MULTIPROC_METHOD=spawn &&,) \
 	MODEL_ARGS="pretrained=$(MODEL),dtype=bfloat16,$(PARALLEL_ARGS),max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={max_new_tokens:32768,temperature:0.6,top_p:0.95}" && \
 	if [ "$(TASK)" = "lcb" ]; then \
 		lighteval vllm $$MODEL_ARGS "extended|lcb:codegeneration|0|0" \
 			--use-chat-template \
 			--output-dir data/evals/$(MODEL); \
 	else \
 		lighteval vllm $$MODEL_ARGS "custom|$(TASK)|0|0" \
 			--custom-tasks src/open_r1/evaluate.py \
 			--use-chat-template \
 			--output-dir data/evals/$(MODEL); \
 	fi
--- a/README.md
+++ b/README.md
@@ -1,30 +1,28 @@
 An heterogeneous RL algorithm built on [GPG](https://github.com/AMAP-ML/GPG)/[trl](https://github.com/huggingface/trl)/[openR1](https://github.com/huggingface/open-r1).

 Asynchronous Reinforcement Learning
 ```shell
 # Enter the current directory (if the directory is different, you need to replace the corresponding path variables in the script).
 cd /userhome/Research_HUB/GPG/open-r1

 # Launch the learner firstly（using 4 * 80GB Nvidia A100 by default）
 CUDA_VISIBLE_DEVICES=2,3,4,5 bash sh_dir/MoIS_Learner_4gpus_nRMs_LogNorm_benchmark.sh learner_script_EqQ_v0_benchmark EqQ_1th 1 v6b EqQ Async_EqQ_diff_32

 # Then launch the sampler（using 4 * 80GB Nvidia A100 for each sampler by default）
 # Asynchronous Reinforcement Learning

 ## Option 1: launch all samplers at once
 CUDA_VISIBLE_DEVICES=0,1,2,3 bash sh_dir/MoIS_Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint.sh learner_script_checkpoint GEPO_nothink_1th 1 v6b gepo 1L2S_GEPO_diff32_nothink
 ## Enter the current directory (if the directory is different, you need to replace the corresponding path variables in the script).

 ## Option 2: launch samplers one by one in sequence
 ## (Optional) Resume from checkpoint
 # please put the path of checkpoint into model_name_or_path
 bash sh_dir/MoIS_Sampler_4gpus_single_benchmark_checkpoint.sh sampler_script_checkpoint GEPO_nothink_1th v6b gepo 1L2S_GEPO_diff32_nothink 0 &
 bash sh_dir/MoIS_Sampler_4gpus_single_benchmark_checkpoint.sh sampler_script_checkpoint GEPO_nothink_1th v6b gepo 1L2S_GEPO_diff32_nothink 1 &
 ## Launch the learner firstly（using 4 * 80GB Nvidia A100 by default）
 ```shell
 cd ./open-r1
 CUDA_VISIBLE_DEVICES=0,1,2,3 bash sh_dir/HeteroRL_Learner_4gpus.sh learner_script_checkpoint GEPO_think_1th 1 v6b gepo 1L2S_GEPO_diff32_think
 ```
 ## Sampler: launch samplers one by one in sequence
 ### resume from checkpoint: put the path of checkpoint into model_name_or_path
 ```shell
 bash sh_dir/HeteroRL_Sampler_4gpus.sh sampler_script_checkpoint GEPO_think_1th v6b gepo 1L2S_GEPO_diff32_think 0 &
 bash sh_dir/HeteroRL_Sampler_4gpus.sh sampler_script_checkpoint GEPO_think_1th v6b gepo 1L2S_GEPO_diff32_think 1 &
 bash sh_dir/HeteroRL_Sampler_4gpus.sh sampler_script_checkpoint GEPO_think_1th v6b gepo 1L2S_GEPO_diff32_think 2 &
 bash sh_dir/HeteroRL_Sampler_4gpus.sh sampler_script_checkpoint GEPO_think_1th v6b gepo 1L2S_GEPO_diff32_think 3 &
 ```


 Online-policy（using 4 * 80GB Nvidia A100 by default）:

 # We support grpo/bnpo/dr_grpo/gepo/gspo loss currently.
 ```shell
 # Enter the current directory (if the directory is different, you need to replace the corresponding path variables in the script).
 cd /userhome/Research_HUB/GPG/open-r1
 # We support grpo/bnpo/dr_grpo/EqP/EqQ/gspo loss currently.
 CUDA_VISIBLE_DEVICES="0,1,2,3" MASTER_PORT=29510 bash sh_dir/train_grpo_4gpus_benchmark.sh grpo
 CUDA_VISIBLE_DEVICES="0,1,2,3" MASTER_PORT=29510 bash sh_dir/Online_gXpo_4gpus.sh gepo
 ```
--- a/assets/plan-of-attack.png
+++ b/assets/plan-of-attack.png
--- a/install.sh
+++ b/install.sh
@@ -1,618 +0,0 @@
 #!/bin/sh
 set -eu

 # code-server's automatic install script.
 # See https://coder.com/docs/code-server/latest/install

 usage() {
  arg0="$0"
  if [ "$0" = sh ]; then
    arg0="curl -fsSL https://code-server.dev/install.sh | sh -s --"
  else
    not_curl_usage="The latest script is available at https://code-server.dev/install.sh
 "
  fi

  cath << EOF
 Installs code-server.
 It tries to use the system package manager if possible.
 After successful installation it explains how to start using code-server.

 Pass in user@host to install code-server on user@host over ssh.
 The remote host must have internet access.
 ${not_curl_usage-}
 Usage:

  $arg0 [--dry-run] [--version X.X.X] [--edge] [--method detect] \
        [--prefix ~/.local] [--rsh ssh] [user@host]

  --dry-run
      Echo the commands for the install process without running them.

  --version X.X.X
      Install a specific version instead of the latest.

  --edge
      Install the latest edge version instead of the latest stable version.

  --method [detect | standalone]
      Choose the installation method. Defaults to detect.
      - detect detects the system package manager and tries to use it.
        Full reference on the process is further below.
      - standalone installs a standalone release archive into ~/.local
        Add ~/.local/bin to your \$PATH to use it.

  --prefix <dir>
      Sets the prefix used by standalone release archives. Defaults to ~/.local
      The release is unarchived into ~/.local/lib/code-server-X.X.X
      and the binary symlinked into ~/.local/bin/code-server
      To install system wide pass --prefix=/usr/local

  --rsh <bin>
      Specifies the remote shell for remote installation. Defaults to ssh.

 The detection method works as follows:
  - Debian, Ubuntu, Raspbian: install the deb package from GitHub.
  - Fedora, CentOS, RHEL, openSUSE: install the rpm package from GitHub.
  - Arch Linux: install from the AUR (which pulls releases from GitHub).
  - FreeBSD, Alpine: install from npm.
  - macOS: install using Homebrew if installed otherwise install from GitHub.
  - All others: install the release from GitHub.

 We only build releases on GitHub for amd64 and arm64 on Linux and amd64 for
 macOS. When the detection method tries to pull a release from GitHub it will
 fall back to installing from npm when there is no matching release for the
 system's operating system and architecture.

 The standalone method will force installion using GitHub releases. It will not
 fall back to npm so on architectures without pre-built releases this will error.

 The installer will cache all downloaded assets into ~/.cache/code-server

 More installation docs are at https://coder.com/docs/code-server/latest/install
 EOF
 }

 echo_latest_version() {
  if [ "${EDGE-}" ]; then
    version="$(curl -fsSL https://api.githubfast.com/repos/coder/code-server/releases | awk 'match($0,/.*"html_url": "(.*\/releases\/tag\/.*)".*/)' | head -n 1 | awk -F '"' '{print $4}')"
  else
    # https://gist.githubfast.com/lukechilds/a83e1d7127b78fef38c2914c4ececc3c#gistcomment-2758860
    version="$(curl -fsSLI -o /dev/null -w "%{url_effective}" https://githubfast.com/coder/code-server/releases/latest)"
  fi
  version="${version#https://githubfast.com/coder/code-server/releases/tag/}"
  version="${version#v}"
  echo "$version"
 }

 echo_npm_postinstall() {
  echoh
  cath << EOF
 npm package has been installed.

 Extend your path to use code-server:
  PATH="$NPM_BIN_DIR:\$PATH"
 Then run with:
  code-server
 EOF
 }

 echo_standalone_postinstall() {
  echoh
  cath << EOF
 Standalone release has been installed into $STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION

 Extend your path to use code-server:
  PATH="$STANDALONE_INSTALL_PREFIX/bin:\$PATH"
 Then run with:
  code-server
 EOF
 }

 echo_brew_postinstall() {
  echoh
  cath << EOF
 Brew release has been installed.

 Run with:
  code-server
 EOF
 }

 echo_systemd_postinstall() {
  echoh
  cath << EOF
 $1 package has been installed.

 To have systemd start code-server now and restart on boot:
  sudo systemctl enable --now code-server@\$USER
 Or, if you don't want/need a background service you can run:
  code-server
 EOF
 }

 echo_coder_postinstall() {
  echoh
  echoh "Deploy code-server for your team with Coder: https://githubfast.com/coder/coder"
 }

 main() {
  if [ "${TRACE-}" ]; then
    set -x
  fi

  unset \
    DRY_RUN \
    METHOD \
    OPTIONAL \
    ALL_FLAGS \
    RSH_ARGS \
    EDGE \
    RSH

  ALL_FLAGS=""
  while [ "$#" -gt 0 ]; do
    case "$1" in
      -*)
        ALL_FLAGS="${ALL_FLAGS} $1"
        ;;
    esac

    case "$1" in
      --dry-run)
        DRY_RUN=1
        ;;
      --method)
        METHOD="$(parse_arg "$@")"
        shift
        ;;
      --method=*)
        METHOD="$(parse_arg "$@")"
        ;;
      --prefix)
        STANDALONE_INSTALL_PREFIX="$(parse_arg "$@")"
        shift
        ;;
      --prefix=*)
        STANDALONE_INSTALL_PREFIX="$(parse_arg "$@")"
        ;;
      --version)
        VERSION="$(parse_arg "$@")"
        shift
        ;;
      --version=*)
        VERSION="$(parse_arg "$@")"
        ;;
      --edge)
        EDGE=1
        ;;
      --rsh)
        RSH="$(parse_arg "$@")"
        shift
        ;;
      --rsh=*)
        RSH="$(parse_arg "$@")"
        ;;
      -h | --h | -help | --help)
        usage
        exit 0
        ;;
      --)
        shift
        # We remove the -- added above.
        ALL_FLAGS="${ALL_FLAGS% --}"
        RSH_ARGS="$*"
        break
        ;;
      -*)
        echoerr "Unknown flag $1"
        echoerr "Run with --help to see usage."
        exit 1
        ;;
      *)
        RSH_ARGS="$*"
        break
        ;;
    esac

    shift
  done

  if [ "${RSH_ARGS-}" ]; then
    RSH="${RSH-ssh}"
    echoh "Installing remotely with $RSH $RSH_ARGS"
    curl -fsSL https://code-server.dev/install.sh | prefix "$RSH_ARGS" "$RSH" "$RSH_ARGS" sh -s -- "$ALL_FLAGS"
    return
  fi

  METHOD="${METHOD-detect}"
  if [ "$METHOD" != detect ] && [ "$METHOD" != standalone ]; then
    echoerr "Unknown install method \"$METHOD\""
    echoerr "Run with --help to see usage."
    exit 1
  fi

  # These are used by the various install_* functions that make use of GitHub
  # releases in order to download and unpack the right release.
  CACHE_DIR=$(echo_cache_dir)
  STANDALONE_INSTALL_PREFIX=${STANDALONE_INSTALL_PREFIX:-$HOME/.local}
  VERSION=${VERSION:-$(echo_latest_version)}
  # These can be overridden for testing but shouldn't normally be used as it can
  # result in a broken code-server.
  OS=${OS:-$(os)}
  ARCH=${ARCH:-$(arch)}

  distro_name

  # Standalone installs by pulling pre-built releases from GitHub.
  if [ "$METHOD" = standalone ]; then
    if has_standalone; then
      install_standalone
      echo_coder_postinstall
      exit 0
    else
      echoerr "There are no standalone releases for $ARCH"
      echoerr "Please try again without '--method standalone'"
      exit 1
    fi
  fi

  # DISTRO can be overridden for testing but shouldn't normally be used as it
  # can result in a broken code-server.
  DISTRO=${DISTRO:-$(distro)}

  case $DISTRO in
    # macOS uses brew when available and falls back to standalone. We only have
    # amd64 for macOS so for anything else use npm.
    macos)
      BREW_PATH="${BREW_PATH-brew}"
      if command_exists "$BREW_PATH"; then
        install_brew
      else
        echoh "Homebrew not installed."
        echoh "Falling back to standalone installation."
        npm_fallback install_standalone
      fi
      ;;
    # The .deb and .rpm files are pulled from GitHub and we only have amd64 and
    # arm64 there and need to fall back to npm otherwise.
    debian) npm_fallback install_deb ;;
    fedora | opensuse) npm_fallback install_rpm ;;
    # Arch uses the AUR package which only supports amd64 and arm64 since it
    # pulls releases from GitHub so we need to fall back to npm.
    arch) npm_fallback install_aur ;;
    # We don't have GitHub releases that work on Alpine or FreeBSD so we have no
    # choice but to use npm here.
    alpine | freebsd) install_npm ;;
    # For anything else we'll try to install standalone but fall back to npm if
    # we don't have releases for the architecture.
    *)
      echoh "Unsupported package manager."
      echoh "Falling back to standalone installation."
      npm_fallback install_standalone
      ;;
  esac

  echo_coder_postinstall
 }

 parse_arg() {
  case "$1" in
    *=*)
      # Remove everything after first equal sign.
      opt="${1%%=*}"
      # Remove everything before first equal sign.
      optarg="${1#*=}"
      if [ ! "$optarg" ] && [ ! "${OPTIONAL-}" ]; then
        echoerr "$opt requires an argument"
        echoerr "Run with --help to see usage."
        exit 1
      fi
      echo "$optarg"
      return
      ;;
  esac

  case "${2-}" in
    "" | -*)
      if [ ! "${OPTIONAL-}" ]; then
        echoerr "$1 requires an argument"
        echoerr "Run with --help to see usage."
        exit 1
      fi
      ;;
    *)
      echo "$2"
      return
      ;;
  esac
 }

 fetch() {
  URL="$1"
  FILE="$2"

  if [ -e "$FILE" ]; then
    echoh "+ Reusing $FILE"
    return
  fi

  sh_c mkdir -p "$CACHE_DIR"
  sh_c curl \
    -#fL \
    -o "$FILE.incomplete" \
    -C - \
    "$URL"
  sh_c mv "$FILE.incomplete" "$FILE"
 }

 install_brew() {
  echoh "Installing latest from Homebrew."
  echoh

  sh_c "$BREW_PATH" install code-server

  echo_brew_postinstall
 }

 install_deb() {
  echoh "Installing v$VERSION of the $ARCH deb package from GitHub."
  echoh

  fetch "https://githubfast.com/coder/code-server/releases/download/v$VERSION/code-server_${VERSION}_$ARCH.deb" \
    "$CACHE_DIR/code-server_${VERSION}_$ARCH.deb"
  sudo_sh_c dpkg -i "$CACHE_DIR/code-server_${VERSION}_$ARCH.deb"

  echo_systemd_postinstall deb
 }

 install_rpm() {
  echoh "Installing v$VERSION of the $ARCH rpm package from GitHub."
  echoh

  fetch "https://githubfast.com/coder/code-server/releases/download/v$VERSION/code-server-$VERSION-$ARCH.rpm" \
    "$CACHE_DIR/code-server-$VERSION-$ARCH.rpm"
  sudo_sh_c rpm -U "$CACHE_DIR/code-server-$VERSION-$ARCH.rpm"

  echo_systemd_postinstall rpm
 }

 install_aur() {
  echoh "Installing latest from the AUR."
  echoh

  sh_c mkdir -p "$CACHE_DIR/code-server-aur"
  sh_c "curl -#fsSL https://aur.archlinux.org/cgit/aur.git/snapshot/code-server.tar.gz | tar -xzC $CACHE_DIR/code-server-aur --strip-components 1"
  echo "+ cd $CACHE_DIR/code-server-aur"
  if [ ! "${DRY_RUN-}" ]; then
    cd "$CACHE_DIR/code-server-aur"
  fi
  sh_c makepkg -si --noconfirm

  echo_systemd_postinstall AUR
 }

 install_standalone() {
  echoh "Installing v$VERSION of the $ARCH release from GitHub."
  echoh

  fetch "https://githubfast.com/coder/code-server/releases/download/v$VERSION/code-server-$VERSION-$OS-$ARCH.tar.gz" \
    "$CACHE_DIR/code-server-$VERSION-$OS-$ARCH.tar.gz"

  # -w only works if the directory exists so try creating it first. If this
  # fails we can ignore the error as the -w check will then swap us to sudo.
  sh_c mkdir -p "$STANDALONE_INSTALL_PREFIX" 2> /dev/null || true

  sh_c="sh_c"
  if [ ! -w "$STANDALONE_INSTALL_PREFIX" ]; then
    sh_c="sudo_sh_c"
  fi

  if [ -e "$STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION" ]; then
    echoh
    echoh "code-server-$VERSION is already installed at $STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION"
    echoh "Remove it to reinstall."
    exit 0
  fi

  "$sh_c" mkdir -p "$STANDALONE_INSTALL_PREFIX/lib" "$STANDALONE_INSTALL_PREFIX/bin"
  "$sh_c" tar -C "$STANDALONE_INSTALL_PREFIX/lib" -xzf "$CACHE_DIR/code-server-$VERSION-$OS-$ARCH.tar.gz"
  "$sh_c" mv -f "$STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION-$OS-$ARCH" "$STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION"
  "$sh_c" ln -fs "$STANDALONE_INSTALL_PREFIX/lib/code-server-$VERSION/bin/code-server" "$STANDALONE_INSTALL_PREFIX/bin/code-server"

  echo_standalone_postinstall
 }

 install_npm() {
  echoh "Installing v$VERSION from npm."
  echoh

  NPM_PATH="${YARN_PATH-npm}"

  if command_exists "$NPM_PATH"; then
    sh_c="sh_c"
    if [ ! "${DRY_RUN-}" ] && [ ! -w "$(NPM_PATH config get prefix)" ]; then
      sh_c="sudo_sh_c"
    fi
    echoh "Installing with npm."
    echoh
    "$sh_c" "$NPM_PATH" install -g "code-server@$VERSION" --unsafe-perm
    NPM_BIN_DIR="\$($NPM_PATH bin -g)" echo_npm_postinstall
    return
  fi
  echoerr "Please install npm to install code-server!"
  echoerr "You will need at least node v20 and a few C dependencies."
  echoerr "See the docs https://coder.com/docs/code-server/latest/install#npm"

  exit 1
 }

 # Run $1 if we have a standalone otherwise run install_npm.
 npm_fallback() {
  if has_standalone; then
    $1
  else
    echoh "No standalone releases for $ARCH."
    echoh "Falling back to installation from npm."
    install_npm
  fi
 }

 # Determine if we have standalone releases on GitHub for the system's arch.
 has_standalone() {
  case $ARCH in
    arm64) return 0 ;;
    # We only have arm64 for macOS.
    amd64)
      [ "$(distro)" != macos ]
      return
      ;;
    *) return 1 ;;
  esac
 }

 os() {
  uname="$(uname)"
  case $uname in
    Linux) echo linux ;;
    Darwin) echo macos ;;
    FreeBSD) echo freebsd ;;
    *) echo "$uname" ;;
  esac
 }

 # Print the detected Linux distro, otherwise print the OS name.
 #
 # Example outputs:
 # - macos -> macos
 # - freebsd -> freebsd
 # - ubuntu, raspbian, debian ... -> debian
 # - amzn, centos, rhel, fedora, ... -> fedora
 # - opensuse-{leap,tumbleweed} -> opensuse
 # - alpine -> alpine
 # - arch, manjaro, endeavouros, ... -> arch
 #
 # Inspired by https://githubfast.com/docker/docker-install/blob/26ff363bcf3b3f5a00498ac43694bf1c7d9ce16c/install.sh#L111-L120.
 distro() {
  if [ "$OS" = "macos" ] || [ "$OS" = "freebsd" ]; then
    echo "$OS"
    return
  fi

  if [ -f /etc/os-release ]; then
    (
      . /etc/os-release
      if [ "${ID_LIKE-}" ]; then
        for id_like in $ID_LIKE; do
          case "$id_like" in debian | fedora | opensuse | arch)
            echo "$id_like"
            return
            ;;
          esac
        done
      fi

      echo "$ID"
    )
    return
  fi
 }

 # Print a human-readable name for the OS/distro.
 distro_name() {
  if [ "$(uname)" = "Darwin" ]; then
    echo "macOS v$(sw_vers -productVersion)"
    return
  fi

  if [ -f /etc/os-release ]; then
    (
      . /etc/os-release
      echo "$PRETTY_NAME"
    )
    return
  fi

  # Prints something like: Linux 4.19.0-9-amd64
  uname -sr
 }

 arch() {
  uname_m=$(uname -m)
  case $uname_m in
    aarch64) echo arm64 ;;
    x86_64) echo amd64 ;;
    *) echo "$uname_m" ;;
  esac
 }

 command_exists() {
  if [ ! "$1" ]; then return 1; fi
  command -v "$@" > /dev/null
 }

 sh_c() {
  echoh "+ $*"
  if [ ! "${DRY_RUN-}" ]; then
    sh -c "$*"
  fi
 }

 sudo_sh_c() {
  if [ "$(id -u)" = 0 ]; then
    sh_c "$@"
  elif command_exists doas; then
    sh_c "doas $*"
  elif command_exists sudo; then
    sh_c "sudo $*"
  elif command_exists su; then
    sh_c "su root -c '$*'"
  else
    echoh
    echoerr "This script needs to run the following command as root."
    echoerr "  $*"
    echoerr "Please install doas, sudo, or su."
    exit 1
  fi
 }

 echo_cache_dir() {
  if [ "${XDG_CACHE_HOME-}" ]; then
    echo "$XDG_CACHE_HOME/code-server"
  elif [ "${HOME-}" ]; then
    echo "$HOME/.cache/code-server"
  else
    echo "/tmp/code-server-cache"
  fi
 }

 echoh() {
  echo "$@" | humanpath
 }

 cath() {
  humanpath
 }

 echoerr() {
  echoh "$@" >&2
 }

 # humanpath replaces all occurrences of " $HOME" with " ~"
 # and all occurrences of '"$HOME' with the literal '"$HOME'.
 humanpath() {
  sed "s# $HOME# ~#g; s#\"$HOME#\"\$HOME#g"
 }

 # We need to make sure we exit with a non zero exit if the command fails.
 # /bin/sh does not support -o pipefail unfortunately.
 prefix() {
  PREFIX="$1"
  shift
  fifo="$(mktemp -d)/fifo"
  mkfifo "$fifo"
  sed -e "s#^#$PREFIX: #" "$fifo" &
  "$@" > "$fifo" 2>&1
 }

 main "$@"
--- a/lighteval_results/main.py
+++ b/lighteval_results/main.py
@@ -1,142 +0,0 @@
 # MIT License

 # Copyright (c) 2025 The HuggingFace Team

 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:

 # The above copyright notice and this permission notice shall be included in all
 # copies or substantial portions of the Software.

 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 """Usage:
 lighteval vllm \
    "pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.6,top_p:0.95}" \
    "extended|lcb:codegeneration|0|0"

 lighteval vllm \
    "pretrained=Qwen/Qwen2.5-Coder-3B-Instruct,dtype=bfloat16,data_parallel_size=8,max_model_length=32768,gpu_memory_utilization=0.8,generation_parameters={temperature:0.2,top_p:0.95}" \
    "extended|lcb:codegeneration|0|0"
 """

 import json
 from typing import Any

 import numpy as np
 from aenum import extend_enum
 from datasets import get_dataset_config_names

 from lighteval.metrics.metrics import MetricCategory, Metrics, MetricUseCase, SampleLevelMetric
 from lighteval.tasks.extended.lcb.codegen_metrics import (
    codegen_metrics,
    extract_code,
    translate_private_test_cases,
 )
 from lighteval.tasks.lighteval_task import Doc, LightevalTaskConfig


 def prepare_prompt(line: dict[str, Any]) -> str:
    query = "You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests.\n\n"
    query += f"Question: {line['question_content']}\n\n"
    if starter_code := line.get("starter_code", None):
        query += "You will use the following starter code to write the solution to the problem and enclose your code within delimiters."
        query += f"```python\n{starter_code}\n```\n\n"
    else:
        query += "Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows."
        query += "```python\n# YOUR CODE HERE\n```\n\n"
    return query


 def lcb_codegeneration_prompt_fn(line, task_name: str = "lcb:codegeneration") -> Doc:
    # For the prompt we need a more general function that can be used tweaked like in:
    # https://github.com/LiveCodeBench/LiveCodeBench/blob/main/lcb_runner/prompts/code_generation.py
    query = prepare_prompt(line)
    # List of dicts of the form: [{"input": "6\nabc\nacb\nbac\nbca\ncab\ncba\n", "output": "YES\nYES\nYES\nNO\nNO\nYES\n", "testtype": "stdin"}]
    public_test_cases = json.loads(line["public_test_cases"])
    private_test_cases = translate_private_test_cases(line["private_test_cases"])
    inputs = [test["input"] for test in public_test_cases + private_test_cases]
    outputs = [test["output"] for test in public_test_cases + private_test_cases]
    return Doc(
        task_name=task_name,
        query=query,
        choices=[""],
        gold_index=0,
        specific={
            "inputs": inputs,
            "outputs": outputs,
            "fn_name": json.loads(line["metadata"]).get("func_name", None),
        },
    )


 def codegen_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> float:
    """Estimates the Pass@1 metric for the code generation task.
    Extract the code from each prediction, Runs it for each sample and generations,
    and computes the Pass@1 over the outputs.
    """
    # Extract generated code snippets
    generated_code_snippets = [[extract_code(pred) for pred in predictions]]  # noqa: F841
    evaluation_sample = {  # noqa: F841
        "inputs": formatted_doc.specific["inputs"],
        "outputs": formatted_doc.specific["outputs"],
        "fn_name": formatted_doc.specific["fn_name"],
    }
    # This is a list of lists because
    evaluation_sample = [{"input_output": json.dumps(evaluation_sample)}]

    metrics, _ = codegen_metrics(
        evaluation_sample,
        generated_code_snippets,
        k_list=[1],  # Only run for Pass@1
        num_process_evaluate=8,
    )
    return metrics["pass@1"]


 lcb_codegen_metric = SampleLevelMetric(
    metric_name="codegen_pass@1:16",  # This is the way of informing the number of generations currently
    category=MetricCategory.GENERATIVE_SAMPLING,
    use_case=MetricUseCase.REASONING,
    higher_is_better=True,
    sample_level_fn=codegen_metric,
    corpus_level_fn=np.mean,
 )


 extend_enum(Metrics, "lcb_codegen_metric", lcb_codegen_metric)

 configs = get_dataset_config_names("livecodebench/code_generation_lite", trust_remote_code=True)

 tasks = []

 for subset in configs:
    # To keep the base subset as the default, the others are named "lcb:codegeneration_v4", "lcb:codegeneration_v5"... etc
    name = "lcb:codegeneration" if subset == "v4_v5" else f"lcb:codegeneration_{subset}"
    task = LightevalTaskConfig(
        name=name,
        suite=["extended"],
        prompt_function=lcb_codegeneration_prompt_fn,
        hf_repo="livecodebench/code_generation_lite",
        hf_subset=subset,  # https://github.com/LiveCodeBench/LiveCodeBench/tree/main?tab=readme-ov-file#dataset-versions
        hf_avail_splits=["test"],
        evaluation_splits=["test"],
        generation_size=32768,
        metric=[Metrics.lcb_codegen_metric],
        stop_sequence=[],  # no stop sequence, will use EOS token
        trust_dataset=True,
        version=0,
    )
    tasks.append(task)


 TASKS_TABLE = tasks
--- a/once.py
+++ b/once.py
@@ -1,3 +0,0 @@
 def main():
    print()

--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v0d.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v0d.yaml
@@ -1,60 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags. Please reason step by step, and put your final answer within \\boxed{}."
 #system_prompt: " Please reason step by step, and put your final answer within \\boxed{}."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 reward_weights:
 - 1.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v1.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v1.yaml
@@ -1,65 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - tag_count
 - length
 - repetition_penalty
 reward_weights:
 - 0.25
 - 0.25
 - 0.25
 - 0.25
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v2.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v2.yaml
@@ -1,65 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - tag_count
 - length
 - repetition_penalty
 reward_weights:
 - 0.45
 - 0.25
 - 0.05
 - 0.25
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v3.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v3.yaml
@@ -1,65 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - tag_count
 - extra_box_v1
 - repetition_penalty
 reward_weights:
 - 0.45
 - 0.25
 - 0.05
 - 0.25
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4.yaml
@@ -1,72 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "lognormal"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4a.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4a.yaml
@@ -1,76 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 0.35
 - 0.35
 - 0.1
 - 0.0
 - 0.0
 - 0.1
 - 0.1
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "lognormal"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4b.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v4b.yaml
@@ -1,76 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 #system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "lognormal"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5.yaml
@@ -1,76 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "lognormal"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5a.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5a.yaml
@@ -1,78 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "weibull"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5b.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5b.yaml
@@ -1,78 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "nodelay"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5c.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5c.yaml
@@ -1,78 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "lognormal"
 lower_bound: 600
 upper_bound: 3600
 confidence: 0.995
 default_delay: 60
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5d.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v5d.yaml
@@ -1,78 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "weibull"
 lower_bound: 600
 upper_bound: 3600
 confidence: 0.995
 default_delay: 60
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0.yaml
@@ -1,78 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "weibull"
 lower_bound: 600
 upper_bound: 3600
 confidence: 0.995
 default_delay: 60
 loss_type: "mois"
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0a.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0a.yaml
@@ -1,78 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "weibull"
 lower_bound: 600
 upper_bound: 3600
 confidence: 0.995
 default_delay: 60
 loss_type: "bnpo"
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0b.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0b.yaml
@@ -1,78 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "weibull"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "bnpo"
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0c1.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0c1.yaml
@@ -1,79 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 # system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "nodelay"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "is_bnpo"
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0c2.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0c2.yaml
@@ -1,78 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "nodelay"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "pg"
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0d.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0d.yaml
@@ -1,78 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "nodelay"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "pg"
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v1.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v1.yaml
@@ -1,77 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 0.5
 - 0.05
 - 0.05
 - 0.1
 - 0.0
 - 0.1
 - 0.1
 - 0.1
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "weibull"
 lower_bound: 600
 upper_bound: 3600
 confidence: 0.995
 default_delay: 60
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v2.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v2.yaml
@@ -1,79 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 # system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "nodelay"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "ais_bnpo"
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v2a.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v2a.yaml
@@ -1,79 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 # system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "nodelay"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "pg"
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6a.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6a.yaml
@@ -1,79 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 # system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "nodelay"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "amis_gspo"
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6b.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6b.yaml
@@ -1,82 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 # system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 #system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "lognormal"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "ais_bnpo"
 cppo_beta: 0.005
 max_diff_step: 32
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6c.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6c.yaml
@@ -1,81 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 # system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "lognormal"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "ais_bnpo"
 cppo_beta: 0.005
 max_diff_step: 4
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6d.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6d.yaml
@@ -1,81 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 # system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "lognormal"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "ais_bnpo"
 cppo_beta: 0.005
 max_diff_step: 16
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6e.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v6e.yaml
@@ -1,81 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 # system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "lognormal"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "ais_bnpo"
 cppo_beta: 0.005
 max_diff_step: 8
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v7.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v7.yaml
@@ -1,81 +0,0 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 # system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen3-1.7B
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 1.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "lognormal"
 lower_bound: 36000
 upper_bound: 72000
 confidence: 0.995
 default_delay: 36000
 loss_type: "ais_bnpo"
 cppo_beta: 0.0
 max_diff_step: 32000
--- a/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0c.yaml
+++ b/recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_v0c.yaml
@@ -1,8 +1,9 @@

 # Model arguments
 model_name_or_path: /extrahome0/HF_models/Qwen/Qwen3-1.7B
 model_name_or_path: Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 # torch_dtype: float32
 attn_implementation: flash_attention_2

 # Data training arguments
@@ -38,7 +39,7 @@ max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 output_dir: "/output_dir"
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
@@ -70,9 +71,11 @@ warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 delay_sampler: "nodelay"
 delay_sampler: "lognormal"
 lower_bound: 60
 upper_bound: 1920
 confidence: 0.995
 default_delay: 60
 loss_type: "bnpo"
 loss_type: "gepo"
 cppo_beta: 0.005
 max_diff_step: 32
--- a/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v0.yaml
+++ b/recipes/AsyncGPG/config_simple_rl_math_l35_nRMs_v0.yaml
@@ -1,23 +1,23 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_name_or_path: Qwen/Qwen3-1.7B
 model_revision: main
 torch_dtype: bfloat16
 # torch_dtype: float32
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 system_prompt: "You are ara helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.45
 vllm_gpu_memory_utilization: 0.25
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
@@ -25,7 +25,7 @@ gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_model_id: None
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
@@ -39,7 +39,7 @@ max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 output_dir: output_dir
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
@@ -57,3 +57,5 @@ warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 # loss_type : "pg"
 loss_type: "grpo"
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_demo.yaml
@@ -1,54 +0,0 @@
 # Model arguments
 model_name_or_path: Qwen/Qwen2.5-7B-Instruct
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 dataset_name: open-r1/OpenR1-Math-cn_k12-86k
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"

 # GRPO trainer config
 beta: 0.001
 bf16: true
 do_eval: false
 eval_strategy: "no"
 use_vllm: true
 do_eval: false
 gradient_accumulation_steps: 16
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen2.5-7B-Instruct-GRPO
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: true
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_grad_norm: 0.2
 max_prompt_length: 1024
 max_completion_length: 4096
 max_steps: -1
 num_generations: 16
 num_train_epochs: 1
 output_dir: data/Qwen2.5-7B-Instruct-GRPO
 overwrite_output_dir: true
 per_device_train_batch_size: 4
 push_to_hub: true
 report_to:
 - wandb
 reward_funcs:
 - accuracy
 - format
 reward_weights:
 - 1.0
 - 0.2
 save_strategy: "steps"
 save_steps: 0.1
 save_total_limit: 1
 seed: 42
 temperature: 0.7
 warmup_ratio: 0.1
--- a/recipes/Qwen2.5-7B-Instruct/grpo/config_demo_v1.yaml
+++ b/recipes/Qwen2.5-7B-Instruct/grpo/config_demo_v1.yaml
@@ -1,54 +0,0 @@
 # Model arguments
 model_name_or_path: Qwen/Qwen2.5-7B-Instruct
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 dataset_name: open-r1/OpenR1-Math-cn_k12-86k
 system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"

 # GRPO trainer config
 beta: 0.001
 bf16: true
 do_eval: false
 eval_strategy: "no"
 use_vllm: true
 do_eval: false
 gradient_accumulation_steps: 16
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen2.5-7B-Instruct-GRPO
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: true
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_grad_norm: 0.2
 max_prompt_length: 1024
 max_completion_length: 4096
 max_steps: -1
 num_generations: 16
 num_train_epochs: 1
 output_dir: data/Qwen2.5-7B-Instruct-GRPO
 overwrite_output_dir: true
 per_device_train_batch_size: 4
 push_to_hub: true
 report_to:
 - wandb
 reward_funcs:
 - accuracy
 - format
 reward_weights:
 - 1.0
 - 0.2
 save_strategy: "steps"
 save_steps: 0.1
 save_total_limit: 1
 seed: 42
 temperature: 0.7
 warmup_ratio: 0.1
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1_vllm.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1_vllm.yaml
@@ -1,64 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 # torch_dtype: float32
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 #system_prompt: "You are ara helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. Please put your final answer within \\boxed{}. Also, indicate that it is the answer."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.25
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 reward_weights:
 - 1.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
 # loss_type : "pg"
 loss_type: "grpo"
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2.yaml
@@ -1,60 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: false
 vllm_device: auto
 vllm_gpu_memory_utilization: 0.7
 do_eval: true
 eval_strategy: steps
 eval_steps: 100
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 5
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 1
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v2
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 4
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - format
 reward_weights:
 - 1.0
 - 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.03
 temperature: 1.0
 top_p : 1.0
 scale_rewards: false

--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2_g16.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2_g16.yaml
@@ -1,60 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: false
 vllm_device: auto
 vllm_gpu_memory_utilization: 0.7
 do_eval: true
 eval_strategy: steps
 eval_steps: 100
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 5
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 16
 num_train_epochs: 1
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v2-g16
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - format
 reward_weights:
 - 1.0
 - 1.0
 save_strategy: "no"
 seed: 42
 warmup_ratio: 0.03
 temperature: 1.0
 top_p : 1.0
 scale_rewards: false

--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2_vllm.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v2_vllm.yaml
@@ -1,73 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.25
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 0.8
 - 0.0
 - 0.1
 - 0.0
 - 0.0
 - 0.0
 - 0.1
 - 0.0
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
--- a/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v3_vllm.yaml
+++ b/recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v3_vllm.yaml
@@ -1,73 +0,0 @@

 # Model arguments
 model_name_or_path: models/Qwen2.5-Math-7B
 #model_name_or_path: Qwen/Qwen2.5-Math-7B
 model_revision: main
 torch_dtype: bfloat16
 attn_implementation: flash_attention_2

 # Data training arguments
 #dataset_name: DigitalLearningGmbH/MATH-lighteval
 dataset_name: "datas/SimpleRL-Zoo-Data/simplelr_qwen_level3to5"
 dataset_config: "train.parquet"
 system_prompt: "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
 # GRPO trainer config
 beta: 0.0
 bf16: true
 use_vllm: true
 vllm_mode: "colocate"
 # vllm_device: auto
 vllm_gpu_memory_utilization: 0.25
 do_eval: true
 eval_strategy: steps
 eval_steps: 33
 gradient_accumulation_steps: 1
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 hub_model_id: Qwen-2.5-7B-Simple-RL
 hub_strategy: every_save
 learning_rate: 1.0e-06
 log_completions: false
 log_level: info
 logging_first_step: true
 logging_steps: 1
 logging_strategy: steps
 lr_scheduler_type: constant_with_warmup
 max_prompt_length: 1024
 max_completion_length: 3000
 max_steps: -1
 num_generations: 8
 num_train_epochs: 3
 output_dir: data/Qwen-2.5-7B-Simple-RL-GPG-math35-v1
 overwrite_output_dir: true
 per_device_eval_batch_size: 16
 per_device_train_batch_size: 8
 push_to_hub: false
 report_to:
 - tensorboard
 reward_funcs:
 - accuracy_lv35
 - extra_box_v1
 - extra_box_v2
 - fused_format
 - tag_count
 - length
 - repetition_penalty
 - language_penalty
 reward_weights:
 - 0.5
 - 0.05
 - 0.05
 - 0.1
 - 0.0
 - 0.1
 - 0.1
 - 0.1
 save_strategy: "steps"
 save_steps: 33
 seed: 42
 warmup_ratio: 0.03
 temperature: 0.6
 top_p : 0.95
 top_k : 20
--- a/recipes/README.md
+++ b/recipes/README.md
@@ -1,15 +0,0 @@
 # Post-training recipes

 ## OlympicCoder

 To train the OlympicCoder models, run:

 ```
 # 7B
 sbatch --nodes=1 slurm/train.slurm OlympicCoder-7B sft v00.00 zero3

 # 32B
 sbatch --nodes=16 slurm/train.slurm OlympicCoder-32B sft v00.00 fsdp
 ```

 Note that we found it necessary to switch to FSDP1 and paged AdamW 8-bit for the 32B model in order to fit the largest possible context size.
--- a/recipes/accelerate_configs/ddp.yaml
+++ b/recipes/accelerate_configs/ddp.yaml
@@ -1,16 +0,0 @@
 compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 gpu_ids: all
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 8
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/recipes/accelerate_configs/ddp_4gpus.yaml
+++ b/recipes/accelerate_configs/ddp_4gpus.yaml
@@ -1,16 +0,0 @@
 compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 gpu_ids: all
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 4
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/recipes/accelerate_configs/ddp_6gpus.yaml
+++ b/recipes/accelerate_configs/ddp_6gpus.yaml
@@ -1,16 +0,0 @@
 compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 gpu_ids: all
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 6
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/recipes/accelerate_configs/ddp_8gpus.yaml
+++ b/recipes/accelerate_configs/ddp_8gpus.yaml
@@ -1,16 +0,0 @@
 compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: MULTI_GPU
 downcast_bf16: 'no'
 gpu_ids: all
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 8
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/recipes/accelerate_configs/fsdp.yaml
+++ b/recipes/accelerate_configs/fsdp.yaml
@@ -1,27 +0,0 @@
 compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: FSDP
 downcast_bf16: 'no'
 enable_cpu_affinity: false
 fsdp_config:
  fsdp_activation_checkpointing: false # Need fix from: https://github.com/huggingface/transformers/pull/36610
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_backward_prefetch: BACKWARD_PRE
  fsdp_cpu_ram_efficient_loading: true
  fsdp_forward_prefetch: true
  fsdp_offload_params: false
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sync_module_states: true
  fsdp_use_orig_params: true
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 8
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/recipes/accelerate_configs/zero1.yaml
+++ b/recipes/accelerate_configs/zero1.yaml
@@ -1,21 +0,0 @@
 compute_environment: LOCAL_MACHINE
 debug: false
 deepspeed_config:
  deepspeed_multinode_launcher: standard
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  zero_stage: 1
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 8
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/recipes/accelerate_configs/zero2.yaml
+++ b/recipes/accelerate_configs/zero2.yaml
@@ -1,21 +0,0 @@
 compute_environment: LOCAL_MACHINE
 debug: false
 deepspeed_config:
  deepspeed_multinode_launcher: standard
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  zero_stage: 2
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 8
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/recipes/accelerate_configs/zero2_2A100s.yaml
+++ b/recipes/accelerate_configs/zero2_2A100s.yaml
@@ -1,21 +0,0 @@
 compute_environment: LOCAL_MACHINE
 debug: false
 deepspeed_config:
  deepspeed_multinode_launcher: standard
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: false
  zero_stage: 2
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 2
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/recipes/accelerate_configs/zero3.yaml
+++ b/recipes/accelerate_configs/zero3.yaml
@@ -1,22 +0,0 @@
 compute_environment: LOCAL_MACHINE
 debug: false
 deepspeed_config:
  deepspeed_multinode_launcher: standard
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: true
  zero3_save_16bit_model: true
  zero_stage: 3
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 8
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/recipes/accelerate_configs/zero3_4A100s.yaml
+++ b/recipes/accelerate_configs/zero3_4A100s.yaml
@@ -1,22 +0,0 @@
 compute_environment: LOCAL_MACHINE
 debug: false
 deepspeed_config:
  deepspeed_multinode_launcher: standard
  offload_optimizer_device: none
  offload_param_device: none
  zero3_init_flag: true
  zero3_save_16bit_model: true
  zero_stage: 3
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
 num_processes: 4
 rdzv_backend: static
 same_network: true
 tpu_env: []
 tpu_use_cluster: false
 tpu_use_sudo: false
 use_cpu: false
--- a/retrieve_analysis.sh
+++ b/retrieve_analysis.sh
@@ -1,12 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/retrieve_and_analysis/${formatted_time}.log
 sync_weights_path=$1
 max_num_model_weight=64
 num_samples=64
 num_generations=8
 echo $log_path
 # --skip_retrieve_model_weight
 nohup python retrieve_and_analysis.py --sync_weights_path $sync_weights_path \
    --num_samples $num_samples --num_generations $num_generations \
    --max_num_model_weight $max_num_model_weight --random_dataset > $log_path 2>&1 &

--- a/retrieve_and_analysis.py
+++ b/retrieve_and_analysis.py
@@ -1,479 +0,0 @@
 import torch
 import shutil
 import os
 import time
 import datetime
 import re
 from trl.trainer.utils import pad
 from tqdm import tqdm
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
 import matplotlib.pyplot as plt
 import torch.nn.functional as F
 from contextlib import nullcontext
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 from vllm import LLM, SamplingParams
 from trl.trainer.utils import selective_log_softmax
 from trl.extras.profiling import profiling_context
 import os
 from datasets import Dataset, DatasetDict
 from transformers import AutoTokenizer
 import os
 import pandas as pd
 import argparse
 import torch.distributed as dist
 import atexit

 def retrieve_model_weight(sync_weights_path=None, target_directory = "/extrahome0/retrieve_model_weight", max_num_model_weight = 32):# 初始的 global_step
    # if os.path.exists(sync_weights_path) and os.path.isdir(sync_weights_path):
    #     files = os.listdir(sync_weights_path)
    #     if len(files) == 0:
    #         print(f"目录 {save_dir} 存在，但为空。")
    #     else:
    #         print(f"目录 {save_dir} 存在，包含 {len(files)} 个文件：")
    #         print(files)
    # else:
    #     print(f"目录 {save_dir} 不存在！")
    current_global_step = 0

    # 权重文件路径
    # sync_weights_path = "/extrahome0/save_dir/AsyncGRPO/4gpus/Async_MoISv6i_1th_cfgv6b/tmp/Qwen3-1.7B/gpg_async_weights.pt"

    # 目标目录
    # target_directory = "/extrahome0/retrieve_model_weight"

    # 确保目标目录存在
    os.makedirs(target_directory, exist_ok=True)

    print(f"开始监控文件: {sync_weights_path}")
    print(f"目标目录: {target_directory}")
    num_model_weight = 0
    try:
        while num_model_weight < max_num_model_weight:
            try:
                # 读取当前保存的 global_step
                global_step, _ = torch.load(sync_weights_path, map_location="cpu")
                
                # 检查是否比上一次的 step 正好大 1
                if global_step == current_global_step + 1 or current_global_step == 0:
                    target_path = os.path.join(target_directory, f"gpg_async_weights_{global_step}.pt")
                    shutil.copy(sync_weights_path, target_path)
                    print(f"✅ 步数增加 1: {current_global_step} → {global_step}")
                    print(f"已复制权重文件到: {target_path}")
                    num_model_weight += 1
                    # 更新记录的 step
                    current_global_step = global_step
                elif global_step > current_global_step + 1:
                    print(f"⚠️  步数跳跃: {current_global_step} → {global_step}（跳过了中间步骤）")
                    break
                    # current_global_step = global_step  # 可选：是否更新？根据需求决定
                else:
                    # global_step <= current_global_step，无需操作
                    pass  # 可以选择打印日志

            # except FileNotFoundError:
            #     print(f"❌ 文件未找到: {sync_weights_path}")
            except Exception as e:
                print(f"❌ 读取文件时发生错误: {e}")

            # 等待 1 秒后再次检查
            time.sleep(1)

    except KeyboardInterrupt:
        print("\n\n监控已手动停止。")


 def custom_loading_dataset(dataset_name, train_name='train.parquet', test_name='test.parquet', max_length=512, tokenizer=None):
    """
    Load and preprocess a dataset from Parquet files, and filter out samples exceeding a specified length.

    Args:
        dataset_name (str): The base directory of the dataset.
        train_name (str, optional): The name of the training file. Defaults to 'train.parquet'.
        test_name (str, optional): The name of the test file. Defaults to 'test.parquet'.
        max_length (int, optional): Maximum length of the samples to keep. Defaults to 512.
        tokenizer (str, optional): tokenizer to use. Defaults to 'bert-base-uncased'.

    Returns:
        DatasetDict: A dictionary-like object containing the training and test datasets.
    """
    # 定义数据文件路径
    train_path = os.path.join(dataset_name, train_name)
    test_path = os.path.join(dataset_name, test_name)


    # 定义一个函数来计算文本的长度
    def get_length(text):
        inputs = tokenizer(text, return_tensors="pt", padding=False, truncation=False)
        return inputs["input_ids"].shape[1]

    # 读取训练数据
    try:
        train_data = pd.read_parquet(train_path)
        train_data['split'] = 'train'  # 添加 split 列
    except FileNotFoundError:
        raise FileNotFoundError(f"Training file not found at {train_path}")

    # 读取测试数据
    try:
        test_data = pd.read_parquet(test_path)
        test_data['split'] = 'test'  # 添加 split 列
    except FileNotFoundError:
        print(f"Test file not found at {test_path}. Skipping test data.")
        test_data = None

    # 定义列名映射
    column_mapping = {
        'ground_truth_answer': 'ground_truth',
        'subject': 'topic',
        'target': 'solution',
        # 'data_source': 'source',
        'input': 'instruction',
        # 'ability': 'skill',
        # 'reward_model': 'reward',
        # 'extra_info': 'metadata',
        'question': 'problem'
    }


    # 重命名列
    train_data.rename(columns=column_mapping, inplace=True)

    if test_data is not None:
        test_data.rename(columns=column_mapping, inplace=True)


    # 计算每个样本的长度
    train_data['length'] = train_data['instruction'].apply(get_length)
    if test_data is not None:
        test_data['length'] = test_data['instruction'].apply(get_length)

    # 过滤掉超过 max_length 的样本
    train_data = train_data[train_data['length'] <= max_length]
    if test_data is not None:
        test_data = test_data[test_data['length'] <= max_length]

    # 转换为 Hugging Face Dataset
    train_dataset = Dataset.from_pandas(train_data)
    if test_data is not None:
        test_dataset = Dataset.from_pandas(test_data)
    else:
        test_dataset = None

    # 创建 DatasetDict
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

    return dataset_dict
 def make_conversation(example):
    prompt = []
    system_prompt = "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
    if system_prompt is not None:
        prompt.append({"role": "system", "content": system_prompt})
    prompt.append({"role": "user", "content": example["problem"]})


    # prompt = example["problem"] + " The reasoning process MUST BE enclosed within <think> and </think> tags. Please reason step by step, and put your final answer within \\boxed{}."
    # if add_think:
    #     prompt += " /think"

    return {"prompt": prompt}


 def pre_process(completions):
    """retrieve the completion content from input"""
    if  isinstance(completions[0],(list,)):
        completion_contents = [completion[0]["content"] for completion in completions]
    elif isinstance(completions[0],(dict)):
        completion_contents = [completion["content"] for completion in completions]
    else:
        completion_contents = [completion for completion in completions]
    return completion_contents

 def accuracy_reward_lv35(completions, solution, **kwargs):
    """Reward function that checks if the completion is the same as the ground truth."""
    # if isinstance(completions[0],(dict)):
    #     contents = [completion["content"] for completion in completions]
    # else:
    #     contents = [completion for completion in completions]
    contents = pre_process(completions)
    rewards = []
    for content, sol in zip(contents, solution):
        box_sol = "$\\\\boxed{}$".format(sol)
        try:
            gold_parsed = parse(
                box_sol,
                extraction_mode="first_match",
            )
        except TimeoutError:
            rank = dist.get_rank() if dist.is_initialized() else 0
            print(f"[Rank  {rank}] gold parse timeout | content='{content}' | sol='{sol}' | box_sol='{box_sol}'")
            rewards.append(1.0)
            continue
        if len(gold_parsed) != 0:
            # We require the answer to be provided in correct latex (no malformed operators)
            try:
                answer_parsed = parse(
                    content,
                    extraction_config=[
                        LatexExtractionConfig(
                            normalization_config=NormalizationConfig(
                                nits=False,
                                malformed_operators=False,
                                basic_latex=True,
                                equations=True,
                                boxed="all",
                                units=True,
                            ),
                            # Ensures that boxed is tried first
                            boxed_match_priority=0,
                            try_extract_without_anchor=False,
                        )
                    ],
                    extraction_mode="first_match",
                )
                # print(f'answer_parsed:{answer_parsed}')
                # if len(anxswer_parsed) == 0:
                #     print(f"answer_parsed is None | content='{content}' | sol='{sol}'")
            except TimeoutError:
                rank = dist.get_rank() if dist.is_initialized() else 0
                print(f"[Rank {rank}] answer parse timeout | content='{content}' | sol='{sol}'")
                rewards.append(0.0)
                continue
            # Reward 1 if the content is the same as the ground truth, 0 otherwise
            try:
                reward = float(verify(answer_parsed, gold_parsed))
            except Exception as e:
                print(f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}")
                reward = 0.0
        else:
            # If the gold solution is not parseable, we reward 1 to skip this example
            reward = 1.0
            print("accuracy_reward_lv35: Failed to parse gold solution: ", box_sol)
        rewards.append(reward)
        

    return torch.Tensor(rewards)

 def _get_per_token_logps(temperature, model, input_ids, attention_mask, logits_to_keep, batch_size=None) -> torch.Tensor:
    batch_size = batch_size or input_ids.size(0)  # Chunk inputs into smaller batches to reduce memory peak
    all_logps = []
    for i in range(0, input_ids.size(0), batch_size):
        input_ids_batch = input_ids[i : i + batch_size]
        attention_mask_batch = attention_mask[i : i + batch_size]

        # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
        logits = model(
            input_ids=input_ids_batch, attention_mask=attention_mask_batch, logits_to_keep=logits_to_keep + 1
        ).logits
        logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
        input_ids_batch = input_ids_batch[:, -logits_to_keep:]
        # Divide logits by sampling temperature.
        # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
        logits = logits / temperature
        logps = selective_log_softmax(logits, input_ids_batch)  # compute logprobs for the input tokens
        all_logps.append(logps)
    return torch.cat(all_logps, dim=0)

 def move_to_vllm(model, llm):
    for name, param in model.named_parameters():
        with nullcontext([param]):
            llm_model = llm.llm_engine.model_executor.driver_worker.model_runner.model
            llm_model.load_weights([(name, param.data)])
    llm.reset_prefix_cache()
    print('vllm updated!')

 def cleanup_dist():
    if dist.is_initialized():
        print("Cleaning up distributed process group...")
        dist.destroy_process_group()

 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--sync_weights_path", type=str, required=True, help="The path to model weights")
    parser.add_argument("--max_num_model_weight", type=int, required=True, help="The number of model weights")
    parser.add_argument("--num_samples", type=int, required=True, help="The number of samples")
    parser.add_argument("--num_generations", type=int, required=True, help="The number of generations per sample")
    parser.add_argument("--skip_retrieve_model_weight", type=bool, default=False, help="skip the retrival of model weight")
    parser.add_argument("--random_dataset", type=bool, default=False, help="re-sample different data from dataset")
    return parser.parse_args()

 def main():
    args = get_args()
    print(args)
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
    target_directory = f"/extrahome0/retrieve_model_weight/{timestamp}"
    max_num_model_weight = args.max_num_model_weight
    if not args.skip_retrieve_model_weight:
        retrieve_model_weight(args.sync_weights_path, target_directory, max_num_model_weight)
    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

    temperature=0.6
    top_p=0.95
    N=args.num_samples
    top_k=20
    max_length = 2048
    num_generations=args.num_generations
    begin_ind=0
    end_ind=begin_ind+N
    resample = True
    scale_rewards = False
    solutions = []
    prompts_text = []
    sampling_params = SamplingParams(temperature=temperature, top_p=top_p, top_k=top_k, max_tokens=max_length)

    # Initialize the tokenizer
    # tokenizer = AutoTokenizer.from_pretrained("/extrahome0/HF_models/Qwen/Qwen3-1.7B")
    # os.environ["VLLM_USE_V1"] = "0"

    # Configurae the sampling parameters (for thinking mode)
    sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=20, max_tokens=2048)

    # Initialize the vLLM engine
    # llm = LLM(model="/extrahome0/HF_models/Qwen/Qwen3-1.7B")
    llm = LLM(model="/extrahome0/HF_models/Qwen/Qwen3-1.7B",gpu_memory_utilization=0.8)
    tokenizer =AutoTokenizer.from_pretrained("/extrahome0/HF_models/Qwen/Qwen3-1.7B", trust_remote_code=True)
    data_path = "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5"
    model_id = "/extrahome0/HF_models/Qwen/Qwen3-1.7B"
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )

    device = model.device
    batch_size = 8
    max_prompt_length = 768
    seed = 42
    mode = "test"
    dataset = custom_loading_dataset(data_path, max_length=max_prompt_length, tokenizer=tokenizer)
    dataset = dataset.map(make_conversation)
    for split in dataset:
        if "messages" in dataset[split].column_names:
            dataset[split] = dataset[split].remove_columns("messages")

    current_dataset = dataset['train'] if mode == "train" else dataset['test']
    current_dataset = current_dataset.shuffle(seed=seed)

    # model_list = sorted(os.listdir(target_directory))
    model_list = sorted([file.name for file in Path(target_directory).glob('gpg_*.pt')], key=lambda x: int(re.search(r'gpg_async_weights_(\d+)', x).group(1)))
    assert len(model_list) == max_num_model_weight, f"Error! got {len(model_list)} < {max_num_model_weight} models in model_list: {model_list}"
    # def get_logprobs_and_reward(model_list):
    log_probs = []
    advantages_list = []
    completion_ids_list = []
    prompt_completion_ids_list = []
    completion_mask_list = []
    attention_mask_list = []
    for model_name in tqdm(model_list):
        model_id, state_dict = torch.load(f"{target_directory}/{model_name}", map_location="cpu")
        model.load_state_dict(state_dict)
        print(f"model_id {model_id} loaded!")
        move_to_vllm(model, llm)

        if resample:
            for ind in range(begin_ind, end_ind):
                for _ in range(num_generations):
                    prompts_text.append("<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer.<|im_start|>user\n" + current_dataset[ind]['problem'] + "<|im_end|>\n<|im_start|>assistant\n")
            #         prompts_text.append(dataset['train'][ind]['problem'] + "/no_think")
                    solutions.append(current_dataset[ind]['solution'])
            # prompts_text = [maybe_apply_chat_template(example, tokenizer)["prompt"] for example in inputs]
            prompt_inputs = tokenizer(
                text=prompts_text, return_tensors="pt", padding=True, padding_side="left", add_special_tokens=False
            )
            prompt_ids, prompt_mask = prompt_inputs["input_ids"].to(device), prompt_inputs["attention_mask"].to(device)
            if max_prompt_length is not None:
                # If max_prompt_length is set, we trim the prompt to keep only the last `max_prompt_length` tokens.
                # Then we decode those tokens back into text. We manually remove leading pad tokens from the decoded text,
                # because we can't use `skip_special_tokens=True` (some special tokens are still needed for generation).
                prompt_ids = prompt_ids[:, -max_prompt_length :]
                prompt_mask = prompt_mask[:, -max_prompt_length :]
                prompts_text = tokenizer.batch_decode(
                    prompt_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
                )
                prompts_text = [
                    re.sub(rf"^({re.escape(tokenizer.pad_token)})+", "", text) for text in prompts_text
                ]
            begin_ind = end_ind
        
        if not args.random_dataset:
            resample = False

        all_outputs = llm.generate(prompts_text, sampling_params, use_tqdm=False)
        completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs]
        completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
        completion_ids = pad(completion_ids, padding_value=tokenizer.pad_token_id)
        prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
        is_eos = completion_ids == tokenizer.eos_token_id
        eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)
        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1)
        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
        logits_to_keep = completion_ids.size(1)
        with torch.no_grad():
            logps = _get_per_token_logps(
                model=model,
                input_ids=prompt_completion_ids,
                attention_mask=attention_mask,
                logits_to_keep=logits_to_keep,
                temperature=temperature,
                batch_size=batch_size
            )
            log_probs.append(logps)
        completions_text = tokenizer.batch_decode(completion_ids, skip_special_tokens=True)
        rewards = accuracy_reward_lv35(completions=completions_text, solution=solutions).to(device)
        mean_grouped_rewards = rewards.view(-1, num_generations).mean(dim=1)
        std_grouped_rewards = rewards.view(-1, num_generations).std(dim=1)
        # is_std_zero = torch.isclose(std_grouped_rewards, torch.zeros_like(std_grouped_rewards))

        # Normalize the rewards to compute the advantages
        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(num_generations, dim=0)
        std_grouped_rewards = std_grouped_rewards.repeat_interleave(num_generations, dim=0)
        advantages = rewards - mean_grouped_rewards
        if scale_rewards:
            advantages = advantages / (std_grouped_rewards + 1e-4)
        advantages_list.append(advantages)
        completion_ids_list.append(completion_ids)
        completion_mask_list.append(completion_mask)
        prompt_completion_ids_list.append(prompt_completion_ids)
        attention_mask_list.append(attention_mask)
    
    print("sampling finish!")
    learner_log_probs = []
    with torch.no_grad():
        for i in tqdm(range(max_num_model_weight-1)): 
            learner_logps = _get_per_token_logps(
                model=model,
                input_ids=prompt_completion_ids_list[i],
                attention_mask=attention_mask_list[i],
                logits_to_keep=completion_ids_list[i].size(1),
                temperature=temperature,
                batch_size=batch_size
            )
            learner_log_probs.append(learner_logps)

    save_path = f"{target_directory}/log_probs_and_advantages.pt"
    torch.save({
        'sampler_log_probs': log_probs,
        'advantages_list': advantages_list,
        'prompt_ids_list': prompt_inputs["input_ids"],
        'prompt_mask_list': prompt_inputs["attention_mask"],
        'completion_ids_list': completion_ids_list,
        'prompt_completion_ids_list': prompt_completion_ids_list,
        'completion_mask_list': completion_mask_list,
        'attention_mask_list': attention_mask_list,
        'learner_log_probs': learner_log_probs
    }, save_path)

    print("learning finish!")
    # calculation

 if __name__ == "__main__":
    atexit.register(cleanup_dist)
    main()
--- a/retrieve_and_analysis_bak.py
+++ b/retrieve_and_analysis_bak.py
@@ -1,472 +0,0 @@
 import torch
 import shutil
 import os
 import time
 import datetime
 import re
 from trl.trainer.utils import pad
 from tqdm import tqdm
 from latex2sympy2_extended import NormalizationConfig
 from math_verify import LatexExtractionConfig, parse, verify
 import matplotlib.pyplot as plt
 import torch.nn.functional as F
 from contextlib import nullcontext
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 from vllm import LLM, SamplingParams
 from trl.trainer.utils import selective_log_softmax
 from trl.extras.profiling import profiling_context
 import os
 from datasets import Dataset, DatasetDict
 from transformers import AutoTokenizer
 import os
 import pandas as pd
 import argparse
 import torch.distributed as dist
 import atexit

 def retrieve_model_weight(sync_weights_path=None, target_directory = "/extrahome0/retrieve_model_weight", max_num_model_weight = 32):# 初始的 global_step
    # if os.path.exists(sync_weights_path) and os.path.isdir(sync_weights_path):
    #     files = os.listdir(sync_weights_path)
    #     if len(files) == 0:
    #         print(f"目录 {save_dir} 存在，但为空。")
    #     else:
    #         print(f"目录 {save_dir} 存在，包含 {len(files)} 个文件：")
    #         print(files)
    # else:
    #     print(f"目录 {save_dir} 不存在！")
    current_global_step = 0

    # 权重文件路径
    # sync_weights_path = "/extrahome0/save_dir/AsyncGRPO/4gpus/Async_MoISv6i_1th_cfgv6b/tmp/Qwen3-1.7B/gpg_async_weights.pt"

    # 目标目录
    # target_directory = "/extrahome0/retrieve_model_weight"

    # 确保目标目录存在
    os.makedirs(target_directory, exist_ok=True)

    print(f"开始监控文件: {sync_weights_path}")
    print(f"目标目录: {target_directory}")
    num_model_weight = 0
    try:
        while num_model_weight < max_num_model_weight:
            try:
                # 读取当前保存的 global_step
                global_step, _ = torch.load(sync_weights_path, map_location="cpu")
                
                # 检查是否比上一次的 step 正好大 1
                if global_step == current_global_step + 1 or current_global_step == 0:
                    target_path = os.path.join(target_directory, f"gpg_async_weights_{global_step}.pt")
                    shutil.copy(sync_weights_path, target_path)
                    print(f"✅ 步数增加 1: {current_global_step} → {global_step}")
                    print(f"已复制权重文件到: {target_path}")
                    num_model_weight += 1
                    # 更新记录的 step
                    current_global_step = global_step
                elif global_step > current_global_step + 1:
                    print(f"⚠️  步数跳跃: {current_global_step} → {global_step}（跳过了中间步骤）")
                    break
                    # current_global_step = global_step  # 可选：是否更新？根据需求决定
                else:
                    # global_step <= current_global_step，无需操作
                    pass  # 可以选择打印日志

            # except FileNotFoundError:
            #     print(f"❌ 文件未找到: {sync_weights_path}")
            except Exception as e:
                print(f"❌ 读取文件时发生错误: {e}")

            # 等待 1 秒后再次检查
            time.sleep(1)

    except KeyboardInterrupt:
        print("\n\n监控已手动停止。")


 def custom_loading_dataset(dataset_name, train_name='train.parquet', test_name='test.parquet', max_length=512, tokenizer=None):
    """
    Load and preprocess a dataset from Parquet files, and filter out samples exceeding a specified length.

    Args:
        dataset_name (str): The base directory of the dataset.
        train_name (str, optional): The name of the training file. Defaults to 'train.parquet'.
        test_name (str, optional): The name of the test file. Defaults to 'test.parquet'.
        max_length (int, optional): Maximum length of the samples to keep. Defaults to 512.
        tokenizer (str, optional): tokenizer to use. Defaults to 'bert-base-uncased'.

    Returns:
        DatasetDict: A dictionary-like object containing the training and test datasets.
    """
    # 定义数据文件路径
    train_path = os.path.join(dataset_name, train_name)
    test_path = os.path.join(dataset_name, test_name)


    # 定义一个函数来计算文本的长度
    def get_length(text):
        inputs = tokenizer(text, return_tensors="pt", padding=False, truncation=False)
        return inputs["input_ids"].shape[1]

    # 读取训练数据
    try:
        train_data = pd.read_parquet(train_path)
        train_data['split'] = 'train'  # 添加 split 列
    except FileNotFoundError:
        raise FileNotFoundError(f"Training file not found at {train_path}")

    # 读取测试数据
    try:
        test_data = pd.read_parquet(test_path)
        test_data['split'] = 'test'  # 添加 split 列
    except FileNotFoundError:
        print(f"Test file not found at {test_path}. Skipping test data.")
        test_data = None

    # 定义列名映射
    column_mapping = {
        'ground_truth_answer': 'ground_truth',
        'subject': 'topic',
        'target': 'solution',
        # 'data_source': 'source',
        'input': 'instruction',
        # 'ability': 'skill',
        # 'reward_model': 'reward',
        # 'extra_info': 'metadata',
        'question': 'problem'
    }


    # 重命名列
    train_data.rename(columns=column_mapping, inplace=True)

    if test_data is not None:
        test_data.rename(columns=column_mapping, inplace=True)


    # 计算每个样本的长度
    train_data['length'] = train_data['instruction'].apply(get_length)
    if test_data is not None:
        test_data['length'] = test_data['instruction'].apply(get_length)

    # 过滤掉超过 max_length 的样本
    train_data = train_data[train_data['length'] <= max_length]
    if test_data is not None:
        test_data = test_data[test_data['length'] <= max_length]

    # 转换为 Hugging Face Dataset
    train_dataset = Dataset.from_pandas(train_data)
    if test_data is not None:
        test_dataset = Dataset.from_pandas(test_data)
    else:
        test_dataset = None

    # 创建 DatasetDict
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

    return dataset_dict
 def make_conversation(example):
    prompt = []
    system_prompt = "You are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. The reasoning process MUST BE enclosed within <think> and </think> tags."
    if system_prompt is not None:
        prompt.append({"role": "system", "content": system_prompt})
    prompt.append({"role": "user", "content": example["problem"]})


    # prompt = example["problem"] + " The reasoning process MUST BE enclosed within <think> and </think> tags. Please reason step by step, and put your final answer within \\boxed{}."
    # if add_think:
    #     prompt += " /think"

    return {"prompt": prompt}


 def pre_process(completions):
    """retrieve the completion content from input"""
    if  isinstance(completions[0],(list,)):
        completion_contents = [completion[0]["content"] for completion in completions]
    elif isinstance(completions[0],(dict)):
        completion_contents = [completion["content"] for completion in completions]
    else:
        completion_contents = [completion for completion in completions]
    return completion_contents

 def accuracy_reward_lv35(completions, solution, **kwargs):
    """Reward function that checks if the completion is the same as the ground truth."""
    # if isinstance(completions[0],(dict)):
    #     contents = [completion["content"] for completion in completions]
    # else:
    #     contents = [completion for completion in completions]
    contents = pre_process(completions)
    rewards = []
    for content, sol in zip(contents, solution):
        box_sol = "$\\\\boxed{}$".format(sol)
        try:
            gold_parsed = parse(
                box_sol,
                extraction_mode="first_match",
            )
        except TimeoutError:
            rank = dist.get_rank() if dist.is_initialized() else 0
            print(f"[Rank  {rank}] gold parse timeout | content='{content}' | sol='{sol}' | box_sol='{box_sol}'")
            rewards.append(1.0)
            continue
        if len(gold_parsed) != 0:
            # We require the answer to be provided in correct latex (no malformed operators)
            try:
                answer_parsed = parse(
                    content,
                    extraction_config=[
                        LatexExtractionConfig(
                            normalization_config=NormalizationConfig(
                                nits=False,
                                malformed_operators=False,
                                basic_latex=True,
                                equations=True,
                                boxed="all",
                                units=True,
                            ),
                            # Ensures that boxed is tried first
                            boxed_match_priority=0,
                            try_extract_without_anchor=False,
                        )
                    ],
                    extraction_mode="first_match",
                )
                # print(f'answer_parsed:{answer_parsed}')
                # if len(anxswer_parsed) == 0:
                #     print(f"answer_parsed is None | content='{content}' | sol='{sol}'")
            except TimeoutError:
                rank = dist.get_rank() if dist.is_initialized() else 0
                print(f"[Rank {rank}] answer parse timeout | content='{content}' | sol='{sol}'")
                rewards.append(0.0)
                continue
            # Reward 1 if the content is the same as the ground truth, 0 otherwise
            try:
                reward = float(verify(answer_parsed, gold_parsed))
            except Exception as e:
                print(f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}")
                reward = 0.0
        else:
            # If the gold solution is not parseable, we reward 1 to skip this example
            reward = 1.0
            print("accuracy_reward_lv35: Failed to parse gold solution: ", box_sol)
        rewards.append(reward)
        

    return torch.Tensor(rewards)

 def _get_per_token_logps(temperature, model, input_ids, attention_mask, logits_to_keep, batch_size=None) -> torch.Tensor:
    batch_size = batch_size or input_ids.size(0)  # Chunk inputs into smaller batches to reduce memory peak
    all_logps = []
    for i in range(0, input_ids.size(0), batch_size):
        input_ids_batch = input_ids[i : i + batch_size]
        attention_mask_batch = attention_mask[i : i + batch_size]

        # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
        logits = model(
            input_ids=input_ids_batch, attention_mask=attention_mask_batch, logits_to_keep=logits_to_keep + 1
        ).logits
        logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
        input_ids_batch = input_ids_batch[:, -logits_to_keep:]
        # Divide logits by sampling temperature.
        # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
        logits = logits / temperature
        logps = selective_log_softmax(logits, input_ids_batch)  # compute logprobs for the input tokens
        all_logps.append(logps)
    return torch.cat(all_logps, dim=0)

 def move_to_vllm(model, llm):
    for name, param in model.named_parameters():
        with nullcontext([param]):
            llm_model = llm.llm_engine.model_executor.driver_worker.model_runner.model
            llm_model.load_weights([(name, param.data)])
    llm.reset_prefix_cache()
    print('vllm updated!')

 def cleanup_dist():
    if dist.is_initialized():
        print("Cleaning up distributed process group...")
        dist.destroy_process_group()

 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--sync_weights_path", type=str, required=True, help="The path to model weights")
    parser.add_argument("--max_num_model_weight", type=int, required=True, help="The number of model weights")
    parser.add_argument("--num_samples", type=int, required=True, help="The number of samples")
    parser.add_argument("--num_generations", type=int, required=True, help="The number of generations per sample")
    parser.add_argument("--skip_retrieve_model_weight", type=bool, default=False, required=True, help="skip the retrival of model weight")
    parser.add_argument("--random_dataset", type=bool, default=False, required=True, help="skip the retrival of model weight")
    return parser.parse_args()

 def main():
    args = get_args()
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
    target_directory = f"/extrahome0/retrieve_model_weight/{timestamp}"
    max_num_model_weight = args.max_num_model_weight
    if not args.skip_retrieve_model_weight:
        retrieve_model_weight(args.sync_weights_path, target_directory, max_num_model_weight)
    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

    temperature=0.6
    top_p=0.95
    N=args.num_samples
    top_k=20
    max_length = 2048
    num_generations=args.num_generations
    begin_ind=0
    end_ind=begin_ind+N
    scale_rewards = False
    solutions = []
    prompts_text = []
    sampling_params = SamplingParams(temperature=temperature, top_p=top_p, top_k=top_k, max_tokens=max_length)

    # Initialize the tokenizer
    # tokenizer = AutoTokenizer.from_pretrained("/extrahome0/HF_models/Qwen/Qwen3-1.7B")
    # os.environ["VLLM_USE_V1"] = "0"

    # Configurae the sampling parameters (for thinking mode)
    sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=20, max_tokens=2048)

    # Initialize the vLLM engine
    # llm = LLM(model="/extrahome0/HF_models/Qwen/Qwen3-1.7B")
    llm = LLM(model="/extrahome0/HF_models/Qwen/Qwen3-1.7B",gpu_memory_utilization=0.8)
    tokenizer =AutoTokenizer.from_pretrained("/extrahome0/HF_models/Qwen/Qwen3-1.7B", trust_remote_code=True)
    data_path = "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5"
    model_id = "/extrahome0/HF_models/Qwen/Qwen3-1.7B"
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )

    device = model.device
    batch_size = 8
    max_prompt_length = 768
    seed = 42
    mode = "test"
    dataset = custom_loading_dataset(data_path, max_length=max_prompt_length, tokenizer=tokenizer)
    dataset = dataset.map(make_conversation)
    for split in dataset:
        if "messages" in dataset[split].column_names:
            dataset[split] = dataset[split].remove_columns("messages")

    current_dataset = dataset['train'] if mode == "train" else dataset['test']
    current_dataset = current_dataset.shuffle(seed=seed)

    for ind in range(begin_ind, end_ind):
        for _ in range(num_generations):
            prompts_text.append("<|im_start|>system\nYou are a helpful AI Assistant, designed to provided well-reasoned and detailed responses. You FIRST think about the reasoning process as an internal monologue and then provide the user with the answer. Please put your final answer within \\boxed{}. Also, indicate that it is the answer.<|im_start|>user\n" + current_dataset[ind]['problem'] + "<|im_end|>\n<|im_start|>assistant\n")
    #         prompts_text.append(dataset['train'][ind]['problem'] + "/no_think")
            solutions.append(current_dataset[ind]['solution'])
    # prompts_text = [maybe_apply_chat_template(example, tokenizer)["prompt"] for example in inputs]
    prompt_inputs = tokenizer(
        text=prompts_text, return_tensors="pt", padding=True, padding_side="left", add_special_tokens=False
    )
    prompt_ids, prompt_mask = prompt_inputs["input_ids"].to(device), prompt_inputs["attention_mask"].to(device)
    if max_prompt_length is not None:
        # If max_prompt_length is set, we trim the prompt to keep only the last `max_prompt_length` tokens.
        # Then we decode those tokens back into text. We manually remove leading pad tokens from the decoded text,
        # because we can't use `skip_special_tokens=True` (some special tokens are still needed for generation).
        prompt_ids = prompt_ids[:, -max_prompt_length :]
        prompt_mask = prompt_mask[:, -max_prompt_length :]
        prompts_text = tokenizer.batch_decode(
            prompt_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False
        )
        prompts_text = [
            re.sub(rf"^({re.escape(tokenizer.pad_token)})+", "", text) for text in prompts_text
        ]

    print(len(prompts_text))
    # model_list = sorted(os.listdir(target_directory))
    model_list = sorted([file.name for file in Path(target_directory).glob('gpg_*.pt')], key=lambda x: int(re.search(r'gpg_async_weights_(\d+)', x).group(1)))
    assert len(model_list) == max_num_model_weight, f"Error! got {len(model_list)} < {max_num_model_weight} models in model_list: {model_list}"
    # def get_logprobs_and_reward(model_list):
    log_probs = []
    advantages_list = []
    completion_ids_list = []
    prompt_completion_ids_list = []
    completion_mask_list = []
    attention_mask_list = []
    for model_name in tqdm(model_list):
        model_id, state_dict = torch.load(f"{target_directory}/{model_name}", map_location="cpu")
        model.load_state_dict(state_dict)
        print(f"model_id {model_id} loaded!")
        move_to_vllm(model, llm)
        all_outputs = llm.generate(prompts_text, sampling_params, use_tqdm=False)
        completion_ids = [output.token_ids for outputs in all_outputs for output in outputs.outputs]
        completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
        completion_ids = pad(completion_ids, padding_value=tokenizer.pad_token_id)
        prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
        is_eos = completion_ids == tokenizer.eos_token_id
        eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)
        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1)
        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
        logits_to_keep = completion_ids.size(1)
        with torch.no_grad():
            logps = _get_per_token_logps(
                model=model,
                input_ids=prompt_completion_ids,
                attention_mask=attention_mask,
                logits_to_keep=logits_to_keep,
                temperature=temperature,
                batch_size=batch_size
            )
            log_probs.append(logps)
        completions_text = tokenizer.batch_decode(completion_ids, skip_special_tokens=True)
        rewards = accuracy_reward_lv35(completions=completions_text, solution=solutions).to(device)
        mean_grouped_rewards = rewards.view(-1, num_generations).mean(dim=1)
        std_grouped_rewards = rewards.view(-1, num_generations).std(dim=1)
        # is_std_zero = torch.isclose(std_grouped_rewards, torch.zeros_like(std_grouped_rewards))

        # Normalize the rewards to compute the advantages
        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(num_generations, dim=0)
        std_grouped_rewards = std_grouped_rewards.repeat_interleave(num_generations, dim=0)
        advantages = rewards - mean_grouped_rewards
        if scale_rewards:
            advantages = advantages / (std_grouped_rewards + 1e-4)
        advantages_list.append(advantages)
        completion_ids_list.append(completion_ids)
        completion_mask_list.append(completion_mask)
        prompt_completion_ids_list.append(prompt_completion_ids)
        attention_mask_list.append(attention_mask)
    
    print("sampling finish!")
    learner_log_probs = []
    with torch.no_grad():
        for i in tqdm(range(max_num_model_weight-1)): 
            learner_logps = _get_per_token_logps(
                model=model,
                input_ids=prompt_completion_ids_list[i],
                attention_mask=attention_mask_list[i],
                logits_to_keep=completion_ids_list[i].size(1),
                temperature=temperature,
                batch_size=batch_size
            )
            learner_log_probs.append(learner_logps)

    save_path = f"{target_directory}/log_probs_and_advantages.pt"
    torch.save({
        'sampler_log_probs': log_probs,
        'advantages_list': advantages_list,
        'prompt_ids_list': prompt_inputs["input_ids"],
        'prompt_mask_list': prompt_inputs["attention_mask"],
        'completion_ids_list': completion_ids_list,
        'prompt_completion_ids_list': prompt_completion_ids_list,
        'completion_mask_list': completion_mask_list,
        'attention_mask_list': attention_mask_list,
        'learner_log_probs': learner_log_probs
    }, save_path)

    print("learning finish!")
    # calculation

 if __name__ == "__main__":
    atexit.register(cleanup_dist)
    main()
--- a/scripts/decontaminate.py
+++ b/scripts/decontaminate.py
@@ -1,146 +0,0 @@
 #!/usr/bin/env python
 # coding=utf-8
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script is used to decontaminate a dataset by checking for n-gram overlap with other datasets.
 It uses the same approach presented in https://arxiv.org/abs/2501.19393,
 as found in: https://github.com/simplescaling/s1/blob/main/data/decontaminate_util.py

 Usage:

 python scripts/decontaminate.py \
    --dataset open-r1/verifiable-coding-problems-python \
    --split train \
    --ngram_size 8 \
    --problem_column problem \
    --cleanup
 """

 import collections

 from tqdm import tqdm


 def normalize_string(text: str) -> str:
    """Basic string normalization."""
    # Convert to lowercase and normalize whitespace
    text = text.lower().strip()
    # Replace multiple spaces with single space
    text = " ".join(text.split())
    return text


 def word_ngrams(text: str, n: int) -> list:
    """Generate word-level n-grams from text."""
    words = text.split()
    return [" ".join(words[i : i + n]) for i in range(len(words) - n + 1)]


 def build_ngram_lookup(documents: list[str], ngram_size: int = 8) -> dict[str, set[int]]:
    """Build ngram lookup for documents."""
    lookup = collections.defaultdict(set)

    for doc_id, document in enumerate(tqdm(documents)):
        normalized_text = normalize_string(document)
        ngrams = word_ngrams(normalized_text, ngram_size)
        for ngram in ngrams:
            lookup[ngram].add(doc_id)

    return lookup


 def build_ngram_single(document: str, ngram_size: int = 8) -> set[str]:
    normalized_text = normalize_string(document)
    ngrams = word_ngrams(normalized_text, ngram_size)

    return set(ngrams)


 if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset", type=str, required=True, help="Name of the dataset to check for contamination.")
    parser.add_argument("--config", type=str, default=None, help="Name of the dataset config to load.")
    parser.add_argument("--split", type=str, default="train", help="Split to check for contamination, defaults to `train`.")
    parser.add_argument("--ngram_size", type=int, default=8, help="Size of n-grams to build, defaults to 8.")
    parser.add_argument(
        "--problem_column", type=str, default="problem", help="Name of the column containing the problem (prompt)."
    )
    parser.add_argument(
        "--cleanup",
        action="store_true",
        help="Whether to remove the contaminated rows before pushing the dataset.",
    )
    parser.add_argument(
        "--new_dataset_name",
        type=str,
        default=None,
        help="New name for the dataset. If not provided, will reuse the name and add a `_decontaminated` to the name."
    )
    args = parser.parse_args()

    from datasets import load_dataset, Dataset

    # Load the dataset to check for contamination
    ds = load_dataset(args.dataset, name=args.config, split=args.split)

    eval_datasets = {
        "aime_2024": (load_dataset("HuggingFaceH4/aime_2024", split="train"), "problem"),
        "aime_2025": (load_dataset("yentinglin/aime_2025", split="train"), "problem"),
        "math_500": (load_dataset("HuggingFaceH4/MATH-500", split="test"), "problem"),
        "gpqa": (load_dataset("Idavidrein/gpqa", "gpqa_diamond", split="train", trust_remote_code=True), "Question"),
        "lcb": (
            load_dataset(
                "livecodebench/code_generation_lite", split="test", version_tag="v4_v5", trust_remote_code=True
            ),
            "question_content",
        ),
    }
    ngram_lookups = {}
    for ds_name, (eval_dataset, problem_col) in eval_datasets.items():
        ngram_lookups[ds_name] = build_ngram_lookup(eval_dataset[problem_col], ngram_size=args.ngram_size)

    for eval_name, ngram_lookup in ngram_lookups.items():
        # Update the ngram_lookup variable for each dataset
        def find_contaminated(row):
            # For each example we have to build the ngrams and check for all of them on each row
            ngrams = build_ngram_single(row[args.problem_column], ngram_size=args.ngram_size)
            row[f"contaminated_{eval_name}"] = any(set(ngram in ngram_lookup for ngram in ngrams))
            return row

        ds = ds.map(find_contaminated, num_proc=8)

    # Allow cleaning up via CLI args (removing the contaminated examples and dropping the columns)
    def cleanup(dataset: Dataset) -> Dataset:
        initial_size = len(dataset)
        contamination_cols = [col for col in dataset.column_names if col.startswith("contaminated_")]
        for col in contamination_cols:
            if col.startswith("contaminated_"):
                size_prior = len(dataset)
                dataset = dataset.filter(lambda x: not x[col], num_proc=8)
                if len(dataset) < size_prior:
                    print(f"Removed {size_prior - len(dataset)} samples from '{col.replace('contaminated_', '')}'")
        dataset = dataset.remove_columns(contamination_cols)
        print(f"Initial size: {initial_size}, Final size: {len(dataset)}")
        return dataset

    if args.cleanup:
        ds = cleanup(ds)

    new_ds_name = args.new_dataset_name or f"{args.dataset}_decontaminated"
    config_name = args.config if args.config is not None else "default"
    url = ds.push_to_hub(new_ds_name, config_name=config_name, split="train")
    print(f"Decontaminated dataset: {url}")
--- a/scripts/generate_reasoning.py
+++ b/scripts/generate_reasoning.py
@@ -1,174 +0,0 @@
 import argparse
 import asyncio
 import hashlib
 import json
 import os
 import random
 from asyncio import Lock
 from typing import Set

 from datasets import load_dataset
 from tqdm.asyncio import tqdm

 import aiofiles
 import aiohttp
 import uvloop


 file_lock = Lock()


 async def generate_completion(session, prompt, args):
    retry_budget = 10
    while retry_budget > 0:
        try:
            await asyncio.sleep(random.uniform(0.0, 0.1))
            async with session.post(
                f"http://{args.api_addr}/v1/chat/completions",
                json={
                    "model": "default",
                    "messages": [{"role": "user", "content": prompt}],
                    "max_tokens": args.max_tokens,
                    "temperature": args.temperature,
                    "top_p": args.top_p,
                },
                headers={"Authorization": "Bearer EMPTY"},
            ) as response:
                return await response.json(content_type=None)
        except Exception as e:
            print(f"API error (will retry): {e}")
            retry_budget -= 1
            await asyncio.sleep(10)
    return None


 async def process_example(example, session, args, output_file, pbar):
    prompt = args.prompt_template.format(prompt=example[args.prompt_column])

    try:
        tasks = [generate_completion(session, prompt, args) for _ in range(args.num_generations)]

        completions = await asyncio.gather(*tasks)

        if any(completion is None for completion in completions):
            print(f"Error processing example")
            pbar.update(1)
            return None

        generations = []
        finish_reasons = []
        api_metadata = []

        for completion in completions:
            generations.append(completion["choices"][0]["message"]["content"])
            finish_reasons.append(completion["choices"][0]["finish_reason"])
            api_metadata.append(completion["usage"])

        # Combine original dataset fields with generations
        result = {
            **example,  # Preserve all original dataset fields
            "generations": generations,
            "finish_reasons": finish_reasons,
            "api_metadata": api_metadata,
        }

        # Write to file with lock
        async with file_lock:
            async with aiofiles.open(output_file, mode="a") as f:
                await f.write(json.dumps(result) + "\n")
                await f.flush()

        pbar.set_postfix(active=len(pbar.active_tasks), refresh=False)
        pbar.update(1)

        return result
    except Exception as e:
        print(f"Error processing example: {e}")
        pbar.update(1)
        return None


 async def load_processed_uuids(output_file, uuid_column):
    processed_uuids = set()
    if os.path.exists(output_file):
        async with aiofiles.open(output_file, mode="r") as f:
            async for line in f:
                try:
                    data = json.loads(line)
                    processed_uuids.add(hashlib.md5(str(data[uuid_column]).encode()).hexdigest())
                except json.JSONDecodeError:
                    continue
    return processed_uuids


 async def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset-name", type=str, required=True)
    parser.add_argument("--output-file", type=str, required=True)
    parser.add_argument("--prompt-column", type=str, required=True)
    parser.add_argument("--uuid-column", type=str, required=True)
    parser.add_argument("--api-addr", type=str, default="localhost:39876")
    parser.add_argument("--num-generations", type=int, default=4)
    parser.add_argument(
        "--prompt-template",
        type=str,
        default="You will be given a problem. Please reason step by step, and put your final answer within \\boxed{{}}:\n{prompt}",
    )
    parser.add_argument("--temperature", type=float, default=0.6)
    parser.add_argument("--top-p", type=float, default=0.95)
    parser.add_argument("--max-tokens", type=int, default=16384)
    parser.add_argument("--max-concurrent", type=int, default=1000)
    args = parser.parse_args()

    dataset = load_dataset(args.dataset_name, split="train").shuffle()
    processed_uuids = await load_processed_uuids(args.output_file, args.uuid_column)
    if processed_uuids:
        print(f"Found {len(processed_uuids)} already processed examples, resuming from there...")

    if not os.path.exists(args.output_file):
        async with aiofiles.open(args.output_file, mode="w") as f:
            await f.write("")

    active_tasks: Set[asyncio.Task] = set()

    pbar = tqdm(
        total=len(dataset) - len(processed_uuids),
        desc="Generating responses",
        unit="row",
        mininterval=2,
        smoothing=0.0001,
    )
    pbar.active_tasks = active_tasks

    async with aiohttp.ClientSession(
        timeout=aiohttp.ClientTimeout(total=60 * 60),
        connector=aiohttp.TCPConnector(limit=args.max_concurrent, ttl_dns_cache=300, keepalive_timeout=60 * 60),
    ) as session:
        for example in dataset:
            uuid = hashlib.md5(str(example[args.uuid_column]).encode()).hexdigest()
            if uuid not in processed_uuids:
                # Wait if we've hit the concurrency limit
                while len(active_tasks) >= args.max_concurrent:
                    done, active_tasks = await asyncio.wait(active_tasks, return_when=asyncio.FIRST_COMPLETED)
                    for task in done:
                        try:
                            await task
                        except Exception as e:
                            print(f"Task failed: {e}")

                task = asyncio.create_task(process_example(example, session, args, args.output_file, pbar))
                active_tasks.add(task)
                task.add_done_callback(active_tasks.discard)

                pbar.set_postfix(active=len(active_tasks), refresh=True)

        # Wait for remaining tasks
        if active_tasks:
            await asyncio.gather(*active_tasks, return_exceptions=True)

    pbar.close()


 if __name__ == "__main__":
    uvloop.install()
    asyncio.run(main())
--- a/scripts/get_tensor_parallel_size.py
+++ b/scripts/get_tensor_parallel_size.py
@@ -1,28 +0,0 @@
 import argparse
 from transformers import AutoConfig
 from math import gcd

 def get_tensor_parallel_size(model_name: str, revision: str = None, default_tp: int = 8) -> int:
    try:
        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=True)
        num_heads = getattr(config, 'num_attention_heads', None)

        if num_heads is not None and num_heads % default_tp != 0:
            tp = gcd(num_heads, default_tp)
            return max(tp, 1)
        else:
            return default_tp
    except Exception as e:
        print(f"Warning: Failed to fetch config for {model_name}@{revision}: {e}")
        return default_tp

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, required=True, help="Hugging Face model name or path")
    parser.add_argument("--revision", type=str, default=None, help="Model revision if applicable")
    parser.add_argument("--default_tp", type=int, default=8, help="Default TP size (usually GPUs per node)")

    args = parser.parse_args()

    tp = get_tensor_parallel_size(args.model_name, args.revision, args.default_tp)
    print(tp)
--- a/scripts/run_benchmarks.py
+++ b/scripts/run_benchmarks.py
@@ -1,61 +0,0 @@
 # Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass, field
 from typing import List, Optional

 from open_r1.utils.evaluation import SUPPORTED_BENCHMARKS, run_benchmark_jobs
 from open_r1.configs import SFTConfig
 from trl import ModelConfig, TrlParser


@dataclass
 class ScriptArguments:
    model_id: str = field(
        default="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
        metadata={"help": "The Hub model id to push the model to."},
    )
    model_revision: str = field(default="main", metadata={"help": "The Hub model branch to push the model to."})
    trust_remote_code: bool = field(default=False, metadata={"help": "Trust the remote code."})
    benchmarks: List[str] = field(
        default_factory=lambda: [], metadata={"help": "The benchmarks to run after training."}
    )
    list_benchmarks: bool = field(default=False, metadata={"help": "List all supported benchmarks."})
    system_prompt: Optional[str] = field(
        default=None, metadata={"help": "The system prompt to use for the benchmark."}
    )


 def main():
    parser = TrlParser(ScriptArguments)
    args = parser.parse_args_and_config()[0]
    if args.list_benchmarks:
        print("Supported benchmarks:")
        for benchmark in SUPPORTED_BENCHMARKS:
            print(f"  - {benchmark}")
        return
    benchmark_args = SFTConfig(
        output_dir="",
        hub_model_id=args.model_id,
        hub_model_revision=args.model_revision,
        benchmarks=args.benchmarks,
        system_prompt=args.system_prompt,
    )
    run_benchmark_jobs(
        benchmark_args,
        ModelConfig(model_name_or_path="", model_revision="", trust_remote_code=args.trust_remote_code),
    )


 if __name__ == "__main__":
    main()
--- a/scripts/upload_details.py
+++ b/scripts/upload_details.py
@@ -1,55 +0,0 @@
 # coding=utf-8
 # Copyright 2025 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Push the details from a LightEval run to the Hub.

 Usage:

 python src/open_r1/utils/upload_details.py \
    --data_files {path_to_parquet_file} \
    --hub_repo_id {hub_repo_id} \
    --config_name {config_name}
 """

 from dataclasses import dataclass, field
 from typing import List

 from datasets import load_dataset
 from transformers import HfArgumentParser


@dataclass
 class ScriptArguments:
    data_files: List[str] = field(default_factory=list)
    hub_repo_id: str = None
    config_name: str = None


 def main():
    parser = HfArgumentParser(ScriptArguments)
    args = parser.parse_args_into_dataclasses()[0]

    if all(file.endswith(".json") for file in args.data_files):
        ds = load_dataset("json", data_files=args.data_files)
    elif all(file.endswith(".jsonl") for file in args.data_files):
        ds = load_dataset("json", data_files=args.data_files)
    else:
        ds = load_dataset("parquet", data_files=args.data_files)
    url = ds.push_to_hub(args.hub_repo_id, config_name=args.config_name, private=True)
    print(f"Dataset available at: {url}")


 if __name__ == "__main__":
    main()
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,41 +0,0 @@
 [isort]
 default_section = FIRSTPARTY
 ensure_newline_before_comments = True
 force_grid_wrap = 0
 include_trailing_comma = True
 known_first_party = open_r1
 known_third_party =
    transformers
    datasets
    fugashi
    git
    h5py
    matplotlib
    nltk
    numpy
    packaging
    pandas
    psutil
    pytest
    rouge_score
    sacrebleu
    seqeval
    sklearn
    streamlit
    torch
    tqdm

 line_length = 119
 lines_after_imports = 2
 multi_line_output = 3
 use_parentheses = True

 [flake8]
 ignore = E203, E501, E741, W503, W605
 max-line-length = 119
 per-file-ignores =
    # imported but unused
    __init__.py: F401

 [tool:pytest]
 doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
--- a/setup.py
+++ b/setup.py
@@ -1,145 +0,0 @@
 # Copyright 2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Adapted from huggingface/transformers: https://github.com/huggingface/transformers/blob/21a2d900eceeded7be9edc445b56877b95eda4ca/setup.py


 import re
 import shutil
 from pathlib import Path

 from setuptools import find_packages, setup


 # Remove stale open_r1.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
 stale_egg_info = Path(__file__).parent / "open_r1.egg-info"
 if stale_egg_info.exists():
    print(
        (
            "Warning: {} exists.\n\n"
            "If you recently updated open_r1, this is expected,\n"
            "but it may prevent open_r1 from installing in editable mode.\n\n"
            "This directory is automatically generated by Python's packaging tools.\n"
            "I will remove it now.\n\n"
            "See https://github.com/pypa/pip/issues/5466 for details.\n"
        ).format(stale_egg_info)
    )
    shutil.rmtree(stale_egg_info)


 # IMPORTANT: all dependencies should be listed here with their version requirements, if any.
 #   * If a dependency is fast-moving (e.g. trl), pin to the exact version
 _deps = [
    "accelerate==1.4.0",
    "bitsandbytes>=0.43.0",
    "datasets>=3.2.0",
    "deepspeed==0.15.4",
    "distilabel[vllm,ray,openai]>=1.5.2",
    "e2b-code-interpreter>=1.0.5",
    "einops>=0.8.0",
    "flake8>=6.0.0",
    "hf_transfer>=0.1.4",
    "huggingface-hub[cli]>=0.19.2,<1.0",
    "isort>=5.12.0",
    "langdetect",  # Needed for LightEval's extended tasks
    "latex2sympy2_extended>=1.0.6",
    "liger_kernel==0.5.3",
    "lighteval @ git+https://github.com/huggingface/lighteval.git@ed084813e0bd12d82a06d9f913291fdbee774905",
    "math-verify==0.5.2",  # Used for math verification in grpo
    "packaging>=23.0",
    "parameterized>=0.9.0",
    "peft>=0.14.0",
    "pytest",
    "python-dotenv",
    "ruff>=0.9.0",
    "safetensors>=0.3.3",
    "sentencepiece>=0.1.99",
    "torch==2.5.1",
    "transformers==4.50.0",
    "trl==0.16.0",
    "vllm==0.7.2",
    "wandb>=0.19.1",
 ]

 # this is a lookup table with items like:
 #
 # tokenizers: "tokenizers==0.9.4"
 # packaging: "packaging"
 #
 # some of the values are versioned whereas others aren't.
 deps = {b: a for a, b in (re.findall(r"^(([^!=<>~ \[\]]+)(?:\[[^\]]+\])?(?:[!=<>~ ].*)?$)", x)[0] for x in _deps)}


 def deps_list(*pkgs):
    return [deps[pkg] for pkg in pkgs]


 extras = {}
 extras["tests"] = deps_list("pytest", "parameterized", "math-verify")
 extras["torch"] = deps_list("torch")
 extras["quality"] = deps_list("ruff", "isort", "flake8")
 extras["code"] = deps_list("e2b-code-interpreter", "python-dotenv")
 extras["eval"] = deps_list("lighteval", "math-verify")
 extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"]

 # core dependencies shared across the whole project - keep this to a bare minimum :)
 install_requires = [
    deps["accelerate"],
    deps["bitsandbytes"],
    deps["einops"],
    deps["datasets"],
    deps["deepspeed"],
    deps["hf_transfer"],
    deps["huggingface-hub"],
    deps["langdetect"],
    deps["latex2sympy2_extended"],
    deps["math-verify"],
    deps["liger_kernel"],
    deps["packaging"],  # utilities from PyPA to e.g., compare versions
    deps["safetensors"],
    deps["sentencepiece"],
    deps["transformers"],
    deps["trl"],
    deps["wandb"],
 ]

 setup(
    name="open-r1",
    version="0.1.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="The Hugging Face team (past and future)",
    author_email="lewis@huggingface.co",
    description="Open R1",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
    keywords="llm inference-time compute reasoning",
    license="Apache",
    url="https://github.com/huggingface/open-r1",
    package_dir={"": "src"},
    packages=find_packages("src"),
    zip_safe=False,
    extras_require=extras,
    python_requires=">=3.10.9",
    install_requires=install_requires,
    classifiers=[
        "Development Status :: 3 - Alpha",
        "Intended Audience :: Developers",
        "Intended Audience :: Education",
        "Intended Audience :: Science/Research",
        "License :: OSI Approved :: Apache Software License",
        "Operating System :: OS Independent",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.10",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
    ],
 )
--- a/sh_dir/Learner_4gpus_nRMs_LogNorm_benchmark.sh
+++ b/sh_dir/Learner_4gpus_nRMs_LogNorm_benchmark.sh
@@ -8,13 +8,12 @@ cfg=$4
 loss_type=$5
 wandb_name=$6
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/learner/${loss_type}/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log
 log_path=/userhome/Research_HUB/HeteroRL/open-r1/log_dir/learner/${loss_type}/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log
 mkdir -p "$(dirname "$log_path")"
 checkpoint=/extrahome0/save_dir/4gpus/Learner_EqQ_2th_cfgv6b/Qwen3-1.7B/checkpoint-64
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/learner/${loss_type}
 export WANDB_DIR=/userhome/Research_HUB/HeteroRL/open-r1/wandb/learner/${loss_type}
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export PYTHONPATH=/userhome/Research_HUB/HeteroRL/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
@@ -39,7 +38,7 @@ accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --max_steps 1295 \
  --save_strategy "steps" --save_steps 64000  --save_total_limit  32 \
  --eval_strategy 'steps' --eval_steps 64 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --wandb_entity "xxxx" --wandb_project "HeteroRL"  --report_to "wandb" \
  --log_completions True --logging_steps 1 \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
  --vllm_gpu_memory_utilization 0.25  \
--- a/sh_dir/Sampler_4gpus_single_benchmark.sh
+++ b/sh_dir/Sampler_4gpus_single_benchmark.sh
@@ -8,13 +8,13 @@ loss_type=$4
 wandb_name=$5
 sampler_id=$6
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/sampler/${loss_type}/$1_sampler${sampler_id}_$2_cfg$3_${formatted_time}.log
 log_path=/userhome/Research_HUB/HeteroRL/open-r1/log_dir/sampler/${loss_type}/$1_sampler${sampler_id}_$2_cfg$3_${formatted_time}.log
 mkdir -p "$(dirname "$log_path")"
 echo $log_path
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/sampler${sampler_id}
 export WANDB_DIR=/userhome/Research_HUB/HeteroRL/open-r1/wandb/sampler${sampler_id}
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export PYTHONPATH=/userhome/Research_HUB/HeteroRL/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
@@ -37,14 +37,14 @@ elif [[ $sampler_id -eq 1 ]]; then
  export CUDA_VISIBLE_DEVICES="4,5,6,7"
  export MASTER_PORT=29522
  vllm_gpu_memory_utilization=0.6
 #elif [[ $sampler_id -eq 2 ]]; then
 #  export CUDA_VISIBLE_DEVICES="4,5,6,7"
 #  export MASTER_PORT=29523
 #  vllm_gpu_memory_utilization=0.3
 #elif [[ $sampler_id -eq 3 ]]; then
 #  export CUDA_VISIBLE_DEVICES="4,5,6,7"
 #  export MASTER_PORT=29524
 #  vllm_gpu_memory_utilization=0.6
 elif [[ $sampler_id -eq 2 ]]; then
  export CUDA_VISIBLE_DEVICES="4,5,6,7"
  export MASTER_PORT=29523
  vllm_gpu_memory_utilization=0.3
 elif [[ $sampler_id -eq 3 ]]; then
  export CUDA_VISIBLE_DEVICES="4,5,6,7"
  export MASTER_PORT=29524
  vllm_gpu_memory_utilization=0.6
 fi

 #rm $SYNC_WEIGHTS_PATH
@@ -61,9 +61,9 @@ accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --num_generations 8 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --wandb_entity "xxx" --wandb_project "HeteroRL"  --report_to "wandb" \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
  --num_samplers 2 --sampler_id $sampler_id \
  --num_samplers 4 --sampler_id $sampler_id \
  --wandb_name $wandb_name \
  --loss_type $loss_type \
  --vllm_gpu_memory_utilization $vllm_gpu_memory_utilization > $log_path 2>&1
--- a/sh_dir/Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint.sh
+++ b/sh_dir/Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint.sh
@@ -1,60 +0,0 @@
 export TZ='Asia/Shanghai'
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 export SYNC_SAMPLER_STEPS=$3
 cfg=$4
 loss_type=$5
 wandb_name=$6
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/learner/${loss_type}/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log
 mkdir -p "$(dirname "$log_path")"
 # checkpoint=/extrahome0/save_dir/4gpus/Learner_EqQ_2th_cfgv6b/Qwen3-1.7B/checkpoint-64
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/learner/${loss_type}
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29510
 export SAVEPATH=/extrahome0/save_dir/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
 export FS_QUEUE_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/async_checkpoint.pt
 export QUEUE_TIMEOUT_SECONDS=3600

 echo $log_path
 # export CUDA_VISIBLE_DEVICES=1,2,3,4
 rm -r $FS_QUEUE_PATH
 rm $SYNC_WEIGHTS_PATH
 accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --max_prompt_length 768 \
  --scale_rewards False \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --max_steps 1295 \
  --save_strategy "steps" --save_steps 64  --save_total_limit  1 \
  --eval_strategy 'steps' --eval_steps 64 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --log_completions True --logging_steps 1 \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
  --vllm_gpu_memory_utilization 0.25  \
  --max_completion_length 2048 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 16 \
  --gradient_accumulation_steps 8 \
  --num_generations 8 \
  --wandb_name $wandb_name \
  --ais_beta 0.5 \
  --use_benchmark \
  --loss_type $loss_type \
  --resume_from_checkpoint False \
  --use_think False \
  --eval_on_start False > $log_path 2>&1 &


  # --resume_from_checkpoint $checkpoint \
--- a/sh_dir/Online_gXpo_4gpus_benchmark_bs512.sh
+++ b/sh_dir/Online_gXpo_4gpus_benchmark_bs512.sh
@@ -1,26 +1,24 @@
 export TZ='Asia/Shanghai'
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 loss_type=$1
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/online/${loss_type}/${formatted_time}.log
 log_path=/userhome/Research_HUB/HeteroRL/open-r1/log_dir/online/${loss_type}/${formatted_time}.log
 mkdir -p "$(dirname "$log_path")"
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/online/${loss_type}
 export WANDB_DIR=/userhome/Research_HUB/HeteroRL/open-r1/wandb/online/${loss_type}
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export PYTHONPATH=/userhome/Research_HUB/HeteroRL/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 # export MASTER_PORT=29506
 export SAVEPATH="/extrahome0/save_dir/GPG/4gpus/${loss_type}/${formatted_time}/Qwen3-1.7B"

 export SAVEPATH="/extrahome0/save_dir/4gpus/${loss_type}/${formatted_time}/Qwen3-1.7B"
 echo $log_path
 accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/online_rl.py --config  recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1_vllm.yaml --output_dir $SAVEPATH \
  --save_total_limit 1 --num_train_epochs 10 --gradient_accumulation_steps 16 --max_completion_length 2048 --max_prompt_length 768 \
  --save_total_limit 1 --num_train_epochs 5 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --save_strategy "steps" --save_steps 64 \
  --wandb_entity "pcl-zh"  --wandb_project "GPG"  --report_to "wandb"   \
  --save_strategy "steps" --save_steps 64 --log_completions False \
  --wandb_entity "xxx"  --wandb_project "HeteroRL"  --report_to "wandb"   \
  --per_device_eval_batch_size 16  --per_device_train_batch_size 8 --eval_strategy "steps" --eval_steps 64 --eval_on_start True --use_benchmark \
  --logging_steps 1  --use_vllm True --loss_type $loss_type > $log_path 2>&1 &
--- a/sh_dir/Online_gXpo_4gpus_benchmark.sh
+++ b/sh_dir/Online_gXpo_4gpus_benchmark.sh
@@ -1,44 +0,0 @@
 export TZ='Asia/Shanghai'
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 loss_type=$1
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/online/${loss_type}/${formatted_time}.log
 mkdir -p "$(dirname "$log_path")"
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/online/${loss_type}
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 # export MASTER_PORT=29506
 export SAVEPATH="/extrahome0/save_dir/GPG/4gpus/${loss_type}/${formatted_time}/Qwen3-1.7B"
 # export CUDA_VISIBLE_DEVICES=4,5,6,7
 # if [[ $loss_type == "gspo" ]]; then
 #     export CUDA_VISIBLE_DEVICES="0,1,2,3"
 #     export MASTER_PORT=29508
 # elif [[ $loss_type == "EqQ" ]]; then
 #     export CUDA_VISIBLE_DEVICES="4,5,6,7"
 #     export MASTER_PORT=29507
 # elif [[ $loss_type == "grpo" ]]; then
 #     export CUDA_VISIBLE_DEVICES="0,1,2,3"
 #     export MASTER_PORT=29506
 # fi

 # if [[ $loss_type == "dr_grpo" ]]; then
 #     export CUDA_VISIBLE_DEVICES="0,1,2,3"
 #     export MASTER_PORT=29508
 # elif [[ $loss_type == "bnpo" ]]; then
 #     export CUDA_VISIBLE_DEVICES="4,5,6,7"
 #     export MASTER_PORT=29507
 # fi
 echo $log_path
 accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/online_rl.py --config  recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1_vllm.yaml --output_dir $SAVEPATH \
  --save_total_limit 1 --num_train_epochs 5 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --save_strategy "steps" --save_steps 64 \
  --wandb_entity "pcl-zh"  --wandb_project "GPG"  --report_to "wandb"   \
  --per_device_eval_batch_size 16  --per_device_train_batch_size 8 --eval_strategy "steps" --eval_steps 64 --eval_on_start True --use_benchmark \
  --logging_steps 1  --use_vllm True --loss_type $loss_type > $log_path 2>&1 &
--- a/sh_dir/README.md
+++ b/sh_dir/README.md
@@ -1,27 +0,0 @@
 An heterogeneous RL algorithm built on [GPG](https://github.com/AMAP-ML/GPG)/[trl](https://github.com/huggingface/trl)/[openR1](https://github.com/huggingface/open-r1).

 # Asynchronous Reinforcement Learning

 ## Enter the current directory (if the directory is different, you need to replace the corresponding path variables in the script).


 ## Launch the learner firstly（using 4 * 80GB Nvidia A100 by default）
 ```shell
 cd /userhome/Research_HUB/GPG/open-r1
 CUDA_VISIBLE_DEVICES=0,1,2,3 bash sh_dir/Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint.sh learner_script_checkpoint GEPO_nothink_1th 1 v6b gepo 1L2S_GEPO_diff32_nothink
 ```
 ## Sampler: launch samplers one by one in sequence
 ### resume from checkpoint: put the path of checkpoint into model_name_or_path
 ```shell
 bash sh_dir/Sampler_4gpus_single_benchmark_checkpoint.sh sampler_script_checkpoint GEPO_nothink_1th v6b gepo 1L2S_GEPO_diff32_nothink 0 &
 bash sh_dir/Sampler_4gpus_single_benchmark_checkpoint.sh sampler_script_checkpoint GEPO_nothink_1th v6b gepo 1L2S_GEPO_diff32_nothink 1 &
 ```


 Online-policy（using 4 * 80GB Nvidia A100 by default）:

 # We support grpo/bnpo/dr_grpo/gepo/gspo loss currently.
 ```shell
 cd /userhome/Research_HUB/GPG/open-r1
 CUDA_VISIBLE_DEVICES="0,1,2,3" MASTER_PORT=29510 bash sh_dir/Online_gXpo_4gpus_benchmark.sh gepo
 ```
--- a/sh_dir/Sampler_4gpus_single_benchmark_checkpoint.sh
+++ b/sh_dir/Sampler_4gpus_single_benchmark_checkpoint.sh
@@ -1,71 +0,0 @@
 export TZ='Asia/Shanghai'
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 cfg=$3
 loss_type=$4
 wandb_name=$5
 sampler_id=$6
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/sampler/${loss_type}/$1_sampler${sampler_id}_$2_cfg$3_${formatted_time}.log
 mkdir -p "$(dirname "$log_path")"
 echo $log_path
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/sampler${sampler_id}
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export SAVEPATH=/extrahome0/save_dir/4gpus/Sampler_${xth}_cfg${cfg}/sampler${sampler_id}/Qwen3-1.7B
 export FS_QUEUE_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/async_checkpoint.pt
 export SYNC_SAMPLER_STEPS=1

 if ! [[ "$sampler_id" =~ ^[0-3]$ ]]; then
  echo "Error: sampler_id must be 0, 1, 2 or 3"
  exit 1
 fi

 if [[ $sampler_id -eq 0 ]]; then
  export CUDA_VISIBLE_DEVICES="4,5,6,7"
  export MASTER_PORT=29521
  vllm_gpu_memory_utilization=0.3
 elif [[ $sampler_id -eq 1 ]]; then
  export CUDA_VISIBLE_DEVICES="4,5,6,7"
  export MASTER_PORT=29522
  vllm_gpu_memory_utilization=0.6
 #elif [[ $sampler_id -eq 2 ]]; then
 #  export CUDA_VISIBLE_DEVICES="4,5,6,7"
 #  export MASTER_PORT=29523
 #  vllm_gpu_memory_utilization=0.3
 #elif [[ $sampler_id -eq 3 ]]; then
 #  export CUDA_VISIBLE_DEVICES="4,5,6,7"
 #  export MASTER_PORT=29524
 #  vllm_gpu_memory_utilization=0.6
 fi

 #rm $SYNC_WEIGHTS_PATH
 #echo "rm$SYNC_WEIGHTS_PATH"
 accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --save_strategy "steps" --save_steps 100000  --save_total_limit  5 \
  --num_train_epochs 3 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --eval_strategy 'no' \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --log_completions True --logging_steps 32 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --num_generations 8 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
  --num_samplers 2 --sampler_id $sampler_id \
  --wandb_name $wandb_name \
  --loss_type $loss_type \
  --resume_from_checkpoint False \
  --use_think False \
  --vllm_gpu_memory_utilization $vllm_gpu_memory_utilization > $log_path 2>&1
--- a/sh_dir/debug/MoIS_Learner_4gpus_nRMs_debug.sh
+++ b/sh_dir/debug/MoIS_Learner_4gpus_nRMs_debug.sh
@@ -1,55 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 export SYNC_SAMPLER_STEPS=$3
 cfg=$4
 wandb_name=$5
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/learner/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log


 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/learner
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29510
 export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
 export QUEUE_TIMEOUT_SECONDS=3600

 echo $log_path
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 # rm -r $FS_QUEUE_PATH
 # rm $SYNC_WEIGHTS_PATH
 accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --max_prompt_length 768 \
  --scale_rewards False \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --max_steps 1295 \
  --save_strategy "steps" --save_steps 3200  --save_total_limit  5 \
  --eval_strategy 'steps' --eval_steps 64 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --log_completions True --logging_steps 1 \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
  --vllm_gpu_memory_utilization 0.25  \
  --max_completion_length 2048 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 16 \
  --gradient_accumulation_steps 8 \
  --num_generations 8 \
  --wandb_name $wandb_name \
  --ais_beta 0.5 \
  --cppo_beta 0.1 \
  --max_diff_step 4 \
  --eval_on_start False


--- a/sh_dir/debug/MoIS_Sampler_4gpus_debug.sh
+++ b/sh_dir/debug/MoIS_Sampler_4gpus_debug.sh
@@ -1,62 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 cfg=$3
 sampler_id=$4
 ########################## parameters ##########################
 # log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_sampler${sampler_id}_$2_cfg$3_${formatted_time}.log
 # echo $log_path
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler/debug
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/sampler${sampler_id}/Qwen3-1.7B
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
 export SYNC_SAMPLER_STEPS=1

 if ! [[ "$sampler_id" =~ ^[0-3]$ ]]; then
  echo "Error: sampler_id must be 0, 1, 2 or 3"
  exit 1
 fi

 if [[ $sampler_id -eq 0 ]]; then
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  export MASTER_PORT=29521
  vllm_gpu_memory_utilization=0.45
 elif [[ $sampler_id -eq 1 ]]; then
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  export MASTER_PORT=29522
  vllm_gpu_memory_utilization=0.90
 elif [[ $sampler_id -eq 2 ]]; then
  export CUDA_VISIBLE_DEVICES="4,5,6,7"
  export MASTER_PORT=29523
  vllm_gpu_memory_utilization=0.45
 elif [[ $sampler_id -eq 3 ]]; then
  export CUDA_VISIBLE_DEVICES="4,5,6,7"
  export MASTER_PORT=29524
  vllm_gpu_memory_utilization=0.90
 fi
 #rm $SYNC_WEIGHTS_PATH
 #echo "rm$SYNC_WEIGHTS_PATH"
 accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --save_strategy "steps" --save_steps 100000  --save_total_limit  5 \
  --num_train_epochs 3 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --eval_strategy 'no' \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --log_completions True --logging_steps 32 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --num_generations 8 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
  --num_samplers 1 --sampler_id $sampler_id \
  --vllm_gpu_memory_utilization $vllm_gpu_memory_utilization
--- a/sh_dir/debug/train_grpo_4gpus_debug.sh
+++ b/sh_dir/debug/train_grpo_4gpus_debug.sh
@@ -1,24 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/grpo/${formatted_time}.log

 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/grpo
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29506
 export SAVEPATH="/extrahome0/save_dir/GPG/4gpus/GRPO/Qwen3-1.7B"
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 echo $log_path
 accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/grpo.py --config  recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1_vllm.yaml --output_dir $SAVEPATH \
  --save_total_limit  5 --num_train_epochs 5 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --save_strategy "steps" --save_steps 32000 \
  --wandb_entity "pcl-zh"  --wandb_project "GPG"  --report_to "wandb"   \
  --per_device_eval_batch_size 16  --per_device_train_batch_size 8 --eval_strategy "steps" --eval_steps 32 --eval_on_start True \
  --logging_steps 1  --use_vllm True > $log_path 2>&1
--- a/sh_dir/old/.MoIS_Learner_4gpus_nRMs.sh.swp
+++ b/sh_dir/old/.MoIS_Learner_4gpus_nRMs.sh.swp
--- a/sh_dir/old/Debug_Learner_MoIS_4gpus_nRMs.sh
+++ b/sh_dir/old/Debug_Learner_MoIS_4gpus_nRMs.sh
@@ -1,49 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 export SYNC_SAMPLER_STEPS=$3
 cfg=$4
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/learner/$1_$2_SyncF$3_cfg$4_${formatted_time}.log


 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/learner
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29510
 export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Learner_${xth}/Qwen3-1.7B
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}/tmp/Qwen3-1.7B/gpg_async_weights.pt
 export QUEUE_TIMEOUT_SECONDS=3600

 echo $log_path
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 rm -r $FS_QUEUE_PATH
 rm $SYNC_WEIGHTS_PATH
 accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --max_prompt_length 768 \
  --scale_rewards False \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-0.6B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --max_steps 1295 \
  --save_strategy "steps" --save_steps 3200  --save_total_limit  5 \
  --eval_strategy 'steps' --eval_steps 32 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --log_completions True --logging_steps 1 \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$4.yaml \
  --vllm_gpu_memory_utilization 0.25  \
  --max_completion_length 2048 \
  --per_device_train_batch_size 2 \
  --gradient_accumulation_steps 2 \
  --num_generations 2 \
  --eval_on_start False \
  --seed 2025 > $log_path 2>&1


--- a/sh_dir/old/Debug_Learner_v2_4gpus.sh
+++ b/sh_dir/old/Debug_Learner_v2_4gpus.sh
@@ -1,27 +0,0 @@
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/debug
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29510
 export SAVEPATH="/extrahome0/save_dir/GPG/4gpus/LearnerV2_debug/Qwen3-1.7B"
 export FS_QUEUE_PATH="/extrahome0/save_dir/GPG/4gpus/AsyncV2_debug/Rollout/Qwen3-1.7B"
 export SYNC_WEIGHTS_PATH="/extrahome0/save_dir/GPG/4gpus/AsyncV2_debug/tmp_3th/Qwen3-1.7B/gpg_async_weights.pt"
 export SYNC_SAMPLER_STEPS=1
 #export CUDA_VISIBLE_DEVICES=4,5,6,7


 accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/learner_script_v2.py --config   recipes/Qwen2.5-Math-7B/grpo/config_simple_rl_math_l35_v1.yaml --output_dir $SAVEPATH \
  --num_train_epochs 5 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --adjust_gd --min_inverse_alpha 0.5 \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --max_steps 1295 --per_device_train_batch_size 8  --gradient_accumulation_steps 8 \
  --save_strategy "steps" --save_steps 32  --save_total_limit  5 \
  --eval_strategy 'steps' --eval_steps 32 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
 #  --vllm_gpu_memory_utilization 1.0 \
 #  --eval_on_start True --log_completions True --logging_steps 1 \
--- a/sh_dir/old/Debug_Sampler_4gpus_Part1.sh
+++ b/sh_dir/old/Debug_Sampler_4gpus_Part1.sh
@@ -1,38 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 cfg=$3
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
 echo $log_path
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29521

 export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}/part1/Qwen3-1.7B
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}/tmp/Qwen3-1.7B/gpg_async_weights.pt

 export SYNC_SAMPLER_STEPS=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 #rm $SYNC_WEIGHTS_PATH
 #echo "rm$SYNC_WEIGHTS_PATH"
 accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --save_strategy "steps" --save_steps 100000  --save_total_limit  5 \
  --num_train_epochs 3 --gradient_accumulation_steps 2 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --eval_strategy 'no' \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-0.6B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --per_device_train_batch_size 2 --log_completions True \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
  --seed 41 --num_generations 2 \
  --vllm_gpu_memory_utilization 0.5 > $log_path 2>&1
--- a/sh_dir/old/Kill_Learner.sh
+++ b/sh_dir/old/Kill_Learner.sh
@@ -1,2 +0,0 @@
 sleep 9600s
 ps -ef | grep learner | grep -v grep | awk '{print $2}' | xargs kill -9
--- a/sh_dir/old/Kill_Sampler.sh
+++ b/sh_dir/old/Kill_Sampler.sh
@@ -1,2 +0,0 @@
 sleep 9600s
 ps -ef | grep sampler | grep -v grep | awk '{print $2}' | xargs kill -9
--- a/sh_dir/old/Kill_debug.sh
+++ b/sh_dir/old/Kill_debug.sh
@@ -1,2 +0,0 @@
 sleep 30s
 echo "debug"
--- a/sh_dir/old/MoIS_Learner_4gpus_nRMs.sh
+++ b/sh_dir/old/MoIS_Learner_4gpus_nRMs.sh
@@ -1,56 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 export SYNC_SAMPLER_STEPS=$3
 cfg=$4
 wandb_name=$5
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/learner/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log


 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/learner
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29510
 export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
 export QUEUE_TIMEOUT_SECONDS=3600

 echo $log_path
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 rm -r $FS_QUEUE_PATH
 rm $SYNC_WEIGHTS_PATH
 accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --max_prompt_length 768 \
  --scale_rewards False \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --max_steps 1295 \
  --save_strategy "steps" --save_steps 3200  --save_total_limit  5 \
  --eval_strategy 'steps' --eval_steps 64 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --log_completions True --logging_steps 1 \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
  --vllm_gpu_memory_utilization 0.25  \
  --max_completion_length 2048 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 16 \
  --gradient_accumulation_steps 8 \
  --num_generations 8 \
  --wandb_name $wandb_name \
  --ais_beta 0.5 \
  --cppo_beta 0.00 \
  --max_diff_step 4 \
  --loss_type "ais_bnpo" \
  --eval_on_start False > $log_path 2>&1


--- a/sh_dir/old/MoIS_Learner_4gpus_nRMs_LogNorm.sh
+++ b/sh_dir/old/MoIS_Learner_4gpus_nRMs_LogNorm.sh
@@ -1,54 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 export SYNC_SAMPLER_STEPS=$3
 cfg=$4
 wandb_name=$5
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/learner/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log


 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/learner
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29510
 export SAVEPATH=/extrahome0/save_dir/AsyncGRPO/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
 export FS_QUEUE_PATH=/extrahome0/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/extrahome0/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
 export QUEUE_TIMEOUT_SECONDS=3600

 echo $log_path
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 rm -r $FS_QUEUE_PATH
 rm $SYNC_WEIGHTS_PATH
 accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --max_prompt_length 768 \
  --scale_rewards False \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --max_steps 1295 \
  --save_strategy "steps" --save_steps 3200  --save_total_limit  5 \
  --eval_strategy 'steps' --eval_steps 64 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --log_completions True --logging_steps 1 \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
  --vllm_gpu_memory_utilization 0.25  \
  --max_completion_length 2048 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 16 \
  --gradient_accumulation_steps 8 \
  --num_generations 8 \
  --wandb_name $wandb_name \
  --ais_beta 0.5 \
  --loss_type "ais_gspo" \
  --eval_on_start False > $log_path 2>&1


--- a/sh_dir/old/MoIS_Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint_bak.sh
+++ b/sh_dir/old/MoIS_Learner_4gpus_nRMs_LogNorm_benchmark_checkpoint_bak.sh
@@ -1,59 +0,0 @@
 export TZ='Asia/Shanghai'
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 export SYNC_SAMPLER_STEPS=$3
 cfg=$4
 loss_type=$5
 wandb_name=$6
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/learner/${loss_type}/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log
 mkdir -p "$(dirname "$log_path")"
 # checkpoint=/extrahome0/save_dir/4gpus/Learner_EqQ_2th_cfgv6b/Qwen3-1.7B/checkpoint-64
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/learner/${loss_type}
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29510
 export SAVEPATH=/extrahome0/save_dir/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
 export FS_QUEUE_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/extrahome0/save_dir/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/async_checkpoint.pt
 export QUEUE_TIMEOUT_SECONDS=3600

 echo $log_path
 export CUDA_VISIBLE_DEVICES=1,2,3,4
 rm -r $FS_QUEUE_PATH
 rm $SYNC_WEIGHTS_PATH
 accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --max_prompt_length 768 \
  --scale_rewards False \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-0.6B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --max_steps 1295 \
  --save_strategy "steps" --save_steps 2  --save_total_limit  1 \
  --eval_strategy 'steps' --eval_steps 64 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --log_completions True --logging_steps 1 \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
  --vllm_gpu_memory_utilization 0.15  \
  --max_completion_length 512 \
  --per_device_train_batch_size 2 \
  --per_device_eval_batch_size 4 \
  --gradient_accumulation_steps 2 \
  --num_generations 8 \
  --wandb_name $wandb_name \
  --ais_beta 0.5 \
  --use_benchmark \
  --loss_type $loss_type \
  --resume_from_checkpoint True \
  --eval_on_start False > $log_path 2>&1 &


  # --resume_from_checkpoint $checkpoint \
--- a/sh_dir/old/MoIS_Learner_4gpus_nRMs_debug.sh
+++ b/sh_dir/old/MoIS_Learner_4gpus_nRMs_debug.sh
@@ -1,48 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 export SYNC_SAMPLER_STEPS=$3
 cfg=$4
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/learner/$1_$2_SyncF$3_cfg${cfg}_${formatted_time}.log


 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/learner
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29510
 export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Learner_${xth}_cfg${cfg}/Qwen3-1.7B
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
 export QUEUE_TIMEOUT_SECONDS=3600
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 # rm -r $FS_QUEUE_PATH
 # rm $SYNC_WEIGHTS_PATH
 accelerate launch --config_file recipes/accelerate_configs/zero2_4A100s.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --max_prompt_length 768 \
  --scale_rewards False \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --max_steps 1295 \
  --save_strategy "steps" --save_steps 3200  --save_total_limit  5 \
  --eval_strategy 'steps' --eval_steps 32 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --log_completions True --logging_steps 1 \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_${cfg}.yaml \
  --vllm_gpu_memory_utilization 0.25  \
  --max_completion_length 2048 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 16 \
  --gradient_accumulation_steps 8 \
  --num_generations 8 \
  --eval_on_start False \
  --seed 2025


--- a/sh_dir/old/MoIS_SamplerV2_4gpus_Part1.sh
+++ b/sh_dir/old/MoIS_SamplerV2_4gpus_Part1.sh
@@ -1,41 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 cfg=$3
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
 echo $log_path
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29521

 export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/part1/Qwen3-1.7B
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt

 export SYNC_SAMPLER_STEPS=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 #rm $SYNC_WEIGHTS_PATH
 #echo "rm$SYNC_WEIGHTS_PATH"
 accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --save_strategy "steps" --save_steps 100000  --save_total_limit  5 \
  --num_train_epochs 3 --gradient_accumulation_steps 1 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --eval_strategy 'no' \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --log_completions False --logging_steps 1 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --num_generations 8 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
  --sampler_id 0 \
  --vllm_gpu_memory_utilization 0.45 > $log_path 2>&1
--- a/sh_dir/old/MoIS_SamplerV2_4gpus_Part2.sh
+++ b/sh_dir/old/MoIS_SamplerV2_4gpus_Part2.sh
@@ -1,41 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 cfg=$3
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
 echo $log_path
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29522

 export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/part1/Qwen3-1.7B
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt

 export SYNC_SAMPLER_STEPS=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 #rm $SYNC_WEIGHTS_PATH
 #echo "rm$SYNC_WEIGHTS_PATH"
 accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --save_strategy "steps" --save_steps 100000  --save_total_limit  5 \
  --num_train_epochs 3 --gradient_accumulation_steps 1 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --eval_strategy 'no' \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --log_completions False --logging_steps 1 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --num_generations 8 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
  --sampler_id 1 \
  --vllm_gpu_memory_utilization 0.90 > $log_path 2>&1
--- a/sh_dir/old/MoIS_SamplerV2_4gpus_Part3.sh
+++ b/sh_dir/old/MoIS_SamplerV2_4gpus_Part3.sh
@@ -1,41 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 cfg=$3
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
 echo $log_path
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29523

 export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/part1/Qwen3-1.7B
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt

 export SYNC_SAMPLER_STEPS=1
 export CUDA_VISIBLE_DEVICES=4,5,6,7
 #rm $SYNC_WEIGHTS_PATH
 #echo "rm$SYNC_WEIGHTS_PATH"
 accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --save_strategy "steps" --save_steps 100000  --save_total_limit  5 \
  --num_train_epochs 3 --gradient_accumulation_steps 1 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --eval_strategy 'no' \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --log_completions False --logging_steps 1 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --num_generations 8 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
  --sampler_id 2 \
  --vllm_gpu_memory_utilization 0.45 > $log_path 2>&1
--- a/sh_dir/old/MoIS_SamplerV2_4gpus_Part4.sh
+++ b/sh_dir/old/MoIS_SamplerV2_4gpus_Part4.sh
@@ -1,41 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 cfg=$3
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
 echo $log_path
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29524

 export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/part1/Qwen3-1.7B
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt

 export SYNC_SAMPLER_STEPS=1
 export CUDA_VISIBLE_DEVICES=4,5,6,7
 #rm $SYNC_WEIGHTS_PATH
 #echo "rm$SYNC_WEIGHTS_PATH"
 accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --save_strategy "steps" --save_steps 100000  --save_total_limit  5 \
  --num_train_epochs 3 --gradient_accumulation_steps 1 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --eval_strategy 'no' \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --log_completions False --logging_steps 1 \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --num_generations 8 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
  --sampler_id 3 \
  --vllm_gpu_memory_utilization 0.90 > $log_path 2>&1
--- a/sh_dir/old/MoIS_Sampler_4gpus.sh
+++ b/sh_dir/old/MoIS_Sampler_4gpus.sh
@@ -1,66 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 cfg=$3
 wandb_name=$4
 ########################## parameters ##########################
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
 export USE_FLASH_ATTN=true
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt
 export SYNC_SAMPLER_STEPS=1

 for sampler_id in 0 2 1 3; do
  if ! [[ "$sampler_id" =~ ^[0-3]$ ]]; then
    echo "Error: sampler_id must be 0, 1, 2 or 3"
    exit 1
  fi

  if [[ $sampler_id -eq 0 ]]; then
    export CUDA_VISIBLE_DEVICES="0,1,2,3"
    export MASTER_PORT=29521
    vllm_gpu_memory_utilization=0.45
  elif [[ $sampler_id -eq 1 ]]; then
    export CUDA_VISIBLE_DEVICES="0,1,2,3"
    export MASTER_PORT=29522
    vllm_gpu_memory_utilization=0.9
  elif [[ $sampler_id -eq 2 ]]; then
    export CUDA_VISIBLE_DEVICES="4,5,6,7"
    export MASTER_PORT=29523
    vllm_gpu_memory_utilization=0.45
  elif [[ $sampler_id -eq 3 ]]; then
    export CUDA_VISIBLE_DEVICES="4,5,6,7"
    export MASTER_PORT=29524
    vllm_gpu_memory_utilization=0.9
  fi
  export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/sampler${sampler_id}/Qwen3-1.7B
  log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_sampler${sampler_id}_$2_cfg$3_${formatted_time}.log
  echo $log_path
  #rm $SYNC_WEIGHTS_PATH
  #echo "rm$SYNC_WEIGHTS_PATH"
  accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
    --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
    src/open_r1/$scriptname.py --output_dir $SAVEPATH \
    --save_strategy "steps" --save_steps 100000  --save_total_limit  5 \
    --num_train_epochs 3 --gradient_accumulation_steps 8 --max_completion_length 2048 --max_prompt_length 768 \
    --scale_rewards False --eval_strategy 'no' \
    --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
    --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
    --log_completions True --logging_steps 32 \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --num_generations 8 \
    --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
    --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
    --num_samplers 4 --sampler_id $sampler_id \
    --wandb_name $wandb_name \
    --vllm_gpu_memory_utilization $vllm_gpu_memory_utilization > $log_path 2>&1 &
  sleep 2
 done
--- a/sh_dir/old/MoIS_Sampler_4gpus_Part1.sh
+++ b/sh_dir/old/MoIS_Sampler_4gpus_Part1.sh
@@ -1,40 +0,0 @@
 formatted_time=$(date "+%Y%m%d-%H-%M-%S")
 ########################## parameters ##########################
 scriptname=$1
 xth=$2
 cfg=$3
 ########################## parameters ##########################
 log_path=/userhome/Research_HUB/GPG/open-r1/log_dir/AsyncGRPO/sampler/$1_part1_$2_cfg$3_${formatted_time}.log
 echo $log_path
 export WANDB_MODE=offline
 export WANDB_DIR=/userhome/Research_HUB/GPG/open-r1/wandb/AsyncGRPO/sampler
 export PYTHONPATH=/userhome/Research_HUB/GPG/open-r1/src
 export WORLD_SIZE=1
 export RANK=0
 export GPUS=4
 export MASTER_ADDR="localhost"
 export MASTER_PORT=29521

 export SAVEPATH=/userhome/save_dir/AsyncGRPO/4gpus/Sampler_${xth}_cfg${cfg}/part1/Qwen3-1.7B
 export FS_QUEUE_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/Rollout/Qwen3-1.7B
 export SYNC_WEIGHTS_PATH=/userhome/save_dir/AsyncGRPO/4gpus/Async_${xth}_cfg${cfg}/tmp/Qwen3-1.7B/gpg_async_weights.pt

 export SYNC_SAMPLER_STEPS=1
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 #rm $SYNC_WEIGHTS_PATH
 #echo "rm$SYNC_WEIGHTS_PATH"
 accelerate launch --config_file recipes/accelerate_configs/ddp_4gpus.yaml \
  --num_machines $WORLD_SIZE --machine_rank $RANK  --num_processes=$GPUS  --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
  src/open_r1/$scriptname.py --output_dir $SAVEPATH \
  --save_strategy "steps" --save_steps 100000  --save_total_limit  5 \
  --num_train_epochs 3 --gradient_accumulation_steps 1 --max_completion_length 2048 --max_prompt_length 768 \
  --scale_rewards False --eval_strategy 'no' \
  --model_name_or_path "/extrahome0/HF_models/Qwen/Qwen3-1.7B" \
  --dataset_name "/extrahome0/HF_datasets/open-r1/simplelr_qwen_level3to5" \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --num_generations 8 \
  --wandb_entity "pcl-zh" --wandb_project "GPG"  --report_to "wandb" \
  --config recipes/AsyncGRPO/config_simple_rl_math_l35_nRMs_$3.yaml \
  --seed 41 \
  --vllm_gpu_memory_utilization 0.45 > $log_path 2>&1
Author	SHA1	Message	Date
PCL-张晗	3386461756	v0	3 months ago
PCL-张晗	ffed470341	v0	3 months ago
PCL-张晗	9c7bd93de6	HeteroRL	3 months ago