diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d450c3b4d260e680d6a4049336f4f4beb3670d85
--- /dev/null
+++ b/.github/workflows/build_documentation.yml
@@ -0,0 +1,18 @@
+name: Build documentation
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+    with:
+      commit_sha: ${{ github.sha }}
+      package: alignment-handbook
+      path_to_docs: alignment-handbook/chapters/
+      additional_args: --not_python_module
+      languages: en
+    secrets:
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
new file mode 100644
index 0000000000000000000000000000000000000000..964698367dbe520442f16f431889fb99527e1435
--- /dev/null
+++ b/.github/workflows/build_pr_documentation.yml
@@ -0,0 +1,19 @@
+name: Build PR Documentation
+
+on:
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    with:
+      commit_sha: ${{ github.event.pull_request.head.sha }}
+      pr_number: ${{ github.event.number }}
+      package: alignment-handbook
+      path_to_docs: alignment-handbook/chapters/
+      additional_args: --not_python_module
+      languages: en
\ No newline at end of file
diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4b4e012ee7576e1243ad28b2ad93ce2b0edf92d6
--- /dev/null
+++ b/.github/workflows/quality.yml
@@ -0,0 +1,31 @@
+name: Quality
+
+on:
+  push:
+    branches:
+      - main
+      - v*-release
+  pull_request:
+    branches:
+      - main
+
+jobs:
+
+  check_code_quality:
+    name: Check code quality
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Setup Python environment
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.10
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install ".[quality]"
+      - name: Code quality
+        run: |
+          make quality
+
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000000000000000000000000000000000000..990795fd57d4c140d348863fb43350a2d06bdcf3
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,31 @@
+name: Tests
+
+on:
+  push:
+    branches:
+      - main
+      - v*-release
+  pull_request:
+    branches:
+      - main
+
+jobs:
+
+  unit-tests:
+    name: Run unit tests
+    env:
+      HF_TOKEN: ${{ secrets.HF_TOKEN }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Setup Python environment
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.10
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install ".[dev, torch]"
+      - name: Run unit tests
+        run: HF_TOKEN=$HF_TOKEN pytest -sv tests/
\ No newline at end of file
diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml
new file mode 100644
index 0000000000000000000000000000000000000000..d80d92c36e2329b00698fdea35e2f1aca4f3440e
--- /dev/null
+++ b/.github/workflows/upload_pr_documentation.yml
@@ -0,0 +1,16 @@
+name: Upload PR Documentation
+
+on:
+  workflow_run:
+    workflows: ["Build PR Documentation"]
+    types:
+      - completed
+
+jobs:
+  build:
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
+    with:
+      package_name: alignment-handbook
+    secrets:
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1445d93c085f3d90a89be70f7900949a76098d24
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,164 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# Temp folders
+data/
+wandb/
\ No newline at end of file
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..1e2b22935bd9b90fe61458051f0ce848cec59716
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,29 @@
+cff-version: 1.2.0
+title: The Alignment Handbook
+message: >-
+  Robust recipes to align language models with human and AI
+  preferences.
+type: software
+authors:
+  - given-names: Lewis
+    family-names: Tunstall
+  - given-names: Edward
+    family-names: Beeching
+  - given-names: Nathan
+    family-names: Lambert
+  - given-names: Nazneen
+    family-names: Rajani
+  - given-names: Shengyi
+    family-names: Huang
+  - given-names: Kashif
+    family-names: Rasul
+  - given-names: Alvaro
+    family-names: Bartolome
+  - given-names: Alexander
+    name-particle: M.
+    family-names: Rush
+  - given-names: Thomas
+    family-names: Wolf
+repository-code: 'https://github.com/huggingface/alignment-handbook'
+license: Apache-2.0
+version: 0.3.0.dev0
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..e2e4d2cb71b3aa6b9b686ee7c4bcae3560422a1e
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,44 @@
+.PHONY: style quality
+
+# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
+export PYTHONPATH = src
+
+check_dirs := src tests scripts
+
+style:
+	black --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort $(check_dirs) setup.py
+
+quality:
+	black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort --check-only $(check_dirs) setup.py
+	flake8 --max-line-length 119 $(check_dirs) setup.py
+
+
+# Release stuff
+
+pre-release:
+	python src/alignment/release.py
+
+pre-patch:
+	python src/alignment/release.py --patch
+
+post-release:
+	python src/alignment/release.py --post_release
+
+post-patch:
+	python src/alignment/release.py --post_release --patch
+
+wheels:
+	python setup.py bdist_wheel && python setup.py sdist
+
+wheels_clean:
+	rm -rf build && rm -rf dist
+
+pypi_upload:
+	python -m pip install twine
+	twine upload dist/* -r pypi
+
+pypi_test_upload:
+	python -m pip install twine
+	twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce625c6b693a23be0199cfe0030b723c4f6f1373
--- /dev/null
+++ b/README.md
@@ -0,0 +1,130 @@
+<p align="center">
+  <img src="https://raw.githubusercontent.com/huggingface/alignment-handbook/main/assets/handbook.png">
+</p>
+
+<p align="center">
+    🤗 <a href="https://huggingface.co/collections/alignment-handbook/handbook-v01-models-and-datasets-654e424d22e6880da5ebc015" target="_blank">Models & Datasets</a> | 📃 <a href="https://arxiv.org/abs/2310.16944" target="_blank">Technical Report</a>
+</p>
+
+# The Alignment Handbook
+
+Robust recipes to continue pretraining and to align language models with human and AI preferences.
+
+## What is this?
+
+Just one year ago, chatbots were out of fashion and most people hadn't heard about techniques like Reinforcement Learning from Human Feedback (RLHF) to align language models with human preferences. Then, OpenAI broke the internet with ChatGPT and Meta followed suit by releasing the Llama series of language models which enabled the ML community to build their very own capable chatbots. This has led to a rich ecosystem of datasets and models that have mostly focused on teaching language models to follow instructions through supervised fine-tuning (SFT).
+
+However, we know from the [InstructGPT](https://huggingface.co/papers/2203.02155) and [Llama2](https://huggingface.co/papers/2307.09288) papers that significant gains in helpfulness and safety can be had by augmenting SFT with human (or AI) preferences. At the same time, aligning language models to a set of preferences is a fairly novel idea and there are few public resources available on how to train these models, what data to collect, and what metrics to measure for best downstream performance.
+
+The Alignment Handbook aims to fill that gap by providing the community with a series of robust training recipes that span the whole pipeline.
+
+## News 🗞️
+* **November 21, 2024**: We release the [recipe](recipes/smollm2/README.md) for finet-uning SmolLM2-Instruct.
+* **August 18, 2024**: We release SmolLM-Instruct v0.2, along with the [recipe](recipes/smollm/README.md)  to fine-tuning small LLMs 💻
+* **April 12, 2024**: We release Zephyr 141B (A35B), in collaboration with Argilla and Kaist AI, along with the recipe to fine-tune Mixtral 8x22B with ORPO 🪁
+* **March 12, 2024:** We release StarChat2 15B, along with the recipe to train capable coding assistants 🌟
+* **March 1, 2024:** We release Zephyr 7B Gemma, which is a new recipe to align Gemma 7B with RLAIF 🔥
+* **February 1, 2024:** We release a recipe to align open LLMs with Constitutional AI 📜! See the [recipe](https://github.com/huggingface/alignment-handbook/tree/main/recipes/constitutional-ai) and the [blog post](https://huggingface.co/blog/constitutional_ai) for details. 
+* **January 18, 2024:** We release a suite of evaluations of DPO vs KTO vs IPO, see the [recipe](recipes/pref_align_scan/README.md) and the [blog post](https://huggingface.co/blog/pref-tuning) for details.
+* **November 10, 2023:** We release all the training code to replicate Zephyr-7b-β 🪁! We also release [No Robots](https://huggingface.co/datasets/HuggingFaceH4/no_robots), a brand new dataset of 10,000 instructions and demonstrations written entirely by skilled human annotators.
+
+## Links 🔗
+
+* [Zephyr 7B models, datasets, and demos](https://huggingface.co/collections/HuggingFaceH4/zephyr-7b-6538c6d6d5ddd1cbb1744a66)
+
+## How to navigate this project 🧭
+
+This project is simple by design and mostly consists of:
+
+* [`scripts`](./scripts/) to train and evaluate models. Four steps are included: continued pretraining, supervised-finetuning (SFT) for chat, preference alignment with DPO, and supervised-finetuning with preference alignment with ORPO. Each script supports distributed training of the full model weights with DeepSpeed ZeRO-3, or LoRA/QLoRA for parameter-efficient fine-tuning.
+* [`recipes`](./recipes/) to reproduce models like Zephyr 7B. Each recipe takes the form of a YAML file which contains all the parameters associated with a single training run. A `gpt2-nl` recipe is also given to illustrate how this handbook can be used for language or domain adaptation, e.g. by continuing to pretrain on a different language, and then SFT and DPO tuning the result. 
+
+We are also working on a series of guides to explain how methods like direct preference optimization (DPO) work, along with lessons learned from gathering human preferences in practice. To get started, we recommend the following:
+
+1. Follow the [installation instructions](#installation-instructions) to set up your environment etc.
+2. Replicate Zephyr-7b-β by following the [recipe instructions](./recipes/zephyr-7b-beta/README.md).
+
+If you would like to train chat models on your own datasets, we recommend following the dataset formatting instructions [here](./scripts/README.md#fine-tuning-on-your-datasets).
+
+
+## Contents
+
+The initial release of the handbook will focus on the following techniques:
+
+* **Continued pretraining:** adapt language models to a new language or domain, or simply improve it by continued pretraining (causal language modeling) on a new dataset.
+* **Supervised fine-tuning:** teach language models to follow instructions and tips on how to collect and curate your training dataset.
+* **Reward modeling:** teach language models to distinguish model responses according to human or AI preferences.
+* **Rejection sampling:** a simple, but powerful technique to boost the performance of your SFT model.
+* **Direct preference optimisation (DPO):** a powerful and promising alternative to PPO.
+* **Odds Ratio Preference Optimisation (ORPO)**: a technique to fine-tune language models with human preferences, combining SFT and DPO in a single stage.
+
+## Installation instructions
+
+To run the code in this project, first, create a Python virtual environment using e.g. Conda:
+
+```shell
+conda create -n handbook python=3.10 && conda activate handbook
+```
+
+Next, install PyTorch `v2.1.2` - the precise version is important for reproducibility! Since this is hardware-dependent, we
+direct you to the [PyTorch Installation Page](https://pytorch.org/get-started/locally/).
+
+You can then install the remaining package dependencies as follows:
+
+```shell
+git clone https://github.com/huggingface/alignment-handbook.git
+cd ./alignment-handbook/
+python -m pip install .
+```
+
+You will also need Flash Attention 2 installed, which can be done by running:
+
+```shell
+python -m pip install flash-attn --no-build-isolation
+```
+
+> **Note**
+> If your machine has less than 96GB of RAM and many CPU cores, reduce the `MAX_JOBS` arguments, e.g. `MAX_JOBS=4 pip install flash-attn --no-build-isolation`
+
+Next, log into your Hugging Face account as follows:
+
+```shell
+huggingface-cli login
+```
+
+Finally, install Git LFS so that you can push models to the Hugging Face Hub:
+
+```shell
+sudo apt-get install git-lfs
+```
+
+You can now check out the `scripts` and `recipes` directories for instructions on how to train some models 🪁!
+
+## Project structure
+
+```
+├── LICENSE
+├── Makefile                    <- Makefile with commands like `make style`
+├── README.md                   <- The top-level README for developers using this project
+├── chapters                    <- Educational content to render on hf.co/learn
+├── recipes                     <- Recipe configs, accelerate configs, slurm scripts
+├── scripts                     <- Scripts to train and evaluate chat models
+├── setup.cfg                   <- Installation config (mostly used for configuring code quality & tests)
+├── setup.py                    <- Makes project pip installable (pip install -e .) so `alignment` can be imported
+├── src                         <- Source code for use in this project
+└── tests                       <- Unit tests
+```
+
+## Citation
+
+If you find the content of this repo useful in your work, please cite it as follows via `\usepackage{biblatex}`:
+
+```bibtex
+@software{Tunstall_The_Alignment_Handbook,
+  author = {Tunstall, Lewis and Beeching, Edward and Lambert, Nathan and Rajani, Nazneen and Huang, Shengyi and Rasul, Kashif and Bartolome, Alvaro and M. Rush, Alexander and Wolf, Thomas},
+  license = {Apache-2.0},
+  title = {{The Alignment Handbook}},
+  url = {https://github.com/huggingface/alignment-handbook},
+  version = {0.3.0.dev0}
+}
+```
diff --git a/assets/handbook.png b/assets/handbook.png
new file mode 100644
index 0000000000000000000000000000000000000000..a1146bfe78725918584c889749fbb5415db2fca5
Binary files /dev/null and b/assets/handbook.png differ
diff --git a/chapters/en/_toctree.yml b/chapters/en/_toctree.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e8fc7c0a08cbe2dcab7769214b1535d7caca08e0
--- /dev/null
+++ b/chapters/en/_toctree.yml
@@ -0,0 +1,4 @@
+- title: Unit 0. Welcome to the RLHF Handbook!
+  sections:
+  - local: chapter0/introduction
+    title: What is this about?
\ No newline at end of file
diff --git a/chapters/en/chapter0/introduction.mdx b/chapters/en/chapter0/introduction.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..26f500f4fb5db72094f6cac3aefe7c0b204a4d21
--- /dev/null
+++ b/chapters/en/chapter0/introduction.mdx
@@ -0,0 +1,3 @@
+# Welcome to the RLHF Handbook!
+
+Stay tuned for more details 🤗
\ No newline at end of file
diff --git a/config_dpo_run.yaml b/config_dpo_run.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..00933eb3675b009ce7df8ea7baf468beb5888634
--- /dev/null
+++ b/config_dpo_run.yaml
@@ -0,0 +1,42 @@
+# Model arguments
+model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
+dataset_mixer:
+  data/my: 1.0
+dataset_splits:
+- train
+preprocessing_num_workers: 2
+
+# dpo trainer config
+bf16: true
+do_eval: False
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+learning_rate: 1.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_length: 4096
+num_train_epochs: 5
+output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_test_model
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+push_to_hub: False
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 51
+save_total_limit: 30
+seed: 42
+warmup_ratio: 0.2
\ No newline at end of file
diff --git a/config_grpo_offline.yaml b/config_grpo_offline.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..fd8681278737681c361d31cd8c0e09e3d1e60b15
--- /dev/null
+++ b/config_grpo_offline.yaml
@@ -0,0 +1,45 @@
+# Model arguments
+model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
+dataset_mixer:
+  data/my: 1.0
+dataset_splits:
+- train
+preprocessing_num_workers: 32
+
+# GRPO trainer config
+bf16: true
+do_eval: False
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+learning_rate: 1.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_prompt_length: 512
+max_completion_length: 512
+num_train_epochs: 5
+output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_grpo
+overwrite_output_dir: true
+# per_device_batch_size = num_generations * per_device_prompt_num (采样数量*per_device_prompt数量）
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+num_generations: 4
+push_to_hub: False
+remove_unused_columns: false
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 50
+save_total_limit: 30
+seed: 42
+warmup_ratio: 0.2
\ No newline at end of file
diff --git a/config_sft_test_env.yaml b/config_sft_test_env.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e2c2e656aba1bc9763910a76122c2bae4e75cb61
--- /dev/null
+++ b/config_sft_test_env.yaml
@@ -0,0 +1,42 @@
+# Model arguments
+model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
+dataset_mixer:
+  data/my: 1.0
+dataset_splits:
+- train
+preprocessing_num_workers: 2
+
+# SFT trainer config
+bf16: true
+do_eval: False
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+learning_rate: 1.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 4096
+num_train_epochs: 5
+output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_test_model
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+push_to_hub: False
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 51
+save_total_limit: 30
+seed: 42
+warmup_ratio: 0.2
\ No newline at end of file
diff --git a/grpo_offline_run.py b/grpo_offline_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..c90c6cffe1e62f868e9f6358e36e627c47d9fe9a
--- /dev/null
+++ b/grpo_offline_run.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Supervised fine-tuning script for decoder language models.
+CUDA_VISIBLE_DEVICES=1,2,3,4,5 ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml grpo_offline_run.py config_grpo_offline.yaml
+"""
+
+import logging
+import random
+import sys
+
+import datasets
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, set_seed
+from trl.data_utils import maybe_apply_chat_template
+from datasets import load_dataset
+from alignment import (
+    DataArguments,
+    H4ArgumentParser,
+    ModelArguments,
+    SFTConfig,
+    apply_chat_template,
+    decontaminate_humaneval,
+    get_checkpoint,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+)
+from trl import SFTTrainer, setup_chat_format
+from trl_012_grpo.grpo_trainer import GRPOTrainer
+from trl_012_grpo.grpo_config import GRPOConfig
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, GRPOConfig))
+    model_args, data_args, training_args = parser.parse()
+
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process a small summary
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Check for last checkpoint
+    last_checkpoint = get_checkpoint(training_args)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_test_shuffle.json")
+    eval_raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/grpo_data_ori/grpo_del_lowscore/shuffle/grpo_test_shuffle.json")
+    logger.info(
+        f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+    column_names = list(raw_datasets["train"].features)
+
+    ################
+    # Load tokenizer
+    ################
+    tokenizer = get_tokenizer(model_args, data_args)
+
+    #######################
+    # Load pretrained model
+    #######################
+    logger.info("*** Load pretrained model ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+
+    model = model_args.model_name_or_path
+    # For ChatML we need to add special tokens and resize the embedding layer
+    if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path:
+        model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
+        model, tokenizer = setup_chat_format(model, tokenizer)
+        model_kwargs = None
+
+    #####################
+    # Apply chat template
+    #####################
+    def modify_completion(example):
+        # 将 completion 转换为列表
+        example['prompt'] = \
+        maybe_apply_chat_template({"prompt": [{"role": "user", "content": example['prompt']}]}, tokenizer=tokenizer)[
+            'prompt']
+        return example
+
+    raw_datasets = raw_datasets.map(modify_completion)
+    eval_raw_datasets = eval_raw_datasets.map(modify_completion)
+
+
+    train_dataset = raw_datasets["train"]
+    eval_dataset = eval_raw_datasets["train"]
+
+    ########################
+    # Initialize the Trainer
+    ########################
+
+    # 这里的reward function实际不会被用到
+    def reward_len(completions, **kwargs):
+        return [-abs(20 - len(completion)) for completion in completions]
+
+    training_args.model_init_kwargs = model_kwargs
+    trainer = GRPOTrainer(
+        model=model,
+        reward_funcs=reward_len,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
+
+    ###############
+    # Training loop
+    ###############
+    logger.info("*** Train ***")
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(train_dataset)
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(data_args.dataset_mixer.keys()),
+        "dataset_tags": list(data_args.dataset_mixer.keys()),
+        "tags": ["alignment-handbook"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(eval_dataset)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.push_to_hub is True:
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
+
+    logger.info("*** Training complete ***")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/recipes/accelerate_configs/deepspeed_zero3.yaml b/recipes/accelerate_configs/deepspeed_zero3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b25267e8f499f6b98462d2964e61236193d2996d
--- /dev/null
+++ b/recipes/accelerate_configs/deepspeed_zero3.yaml
@@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 4
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/recipes/accelerate_configs/fsdp.yaml b/recipes/accelerate_configs/fsdp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2ec911682c8681e5fbd4b2c1d75d0c46d13bbe3f
--- /dev/null
+++ b/recipes/accelerate_configs/fsdp.yaml
@@ -0,0 +1,26 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/recipes/accelerate_configs/fsdp_qlora.yaml b/recipes/accelerate_configs/fsdp_qlora.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f28a0f1046a735579045655dcdb9d3bf7c6ffdcc
--- /dev/null
+++ b/recipes/accelerate_configs/fsdp_qlora.yaml
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE                                                                                                                                           
+debug: false                                                                                                                                                                 
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: true
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 2
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
\ No newline at end of file
diff --git a/recipes/accelerate_configs/multi_gpu.yaml b/recipes/accelerate_configs/multi_gpu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4f0557131aa2c1bded4cb4cfdc1cc58a3b25765b
--- /dev/null
+++ b/recipes/accelerate_configs/multi_gpu.yaml
@@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/recipes/constitutional-ai/README.md b/recipes/constitutional-ai/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..08f4520ae4e73e14adcec2f2b0cf440dc6c4c831
--- /dev/null
+++ b/recipes/constitutional-ai/README.md
@@ -0,0 +1,24 @@
+# Constitutional AI 
+
+This repo includes the recipe for training the following models:
+
+* https://huggingface.co/HuggingFaceH4/mistral-7b-anthropic
+* https://huggingface.co/HuggingFaceH4/mistral-7b-grok
+
+
+## Full training examples
+
+You will require 8 GPUs (80GB of VRAM) to train the full model.
+```shell
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/constitutional-ai/sft/config_{grok,anthropic}.yaml
+
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/constitutional-ai/dpo/config_anthropic.yaml
+# Note that we did not include the DPO recipe for grok, as that model's seems overtrained and too snarky.
+```
+
+
+## Advanced: generating you own dataset
+
+To generate the constitutional AI dataset, see https://github.com/huggingface/llm-swarm/tree/main/examples/constitutional-ai for detailed instructions if you want to build or customize the dataset. 
diff --git a/recipes/constitutional-ai/dpo/config_anthropic.yaml b/recipes/constitutional-ai/dpo/config_anthropic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..48f576760de7eb7a0b0b1f13f0b9577c5826a987
--- /dev/null
+++ b/recipes/constitutional-ai/dpo/config_anthropic.yaml
@@ -0,0 +1,41 @@
+# Model arguments
+model_name_or_path: alignment-handbook/mistral-7b-sft-constitutional-ai
+torch_dtype: null
+
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+  HuggingFaceH4/cai-conversation-harmless: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.1
+do_eval: true
+do_train: true
+eval_strategy: steps
+eval_steps: 1000
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+hub_model_id: mistral-7b-dpo-constitutional-ai
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: linear
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 3
+optim: rmsprop
+output_dir: data/mistral-7b-dpo-constitutional-ai
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 8
+push_to_hub: true
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/constitutional-ai/sft/config_anthropic.yaml b/recipes/constitutional-ai/sft/config_anthropic.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6724de0cdc90b0245e1965199bedc0b6d4cfa637
--- /dev/null
+++ b/recipes/constitutional-ai/sft/config_anthropic.yaml
@@ -0,0 +1,48 @@
+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/cai-conversation-harmless: 1.0
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: true
+do_train: true
+eval_strategy: epoch # One of ["no", "steps", "epoch"]
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: mistral-7b-sft-constitutional-ai
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/mistral-7b-sft-constitutional-ai
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/constitutional-ai/sft/config_grok.yaml b/recipes/constitutional-ai/sft/config_grok.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c79031dc757b4ac36af3b01522844e1f1ad884f6
--- /dev/null
+++ b/recipes/constitutional-ai/sft/config_grok.yaml
@@ -0,0 +1,48 @@
+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/grok-conversation-harmless: 0.15
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: true
+do_train: true
+eval_strategy: epoch # One of ["no", "steps", "epoch"]
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: mistral-7b-sft-constitutional-ai
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/mistral-7b-sft-constitutional-ai
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/gpt2-nl/README.md b/recipes/gpt2-nl/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..68eccfc8b0a1b093c61313757e712d80bf44c911
--- /dev/null
+++ b/recipes/gpt2-nl/README.md
@@ -0,0 +1,43 @@
+# Language Adaptation through Continued Pretraining
+
+This directory shows a base example of how to use continued pretraining and further tuning to adapt a language model to new data (e.g. a new language or domain).
+
+Three steps are needed: continued pretraining (`cpt`), supervised finetuning (`sft`), and direct preference optimisation (`dpo`). In this dummy example, we'll continue pretraining gpt2 on Dutch raw data, then sft-tuning it, and finally aligning it with DPO. Note that no extensive hyperparameters were tested in this example and that the output models are bad - it is just to show you how you can use the scripts for LM adaptation. The scripts work on 4x 3090s (24GB VRAM). If you have less powerful hardware you may need to reduce the batch size.
+
+## Continued pretraining
+
+This step will further pretrain the original `gpt2` model on plain Dutch text. Note that the script will by default use the `text` column in the dataset but you can change that by specifying `text_column` in the yaml file or on the command-line.
+
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch \
+    --config_file recipes/accelerate_configs/multi_gpu.yaml \
+    --num_processes 4 \
+    scripts/run_cpt.py \
+    recipes/gpt2-nl/cpt/config_full.yaml
+```
+
+## Supervised finetuning
+
+As other recipes, such as the famous zephyr-7b-beta recipe, have shown, we can then teach our model how to hold a conversation by finetuning it on chat-formatted data. As a base model, we'll make use of the output of the previous step.
+
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch \
+    --config_file recipes/accelerate_configs/multi_gpu.yaml \
+    --num_processes 4 \
+    scripts/run_sft.py recipes/gpt2-nl/sft/config_full.yaml
+```
+
+## Direct preference optimisation
+
+Finally, to align the model better with feedback, we can finetune the SFT output with the DPO algorithm. This should improve the quality of the chat capabilities of the model.
+
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch \
+    --config_file recipes/accelerate_configs/multi_gpu.yaml \
+    --num_processes 4 \
+    scripts/run_dpo.py recipes/gpt2-nl/dpo/config_full.yaml
+```
+
+## Conclusion
+
+With the steps above you can adapt an LM to a new domain, more data, or even a different language. Then, with sft and dpo, you can end up building a powerful chatbot, too! All within just three simple commands. It should be obvious that all of these follow a very similar approach, which makes them suitable to apply in parameterized slurm jobs. The neat part is that you can easily overwrite arguments in the yaml files by specifying the overwriting argument as a command-line argument, so the adaptability is also great.
diff --git a/recipes/gpt2-nl/cpt/config_full.yaml b/recipes/gpt2-nl/cpt/config_full.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9c7056cf1c5316b27704cbd67e57dc8bb389af6f
--- /dev/null
+++ b/recipes/gpt2-nl/cpt/config_full.yaml
@@ -0,0 +1,45 @@
+# Model arguments
+model_name_or_path: gpt2
+model_revision: main
+torch_dtype: bfloat16
+
+# Data training arguments
+dataset_mixer:
+  yhavinga/mc4_nl_cleaned: 1.0
+dataset_splits:
+  - train
+dataset_configs:
+  - tiny
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: False
+eval_strategy: "no"
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: gpt2-cpt-dutch
+hub_strategy: every_save
+learning_rate: 2.0e-04
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 1024
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/gpt2-cpt-dutch
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 16
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
diff --git a/recipes/gpt2-nl/dpo/config_full.yaml b/recipes/gpt2-nl/dpo/config_full.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..976c253756435d99c1c4bf54a8a5437e6c41c081
--- /dev/null
+++ b/recipes/gpt2-nl/dpo/config_full.yaml
@@ -0,0 +1,44 @@
+# Model arguments
+model_name_or_path: BramVanroy/gpt2-sft-dutch
+model_revision: main
+torch_dtype: bfloat16
+
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  BramVanroy/ultra_feedback_dutch: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.1
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: gpt2-dpo-dutch
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 1
+optim: adamw_torch
+output_dir: data/gpt2-dpo-dutch
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 8
+push_to_hub: true
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
+report_to:
+- wandb
diff --git a/recipes/gpt2-nl/sft/config_full.yaml b/recipes/gpt2-nl/sft/config_full.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f80d8efcdb9d3ecd4c5c236aedbc251c8c06cd37
--- /dev/null
+++ b/recipes/gpt2-nl/sft/config_full.yaml
@@ -0,0 +1,45 @@
+# Model arguments
+model_name_or_path: BramVanroy/gpt2-cpt-dutch
+model_revision: main
+torch_dtype: bfloat16
+
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  BramVanroy/ultrachat_200k_dutch: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: gpt2-sft-dutch
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 1024
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/gpt2-sft-dutch
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- wandb
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
diff --git a/recipes/launch.slurm b/recipes/launch.slurm
new file mode 100644
index 0000000000000000000000000000000000000000..d90fdae9bf29cb2671ace11b76261b7a912c41cc
--- /dev/null
+++ b/recipes/launch.slurm
@@ -0,0 +1,86 @@
+#!/bin/bash
+#SBATCH --ntasks-per-node=1
+#SBATCH --exclusive
+#SBATCH --gres=gpu:8
+#SBATCH --partition=hopper-prod  # Adjust this for your cluster
+#SBATCH --output=/fsx/h4/logs/%x-%j.out # Adjust this for your cluster
+#SBATCH --err=/fsx/h4/logs/%x-%j.err    # Adjust this for your cluster
+
+set -x -e
+
+source ~/.bashrc
+conda activate handbook
+echo "START TIME: $(date)"
+
+MODEL=$1
+TASK=$2
+PRECISION=$3
+ACCELERATOR=$4
+OPTIONAL_ARGS=$5
+
+# Training setup
+NUM_NODES=$SLURM_NNODES
+GPUS_PER_NODE=8
+WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
+# Due to conflicts between Accelerate's DeepSpeed configs and Transformers' TrainingArguments, we need to parse the gradient accumulation steps from the config file to ensure they match
+CONFIG_FILE=recipes/$MODEL/$TASK/config_$PRECISION.yaml
+GRAD_ACC_STEPS=$(grep 'gradient_accumulation_steps' $CONFIG_FILE | awk '{print $2}')
+
+# Split the string into individual arguments
+IFS=' ' read -ra ARGS <<< "$OPTIONAL_ARGS"
+
+# Loop through the arguments and find the one with "--gradient_accumulation_steps"
+for arg in "${ARGS[@]}"; do
+    if [[ "$arg" == "--gradient_accumulation_steps="* ]]; then
+        # Extract the value after the equals sign
+        GRAD_ACC_STEPS="${arg#*=}"
+        break  # Exit the loop once we find the desired argument
+    fi
+done
+
+echo "Gradient accumulation steps: $GRAD_ACC_STEPS"
+# so processes know who to talk to
+MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+MASTER_PORT=6000
+
+export CMD=" \
+    scripts/run_$TASK.py $CONFIG_FILE $OPTIONAL_ARGS
+    "
+
+export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch \
+    --config_file recipes/accelerate_configs/$ACCELERATOR.yaml  \
+    --gradient_accumulation_steps $GRAD_ACC_STEPS \
+    --num_machines $NUM_NODES \
+    --num_processes $WORLD_SIZE \
+    --main_process_ip $MASTER_ADDR \
+    --main_process_port $MASTER_PORT \
+    --machine_rank \$SLURM_PROCID \
+    --rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
+    --max_restarts 1 \
+    --role \$(hostname -s): \
+    --tee 3 \
+    "
+
+# force crashing on nccl issues like hanging broadcast
+export NCCL_ASYNC_ERROR_HANDLING=1
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=COLL
+# export NCCL_SOCKET_NTHREADS=1
+# export NCCL_NSOCKS_PERTHREAD=1
+# export CUDA_LAUNCH_BLOCKING=1
+
+# Specific configuration optimized for the Hugging Face Compute Cluster
+# Be ye warned this may not work on other clusters!
+module load cuda/12.1
+
+# srun error handling:
+# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
+# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
+SRUN_ARGS=" \
+    --wait=60 \
+    --kill-on-bad-exit=1 \
+    "
+
+clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD" 2>&1
+
+echo "END TIME: $(date)"
\ No newline at end of file
diff --git a/recipes/pref_align_scan/README.md b/recipes/pref_align_scan/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f9c81a51c68450e96c1969e58285e78f48bb682b
--- /dev/null
+++ b/recipes/pref_align_scan/README.md
@@ -0,0 +1,49 @@
+# Comparing Preference Alignment Algorithms
+This directory contains various comparisons for three algorithms: DPO, IPO, and KTO. Each algorithm has been run in different hyperparameter configurations to study their performance. Two different models and datasets have been used to compare the performance of each algorithm:
+
+- zephyr-beta-sft and Ultrafeedback
+- OpenHermes-2.5 and the OpenOrca datasets 
+
+We release a collection containing the datasets and models used for these experiments, if you require the other trained models, we can release them on request.
+You can find a longer description of these results in our [blogpost](https://huggingface.co/blog/pref-tuning)
+
+## Comparisons
+For each algorithm, we aim to tune the beta parameter for a fixed learning rate. We vary beta from 0.1-0.9 in steps of 0.1, we have also found that in certain configurations a tiny value of beta, 0.01, can be effective. So we have included this smaller value in all our comparisons.
+
+## Usage
+The experiments can be launched with the following bash script:
+```bash
+#!/bin/bash
+
+# Define an array containing the base configs we wish to fine tune
+configs=("zephyr" "openhermes")
+# Define an array of loss types
+loss_types=("sigmoid" "kto_pair" "ipo")
+
+# Define an array of beta values
+betas=("0.01" "0.1" "0.2" "0.3" "0.4" "0.5" "0.6" "0.7" "0.8" "0.9")
+
+# Outer loop for loss types
+for config in "${configs[@]}"; do
+    for loss_type in "${loss_types[@]}"; do
+
+        # Inner loop for beta values
+        for beta in "${betas[@]}"; do
+
+            # Determine the job name and model revision based on loss type
+            job_name="$config_${loss_type}_beta_${beta}"
+            model_revision="${loss_type}-${beta}"
+
+            # Submit the job
+            sbatch --job-name=${job_name} recipes/launch.slurm pref_align_scan dpo $config deepspeed_zero3 \
+            "--beta=${beta} --loss_type=${loss_type} --output_dir=data/$config-7b-align-scan-${loss_type}-beta-${beta} --hub_model_revision=${model_revision}"
+        done
+    done
+done
+```
+
+
+
+
+
+
diff --git a/recipes/pref_align_scan/dpo/config_openhermes.yaml b/recipes/pref_align_scan/dpo/config_openhermes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..43e8a2309352d6fc5ed9a89b5ae5f2a876ad0952
--- /dev/null
+++ b/recipes/pref_align_scan/dpo/config_openhermes.yaml
@@ -0,0 +1,41 @@
+# Model arguments
+model_name_or_path: teknium/OpenHermes-2.5-Mistral-7B
+torch_dtype: null
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/orca_dpo_pairs: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# Training arguments with sensible defaults
+bf16: true
+beta: 0.01
+loss_type: sigmoid
+do_eval: true
+do_train: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: HuggingFaceH4/openhermes-2.5-mistral-7b-dpo
+hub_model_revision: v1.0
+
+learning_rate: 5.0e-7
+logging_steps: 10
+lr_scheduler_type: cosine
+max_prompt_length: 512
+num_train_epochs: 1
+optim: adamw_torch
+output_dir: data/openhermes-2.5-mistral-7b-dpo-v1.0
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 8
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/pref_align_scan/dpo/config_zephyr.yaml b/recipes/pref_align_scan/dpo/config_zephyr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0dd6d37921b3f2f1eb5238470e3b0a38ae0bb78e
--- /dev/null
+++ b/recipes/pref_align_scan/dpo/config_zephyr.yaml
@@ -0,0 +1,39 @@
+# Model arguments
+model_name_or_path: alignment-handbook/zephyr-7b-sft-full
+torch_dtype: null
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# Training arguments with sensible defaults
+bf16: true
+beta: 0.01
+loss_type: sigmoid
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: zephyr-7b-align-scan
+hub_model_revision: dpo-beta-0.01
+learning_rate: 5.0e-7
+logging_steps: 10
+lr_scheduler_type: cosine
+max_prompt_length: 512
+num_train_epochs: 1
+optim: adamw_torch
+output_dir: data/zephyr-7b-align-scan-dpo-beta-0.01
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 8
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/pref_align_scan/launch_scan.sh b/recipes/pref_align_scan/launch_scan.sh
new file mode 100644
index 0000000000000000000000000000000000000000..334b9472cb96c23d9f73a654eb856ecd4dbc3f1e
--- /dev/null
+++ b/recipes/pref_align_scan/launch_scan.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+# Define an array containing the base configs we wish to fine tune
+configs=("zephyr" "openhermes")
+# Define an array of loss types
+loss_types=("sigmoid" "kto_pair" "ipo")
+# Define an array of beta values
+betas=("0.01" "0.1" "0.2" "0.3" "0.4" "0.5" "0.6" "0.7" "0.8" "0.9")
+
+# Outer loop for loss types
+for config in "${configs[@]}"; do
+    for loss_type in "${loss_types[@]}"; do
+
+        # Inner loop for beta values
+        for beta in "${betas[@]}"; do
+            # Determine the job name and model revision based on loss type
+            job_name="$config_${loss_type}_beta_${beta}"
+            model_revision="${loss_type}-${beta}"
+
+            # Submit the job
+            sbatch --job-name=${job_name} recipes/launch.slurm pref_align_scan dpo $config deepspeed_zero3 \
+            "--beta=${beta} --loss_type=${loss_type} --output_dir=data/$config-7b-align-scan-${loss_type}-beta-${beta} --hub_model_revision=${model_revision}"
+        done
+    done
+done
\ No newline at end of file
diff --git a/recipes/smollm/README.md b/recipes/smollm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d636ed3f80e41883fd3668d5bd40802059a543c4
--- /dev/null
+++ b/recipes/smollm/README.md
@@ -0,0 +1,19 @@
+
+# Instructions to train SmolLM-Instruct
+
+We build the [SmolLM-Instruct](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966) (v0.2) models (135M, 360M and 1.7B) by doing SFT on a mix of these datasets:
+- a dataset of 2k simple everyday conversations we generated by llama3.1-70B [everyday-conversations-llama3.1-2k](https://huggingface.co/datasets/HuggingFaceTB/everyday-conversations-llama3.1-2k/)
+- [Magpie-Pro-300K-Filtered](https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered)
+- [StarCoder2-Self-OSS-Instruct](https://huggingface.co/datasets/bigcode/self-oss-instruct-sc2-exec-filter-50k)
+- A small subset of [OpenHermes-2.5](https://huggingface.co/datasets/teknium/OpenHermes-2.5)
+
+## Setup
+
+Follow the installation instructions in https://github.com/huggingface/alignment-handbook/tree/main?tab=readme-ov-file#installation-instructions 
+
+## Training
+We train the models on 8 GPUs using the following command:
+
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/smollm/sft/config.yaml
+```
diff --git a/recipes/smollm/sft/config.yaml b/recipes/smollm/sft/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2462191caeec1baf8d0dfa62b5e3166421b62d5c
--- /dev/null
+++ b/recipes/smollm/sft/config.yaml
@@ -0,0 +1,53 @@
+# Model arguments
+model_name_or_path: HuggingFaceTB/SmolLM-360M
+model_revision: main
+tokenizer_name_or_path: HuggingFaceTB/SmolLM-360M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+torch_dtype: bfloat16
+use_flash_attention_2: true
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceTB/Magpie-Pro-300K-Filtered-H4: 1.0 
+  HuggingFaceTB/self-oss-instruct-sc2-H4: 1.0 
+  HuggingFaceTB/OpenHermes-2.5-H4: 0.001 
+  HuggingFaceTB/everyday-conversations-llama3.1-2k: 1.0 
+  HuggingFaceTB/instruct-data-basics-smollm-H4: 1.0 
+
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 36
+
+# SFT trainer config
+bf16: true
+dataset_kwargs:
+  add_special_tokens: false  # We already wrap <bos> and <eos> in the chat template
+  append_concat_token: false # No need to add <eos> across samples
+do_eval: true
+evaluation_strategy: epoch
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: smollm-360M-instruct-new
+hub_strategy: every_save
+learning_rate: 1.0e-03 # 3e-4
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/smollm-360M-instruct-new
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/smollm2/README.md b/recipes/smollm2/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2afc8844dcaf042ea9f7a9ff50efc1d5297d2600
--- /dev/null
+++ b/recipes/smollm2/README.md
@@ -0,0 +1,28 @@
+
+# Instructions to train SmolLM2-1.7B-Instruct
+
+We build the [SmolLM2-Instruct](https://huggingface.co/collections/HuggingFaceTB/smollm2-6723884218bcda64b34d7db9) by doing SFT on [SmolTalk](https://huggingface.co/datasets/HuggingFaceTB/smoltalk) and then DPO on [UltraFeedBack](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized).
+
+## Setup
+
+Follow the installation instructions in https://github.com/huggingface/alignment-handbook/tree/main?tab=readme-ov-file#installation-instructions 
+
+## Training
+We train the 1.7B on 8 GPUs using the following command:
+
+```shell
+# SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/smollm2/sft/config.yaml
+
+# DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/smollm2/dpo/config.yaml
+```
+
+For the 135M and 360M we use [smol-smoltalk](https://huggingface.co/datasets/HuggingFaceTB/smol-smoltalk) dataset for SFT and UltraFeedback for DPO:
+```shell
+# SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/smollm2/sft/config_smol.yaml
+
+# DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/smollm2/dpo/config_smol.yaml
+```
\ No newline at end of file
diff --git a/recipes/smollm2/dpo/config.yaml b/recipes/smollm2/dpo/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f35f8dcdc1203886f04d0eae34e1a204675bb6d
--- /dev/null
+++ b/recipes/smollm2/dpo/config.yaml
@@ -0,0 +1,43 @@
+# Model arguments
+model_name_or_path: loubnabnl/smollm2-1.7B-sft
+torch_dtype: bfloat16
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.5
+do_eval: true
+hub_private_repo: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: smollm2-1.7B-dpo
+learning_rate: 1.0e-6
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 3
+optim: adamw_torch
+output_dir: data/smollm2-1.7B-dpo
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 4
+push_to_hub: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/smollm2/dpo/config_smol.yaml b/recipes/smollm2/dpo/config_smol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b629bc3a36d167a06d32ca844e9f1047c886e3a8
--- /dev/null
+++ b/recipes/smollm2/dpo/config_smol.yaml
@@ -0,0 +1,43 @@
+# Model arguments
+model_name_or_path: loubnabnl/smollm2-360M-sft # we use this script for the 135M model too
+torch_dtype: bfloat16
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.5
+do_eval: true
+hub_private_repo: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: smollm2-360M-dpo
+learning_rate: 1.0e-6
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 2
+optim: adamw_torch
+output_dir: data/smollm2-360M-dpo
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 4
+push_to_hub: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/smollm2/sft/config.yaml b/recipes/smollm2/sft/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f6cd5168e43a1b3e5be334793d0534215ef443e
--- /dev/null
+++ b/recipes/smollm2/sft/config.yaml
@@ -0,0 +1,49 @@
+# Model arguments
+model_name_or_path: HuggingFaceTB/SmolLM2-1.7B
+model_revision: main
+tokenizer_name_or_path: HuggingFaceTB/SmolLM2-1.7B-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+torch_dtype: bfloat16
+use_flash_attention_2: true
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceTB/smoltalk: 1.0
+
+dataset_configs:
+- all
+
+dataset_splits:
+- train
+- test
+preprocessing_num_workers: 36
+
+# SFT trainer config
+bf16: true
+do_eval: true
+evaluation_strategy: epoch
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: smollm2-1.7B-sft
+hub_strategy: every_save
+learning_rate: 3.0e-04 
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 8192
+max_steps: -1
+num_train_epochs: 2
+output_dir: data/smollm2-1.7B-sft
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/smollm2/sft/config_smol.yaml b/recipes/smollm2/sft/config_smol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..70be48cc552559be35e03ee675311a7d63a3c04c
--- /dev/null
+++ b/recipes/smollm2/sft/config_smol.yaml
@@ -0,0 +1,46 @@
+# Model arguments
+model_name_or_path: HuggingFaceTB/SmolLM2-360M # we use this script for the 135M model too
+model_revision: main
+tokenizer_name_or_path: HuggingFaceTB/SmolLM2-360M-Instruct # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+torch_dtype: bfloat16
+use_flash_attention_2: true
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceTB/smol-smoltalk: 1.0
+
+dataset_splits:
+- train
+- test
+preprocessing_num_workers: 36
+
+# SFT trainer config
+bf16: true
+do_eval: true
+evaluation_strategy: epoch
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: smollm2-360M-sft
+hub_strategy: every_save
+learning_rate: 1.0e-03 # 3e-4
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 8192
+max_steps: -1
+num_train_epochs: 2
+output_dir: data/smollm2-360M-sft
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/starchat2-15b/README.md b/recipes/starchat2-15b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..06e807f11518194dcd3ffa6c04931967ffb6a15f
--- /dev/null
+++ b/recipes/starchat2-15b/README.md
@@ -0,0 +1,21 @@
+
+# Instructions to train StarChat2
+
+Similar to how we trained Zephyr 7B Beta in our [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:
+
+1. Apply SFT to fine-tune [StarCoder2 15B](https://huggingface.co/bigcode/starcoder2-15b) on a blend of chat, code, and math datastets. The result is an SFT model like [`starchat2-15b-sft-v0.1`](https://huggingface.co/HuggingFaceH4/starchat2-15b-sft-v0.1).
+2. Align the SFT model to AI feedback via DPO on the UltraFeedback and Orca DPO Pairs datasets. The result is a DPO model like [`starchat2-15b-v0.1`](https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1).
+
+See below for commands to train these models using DeepSpeed ZeRO-3.
+
+## Full training examples
+
+You will require 8 GPUs (80GB of VRAM) to train the full model - alternatively, you can train on 1 GPU by adjusting `per_device_train_batch_size` and `gradient_accumulation_steps` to keep the global batch size constant. A recipe involving QLoRA will come later 🤗.
+
+```shell
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/starchat2-15b/sft/config_v0.1.yaml
+
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/starchat2-15b/dpo/config_v0.1.yaml
+```
diff --git a/recipes/starchat2-15b/dpo/config_v0.1.yaml b/recipes/starchat2-15b/dpo/config_v0.1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..cf0ddb3f4aee43823ab35b30494a1049d6d7737e
--- /dev/null
+++ b/recipes/starchat2-15b/dpo/config_v0.1.yaml
@@ -0,0 +1,43 @@
+# Model arguments
+model_name_or_path: HuggingFaceH4/starchat2-15b-sft-v0.1
+torch_dtype: bfloat16
+
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+  HuggingFaceH4/orca_dpo_pairs: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.05
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: starchat2-15b-dpo-v0.1
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 2
+optim: adamw_torch
+output_dir: data/starchat2-15b-dpo-v0.1
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 4
+push_to_hub: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/starchat2-15b/sft/config_v0.1.yaml b/recipes/starchat2-15b/sft/config_v0.1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f5892de59a6945006a6ffd7b219aec6dd8433ea2
--- /dev/null
+++ b/recipes/starchat2-15b/sft/config_v0.1.yaml
@@ -0,0 +1,49 @@
+# Model arguments
+model_name_or_path: bigcode/starcoder2-15b
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+dataset_mixer:
+  HuggingFaceH4/airoboros-3.2: 1.0
+  HuggingFaceH4/Code-Feedback: 1.0
+  HuggingFaceH4/orca-math-word-problems-200k: 1.0
+  HuggingFaceH4/SystemChat: 1.0
+  HuggingFaceH4/capybara: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 24
+
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: starchat2-15b-v0.1
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 3
+output_dir: data/starchat2-15b-v0.1
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 8
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/zephyr-141b-A35b/README.md b/recipes/zephyr-141b-A35b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..203cd14cfe4b3f3c303a3bbe086415234bbf001f
--- /dev/null
+++ b/recipes/zephyr-141b-A35b/README.md
@@ -0,0 +1,23 @@
+
+# Instructions to train Zephyr-141B-A35B with ORPO
+
+This model is fine-tuned via a novel alignment algorithm called [Odds Ratio Preference Optimization (ORPO)](https://huggingface.co/papers/2403.07691). ORPO does not require an SFT step to achieve high performance and is thus much more computationally efficient than methods like DPO and PPO. To train Zephyr-141B-A35B, we used the [`argilla/distilabel-capybara-dpo-7k-binarized`](https://huggingface.co/datasets/argilla/distilabel-capybara-dpo-7k-binarized) preference dataset, which consists of synthetic, high-quality, multi-turn preferences that have been scored via LLMs.
+
+See below for commands to train these models using FSDP. **Note:** we found it was not possible to train this large model with DeepSpeed ZeRO-3 due to unresolved NCCL errors which cause GPUs to hang. 
+
+## Full training examples
+
+You will require 4 nodes of 8 GPUs (80GB of VRAM) to train the full model - alternatively, you may be able to train on fewer GPUs by adjusting `per_device_train_batch_size` and `gradient_accumulation_steps` and `num_train_epochs` to keep the global batch size constant. A recipe involving QLoRA will come later 🤗.
+
+To run with Slurm, use:
+
+```shell
+sbatch --job-name=handbook_sft --nodes=4 recipes/launch.slurm zephyr-141b-A35b orpo full fsdp
+```
+
+Under the hood, this calls the following script which can be adapted to other models and datasets:
+
+
+```shell
+ACCELERATE_LOG_LEVEL=info TRANSFORMERS_VERBOSITY=info accelerate launch --config_file recipes/accelerate_configs/fsdp.yaml scripts/run_orpo.py recipes/zephyr-141b-A35b/orpo/config_full.yaml
+```
\ No newline at end of file
diff --git a/recipes/zephyr-141b-A35b/orpo/config_full.yaml b/recipes/zephyr-141b-A35b/orpo/config_full.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b521013207f301d09b745adc6fbb23ddad8ae5de
--- /dev/null
+++ b/recipes/zephyr-141b-A35b/orpo/config_full.yaml
@@ -0,0 +1,39 @@
+# Model arguments
+model_name_or_path: mistral-community/Mixtral-8x22B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  argilla/distilabel-capybara-dpo-7k-binarized: 1.0
+dataset_splits:
+- train
+preprocessing_num_workers: 8
+
+# ORPOTrainer arguments
+bf16: true
+beta: 0.05
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: true
+hub_model_id: zephyr-orpo-141b-A35b
+learning_rate: 5.0e-6
+log_level: info
+logging_steps: 10
+lr_scheduler_type: inverse_sqrt
+max_length: 2048
+max_prompt_length: 1792
+num_train_epochs: 3
+optim: adamw_bnb_8bit
+output_dir: data/zephyr-orpo-141b-A35b
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_steps: 100
diff --git a/recipes/zephyr-7b-beta/README.md b/recipes/zephyr-7b-beta/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c082f174f0c0d4a7ae8bb7f61e78108738c0767
--- /dev/null
+++ b/recipes/zephyr-7b-beta/README.md
@@ -0,0 +1,44 @@
+
+# Instructions to Replicate Zephyr-7b-β
+
+As described in the Zephyr [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:
+
+1. Apply SFT to fine-tune Mistral 7B on a filtered version of the UltraChat dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)). The result is an SFT model like [`zephyr-7b-sft-full`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) or [`zephyr-7b-sft-qlora`](https://huggingface.co/alignment-handbook/zephyr-7b-sft-qlora).
+2. Align the SFT model to AI feedback via DPO on a preprocessed version of the UltraFeedback dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized)). The result is a DPO model like [`zephyr-7b-dpo-full`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-full) or [`zephyr-7b-dpo-qlora`](https://huggingface.co/alignment-handbook/zephyr-7b-dpo-qlora).
+
+**Note:** after the release of Zephyr, the team at [Argilla](https://argilla.io) found that the source UltraFeedback dataset had a few thousand incorrect preference labels from GPT-4. Additionally, TRL's `SFTTrainer` had a bug in the learning rate scheduler which terminated training early. Accounting for these changes led us to find a better set of hyperparameters from those described in the technical report. In particular, for DPO training we found that training for 1 epoch with `beta=0.01` was sufficient to achieve comparable performance to `zephyr-7b-beta` (vs. 3 epochs with `beta=0.1`).
+
+See below for commands to train these models using either DeepSpeed ZeRO-3 or LoRA.
+
+## Full training examples
+
+You will require 8 GPUs (80GB of VRAM) to train the full model.
+```shell
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_full.yaml
+
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_full.yaml
+```
+
+## QLoRA training examples
+
+Train faster with flash-attention 2 (GPU supporting FA2: A100, H100, etc)
+```````shell
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_qlora.yaml --load_in_4bit=true
+
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_qlora.yaml
+```````
+
+P.S. Using Flash Attention also allows you to drastically increase the batch size (x2 in my case)
+
+Train without flash-attention (i.e. via PyTorch's scaled dot product attention):
+```````shell
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_qlora.yaml --load_in_4bit=true --attn_implementation=sdpa
+
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_qlora.yaml --attn_implementation=sdpa
+```````
\ No newline at end of file
diff --git a/recipes/zephyr-7b-beta/dpo/config_full.yaml b/recipes/zephyr-7b-beta/dpo/config_full.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..12b47b18fb315184ae908108b32550451f9076e6
--- /dev/null
+++ b/recipes/zephyr-7b-beta/dpo/config_full.yaml
@@ -0,0 +1,41 @@
+# Model arguments
+model_name_or_path: alignment-handbook/zephyr-7b-sft-full
+torch_dtype: null
+
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.01
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: zephyr-7b-dpo-full
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 1
+optim: adamw_torch
+output_dir: data/zephyr-7b-dpo-full
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 8
+push_to_hub: true
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/zephyr-7b-beta/dpo/config_qlora.yaml b/recipes/zephyr-7b-beta/dpo/config_qlora.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..46fbccd9f35f041ee942e54a9f8a1c5ada25f6f0
--- /dev/null
+++ b/recipes/zephyr-7b-beta/dpo/config_qlora.yaml
@@ -0,0 +1,57 @@
+# Model arguments
+model_name_or_path: alignment-handbook/zephyr-7b-sft-qlora
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# LoRA arguments
+use_peft: true
+load_in_4bit: true
+lora_r: 128
+lora_alpha: 128
+lora_dropout: 0.05
+lora_target_modules:
+- q_proj
+- k_proj
+- v_proj
+- o_proj
+- gate_proj
+- up_proj
+- down_proj
+
+# Data training arguments
+
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.01
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: zephyr-7b-dpo-qlora
+learning_rate: 5.0e-6
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 1
+optim: paged_adamw_32bit
+output_dir: data/zephyr-7b-dpo-qlora # It is handy to append `hub_model_revision` to keep track of your local experiments
+per_device_train_batch_size: 4
+per_device_eval_batch_size: 8
+push_to_hub: true
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/zephyr-7b-beta/sft/config_full.yaml b/recipes/zephyr-7b-beta/sft/config_full.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f1e8457d85b2aa917d74a98d9f9bfe6dccefd9fb
--- /dev/null
+++ b/recipes/zephyr-7b-beta/sft/config_full.yaml
@@ -0,0 +1,46 @@
+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: zephyr-7b-sft-full
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/zephyr-7b-sft-full
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 16
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/zephyr-7b-beta/sft/config_qlora.yaml b/recipes/zephyr-7b-beta/sft/config_qlora.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4881757c2c62c5a448d292bb6272350a363b45f1
--- /dev/null
+++ b/recipes/zephyr-7b-beta/sft/config_qlora.yaml
@@ -0,0 +1,60 @@
+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# LoRA arguments
+load_in_4bit: true
+use_peft: true
+lora_r: 16
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+- q_proj
+- k_proj
+- v_proj
+- o_proj
+- gate_proj
+- up_proj
+- down_proj
+
+# Data training arguments
+chat_template: "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: zephyr-7b-sft-qlora
+hub_strategy: every_save
+learning_rate: 2.0e-04
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/zephyr-7b-sft-qlora
+overwrite_output_dir: true
+per_device_eval_batch_size: 8
+per_device_train_batch_size: 4
+push_to_hub: true
+report_to:
+- tensorboard
+save_strategy: "steps"
+save_steps: 100
+save_total_limit: 1
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/zephyr-7b-gemma/README.md b/recipes/zephyr-7b-gemma/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..416462e3553649a4aab138c2c7f86995ac028026
--- /dev/null
+++ b/recipes/zephyr-7b-gemma/README.md
@@ -0,0 +1,21 @@
+
+# Instructions to Replicate Zephyr 7B Gemma
+
+Similar to how we trained Zephyr 7B Beta in our [technical report](https://huggingface.co/papers/2310.16944), training this model proceeds in two steps:
+
+1. Apply SFT to fine-tune Gemma 7B on the Deita 10k dataset ([link](https://huggingface.co/datasets/HuggingFaceH4/deita-10k-v0-sft)). The result is an SFT model like [`zephyr-7b-gemma-sft`](https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-sft-v0.1).
+2. Align the SFT model to AI feedback via DPO on a curated mix of 7k examples by Argilla ([link](https://huggingface.co/datasets/argilla/dpo-mix-7k)). The result is a DPO model like [`zephyr-7b-gemma`](HuggingFaceH4/zephyr-7b-gemma-v0.1).
+
+See below for commands to train these models using either DeepSpeed ZeRO-3 or LoRA.
+
+## Full training examples
+
+You will require 8 GPUs (80GB of VRAM) to train the full model - alternatively, you can train on 1 GPU by adjusting the micro batch size and gradient accumulation steps to keep the global batch size constant. A recipe involving QLoRA will come later 🤗.
+
+```shell
+# Step 1 - SFT
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b-gemma/sft/config_full.yaml
+
+# Step 2 - DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b-gemma/dpo/config_full.yaml
+```
diff --git a/recipes/zephyr-7b-gemma/dpo/config_full.yaml b/recipes/zephyr-7b-gemma/dpo/config_full.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f17ac683ffc5eb5069d1e2ae95e7935dd892fd14
--- /dev/null
+++ b/recipes/zephyr-7b-gemma/dpo/config_full.yaml
@@ -0,0 +1,42 @@
+# Model arguments
+model_name_or_path: HuggingFaceH4/zephyr-7b-gemma-sft-v0.1
+torch_dtype: bfloat16
+
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  argilla/dpo-mix-7k: 1.0
+dataset_splits:
+- train
+- test
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.05
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 8
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: False
+hub_model_id: zephyr-7b-gemma-dpo
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: cosine
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 2
+optim: adamw_torch
+output_dir: data/zephyr-7b-gemma-dpo
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 4
+push_to_hub: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/recipes/zephyr-7b-gemma/sft/config_full.yaml b/recipes/zephyr-7b-gemma/sft/config_full.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..03226ab3b4cd13d38bb68d14c8eaa85274bdcb91
--- /dev/null
+++ b/recipes/zephyr-7b-gemma/sft/config_full.yaml
@@ -0,0 +1,48 @@
+# Model arguments
+model_name_or_path: google/gemma-7b
+model_revision: main
+tokenizer_name_or_path: philschmid/gemma-tokenizer-chatml # Custom tokenizer with <|im_start|> and <|im_end|> tokens
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/deita-10k-v0-sft: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+dataset_kwargs:
+  add_special_tokens: false  # We already wrap <bos> and <eos> in the chat template
+  append_concat_token: false # No need to add <eos> across samples
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 4
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: zephyr-7b-gemma-sft
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 3
+output_dir: data/zephyr-7b-gemma-sft
+overwrite_output_dir: true
+per_device_eval_batch_size: 4
+per_device_train_batch_size: 4
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+- wandb
+save_strategy: "no"
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/run_dpo.py b/run_dpo.py
new file mode 100644
index 0000000000000000000000000000000000000000..31cdc39bf832da5404a2deeeed293683b62f03ad
--- /dev/null
+++ b/run_dpo.py
@@ -0,0 +1,269 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+import sys
+from datasets import load_dataset
+
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, set_seed
+
+from alignment import (
+    DataArguments,
+    DPOConfig,
+    H4ArgumentParser,
+    ModelArguments,
+    apply_chat_template,
+    decontaminate_humaneval,
+    get_checkpoint,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+    is_adapter_model,
+)
+from peft import PeftConfig, PeftModel
+from trl import DPOTrainer
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, DPOConfig))
+    model_args, data_args, training_args = parser.parse()
+
+    #######
+    # Setup
+    #######
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Check for last checkpoint
+    last_checkpoint = get_checkpoint(training_args)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/dpo_data/dpo_test.json")
+    eval_raw_datasets = load_dataset("json", data_files="/data01/swzhang/dataset/dpo_data/dpo_test.json")
+    raw_datasets['test'] = eval_raw_datasets['train']
+    def process_dpo_data(example):
+        example['chosen'] = [{'content':example['prompt'],'role':'user'},{'content':example['chosen'],'role':'assistant'}]
+        example['rejected'] = [{'content':example['prompt'],'role':'user'},{'content':example['rejected'],'role':'assistant'}]
+        return example
+    raw_datasets=raw_datasets.map(process_dpo_data)
+    print(raw_datasets['train'][0])
+
+
+    logger.info(
+        f"Training on the following splits: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+    column_names = list(raw_datasets["train"].features)
+
+    #####################################
+    # Load tokenizer and process datasets
+    #####################################
+    data_args.truncation_side = "left"  # Truncate from left to ensure we don't lose labels in final turn
+    tokenizer = get_tokenizer(model_args, data_args)
+
+    #####################
+    # Apply chat template
+    #####################
+    raw_datasets = raw_datasets.map(
+        apply_chat_template,
+        fn_kwargs={
+            "tokenizer": tokenizer,
+            "task": "dpo",
+            "auto_insert_empty_system_msg": data_args.auto_insert_empty_system_msg,
+        },
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        desc="Formatting comparisons with prompt template",
+    )
+    print(raw_datasets['train'][0])
+
+
+    ##########################
+    # Decontaminate benchmarks
+    ##########################
+    num_raw_train_samples = len(raw_datasets["train"])
+    raw_datasets = raw_datasets.filter(
+        decontaminate_humaneval,
+        fn_kwargs={"text_column": "text_chosen"},
+        batched=True,
+        batch_size=10_000,
+        num_proc=1,
+        desc="Decontaminating HumanEval samples",
+    )
+    num_filtered_train_samples = num_raw_train_samples - len(raw_datasets["train"])
+    logger.info(
+        f"Decontaminated {num_filtered_train_samples} ({num_filtered_train_samples/num_raw_train_samples * 100:.2f}%) samples from the training set."
+    )
+
+    # Replace column names with what TRL needs, text_chosen -> chosen and text_rejected -> rejected
+    for split in ["train", "test"]:
+        raw_datasets[split] = raw_datasets[split].rename_columns(
+            {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
+        )
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(raw_datasets["train"])), 3):
+        logger.info(f"Prompt sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['prompt']}")
+        logger.info(f"Chosen sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['chosen']}")
+        logger.info(f"Rejected sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['rejected']}")
+
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+
+    model = model_args.model_name_or_path
+    if is_adapter_model(model, model_args.model_revision) is True:
+        logger.info(f"Loading SFT adapter for {model_args.model_name_or_path=}")
+        peft_config = PeftConfig.from_pretrained(model_args.model_name_or_path, revision=model_args.model_revision)
+        model_kwargs = dict(
+            revision=model_args.base_model_revision,
+            trust_remote_code=model_args.trust_remote_code,
+            attn_implementation=model_args.attn_implementation,
+            torch_dtype=torch_dtype,
+            use_cache=False if training_args.gradient_checkpointing else True,
+            device_map=get_kbit_device_map() if quantization_config is not None else None,
+            quantization_config=quantization_config,
+        )
+        base_model = AutoModelForCausalLM.from_pretrained(
+            peft_config.base_model_name_or_path,
+            **model_kwargs,
+        )
+        model = PeftModel.from_pretrained(
+            base_model,
+            model_args.model_name_or_path,
+            revision=model_args.model_revision,
+        )
+        model_kwargs = None
+
+    ref_model = model
+    ref_model_kwargs = model_kwargs
+
+    if model_args.use_peft is True:
+        ref_model = None
+        ref_model_kwargs = None
+
+    #########################
+    # Instantiate DPO trainer
+    #########################
+    trainer = DPOTrainer(
+        model,
+        ref_model,
+        model_init_kwargs=model_kwargs,
+        ref_model_init_kwargs=ref_model_kwargs,
+        args=training_args,
+        beta=training_args.beta,
+        train_dataset=raw_datasets["train"],
+        eval_dataset=raw_datasets["test"],
+        tokenizer=tokenizer,
+        max_length=training_args.max_length,
+        max_prompt_length=training_args.max_prompt_length,
+        peft_config=get_peft_config(model_args),
+        loss_type=training_args.loss_type,
+    )
+
+    ###############
+    # Training loop
+    ###############
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(raw_datasets["train"])
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    logger.info("*** Training complete ***")
+
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(data_args.dataset_mixer.keys()),
+        "dataset_tags": list(data_args.dataset_mixer.keys()),
+        "tags": ["alignment-handbook"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(raw_datasets["test"])
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.push_to_hub is True:
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
+
+    logger.info("*** Training complete! ***")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/run_sft_test_env.py b/run_sft_test_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f19c912ca7833c9a7c9b5be93deec736cf4e171
--- /dev/null
+++ b/run_sft_test_env.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Supervised fine-tuning script for decoder language models.
+"""
+
+import logging
+import random
+import sys
+
+import datasets
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, set_seed
+from datasets import load_dataset
+from alignment import (
+    DataArguments,
+    H4ArgumentParser,
+    ModelArguments,
+    SFTConfig,
+    apply_chat_template,
+    decontaminate_humaneval,
+    get_checkpoint,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+)
+from trl import SFTTrainer, setup_chat_format
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))
+    model_args, data_args, training_args = parser.parse()
+
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process a small summary
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Check for last checkpoint
+    last_checkpoint = get_checkpoint(training_args)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = load_dataset("json", data_files="/home/swzhang/LLM_alignment/alignment-handbook/test.json")
+    eval_raw_datasets = load_dataset("json", data_files="/home/swzhang/LLM_alignment/alignment-handbook/test.json")
+    logger.info(
+        f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+    column_names = list(raw_datasets["train"].features)
+
+    ################
+    # Load tokenizer
+    ################
+    tokenizer = get_tokenizer(model_args, data_args)
+
+    #######################
+    # Load pretrained model
+    #######################
+    logger.info("*** Load pretrained model ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+
+    model = model_args.model_name_or_path
+    # For ChatML we need to add special tokens and resize the embedding layer
+    if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path:
+        model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
+        model, tokenizer = setup_chat_format(model, tokenizer)
+        model_kwargs = None
+
+    #####################
+    # Apply chat template
+    #####################
+    raw_datasets = raw_datasets.map(
+        apply_chat_template,
+        fn_kwargs={
+            "tokenizer": tokenizer,
+            "task": "sft",
+            "auto_insert_empty_system_msg": False,
+        },
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        desc="Applying chat template",
+    )
+    eval_raw_datasets = eval_raw_datasets.map(
+        apply_chat_template,
+        fn_kwargs={
+            "tokenizer": tokenizer,
+            "task": "sft",
+            "auto_insert_empty_system_msg": False,
+        },
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        desc="Applying chat template",
+    )
+
+
+    train_dataset = raw_datasets["train"]
+    eval_dataset = eval_raw_datasets["train"]
+
+    ########################
+    # Initialize the Trainer
+    ########################
+    trainer = SFTTrainer(
+        model=model,
+        model_init_kwargs=model_kwargs,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        dataset_text_field="text",
+        max_seq_length=training_args.max_seq_length,
+        tokenizer=tokenizer,
+        packing=True,
+        peft_config=get_peft_config(model_args),
+        dataset_kwargs=training_args.dataset_kwargs,
+    )
+
+    ###############
+    # Training loop
+    ###############
+    logger.info("*** Train ***")
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(train_dataset)
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(data_args.dataset_mixer.keys()),
+        "dataset_tags": list(data_args.dataset_mixer.keys()),
+        "tags": ["alignment-handbook"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(eval_dataset)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.push_to_hub is True:
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
+
+    logger.info("*** Training complete ***")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..79d2e195b9dbe5acc61f413c51fd19cebbcf07de
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,141 @@
+# Scripts to Train and Evaluate Chat Models
+
+## Fine-tuning
+
+In the handbook, we provide four main ways to align LLMs for chat:
+
+- Full fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on an 8 x A100 (80GB) node).
+- LoRA or QLoRA fine-tuning on a single consumer 24GB GPU (tested on an RTX 4090).
+- LoRA fine-tuning on a multi-GPU machine with DeepSpeed ZeRO-3 (tested on a 2 x A100s (80GB)).
+- QLoRA fine-tuning on multi-GPU machine with FSDP (tested on a 2 x A6000s (48GB)).
+
+In practice, we find comparable performance for both full and QLoRA fine-tuning, with the latter having the advantage of producing small adapter weights that are fast to upload and download from the Hugging Face Hub. Here are the general commands to fine-tune your models:
+
+```shell
+# Full training with ZeRO-3 on 8 GPUs
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml
+
+# QLoRA 4-bit training on a single GPU
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_{task}.py recipes/{model_name}/{task}/config_qlora.yaml
+
+# LoRA training on a single GPU
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/multi_gpu.yaml --num_processes=1 scripts/run_{task}.py recipes/{model_name}/{task}/config_qlora.yaml --load_in_4bit=false
+
+# LoRA training with ZeRO-3 on two or more GPUs
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml --num_processes={num_gpus} scripts/run_{task}.py recipes/{model_name}/{task}/config_qlora.yaml --load_in_4bit=false
+
+# QLoRA training with FSDP on two or more GPUs
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/fsdp+qlora.yaml --num_processes={num_gpus} scripts/run_{task}.py recipes/{model_name}/{task}/config_qlora.yaml --torch_dtype=bfloat16 --bnb_4bit_quant_storage=bfloat16
+```
+
+Here `{task}` refers to the type of training you wish to run. Currently, the following tasks are supported:
+* continued pretraining `cpt` (note that `cpt` is only present in the `gpt-nl` example recipe)
+* supervised finetuning `sft`
+* direct preference optimisation `dpo`
+* odds ratio preference optimisation `orpo`
+
+`{model_name}` refers to the choice of a recipe in the `recipes` directory. For example, to replicate Zephyr-7B-β you can run:
+
+```shell
+# Step 1 - train SFT policy
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_sft.py recipes/zephyr-7b-beta/sft/config_full.yaml
+
+# Step 2 - align with DPO
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_dpo.py recipes/zephyr-7b-beta/dpo/config_full.yaml
+```
+
+**💡 Tip:** If you scale up/down the number of GPUs, we recommend also scaling up the per-device batch size or number of gradient accumulation steps to keep the global batch size constant (and thus replicate our results).
+
+By default, these scripts will push each model to your Hugging Face Hub username, i.e. `{username}/{model_name}-{task}`. You can override the parameters in each YAML config by appending them to the command as follows:
+
+```shell
+# Change batch size, number of epochs etc
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml --per_device_train_batch_size=42 --num_train_epochs=5
+```
+
+## Logging with Weights and Biases
+By default, all training metrics are logged with TensorBoard. If you have a [Weights and Biases](https://wandb.ai/site) account and are logged in, you can view the training metrics by appending `--report_to=wandb`, e.g.
+
+```shell
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file recipes/accelerate_configs/deepspeed_zero3.yaml scripts/run_{task}.py recipes/{model_name}/{task}/config_full.yaml --report_to=wandb
+```
+
+## Launching jobs on a Slurm cluster
+
+If you have access to a Slurm cluster, we provide a `recipes/launch.slurm` script that will automatically queue training jobs for you. Here's how you can use it:
+
+```shell
+sbatch --job-name=handbook_{task} --nodes=1 recipes/launch.slurm {model_name} {task} {precision} {accelerator}
+```
+
+Here `{model_name}` and `{task}` are defined as above, while `{precision}` refers to the type of training (`full` vs `qlora`) and `{accelerator}` refers to the choice of 🤗 Accelerate config in `recipes/accelerate_configs`. If you wish to override the default config parameters, you can provide them by appending a space-separated string like `'--arg1=value1 --arg2=value2'. Here's a concrete example to run SFT on 1 node of 8 GPUs:
+
+```shell
+# Launch on Slurm and override default hyperparameters
+sbatch --job-name=handbook_sft --nodes=1 recipes/launch.slurm zephyr-7b-beta sft full deepspeed_zero3 '--per_device_train_batch_size=42 --num_train_epochs=5'
+```
+
+You can scale the number of nodes by increasing the `--nodes` flag.
+
+**⚠️ Note:** the configuration in `recipes/launch.slurm` is optimised for the Hugging Face Compute Cluster and may require tweaking to be adapted to your own compute nodes.
+
+## Fine-tuning on your datasets
+
+Under the hood, each training script uses the `get_datasets()` function which allows one to easily combine multiple datasets with varying proportions. For instance, this is how one can specify multiple datasets and which splits to combine in one of the YAML configs:
+
+```yaml
+datasets_mixer:
+    dataset_1: 0.5  # Use 50% of the training examples
+    dataset_2: 0.66 # Use 66% of the training examples
+    dataset_3: 0.10 # Use 10% of the training examples
+dataset_splits:
+- train_xxx         # The training splits to mix
+- test_xxx          # The test splits to mix
+```
+
+If you want to fine-tune on your datasets, the main thing to keep in mind is how the chat templates are applied to the dataset blend. Since each task (SFT, DPO, ORPO, etc), requires a different format, we assume the datasets have the following columns:
+
+**SFT**
+
+* `messages`: A list of `dicts` in the form `{"role": "{role}", "content": {content}}`. 
+* See [ultrachat_200k](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) for an example.
+
+**DPO and ORPO**
+
+* `chosen`: A list of `dicts` in the form `{"role": "{role}", "content": {content}}` corresponding to the preferred dialogue.
+* `rejected`: A list of `dicts` in the form `{"role": "{role}", "content": {content}}` corresponding to the dispreferred dialogue.
+* See [ultrafeedback_binarized](https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized) for an example.
+
+We also find it useful to include dedicated splits per task in our datasets, so e.g. we have:
+
+* `{train,test}_sft`: Splits for SFT training.
+* `{train,test}_gen`: Splits for generation ranking like rejection sampling or PPO.
+* `{train,test}_prefs`: Splits for preference modelling, like reward modelling or DPO.
+
+If you format your dataset in the same way, our training scripts should work out of the box!
+
+## Evaluating chat models
+
+We recommend benchmarking chat models on:
+
+* [MT-Bench](https://huggingface.co/spaces/lmsys/mt-bench): a multi-turn benchmark spanning 80 dialogues and 10 domains.
+* [AlpacaEval](https://github.com/tatsu-lab/alpaca_eval): a single-turn benchmark that evaluates the helpfulness of chat and instruct models against `text-davinci-003`.
+
+For both benchmarks, we have added support for the [Zephyr chat template](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full/blob/ac6e600eefcce74f5e8bae1035d4f66019e93190/tokenizer_config.json#L30) (which is the default produced by our scripts), so you can evaluate models produced by our scripts as follows:
+
+**MT-Bench**
+
+* Follow the installation instructions [here](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge)
+* Make sure the word `zephyr` exists in the `--model-path` argument when generating the model responses [here](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge#step-1-generate-model-answers-to-mt-bench-questions). This will ensure the correct chat template is loaded. For example, the following model name is valid: `--model-path {hub_username}/my-baby-zephyr`
+* Generate the model responses and GPT-4 rankings.
+
+**AlpacaEval**
+
+* Follow the installation instructions [here](https://github.com/tatsu-lab/alpaca_eval#quick-start)
+* Copy-paste the [config](https://github.com/tatsu-lab/alpaca_eval/blob/main/src/alpaca_eval/models_configs/zephyr-7b-beta/configs.yaml) for `zephyr-7b-beta` and place it in the `model_configs` directory under `{your_zephyr_model}`.
+  * Next, update the [config name](https://github.com/tatsu-lab/alpaca_eval/blob/2daa6e11b194653043ca74f735728dc068e04aae/src/alpaca_eval/models_configs/zephyr-7b-beta/configs.yaml#L1) and [Hub model ID](https://github.com/tatsu-lab/alpaca_eval/blob/2daa6e11b194653043ca74f735728dc068e04aae/src/alpaca_eval/models_configs/zephyr-7b-beta/configs.yaml#L5) to match your model name.
+* Follow the steps to evaluate your model [here](https://github.com/tatsu-lab/alpaca_eval/tree/main#evaluating-a-model).
+
+Note that MT-Bench and AlpacaEval rely on LLMs like GPT-4 to judge the quality of the model responses, and thus the ranking exhibits various biases including a preference for models distilled from GPTs. For that reason, we also recommend submitting your best models for human evaluation in:
+
+* [Chatbot Arena](https://chat.lmsys.org): a live, human evaluation of chat models in head-to-head comparisons.
diff --git a/scripts/run_cpt.py b/scripts/run_cpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..06c9e9d9fd16f151cde6b5ee77c3d0cf9e7545f2
--- /dev/null
+++ b/scripts/run_cpt.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Continued pretraining script for decoder language models.
+"""
+
+import logging
+import random
+import sys
+
+import datasets
+import torch
+import transformers
+from transformers import set_seed
+
+from alignment import (
+    DataArguments,
+    H4ArgumentParser,
+    ModelArguments,
+    SFTConfig,
+    get_checkpoint,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+)
+from trl import SFTTrainer
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))
+    model_args, data_args, training_args = parser.parse()
+
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process a small summary
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Check for last checkpoint
+    last_checkpoint = get_checkpoint(training_args)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = get_datasets(
+        data_args,
+        splits=data_args.dataset_splits,
+        configs=data_args.dataset_configs,
+        columns_to_keep=[data_args.text_column],
+    )
+
+    logger.info(
+        f"Training on the following datasets and their proportions:"
+        f" {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+
+    train_dataset = raw_datasets["train"] if "train" in raw_datasets else None
+    eval_dataset = raw_datasets["test"] if "test" in raw_datasets else None
+
+    if train_dataset is None:
+        raise ValueError(
+            "Training set must be included (so make sure that your dataset has a split with" " 'train' in the name)."
+        )
+
+    if training_args.do_eval and eval_dataset is None:
+        raise ValueError("'--do_eval' enabled so make sure that your dataset has a split with 'test' in the name.")
+
+    ################
+    # Load tokenizer
+    ################
+    tokenizer = get_tokenizer(model_args, data_args, auto_set_chat_template=False)
+
+    with training_args.main_process_first(desc="Log a few random samples from the processed training set"):
+        for index in random.sample(range(len(raw_datasets["train"])), 3):
+            logger.info(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")
+
+    #######################
+    # Load pretrained model
+    #######################
+    logger.info("*** Load pretrained model ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+
+    ########################
+    # Initialize the Trainer
+    ########################
+    trainer = SFTTrainer(
+        model=model_args.model_name_or_path,
+        model_init_kwargs=model_kwargs,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        dataset_text_field=data_args.text_column,
+        max_seq_length=training_args.max_seq_length,
+        tokenizer=tokenizer,
+        packing=True,
+        peft_config=get_peft_config(model_args),
+        dataset_kwargs=training_args.dataset_kwargs,
+    )
+
+    ###############
+    # Training loop
+    ###############
+    logger.info("*** Train ***")
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(train_dataset)
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(data_args.dataset_mixer.keys()),
+        "dataset_tags": list(data_args.dataset_mixer.keys()),
+        "tags": ["alignment-handbook"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(eval_dataset)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.push_to_hub is True:
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
+
+    logger.info("*** Training complete ***")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_dpo.py b/scripts/run_dpo.py
new file mode 100644
index 0000000000000000000000000000000000000000..972d969af417bce39469989339701c80fe89c69b
--- /dev/null
+++ b/scripts/run_dpo.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+import sys
+
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, set_seed
+
+from alignment import (
+    DataArguments,
+    DPOConfig,
+    H4ArgumentParser,
+    ModelArguments,
+    apply_chat_template,
+    decontaminate_humaneval,
+    get_checkpoint,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+    is_adapter_model,
+)
+from peft import PeftConfig, PeftModel
+from trl import DPOTrainer
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, DPOConfig))
+    model_args, data_args, training_args = parser.parse()
+
+    #######
+    # Setup
+    #######
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Check for last checkpoint
+    last_checkpoint = get_checkpoint(training_args)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = get_datasets(
+        data_args,
+        splits=data_args.dataset_splits,
+        configs=data_args.dataset_configs,
+        columns_to_keep=["messages", "chosen", "rejected", "prompt", "completion", "label"],
+    )
+    logger.info(
+        f"Training on the following splits: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+    column_names = list(raw_datasets["train"].features)
+
+    #####################################
+    # Load tokenizer and process datasets
+    #####################################
+    data_args.truncation_side = "left"  # Truncate from left to ensure we don't lose labels in final turn
+    tokenizer = get_tokenizer(model_args, data_args)
+
+    #####################
+    # Apply chat template
+    #####################
+    raw_datasets = raw_datasets.map(
+        apply_chat_template,
+        fn_kwargs={
+            "tokenizer": tokenizer,
+            "task": "dpo",
+            "auto_insert_empty_system_msg": data_args.auto_insert_empty_system_msg,
+        },
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        desc="Formatting comparisons with prompt template",
+    )
+
+    ##########################
+    # Decontaminate benchmarks
+    ##########################
+    num_raw_train_samples = len(raw_datasets["train"])
+    raw_datasets = raw_datasets.filter(
+        decontaminate_humaneval,
+        fn_kwargs={"text_column": "text_chosen"},
+        batched=True,
+        batch_size=10_000,
+        num_proc=1,
+        desc="Decontaminating HumanEval samples",
+    )
+    num_filtered_train_samples = num_raw_train_samples - len(raw_datasets["train"])
+    logger.info(
+        f"Decontaminated {num_filtered_train_samples} ({num_filtered_train_samples/num_raw_train_samples * 100:.2f}%) samples from the training set."
+    )
+
+    # Replace column names with what TRL needs, text_chosen -> chosen and text_rejected -> rejected
+    for split in ["train", "test"]:
+        raw_datasets[split] = raw_datasets[split].rename_columns(
+            {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
+        )
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(raw_datasets["train"])), 3):
+        logger.info(f"Prompt sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['prompt']}")
+        logger.info(f"Chosen sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['chosen']}")
+        logger.info(f"Rejected sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['rejected']}")
+
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+
+    model = model_args.model_name_or_path
+    if is_adapter_model(model, model_args.model_revision) is True:
+        logger.info(f"Loading SFT adapter for {model_args.model_name_or_path=}")
+        peft_config = PeftConfig.from_pretrained(model_args.model_name_or_path, revision=model_args.model_revision)
+        model_kwargs = dict(
+            revision=model_args.base_model_revision,
+            trust_remote_code=model_args.trust_remote_code,
+            attn_implementation=model_args.attn_implementation,
+            torch_dtype=torch_dtype,
+            use_cache=False if training_args.gradient_checkpointing else True,
+            device_map=get_kbit_device_map() if quantization_config is not None else None,
+            quantization_config=quantization_config,
+        )
+        base_model = AutoModelForCausalLM.from_pretrained(
+            peft_config.base_model_name_or_path,
+            **model_kwargs,
+        )
+        model = PeftModel.from_pretrained(
+            base_model,
+            model_args.model_name_or_path,
+            revision=model_args.model_revision,
+        )
+        model_kwargs = None
+
+    ref_model = model
+    ref_model_kwargs = model_kwargs
+
+    if model_args.use_peft is True:
+        ref_model = None
+        ref_model_kwargs = None
+
+    #########################
+    # Instantiate DPO trainer
+    #########################
+    trainer = DPOTrainer(
+        model,
+        ref_model,
+        model_init_kwargs=model_kwargs,
+        ref_model_init_kwargs=ref_model_kwargs,
+        args=training_args,
+        beta=training_args.beta,
+        train_dataset=raw_datasets["train"],
+        eval_dataset=raw_datasets["test"],
+        tokenizer=tokenizer,
+        max_length=training_args.max_length,
+        max_prompt_length=training_args.max_prompt_length,
+        peft_config=get_peft_config(model_args),
+        loss_type=training_args.loss_type,
+    )
+
+    ###############
+    # Training loop
+    ###############
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(raw_datasets["train"])
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    logger.info("*** Training complete ***")
+
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(data_args.dataset_mixer.keys()),
+        "dataset_tags": list(data_args.dataset_mixer.keys()),
+        "tags": ["alignment-handbook"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(raw_datasets["test"])
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.push_to_hub is True:
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
+
+    logger.info("*** Training complete! ***")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_orpo.py b/scripts/run_orpo.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce864d310fa7095988d0b799b559bb5e382208fa
--- /dev/null
+++ b/scripts/run_orpo.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import random
+import sys
+from typing import Any, Dict
+
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, set_seed
+
+from alignment import (
+    DataArguments,
+    H4ArgumentParser,
+    ModelArguments,
+    apply_chat_template,
+    decontaminate_humaneval,
+    get_checkpoint,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+)
+from trl import ORPOConfig, ORPOTrainer, setup_chat_format
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, ORPOConfig))
+    model_args, data_args, training_args = parser.parse()
+
+    #######
+    # Setup
+    #######
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Check for last checkpoint
+    last_checkpoint = get_checkpoint(training_args)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = get_datasets(
+        data_args,
+        splits=data_args.dataset_splits,
+        configs=data_args.dataset_configs,
+        columns_to_keep=[
+            "prompt",
+            "chosen",
+            "rejected",
+        ],
+    )
+    logger.info(
+        f"Training on the following splits: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+    column_names = list(raw_datasets["train"].features)
+
+    #####################################
+    # Load tokenizer and process datasets
+    #####################################
+    data_args.truncation_side = "left"  # Truncate from left to ensure we don't lose labels in final turn
+    tokenizer = get_tokenizer(model_args, data_args)
+
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+
+    # For ChatML we need to add special tokens and resize the embedding layer
+    if "<|im_start|>" in tokenizer.chat_template:
+        model, tokenizer = setup_chat_format(model, tokenizer)
+
+    #####################
+    # Apply chat template
+    #####################
+    raw_datasets = raw_datasets.map(
+        apply_chat_template,
+        fn_kwargs={
+            "tokenizer": tokenizer,
+            "task": "orpo",
+            "auto_insert_empty_system_msg": data_args.auto_insert_empty_system_msg,
+        },
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        desc="Formatting comparisons with prompt template",
+    )
+
+    #############################
+    # Filter out seq > max_length
+    #############################
+    if training_args.max_prompt_length is not None:
+        unfiltered_train_samples = len(raw_datasets["train"])
+        if "test" in raw_datasets:
+            unfiltered_test_samples = len(raw_datasets["test"])
+
+        def filter_fn(sample: Dict[str, Any]) -> Dict[str, Any]:
+            prompt_length = tokenizer(
+                sample["text_prompt"],
+                return_tensors="pt",
+            )[
+                "input_ids"
+            ].size(dim=-1)
+
+            return prompt_length < training_args.max_prompt_length
+
+        raw_datasets = raw_datasets.filter(
+            filter_fn,
+            desc="Filtering out the samples where len(text_prompt) > max_prompt_length",
+        )
+
+        filtered_train_samples = unfiltered_train_samples - len(raw_datasets["train"])
+        logger.info(
+            f"Filtered out {filtered_train_samples} training samples out of the {unfiltered_train_samples} samples."
+        )
+        if "test" in raw_datasets:
+            filtered_test_samples = unfiltered_test_samples - len(raw_datasets["test"])
+            logger.info(
+                f"Filtered out {filtered_test_samples} test samples out of the {unfiltered_test_samples} samples."
+            )
+
+    ##########################
+    # Decontaminate benchmarks
+    ##########################
+    num_raw_train_samples = len(raw_datasets["train"])
+    raw_datasets = raw_datasets.filter(
+        decontaminate_humaneval,
+        fn_kwargs={"text_column": "text_chosen"},
+        batched=True,
+        batch_size=10_000,
+        num_proc=1,
+        desc="Decontaminating HumanEval samples",
+    )
+    num_filtered_train_samples = num_raw_train_samples - len(raw_datasets["train"])
+    logger.info(
+        f"Decontaminated {num_filtered_train_samples} ({num_filtered_train_samples/num_raw_train_samples * 100:.2f}%) samples from the training set."
+    )
+
+    # Replace column names with what TRL needs, text_prompt -> prompt, text_chosen -> chosen and text_rejected -> rejected
+    for split in raw_datasets.keys():
+        raw_datasets[split] = raw_datasets[split].rename_columns(
+            {
+                "text_prompt": "prompt",
+                "text_chosen": "chosen",
+                "text_rejected": "rejected",
+            }
+        )
+
+    # Log a few random samples from the training set:
+    for index in random.sample(range(len(raw_datasets["train"])), 3):
+        logger.info(f"Prompt sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['prompt']}")
+        logger.info(f"Chosen sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['chosen']}")
+        logger.info(f"Rejected sample {index} of the raw training set:\n\n{raw_datasets['train'][index]['rejected']}")
+
+    ##########################
+    # Instantiate ORPO trainer
+    ##########################
+    trainer = ORPOTrainer(
+        model,
+        args=training_args,
+        train_dataset=raw_datasets["train"],
+        eval_dataset=raw_datasets["test"] if "test" in raw_datasets else None,
+        tokenizer=tokenizer,
+        peft_config=get_peft_config(model_args),  # type: ignore
+    )
+
+    ###############
+    # Training loop
+    ###############
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(raw_datasets["train"])
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    logger.info("*** Training complete ***")
+
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    if trainer.is_fsdp_enabled:
+        trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(data_args.dataset_mixer.keys()),
+        "dataset_tags": list(data_args.dataset_mixer.keys()),
+        "tags": ["alignment-handbook"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval and "test" in raw_datasets:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(raw_datasets["test"])
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.push_to_hub is True:
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
+
+    logger.info("*** Training complete! ***")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_sft.py b/scripts/run_sft.py
new file mode 100644
index 0000000000000000000000000000000000000000..60a2dfdb6ba38b8045e540ea94bcbe84d225403a
--- /dev/null
+++ b/scripts/run_sft.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Supervised fine-tuning script for decoder language models.
+"""
+
+import logging
+import random
+import sys
+
+import datasets
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, set_seed
+
+from alignment import (
+    DataArguments,
+    H4ArgumentParser,
+    ModelArguments,
+    SFTConfig,
+    apply_chat_template,
+    decontaminate_humaneval,
+    get_checkpoint,
+    get_datasets,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+)
+from trl import SFTTrainer, setup_chat_format
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))
+    model_args, data_args, training_args = parser.parse()
+
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process a small summary
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Data parameters {data_args}")
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Check for last checkpoint
+    last_checkpoint = get_checkpoint(training_args)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+
+    ###############
+    # Load datasets
+    ###############
+    raw_datasets = get_datasets(
+        data_args,
+        splits=data_args.dataset_splits,
+        configs=data_args.dataset_configs,
+        columns_to_keep=["messages", "chosen", "rejected", "prompt", "completion", "label"],
+    )
+    logger.info(
+        f"Training on the following datasets and their proportions: {[split + ' : ' + str(dset.num_rows) for split, dset in raw_datasets.items()]}"
+    )
+    column_names = list(raw_datasets["train"].features)
+
+    ################
+    # Load tokenizer
+    ################
+    tokenizer = get_tokenizer(model_args, data_args)
+
+    #######################
+    # Load pretrained model
+    #######################
+    logger.info("*** Load pretrained model ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+
+    model = model_args.model_name_or_path
+    # For ChatML we need to add special tokens and resize the embedding layer
+    if "<|im_start|>" in tokenizer.chat_template and "gemma-tokenizer-chatml" not in tokenizer.name_or_path:
+        model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
+        model, tokenizer = setup_chat_format(model, tokenizer)
+        model_kwargs = None
+
+    #####################
+    # Apply chat template
+    #####################
+    raw_datasets = raw_datasets.map(
+        apply_chat_template,
+        fn_kwargs={
+            "tokenizer": tokenizer,
+            "task": "sft",
+            "auto_insert_empty_system_msg": data_args.auto_insert_empty_system_msg,
+        },
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        desc="Applying chat template",
+    )
+
+    ##########################
+    # Decontaminate benchmarks
+    ##########################
+    num_raw_train_samples = len(raw_datasets["train"])
+    raw_datasets = raw_datasets.filter(decontaminate_humaneval, batched=True, batch_size=10_000, num_proc=1)
+    num_filtered_train_samples = num_raw_train_samples - len(raw_datasets["train"])
+    logger.info(
+        f"Decontaminated {num_filtered_train_samples} ({num_filtered_train_samples/num_raw_train_samples * 100:.2f}%) samples from the training set."
+    )
+
+    train_dataset = raw_datasets["train"]
+    eval_dataset = raw_datasets["test"]
+
+    with training_args.main_process_first(desc="Log a few random samples from the processed training set"):
+        for index in random.sample(range(len(raw_datasets["train"])), 3):
+            logger.info(f"Sample {index} of the processed training set:\n\n{raw_datasets['train'][index]['text']}")
+
+    ########################
+    # Initialize the Trainer
+    ########################
+    trainer = SFTTrainer(
+        model=model,
+        model_init_kwargs=model_kwargs,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        dataset_text_field="text",
+        max_seq_length=training_args.max_seq_length,
+        tokenizer=tokenizer,
+        packing=True,
+        peft_config=get_peft_config(model_args),
+        dataset_kwargs=training_args.dataset_kwargs,
+    )
+
+    ###############
+    # Training loop
+    ###############
+    logger.info("*** Train ***")
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(train_dataset)
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(data_args.dataset_mixer.keys()),
+        "dataset_tags": list(data_args.dataset_mixer.keys()),
+        "tags": ["alignment-handbook"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+
+    ##########
+    # Evaluate
+    ##########
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        metrics = trainer.evaluate()
+        metrics["eval_samples"] = len(eval_dataset)
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    if training_args.push_to_hub is True:
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
+
+    logger.info("*** Training complete ***")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..c2ee645befd5933b618f5f9912609acc5fab1445
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,41 @@
+[isort]
+default_section = FIRSTPARTY
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+known_first_party = alignment
+known_third_party =
+    transformers
+    datasets
+    fugashi
+    git
+    h5py
+    matplotlib
+    nltk
+    numpy
+    packaging
+    pandas
+    psutil
+    pytest
+    rouge_score
+    sacrebleu
+    seqeval
+    sklearn
+    streamlit
+    torch
+    tqdm
+
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
+
+[flake8]
+ignore = E203, E501, E741, W503, W605
+max-line-length = 119
+per-file-ignores =
+    # imported but unused
+    __init__.py: F401
+
+[tool:pytest]
+doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..73214bf51cff0521d5c71528e9664e29ae1cf786
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,147 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from huggingface/transformers: https://github.com/huggingface/transformers/blob/21a2d900eceeded7be9edc445b56877b95eda4ca/setup.py
+
+
+import re
+import shutil
+from pathlib import Path
+
+from setuptools import find_packages, setup
+
+
+# Remove stale alignment.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
+stale_egg_info = Path(__file__).parent / "alignment.egg-info"
+if stale_egg_info.exists():
+    print(
+        (
+            "Warning: {} exists.\n\n"
+            "If you recently updated alignment, this is expected,\n"
+            "but it may prevent alignment from installing in editable mode.\n\n"
+            "This directory is automatically generated by Python's packaging tools.\n"
+            "I will remove it now.\n\n"
+            "See https://github.com/pypa/pip/issues/5466 for details.\n"
+        ).format(stale_egg_info)
+    )
+    shutil.rmtree(stale_egg_info)
+
+
+# IMPORTANT: all dependencies should be listed here with their version requirements, if any.
+#   * If a dependency is fast-moving (e.g. transformers), pin to the exact version
+_deps = [
+    "accelerate>=0.29.2",
+    "bitsandbytes>=0.43.0",
+    "black>=24.4.2",
+    "datasets>=2.18.0",
+    "deepspeed>=0.14.4",
+    "einops>=0.6.1",
+    "evaluate==0.4.0",
+    "flake8>=6.0.0",
+    "hf-doc-builder>=0.4.0",
+    "hf_transfer>=0.1.4",
+    "huggingface-hub>=0.19.2,<1.0",
+    "isort>=5.12.0",
+    "ninja>=1.11.1",
+    "numpy>=1.24.2",
+    "packaging>=23.0",
+    "parameterized>=0.9.0",
+    "peft>=0.9.0",
+    "protobuf<=3.20.2",  # Needed to avoid conflicts with `transformers`
+    "pytest",
+    "safetensors>=0.3.3",
+    "sentencepiece>=0.1.99",
+    "scipy",
+    "tensorboard",
+    "torch>=2.1.2",
+    "transformers>=4.39.3",
+    "trl>=0.9.6",
+    "jinja2>=3.0.0",
+    "tqdm>=4.64.1",
+]
+
+# this is a lookup table with items like:
+#
+# tokenizers: "tokenizers==0.9.4"
+# packaging: "packaging"
+#
+# some of the values are versioned whereas others aren't.
+deps = {b: a for a, b in (re.findall(r"^(([^!=<>~ \[\]]+)(?:\[[^\]]+\])?(?:[!=<>~ ].*)?$)", x)[0] for x in _deps)}
+
+
+def deps_list(*pkgs):
+    return [deps[pkg] for pkg in pkgs]
+
+
+extras = {}
+extras["tests"] = deps_list("pytest", "parameterized")
+extras["torch"] = deps_list("torch")
+extras["quality"] = deps_list("black", "isort", "flake8")
+extras["docs"] = deps_list("hf-doc-builder")
+extras["dev"] = extras["docs"] + extras["quality"] + extras["tests"]
+
+# core dependencies shared across the whole project - keep this to a bare minimum :)
+install_requires = [
+    deps["accelerate"],
+    deps["bitsandbytes"],
+    deps["einops"],
+    deps["evaluate"],
+    deps["datasets"],
+    deps["deepspeed"],
+    deps["hf_transfer"],
+    deps["huggingface-hub"],
+    deps["jinja2"],
+    deps["ninja"],
+    deps["numpy"],
+    deps["packaging"],  # utilities from PyPA to e.g., compare versions
+    deps["peft"],
+    deps["protobuf"],
+    deps["safetensors"],
+    deps["sentencepiece"],
+    deps["scipy"],
+    deps["tensorboard"],
+    deps["tqdm"],  # progress bars in model download and training scripts
+    deps["transformers"],
+    deps["trl"],
+]
+
+setup(
+    name="alignment-handbook",
+    version="0.4.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    author="The Hugging Face team (past and future)",
+    author_email="lewis@huggingface.co",
+    description="The Alignment Handbook",
+    long_description=open("README.md", "r", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    keywords="nlp deep learning rlhf llm",
+    license="Apache",
+    url="https://github.com/huggingface/alignment-handbook",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    zip_safe=False,
+    extras_require=extras,
+    python_requires=">=3.10.9",
+    install_requires=install_requires,
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+)
diff --git a/src/alignment/__init__.py b/src/alignment/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fafb542a7754d1fb6c0deb90463afcf06a94c6a
--- /dev/null
+++ b/src/alignment/__init__.py
@@ -0,0 +1,31 @@
+__version__ = "0.3.0.dev0"
+
+from .configs import DataArguments, DPOConfig, H4ArgumentParser, ModelArguments, SFTConfig
+from .data import apply_chat_template, get_datasets
+from .decontaminate import decontaminate_humaneval
+from .model_utils import (
+    get_checkpoint,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+    is_adapter_model,
+)
+
+
+__all__ = [
+    "DataArguments",
+    "DPOConfig",
+    "H4ArgumentParser",
+    "ModelArguments",
+    "SFTConfig",
+    "apply_chat_template",
+    "get_datasets",
+    "decontaminate_humaneval",
+    "get_checkpoint",
+    "get_kbit_device_map",
+    "get_peft_config",
+    "get_quantization_config",
+    "get_tokenizer",
+    "is_adapter_model",
+]
diff --git a/src/alignment/configs.py b/src/alignment/configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..aff079202af93c273af3f2e9f724c5651ba76721
--- /dev/null
+++ b/src/alignment/configs.py
@@ -0,0 +1,271 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import dataclasses
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, NewType, Optional, Tuple
+
+from transformers import MODEL_FOR_CAUSAL_LM_MAPPING, HfArgumentParser
+
+import trl
+
+
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+DataClassType = NewType("DataClassType", Any)
+
+
+class H4ArgumentParser(HfArgumentParser):
+    def parse_yaml_and_args(self, yaml_arg: str, other_args: Optional[List[str]] = None) -> List[dataclass]:
+        """
+        Parse a YAML file and overwrite the default/loaded values with the values provided to the command line.
+
+        Args:
+            yaml_arg (`str`):
+                The path to the config file used
+            other_args (`List[str]`, *optional`):
+                A list of strings to parse as command line arguments, e.g. ['--arg=val', '--arg2=val2'].
+
+        Returns:
+            [`List[dataclass]`]: a list of dataclasses with the values from the YAML file and the command line
+        """
+        arg_list = self.parse_yaml_file(os.path.abspath(yaml_arg))
+
+        outputs = []
+        # strip other args list into dict of key-value pairs
+        other_args = {arg.split("=")[0].strip("-"): arg.split("=")[1] for arg in other_args}
+        used_args = {}
+
+        # overwrite the default/loaded value with the value provided to the command line
+        # adapted from https://github.com/huggingface/transformers/blob/d0b5002378daabf62769159add3e7d66d3f83c3b/src/transformers/hf_argparser.py#L327
+        for data_yaml, data_class in zip(arg_list, self.dataclass_types):
+            keys = {f.name for f in dataclasses.fields(data_yaml) if f.init}
+            inputs = {k: v for k, v in vars(data_yaml).items() if k in keys}
+            for arg, val in other_args.items():
+                # add only if in keys
+
+                if arg in keys:
+                    base_type = data_yaml.__dataclass_fields__[arg].type
+                    inputs[arg] = val
+
+                    # cast type for ints, floats (default to strings)
+                    if base_type in [int, float]:
+                        inputs[arg] = base_type(val)
+
+                    if base_type == List[str]:
+                        inputs[arg] = [str(v) for v in val.split(",")]
+
+                    # bool of a non-empty string is True, so we manually check for bools
+                    if base_type is bool:
+                        if val in ["true", "True"]:
+                            inputs[arg] = True
+                        else:
+                            inputs[arg] = False
+
+                    # add to used-args so we can check if double add
+                    if arg not in used_args:
+                        used_args[arg] = val
+                    else:
+                        raise ValueError(f"Duplicate argument provided: {arg}, may cause unexpected behavior")
+
+            obj = data_class(**inputs)
+            outputs.append(obj)
+
+        return outputs
+
+    def parse(self) -> DataClassType | Tuple[DataClassType]:
+        if len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"):
+            # If we pass only one argument to the script and it's the path to a YAML file,
+            # let's parse it to get our arguments.
+            output = self.parse_yaml_file(os.path.abspath(sys.argv[1]))
+        # parse command line args and yaml file
+        elif len(sys.argv) > 2 and sys.argv[1].endswith(".yaml"):
+            output = self.parse_yaml_and_args(os.path.abspath(sys.argv[1]), sys.argv[2:])
+        # parse command line args only
+        else:
+            output = self.parse_args_into_dataclasses()
+
+        if len(output) == 1:
+            output = output[0]
+        return output
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune.
+    """
+
+    base_model_revision: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The base model checkpoint for weights initialization with PEFT adapters.")},
+    )
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The model checkpoint for weights initialization. Don't set if you want to train a model from scratch."
+            )
+        },
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    model_code_revision: str = field(default=None, metadata={"help": "The branch of the IFT model"})
+    torch_dtype: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
+                "dtype will be automatically derived from the model's weights."
+            ),
+            "choices": ["auto", "bfloat16", "float16", "float32"],
+        },
+    )
+    tokenizer_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The path to the tokenizer. Useful if you want to use a different tokenizer to the one stored in `model_name_or_path`."
+            )
+        },
+    )
+    trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."})
+    attn_implementation: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Which attention implementation to use; you can use --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`"
+            )
+        },
+    )
+    use_peft: bool = field(
+        default=False,
+        metadata={"help": ("Whether to use PEFT or not for training.")},
+    )
+    lora_r: Optional[int] = field(
+        default=16,
+        metadata={"help": ("LoRA R value.")},
+    )
+    lora_alpha: Optional[int] = field(
+        default=32,
+        metadata={"help": ("LoRA alpha.")},
+    )
+    lora_dropout: Optional[float] = field(
+        default=0.05,
+        metadata={"help": ("LoRA dropout.")},
+    )
+    lora_target_modules: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": ("LoRA target modules.")},
+    )
+    lora_modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": ("Model layers to unfreeze & train")},
+    )
+    load_in_8bit: bool = field(default=False, metadata={"help": "use 8 bit precision"})
+    load_in_4bit: bool = field(default=False, metadata={"help": "use 4 bit precision"})
+
+    bnb_4bit_quant_type: Optional[str] = field(
+        default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"}
+    )
+    use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"})
+    bnb_4bit_quant_storage: Optional[str] = field(
+        default="uint8",
+        metadata={"help": "storage type to pack the quanitzed 4-bit prarams."},
+    )
+
+    def __post_init__(self):
+        if self.load_in_8bit and self.load_in_4bit:
+            raise ValueError("You can't use 8 bit and 4 bit precision at the same time")
+
+
+@dataclass
+class DataArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    chat_template: Optional[str] = field(default=None, metadata={"help": "The chat template to use."})
+    dataset_mixer: Optional[Dict[str, float]] = field(
+        default=None,
+        metadata={"help": ("Datasets and their proportions to be used for training ift/rl.")},
+    )
+    text_column: Optional[str] = field(
+        default="text",
+        metadata={"help": "The column name to use for the text in the dataset (only used for continued pretraining)."},
+    )
+    dataset_splits: Optional[List[str]] = field(
+        default_factory=lambda: ["train", "test"],
+        metadata={"help": ("List of train test splits to use in the dataset")},
+    )
+    dataset_configs: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": "List of dataset config names. If given must be the same length as 'dataset_mixer' keys."},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    truncation_side: Optional[str] = field(
+        default=None, metadata={"help": "Truncation side to use for the tokenizer."}
+    )
+    auto_insert_empty_system_msg: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to automatically insert an empty system message as the first message if `system` is mentioned in the chat template."
+            )
+        },
+    )
+
+
+@dataclass
+class SFTConfig(trl.SFTConfig):
+    """
+    Arguments related to the training process itself. For all parameters, see: https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/trainer#transformers.TrainingArguments
+    Also used for the continued pretraining task.
+    """
+
+    hub_model_revision: Optional[str] = field(
+        default="main",
+        metadata={"help": ("The Hub model branch to push the model to.")},
+    )
+    logging_first_step: bool = field(
+        default=True,
+        metadata={"help": ("Whether to log and evaluate the first global_step or not.")},
+    )
+
+
+@dataclass
+class DPOConfig(trl.DPOConfig):
+    """
+    Arguments related to the DPO training process itself. For all parameters, see: https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/trainer#transformers.TrainingArguments
+    """
+
+    hub_model_revision: Optional[str] = field(
+        default="main",
+        metadata={"help": ("The Hub model branch to push the model to.")},
+    )
+    logging_first_step: bool = field(
+        default=True,
+        metadata={"help": ("Whether to log and evaluate the first global_step or not.")},
+    )
+    optim: Optional[str] = field(default="rmsprop")
+    remove_unused_columns: bool = field(default=False)
diff --git a/src/alignment/data.py b/src/alignment/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..56a4af62a89d7c8d7a27f6be0ac90ffd18bcb88d
--- /dev/null
+++ b/src/alignment/data.py
@@ -0,0 +1,256 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Any, List, Literal, Optional
+
+from datasets import DatasetDict, concatenate_datasets, load_dataset, load_from_disk
+from datasets.builder import DatasetGenerationError
+
+from .configs import DataArguments
+
+
+DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
+
+
+def maybe_insert_system_message(messages, tokenizer):
+    if messages[0]["role"] == "system":
+        return
+
+    # chat template can be one of two attributes, we check in order
+    chat_template = tokenizer.chat_template
+    if chat_template is None:
+        chat_template = tokenizer.get_chat_template()
+
+    # confirm the jinja template refers to a system message before inserting
+    if "system" in chat_template or "<|im_start|>" in chat_template:
+        messages.insert(0, {"role": "system", "content": ""})
+
+
+def apply_chat_template(
+    example,
+    tokenizer,
+    task: Literal["sft", "generation", "rm", "dpo"],
+    auto_insert_empty_system_msg: bool = True,
+):
+    if task in ["sft", "generation"]:
+        messages = example["messages"]
+        # We add an empty system message if there is none
+        if auto_insert_empty_system_msg:
+            maybe_insert_system_message(messages, tokenizer)
+        example["text"] = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True if task == "generation" else False,
+        )
+    elif task == "rm":
+        if all(k in example.keys() for k in ("chosen", "rejected")):
+            chosen_messages = example["chosen"]
+            rejected_messages = example["rejected"]
+            # We add an empty system message if there is none
+            if auto_insert_empty_system_msg:
+                maybe_insert_system_message(chosen_messages, tokenizer)
+                maybe_insert_system_message(rejected_messages, tokenizer)
+
+            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
+            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
+        else:
+            raise ValueError(
+                f"Could not format example as dialogue for `rm` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
+            )
+    elif task in ["dpo", "orpo"]:
+        if all(k in example.keys() for k in ("chosen", "rejected")):
+            if not is_openai_format(example["chosen"]) or not is_openai_format(example["rejected"]):
+                raise ValueError(
+                    f"Could not format example as dialogue for `{task}` task! Require OpenAI format for all messages"
+                )
+
+            # For DPO/ORPO, the inputs are triples of (prompt, chosen, rejected), where `chosen` and `rejected` are the final turn of a dialogue
+            # We therefore need to extract the N-1 turns to form the prompt
+            if "prompt" in example and is_openai_format(example["prompt"]):
+                prompt_messages = example["prompt"]
+                chosen_messages = example["chosen"]
+                rejected_messages = example["rejected"]
+            else:
+                prompt_messages = example["chosen"][:-1]
+                # Now we extract the final turn to define chosen/rejected responses
+                chosen_messages = example["chosen"][-1:]
+                rejected_messages = example["rejected"][-1:]
+
+            # Prepend a system message if the first message is not a system message
+            if auto_insert_empty_system_msg:
+                maybe_insert_system_message(prompt_messages, tokenizer)
+
+            example["text_prompt"] = tokenizer.apply_chat_template(prompt_messages, tokenize=False)
+            example["text_chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False)
+            example["text_rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False)
+        else:
+            raise ValueError(
+                f"Could not format example as dialogue for `{task}` task! Require either the "
+                f"`[chosen, rejected]` or `[prompt, chosen, rejected]` keys but found {list(example.keys())}"
+            )
+    else:
+        raise ValueError(
+            f"Task {task} not supported, please ensure that the provided task is one of ['sft', 'generation', 'rm', 'dpo', 'orpo']"
+        )
+    return example
+
+
+def is_openai_format(messages: Any) -> bool:
+    """
+    Check if the input messages are in OpenAI format.
+    Args:
+        messages (`Any`):
+            Messages to check.
+    Returns:
+        `bool`: Whether the messages are in OpenAI format.
+    """
+    if isinstance(messages, list) and all(isinstance(message, dict) for message in messages):
+        return all("role" in message and "content" in message for message in messages)
+    return False
+
+
+def get_datasets(
+    data_config: DataArguments | dict,
+    splits: Optional[List[str]] = None,
+    configs: Optional[List[str]] = None,
+    columns_to_keep: Optional[List[str]] = None,
+    shuffle: bool = True,
+) -> DatasetDict:
+    """
+    Loads one or more datasets with varying training set proportions.
+
+    Args:
+        data_config (`DataArguments` or `dict`):
+            Dataset configuration and split proportions.
+        splits (`List[str]`, *optional*, defaults to `['train', 'test']`):
+            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
+        configs (Optional[List[str]], *optional*, defaults to `None`):
+            List of dataset config names. If given must be the same length as 'data_config' keys.
+        columns_to_keep (Optional[List[str]], *optional*, defaults to `None`):
+            Column names to keep in the dataset. Useful in the datamixer to avoid schema conflicts,
+            and for cpt this should be (at least) the text column.
+        shuffle (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the training and testing/validation data.
+
+    Returns
+        [`DatasetDict`]: The dataset dictionary containing the loaded datasets.
+    """
+    if type(data_config) is DataArguments:
+        # Structure of the config to read the datasets and their mix
+        # datasets_mixer:
+        #     - 'dataset1': 0.5
+        #     - 'dataset2': 0.3
+        #     - 'dataset3': 0.2
+        dataset_mixer = data_config.dataset_mixer
+    elif isinstance(data_config, dict):
+        # Structure of the input is:
+        #     dataset_mixer = {
+        #             "dataset1": 0.5,
+        #             "dataset1": 0.3,
+        #             "dataset1": 0.2,
+        #         }
+        dataset_mixer = data_config
+    else:
+        raise ValueError(f"Data config {data_config} not recognized.")
+
+    raw_datasets = mix_datasets(
+        dataset_mixer,
+        splits=splits,
+        configs=configs,
+        columns_to_keep=columns_to_keep,
+        shuffle=shuffle,
+    )
+    return raw_datasets
+
+
+def mix_datasets(
+    dataset_mixer: dict,
+    splits: Optional[List[str]] = None,
+    configs: Optional[List[str]] = None,
+    columns_to_keep: Optional[List[str]] = None,
+    shuffle=True,
+) -> DatasetDict:
+    """
+    Loads and mixes datasets according to proportions specified in `dataset_mixer`.
+
+    Args:
+        dataset_mixer (`dict`):
+            Dictionary containing the dataset names and their training proportions. By default, all test proportions are 1.
+        splits (Optional[List[str]], *optional*, defaults to `None`):
+            Dataset splits to load and mix. Assumes the splits exist in all datasets and have a `train_` or `test_` prefix.
+        configs (Optional[List[str]], *optional*, defaults to `None`):
+            List of dataset config names. If given must be the same length as 'dataset_mixer' keys.
+        columns_to_keep (Optional[List[str]], *optional*, defaults to `None`):
+            Column names to keep in the dataset. Useful in the datamixer to avoid schema conflicts,
+            and for cpt this should be (at least) the text column.
+        shuffle (`bool`, *optional*, defaults to `True`):
+            Whether to shuffle the training and testing/validation data.
+    """
+    splits = ["train", "test"] if splits is None else splits
+    configs = [None] * len(dataset_mixer) if not configs else configs
+    columns_to_keep = [] if columns_to_keep is None else columns_to_keep
+
+    if configs is not None and len(configs) != len(dataset_mixer):
+        raise ValueError("The number of given dataset config names must be the same as the given number of datasets.")
+
+    raw_datasets = DatasetDict()
+    raw_train_datasets = []
+    raw_val_datasets = []
+    fracs = []
+    for (ds, frac), ds_config in zip(dataset_mixer.items(), configs):
+        fracs.append(frac)
+        for split in splits:
+            try:
+                # Try first if dataset on a Hub repo
+                dataset = load_dataset(ds, ds_config, split=split)
+            except DatasetGenerationError:
+                # If not, check local dataset
+                dataset = load_from_disk(os.path.join(ds, split))
+
+            # Remove redundant columns to avoid schema conflicts on load
+            dataset = dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])
+            if "train" in split:
+                raw_train_datasets.append(dataset)
+            elif "test" in split:
+                raw_val_datasets.append(dataset)
+            else:
+                raise ValueError(f"Split type {split} not recognized as one of test or train.")
+
+    if any(frac < 0 for frac in fracs):
+        raise ValueError("Dataset fractions cannot be negative.")
+
+    if len(raw_train_datasets) > 0:
+        train_subsets = []
+        for dataset, frac in zip(raw_train_datasets, fracs):
+            train_subset = dataset.select(range(int(frac * len(dataset))))
+            train_subsets.append(train_subset)
+        if shuffle:
+            raw_datasets["train"] = concatenate_datasets(train_subsets).shuffle(seed=42)
+        else:
+            raw_datasets["train"] = concatenate_datasets(train_subsets)
+    # No subsampling for test datasets to enable fair comparison across models
+    if len(raw_val_datasets) > 0:
+        if shuffle:
+            raw_datasets["test"] = concatenate_datasets(raw_val_datasets).shuffle(seed=42)
+        else:
+            raw_datasets["test"] = concatenate_datasets(raw_val_datasets)
+
+    if len(raw_datasets) == 0:
+        raise ValueError(
+            f"Dataset {dataset_mixer} not recognized with splits {splits}. Check the dataset has been correctly formatted."
+        )
+
+    return raw_datasets
diff --git a/src/alignment/decontaminate.py b/src/alignment/decontaminate.py
new file mode 100644
index 0000000000000000000000000000000000000000..45cba95cca5c276ccdbaa6b0cc26f48e8eee844f
--- /dev/null
+++ b/src/alignment/decontaminate.py
@@ -0,0 +1,91 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List
+
+from datasets import load_dataset
+
+
+# HumanEval solutions that are considered simple/generic enough to be kept in the training dataset
+HUMAN_EVAL_STRINGS_OK = ["return x + y", "return len(string)", "return n**2", "return " ".join(strings)"]
+
+
+def extract_docstring(prompt: str) -> str:
+    if '"""' in prompt:
+        if prompt.count('"""') == 2:
+            return prompt.split('"""')[1].strip()
+        elif prompt.count('"""') == 4:
+            return prompt.split('"""')[3].strip()
+        else:
+            raise ValueError()
+    elif "'''" in prompt:
+        assert prompt.count("'''") == 2
+        return prompt.split("'''")[1].strip()
+    else:
+        raise ValueError()
+
+
+def human_eval_docstrings() -> List[str]:
+    ds = load_dataset("openai_humaneval", split="test")
+    docstrings = [extract_docstring(v["prompt"]) for v in ds]
+    return docstrings
+
+
+def load_dataset_column(dataset: str, column: str, split: str, name=None) -> List[str]:
+    ds = load_dataset(dataset, split=split, name=name)
+    res = [sample[column].strip() for sample in ds]
+    # Only return non-empty strings
+    return [sample for sample in res if len(sample) > 0]
+
+
+FILTER_OUT = {
+    "human_eval_docstrings": human_eval_docstrings(),
+    "human_eval_solutions": [
+        s
+        for s in load_dataset_column("openai_humaneval", "canonical_solution", "test")
+        if s not in HUMAN_EVAL_STRINGS_OK
+    ],
+}
+
+
+def normalize_whitespace(text: str) -> str:
+    return " ".join(text.split())
+
+
+def decontaminate_humaneval(
+    samples: List[Dict[str, Any]], text_column: str = "text", filter_out: Dict[str, List[str]] = FILTER_OUT
+) -> List[Dict[str, Any]]:
+    """
+    filter_out: Dict[str, List[str]] mapping from benchmark name to list of strings that need to be
+    filtered-out.
+    Return a list where each element is True if the corresponding file should be included in the dataset.
+    Otherwise, the element is False.
+    """
+    output = []
+
+    for content in samples[text_column]:
+        content = normalize_whitespace(content.lower())
+        matched = False
+        for _, substrings in filter_out.items():
+            for substring in substrings:
+                if normalize_whitespace(substring.lower()) in content:
+                    matched = True
+                    break
+            if matched:
+                break
+        # we keep files that are not matched
+        output.append(not matched)
+
+    return output
diff --git a/src/alignment/model_utils.py b/src/alignment/model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..650517eec8aefac5822627d5ea8e09fd9a307cb4
--- /dev/null
+++ b/src/alignment/model_utils.py
@@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from pathlib import Path
+from typing import Dict
+
+import torch
+from transformers import AutoTokenizer, BitsAndBytesConfig, PreTrainedTokenizer
+from transformers.trainer_utils import get_last_checkpoint
+
+from accelerate import Accelerator
+from huggingface_hub import list_repo_files
+from huggingface_hub.errors import RepositoryNotFoundError
+from huggingface_hub.utils._validators import HFValidationError
+from peft import LoraConfig, PeftConfig
+
+from .configs import DataArguments, DPOConfig, ModelArguments, SFTConfig
+from .data import DEFAULT_CHAT_TEMPLATE
+
+
+def get_current_device() -> int:
+    """Get the current device. For GPU we return the local process index to enable multiple GPU training."""
+    return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"
+
+
+def get_kbit_device_map() -> Dict[str, int] | None:
+    """Useful for running inference with quantized models by setting `device_map=get_peft_device_map()`"""
+    return {"": get_current_device()} if torch.cuda.is_available() else None
+
+
+def get_quantization_config(model_args: ModelArguments) -> BitsAndBytesConfig | None:
+    if model_args.load_in_4bit:
+        compute_dtype = torch.float16
+        if model_args.torch_dtype not in {"auto", None}:
+            compute_dtype = getattr(torch, model_args.torch_dtype)
+
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=compute_dtype,
+            bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
+            bnb_4bit_use_double_quant=model_args.use_bnb_nested_quant,
+            bnb_4bit_quant_storage=model_args.bnb_4bit_quant_storage,
+        ).to_dict()
+    elif model_args.load_in_8bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+        ).to_dict()
+    else:
+        quantization_config = None
+
+    return quantization_config
+
+
+def get_tokenizer(
+    model_args: ModelArguments, data_args: DataArguments, auto_set_chat_template: bool = True
+) -> PreTrainedTokenizer:
+    """Get the tokenizer for the model."""
+    tokenizer = AutoTokenizer.from_pretrained(
+        (
+            model_args.model_name_or_path
+            if model_args.tokenizer_name_or_path is None
+            else model_args.tokenizer_name_or_path
+        ),
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+    )
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    if data_args.truncation_side is not None:
+        tokenizer.truncation_side = data_args.truncation_side
+
+    # Set reasonable default for models without max length
+    if tokenizer.model_max_length > 100_000:
+        tokenizer.model_max_length = 2048
+
+    if data_args.chat_template is not None:
+        tokenizer.chat_template = data_args.chat_template
+    elif auto_set_chat_template and tokenizer.get_chat_template() is None:
+        tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE
+
+    return tokenizer
+
+
+def get_peft_config(model_args: ModelArguments) -> PeftConfig | None:
+    if model_args.use_peft is False:
+        return None
+
+    peft_config = LoraConfig(
+        r=model_args.lora_r,
+        lora_alpha=model_args.lora_alpha,
+        lora_dropout=model_args.lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=model_args.lora_target_modules,
+        modules_to_save=model_args.lora_modules_to_save,
+    )
+
+    return peft_config
+
+
+def is_adapter_model(model_name_or_path: str, revision: str = "main") -> bool:
+    try:
+        # Try first if model on a Hub repo
+        repo_files = list_repo_files(model_name_or_path, revision=revision)
+    except (HFValidationError, RepositoryNotFoundError):
+        # If not, check local repo
+        repo_files = os.listdir(model_name_or_path)
+    return "adapter_model.safetensors" in repo_files or "adapter_model.bin" in repo_files
+
+
+def get_checkpoint(training_args: SFTConfig | DPOConfig) -> Path | None:
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+    return last_checkpoint
diff --git a/src/alignment/release.py b/src/alignment/release.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a9f6603fe6a675b052f05bcfe4ec8c330fdb281
--- /dev/null
+++ b/src/alignment/release.py
@@ -0,0 +1,125 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import re
+
+import packaging.version
+
+
+REPLACE_PATTERNS = {
+    "init": (
+        re.compile(r'^__version__\s+=\s+"([^"]+)"\s*$', re.MULTILINE),
+        '__version__ = "VERSION"\n',
+    ),
+    "setup": (
+        re.compile(r'^(\s*)version\s*=\s*"[^"]+",', re.MULTILINE),
+        r'\1version="VERSION",',
+    ),
+    "citation": (re.compile(r"^version:\s+[^ ]+", re.MULTILINE), "version: VERSION"),
+    "readme": (
+        re.compile(r"version\s+=\s+\{[^}]+\}", re.MULTILINE),
+        "version = {VERSION}",
+    ),
+}
+
+README_FILE = "README.md"
+
+REPLACE_FILES = {
+    "init": "src/alignment/__init__.py",
+    "setup": "setup.py",
+    "citation": "CITATION.cff",
+    "readme": README_FILE,
+}
+
+
+def update_version_in_file(fname, version, pattern):
+    """Update the version in one file using a specific pattern."""
+    with open(fname, "r", encoding="utf-8", newline="\n") as f:
+        code = f.read()
+    re_pattern, replace = REPLACE_PATTERNS[pattern]
+    replace = replace.replace("VERSION", version)
+    code = re_pattern.sub(replace, code)
+    with open(fname, "w", encoding="utf-8", newline="\n") as f:
+        f.write(code)
+
+
+def global_version_update(version, patch=False):
+    """Update the version in all needed files."""
+    for pattern, fname in REPLACE_FILES.items():
+        update_version_in_file(fname, version, pattern)
+
+
+def get_version():
+    """Reads the current version in the __init__."""
+    with open(REPLACE_FILES["init"], "r") as f:
+        code = f.read()
+    default_version = REPLACE_PATTERNS["init"][0].search(code).groups()[0]
+    return packaging.version.parse(default_version)
+
+
+def pre_release_work(patch=False):
+    """Do all the necessary pre-release steps."""
+    # First let's get the default version: base version if we are in dev, bump minor otherwise.
+    default_version = get_version()
+    if patch and default_version.is_devrelease:
+        raise ValueError("Can't create a patch version from the dev branch, checkout a released version!")
+    if default_version.is_devrelease:
+        default_version = default_version.base_version
+    elif patch:
+        default_version = f"{default_version.major}.{default_version.minor}.{default_version.micro + 1}"
+    else:
+        default_version = f"{default_version.major}.{default_version.minor + 1}.0"
+
+    # Now let's ask nicely if that's the right one.
+    version = input(f"Which version are you releasing? [{default_version}]")
+    if len(version) == 0:
+        version = default_version
+
+    print(f"Updating version to {version}.")
+    global_version_update(version, patch=patch)
+
+
+def post_release_work():
+    """Do all the necessary post-release steps."""
+    # First let's get the current version
+    current_version = get_version()
+    dev_version = f"{current_version.major}.{current_version.minor + 1}.0.dev0"
+    current_version = current_version.base_version
+
+    # Check with the user we got that right.
+    version = input(f"Which version are we developing now? [{dev_version}]")
+    if len(version) == 0:
+        version = dev_version
+
+    print(f"Updating version to {version}.")
+    global_version_update(version)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--post_release",
+        action="store_true",
+        help="Whether this is pre or post release.",
+    )
+    parser.add_argument("--patch", action="store_true", help="Whether or not this is a patch release.")
+    args = parser.parse_args()
+    if not args.post_release:
+        pre_release_work(patch=args.patch)
+    elif args.patch:
+        print("Nothing to do after a patch :-)")
+    else:
+        post_release_work()
diff --git a/test.json b/test.json
new file mode 100644
index 0000000000000000000000000000000000000000..3055ae8db33650c9c12f330cec6e64fe4c3aaf94
--- /dev/null
+++ b/test.json
@@ -0,0 +1,4167 @@
+[
+    {
+        "prompt": "22",
+        "completion": [
+            "\u5236\u5b9a\u8be6\u7ec6\u7684\u5b66\u4e60\u8ba1\u5212\u5e76\u4e25\u683c\u6267\u884c",
+            "\u91c7\u7528\u591a\u79cd\u5b66\u4e60\u65b9\u6cd5\uff0c\u5982\u9605\u8bfb\u3001\u505a\u7b14\u8bb0\u3001\u8ba8\u8bba\u7b49",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5b66\u4e60\u73af\u5883\uff0c\u51cf\u5c11\u5e72\u6270\u56e0\u7d20",
+            "\u5b9a\u671f\u8fdb\u884c\u81ea\u6211\u6d4b\u8bd5\uff0c\u68c0\u67e5\u5b66\u4e60\u6548\u679c",
+            "\u4fdd\u8bc1\u5145\u8db3\u7684\u7761\u7720\u548c\u9002\u5f53\u7684\u4f11\u606f"
+        ],
+        "reward": [
+            5,
+            4,
+            2,
+            1,
+            -5
+        ]
+    },
+    {
+        "prompt": "333",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -4,
+            -3,
+            -2,
+            2,
+            0
+        ]
+    },
+    {
+        "prompt": "4444",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            1,
+            -2,
+            1,
+            4,
+            2
+        ]
+    },
+    {
+        "prompt": "55555",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -4,
+            3,
+            5,
+            1,
+            -1
+        ]
+    },
+    {
+        "prompt": "666666",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            0,
+            1,
+            -5,
+            1,
+            0
+        ]
+    },
+    {
+        "prompt": "7777777",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            4,
+            -4,
+            4,
+            4,
+            -4
+        ]
+    },
+    {
+        "prompt": "88888888",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            1,
+            -3,
+            1,
+            -4,
+            4
+        ]
+    },
+    {
+        "prompt": "999999999",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -2,
+            -5,
+            1,
+            1,
+            -2
+        ]
+    },
+    {
+        "prompt": "10101010101010101010",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            0,
+            -5,
+            -4,
+            -3,
+            -1
+        ]
+    },
+    {
+        "prompt": "1111111111111111111111",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -2,
+            -3,
+            -2,
+            -5,
+            -5
+        ]
+    },
+    {
+        "prompt": "121212121212121212121212",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            2,
+            5,
+            -3,
+            0,
+            5
+        ]
+    },
+    {
+        "prompt": "13131313131313131313131313",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -5,
+            2,
+            -3,
+            -1,
+            5
+        ]
+    },
+    {
+        "prompt": "1414141414141414141414141414",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            2,
+            -3,
+            -5,
+            -2,
+            -3
+        ]
+    },
+    {
+        "prompt": "151515151515151515151515151515",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            5,
+            -1,
+            -2,
+            -3,
+            4
+        ]
+    },
+    {
+        "prompt": "16161616161616161616161616161616",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            4,
+            0,
+            -1,
+            -5,
+            1
+        ]
+    },
+    {
+        "prompt": "1717171717171717171717171717171717",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            1,
+            -5,
+            -4,
+            -3,
+            2
+        ]
+    },
+    {
+        "prompt": "181818181818181818181818181818181818",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -4,
+            5,
+            -2,
+            5,
+            -2
+        ]
+    },
+    {
+        "prompt": "19191919191919191919191919191919191919",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -2,
+            1,
+            0,
+            -1,
+            0
+        ]
+    },
+    {
+        "prompt": "2020202020202020202020202020202020202020",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            4,
+            2,
+            4,
+            -3,
+            1
+        ]
+    },
+    {
+        "prompt": "212121212121212121212121212121212121212121",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            4,
+            -5,
+            3,
+            2,
+            0
+        ]
+    },
+    {
+        "prompt": "22222222222222222222222222222222222222222222",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            4,
+            5,
+            -4,
+            -4,
+            0
+        ]
+    },
+    {
+        "prompt": "2323232323232323232323232323232323232323232323",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            4,
+            5,
+            -5,
+            1,
+            -4
+        ]
+    },
+    {
+        "prompt": "242424242424242424242424242424242424242424242424",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -3,
+            -2,
+            5,
+            2,
+            -5
+        ]
+    },
+    {
+        "prompt": "25252525252525252525252525252525252525252525252525",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            5,
+            5,
+            -2,
+            -5,
+            1
+        ]
+    },
+    {
+        "prompt": "2626262626262626262626262626262626262626262626262626",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            5,
+            -5,
+            -5,
+            -5,
+            4
+        ]
+    },
+    {
+        "prompt": "272727272727272727272727272727272727272727272727272727",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -4,
+            4,
+            5,
+            -2,
+            2
+        ]
+    },
+    {
+        "prompt": "28282828282828282828282828282828282828282828282828282828",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -3,
+            -3,
+            -2,
+            5,
+            3
+        ]
+    },
+    {
+        "prompt": "2929292929292929292929292929292929292929292929292929292929",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            4,
+            2,
+            3,
+            2,
+            -2
+        ]
+    },
+    {
+        "prompt": "303030303030303030303030303030303030303030303030303030303030",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            4,
+            -1,
+            0,
+            0,
+            0
+        ]
+    },
+    {
+        "prompt": "31313131313131313131313131313131313131313131313131313131313131",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -2,
+            -4,
+            5,
+            2,
+            1
+        ]
+    },
+    {
+        "prompt": "3232323232323232323232323232323232323232323232323232323232323232",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            0,
+            -5,
+            -4,
+            -3,
+            0
+        ]
+    },
+    {
+        "prompt": "333333333333333333333333333333333333333333333333333333333333333333",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -4,
+            -4,
+            -3,
+            -4,
+            2
+        ]
+    },
+    {
+        "prompt": "34343434343434343434343434343434343434343434343434343434343434343434",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            3,
+            -4,
+            2,
+            1,
+            -4
+        ]
+    },
+    {
+        "prompt": "3535353535353535353535353535353535353535353535353535353535353535353535",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -5,
+            1,
+            -2,
+            -5,
+            0
+        ]
+    },
+    {
+        "prompt": "363636363636363636363636363636363636363636363636363636363636363636363636",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -1,
+            -1,
+            5,
+            5,
+            -3
+        ]
+    },
+    {
+        "prompt": "37373737373737373737373737373737373737373737373737373737373737373737373737",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -4,
+            -2,
+            1,
+            1,
+            -1
+        ]
+    },
+    {
+        "prompt": "3838383838383838383838383838383838383838383838383838383838383838383838383838",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            0,
+            1,
+            -5,
+            5,
+            -4
+        ]
+    },
+    {
+        "prompt": "393939393939393939393939393939393939393939393939393939393939393939393939393939",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            2,
+            -4,
+            2,
+            2,
+            -2
+        ]
+    },
+    {
+        "prompt": "40404040404040404040404040404040404040404040404040404040404040404040404040404040",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            1,
+            -1,
+            -3,
+            0,
+            3
+        ]
+    },
+    {
+        "prompt": "4141414141414141414141414141414141414141414141414141414141414141414141414141414141",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -2,
+            -3,
+            4,
+            -1,
+            0
+        ]
+    },
+    {
+        "prompt": "424242424242424242424242424242424242424242424242424242424242424242424242424242424242",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            1,
+            -3,
+            5,
+            4,
+            -5
+        ]
+    },
+    {
+        "prompt": "43434343434343434343434343434343434343434343434343434343434343434343434343434343434343",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            0,
+            -2,
+            -1,
+            1,
+            4
+        ]
+    },
+    {
+        "prompt": "4444444444444444444444444444444444444444444444444444444444444444444444444444444444444444",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -5,
+            1,
+            1,
+            -1,
+            -2
+        ]
+    },
+    {
+        "prompt": "454545454545454545454545454545454545454545454545454545454545454545454545454545454545454545",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -4,
+            -1,
+            3,
+            1,
+            -5
+        ]
+    },
+    {
+        "prompt": "46464646464646464646464646464646464646464646464646464646464646464646464646464646464646464646",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -3,
+            -2,
+            -4,
+            0,
+            -3
+        ]
+    },
+    {
+        "prompt": "4747474747474747474747474747474747474747474747474747474747474747474747474747474747474747474747",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            1,
+            4,
+            -1,
+            -2,
+            -5
+        ]
+    },
+    {
+        "prompt": "484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            4,
+            2,
+            -1,
+            -5,
+            3
+        ]
+    },
+    {
+        "prompt": "49494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            3,
+            3,
+            -5,
+            1,
+            -2
+        ]
+    },
+    {
+        "prompt": "5050505050505050505050505050505050505050505050505050505050505050505050505050505050505050505050505050",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -1,
+            4,
+            1,
+            -5,
+            -5
+        ]
+    },
+    {
+        "prompt": "515151515151515151515151515151515151515151515151515151515151515151515151515151515151515151515151515151",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            5,
+            3,
+            5,
+            2,
+            -2
+        ]
+    },
+    {
+        "prompt": "52525252525252525252525252525252525252525252525252525252525252525252525252525252525252525252525252525252",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            2,
+            -5,
+            -5,
+            -1,
+            1
+        ]
+    },
+    {
+        "prompt": "5353535353535353535353535353535353535353535353535353535353535353535353535353535353535353535353535353535353",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            1,
+            -3,
+            3,
+            0,
+            3
+        ]
+    },
+    {
+        "prompt": "545454545454545454545454545454545454545454545454545454545454545454545454545454545454545454545454545454545454",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            0,
+            -4,
+            -4,
+            -4,
+            1
+        ]
+    },
+    {
+        "prompt": "55555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            0,
+            3,
+            3,
+            2,
+            -5
+        ]
+    },
+    {
+        "prompt": "5656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -3,
+            -5,
+            -3,
+            -3,
+            3
+        ]
+    },
+    {
+        "prompt": "575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            1,
+            -4,
+            5,
+            5,
+            4
+        ]
+    },
+    {
+        "prompt": "58585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -1,
+            -2,
+            -1,
+            -1,
+            -4
+        ]
+    },
+    {
+        "prompt": "5959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            1,
+            4,
+            2,
+            -5,
+            -4
+        ]
+    },
+    {
+        "prompt": "606060606060606060606060606060606060606060606060606060606060606060606060606060606060606060606060606060606060606060606060",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -1,
+            3,
+            -3,
+            -5,
+            2
+        ]
+    },
+    {
+        "prompt": "61616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161616161",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            2,
+            2,
+            4,
+            -4,
+            1
+        ]
+    },
+    {
+        "prompt": "6262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262626262",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -1,
+            -3,
+            -2,
+            -1,
+            3
+        ]
+    },
+    {
+        "prompt": "636363636363636363636363636363636363636363636363636363636363636363636363636363636363636363636363636363636363636363636363636363",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -2,
+            1,
+            -3,
+            1,
+            -1
+        ]
+    },
+    {
+        "prompt": "64646464646464646464646464646464646464646464646464646464646464646464646464646464646464646464646464646464646464646464646464646464",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            2,
+            -4,
+            4,
+            -3,
+            2
+        ]
+    },
+    {
+        "prompt": "6565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565656565",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -2,
+            2,
+            -4,
+            -5,
+            -2
+        ]
+    },
+    {
+        "prompt": "666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666666",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            5,
+            0,
+            -5,
+            1,
+            -4
+        ]
+    },
+    {
+        "prompt": "67676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            2,
+            4,
+            2,
+            -3,
+            4
+        ]
+    },
+    {
+        "prompt": "6868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -2,
+            -2,
+            4,
+            0,
+            -3
+        ]
+    },
+    {
+        "prompt": "696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            2,
+            -2,
+            5,
+            0,
+            0
+        ]
+    },
+    {
+        "prompt": "70707070707070707070707070707070707070707070707070707070707070707070707070707070707070707070707070707070707070707070707070707070707070707070",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -3,
+            0,
+            1,
+            3,
+            1
+        ]
+    },
+    {
+        "prompt": "7171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171717171",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -5,
+            1,
+            -2,
+            -5,
+            -3
+        ]
+    },
+    {
+        "prompt": "727272727272727272727272727272727272727272727272727272727272727272727272727272727272727272727272727272727272727272727272727272727272727272727272",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            3,
+            1,
+            0,
+            4,
+            -2
+        ]
+    },
+    {
+        "prompt": "73737373737373737373737373737373737373737373737373737373737373737373737373737373737373737373737373737373737373737373737373737373737373737373737373",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -3,
+            5,
+            4,
+            2,
+            -4
+        ]
+    },
+    {
+        "prompt": "7474747474747474747474747474747474747474747474747474747474747474747474747474747474747474747474747474747474747474747474747474747474747474747474747474",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            0,
+            -1,
+            -3,
+            5,
+            0
+        ]
+    },
+    {
+        "prompt": "757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575757575",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -5,
+            -2,
+            -3,
+            -1,
+            5
+        ]
+    },
+    {
+        "prompt": "76767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676767676",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            4,
+            2,
+            -5,
+            -1,
+            -5
+        ]
+    },
+    {
+        "prompt": "7777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777777",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -3,
+            -5,
+            5,
+            -5,
+            -3
+        ]
+    },
+    {
+        "prompt": "787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            1,
+            3,
+            0,
+            -5,
+            -3
+        ]
+    },
+    {
+        "prompt": "79797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -2,
+            -3,
+            -3,
+            -1,
+            -1
+        ]
+    },
+    {
+        "prompt": "8080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080808080",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -5,
+            -5,
+            5,
+            0,
+            2
+        ]
+    },
+    {
+        "prompt": "818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181818181",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -5,
+            -4,
+            0,
+            4,
+            -5
+        ]
+    },
+    {
+        "prompt": "82828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282828282",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            1,
+            -5,
+            4,
+            1,
+            4
+        ]
+    },
+    {
+        "prompt": "8383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383838383",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            2,
+            -3,
+            2,
+            -2,
+            -4
+        ]
+    },
+    {
+        "prompt": "848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484848484",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -1,
+            -4,
+            -1,
+            -5,
+            -2
+        ]
+    },
+    {
+        "prompt": "85858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585858585",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            2,
+            1,
+            -3,
+            1,
+            -1
+        ]
+    },
+    {
+        "prompt": "8686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686868686",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            3,
+            -1,
+            1,
+            5,
+            3
+        ]
+    },
+    {
+        "prompt": "878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787878787",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            1,
+            -3,
+            -2,
+            1,
+            1
+        ]
+    },
+    {
+        "prompt": "88888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888888",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            1,
+            5,
+            -1,
+            5,
+            1
+        ]
+    },
+    {
+        "prompt": "8989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -1,
+            2,
+            4,
+            -5,
+            3
+        ]
+    },
+    {
+        "prompt": "909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090909090",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -1,
+            0,
+            4,
+            4,
+            -1
+        ]
+    },
+    {
+        "prompt": "91919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191919191",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            4,
+            -5,
+            -3,
+            -2,
+            -3
+        ]
+    },
+    {
+        "prompt": "9292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292929292",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -3,
+            4,
+            -3,
+            5,
+            0
+        ]
+    },
+    {
+        "prompt": "939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393939393",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -1,
+            3,
+            -3,
+            -2,
+            -5
+        ]
+    },
+    {
+        "prompt": "94949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494949494",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            3,
+            4,
+            -5,
+            2,
+            3
+        ]
+    },
+    {
+        "prompt": "9595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595959595",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            0,
+            3,
+            -2,
+            -5,
+            3
+        ]
+    },
+    {
+        "prompt": "969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696969696",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            5,
+            2,
+            -3,
+            3,
+            -2
+        ]
+    },
+    {
+        "prompt": "97979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797979797",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -1,
+            3,
+            3,
+            -4,
+            2
+        ]
+    },
+    {
+        "prompt": "9898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898989898",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -4,
+            5,
+            -4,
+            3,
+            4
+        ]
+    },
+    {
+        "prompt": "999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999999",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            4,
+            -2,
+            -5,
+            4,
+            -4
+        ]
+    },
+    {
+        "prompt": "100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100100",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -1,
+            1,
+            0,
+            1,
+            -4
+        ]
+    },
+    {
+        "prompt": "101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101101",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            0,
+            5,
+            3,
+            -4,
+            5
+        ]
+    },
+    {
+        "prompt": "102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102102",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            2,
+            -5,
+            0,
+            5,
+            -5
+        ]
+    },
+    {
+        "prompt": "103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103103",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -1,
+            -5,
+            0,
+            4,
+            1
+        ]
+    },
+    {
+        "prompt": "104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104104",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            3,
+            -4,
+            -1,
+            -1,
+            -4
+        ]
+    },
+    {
+        "prompt": "105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105105",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -2,
+            1,
+            1,
+            2,
+            -4
+        ]
+    },
+    {
+        "prompt": "106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106106",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -3,
+            3,
+            -1,
+            4,
+            1
+        ]
+    },
+    {
+        "prompt": "107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107107",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -5,
+            -2,
+            -3,
+            -2,
+            1
+        ]
+    },
+    {
+        "prompt": "108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108108",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -3,
+            4,
+            5,
+            2,
+            4
+        ]
+    },
+    {
+        "prompt": "109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109109",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            5,
+            5,
+            5,
+            0,
+            -1
+        ]
+    },
+    {
+        "prompt": "110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110110",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -1,
+            -5,
+            4,
+            4,
+            -2
+        ]
+    },
+    {
+        "prompt": "111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            0,
+            -3,
+            4,
+            -1,
+            -4
+        ]
+    },
+    {
+        "prompt": "112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112112",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            3,
+            3,
+            -1,
+            -2,
+            -2
+        ]
+    },
+    {
+        "prompt": "113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113113",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            0,
+            -4,
+            2,
+            -5,
+            3
+        ]
+    },
+    {
+        "prompt": "114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114114",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            4,
+            0,
+            1,
+            1,
+            4
+        ]
+    },
+    {
+        "prompt": "115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115115",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            5,
+            5,
+            1,
+            -2,
+            1
+        ]
+    },
+    {
+        "prompt": "116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116116",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            1,
+            2,
+            -3,
+            -3,
+            -5
+        ]
+    },
+    {
+        "prompt": "117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117117",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -2,
+            -4,
+            -4,
+            4,
+            -5
+        ]
+    },
+    {
+        "prompt": "118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118118",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            3,
+            5,
+            -4,
+            1,
+            -5
+        ]
+    },
+    {
+        "prompt": "119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119119",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            0,
+            5,
+            -2,
+            -5,
+            3
+        ]
+    },
+    {
+        "prompt": "120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120120",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            2,
+            -3,
+            1,
+            5,
+            4
+        ]
+    },
+    {
+        "prompt": "121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121121",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            3,
+            5,
+            -2,
+            -2,
+            4
+        ]
+    },
+    {
+        "prompt": "122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122122",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -2,
+            1,
+            0,
+            -3,
+            1
+        ]
+    },
+    {
+        "prompt": "123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123123",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            3,
+            1,
+            -1,
+            -1,
+            -1
+        ]
+    },
+    {
+        "prompt": "124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124124",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            0,
+            -2,
+            -5,
+            4,
+            3
+        ]
+    },
+    {
+        "prompt": "125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125125",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -3,
+            -2,
+            1,
+            1,
+            -2
+        ]
+    },
+    {
+        "prompt": "126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126126",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -4,
+            1,
+            2,
+            2,
+            2
+        ]
+    },
+    {
+        "prompt": "127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127127",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -5,
+            -3,
+            -2,
+            -5,
+            -2
+        ]
+    },
+    {
+        "prompt": "128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128128",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -3,
+            5,
+            0,
+            -3,
+            1
+        ]
+    },
+    {
+        "prompt": "129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129129",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -5,
+            -5,
+            -4,
+            -2,
+            3
+        ]
+    },
+    {
+        "prompt": "130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130130",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            3,
+            -5,
+            2,
+            1,
+            1
+        ]
+    },
+    {
+        "prompt": "131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131131",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -1,
+            2,
+            -4,
+            -4,
+            -3
+        ]
+    },
+    {
+        "prompt": "132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132132",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -1,
+            -3,
+            -2,
+            2,
+            -3
+        ]
+    },
+    {
+        "prompt": "133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133133",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -3,
+            5,
+            -2,
+            -2,
+            -3
+        ]
+    },
+    {
+        "prompt": "134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134134",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -1,
+            -1,
+            1,
+            1,
+            -2
+        ]
+    },
+    {
+        "prompt": "135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135135",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            1,
+            -3,
+            -5,
+            -1,
+            -5
+        ]
+    },
+    {
+        "prompt": "136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136136",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            2,
+            -3,
+            -3,
+            -1,
+            3
+        ]
+    },
+    {
+        "prompt": "137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137137",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            1,
+            0,
+            3,
+            -3,
+            -5
+        ]
+    },
+    {
+        "prompt": "138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138138",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            5,
+            -4,
+            5,
+            -2,
+            -4
+        ]
+    },
+    {
+        "prompt": "139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139139",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            5,
+            1,
+            -1,
+            -1,
+            -4
+        ]
+    },
+    {
+        "prompt": "140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140140",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            1,
+            -1,
+            -3,
+            -2,
+            -5
+        ]
+    },
+    {
+        "prompt": "141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141141",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -3,
+            -3,
+            -5,
+            0,
+            3
+        ]
+    },
+    {
+        "prompt": "142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142142",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            2,
+            -1,
+            -1,
+            -4,
+            -5
+        ]
+    },
+    {
+        "prompt": "143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143143",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            5,
+            4,
+            1,
+            -3,
+            -1
+        ]
+    },
+    {
+        "prompt": "144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144144",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            4,
+            2,
+            5,
+            -5,
+            -2
+        ]
+    },
+    {
+        "prompt": "145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145145",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -3,
+            0,
+            2,
+            0,
+            -3
+        ]
+    },
+    {
+        "prompt": "146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146146",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            1,
+            1,
+            2,
+            -4,
+            -5
+        ]
+    },
+    {
+        "prompt": "147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147147",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            2,
+            4,
+            5,
+            -5,
+            3
+        ]
+    },
+    {
+        "prompt": "148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148148",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            1,
+            -1,
+            0,
+            -4,
+            0
+        ]
+    },
+    {
+        "prompt": "149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149149",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -3,
+            -2,
+            -1,
+            -4,
+            4
+        ]
+    },
+    {
+        "prompt": "150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150150",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -1,
+            -3,
+            -2,
+            1,
+            2
+        ]
+    },
+    {
+        "prompt": "151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151151",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            0,
+            -4,
+            -3,
+            -5,
+            1
+        ]
+    },
+    {
+        "prompt": "152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152152",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            5,
+            2,
+            -3,
+            3,
+            0
+        ]
+    },
+    {
+        "prompt": "153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153153",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            5,
+            -3,
+            2,
+            -1,
+            5
+        ]
+    },
+    {
+        "prompt": "154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154154",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            1,
+            0,
+            -5,
+            -5,
+            2
+        ]
+    },
+    {
+        "prompt": "155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155155",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -4,
+            -4,
+            3,
+            0,
+            5
+        ]
+    },
+    {
+        "prompt": "156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156156",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            4,
+            -1,
+            -5,
+            -4,
+            5
+        ]
+    },
+    {
+        "prompt": "157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157157",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            2,
+            -1,
+            -2,
+            2,
+            -1
+        ]
+    },
+    {
+        "prompt": "158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158158",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -5,
+            -3,
+            -3,
+            -5,
+            -5
+        ]
+    },
+    {
+        "prompt": "159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159159",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -4,
+            -3,
+            0,
+            -1,
+            -4
+        ]
+    },
+    {
+        "prompt": "160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160160",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            0,
+            3,
+            0,
+            -4,
+            1
+        ]
+    },
+    {
+        "prompt": "161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161161",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            5,
+            0,
+            0,
+            -4,
+            -3
+        ]
+    },
+    {
+        "prompt": "162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162162",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -2,
+            3,
+            -1,
+            4,
+            0
+        ]
+    },
+    {
+        "prompt": "163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163163",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -5,
+            4,
+            -1,
+            -5,
+            4
+        ]
+    },
+    {
+        "prompt": "164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164164",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            0,
+            1,
+            4,
+            -2,
+            -5
+        ]
+    },
+    {
+        "prompt": "165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165165",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -5,
+            3,
+            -2,
+            2,
+            5
+        ]
+    },
+    {
+        "prompt": "166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166166",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            1,
+            -4,
+            0,
+            -1,
+            -1
+        ]
+    },
+    {
+        "prompt": "167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167167",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -4,
+            -2,
+            5,
+            2,
+            -4
+        ]
+    },
+    {
+        "prompt": "168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168168",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -4,
+            1,
+            3,
+            -5,
+            0
+        ]
+    },
+    {
+        "prompt": "169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169169",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -1,
+            -4,
+            4,
+            2,
+            -3
+        ]
+    },
+    {
+        "prompt": "170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170170",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -4,
+            5,
+            -3,
+            0,
+            4
+        ]
+    },
+    {
+        "prompt": "171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171171",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            5,
+            4,
+            -4,
+            -4,
+            -2
+        ]
+    },
+    {
+        "prompt": "172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172172",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -5,
+            -4,
+            -3,
+            -2,
+            0
+        ]
+    },
+    {
+        "prompt": "173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173173",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -4,
+            4,
+            4,
+            4,
+            2
+        ]
+    },
+    {
+        "prompt": "174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174174",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            4,
+            3,
+            -1,
+            -3,
+            4
+        ]
+    },
+    {
+        "prompt": "175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175175",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            2,
+            -3,
+            5,
+            -5,
+            1
+        ]
+    },
+    {
+        "prompt": "176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176176",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            1,
+            0,
+            4,
+            5,
+            3
+        ]
+    },
+    {
+        "prompt": "177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177177",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            1,
+            1,
+            -5,
+            -2,
+            -1
+        ]
+    },
+    {
+        "prompt": "178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178178",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -1,
+            2,
+            0,
+            -3,
+            2
+        ]
+    },
+    {
+        "prompt": "179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179179",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -3,
+            -3,
+            5,
+            4,
+            1
+        ]
+    },
+    {
+        "prompt": "180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180180",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -2,
+            -4,
+            1,
+            0,
+            4
+        ]
+    },
+    {
+        "prompt": "181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181181",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -3,
+            0,
+            1,
+            -3,
+            0
+        ]
+    },
+    {
+        "prompt": "182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182182",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -5,
+            0,
+            0,
+            -4,
+            -5
+        ]
+    },
+    {
+        "prompt": "183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183183",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -5,
+            2,
+            -3,
+            -4,
+            2
+        ]
+    },
+    {
+        "prompt": "184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184184",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            0,
+            -1,
+            -4,
+            2,
+            -1
+        ]
+    },
+    {
+        "prompt": "185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185185",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -2,
+            5,
+            4,
+            -1,
+            -1
+        ]
+    },
+    {
+        "prompt": "186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186186",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -2,
+            1,
+            4,
+            5,
+            5
+        ]
+    },
+    {
+        "prompt": "187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187187",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -2,
+            -3,
+            3,
+            2,
+            -1
+        ]
+    },
+    {
+        "prompt": "188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188188",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            5,
+            -1,
+            2,
+            3,
+            -3
+        ]
+    },
+    {
+        "prompt": "189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189189",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            3,
+            1,
+            -5,
+            3,
+            -2
+        ]
+    },
+    {
+        "prompt": "190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190190",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -2,
+            -3,
+            5,
+            -1,
+            -3
+        ]
+    },
+    {
+        "prompt": "191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191191",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            1,
+            2,
+            -2,
+            -2,
+            -5
+        ]
+    },
+    {
+        "prompt": "192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192192",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -1,
+            -2,
+            -5,
+            -5,
+            -4
+        ]
+    },
+    {
+        "prompt": "193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193193",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            4,
+            2,
+            1,
+            5,
+            -5
+        ]
+    },
+    {
+        "prompt": "194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194194",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -3,
+            -2,
+            -3,
+            2,
+            1
+        ]
+    },
+    {
+        "prompt": "195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195195",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            3,
+            2,
+            4,
+            0,
+            5
+        ]
+    },
+    {
+        "prompt": "196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196196",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            2,
+            -5,
+            -2,
+            -1,
+            -1
+        ]
+    },
+    {
+        "prompt": "197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197197",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            2,
+            -2,
+            -1,
+            -3,
+            -4
+        ]
+    },
+    {
+        "prompt": "198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198198",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -4,
+            3,
+            0,
+            4,
+            -3
+        ]
+    },
+    {
+        "prompt": "199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199199",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            4,
+            4,
+            1,
+            -3,
+            -1
+        ]
+    },
+    {
+        "prompt": "200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200200",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            2,
+            2,
+            -3,
+            4,
+            0
+        ]
+    },
+    {
+        "prompt": "201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201201",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            2,
+            -2,
+            3,
+            -3,
+            4
+        ]
+    },
+    {
+        "prompt": "202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202202",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            4,
+            -1,
+            3,
+            -4,
+            -4
+        ]
+    },
+    {
+        "prompt": "203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203203",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            2,
+            -4,
+            2,
+            -2,
+            0
+        ]
+    },
+    {
+        "prompt": "204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204204",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            2,
+            -4,
+            5,
+            -5,
+            5
+        ]
+    },
+    {
+        "prompt": "205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205205",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            2,
+            3,
+            4,
+            0,
+            -3
+        ]
+    },
+    {
+        "prompt": "206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206206",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -5,
+            -4,
+            -3,
+            -4,
+            4
+        ]
+    },
+    {
+        "prompt": "207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207207",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            5,
+            1,
+            2,
+            1,
+            0
+        ]
+    },
+    {
+        "prompt": "208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208208",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            0,
+            -5,
+            -4,
+            -4,
+            0
+        ]
+    },
+    {
+        "prompt": "209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209209",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -4,
+            -4,
+            -1,
+            1,
+            0
+        ]
+    },
+    {
+        "prompt": "210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210210",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -1,
+            -3,
+            1,
+            1,
+            -1
+        ]
+    },
+    {
+        "prompt": "211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211211",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            1,
+            -2,
+            -3,
+            2,
+            3
+        ]
+    },
+    {
+        "prompt": "212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212212",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -2,
+            4,
+            -2,
+            3,
+            -4
+        ]
+    },
+    {
+        "prompt": "213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213213",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            5,
+            3,
+            5,
+            2,
+            -4
+        ]
+    },
+    {
+        "prompt": "214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214214",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -2,
+            -3,
+            -1,
+            -4,
+            2
+        ]
+    },
+    {
+        "prompt": "215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215215",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -4,
+            5,
+            -5,
+            3,
+            4
+        ]
+    },
+    {
+        "prompt": "216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216216",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            2,
+            3,
+            2,
+            2,
+            1
+        ]
+    },
+    {
+        "prompt": "217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217217",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            3,
+            4,
+            -3,
+            1,
+            5
+        ]
+    },
+    {
+        "prompt": "218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218218",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            3,
+            5,
+            1,
+            -3,
+            4
+        ]
+    },
+    {
+        "prompt": "219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219219",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -4,
+            4,
+            4,
+            5,
+            2
+        ]
+    },
+    {
+        "prompt": "220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220220",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            2,
+            0,
+            -4,
+            4,
+            3
+        ]
+    },
+    {
+        "prompt": "221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221221",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -3,
+            5,
+            4,
+            0,
+            -1
+        ]
+    },
+    {
+        "prompt": "222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -1,
+            -2,
+            5,
+            -5,
+            1
+        ]
+    },
+    {
+        "prompt": "223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223223",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            2,
+            0,
+            4,
+            2,
+            5
+        ]
+    },
+    {
+        "prompt": "224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224224",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            2,
+            1,
+            -4,
+            2,
+            -5
+        ]
+    },
+    {
+        "prompt": "225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225225",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -3,
+            4,
+            1,
+            4,
+            -2
+        ]
+    },
+    {
+        "prompt": "226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226226",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -5,
+            -2,
+            -4,
+            -1,
+            -1
+        ]
+    },
+    {
+        "prompt": "227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227227",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -4,
+            1,
+            -1,
+            4,
+            2
+        ]
+    },
+    {
+        "prompt": "228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228228",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -2,
+            -4,
+            -5,
+            -1,
+            1
+        ]
+    },
+    {
+        "prompt": "229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229229",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            0,
+            -2,
+            1,
+            2,
+            -1
+        ]
+    },
+    {
+        "prompt": "230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230230",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -3,
+            -2,
+            1,
+            0,
+            -1
+        ]
+    },
+    {
+        "prompt": "231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231231",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            1,
+            -2,
+            1,
+            4,
+            4
+        ]
+    },
+    {
+        "prompt": "232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232232",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            4,
+            -4,
+            -4,
+            2,
+            3
+        ]
+    },
+    {
+        "prompt": "233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233233",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            3,
+            1,
+            -5,
+            -2,
+            1
+        ]
+    },
+    {
+        "prompt": "234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234234",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            3,
+            3,
+            -4,
+            -3,
+            -4
+        ]
+    },
+    {
+        "prompt": "235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235235",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -3,
+            2,
+            -3,
+            4,
+            -2
+        ]
+    },
+    {
+        "prompt": "236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236236",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            3,
+            -1,
+            -4,
+            5,
+            2
+        ]
+    },
+    {
+        "prompt": "237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237237",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -3,
+            -2,
+            1,
+            -1,
+            1
+        ]
+    },
+    {
+        "prompt": "238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238238",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -1,
+            5,
+            5,
+            -4,
+            3
+        ]
+    },
+    {
+        "prompt": "239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239239",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            -2,
+            1,
+            -5,
+            -4,
+            0
+        ]
+    },
+    {
+        "prompt": "240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240240",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -1,
+            0,
+            -2,
+            2,
+            3
+        ]
+    },
+    {
+        "prompt": "241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241241",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -5,
+            -1,
+            0,
+            3,
+            -3
+        ]
+    },
+    {
+        "prompt": "242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242242",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            0,
+            -1,
+            1,
+            3,
+            -2
+        ]
+    },
+    {
+        "prompt": "243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243243",
+        "completion": [
+            "\u5747\u8861\u996e\u98df\uff0c\u591a\u5403\u852c\u83dc\u6c34\u679c\uff0c\u5c11\u5403\u6cb9\u817b\u548c\u9ad8\u7cd6\u98df\u7269",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u5982\u8dd1\u6b65\u3001\u6e38\u6cf3\u3001\u5065\u8eab\u7b49",
+            "\u4fdd\u6301\u89c4\u5f8b\u7684\u4f5c\u606f\u65f6\u95f4\uff0c\u65e9\u7761\u65e9\u8d77",
+            "\u4fdd\u6301\u826f\u597d\u7684\u5fc3\u6001\uff0c\u907f\u514d\u8fc7\u5ea6\u7126\u8651\u548c\u538b\u529b",
+            "\u5b9a\u671f\u8fdb\u884c\u4f53\u68c0\uff0c\u53ca\u65f6\u53d1\u73b0\u548c\u9884\u9632\u75be\u75c5"
+        ],
+        "reward": [
+            3,
+            4,
+            -3,
+            -1,
+            4
+        ]
+    },
+    {
+        "prompt": "244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244244",
+        "completion": [
+            "\u5408\u7406\u5b89\u6392\u5de5\u4f5c\u4efb\u52a1\uff0c\u5236\u5b9a\u4f18\u5148\u7ea7",
+            "\u907f\u514d\u591a\u4efb\u52a1\u5904\u7406\uff0c\u4e13\u6ce8\u5b8c\u6210\u4e00\u9879\u4efb\u52a1",
+            "\u4f7f\u7528\u5de5\u5177\u548c\u8f6f\u4ef6\u63d0\u9ad8\u5de5\u4f5c\u81ea\u52a8\u5316\u7a0b\u5ea6",
+            "\u5b9a\u671f\u4f11\u606f\uff0c\u907f\u514d\u957f\u65f6\u95f4\u8fde\u7eed\u5de5\u4f5c\u5bfc\u81f4\u75b2\u52b3",
+            "\u4e0e\u540c\u4e8b\u6709\u6548\u6c9f\u901a\uff0c\u51cf\u5c11\u4e0d\u5fc5\u8981\u7684\u4f1a\u8bae\u548c\u5e72\u6270"
+        ],
+        "reward": [
+            -2,
+            2,
+            3,
+            -3,
+            4
+        ]
+    },
+    {
+        "prompt": "245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245245",
+        "completion": [
+            "\u5c0a\u91cd\u4ed6\u4eba\uff0c\u503e\u542c\u4ed6\u4eba\u7684\u610f\u89c1\u548c\u60f3\u6cd5",
+            "\u79ef\u6781\u6c9f\u901a\uff0c\u8868\u8fbe\u81ea\u5df1\u7684\u611f\u53d7\u548c\u9700\u6c42",
+            "\u4e50\u4e8e\u52a9\u4eba\uff0c\u5173\u5fc3\u4ed6\u4eba\u7684\u56f0\u96be\u548c\u95ee\u9898",
+            "\u4fdd\u6301\u8bda\u4fe1\uff0c\u9075\u5b88\u627f\u8bfa\u548c\u7ea6\u5b9a",
+            "\u53c2\u4e0e\u793e\u4ea4\u6d3b\u52a8\uff0c\u6269\u5927\u81ea\u5df1\u7684\u793e\u4ea4\u5708\u5b50"
+        ],
+        "reward": [
+            -4,
+            0,
+            1,
+            -1,
+            3
+        ]
+    },
+    {
+        "prompt": "246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246246",
+        "completion": [
+            "\u8fdb\u884c\u653e\u677e\u8bad\u7ec3\uff0c\u5982\u6df1\u547c\u5438\u3001\u51a5\u60f3\u7b49",
+            "\u9002\u5f53\u8fdb\u884c\u4f53\u80b2\u953b\u70bc\uff0c\u91ca\u653e\u538b\u529b",
+            "\u4e0e\u670b\u53cb\u6216\u5bb6\u4eba\u4ea4\u6d41\uff0c\u5bfb\u6c42\u652f\u6301\u548c\u5e2e\u52a9",
+            "\u5408\u7406\u5b89\u6392\u65f6\u95f4\uff0c\u907f\u514d\u8fc7\u5ea6\u5de5\u4f5c\u548c\u5b66\u4e60",
+            "\u57f9\u517b\u5174\u8da3\u7231\u597d\uff0c\u8f6c\u79fb\u6ce8\u610f\u529b"
+        ],
+        "reward": [
+            -5,
+            -5,
+            -1,
+            -3,
+            -4
+        ]
+    }
+]
\ No newline at end of file
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tests/fixtures/config_dpo_full.yaml b/tests/fixtures/config_dpo_full.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9ed1387370377ae7ed51e31e68d4c8afb677cf25
--- /dev/null
+++ b/tests/fixtures/config_dpo_full.yaml
@@ -0,0 +1,37 @@
+# Model arguments
+model_name_or_path: alignment-handbook/zephyr-7b-sft-full
+
+# Data training arguments
+# For definitions, see: src/h4/training/config.py
+dataset_mixer:
+  HuggingFaceH4/ultrafeedback_binarized: 1.0
+dataset_splits:
+- train_prefs
+- test_prefs
+preprocessing_num_workers: 12
+
+# DPOTrainer arguments
+bf16: true
+beta: 0.1
+do_eval: true
+eval_strategy: steps
+eval_steps: 100
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+hub_model_id: zephyr-7b-dpo-full
+learning_rate: 5.0e-7
+log_level: info
+logging_steps: 10
+lr_scheduler_type: linear
+max_length: 1024
+max_prompt_length: 512
+num_train_epochs: 3
+optim: rmsprop
+output_dir: data/zephyr-7b-dpo-full
+per_device_train_batch_size: 8
+per_device_eval_batch_size: 4
+push_to_hub: true
+save_strategy: "no"
+save_total_limit: null
+seed: 42
+warmup_ratio: 0.1
\ No newline at end of file
diff --git a/tests/fixtures/config_sft_full.yaml b/tests/fixtures/config_sft_full.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..297dc06a0d86897a068a685483f609c387da7527
--- /dev/null
+++ b/tests/fixtures/config_sft_full.yaml
@@ -0,0 +1,41 @@
+# Model arguments
+model_name_or_path: mistralai/Mistral-7B-v0.1
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_mixer:
+  HuggingFaceH4/ultrachat_200k: 1.0
+dataset_splits:
+- train_sft
+- test_sft
+preprocessing_num_workers: 12
+
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: epoch
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+hub_model_id: zephyr-7b-sft-full
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5  
+logging_strategy: steps
+lr_scheduler_type: cosine
+max_seq_length: 2048
+max_steps: -1
+num_train_epochs: 1
+output_dir: data/zephyr-7b-sft-full
+overwrite_output_dir: true
+per_device_eval_batch_size: 16
+per_device_train_batch_size: 32
+push_to_hub: true
+remove_unused_columns: true
+report_to:
+- tensorboard
+save_strategy: "no"
+save_total_limit: null
+seed: 42
\ No newline at end of file
diff --git a/tests/test_configs.py b/tests/test_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a4a7a6d0e642df5a1aade36e5912d5239c6deb6
--- /dev/null
+++ b/tests/test_configs.py
@@ -0,0 +1,43 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from alignment import DataArguments, H4ArgumentParser, ModelArguments, SFTConfig
+
+
+class H4ArgumentParserTest(unittest.TestCase):
+    def setUp(self):
+        self.parser = H4ArgumentParser((ModelArguments, DataArguments, SFTConfig))
+        self.yaml_file_path = "tests/fixtures/config_sft_full.yaml"
+
+    def test_load_yaml(self):
+        model_args, data_args, training_args = self.parser.parse_yaml_file(os.path.abspath(self.yaml_file_path))
+        self.assertEqual(model_args.model_name_or_path, "mistralai/Mistral-7B-v0.1")
+
+    def test_load_yaml_and_args(self):
+        command_line_args = [
+            "--model_name_or_path=test",
+            "--use_peft=true",
+            "--lora_r=16",
+            "--lora_dropout=0.5",
+        ]
+        model_args, data_args, training_args = self.parser.parse_yaml_and_args(
+            os.path.abspath(self.yaml_file_path), command_line_args
+        )
+        self.assertEqual(model_args.model_name_or_path, "test")
+        self.assertEqual(model_args.use_peft, True)
+        self.assertEqual(model_args.lora_r, 16)
+        self.assertEqual(model_args.lora_dropout, 0.5)
diff --git a/tests/test_data.py b/tests/test_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcf600f586bbcde791f04eb7647e86e3fb1593e5
--- /dev/null
+++ b/tests/test_data.py
@@ -0,0 +1,196 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from copy import deepcopy
+
+import pytest
+from datasets import Dataset
+from transformers import AutoTokenizer
+
+from alignment import DataArguments, ModelArguments, apply_chat_template, get_datasets, get_tokenizer
+from alignment.data import maybe_insert_system_message
+
+
+class GetDatasetsTest(unittest.TestCase):
+    """Each of these test datasets has 100 examples"""
+
+    def test_loading_data_args(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.5,
+            "HuggingFaceH4/testing_self_instruct_small": 0.3,
+            "HuggingFaceH4/testing_codealpaca_small": 0.2,
+        }
+        data_args = DataArguments(dataset_mixer=dataset_mixer)
+        datasets = get_datasets(data_args, columns_to_keep=["prompt", "completion"])
+        self.assertEqual(len(datasets["train"]), 100)
+        self.assertEqual(len(datasets["test"]), 300)
+
+    def test_loading_data_dict(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.5,
+            "HuggingFaceH4/testing_self_instruct_small": 0.3,
+            "HuggingFaceH4/testing_codealpaca_small": 0.2,
+        }
+        datasets = get_datasets(dataset_mixer, columns_to_keep=["prompt", "completion"])
+        self.assertEqual(len(datasets["train"]), 100)
+        self.assertEqual(len(datasets["test"]), 300)
+
+    def test_loading_with_unit_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 1.0,
+            "HuggingFaceH4/testing_self_instruct_small": 1.0,
+            "HuggingFaceH4/testing_codealpaca_small": 1.0,
+        }
+        datasets = get_datasets(dataset_mixer, columns_to_keep=["prompt", "completion"])
+        self.assertEqual(len(datasets["train"]), 300)
+        self.assertEqual(len(datasets["test"]), 300)
+
+    def test_loading_with_fractions_greater_than_unity(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.7,
+            "HuggingFaceH4/testing_self_instruct_small": 0.4,
+        }
+        datasets = get_datasets(dataset_mixer, columns_to_keep=["prompt", "completion"])
+        self.assertEqual(len(datasets["train"]), 70 + 40)
+        self.assertEqual(len(datasets["test"]), 200)
+
+    def test_loading_fails_with_negative_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 0.7,
+            "HuggingFaceH4/testing_self_instruct_small": -0.3,
+        }
+        with pytest.raises(ValueError, match=r"Dataset fractions cannot be negative."):
+            get_datasets(dataset_mixer, columns_to_keep=["prompt", "completion"])
+
+    def test_loading_single_split_with_unit_fractions(self):
+        dataset_mixer = {
+            "HuggingFaceH4/testing_alpaca_small": 1.0,
+        }
+        datasets = get_datasets(dataset_mixer, splits=["test"], columns_to_keep=["prompt", "completion"])
+        self.assertEqual(len(datasets["test"]), 100)
+        self.assertRaises(KeyError, lambda: datasets["train"])
+
+
+class ApplyChatTemplateTest(unittest.TestCase):
+    def setUp(self):
+        model_args = ModelArguments(model_name_or_path="HuggingFaceH4/zephyr-7b-alpha")
+        data_args = DataArguments()
+        self.tokenizer = get_tokenizer(model_args, data_args)
+        self.dataset = Dataset.from_dict(
+            {
+                "prompt": ["Hello!"],
+                "messages": [
+                    [
+                        {"role": "system", "content": "You are a happy chatbot"},
+                        {"role": "user", "content": "Hello!"},
+                        {"role": "assistant", "content": "Bonjour!"},
+                        {"role": "user", "content": "How are you?"},
+                        {"role": "assistant", "content": "I am doing well, thanks!"},
+                    ]
+                ],
+                "chosen": [
+                    [
+                        {"role": "system", "content": "You are a happy chatbot"},
+                        {"role": "user", "content": "Hello!"},
+                        {"role": "assistant", "content": "Bonjour!"},
+                        {"role": "user", "content": "How are you?"},
+                        {"role": "assistant", "content": "I am doing well, thanks!"},
+                    ]
+                ],
+                "rejected": [
+                    [
+                        {"role": "system", "content": "You are a happy chatbot"},
+                        {"role": "user", "content": "Hello!"},
+                        {"role": "assistant", "content": "Bonjour!"},
+                        {"role": "user", "content": "How are you?"},
+                        {"role": "assistant", "content": "Not so good tbh"},
+                    ]
+                ],
+            }
+        )
+
+    def test_maybe_insert_system_message(self):
+        # Chat template that does not accept system prompt. Use community checkpoint since it has no HF token requirement
+        tokenizer_sys_excl = AutoTokenizer.from_pretrained("mistral-community/Mistral-7B-Instruct-v0.3")
+        # Chat template that accepts system prompt
+        tokenizer_sys_incl = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")
+        messages_sys_excl = [{"role": "user", "content": "Tell me a joke."}]
+        messages_sys_incl = [{"role": "system", "content": ""}, {"role": "user", "content": "Tell me a joke."}]
+
+        messages_proc_excl = deepcopy(messages_sys_excl)
+        message_proc_incl = deepcopy(messages_sys_excl)
+        maybe_insert_system_message(messages_proc_excl, tokenizer_sys_excl)
+        maybe_insert_system_message(message_proc_incl, tokenizer_sys_incl)
+
+        # output from mistral should not have a system message, output from llama should
+        self.assertEqual(messages_proc_excl, messages_sys_excl)
+        self.assertEqual(message_proc_incl, messages_sys_incl)
+
+    def test_sft(self):
+        dataset = self.dataset.map(
+            apply_chat_template,
+            fn_kwargs={"tokenizer": self.tokenizer, "task": "sft"},
+            remove_columns=self.dataset.column_names,
+        )
+        self.assertDictEqual(
+            dataset[0],
+            {
+                "text": "<|system|>\nYou are a happy chatbot</s>\n<|user|>\nHello!</s>\n<|assistant|>\nBonjour!</s>\n<|user|>\nHow are you?</s>\n<|assistant|>\nI am doing well, thanks!</s>\n"
+            },
+        )
+
+    def test_generation(self):
+        # Remove last turn from messages
+        dataset = self.dataset.map(lambda x: {"messages": x["messages"][:-1]})
+        dataset = dataset.map(
+            apply_chat_template,
+            fn_kwargs={"tokenizer": self.tokenizer, "task": "generation"},
+            remove_columns=self.dataset.column_names,
+        )
+        self.assertDictEqual(
+            dataset[0],
+            {
+                "text": "<|system|>\nYou are a happy chatbot</s>\n<|user|>\nHello!</s>\n<|assistant|>\nBonjour!</s>\n<|user|>\nHow are you?</s>\n<|assistant|>\n"
+            },
+        )
+
+    def test_rm(self):
+        dataset = self.dataset.map(
+            apply_chat_template,
+            fn_kwargs={"tokenizer": self.tokenizer, "task": "rm"},
+            remove_columns=self.dataset.column_names,
+        )
+        self.assertDictEqual(
+            dataset[0],
+            {
+                "text_chosen": "<|system|>\nYou are a happy chatbot</s>\n<|user|>\nHello!</s>\n<|assistant|>\nBonjour!</s>\n<|user|>\nHow are you?</s>\n<|assistant|>\nI am doing well, thanks!</s>\n",
+                "text_rejected": "<|system|>\nYou are a happy chatbot</s>\n<|user|>\nHello!</s>\n<|assistant|>\nBonjour!</s>\n<|user|>\nHow are you?</s>\n<|assistant|>\nNot so good tbh</s>\n",
+            },
+        )
+
+    def test_dpo(self):
+        dataset = self.dataset.map(
+            apply_chat_template,
+            fn_kwargs={"tokenizer": self.tokenizer, "task": "dpo"},
+            remove_columns=self.dataset.column_names,
+        )
+        self.assertDictEqual(
+            dataset[0],
+            {
+                "text_prompt": "<|system|>\nYou are a happy chatbot</s>\n<|user|>\nHello!</s>\n<|assistant|>\nBonjour!</s>\n<|user|>\nHow are you?</s>\n",
+                "text_chosen": "<|assistant|>\nI am doing well, thanks!</s>\n",
+                "text_rejected": "<|assistant|>\nNot so good tbh</s>\n",
+            },
+        )
diff --git a/tests/test_decontaminate.py b/tests/test_decontaminate.py
new file mode 100644
index 0000000000000000000000000000000000000000..334501ef44008a1fc7a88430a84825770b11f48e
--- /dev/null
+++ b/tests/test_decontaminate.py
@@ -0,0 +1,48 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from unittest import TestCase
+
+from datasets import Dataset
+from transformers import AutoTokenizer
+
+from alignment import apply_chat_template, decontaminate_humaneval
+
+
+class DecontamintateHumanEvalTest(TestCase):
+    """Test we decontaminate HumanEval samples correctly"""
+
+    def setUp(self) -> None:
+        # Create a dataset with a HumanEval sample wrapped in some fake text
+        dataset = Dataset.from_dict(
+            {
+                "messages": [
+                    [{"content": "Hello", "role": "user"}],
+                    [
+                        {
+                            "content": 'Hello, I am\nfrom\n\n typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n',
+                            "role": "assistant",
+                        }
+                    ],
+                ]
+            }
+        )
+        tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+        self.dataset = dataset.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer, "task": "sft"})
+
+    def test_decontamination(self):
+        """Test we decontaminate HumanEval samples correctly"""
+        decontaminated_dataset = self.dataset.filter(decontaminate_humaneval, batched=True)
+        # Check we recover just the first message
+        self.assertEqual(decontaminated_dataset[0]["text"], self.dataset[0]["text"])
diff --git a/tests/test_model_utils.py b/tests/test_model_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..16ada9236c2e0303c3e2c1f44bc3cd7db65eeb65
--- /dev/null
+++ b/tests/test_model_utils.py
@@ -0,0 +1,88 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+from alignment import (
+    DataArguments,
+    ModelArguments,
+    get_peft_config,
+    get_quantization_config,
+    get_tokenizer,
+    is_adapter_model,
+)
+from alignment.data import DEFAULT_CHAT_TEMPLATE
+
+
+class GetQuantizationConfigTest(unittest.TestCase):
+    def test_4bit(self):
+        model_args = ModelArguments(load_in_4bit=True)
+        quantization_config = get_quantization_config(model_args)
+        self.assertTrue(quantization_config["load_in_4bit"])
+        self.assertEqual(quantization_config["bnb_4bit_compute_dtype"], "float16")
+        self.assertEqual(quantization_config["bnb_4bit_quant_type"], "nf4")
+        self.assertFalse(quantization_config["bnb_4bit_use_double_quant"])
+
+    def test_8bit(self):
+        model_args = ModelArguments(load_in_8bit=True)
+        quantization_config = get_quantization_config(model_args)
+        self.assertTrue(quantization_config["load_in_8bit"])
+
+    def test_no_quantization(self):
+        model_args = ModelArguments()
+        quantization_config = get_quantization_config(model_args)
+        self.assertIsNone(quantization_config)
+
+
+class GetTokenizerTest(unittest.TestCase):
+    def setUp(self) -> None:
+        self.model_args = ModelArguments(model_name_or_path="HuggingFaceH4/zephyr-7b-alpha")
+
+    def test_right_truncation_side(self):
+        tokenizer = get_tokenizer(self.model_args, DataArguments(truncation_side="right"))
+        self.assertEqual(tokenizer.truncation_side, "right")
+
+    def test_left_truncation_side(self):
+        tokenizer = get_tokenizer(self.model_args, DataArguments(truncation_side="left"))
+        self.assertEqual(tokenizer.truncation_side, "left")
+
+    def test_default_chat_template(self):
+        tokenizer = get_tokenizer(self.model_args, DataArguments())
+        self.assertEqual(tokenizer.chat_template, DEFAULT_CHAT_TEMPLATE)
+
+    def test_chatml_chat_template(self):
+        chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+        tokenizer = get_tokenizer(self.model_args, DataArguments(chat_template=chat_template))
+        self.assertEqual(tokenizer.chat_template, chat_template)
+
+
+class GetPeftConfigTest(unittest.TestCase):
+    def test_peft_config(self):
+        model_args = ModelArguments(use_peft=True, lora_r=42, lora_alpha=0.66, lora_dropout=0.99)
+        peft_config = get_peft_config(model_args)
+        self.assertEqual(peft_config.r, 42)
+        self.assertEqual(peft_config.lora_alpha, 0.66)
+        self.assertEqual(peft_config.lora_dropout, 0.99)
+
+    def test_no_peft_config(self):
+        model_args = ModelArguments(use_peft=False)
+        peft_config = get_peft_config(model_args)
+        self.assertIsNone(peft_config)
+
+
+class IsAdapterModelTest(unittest.TestCase):
+    def test_is_adapter_model_calls_listdir(self):
+        # Assert that for an invalid repo name it gets to the point where it calls os.listdir,
+        # which is expected to raise a FileNotFoundError
+        self.assertRaises(FileNotFoundError, is_adapter_model, "nonexistent/model")
diff --git a/trl_012_grpo/__init__.py b/trl_012_grpo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/trl_012_grpo/grpo_config.py b/trl_012_grpo/grpo_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd6cc9174892e676b08544679ccaef3c93214b5f
--- /dev/null
+++ b/trl_012_grpo/grpo_config.py
@@ -0,0 +1,251 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+from transformers import TrainingArguments
+
+
+@dataclass
+class GRPOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`GRPOTrainer`].
+
+    Only the parameters specific to GRPO training are listed here. For details on other parameters, refer to the
+    [`~transformers.TrainingArguments`] documentation.
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        > Parameters that control the model and reference model
+
+        model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+            Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
+            argument of the [`GRPOTrainer`] is provided as a string.
+
+        > Parameters that control the data preprocessing
+
+        remove_unused_columns (`bool`, *optional*, defaults to `False`):
+            Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that
+            requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left.
+        num_generations (`int` or `None`, *optional*, defaults to `8`):
+            Number of generations per prompt to sample. The global batch size (num_processes * per_device_batch_size)
+            must be divisible by this value.
+        temperature (`float`, *optional*, defaults to `0.9`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        max_completion_length (`int` or `None`, *optional*, defaults to `256`):
+            Maximum length of the generated completion.
+        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
+            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+            improving generation speed. However, disabling this option allows training models that exceed the VRAM
+            capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
+            with vLLM generation.
+
+        > Parameters that control generation acceleration powered by vLLM
+
+        use_vllm (`bool`, *optional*, defaults to `False`):
+            Whether to use vLLM for generating completions. If set to `True`, ensure that a GPU is kept unused for
+            training, as vLLM will require one for generation. vLLM must be installed (`pip install vllm`).
+        vllm_device (`str`, *optional*, defaults to `"auto"`):
+            Device where vLLM generation will run, e.g. `"cuda:1"`. If set to `"auto"` (default), the system will
+            automatically select the next available GPU after the last one used for training. This assumes that
+            training has not already occupied all available GPUs. If only one device is available, the device will be
+            shared between both training and vLLM.
+        vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.9`):
+            Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache on the
+            device dedicated to generation powered by vLLM. Higher values will increase the KV cache size and thus
+            improve the model's throughput. However, if the value is too high, it may cause out-of-memory (OOM) errors
+            during initialization.
+        vllm_dtype (`str`, *optional*, defaults to `"auto"`):
+            Data type to use for vLLM generation. If set to `"auto"`, the data type will be automatically determined
+            based on the model configuration. Find the supported values in the vLLM documentation.
+        vllm_max_model_len (`int` or `None`, *optional*, defaults to `None`):
+            If set, the `max_model_len` to use for vLLM. This could be useful when running with reduced
+            `vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model
+            context size, which might be much larger than the KV cache, leading to inefficiencies.
+
+        > Parameters that control the training
+
+        learning_rate (`float`, *optional*, defaults to `1e-6`):
+            Initial learning rate for [`AdamW`] optimizer. The default value replaces that of
+            [`~transformers.TrainingArguments`].
+        beta (`float`, *optional*, defaults to `0.04`):
+            KL coefficient.
+        reward_weights (`list[float]` or `None`, *optional*, defaults to `None`):
+            Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are
+            weighted equally with weight `1.0`.
+        sync_ref_model (`bool`, *optional*, defaults to `False`):
+            Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using
+            the `ref_model_mixup_alpha` parameter. This synchronization originites from the
+            [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
+        ref_model_mixup_alpha (`float`, *optional*, defaults to `0.9`):
+            α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix
+            between the current policy and the previous reference policy during updates. The reference policy is
+            updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you
+            must set `sync_ref_model=True`.
+        ref_model_sync_steps (`int`, *optional*, defaults to `64`):
+            τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how
+            frequently the current policy is synchronized with the reference policy. To use this parameter, you must
+            set `sync_ref_model=True`.
+
+        > Parameters that control the logging
+
+        log_completions (`bool`, *optional*, defaults to `False`):
+            Whether to log the completions during training.
+    """
+
+    # Parameters that control the model and reference model
+    model_init_kwargs: Optional[dict] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments for `transformers.AutoModelForCausalLM.from_pretrained`, used when the `model` "
+            "argument of the `GRPOTrainer` is provided as a string."
+        },
+    )
+
+    # Parameters that control the data preprocessing
+    # The default value remove_unused_columns is overwritten from the parent class, because in GRPO we usually rely on
+    # additional columns to compute the reward
+    remove_unused_columns: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether to only keep the column 'prompt' in the dataset. If you use a custom reward function "
+            "that requires any column other than 'prompts' and 'completions', you should keep this to `False`."
+        },
+    )
+    max_prompt_length: Optional[int] = field(
+        default=512,
+        metadata={
+            "help": "Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left."
+        },
+    )
+    num_generations: Optional[int] = field(
+        default=8,
+        metadata={
+            "help": "Number of generations to sample. The global batch size (num_processes * per_device_batch_size) "
+            "must be divisible by this value."
+        },
+    )
+    temperature: Optional[float] = field(
+        default=0.9,
+        metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."},
+    )
+    max_completion_length: Optional[int] = field(
+        default=256,
+        metadata={"help": "Maximum length of the generated completion."},
+    )
+    ds3_gather_for_generation: bool = field(
+        default=True,
+        metadata={
+            "help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for "
+            "generation, improving generation speed. However, disabling this option allows training models that "
+            "exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option "
+            "is not compatible with vLLM generation."
+        },
+    )
+
+    # Parameters that control generation acceleration powered by vLLM
+    use_vllm: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether to use vLLM for generating completions. If set to `True`, ensure that a GPU is kept "
+            "unused for training, as vLLM will require one for generation. vLLM must be installed "
+            "(`pip install vllm`)."
+        },
+    )
+    vllm_device: Optional[str] = field(
+        default="auto",
+        metadata={
+            "help": "Device where vLLM generation will run, e.g. 'cuda:1'. If set to 'auto' (default), the system "
+            "will automatically select the next available GPU after the last one used for training. This assumes "
+            "that training has not already occupied all available GPUs."
+        },
+    )
+    vllm_gpu_memory_utilization: float = field(
+        default=0.9,
+        metadata={
+            "help": "Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV "
+            "cache on the device dedicated to generation powered by vLLM. Higher values will increase the KV cache "
+            "size and thus improve the model's throughput. However, if the value is too high, it may cause "
+            "out-of-memory (OOM) errors during initialization."
+        },
+    )
+    vllm_dtype: Optional[str] = field(
+        default="auto",
+        metadata={
+            "help": "Data type to use for vLLM generation. If set to 'auto', the data type will be automatically "
+            "determined based on the model configuration. Find the supported values in the vLLM documentation."
+        },
+    )
+    vllm_max_model_len: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "If set, the `max_model_len` to use for vLLM. This could be useful when running with reduced "
+            "`vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model "
+            "context size, which might be much larger than the KV cache, leading to inefficiencies."
+        },
+    )
+
+    # Parameters that control the training
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={
+            "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of "
+            "`transformers.TrainingArguments`."
+        },
+    )
+    beta: float = field(
+        default=0.04,
+        metadata={"help": "KL coefficient."},
+    )
+    reward_weights: Optional[list[float]] = field(
+        default=None,
+        metadata={
+            "help": "Weights for each reward function. Must match the number of reward functions. If `None`, all "
+            "rewards are weighted equally with weight `1.0`."
+        },
+    )
+    sync_ref_model: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to synchronize the reference model with the active model every `ref_model_sync_steps` "
+            "steps, using the `ref_model_mixup_alpha` parameter."
+        },
+    )
+    ref_model_mixup_alpha: float = field(
+        default=0.9,
+        metadata={
+            "help": "α parameter from the TR-DPO paper, which controls the mix between the current policy and the "
+            "previous reference policy during updates. The reference policy is updated according to the equation: "
+            "`π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you must set `sync_ref_model=True`."
+        },
+    )
+    ref_model_sync_steps: int = field(
+        default=64,
+        metadata={
+            "help": "τ parameter from the TR-DPO paper, which determines how frequently the current policy is "
+            "synchronized with the reference policy. To use this parameter, you must set `sync_ref_model=True`."
+        },
+    )
+
+    # Parameters that control the logging
+    log_completions: bool = field(
+        default=False,
+        metadata={"help": "Whether to log the completions during training."},
+    )
diff --git a/trl_012_grpo/grpo_run.py b/trl_012_grpo/grpo_run.py
new file mode 100644
index 0000000000000000000000000000000000000000..b13735739e6a2c4e1422ee3a757430a76efff268
--- /dev/null
+++ b/trl_012_grpo/grpo_run.py
@@ -0,0 +1,42 @@
+# train_grpo.py
+# CUDA_VISIBLE_DEVICES=1,2,3,4 python grpo_run.py
+from datasets import load_dataset
+from trl.data_utils import maybe_apply_chat_template
+from transformers import (
+    AutoTokenizer,
+)
+from grpo_trainer import GRPOTrainer
+from grpo_config import GRPOConfig
+model_path = "qwen/Qwen2"
+# 加载数据集
+dataset = load_dataset("trl-lib/tldr", split="train", cache_dir='./')
+tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left")
+# {'prompt': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nWhat color is the sky?<|im_end|>\n<|im_start|>assistant\n', 'completion': 'It is blue.<|im_end|>\n'}
+# 使用.map() 方法批量修改数据
+"""
+数据格式：
+{'prompt': str,
+'completion': list:[str]
+'reward': list:[float/int]}
+"""
+def modify_completion(example):
+    # 将 completion 转换为列表
+    example['prompt'] = maybe_apply_chat_template({"prompt": [{"role": "user", "content": example['prompt']}]},tokenizer=tokenizer)['prompt']
+    return example
+# 应用修改
+dataset = dataset.map(modify_completion)
+print(dataset[0])
+
+
+# Define the reward function, which rewards completions that are close to 20 characters
+def reward_len(completions, **kwargs):
+    return [-abs(20 - len(completion)) for completion in completions]
+
+training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO", logging_steps=10,per_device_train_batch_size=5,num_generations=5)
+trainer = GRPOTrainer(
+    model=model_path,
+    reward_funcs=reward_len,
+    args=training_args,
+    train_dataset=dataset,
+)
+trainer.train()
\ No newline at end of file
diff --git a/trl_012_grpo/grpo_trainer.py b/trl_012_grpo/grpo_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e095819b973ec836047328162e18a7f025a6f1d
--- /dev/null
+++ b/trl_012_grpo/grpo_trainer.py
@@ -0,0 +1,890 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import textwrap
+import warnings
+from collections import defaultdict
+from typing import Any, Callable, Optional, Sized, Union
+from unittest.mock import patch
+
+import torch
+import torch.utils.data
+import transformers
+from accelerate.utils import broadcast_object_list, gather, gather_object, is_peft_model, set_seed
+from accelerate.utils.other import is_compiled_module
+from datasets import Dataset, IterableDataset
+from packaging import version
+from torch import nn
+from torch.utils.data import Sampler
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Trainer,
+    TrainerCallback,
+    is_wandb_available,
+)
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.utils import is_peft_available
+from accelerate.utils import is_deepspeed_available
+
+from trl.data_utils import apply_chat_template, is_conversational, maybe_apply_chat_template
+from trl.models import create_reference_model, unwrap_model_for_generation
+from trl.trainer.callbacks import SyncRefModelCallback
+from trl_012_grpo.grpo_config import GRPOConfig
+
+from trl_012_grpo.utils import generate_model_card, get_comet_experiment_url, pad, selective_log_softmax
+from copy import deepcopy
+
+
+if is_peft_available():
+    from peft import PeftConfig, get_peft_model
+
+
+if is_wandb_available():
+    import wandb
+
+# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of
+# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model.
+RewardFunc = Union[str, PreTrainedModel, Callable[[list, list], list[float]]]
+if is_deepspeed_available():
+    import deepspeed
+
+def prepare_deepspeed(model, accelerator):
+    # Adapted from accelerate: https://github.com/huggingface/accelerate/blob/739b135f8367becb67ffaada12fe76e3aa60fefd/src/accelerate/accelerator.py#L1473
+    deepspeed_plugin = accelerator.state.deepspeed_plugin
+    config_kwargs = deepcopy(deepspeed_plugin.deepspeed_config)
+    stage = config_kwargs["zero_optimization"]["stage"]
+
+    if model is not None:
+        hidden_size = (
+            max(model.config.hidden_sizes)
+            if getattr(model.config, "hidden_sizes", None)
+            else getattr(model.config, "hidden_size", None)
+        )
+        if hidden_size is not None and stage == 3:
+            # Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache
+            # @ step 0: expected module 1, but got module 0`
+            # This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
+            config_kwargs.update(
+                {
+                    "zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
+                    "zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
+                    "zero_optimization.stage3_prefetch_bucket_size": 0.9 * hidden_size * hidden_size,
+                }
+            )
+
+    # If ZeRO-3 is used, we shard both the active and reference model.
+    # Otherwise, we assume the reference model fits in memory and is initialized on each device with ZeRO
+    # disabled (stage 0)
+    if stage != 3:
+        config_kwargs["zero_optimization"]["stage"] = 0
+    model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
+    model.eval()
+    return model
+
+class RepeatRandomSampler(Sampler):
+    """
+    Sampler that repeats the indices of a dataset N times.
+
+    Args:
+        data_source (`Sized`):
+            Dataset to sample from.
+        repeat_count (`int`):
+            Number of times to repeat each index.
+        seed (`Optional[int]`):
+            Random seed for reproducibility (only affects this sampler).
+
+    Example:
+    ```python
+    >>> sampler = RepeatRandomSampler(["a", "b", "c", "d"], repeat_count=2)
+    >>> list(sampler)
+    [2, 2, 0, 0, 3, 3, 1, 1]
+    ```
+    """
+
+    def __init__(self, data_source: Sized, repeat_count: int, seed: Optional[int] = None):
+        self.data_source = data_source
+        self.repeat_count = repeat_count
+        self.num_samples = len(data_source)
+        self.seed = seed
+        self.generator = torch.Generator()  # Create a local random generator
+        if seed is not None:
+            self.generator.manual_seed(seed)
+
+    def __iter__(self):
+        indexes = [
+            idx
+            for idx in torch.randperm(self.num_samples, generator=self.generator).tolist()
+            for _ in range(self.repeat_count)
+        ]
+        return iter(indexes)
+
+    def __len__(self):
+        return self.num_samples * self.repeat_count
+
+
+class GRPOTrainer(Trainer):
+    """
+    Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
+    paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+
+    Example:
+
+    ```python
+    from datasets import load_dataset
+    from trl import GRPOTrainer
+
+    dataset = load_dataset("trl-lib/tldr", split="train")
+
+    def reward_func(completions, **kwargs):
+        # Dummy reward function that rewards completions with more unique letters.
+        return [float(len(set(completion))) for completion in completions]
+
+    trainer = GRPOTrainer(
+        model="Qwen/Qwen2-0.5B-Instruct",
+        reward_funcs=reward_func,
+        train_dataset=dataset,
+    )
+
+    trainer.train()
+    ```
+
+    Args:
+        model (`Union[str, PreTrainedModel]`):
+            Model to be trained. Can be either:
+
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or
+              a path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is
+              loaded using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keywork arguments
+              in `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
+            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+            functions with the prompts and completions and sum the rewards. Can be either:
+
+            - A single reward function, such as:
+                - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
+                path to a *directory* containing model weights saved using
+                [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+                using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
+                keyword arguments in `args.model_init_kwargs`.
+                - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
+                - A custom reward function: The function is provided with the prompts and the generated completions,
+                  plus any additional columns in the dataset. It should return a list of rewards. For more details, see
+                  [Using a custom reward function](#using-a-custom-reward-function).
+            - A list of reward functions, where each item can independently be any of the above types. Mixing different
+            types within the list (e.g., a string model ID and a custom reward function) is allowed.
+        args ([`GRPOConfig`], *optional*, defaults to `None`):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
+            ignored. The format of the samples can be either:
+
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`):
+            Processing class used to process the data. The padding side must be set to "left". If `None`, the
+            processing class is loaded from the model's name with [`~transformers.AutoTokenizer.from_pretrained`].
+        reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
+            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+
+            - A single processing class: Used when `reward_funcs` contains only one reward function.
+            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+            If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
+            `None`, the tokenizer for the model is automatically loaded using [`~transformers.AutoTokenizer.from_pretrained`].
+            For elements in `reward_funcs` that are custom reward functions (not [`~transformers.PreTrainedModel`]),
+            the corresponding entries in `reward_processing_classes` are ignored.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks
+            detailed in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+    """
+
+    _tag_names = ["trl", "grpo"]
+
+    def __init__(
+        self,
+        model: Union[str, PreTrainedModel],
+        reward_funcs: Union[RewardFunc, list[RewardFunc]],
+        args: GRPOConfig = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None,
+        processing_class: Optional[PreTrainedTokenizerBase] = None,
+        reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
+        peft_config: Optional["PeftConfig"] = None,
+    ):
+        print('正在使用离线设置下的grpo......')
+        # Args
+        if args is None:
+            model_name = model if isinstance(model, str) else model.config._name_or_path
+            model_name = model_name.split("/")[-1]
+            args = GRPOConfig(f"{model_name}-GRPO")
+
+        # Models
+        # Trained model
+        model_init_kwargs = args.model_init_kwargs or {}
+        if isinstance(model, str):
+            model_id = model
+            torch_dtype = model_init_kwargs.get("torch_dtype")
+            if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None:
+                pass  # torch_dtype is already a torch.dtype or "auto" or None
+            elif isinstance(torch_dtype, str):  # it's a str, but not "auto"
+                torch_dtype = getattr(torch, torch_dtype)
+                model_init_kwargs["torch_dtype"] = torch_dtype
+            else:
+                raise ValueError(
+                    "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing "
+                    f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}."
+                )
+            # Disable caching if gradient checkpointing is enabled (not supported)
+            model_init_kwargs["use_cache"] = (
+                False if args.gradient_checkpointing else model_init_kwargs.get("use_cache")
+            )
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+        else:
+            model_id = model.config._name_or_path
+            if args.model_init_kwargs is not None:
+                raise ValueError(
+                    "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. "
+                    "This argument can only be used when the `model` argument is a string."
+                )
+
+        if peft_config is not None:
+            model = get_peft_model(model, peft_config)
+
+        # Reference model
+        if is_deepspeed_zero3_enabled():
+            self.ref_model = AutoModelForCausalLM.from_pretrained(model_id, **model_init_kwargs)
+        elif not is_peft_model(model):
+            # If PEFT configuration is not provided, create a reference model based on the initial model.
+            self.ref_model = create_reference_model(model)
+        else:
+            # If PEFT is used, the reference model is not needed since the adapter can be disabled
+            # to revert to the initial model.
+            self.ref_model = None
+
+        # Processing class
+        if processing_class is None:
+            processing_class = AutoTokenizer.from_pretrained(model.config._name_or_path, padding_side="left")
+
+        # Reward functions
+        if not isinstance(reward_funcs, list):
+            reward_funcs = [reward_funcs]
+        for i, reward_func in enumerate(reward_funcs):
+            if isinstance(reward_func, str):
+                reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained(
+                    reward_func, num_labels=1, **model_init_kwargs
+                )
+        self.reward_funcs = reward_funcs
+
+        # Reward weights
+        if args.reward_weights is not None:
+            if len(args.reward_weights) != len(reward_funcs):
+                raise ValueError(
+                    f"Number of reward weights ({len(args.reward_weights)}) must match number of reward "
+                    f"functions ({len(reward_funcs)})"
+                )
+            self.reward_weights = torch.tensor(args.reward_weights, dtype=torch.float32)
+        else:
+            self.reward_weights = torch.ones(len(reward_funcs), dtype=torch.float32)
+
+        # Reward processing class
+        if reward_processing_classes is None:
+            reward_processing_classes = [None] * len(reward_funcs)
+        elif not isinstance(reward_processing_classes, list):
+            reward_processing_classes = [reward_processing_classes]
+        else:
+            if len(reward_processing_classes) != len(reward_funcs):
+                raise ValueError("The number of reward processing classes must match the number of reward functions.")
+
+        for i, (reward_processing_class, reward_func) in enumerate(zip(reward_processing_classes, reward_funcs)):
+            if isinstance(reward_func, PreTrainedModel):
+                if reward_processing_class is None:
+                    reward_processing_class = AutoTokenizer.from_pretrained(reward_func.config._name_or_path)
+                if reward_processing_class.pad_token_id is None:
+                    reward_processing_class.pad_token = reward_processing_class.eos_token
+                # The reward model computes the reward for the latest non-padded token in the input sequence.
+                # So it's important to set the pad token ID to the padding token ID of the processing class.
+                reward_func.config.pad_token_id = reward_processing_class.pad_token_id
+                reward_processing_classes[i] = reward_processing_class
+        self.reward_processing_classes = reward_processing_classes
+
+        # Data collator
+        def data_collator(features):  # No data collation is needed in GRPO
+            return features
+
+        # Training arguments
+        self.max_prompt_length = args.max_prompt_length
+        self.max_completion_length = args.max_completion_length  # = |o_i| in the GRPO paper
+        self.num_generations = args.num_generations  # = G in the GRPO paper
+        self.use_vllm = args.use_vllm
+
+        self.beta = args.beta
+
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in GRPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning:
+        # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To
+        # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True.
+        # This acts as a flag to indicate that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+
+        # Initialize the metrics
+        self._metrics = defaultdict(list)
+        self.log_completions = args.log_completions
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            callbacks=callbacks,
+            optimizers=optimizers,
+        )
+
+        # Check if the per_device_train/eval_batch_size * num processes can be divided by the number of generations
+        num_processes = self.accelerator.num_processes
+        global_batch_size = args.per_device_train_batch_size * num_processes
+        possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0]
+        if self.num_generations not in possible_values:
+            raise ValueError(
+                f"The global train batch size ({num_processes} x {args.per_device_train_batch_size}) must be evenly "
+                f"divisible by the number of generations per prompt ({self.num_generations}). Given the current train "
+                f"batch size, the valid values for the number of generations are: {possible_values}."
+            )
+        if self.args.eval_strategy != "no":
+            global_batch_size = args.per_device_eval_batch_size * num_processes
+            possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0]
+            if self.num_generations not in possible_values:
+                raise ValueError(
+                    f"The global eval batch size ({num_processes} x {args.per_device_eval_batch_size}) must be evenly "
+                    f"divisible by the number of generations per prompt ({self.num_generations}). Given the current "
+                    f"eval batch size, the valid values for the number of generations are: {possible_values}."
+                )
+
+        # Ensure each process receives a unique seed to prevent duplicate completions when generating with
+        # transformers if num_generations exceeds per_device_train_batch_size. We could skip it if we use vLLM, but
+        # it's safer to set it in all cases.
+        set_seed(args.seed, device_specific=True)
+
+        # 离线设置下无需使用VLLM进行采样
+        # if self.use_vllm:
+        #     if not is_vllm_available():
+        #         raise ImportError(
+        #             "vLLM is not available and `use_vllm` is set to True. Please install vLLM with "
+        #             "`pip install vllm` to use it."
+        #         )
+        #
+        #     if self.accelerator.is_main_process:
+        #         vllm_device = self.args.vllm_device
+        #         if vllm_device == "auto":
+        #             if torch.cuda.device_count() == 1:
+        #                 vllm_device = "cuda:0"  # particular case when training with onyl 1 GPU: share it
+        #             else:
+        #                 vllm_device = f"cuda:{self.accelerator.num_processes}"  # take the next GPU idx
+        #         # Check that the requested device is available
+        #         if vllm_device.split(":")[0] == "cuda" and int(vllm_device.split(":")[1]) >= torch.cuda.device_count():
+        #             raise ValueError(
+        #                 f"The requested device for vllm ({vllm_device}) is not available. You are likely using vLLM "
+        #                 "without restricting the number of GPUs for training. Set the `--num_processes` argument to a "
+        #                 "value lower than the number of GPUs available on your machine—typically, reducing it by one "
+        #                 f"is sufficient. In your case: `--num_processes {torch.cuda.device_count() - 1}`."
+        #             )
+        #         # Check that the requested device is not also used for training
+        #         if vllm_device in {f"cuda:{idx}" for idx in range(self.accelerator.num_processes)}:
+        #             warnings.warn(
+        #                 f"The requested device {vllm_device} is also being used for training. For higher throughput "
+        #                 "and to avoid out-of-memory errors, it is recommended to use a dedicated device for vLLM. "
+        #                 "If this is intentional, you may ignore this warning but should adjust "
+        #                 "`vllm_gpu_memory_utilization` accordingly."
+        #             )
+        #         # vLLM is not compatible with accelerate. So we need to patch it to make sure we can (1) place the vLLM
+        #         # model on the desired device (world_size_patch) and (2) avoid a test that is not designed for our
+        #         # setting (profiling_patch).
+        #         world_size_patch = patch("torch.distributed.get_world_size", return_value=1)
+        #         profiling_patch = patch(
+        #             "vllm.worker.worker.Worker._assert_memory_footprint_increased_during_profiling", return_value=None
+        #         )
+        #         with world_size_patch, profiling_patch:
+        #             self.llm = LLM(
+        #                 model=model.name_or_path,
+        #                 device=vllm_device,
+        #                 gpu_memory_utilization=self.args.vllm_gpu_memory_utilization,
+        #                 dtype=self.args.vllm_dtype,
+        #                 # Automatic Prefix Caching caches the KV cache of existing queries, so that a new query can
+        #                 # directly reuse the KV cache if it shares the same prefix with one of the existing queries.
+        #                 # This is particularly useful here because we generate completions from the same prompts.
+        #                 enable_prefix_caching=True,
+        #                 max_model_len=self.args.vllm_max_model_len,
+        #             )
+        #         self.sampling_params = SamplingParams(
+        #             temperature=args.temperature,
+        #             max_tokens=self.max_completion_length,
+        #         )
+        #
+        #     self._last_loaded_step = 0  # tag to avoid useless loading during grad accumulation
+        #
+        #     # When using vLLM, the main process is responsible for loading the model weights. This can cause process
+        #     # desynchronization and seems to lead to DeepSpeed hanging during initialization. To prevent this, we
+        #     # synchronize all processes after vLLM has been fully initialized.
+        #     self.accelerator.wait_for_everyone()
+        # else:
+        #     self.generation_config = GenerationConfig(
+        #         max_new_tokens=self.max_completion_length,
+        #         do_sample=True,
+        #         temperature=args.temperature,
+        #         pad_token_id=processing_class.pad_token_id,
+        #     )
+
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+
+        # Add tags to the model
+        self.model.add_model_tags(self._tag_names)
+
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+
+        if args.sync_ref_model:
+            self.add_callback(SyncRefModelCallback(ref_model=self.ref_model, accelerator=self.accelerator))
+
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                self.reward_funcs[i] = self.accelerator.prepare_model(reward_func, evaluation_mode=True)
+
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by the `training_step` method, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = ["prompt"]
+
+    def _get_train_sampler(self) -> Sampler:
+        # Returns a sampler that ensures each prompt is repeated across multiple processes. This guarantees that
+        # identical prompts are distributed to different GPUs, allowing rewards to be computed and normalized correctly
+        # within each prompt group. Using the same seed across processes ensures consistent prompt assignment,
+        # preventing discrepancies in group formation.
+        return RepeatRandomSampler(self.train_dataset, self.num_generations, seed=self.args.seed)
+
+    def _get_eval_sampler(self, eval_dataset) -> Sampler:
+        # Returns a sampler that ensures each prompt is repeated across multiple processes. This guarantees that
+        # identical prompts are distributed to different GPUs, allowing rewards to be computed and normalized correctly
+        # within each prompt group. Using the same seed across processes ensures consistent prompt assignment,
+        # preventing discrepancies in group formation.
+        return RepeatRandomSampler(eval_dataset, self.num_generations, seed=self.args.seed)
+
+    # Get the per-token log probabilities for the completions for the model and the reference model
+    def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep):
+        # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
+        logits = model(input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1).logits
+        logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+
+        input_ids = input_ids[:, -logits_to_keep:]
+        # For transformers<=4.48, logits_to_keep argument isn't supported, so here we drop logits ourselves.
+        # See https://github.com/huggingface/trl/issues/2770
+        logits = logits[:, -logits_to_keep:]
+        return selective_log_softmax(logits, input_ids)  #  compute logprobs for the input tokens
+
+    def _move_model_to_vllm(self):
+        with unwrap_model_for_generation(
+            self.model, self.accelerator, gather_deepspeed3_params=self.args.ds3_gather_for_generation
+        ) as unwrapped_model:
+            if is_compiled_module(unwrapped_model):
+                unwrapped_model = unwrapped_model._orig_mod
+            if is_peft_model(unwrapped_model):
+                unwrapped_model.merge_adapter()
+                state_dict = unwrapped_model.state_dict()
+                # Remove base_model and base_layer prefixes
+                state_dict = {
+                    k.removeprefix("base_model.model.").replace(".base_layer", ""): v for k, v in state_dict.items()
+                }
+                # Remove values with adapter prefix (example: "_lora")
+                state_dict = {k: v for k, v in state_dict.items() if unwrapped_model.prefix not in k}
+                # When module to save, remove its prefix and discard the original module
+                state_dict = {
+                    k.replace("modules_to_save.default.", ""): v
+                    for k, v in state_dict.items()
+                    if "original_module" not in k
+                }
+            else:
+                state_dict = unwrapped_model.state_dict()
+            if self.accelerator.is_main_process:
+                llm_model = self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                llm_model.load_weights(state_dict.items())
+            # Unmerge the adapter to restore the model to its original state.
+            # This must be done after loading weights to ensure they correspond to the merged state.
+            if is_peft_model(unwrapped_model):
+                unwrapped_model.unmerge_adapter()
+
+    def _prepare_inputs(self, inputs: dict[str, Union[torch.Tensor, Any]]) -> dict[str, Union[torch.Tensor, Any]]:
+        # ori_input: {'prompt':"...",'completion':"..."}
+        # modify: {'prompt':"...",''completion_list'':['answer_1','answer_2',...],'reward':[reward_1,reward_2...]
+        # input:[devise_num*sample_num ,max_prompt_len]
+        device = self.accelerator.device
+        prompts = [x["prompt"] for x in inputs]  # 在这之前已经将prmopt复制了sample数量了
+        prompts_text = [maybe_apply_chat_template(example, self.processing_class)["prompt"] for example in inputs]  # 这里数据格式不变
+        prompt_inputs = self.processing_class(  # 这里processing_class 其实就是tokenizer 将文本转id
+            prompts_text, return_tensors="pt", padding=True, padding_side="left", add_special_tokens=False
+        )
+        prompt_inputs = super()._prepare_inputs(prompt_inputs)  # 转为ids了，并且attention_mask的padding“0”在左
+        prompt_ids, prompt_mask = prompt_inputs["input_ids"], prompt_inputs["attention_mask"]
+
+        # completion_list跟随prompts被重复了sample_num次，要去重
+        completion_list, prompt_list, reward_list = [],[],[]
+        for x in inputs:
+            if x['prompt'] not in prompt_list:
+                completion_list += x['completion']
+                reward_list += x['reward']
+                prompt_list.append(x['prompt'])
+        # 将输入的completion_list转ids
+        completion_list_text = [completion+self.processing_class.eos_token for completion in completion_list]
+        completion_list = self.processing_class(
+            completion_list_text, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False
+        )
+        completion_list = super()._prepare_inputs(completion_list)
+        completion_ids, completion_mask = completion_list['input_ids'], completion_list["attention_mask"]
+
+
+
+        if self.max_prompt_length is not None:  # 超过最大设置length截断
+            prompt_ids = prompt_ids[:, -self.max_prompt_length :]
+            prompt_mask = prompt_mask[:, -self.max_prompt_length :]
+
+        prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+
+        # # Generate completions using either vLLM or regular generation
+        # if self.args.use_vllm:
+        #     # First, have main process load weights if needed
+        #     if self.state.global_step != self._last_loaded_step:  # 类似police模型采样
+        #         self._move_model_to_vllm()
+        #         self._last_loaded_step = self.state.global_step
+        #
+        #     # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
+        #     all_prompts_text = gather_object(prompts_text)
+        #     if self.accelerator.is_main_process:
+        #         outputs = self.llm.generate(all_prompts_text, sampling_params=self.sampling_params, use_tqdm=False)
+        #         completion_ids = [out.token_ids for completions in outputs for out in completions.outputs]
+        #     else:
+        #         completion_ids = [None] * len(all_prompts_text)
+        #     # Broadcast the completions from the main process to all processes, ensuring each process receives its
+        #     # corresponding slice.
+        #     completion_ids = broadcast_object_list(completion_ids, from_process=0)
+        #     process_slice = slice(
+        #         self.accelerator.process_index * len(prompts),
+        #         (self.accelerator.process_index + 1) * len(prompts),
+        #     )
+        #     completion_ids = completion_ids[process_slice]
+        #
+        #     # Pad the completions, and concatenate them with the prompts
+        #     completion_ids = [torch.tensor(ids, device=device) for ids in completion_ids]
+        #     completion_ids = pad(completion_ids, padding_value=self.processing_class.pad_token_id)
+        #     prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        # else:
+        #     # Regular generation path
+        #     with unwrap_model_for_generation(self.model, self.accelerator) as unwrapped_model:
+        #         prompt_completion_ids = unwrapped_model.generate(
+        #             prompt_ids, attention_mask=prompt_mask, generation_config=self.generation_config
+        #         )
+        #
+        #     # Compute prompt length and extract completion ids
+        #     prompt_length = prompt_ids.size(1)
+        #     prompt_ids = prompt_completion_ids[:, :prompt_length]
+        #     completion_ids = prompt_completion_ids[:, prompt_length:]
+
+        # Mask everything after the first EOS token
+        # 这里求completion_mask其实是冗余的因为上面已经求出过completion_mask，这里是怕出错
+        is_eos = completion_ids == self.processing_class.eos_token_id
+        eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)
+        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
+        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1)
+        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
+
+        # Concatenate prompt_mask with completion_mask for logit computation
+        # 求ref model对completions的概率分布
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B*G, P+C)
+
+        logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+
+        with torch.inference_mode():
+            if self.ref_model is not None:
+                ref_per_token_logps = self._get_per_token_logps(
+                    self.ref_model, prompt_completion_ids, attention_mask, logits_to_keep
+                )
+            else:
+                with self.accelerator.unwrap_model(self.model).disable_adapter():
+                    ref_per_token_logps = self._get_per_token_logps(
+                        self.model, prompt_completion_ids, attention_mask, logits_to_keep
+                    )
+
+        # Decode the generated completions
+        # 把completion重新作为对话格式应该是为了送入reward model打分，现在离线设置就不用这样了
+        completions_text = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True)
+        # if is_conversational(inputs[0]):
+        #     completions = []
+        #     for prompt, completion in zip(prompts, completions_text):
+        #         bootstrap = prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
+        #         completions.append([{"role": "assistant", "content": bootstrap + completion}])
+        # else:
+        #     completions = completions_text
+        #
+        # rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device)
+        # for i, (reward_func, reward_processing_class) in enumerate(
+        #     zip(self.reward_funcs, self.reward_processing_classes)
+        # ):
+        #     if isinstance(reward_func, nn.Module):  # Module instead of PretrainedModel for compat with compiled models
+        #         if is_conversational(inputs[0]):
+        #             messages = [{"messages": p + c} for p, c in zip(prompts, completions)]
+        #             texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages]
+        #         else:
+        #             texts = [p + c for p, c in zip(prompts, completions)]
+        #         reward_inputs = reward_processing_class(
+        #             texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False
+        #         )
+        #         reward_inputs = super()._prepare_inputs(reward_inputs)
+        #         with torch.inference_mode():
+        #             rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0]  # Shape (B*G,)
+        #     else:
+        #         # Repeat all input columns (but "prompt" and "completion") to match the number of generations
+        #         keys = [key for key in inputs[0] if key not in ["prompt", "completion"]]
+        #         reward_kwargs = {key: [example[key] for example in inputs] for key in keys}
+        #         output_reward_func = reward_func(prompts=prompts, completions=completions, **reward_kwargs)
+        #         rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device)
+        #
+        # # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
+        # # completions may be distributed across processes
+        # rewards_per_func = gather(rewards_per_func)
+
+        # Apply weights to each reward function's output and sum
+        # rewards = (rewards_per_func * self.reward_weights.to(device).unsqueeze(0)).sum(dim=1)
+        rewards = torch.tensor(reward_list,device=device,dtype=torch.float32)
+
+        # Compute grouped-wise rewards
+        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
+        std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
+
+        # Normalize the rewards to compute the advantages
+        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        std_grouped_rewards = std_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + 1e-4)
+
+        # Slice to keep only the local part of the data
+        # process_slice = slice(
+        #     self.accelerator.process_index * len(prompts),
+        #     (self.accelerator.process_index + 1) * len(prompts),
+        # )
+        # advantages = advantages[process_slice]
+
+        # Log the metrics
+        # reward_per_func = rewards_per_func.mean(0)
+        # for i, reward_func in enumerate(self.reward_funcs):
+        #     if isinstance(reward_func, nn.Module):  # Module instead of PretrainedModel for compat with compiled models
+        #         reward_func_name = reward_func.config._name_or_path.split("/")[-1]
+        #     else:
+        #         reward_func_name = reward_func.__name__
+        #     self._metrics[f"rewards/{reward_func_name}"].append(reward_per_func[i].item())
+
+        self._metrics["reward"].append(rewards.mean().item())
+        self._metrics["reward_std"].append(std_grouped_rewards.mean().item())
+
+        if (
+            self.log_completions
+            and self.state.global_step % self.args.logging_steps == 0
+            and "wandb" in self.args.report_to
+        ):
+            import pandas as pd
+
+            # For logging
+            table = {
+                "step": [str(self.state.global_step)] * len(rewards),
+                "prompt": gather_object(prompts_text),
+                "completion": gather_object(completions_text),
+                "reward": rewards.tolist(),
+            }
+            df = pd.DataFrame(table)
+
+            if wandb.run is not None and self.accelerator.is_main_process:
+                wandb.log({"completions": wandb.Table(dataframe=df)})
+        """
+        original_shape:
+        {'prompt_ids': torch.Size([10, 395]),
+         'prompt_mask': torch.Size([10, 395]),
+          'completion_ids': torch.Size([10, 256]),  # completions 第二维是采样数据中max_completions_length
+          'completion_mask': torch.Size([10, 256]), 
+        'ref_per_token_logps': torch.Size([10, 256]),
+         'advantages': torch.Size([10])}
+         
+         modified_shape:
+         'prompt_ids': torch.Size([10, 395]),
+         'prompt_mask': torch.Size([10, 395]),
+          'completion_ids': torch.Size([10, 38]), # completions 第二维是数据中max_completions_length
+          'completion_mask': torch.Size([10, 38]), 
+        'ref_per_token_logps': torch.Size([10, 38]),
+         'advantages': torch.Size([10])}
+        """
+        return {
+            "prompt_text":prompts_text,
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "ref_per_token_logps": ref_per_token_logps,
+            "advantages": advantages,
+        }
+
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        if return_outputs:
+            raise ValueError("The GRPOTrainer does not support returning outputs")
+        # Compute the per-token log probabilities for the model
+
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        # prompt_ids:[batch_size*sample_num ,max_prompt_len]
+
+        completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"]
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+
+        per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep)
+        # per_token_logps:[batch_size*sample_num ,max_response_len]
+        # print('input_ids:')
+        # print(prompt_ids[0])
+        # print('completion')
+        # print(completion_ids[0])
+        # print('#'*50)
+
+        # Compute the KL divergence between the model and the reference model
+        ref_per_token_logps = inputs["ref_per_token_logps"]
+        per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
+
+        # x - x.detach() allows for preserving gradients from x
+        advantages = inputs["advantages"]  # advantages:[batch_size*sample_num]
+        per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1)
+        per_token_loss = -(per_token_loss - self.beta * per_token_kl)
+        loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
+
+        # Log the metrics
+        completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item()
+        self._metrics["completion_length"].append(completion_length)
+
+        mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
+        self._metrics["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
+        return loss
+
+    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys: Optional[list[str]] = None):
+        inputs = self._prepare_inputs(inputs)
+        with torch.no_grad():
+            with self.compute_loss_context_manager():
+                loss = self.compute_loss(model, inputs)
+            loss = loss.mean().detach()
+        return loss, None, None
+
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        metrics = {key: sum(val) / len(val) for key, val in self._metrics.items()}  # average the metrics
+
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if next(iter(logs.keys())).startswith("eval_"):
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+
+        logs = {**logs, **metrics}
+        if version.parse(transformers.__version__) >= version.parse("4.47.0.dev0"):
+            super().log(logs, start_time)
+        else:  # transformers<=4.46
+            super().log(logs)
+        self._metrics.clear()
+
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+
+        tags = tags or []
+        if isinstance(tags, str):
+            tags = [tags]
+
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.append("unsloth")
+
+        citation = textwrap.dedent(
+            """\
+            @article{zhihong2024deepseekmath,
+                title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+                author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+                year         = 2024,
+                eprint       = {arXiv:2402.03300},
+            }
+            """
+        )
+
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="GRPO",
+            trainer_citation=citation,
+            paper_title="DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
+            paper_id="2402.03300",
+        )
+
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
diff --git a/trl_012_grpo/utils.py b/trl_012_grpo/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..32578e5023d50cd312dcae1f701763cf2d38213a
--- /dev/null
+++ b/trl_012_grpo/utils.py
@@ -0,0 +1,1685 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import importlib.resources as pkg_resources
+import json
+import random
+import warnings
+from collections import deque
+from dataclasses import dataclass, field
+from importlib.metadata import version
+from typing import Any, Literal, Optional, Union
+
+import datasets
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+import torch.utils.data
+from accelerate import Accelerator, PartialState
+from accelerate.state import AcceleratorState
+from huggingface_hub import ModelCard, ModelCardData
+from rich.console import Console
+from rich.table import Table
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import IterableDataset
+from transformers import (
+    BitsAndBytesConfig,
+    DataCollatorForLanguageModeling,
+    EvalPrediction,
+    GenerationConfig,
+    PreTrainedTokenizerBase,
+    TrainerState,
+    TrainingArguments,
+    is_comet_available,
+)
+from transformers.utils import (
+    is_peft_available,
+    is_torch_mlu_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+)
+
+from trl.trainer.model_config import ModelConfig
+
+
+if is_comet_available():
+    import comet_ml
+
+if is_peft_available():
+    from peft import LoraConfig, PeftConfig
+
+
+class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
+    """
+    Data collator used for completion tasks. It ensures that all the tokens of the labels are set to an 'ignore_index'
+    when they do not come from the assistant. This ensure that the loss is only
+    calculated on the completion made by the assistant.
+
+    Args:
+        response_template (`Union[str, list[int]]`): the template form that indicates the start of the response, typically something like
+            '### Response:\n'. It can also be passed as tokenized ids, which can be useful when using a tokenizer that encodes the response
+            differently if it does not have proper context.
+        instruction_template (`Union[str, list[int]]`): the template form that indicates the start of the human instruction, typically something like
+            '### Human:\n'. Useful for assistant-style conversation datasets. It can also be passed as tokenized ids.
+        mlm (`bool`, *optional*, defaults to `False`): Whether to use masked language modeling in the underlying
+            `DataCollatorForLanguageModeling` class. Note that this option currently has no effect but is present
+             for flexibility and backwards-compatibility.
+        ignore_index (`int`, *optional*, defaults to `-100`):
+            The index to use to ignore the initial tokens with
+    """
+
+    def __init__(
+        self,
+        response_template: Union[str, list[int]],
+        instruction_template: Optional[Union[str, list[int]]] = None,
+        *args,
+        mlm: bool = False,
+        ignore_index: int = -100,
+        padding_free: bool = False,
+        **kwargs,
+    ):
+        super().__init__(*args, mlm=mlm, **kwargs)
+
+        self.instruction_template = instruction_template
+        if isinstance(instruction_template, str):
+            # The user provides a string, must tokenize
+            self.instruction_token_ids = self.tokenizer.encode(self.instruction_template, add_special_tokens=False)
+        else:
+            # The user already provides the token ids
+            self.instruction_token_ids = instruction_template
+
+        self.response_template = response_template
+        if isinstance(response_template, str):
+            # The user provides a string, must tokenize
+            self.response_token_ids = self.tokenizer.encode(self.response_template, add_special_tokens=False)
+        else:
+            # The user already provides the token ids
+            self.response_token_ids = response_template
+
+        if not self.mlm and self.instruction_template and self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            warnings.warn(
+                "The pad_token_id and eos_token_id values of this tokenizer are identical. "
+                "If you are planning for multi-turn training, "
+                "it can result in the model continuously generating questions and answers without eos token. "
+                "To avoid this, set the pad_token_id to a different value.",
+                UserWarning,
+            )
+
+        self.ignore_index = ignore_index
+        self.padding_free = padding_free
+
+    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
+        batch = super().torch_call(examples)
+
+        if self.instruction_template is None:
+            for i in range(len(examples)):
+                response_token_ids_start_idx = None
+
+                for idx in np.where(batch["labels"][i] == self.response_token_ids[0])[0]:
+                    # `response_token_ids` is `'### Response:\n'`, here we are just making sure that the token IDs match
+                    if (
+                        self.response_token_ids
+                        == batch["labels"][i][idx : idx + len(self.response_token_ids)].tolist()
+                    ):
+                        response_token_ids_start_idx = idx
+
+                if response_token_ids_start_idx is None:
+                    warnings.warn(
+                        f"Could not find response key `{self.response_template}` in the following instance: "
+                        f"{self.tokenizer.decode(batch['input_ids'][i])}. This instance will be ignored in loss "
+                        "calculation. Note, if this happens often, consider increasing the `max_seq_length`.",
+                        UserWarning,
+                    )
+                    batch["labels"][i, :] = self.ignore_index
+                else:
+                    response_token_ids_end_idx = response_token_ids_start_idx + len(self.response_token_ids)
+
+                    # Make pytorch loss function ignore all tokens up through the end of the response key
+                    batch["labels"][i, :response_token_ids_end_idx] = self.ignore_index
+
+        else:
+            for i in range(len(examples)):
+                response_token_ids_idxs = []
+                human_token_ids_idxs = []
+
+                for assistant_idx in np.where(batch["labels"][i] == self.response_token_ids[0])[0]:
+                    # find the indexes of the start of a response.
+                    if (
+                        self.response_token_ids
+                        == batch["labels"][i][assistant_idx : assistant_idx + len(self.response_token_ids)].tolist()
+                    ):
+                        response_token_ids_idxs.append(assistant_idx + len(self.response_token_ids))
+
+                if len(response_token_ids_idxs) == 0:
+                    warnings.warn(
+                        f"Could not find response key `{self.response_template}` in the following instance: "
+                        f"{self.tokenizer.decode(batch['input_ids'][i])}. This instance will be ignored in loss "
+                        "calculation. Note, if this happens often, consider increasing the `max_seq_length`.",
+                        UserWarning,
+                    )
+                    batch["labels"][i, :] = self.ignore_index
+
+                human_token_ids = self.instruction_token_ids
+                for human_idx in np.where(batch["labels"][i] == human_token_ids[0])[0]:
+                    # find the indexes of the start of a human answer.
+                    if human_token_ids == batch["labels"][i][human_idx : human_idx + len(human_token_ids)].tolist():
+                        human_token_ids_idxs.append(human_idx)
+
+                if len(human_token_ids_idxs) == 0:
+                    warnings.warn(
+                        f"Could not find instruction key `{self.instruction_template}` in the following instance: "
+                        f"{self.tokenizer.decode(batch['input_ids'][i])}. This instance will be ignored in loss "
+                        "calculation. Note, if this happens often, consider increasing the `max_seq_length`.",
+                        UserWarning,
+                    )
+                    batch["labels"][i, :] = self.ignore_index
+
+                if (
+                    len(human_token_ids_idxs) > 0
+                    and len(response_token_ids_idxs) > 0
+                    and human_token_ids_idxs[0] > response_token_ids_idxs[0]
+                ):
+                    human_token_ids_idxs = [0] + human_token_ids_idxs
+
+                for idx, (start, end) in enumerate(zip(human_token_ids_idxs, response_token_ids_idxs)):
+                    # Make pytorch loss function ignore all non response tokens
+                    if idx != 0:
+                        batch["labels"][i, start:end] = self.ignore_index
+                    else:
+                        batch["labels"][i, :end] = self.ignore_index
+
+                if len(response_token_ids_idxs) < len(human_token_ids_idxs):
+                    batch["labels"][i, human_token_ids_idxs[-1] :] = self.ignore_index
+
+        if self.padding_free:
+            # remove padding, `attention_mask` and add `position_ids`
+            attn_mask = batch.pop("attention_mask")
+            batch["input_ids"] = batch["input_ids"][attn_mask.bool()].unsqueeze(0)
+            batch["position_ids"] = attn_mask.cumsum(1)[attn_mask.bool()].unsqueeze(0) - 1
+            batch["labels"] = batch["labels"][attn_mask.bool()].unsqueeze(0)
+            batch["labels"][batch["position_ids"] == 0] = self.ignore_index
+
+            # Calculate cumulative sequence lengths for queries and keys to prevent graph breaks during further computations.
+            flattened_position_ids = batch["position_ids"].flatten()
+            indices_q = torch.arange(
+                flattened_position_ids.size(0), device=flattened_position_ids.device, dtype=torch.int32
+            )
+            batch["cu_seq_lens_q"] = torch.cat(
+                (
+                    indices_q[flattened_position_ids == 0],
+                    torch.tensor(
+                        flattened_position_ids.size(), device=flattened_position_ids.device, dtype=torch.int32
+                    ),
+                )
+            )
+            batch["cu_seq_lens_k"] = batch["cu_seq_lens_q"]
+
+            # Determine maximum sequence lengths to prevent graph breaks during further computations.
+            batch["max_length_k"] = flattened_position_ids.max().item() + 1
+            batch["max_length_q"] = batch["max_length_k"]
+
+        return batch
+
+
+@dataclass
+class DataCollatorForChatML:
+    """
+    Data collator for ChatML format datasets.
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    ignore_index: int = -100
+    max_length: int = None
+    prompt_key: str = "prompt"
+    messages_key: str = "messages"
+
+    def __post_init__(self):
+        if self.tokenizer.pad_token_id is None:
+            raise ValueError("The tokenizer does not have a pad token. Please set `pad_token_id` in the tokenizer.")
+        if self.max_length is None:
+            # set a sensible default
+            self.max_length = min(self.tokenizer.model_max_length, 1024)
+
+    def __call__(self, examples: list[dict[str, Any]]) -> dict[str, torch.Tensor]:
+        input_ids = []
+        attention_mask = []
+        prompts_input_ids = []
+        prompt_attention_mask = []
+        labels = []
+
+        for example in examples:
+            formatted_prompt = example.get(self.prompt_key, None)
+            if formatted_prompt is None:
+                prompt = example[self.messages_key][:-1]
+                formatted_prompt = self.tokenizer.apply_chat_template(
+                    prompt, tokenize=False, add_generation_prompt=True
+                )
+
+            if "input_ids" not in example:
+                message = example[self.messages_key]
+                formatted_message = self.tokenizer.apply_chat_template(
+                    message, tokenize=False, add_generation_prompt=False
+                )
+                tokenized_message = self.tokenizer(
+                    formatted_message,
+                    truncation=True,
+                    max_length=self.max_length,
+                    padding=False,
+                    return_tensors=None,
+                    add_special_tokens=False,
+                )
+                input_ids.append(tokenized_message["input_ids"])
+                attention_mask.append(tokenized_message["attention_mask"])
+            else:
+                input_ids.append(example["input_ids"])
+                attention_mask.append(example["attention_mask"])
+
+            tokenized_prompt = self.tokenizer(
+                formatted_prompt,
+                truncation=True,
+                max_length=len(input_ids[-1]),
+                padding=False,
+                return_tensors=None,
+                add_special_tokens=False,
+            )
+
+            prompts_input_ids.append(tokenized_prompt["input_ids"])
+            prompt_attention_mask.append(tokenized_prompt["attention_mask"])
+
+            # Create the labels that will have all but the completion tokens of the example["input_ids"] set to ignore_index
+            label = [self.ignore_index] * len(input_ids[-1])
+            completion_start_idx = len(tokenized_prompt["input_ids"])
+            label[completion_start_idx:] = input_ids[-1][completion_start_idx:]
+            labels.append(label)
+
+        # convert to list of tensors and pad
+        input_ids = [torch.tensor(ids, dtype=torch.long) for ids in input_ids]
+        attention_mask = [torch.tensor(mask, dtype=torch.long) for mask in attention_mask]
+        labels = [torch.tensor(label, dtype=torch.long) for label in labels]
+        input_ids = pad(input_ids, padding_side="left", padding_value=self.tokenizer.pad_token_id)
+        attention_mask = pad(attention_mask, padding_side="left", padding_value=0)
+        labels = pad(labels, padding_side="left", padding_value=self.ignore_index)
+
+        prompts_input_ids = [torch.tensor(ids, dtype=torch.long) for ids in prompts_input_ids]
+        prompt_attention_mask = [torch.tensor(mask, dtype=torch.long) for mask in prompt_attention_mask]
+        prompts_input_ids = pad(prompts_input_ids, padding_side="left", padding_value=self.tokenizer.pad_token_id)
+        prompt_attention_mask = pad(prompt_attention_mask, padding_side="left", padding_value=0)
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            "prompts": prompts_input_ids,
+            "prompt_attention_mask": prompt_attention_mask,
+        }
+
+
+@dataclass
+class RewardDataCollatorWithPadding:
+    r"""
+    Reward DataCollator class that pads the inputs to the maximum length of the batch.
+
+    Args:
+        tokenizer (`PreTrainedTokenizerBase`):
+            The tokenizer used for encoding the data.
+        padding (`Union[bool, str, `PaddingStrategy`]`, `optional`, defaults to `True`):
+            padding_strategy to pass to the tokenizer.
+        pad_to_multiple_of (`int` or `None`, `optional`, defaults to `None`):
+            If set will pad the sequence to a multiple of the provided value.
+        return_tensors (`str`, `optional`, defaults to `"pt"`):
+            The tensor type to use.
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str] = True
+    pad_to_multiple_of: Optional[int] = None
+    return_tensors: str = "pt"
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
+        features_chosen = []
+        features_rejected = []
+        margin = []
+        # check if we have a margin. If we do, we need to batch it as well
+        has_margin = "margin" in features[0]
+        for feature in features:
+            # check if the keys are named as expected
+            if (
+                "input_ids_chosen" not in feature
+                or "input_ids_rejected" not in feature
+                or "attention_mask_chosen" not in feature
+                or "attention_mask_rejected" not in feature
+            ):
+                raise ValueError(
+                    "The features should include `input_ids_chosen`, `attention_mask_chosen`, `input_ids_rejected` and `attention_mask_rejected`"
+                )
+
+            features_chosen.append(
+                {
+                    "input_ids": feature["input_ids_chosen"],
+                    "attention_mask": feature["attention_mask_chosen"],
+                }
+            )
+            features_rejected.append(
+                {
+                    "input_ids": feature["input_ids_rejected"],
+                    "attention_mask": feature["attention_mask_rejected"],
+                }
+            )
+            if has_margin:
+                margin.append(feature["margin"])
+        batch_chosen = self.tokenizer.pad(
+            features_chosen,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        batch_rejected = self.tokenizer.pad(
+            features_rejected,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+        )
+        batch = {
+            "input_ids_chosen": batch_chosen["input_ids"],
+            "attention_mask_chosen": batch_chosen["attention_mask"],
+            "input_ids_rejected": batch_rejected["input_ids"],
+            "attention_mask_rejected": batch_rejected["attention_mask"],
+            "return_loss": True,
+        }
+        if has_margin:
+            margin = torch.tensor(margin, dtype=torch.float)
+            batch["margin"] = margin
+        return batch
+
+
+def pad(tensors: list[torch.Tensor], padding_value: int = 0, padding_side: str = "right") -> torch.Tensor:
+    """
+    Pads a list of tensors to the same shape along the first dimension.
+
+    Args:
+        tensors (`list[torch.Tensor]`):
+            List of input tensors to pad.
+        padding_value (`int`):
+            Value to use for padding. Default is 0.
+        padding_side (`str`):
+            Side on which to add padding. Must be 'left' or 'right'. Default is 'right'.
+
+    Returns:
+        `torch.Tensor`:
+            A single tensor containing the padded tensors.
+
+    Examples:
+        >>> import torch
+        >>> pad([torch.tensor([1, 2, 3]), torch.tensor([4, 5])])
+        tensor([[1, 2, 3],
+                [4, 5, 0]])
+        >>> pad([torch.tensor([[1, 2], [3, 4]]), torch.tensor([[5, 6]])])
+        tensor([[[1, 2],
+                [3, 4]],
+
+                [[5, 6],
+                [0, 0]]])
+    """
+    # Determine the maximum shape for each dimension
+    output_shape = np.max([t.shape for t in tensors], 0).tolist()
+
+    # Create an output tensor filled with the padding value
+    output = torch.full((len(tensors), *output_shape), padding_value, dtype=tensors[0].dtype, device=tensors[0].device)
+
+    for i, t in enumerate(tensors):
+        # Determine the slice for the sequence dimension
+        if padding_side == "left":
+            seq_slice = slice(output_shape[0] - t.shape[0], output_shape[0])
+        elif padding_side == "right":
+            seq_slice = slice(0, t.shape[0])
+        else:
+            raise ValueError("padding_side must be 'left' or 'right'")
+
+        slices = (seq_slice,) + tuple(slice(0, s) for s in t.shape[1:])
+        output[i][slices] = t
+
+    return output
+
+
+@dataclass
+class DPODataCollatorWithPadding:
+    r"""
+    DPO DataCollator class that pads the tokenized inputs to the maximum length of the batch.
+
+    Args:
+        pad_token_id (`int` defaults to 0):
+            The tokenizer's pad_token_id.
+        label_pad_token_id (`int`, defaults to -100):
+            The label used for masking.
+        is_encoder_decoder (`bool` or `None`, `optional`, defaults to `None`):
+            Whether you model has an encoder_decoder architecture.
+    """
+
+    pad_token_id: int = 0
+    label_pad_token_id: int = -100
+    is_encoder_decoder: Optional[bool] = False
+
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
+        # first, pad everything to the same length
+        padded_batch = {}
+        for k in features[0].keys():
+            if k.endswith(("_input_ids", "_attention_mask", "_labels", "_pixel_values")):
+                if self.is_encoder_decoder:
+                    to_pad = [torch.LongTensor(ex[k]) for ex in features]
+
+                    if (k.startswith("prompt")) and (k.endswith("input_ids")):
+                        if self.pad_token_id is None:
+                            raise ValueError(
+                                "Padding is enabled, but the tokenizer is not configured with a padding token."
+                                " Explicitly set `tokenizer.pad_token` (e.g. `tokenizer.pad_token = tokenizer.eos_token`)"
+                                " before calling the trainer."
+                            )
+                        padding_value = self.pad_token_id
+                    elif k.endswith("_attention_mask"):
+                        padding_value = 0
+                    elif k.startswith(("chosen", "rejected", "completion")) or ("decoder" in k):
+                        padding_value = self.label_pad_token_id
+                    else:
+                        raise ValueError(f"Unexpected key in batch '{k}'")
+                    padded_batch[k] = pad_sequence(to_pad, batch_first=True, padding_value=padding_value)
+                else:
+                    # Set padding value based on the key
+                    if k.endswith("_input_ids"):
+                        if self.pad_token_id is None:
+                            raise ValueError(
+                                "Padding is enabled, but the tokenizer is not configured with a padding token."
+                                " Explicitly set `tokenizer.pad_token` (e.g. `tokenizer.pad_token = tokenizer.eos_token`)"
+                                " before calling the trainer."
+                            )
+                        padding_value = self.pad_token_id
+                    elif k.endswith("_labels"):
+                        padding_value = self.label_pad_token_id
+                    elif k.endswith("_attention_mask"):
+                        padding_value = 0
+                    elif k.endswith("_pixel_values"):
+                        padding_value = 0  # TODO: check if this is correct
+                    else:
+                        raise ValueError(f"Unexpected key in batch '{k}'")
+
+                    # Set padding side based on the key
+                    if k in ["prompt_input_ids", "prompt_attention_mask"]:
+                        padding_side = "left"
+                    else:
+                        padding_side = "right"
+
+                    # Set the dtype
+                    if k.endswith("_pixel_values"):
+                        dtype = torch.float32  # will be downcasted if necessary by the Trainer
+                    else:
+                        dtype = torch.int64
+
+                    # Convert to tensor and pad
+                    to_pad = [torch.tensor(ex[k], dtype=dtype) for ex in features]
+                    padded_batch[k] = pad(to_pad, padding_value=padding_value, padding_side=padding_side)
+            elif k.endswith("_logps"):
+                # the cached reference model logprobs
+                padded_batch[k] = torch.tensor([ex[k] for ex in features])
+            else:
+                padded_batch[k] = [ex[k] for ex in features]
+
+        return padded_batch
+
+
+class ConstantLengthDataset(IterableDataset):
+    """
+    Iterable dataset that returns constant length chunks of tokens from stream of text files.
+    The dataset also formats the text before tokenization with a specific format that is provided
+    by the user.
+
+    Args:
+        tokenizer (`transformers.PreTrainedTokenizer`):
+            The processor used for processing the data.
+        dataset (`dataset.Dataset`):
+            Dataset with text files.
+        dataset_text_field (`str` or `None`, *optional*, defaults to `None`):
+            Name of the field in the dataset that contains the text. Only one of `dataset_text_field` and
+            `formatting_func` should be provided.
+        formatting_func (`Callable`, *optional*):
+            Function that formats the text before tokenization. Usually it is recommended to follow a certain
+            pattern such as `"### Question: {question} ### Answer: {answer}"`. Only one of `dataset_text_field` and
+            `formatting_func` should be provided.
+        infinite (`bool`, *optional*, defaults to `False`):
+            If True the iterator is reset after dataset reaches end else stops.
+        seq_length (`int`, *optional*, defaults to `1024`):
+            Length of token sequences to return.
+        num_of_sequences (`int`, *optional*, defaults to `1024`):
+            Number of token sequences to keep in buffer.
+        chars_per_token (`int`, *optional*, defaults to `3.6`):
+            Number of characters per token used to estimate number of tokens in text buffer.
+        eos_token_id (`int`, *optional*, defaults to `0`):
+            Id of the end of sequence token if the passed tokenizer does not have an EOS token.
+        shuffle (`bool`, *optional*, defaults to `True`)
+            Shuffle the examples before they are returned
+        append_concat_token (`bool`, *optional*, defaults to `True`)
+            If true, appends `eos_token_id` at the end of each sample being packed.
+        add_special_tokens (`bool`, *optional*, defaults to `True`)
+            If true, tokenizers adds special tokens to each sample being packed.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        dataset,
+        dataset_text_field=None,
+        formatting_func=None,
+        infinite=False,
+        seq_length=1024,
+        num_of_sequences=1024,
+        chars_per_token=3.6,
+        eos_token_id=0,
+        shuffle=True,
+        append_concat_token=True,
+        add_special_tokens=True,
+    ):
+        self.tokenizer = tokenizer
+        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id else eos_token_id
+        self.dataset = dataset
+        self.seq_length = seq_length
+        self.infinite = infinite
+        self.current_size = 0
+        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
+        self.shuffle = shuffle
+        self.append_concat_token = append_concat_token
+        self.add_special_tokens = add_special_tokens
+
+        if dataset_text_field is not None and formatting_func is not None:
+            warnings.warn(
+                "Only one of `dataset_text_field` and `formatting_func` should be provided. "
+                "Ignoring `dataset_text_field` and using `formatting_func`.",
+                UserWarning,
+            )
+
+        if formatting_func is not None:
+            self.formatting_func = formatting_func
+        elif dataset_text_field is not None:
+            self.formatting_func = lambda x: x[dataset_text_field]
+        else:  # neither is provided
+            raise ValueError("Either `dataset_text_field` or `formatting_func` should be provided.")
+
+        self.pretokenized = False
+        column_names = (
+            dataset.column_names if isinstance(dataset, (datasets.Dataset, datasets.IterableDataset)) else None
+        )
+        if column_names is not None and "input_ids" in column_names:
+            self.pretokenized = True
+            # since the dataset is tokenized, the unit of buffer size should be tokens
+            self.max_buffer_size = seq_length * num_of_sequences
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __iter__(self):
+        iterator = iter(self.dataset)
+        more_examples = True
+        while more_examples:
+            buffer, buffer_len = [], 0
+            while True:
+                if buffer_len >= self.max_buffer_size:
+                    break
+                try:
+                    buffer.append(self.formatting_func(next(iterator)))
+                    buffer_len += len(buffer[-1])
+                except StopIteration:
+                    if self.infinite:
+                        iterator = iter(self.dataset)
+                    else:
+                        more_examples = False
+                        break
+            if self.shuffle:
+                random.shuffle(buffer)
+            if self.pretokenized:
+                tokenized_inputs = buffer
+            else:
+                tokenized_inputs = self.tokenizer(
+                    buffer, add_special_tokens=self.add_special_tokens, truncation=False
+                )["input_ids"]
+            all_token_ids = []
+            for tokenized_input in tokenized_inputs:
+                if self.append_concat_token:
+                    tokenized_input = tokenized_input + [self.concat_token_id]
+                all_token_ids.extend(tokenized_input)
+            examples = []
+            for i in range(0, len(all_token_ids), self.seq_length):
+                input_ids = all_token_ids[i : i + self.seq_length]
+                if len(input_ids) == self.seq_length:
+                    examples.append(input_ids)
+            if self.shuffle:
+                # Shuffle again, otherwise split examples occur in consecutive tensors.
+                random.shuffle(examples)
+            for example in examples:
+                self.current_size += 1
+                yield {
+                    "input_ids": torch.LongTensor(example),
+                    "labels": torch.LongTensor(example),
+                }
+
+
+@dataclass
+class RunningMoments:
+    """
+    Calculates the running mean and standard deviation of a data stream. Reference:
+    https://github.com/OpenLMLab/MOSS-RLHF/blob/40b91eb2f2b71b16919addede0341d2bef70825d/utils.py#L75
+    """
+
+    accelerator: Accelerator
+    mean: float = 0
+    std: float = 1
+    var: float = 1
+    count: float = 1e-24
+
+    @torch.no_grad()
+    def update(self, xs: torch.Tensor) -> tuple[float, float]:
+        """
+        Updates running moments from batch's moments computed across ranks
+        """
+        if self.accelerator.use_distributed:
+            xs_mean, xs_var, xs_count = get_global_statistics(self.accelerator, xs)
+        else:
+            xs_count = xs.numel()
+            xs_var, xs_mean = torch.var_mean(xs, unbiased=False)
+        xs_mean, xs_var = xs_mean.float(), xs_var.float()
+
+        delta = xs_mean - self.mean
+        tot_count = self.count + xs_count
+
+        new_sum = xs_var * xs_count
+        # correct old_sum deviation accounting for the new mean
+        old_sum = self.var * self.count + delta**2 * self.count * xs_count / tot_count
+        tot_sum = old_sum + new_sum
+
+        self.mean += (delta * xs_count / tot_count).item()
+        new_var = tot_sum / tot_count
+        self.std = (new_var * tot_count / (tot_count - 1)).float().sqrt().item()
+        self.var = new_var.item()
+        self.count = tot_count
+
+        return xs_mean.item(), (xs_var * xs_count / (xs_count - 1)).float().sqrt().item()
+
+    def save_to_json(self, json_path: str):
+        """Save the content of this instance in JSON format inside `json_path`."""
+        # save everything except accelerator
+        if self.accelerator.is_main_process:
+            save_dict = dataclasses.asdict(self, dict_factory=lambda x: {k: v for (k, v) in x if k != "accelerator"})
+            json_string = json.dumps(save_dict, indent=2, sort_keys=True) + "\n"
+            with open(json_path, "w", encoding="utf-8") as f:
+                f.write(json_string)
+
+    @classmethod
+    def load_from_json(cls, accelerator: Accelerator, json_path: str):
+        """Create an instance from the content of `json_path`."""
+        # load everything except accelerator
+        with open(json_path, encoding="utf-8") as f:
+            text = f.read()
+        return cls(accelerator=accelerator, **json.loads(text))
+
+
+@torch.no_grad()
+def get_global_statistics(
+    accelerator, xs: torch.Tensor, mask=None, device="cpu"
+) -> tuple[torch.Tensor, torch.Tensor, int]:
+    """
+    Computes element-wise mean and variance of the tensor across processes. Reference:
+    https://github.com/OpenLMLab/MOSS-RLHF/blob/40b91eb2f2b71b16919addede0341d2bef70825d/utils.py#L57C1-L73C75
+    """
+    xs = xs.to(accelerator.device)
+    sum_and_count = torch.tensor([xs.sum(), (xs.numel() if mask is None else mask.sum())], device=xs.device)
+    sum_and_count = accelerator.reduce(sum_and_count)
+    global_sum, count = sum_and_count
+    global_mean = global_sum / count
+
+    sum_var = torch.sum(((xs - global_mean) ** 2).mul(1 if mask is None else mask))
+    sum_var = accelerator.reduce(sum_var)
+    global_var = sum_var / count
+
+    return global_mean.to(device), global_var.to(device), count.item()
+
+
+def compute_accuracy(eval_pred: EvalPrediction) -> dict[str, float]:
+    predictions, labels = eval_pred
+    if predictions.ndim == 3:
+        # Token classification task. Shapes are (batch_size, seq_len, num_labels) and (batch_size, seq_len)
+        # Used to compute the accuracy in the prm_trainer.
+        predictions = np.argmax(predictions, axis=2)
+
+        # Flatten the predictions and labels to remove the ignored tokens.
+        predictions = np.array(
+            [p for prediction, label in zip(predictions, labels) for (p, lbl) in zip(prediction, label) if lbl != -100]
+        )
+        labels = np.array([lbl for label in labels for lbl in label if lbl != -100])
+
+    else:
+        # Here, predictions is rewards_chosen and rewards_rejected. Shapes are (batch_size, 2) and (batch_size,)
+        # We want to see how much of the time rewards_chosen > rewards_rejected.
+        equal_mask = predictions[:, 0] == predictions[:, 1]
+        equal_predictions_count = int(equal_mask.sum())
+
+        if equal_predictions_count > 0:
+            warnings.warn(
+                f"There are {equal_predictions_count} out of {len(predictions[:, 0])} instances where the predictions "
+                "for both options are equal. These instances are ignored in the accuracy computation.",
+                UserWarning,
+            )
+
+        # Filter out equal predictions
+        predictions = predictions[~equal_mask]
+        labels = labels[~equal_mask]
+
+        # Use the remaining predictions for accuracy calculation
+        predictions = np.argmax(predictions, axis=1)
+
+    accuracy = np.array(predictions == labels, dtype=float).mean().item()
+    return {"accuracy": accuracy}
+
+
+def pad_to_length(tensor: torch.Tensor, length: int, pad_value: Union[int, float], dim: int = -1) -> torch.Tensor:
+    if tensor.size(dim) >= length:
+        return tensor
+    else:
+        pad_size = list(tensor.shape)
+        pad_size[dim] = length - tensor.size(dim)
+        return torch.cat(
+            [
+                tensor,
+                pad_value * torch.ones(*pad_size, dtype=tensor.dtype, device=tensor.device),
+            ],
+            dim=dim,
+        )
+
+
+def disable_dropout_in_model(model: torch.nn.Module) -> None:
+    for module in model.modules():
+        if isinstance(module, torch.nn.Dropout):
+            module.p = 0
+
+
+def exact_div(a, b, custom_error_message=""):
+    q = a // b
+    if a != q * b:
+        raise ValueError(f"{custom_error_message}, inexact division: {a} / {b} = {a / b}")
+    return q
+
+
+# copied from https://github.com/kvablack/ddpo-pytorch/blob/main/ddpo_pytorch/stat_tracking.py#L5
+class PerPromptStatTracker:
+    r"""
+    Class for tracking statistics per prompt. Mainly used to calculate advantage for the DPPO algorithm
+
+    Args:
+        buffer_size (`int`):
+            Size of the buffer to keep for each prompt.
+        min_count (`int`):
+            Minimum number of samples to keep in the buffer before calculating the mean and std.
+    """
+
+    def __init__(self, buffer_size, min_count):
+        self.buffer_size = buffer_size
+        self.min_count = min_count
+        self.stats = {}
+
+    def update(self, prompts, rewards):
+        prompts = np.array(prompts)
+        rewards = np.array(rewards)
+        unique = np.unique(prompts)
+        advantages = np.empty_like(rewards)
+        for prompt in unique:
+            prompt_rewards = rewards[prompts == prompt]
+            if prompt not in self.stats:
+                self.stats[prompt] = deque(maxlen=self.buffer_size)
+            self.stats[prompt].extend(prompt_rewards)
+
+            if len(self.stats[prompt]) < self.min_count:
+                mean = np.mean(rewards)
+                std = np.std(rewards) + 1e-6
+            else:
+                mean = np.mean(self.stats[prompt])
+                std = np.std(self.stats[prompt]) + 1e-6
+            advantages[prompts == prompt] = (prompt_rewards - mean) / std
+
+        return advantages
+
+    def get_stats(self):
+        return {k: {"mean": np.mean(v), "std": np.std(v), "count": len(v)} for k, v in self.stats.items()}
+
+
+def peft_module_casting_to_bf16(model):
+    for name, module in model.named_modules():
+        if isinstance(module, torch.nn.LayerNorm) or "norm" in name:
+            module = module.to(torch.float32)
+        elif any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]):
+            if hasattr(module, "weight"):
+                if module.weight.dtype == torch.float32:
+                    module = module.to(torch.bfloat16)
+
+
+def get_quantization_config(model_args: ModelConfig) -> Optional[BitsAndBytesConfig]:
+    if model_args.load_in_4bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=model_args.torch_dtype,  # For consistency with model weights, we use the same value as `torch_dtype`
+            bnb_4bit_quant_type=model_args.bnb_4bit_quant_type,
+            bnb_4bit_use_double_quant=model_args.use_bnb_nested_quant,
+            bnb_4bit_quant_storage=model_args.torch_dtype,
+        )
+    elif model_args.load_in_8bit:
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+        )
+    else:
+        quantization_config = None
+
+    return quantization_config
+
+
+def get_kbit_device_map() -> Optional[dict[str, int]]:
+    if is_torch_xpu_available():
+        return {"": f"xpu:{PartialState().local_process_index}"}
+    elif torch.cuda.is_available():
+        return {"": PartialState().local_process_index}
+    else:
+        return None
+
+
+def get_peft_config(model_args: ModelConfig) -> "Optional[PeftConfig]":
+    if model_args.use_peft is False:
+        return None
+
+    if not is_peft_available():
+        raise ValueError(
+            "You need to have PEFT library installed in your environment, make sure to install `peft`. "
+            "Make sure to run `pip install -U peft`."
+        )
+
+    peft_config = LoraConfig(
+        task_type=model_args.lora_task_type,
+        r=model_args.lora_r,
+        target_modules=model_args.lora_target_modules,
+        lora_alpha=model_args.lora_alpha,
+        lora_dropout=model_args.lora_dropout,
+        bias="none",
+        use_rslora=model_args.use_rslora,
+        modules_to_save=model_args.lora_modules_to_save,
+    )
+
+    return peft_config
+
+
+def get_exp_cap(value, decimal=4):
+    """
+    Get the exponent cap of a value. This is used to cap the exponent of a value to avoid overflow.
+    The formula is : log(value.dtype.max)
+    E.g.
+      For float32 data type, the maximum exponent value is 88.7228 to 4 decimal points.
+    ```
+
+    Args:
+        value (`torch.Tensor`):
+            The input tensor to obtain the data type
+        decimal (`int`):
+            The number of decimal points of the output exponent cap.
+            eg: direct calling exp(log(torch.float32.max)) will result in inf
+            so we cap the exponent to 88.7228 to avoid overflow.
+    """
+    vdtype_max = torch.zeros([1]).to(value.dtype) + torch.finfo(value.dtype).max
+    vdtype_log_max = torch.log(vdtype_max).to(value.device)
+    return torch.floor(vdtype_log_max * 10**decimal) / 10**decimal if decimal > 0 else vdtype_log_max
+
+
+def cap_exp(value, cap=-1):
+    # Cap the exponent value below the upper-bound to avoid overflow, before calling torch.exp
+    cap = get_exp_cap(value) if cap < 0 else cap
+    return torch.exp(torch.clamp(value, max=cap))
+
+
+def print_rich_table(df: pd.DataFrame) -> Table:
+    console = Console()
+    table = Table(show_lines=True)
+    for column in df.columns:
+        table.add_column(column)
+    for _, row in df.iterrows():
+        table.add_row(*row.astype(str).tolist())
+    console.print(table)
+
+
+SIMPLE_SFT_CHAT_TEMPLATE = "{% for message in messages %}{{' ' + message['content']}}{% endfor %}{{ eos_token }}"
+# SIMPLE_SFT_CHAT_TEMPLATE simply ends things with an EOS token, this helps the SFT model learn to end the completions with EOS tokens
+
+SIMPLE_CHAT_TEMPLATE = "{% for message in messages %}{{message['role'].capitalize() + ': ' + message['content'] + '\n\n'}}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}"
+
+
+@dataclass
+class OnlineTrainerState(TrainerState):
+    episode: int = 0
+
+
+@dataclass
+class OnPolicyConfig(TrainingArguments):
+    r"""
+    Base configuration class for on-policy trainers.
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        run_name (`str` or `None`, *optional*, defaults to `None`):
+            Name of the run.
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+            Number of processes to use for processing the dataset.
+        num_mini_batches (`int`, *optional*, defaults to `1`):
+            Number of minibatches to split a batch into.
+        total_episodes (`int` or `None`, *optional*, defaults to `None`):
+            Total number of episodes in the dataset.
+        local_rollout_forward_batch_size (`int`, *optional*, defaults to `64`):
+            Per rank no grad forward pass in the rollout phase.
+        num_sample_generations (`int`, *optional*, defaults to `10`):
+            Number of debugging samples generations (i.e., `generate_completions` calls) throughout training.
+        response_length (`int`, *optional*, defaults to `53`):
+            Length of the response.
+        stop_token (`str` or `None`, *optional*, defaults to `None`):
+            Specifies the stop token to use for text generation. This parameter is mutually exclusive with
+            `stop_token_id`.
+
+            - `None`: No stop token is applied, unless `stop_token_id` is specified.
+            - `'eos'`: Uses the tokenizer's `eos_token`.
+
+        stop_token_id (`int` or `None`, *optional*, defaults to `None`):
+            Specifies the ID of the stop token to use for text generation. If `None`, no stop token ID is applied,
+            unless `stop_token` is specified. This parameter is mutually exclusive with `stop_token`.
+        temperature (`float`, *optional*, defaults to `0.7`):
+            Sampling temperature.
+        missing_eos_penalty (`float` or `None`, *optional*, defaults to `None`):
+            Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage
+            to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive
+            value.
+        sft_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
+            Path to the SFT model.
+        world_size (`int` or `None`, *optional*, defaults to `None`):
+            Number of processes (GPUs) to use for the training.
+        num_total_batches (`int` or `None`, *optional*, defaults to `None`):
+            Number of total batches to train.
+        micro_batch_size (`int` or `None`, *optional*, defaults to `None`):
+            Micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`).
+        local_batch_size (`int` or `None`, *optional*, defaults to `None`):
+            Batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`).
+        batch_size (`int` or `None`, *optional*, defaults to `None`):
+            Batch size across devices (HF's `per_device_train_batch_size` * `world_size` * `gradient_accumulation_steps`).
+        local_mini_batch_size (`int` or `None`, *optional*, defaults to `None`):
+            Mini batch size per GPU.
+        mini_batch_size (`int` or `None`, *optional*, defaults to `None`):
+            Mini batch size across GPUs.
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether to push the model to the Hub after training.
+    """
+
+    run_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Name of the run."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes to use for processing the dataset."},
+    )
+    num_mini_batches: int = field(
+        default=1,
+        metadata={"help": "Number of minibatches to split a batch into."},
+    )
+    total_episodes: Optional[int] = field(
+        default=None,
+        metadata={"help": "Total number of episodes in the dataset."},
+    )
+    local_rollout_forward_batch_size: int = field(
+        default=64,
+        metadata={"help": "Per rank no grad forward pass in the rollout phase."},
+    )
+    num_sample_generations: int = field(
+        default=10,
+        metadata={
+            "help": "Number of debugging samples generations (i.e., `generate_completions` calls) throughout training."
+        },
+    )
+    response_length: int = field(
+        default=53,
+        metadata={"help": "Length of the response."},
+    )
+    stop_token: Optional[Literal["eos"]] = field(
+        default=None,
+        metadata={
+            "help": "Specifies the stop token to use for text generation. This parameter is mutually exclusive with "
+            "`stop_token_id`."
+        },
+    )
+    stop_token_id: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Specifies the ID of the stop token to use for text generation. If `None`, no stop token ID is "
+            "applied, unless `stop_token` is specified. This parameter is mutually exclusive with `stop_token`."
+        },
+    )
+    temperature: float = field(
+        default=0.7,
+        metadata={"help": "Sampling temperature."},
+    )
+    missing_eos_penalty: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "Penalty applied to the score when the model fails to generate an EOS token. This is useful to "
+            "encourage to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be "
+            "a positive value."
+        },
+    )
+    sft_model_path: str = field(
+        default="EleutherAI/pythia-160m",
+        metadata={"help": "Path to the SFT model."},
+    )
+    world_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of processes (GPUs) to use for the training."},
+    )
+    num_total_batches: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of total batches to train."},
+    )
+    micro_batch_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "Micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`)."},
+    )
+    local_batch_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "Batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`)."},
+    )
+    batch_size: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Batch size across devices (HF's `per_device_train_batch_size` * `world_size` * "
+            "`gradient_accumulation_steps`)."
+        },
+    )
+    local_mini_batch_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "Mini batch size per GPU."},
+    )
+    mini_batch_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "Mini batch size across GPUs."},
+    )
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the model to the Hub after training."},
+    )
+
+
+def first_true_indices(bools: torch.Tensor, dtype=torch.long):
+    """
+    Takes an N-dimensional bool tensor and returns an (N-1)-dimensional tensor of integers giving
+    the position of the first True in each "row".
+
+    Returns the length of the rows (bools.size(-1)) if no element is True in a given row.
+
+    Args:
+        bools (`torch.Tensor`):
+            An N-dimensional boolean tensor.
+        dtype (`torch.dtype`, optional):
+            The desired data type of the output tensor. Defaults to `torch.long`.
+
+    Returns:
+        `torch.Tensor`:
+            An (N-1)-dimensional tensor of integers indicating the position of the first True
+            in each row. If no True value is found in a row, returns the length of the row.
+    """
+    row_len = bools.size(-1)
+    zero_or_index = row_len * (~bools).type(dtype) + torch.arange(row_len, dtype=dtype, device=bools.device)
+    return torch.min(zero_or_index, dim=-1).values
+
+
+def get_reward(
+    model: torch.nn.Module, query_responses: torch.Tensor, pad_token_id: int, context_length: int
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Computes the reward logits and the rewards for a given model and query responses.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model used to compute the reward logits.
+        query_responses (`torch.Tensor`):
+            The tensor containing the query responses.
+        pad_token_id (`int`):
+            The token ID representing the pad token.
+        context_length (`int`):
+            The length of the context in the query responses.
+
+    Returns:
+        tuple:
+            - `reward_logits` (`torch.Tensor`):
+                The logits for the reward model.
+            - `final_rewards` (`torch.Tensor`):
+                The final rewards for each query response.
+            - `sequence_lengths` (`torch.Tensor`):
+                The lengths of the sequences in the query responses.
+    """
+    attention_mask = query_responses != pad_token_id
+    position_ids = attention_mask.cumsum(1) - attention_mask.long()  # exclusive cumsum
+    lm_backbone = getattr(model, model.base_model_prefix)
+    input_ids = torch.masked_fill(query_responses, ~attention_mask, 0)
+    output = lm_backbone(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        return_dict=True,
+        output_hidden_states=True,
+        use_cache=False,  # otherwise mistral-based RM would error out
+    )
+    reward_logits = model.score(output.hidden_states[-1])
+    sequence_lengths = first_true_indices(query_responses[:, context_length:] == pad_token_id) - 1 + context_length
+    # https://github.com/huggingface/transformers/blob/dc68a39c8111217683bf49a4912d0c9018bab33d/src/transformers/models/gpt2/modeling_gpt2.py#L1454
+    return (
+        reward_logits,
+        reward_logits[
+            torch.arange(reward_logits.size(0), device=reward_logits.device),
+            sequence_lengths,
+        ].squeeze(-1),
+        sequence_lengths,
+    )
+
+
+def forward(
+    model: torch.nn.Module,
+    query_responses: torch.Tensor,
+    pad_token_id: int,
+) -> torch.nn.Module:
+    """
+    Performs a forward pass through the model with the given query responses and pad token ID.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to perform the forward pass.
+        query_responses (`torch.Tensor`):
+            The tensor containing the query responses.
+        pad_token_id (`int`):
+            The token ID representing the pad token.
+
+    Returns:
+        `torch.nn.Module`:
+            The output of the model, including hidden states.
+    """
+    attention_mask = query_responses != pad_token_id
+    position_ids = attention_mask.cumsum(1) - attention_mask.long()
+    input_ids = torch.masked_fill(query_responses, ~attention_mask, 0)
+    return model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        return_dict=True,
+        output_hidden_states=True,
+    )
+
+
+def prepare_deepspeed(
+    model: torch.nn.Module, per_device_train_batch_size: int, fp16: bool = False, bf16: bool = False
+):
+    """
+    Prepares the model for training with DeepSpeed (both for stage 2 and 3), configuring the appropriate settings based on the model and
+    batch size.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to be prepared for DeepSpeed training.
+        per_device_train_batch_size (`int`):
+            The training batch size per device.
+
+    Returns:
+        `torch.nn.Module`:
+            The model initialized and configured with DeepSpeed for training.
+    """
+    import deepspeed
+
+    deepspeed_plugin = AcceleratorState().deepspeed_plugin
+    config_kwargs = deepspeed_plugin.deepspeed_config
+    if config_kwargs["zero_optimization"]["stage"] != 3:
+        config_kwargs["train_micro_batch_size_per_gpu"] = per_device_train_batch_size
+        config_kwargs = {
+            "train_micro_batch_size_per_gpu": config_kwargs["train_micro_batch_size_per_gpu"],
+            "prescale_gradients": False,
+            "wall_clock_breakdown": False,
+        }
+        if bf16:
+            config_kwargs["bf16"] = {"enabled": True}
+        elif fp16:
+            config_kwargs["fp16"] = {"enabled": True}
+    else:
+        if hasattr(model, "config"):
+            hidden_size = (
+                max(model.config.hidden_sizes)
+                if getattr(model.config, "hidden_sizes", None)
+                else getattr(model.config, "hidden_size", None)
+            )
+            if hidden_size is not None and config_kwargs["zero_optimization"]["stage"] == 3:
+                # Note that `stage3_prefetch_bucket_size` can produce DeepSpeed messages like: `Invalidate trace cache @ step 0: expected module 1, but got module 0`
+                # This is expected and is not an error, see: https://github.com/microsoft/DeepSpeed/discussions/4081
+                config_kwargs.update(
+                    {
+                        "zero_optimization.reduce_bucket_size": hidden_size * hidden_size,
+                        "zero_optimization.stage3_param_persistence_threshold": 10 * hidden_size,
+                        "zero_optimization.stage3_prefetch_bucket_size": 0,
+                    }
+                )
+    model, *_ = deepspeed.initialize(model=model, config=config_kwargs)
+    model.eval()
+    return model
+
+
+def truncate_response(stop_token_id: int, pad_token_id: int, responses: torch.Tensor):
+    """
+    Truncates the responses at the first occurrence of the stop token, filling the rest with pad tokens.
+
+    Args:
+        stop_token_id (`int`):
+            The token ID representing the stop token where truncation occurs.
+        pad_token_id (`int`):
+            The token ID representing the pad token used to fill the truncated responses.
+        responses (`torch.Tensor`):
+            The tensor containing the responses to be truncated.
+
+    Returns:
+        `torch.Tensor`:
+            The truncated responses tensor with pad tokens filled after the stop token.
+    """
+    trunc_idxs = first_true_indices(responses == stop_token_id).unsqueeze(-1)
+    new_size = [1] * (len(responses.size()) - 1) + [responses.shape[1]]
+    idxs = torch.arange(responses.shape[1], device=responses.device).view(*new_size)
+    postprocessed_responses = torch.masked_fill(responses, idxs > trunc_idxs, pad_token_id)
+    return postprocessed_responses
+
+
+def generate(
+    lm_backbone: torch.nn.Module, queries: torch.Tensor, pad_token_id: int, generation_config: GenerationConfig
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Generates sequences from the language model backbone in a way that does not affect padding tokens.
+
+    Args:
+        lm_backbone (`torch.nn.Module`):
+            The language model backbone used for generation.
+        queries (`torch.Tensor`):
+            The tensor containing the input queries.
+        pad_token_id (`int`):
+            The token ID representing the pad token.
+        generation_config (`GenerationConfig`):
+            The configuration for the generation process.
+
+    Returns:
+        tuple:
+            - `generated_sequences` (`torch.Tensor`):
+                The concatenated tensor of input queries and generated sequences.
+            - `logits` (`torch.Tensor`):
+                The logits output from the generation process.
+    """
+    context_length = queries.shape[1]
+    attention_mask = queries != pad_token_id
+    input_ids = torch.masked_fill(queries, ~attention_mask, 0)
+    output = lm_backbone.generate(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        # position_ids=attention_mask.cumsum(1) - attention_mask.long(), # not needed: already adjusted in generations
+        # https://github.com/huggingface/transformers/blob/ac33aeeeee2a7a89b89c93c2962e6feb90daef0a/src/transformers/models/gpt2/modeling_gpt2.py#L1227-L1250
+        generation_config=generation_config,
+        return_dict_in_generate=True,
+        output_scores=True,
+    )
+    logits = torch.stack(output.scores, 1)
+    return torch.cat((queries, output.sequences[:, context_length:]), dim=1), logits
+
+
+@torch.no_grad()
+def batch_generation(
+    model: torch.nn.Module,
+    queries: torch.Tensor,
+    local_rollout_forward_batch_size: int,
+    pad_token_id: int,
+    generation_config: GenerationConfig,
+):
+    query_responses = []
+    logitss = []
+    batch_size = queries.shape[0]
+    for i in range(0, batch_size, local_rollout_forward_batch_size):
+        query = queries[i : i + local_rollout_forward_batch_size]
+        query_response, logits = generate(
+            model,
+            query,
+            pad_token_id,
+            generation_config,
+        )
+        query_responses.append(query_response)
+        logitss.append(logits)
+
+    # padding tensors
+    padded_query_responses = pad(query_responses, padding_value=pad_token_id, padding_side="right")
+    padded_logitss = pad(logitss, padding_value=0, padding_side="right")
+
+    # reshaping
+    padded_query_responses = padded_query_responses.view(-1, padded_query_responses.shape[-1])[:batch_size]
+    padded_logitss = padded_logitss.view(-1, *padded_logitss.shape[2:])[:batch_size]
+
+    return padded_query_responses, padded_logitss
+
+
+def add_bos_token_if_needed(
+    bos_token_id: Optional[int],
+    prompt_len_input_ids: int,
+    prompt_tokens: dict[str, list[int]],
+    chosen_prompt_len_input_ids: int,
+    chosen_tokens: dict[str, list[int]],
+    rejected_prompt_len_input_ids: int,
+    rejected_tokens: dict[str, list[int]],
+):
+    if bos_token_id is not None:
+        if prompt_len_input_ids == 0 or bos_token_id != prompt_tokens["prompt_input_ids"][0]:
+            prompt_tokens["prompt_input_ids"] = [bos_token_id] + prompt_tokens["prompt_input_ids"]
+            prompt_tokens["prompt_attention_mask"] = [1] + prompt_tokens["prompt_attention_mask"]
+        if chosen_prompt_len_input_ids == 0 or bos_token_id != chosen_tokens["prompt_input_ids"][0]:
+            chosen_tokens["prompt_input_ids"] = [bos_token_id] + chosen_tokens["prompt_input_ids"]
+            chosen_tokens["prompt_attention_mask"] = [1] + chosen_tokens["prompt_attention_mask"]
+        if rejected_prompt_len_input_ids == 0 or bos_token_id != rejected_tokens["prompt_input_ids"][0]:
+            rejected_tokens["prompt_input_ids"] = [bos_token_id] + rejected_tokens["prompt_input_ids"]
+            rejected_tokens["prompt_attention_mask"] = [1] + rejected_tokens["prompt_attention_mask"]
+    return prompt_tokens, chosen_tokens, rejected_tokens
+
+
+def add_eos_token_if_needed(
+    eos_token_id: int, chosen_tokens: dict[str, list[int]], rejected_tokens: dict[str, list[int]]
+):
+    if len(chosen_tokens["input_ids"]) == 0 or eos_token_id != chosen_tokens["input_ids"][-1]:
+        chosen_tokens["input_ids"].append(eos_token_id)
+        chosen_tokens["attention_mask"].append(1)
+    if len(rejected_tokens["input_ids"]) == 0 or eos_token_id != rejected_tokens["input_ids"][-1]:
+        rejected_tokens["input_ids"].append(eos_token_id)
+        rejected_tokens["attention_mask"].append(1)
+    return chosen_tokens, rejected_tokens
+
+
+def truncate_right(
+    input_ids: torch.Tensor, stop_token_id: int, pad_token_id: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Truncates the input tensor from the right side after the first occurrence of the stop token.
+
+    Args:
+        input_ids (`torch.Tensor`):
+            The tensor containing the responses to be truncated
+        stop_token_id (`int`):
+            The token ID representing the stop token where truncation occurs
+        pad_token_id (`int`):
+            The token ID representing the pad token used to fill the truncated responses
+
+    Returns:
+        tuple:
+            - `output_ids` (`torch.Tensor`):
+                The truncated responses tensor with pad tokens filled after the stop token
+            - `mask` (`torch.Tensor`):
+                The mask tensor to indicate the padding tokens
+    """
+    trunc_idxs = first_true_indices(input_ids == stop_token_id).unsqueeze(-1)
+    new_size = [1] * (len(input_ids.size()) - 1) + [input_ids.shape[1]]
+    idxs = torch.arange(input_ids.shape[1], device=input_ids.device).view(*new_size)
+    output_ids = torch.masked_fill(input_ids, idxs > trunc_idxs, pad_token_id)
+    mask = torch.masked_fill(torch.ones_like(input_ids), idxs > trunc_idxs, 0)
+    return output_ids, mask
+
+
+def empty_cache() -> None:
+    """Empties the cache of the available torch device.
+
+    This function checks for the availability of different torch devices (XPU, MLU, NPU, CUDA)
+    and empties the cache of the first available device it finds.
+
+    If none of the specific devices are available, it defaults to emptying the CUDA cache.
+    """
+    if is_torch_xpu_available():
+        torch.xpu.empty_cache()
+    elif is_torch_mlu_available():
+        torch.mlu.empty_cache()
+    elif is_torch_npu_available():
+        torch.npu.empty_cache()
+    else:
+        torch.cuda.empty_cache()
+
+
+def decode_and_strip_padding(inputs: torch.Tensor, tokenizer: PreTrainedTokenizerBase) -> list[str]:
+    """
+    Decodes the input tensor and strips the padding tokens.
+
+    Args:
+        inputs (`torch.Tensor`):
+            The input tensor to be decoded.
+        tokenizer (`transformers.PreTrainedTokenizerBase`):
+            The tokenizer used to decode the input tensor.
+
+    Returns:
+        `list[str]`:
+            The list of decoded strings with padding tokens stripped.
+    """
+    decoded = tokenizer.batch_decode(inputs, skip_special_tokens=False)
+    return [d.replace(tokenizer.pad_token, "") for d in decoded]
+
+
+def generate_model_card(
+    base_model: Optional[str],
+    model_name: str,
+    hub_model_id: str,
+    dataset_name: Optional[str],
+    tags: list[str],
+    wandb_url: Optional[str],
+    trainer_name: str,
+    trainer_citation: Optional[str] = None,
+    paper_title: Optional[str] = None,
+    paper_id: Optional[str] = None,
+    comet_url: Optional[str] = None,
+) -> ModelCard:
+    """
+    Generate a `ModelCard` from a template.
+
+    Args:
+        base_model (`str` or `None`):
+            Base model name.
+        model_name (`str`):
+            Model name.
+        hub_model_id (`str`):
+            Hub model ID as `username/model_id`.
+        dataset_name (`str` or `None`):
+            Dataset name.
+        tags (`list[str]`):
+            Tags.
+        wandb_url (`str` or `None`):
+            Weights & Biases run URL.
+        comet_url (`str` or `None`):
+            Comet experiment URL.
+        trainer_name (`str`):
+            Trainer name.
+        trainer_citation (`str` or `None`, defaults to `None`):
+            Trainer citation as a BibTeX entry.
+        paper_title (`str` or `None`, defaults to `None`):
+            Paper title.
+        paper_id (`str` or `None`, defaults to `None`):
+            ArXiv paper ID as `YYMM.NNNNN`.
+
+    Returns:
+        `ModelCard`:
+            A ModelCard object.
+    """
+    card_data = ModelCardData(
+        base_model=base_model,
+        datasets=dataset_name,
+        library_name="transformers",
+        licence="license",
+        model_name=model_name,
+        tags=["generated_from_trainer", *tags],
+    )
+    card = ModelCard.from_template(
+        card_data,
+        template_path=str(pkg_resources.files("trl").joinpath("templates/lm_model_card.md")),
+        base_model=base_model,
+        model_name=model_name,
+        hub_model_id=hub_model_id,
+        dataset_name=dataset_name,
+        wandb_url=wandb_url,
+        comet_url=comet_url,
+        trainer_name=trainer_name,
+        trainer_citation=trainer_citation,
+        paper_title=paper_title,
+        paper_id=paper_id,
+        trl_version=version("trl"),
+        transformers_version=version("transformers"),
+        pytorch_version=version("torch"),
+        datasets_version=version("datasets"),
+        tokenizers_version=version("tokenizers"),
+    )
+    return card
+
+
+def get_comet_experiment_url() -> Optional[str]:
+    """
+    If Comet integration is enabled, return the URL of the current Comet experiment; otherwise, return `None`.
+    """
+    if not is_comet_available():
+        return None
+
+    if comet_ml.get_running_experiment() is not None:
+        return comet_ml.get_running_experiment().url
+
+    return None
+
+
+def log_table_to_comet_experiment(name: str, table: pd.DataFrame) -> None:
+    """
+    If Comet integration is enabled logs a table to the Comet experiment if it is currently running.
+
+    Args:
+        name (`str`):
+            Table name.
+        table (`pd.DataFrame`):
+            The Pandas DataFrame containing the table to log.
+    """
+    if not is_comet_available():
+        raise ModuleNotFoundError("The comet-ml is not installed. Please install it first: pip install comet-ml")
+
+    experiment = comet_ml.get_running_experiment()
+    if experiment is not None:
+        experiment.log_table(tabular_data=table, filename=name)
+
+
+def flush_left(mask: torch.Tensor, *tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
+    """
+    Shift non-zero elements in the mask and corresponding tensors to the left.
+
+    This function operates on a binary mask and any number of additional tensors with the same dimensions as the mask.
+    For each row, non-zero values are shifted to the leftmost positions. Then, columns that contain only zeros across
+    all rows are truncated from the mask and tensors. Visually, this operation can be represented as follows:
+
+    ```
+    [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
+     [0, x, x, x, 0, 0]]       [x, x, x, 0]]
+    ```
+
+    Args:
+
+        mask (`torch.Tensor`):
+            2D tensor (binary mask) with shape `(N, M)`.
+        *tensors (`torch.Tensor`)
+            One or more 2D tensors with the same shape as `mask`. These tensors will be processed alongside `mask`,
+            with non-zero values shifted and excess zero columns truncated in the same manner.
+
+    Returns:
+        `torch.Tensor`:
+            Updated binary mask with non-zero values flushed to the left and trailing zero columns removed.
+        `*torch.Tensor`
+            Updated tensors, processed in the same way as the mask.
+
+    Example:
+    ```python
+    >>> mask = torch.tensor([[0, 0, 1, 1, 1],
+    ...                      [0, 1, 1, 0, 0]])
+    >>> tensor = torch.tensor([[9, 9, 2, 3, 4],
+    ...                        [9, 5, 6, 9, 9]])
+    >>> new_mask, new_tensor = flush_left(mask, tensor)
+    >>> print(new_mask)
+    tensor([[1, 1, 1],
+            [1, 1, 0]])
+    >>> print(new_tensor)
+    tensor([[2, 3, 4],
+            [5, 6, 0]])
+    ```
+    """
+    # Create copy of mask and tensors
+    mask = mask.clone()
+    tensors = [t.clone() for t in tensors]
+
+    # Shift non-zero values to the left
+    for i in range(mask.size(0)):
+        first_one_idx = torch.nonzero(mask[i])[0].item()
+        mask[i] = torch.roll(mask[i], shifts=-first_one_idx)
+        for tensor in tensors:
+            tensor[i] = torch.roll(tensor[i], shifts=-first_one_idx)
+
+    # Get the first column idx that is all zeros and remove every column after that
+    empty_cols = torch.sum(mask, dim=0) == 0
+    first_empty_col = torch.nonzero(empty_cols)[0].item() if empty_cols.any() else mask.size(1)
+    mask = mask[:, :first_empty_col]
+    for i, tensor in enumerate(tensors):
+        tensors[i] = tensor[:, :first_empty_col]
+
+    if not tensors:
+        return mask
+    else:
+        return mask, *tensors
+
+
+def selective_log_softmax(logits, index):
+    """
+    A memory-efficient implementation of the common `log_softmax -> gather` operation.
+
+    This function is equivalent to the following naive implementation:
+    ```python
+    logps = torch.gather(logits.log_softmax(-1), dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
+    ```
+
+    Args:
+        logits (`torch.Tensor`):
+            Logits tensor of shape `(..., num_classes)`.
+        index (`torch.Tensor`):
+            Index tensor of shape `(...)`, specifying the positions to gather from the log-softmax output.
+
+    Returns:
+        `torch.Tensor`:
+            Gathered log probabilities with the same shape as `index`.
+    """
+    if logits.dtype in [torch.float32, torch.float64]:
+        selected_logits = torch.gather(logits, dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
+        # loop to reduce peak mem consumption
+        logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits])
+        per_token_logps = selected_logits - logsumexp_values  # log_softmax(x_i) = x_i - logsumexp(x)
+    else:
+        # logsumexp approach is unstable with bfloat16, fall back to slightly less efficent approach
+        per_token_logps = []
+        for row_logits, row_labels in zip(logits, index):  # loop to reduce peak mem consumption
+            row_logps = F.log_softmax(row_logits, dim=-1)
+            row_per_token_logps = row_logps.gather(dim=-1, index=row_labels.unsqueeze(-1)).squeeze(-1)
+            per_token_logps.append(row_per_token_logps)
+        per_token_logps = torch.stack(per_token_logps)
+    return per_token_logps