diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..9a86c18c56be3b5ad36f1928be3e4e8daef5c6bb 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,28 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +static/autotrain_homepage.png filter=lfs diff=lfs merge=lfs -text +static/autotrain_model_choice.png filter=lfs diff=lfs merge=lfs -text +static/autotrain_space.png filter=lfs diff=lfs merge=lfs -text +static/autotrain_text_classification.png filter=lfs diff=lfs merge=lfs -text +static/cost.png filter=lfs diff=lfs merge=lfs -text +static/dreambooth1.jpeg filter=lfs diff=lfs merge=lfs -text +static/dreambooth2.png filter=lfs diff=lfs merge=lfs -text +static/duplicate_space.png filter=lfs diff=lfs merge=lfs -text +static/ext_qa.png filter=lfs diff=lfs merge=lfs -text +static/hub_model_choice.png filter=lfs diff=lfs merge=lfs -text +static/image_classification_1.png filter=lfs diff=lfs merge=lfs -text +static/img_reg_ui.png filter=lfs diff=lfs merge=lfs -text +static/llm_1.png filter=lfs diff=lfs merge=lfs -text +static/llm_2.png filter=lfs diff=lfs merge=lfs -text +static/llm_3.png filter=lfs diff=lfs merge=lfs -text +static/llm_orpo_example.png filter=lfs diff=lfs merge=lfs -text +static/model_choice_1.png filter=lfs diff=lfs merge=lfs -text +static/param_choice_1.png filter=lfs diff=lfs merge=lfs -text +static/param_choice_2.png filter=lfs diff=lfs merge=lfs -text +static/space_template_1.png filter=lfs diff=lfs merge=lfs -text +static/space_template_2.png filter=lfs diff=lfs merge=lfs -text +static/space_template_3.png filter=lfs diff=lfs merge=lfs -text +static/space_template_4.png filter=lfs diff=lfs merge=lfs -text +static/space_template_5.png filter=lfs diff=lfs merge=lfs -text +static/text_classification_1.png filter=lfs diff=lfs merge=lfs -text diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..99b92340ae2a135ff63c5c5a6476efda7fad3787 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,83 @@ +FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive \ + TZ=UTC \ + HF_HUB_ENABLE_HF_TRANSFER=1 + +ENV PATH="${HOME}/miniconda3/bin:${PATH}" +ARG PATH="${HOME}/miniconda3/bin:${PATH}" +ENV PATH="/app/ngc-cli:${PATH}" +ARG PATH="/app/ngc-cli:${PATH}" + +RUN mkdir -p /tmp/model && \ + chown -R 1000:1000 /tmp/model && \ + mkdir -p /tmp/data && \ + chown -R 1000:1000 /tmp/data + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y \ + build-essential \ + cmake \ + curl \ + ca-certificates \ + gcc \ + git \ + locales \ + net-tools \ + wget \ + libpq-dev \ + libsndfile1-dev \ + git \ + git-lfs \ + libgl1 \ + unzip \ + libjpeg-dev \ + libpng-dev \ + libgomp1 \ + && rm -rf /var/lib/apt/lists/* && \ + apt-get clean + + +RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \ + git lfs install + +WORKDIR /app +RUN mkdir -p /app/.cache +ENV HF_HOME="/app/.cache" +RUN useradd -m -u 1000 user +RUN chown -R user:user /app +USER user +ENV HOME=/app + +ENV PYTHONPATH=$HOME/app \ + PYTHONUNBUFFERED=1 \ + GRADIO_ALLOW_FLAGGING=never \ + GRADIO_NUM_PORTS=1 \ + GRADIO_SERVER_NAME=0.0.0.0 \ + SYSTEM=spaces + + +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ + && sh Miniconda3-latest-Linux-x86_64.sh -b -p /app/miniconda \ + && rm -f Miniconda3-latest-Linux-x86_64.sh +ENV PATH /app/miniconda/bin:$PATH + +RUN conda create -p /app/env -y python=3.10 + +SHELL ["conda", "run","--no-capture-output", "-p","/app/env", "/bin/bash", "-c"] + +RUN conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.1 -c pytorch -c nvidia && \ + conda clean -ya && \ + conda install -c "nvidia/label/cuda-12.1.1" cuda-nvcc && conda clean -ya && \ + conda install xformers -c xformers && conda clean -ya + +COPY --chown=1000:1000 . /app/ + +RUN pip install -e . && \ + python -m nltk.downloader punkt && \ + pip install -U ninja && \ + pip install -U flash-attn --no-build-isolation && \ + pip install -U deepspeed && \ + pip install --upgrade --force-reinstall --no-cache-dir "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git" --no-deps && \ + pip cache purge diff --git a/Dockerfile.api b/Dockerfile.api new file mode 100644 index 0000000000000000000000000000000000000000..097ea7ac62e2720b47cad89b6128ebe38518d6de --- /dev/null +++ b/Dockerfile.api @@ -0,0 +1,3 @@ +FROM huggingface/autotrain-advanced:latest + +CMD autotrain api --port 7860 --host 0.0.0.0 \ No newline at end of file diff --git a/Dockerfile.app b/Dockerfile.app new file mode 100644 index 0000000000000000000000000000000000000000..9834b4d9598ae9eab17bb7750171bf8c2ed892d9 --- /dev/null +++ b/Dockerfile.app @@ -0,0 +1,2 @@ +FROM huggingface/autotrain-advanced:latest +CMD uvicorn autotrain.app:app --host 0.0.0.0 --port 7860 --reload --workers 4 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..7a4a3ea2424c09fbe48d455aed1eaa94d9124835 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..6b67a6a4b273d7ecdb1e03e18574d4191848284f --- /dev/null +++ b/Makefile @@ -0,0 +1,39 @@ +.PHONY: quality style test + +# Check that source code meets quality standards + +quality: + black --check --line-length 119 --target-version py38 . + isort --check-only . + flake8 --max-line-length 119 + +# Format source code automatically + +style: + black --line-length 119 --target-version py38 . + isort . + +test: + pytest -sv ./src/ + +docker: + docker build -t autotrain-advanced:latest . + docker tag autotrain-advanced:latest huggingface/autotrain-advanced:latest + docker push huggingface/autotrain-advanced:latest + +api: + docker build -t autotrain-advanced-api:latest -f Dockerfile.api . + docker tag autotrain-advanced-api:latest public.ecr.aws/z4c3o6n6/autotrain-api:latest + docker push public.ecr.aws/z4c3o6n6/autotrain-api:latest + +ngc: + docker build -t autotrain-advanced:latest . + docker tag autotrain-advanced:latest nvcr.io/ycymhzotssoi/autotrain-advanced:latest + docker push nvcr.io/ycymhzotssoi/autotrain-advanced:latest + +pip: + rm -rf build/ + rm -rf dist/ + make style && make quality + python setup.py sdist bdist_wheel + twine upload dist/* --verbose --repository autotrain-advanced \ No newline at end of file diff --git a/Manifest.in b/Manifest.in new file mode 100644 index 0000000000000000000000000000000000000000..ef04694c3f356e5d10b87bc63301d7e64783a387 --- /dev/null +++ b/Manifest.in @@ -0,0 +1,2 @@ +recursive-include src/autotrain/static * +recursive-include src/autotrain/templates * \ No newline at end of file diff --git a/README.md b/README.md index f48e1fcea38396f2fbaf50b5de50f8723d49043c..d0b153e762dbaf1e5f217d4d06398ce2daae8787 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,133 @@ ---- -title: Tensora Autotrain -emoji: 😻 -colorFrom: pink -colorTo: red -sdk: docker -pinned: false ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# 🤗 AutoTrain Advanced + +AutoTrain Advanced: faster and easier training and deployments of state-of-the-art machine learning models. AutoTrain Advanced is a no-code solution that allows you to train machine learning models in just a few clicks. Please note that you must upload data in correct format for project to be created. For help regarding proper data format and pricing, check out the documentation. + +NOTE: AutoTrain is free! You only pay for the resources you use in case you decide to run AutoTrain on Hugging Face Spaces. When running locally, you only pay for the resources you use on your own infrastructure. + +## Supported Tasks + +| Task | Status | Python Notebook | Example Configs | +| --- | --- | --- | --- | +| LLM SFT Finetuning | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/notebooks/llm_finetuning.ipynb) | [llm_sft_finetune.yaml](https://github.com/huggingface/autotrain-advanced/blob/main/configs/llm_finetuning/smollm2.yml) | +| LLM ORPO Finetuning | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/notebooks/llm_finetuning.ipynb) | [llm_orpo_finetune.yaml](https://github.com/huggingface/autotrain-advanced/blob/main/configs/llm_finetuning/llama3-8b-orpo.yml) | +| LLM DPO Finetuning | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/notebooks/llm_finetuning.ipynb) | [llm_dpo_finetune.yaml](https://github.com/huggingface/autotrain-advanced/blob/main/configs/llm_finetuning/llama3-8b-dpo-qlora.yml) | +| LLM Reward Finetuning | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/notebooks/llm_finetuning.ipynb) | [llm_reward_finetune.yaml](https://github.com/huggingface/autotrain-advanced/blob/main/configs/llm_finetuning/llama32-1b-sft.yml) | +| LLM Generic/Default Finetuning | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/notebooks/llm_finetuning.ipynb) | [llm_generic_finetune.yaml](https://github.com/huggingface/autotrain-advanced/blob/main/configs/llm_finetuning/gpt2_sft.yml) | +| Text Classification | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/notebooks/text_classification.ipynb) | [text_classification.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/text_classification) | +| Text Regression | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/notebooks/text_regression.ipynb) | [text_regression.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/text_regression) | +| Token Classification | ✅ | Coming Soon | [token_classification.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/token_classification) | +| Seq2Seq | ✅ | Coming Soon | [seq2seq.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/seq2seq) | +| Extractive Question Answering | ✅ | Coming Soon | [extractive_qa.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/extractive_question_answering) | +| Image Classification | ✅ | Coming Soon | [image_classification.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/image_classification) | +| Image Scoring/Regression | ✅ | Coming Soon | [image_regression.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/image_scoring) | +| VLM | 🟥 | Coming Soon | [vlm.yaml](https://github.com/huggingface/autotrain-advanced/tree/main/configs/vlm) | + + +## Running UI on Colab or Hugging Face Spaces + +- Deploy AutoTrain on Hugging Face Spaces: [![Deploy on Spaces](https://huggingface.co/datasets/huggingface/badges/resolve/main/deploy-on-spaces-md.svg)](https://huggingface.co/login?next=%2Fspaces%2Fautotrain-projects%2Fautotrain-advanced%3Fduplicate%3Dtrue) + + +- Run AutoTrain UI on Colab via ngrok: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/colabs/AutoTrain_ngrok.ipynb) + + +## Local Installation + +You can Install AutoTrain-Advanced python package via PIP. Please note you will need python >= 3.10 for AutoTrain Advanced to work properly. + + pip install autotrain-advanced + +Please make sure that you have git lfs installed. Check out the instructions here: https://github.com/git-lfs/git-lfs/wiki/Installation + +You also need to install torch, torchaudio and torchvision. + +The best way to run autotrain is in a conda environment. You can create a new conda environment with the following command: + + conda create -n autotrain python=3.10 + conda activate autotrain + pip install autotrain-advanced + conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia + conda install -c "nvidia/label/cuda-12.1.0" cuda-nvcc + +Once done, you can start the application using: + + autotrain app --port 8080 --host 127.0.0.1 + + +If you are not fond of UI, you can use AutoTrain Configs to train using command line or simply AutoTrain CLI. + +To use config file for training, you can use the following command: + + autotrain --config + + +You can find sample config files in the `configs` directory of this repository. + +Example config file for finetuning SmolLM2: + +```yaml +task: llm-sft +base_model: HuggingFaceTB/SmolLM2-1.7B-Instruct +project_name: autotrain-smollm2-finetune +log: tensorboard +backend: local + +data: + path: HuggingFaceH4/no_robots + train_split: train + valid_split: null + chat_template: tokenizer + column_mapping: + text_column: messages + +params: + block_size: 2048 + model_max_length: 4096 + epochs: 2 + batch_size: 1 + lr: 1e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: paged_adamw_8bit + scheduler: linear + gradient_accumulation: 8 + mixed_precision: bf16 + merge_adapter: true + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true +``` + +To fine-tune a model using the config file above, you can use the following command: + +```bash +$ export HF_USERNAME= +$ export HF_TOKEN= +$ autotrain --config +``` + + +## Documentation + +Documentation is available at https://hf.co/docs/autotrain/ + +## Citation + +``` +@inproceedings{thakur-2024-autotrain, + title = "{A}uto{T}rain: No-code training for state-of-the-art models", + author = "Thakur, Abhishek", + booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", + month = nov, + year = "2024", + address = "Miami, Florida, USA", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2024.emnlp-demo.44", + pages = "419--423", + abstract = "With the advancements in open-source models, training(or finetuning) models on custom datasets has become a crucial part of developing solutions which are tailored to specific industrial or open-source applications. Yet, there is no single tool which simplifies the process of training across different types of modalities or tasks.We introduce AutoTrain(aka AutoTrain Advanced){---}an open-source, no code tool/library which can be used to train (or finetune) models for different kinds of tasks such as: large language model (LLM) finetuning, text classification/regression, token classification, sequence-to-sequence task, finetuning of sentence transformers, visual language model (VLM) finetuning, image classification/regression and even classification and regression tasks on tabular data. AutoTrain Advanced is an open-source library providing best practices for training models on custom datasets. The library is available at https://github.com/huggingface/autotrain-advanced. AutoTrain can be used in fully local mode or on cloud machines and works with tens of thousands of models shared on Hugging Face Hub and their variations.", +} +``` diff --git a/colabs/AutoTrain.ipynb b/colabs/AutoTrain.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..84a141a33d88d1789be7562202e69781043fe542 --- /dev/null +++ b/colabs/AutoTrain.ipynb @@ -0,0 +1,50 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"AutoTrain\"\n", + "\n", + "- Attach proper hardware\n", + "- Click Runtime > Run all\n", + "- Read the [docs](https://hf.co/docs/autotrain) for data format, parameters and other questions\n", + "- GitHub Repo: https://github.com/huggingface/autotrain-advanced" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -U autotrain-advanced > install_logs.txt 2>&1\n", + "from IPython.display import display\n", + "from autotrain.app.colab import colab_app\n", + "elements = colab_app()\n", + "display(elements)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "autotrain", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.1.-1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/colabs/AutoTrain_LLM.ipynb b/colabs/AutoTrain_LLM.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f1ea3ed000813b3b148bce142a9e4936634d6175 --- /dev/null +++ b/colabs/AutoTrain_LLM.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "collapsed": true, + "id": "JvMRbVLEJlZT" + }, + "outputs": [], + "source": [ + "#@title 🤗 AutoTrain LLM\n", + "#@markdown In order to use this colab\n", + "#@markdown - upload train.csv to a folder named `data/`\n", + "#@markdown - train.csv must contain a `text` column\n", + "#@markdown - choose a project name if you wish\n", + "#@markdown - change model if you wish, you can use most of the text-generation models from Hugging Face Hub\n", + "#@markdown - add huggingface information (token) if you wish to push trained model to huggingface hub\n", + "#@markdown - update hyperparameters if you wish\n", + "#@markdown - click `Runtime > Run all` or run each cell individually\n", + "#@markdown - report issues / feature requests here: https://github.com/huggingface/autotrain-advanced/issues\n", + "\n", + "\n", + "import os\n", + "!pip install -U autotrain-advanced > install_logs.txt 2>&1\n", + "!autotrain setup --colab > setup_logs.txt\n", + "from autotrain import __version__\n", + "print(f'AutoTrain version: {__version__}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "A2-_lkBS1WKA" + }, + "outputs": [], + "source": [ + "#@markdown ---\n", + "#@markdown #### Project Config\n", + "#@markdown Note: if you are using a restricted/private model, you need to enter your Hugging Face token in the next step.\n", + "project_name = 'my-autotrain-llm' # @param {type:\"string\"}\n", + "model_name = 'abhishek/llama-2-7b-hf-small-shards' # @param {type:\"string\"}\n", + "\n", + "#@markdown ---\n", + "#@markdown #### Push to Hub?\n", + "#@markdown Use these only if you want to push your trained model to a private repo in your Hugging Face Account\n", + "#@markdown If you dont use these, the model will be saved in Google Colab and you are required to download it manually.\n", + "#@markdown Please enter your Hugging Face write token. The trained model will be saved to your Hugging Face account.\n", + "#@markdown You can find your token here: https://huggingface.co/settings/tokens\n", + "push_to_hub = False # @param [\"False\", \"True\"] {type:\"raw\"}\n", + "hf_token = \"hf_XXX\" #@param {type:\"string\"}\n", + "hf_username = \"abc\" #@param {type:\"string\"}\n", + "\n", + "#@markdown ---\n", + "#@markdown #### Hyperparameters\n", + "unsloth = False # @param [\"False\", \"True\"] {type:\"raw\"}\n", + "learning_rate = 2e-4 # @param {type:\"number\"}\n", + "num_epochs = 1 #@param {type:\"number\"}\n", + "batch_size = 1 # @param {type:\"slider\", min:1, max:32, step:1}\n", + "block_size = 1024 # @param {type:\"number\"}\n", + "trainer = \"sft\" # @param [\"generic\", \"sft\"] {type:\"string\"}\n", + "warmup_ratio = 0.1 # @param {type:\"number\"}\n", + "weight_decay = 0.01 # @param {type:\"number\"}\n", + "gradient_accumulation = 4 # @param {type:\"number\"}\n", + "mixed_precision = \"fp16\" # @param [\"fp16\", \"bf16\", \"none\"] {type:\"string\"}\n", + "peft = True # @param [\"False\", \"True\"] {type:\"raw\"}\n", + "quantization = \"int4\" # @param [\"int4\", \"int8\", \"none\"] {type:\"string\"}\n", + "lora_r = 16 #@param {type:\"number\"}\n", + "lora_alpha = 32 #@param {type:\"number\"}\n", + "lora_dropout = 0.05 #@param {type:\"number\"}\n", + "\n", + "os.environ[\"HF_TOKEN\"] = hf_token\n", + "os.environ[\"HF_USERNAME\"] = hf_username\n", + "\n", + "conf = f\"\"\"\n", + "task: llm-{trainer}\n", + "base_model: {model_name}\n", + "project_name: {project_name}\n", + "log: tensorboard\n", + "backend: local\n", + "\n", + "data:\n", + " path: data/\n", + " train_split: train\n", + " valid_split: null\n", + " chat_template: null\n", + " column_mapping:\n", + " text_column: text\n", + "\n", + "params:\n", + " block_size: {block_size}\n", + " lr: {learning_rate}\n", + " warmup_ratio: {warmup_ratio}\n", + " weight_decay: {weight_decay}\n", + " epochs: {num_epochs}\n", + " batch_size: {batch_size}\n", + " gradient_accumulation: {gradient_accumulation}\n", + " mixed_precision: {mixed_precision}\n", + " peft: {peft}\n", + " quantization: {quantization}\n", + " lora_r: {lora_r}\n", + " lora_alpha: {lora_alpha}\n", + " lora_dropout: {lora_dropout}\n", + " unsloth: {unsloth}\n", + "\n", + "hub:\n", + " username: ${{HF_USERNAME}}\n", + " token: ${{HF_TOKEN}}\n", + " push_to_hub: {push_to_hub}\n", + "\"\"\"\n", + "\n", + "with open(\"conf.yaml\", \"w\") as f:\n", + " f.write(conf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "g3cd_ED_yXXt" + }, + "outputs": [], + "source": [ + "!autotrain --config conf.yaml" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/colabs/AutoTrain_ngrok.ipynb b/colabs/AutoTrain_ngrok.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2d0965a0ded142e0cb741f44942840ae9ee1cc36 --- /dev/null +++ b/colabs/AutoTrain_ngrok.ipynb @@ -0,0 +1,52 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "II6F7ThkI10I" + }, + "outputs": [], + "source": [ + "#@title 🤗 AutoTrain\n", + "#@markdown In order to use this colab\n", + "#@markdown - Enter your [Hugging Face Write Token](https://huggingface.co/settings/tokens)\n", + "#@markdown - Enter your [ngrok auth token](https://dashboard.ngrok.com/get-started/your-authtoken)\n", + "huggingface_token = '' # @param {type:\"string\"}\n", + "ngrok_token = \"\" # @param {type:\"string\"}\n", + "\n", + "#@markdown\n", + "#@markdown - Attach appropriate accelerator `Runtime > Change runtime type > Hardware accelerator`\n", + "#@markdown - click `Runtime > Run all`\n", + "#@markdown - Follow the link to access the UI\n", + "#@markdown - Training happens inside this Google Colab\n", + "#@markdown - report issues / feature requests [here](https://github.com/huggingface/autotrain-advanced/issues)\n", + "\n", + "import os\n", + "os.environ[\"HF_TOKEN\"] = str(huggingface_token)\n", + "os.environ[\"NGROK_AUTH_TOKEN\"] = str(ngrok_token)\n", + "os.environ[\"AUTOTRAIN_LOCAL\"] = \"1\"\n", + "\n", + "!pip install -U autotrain-advanced > install_logs.txt 2>&1\n", + "!autotrain app --share" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/colabs/image_classification.ipynb b/colabs/image_classification.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..b0b58f50251c4ab39477fa5a11349f865fbb7366 --- /dev/null +++ b/colabs/image_classification.ipynb @@ -0,0 +1,63 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile config.yml\n", + "task: image_classification # do not change\n", + "base_model: google/vit-base-patch16-224 # the model to be used from hugging face hub\n", + "project_name: autotrain-image-classification-model # the name of the project, must be unique\n", + "log: tensorboard # do not change\n", + "backend: local # do not change\n", + "\n", + "data:\n", + " path: data/ # the path to the data folder, this folder consists of `train` and `valid` (if any) folders\n", + " train_split: train # this folder inside data/ will be used for training, it contains the images in subfolders.\n", + " valid_split: null # this folder inside data/ will be used for validation, it contains the images in subfolders. If not available, set it to null\n", + " column_mapping: # do not change\n", + " image_column: image\n", + " target_column: labels\n", + "\n", + "params:\n", + " epochs: 2\n", + " batch_size: 4\n", + " lr: 2e-5\n", + " optimizer: adamw_torch\n", + " scheduler: linear\n", + " gradient_accumulation: 1\n", + " mixed_precision: fp16\n", + "\n", + "hub:\n", + " username: ${HF_USERNAME} # please set HF_USERNAME in colab secrets\n", + " token: ${HF_TOKEN} # please set HF_TOKEN in colab secrets, must be valid hugging face write token\n", + " push_to_hub: true # set to true if you want to push the model to the hub" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from google.colab import userdata\n", + "HF_USERNAME = userdata.get('HF_USERNAME')\n", + "HF_TOKEN = userdata.get('HF_TOKEN')\n", + "os.environ['HF_USERNAME'] = HF_USERNAME\n", + "\n", + "os.environ['HF_TOKEN'] = HF_TOKEN\n", + "!autotrain --config config.yml" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/configs/extractive_question_answering/hub_dataset.yml b/configs/extractive_question_answering/hub_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..092d0b2d013a57cf2f9ffaf9461ce1457aba14ee --- /dev/null +++ b/configs/extractive_question_answering/hub_dataset.yml @@ -0,0 +1,30 @@ +task: extractive-qa +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-ex-qa1 +log: tensorboard +backend: local + +data: + path: lhoestq/squad + train_split: train + valid_split: validation + column_mapping: + text_column: context + question_column: question + answer_column: answers + +params: + max_seq_length: 512 + max_doc_stride: 128 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/extractive_question_answering/local_dataset.yml b/configs/extractive_question_answering/local_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..0b251c08405a222fa04372106d8acdefca3ef76a --- /dev/null +++ b/configs/extractive_question_answering/local_dataset.yml @@ -0,0 +1,30 @@ +task: extractive-qa +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-ex-qa2 +log: tensorboard +backend: local + +data: + path: data/ # this must be the path to the directory containing the train and valid files + train_split: train # this must be either train.csv or train.json + valid_split: valid # this must be either valid.csv or valid.json + column_mapping: + text_column: context + question_column: question + answer_column: answers + +params: + max_seq_length: 512 + max_doc_stride: 128 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/image_classification/hub_dataset.yml b/configs/image_classification/hub_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..fe7368e87b37767ee236a7c58c0101bd540f8352 --- /dev/null +++ b/configs/image_classification/hub_dataset.yml @@ -0,0 +1,27 @@ +task: image_classification +base_model: google/vit-base-patch16-224 +project_name: autotrain-cats-vs-dogs-finetuned +log: tensorboard +backend: local + +data: + path: cats_vs_dogs + train_split: train + valid_split: null + column_mapping: + image_column: image + target_column: labels + +params: + epochs: 2 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/image_classification/local.yml b/configs/image_classification/local.yml new file mode 100644 index 0000000000000000000000000000000000000000..a701c168998f13c5ec7715ce7f4080312e89b27a --- /dev/null +++ b/configs/image_classification/local.yml @@ -0,0 +1,27 @@ +task: image_classification +base_model: google/vit-base-patch16-224 +project_name: autotrain-image-classification-model +log: tensorboard +backend: local + +data: + path: data/ + train_split: train # this folder inside data/ will be used for training, it contains the images in subfolders. + valid_split: null + column_mapping: + image_column: image + target_column: label + +params: + epochs: 2 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/image_scoring/hub_dataset.yml b/configs/image_scoring/hub_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..bdd2d764736106ce4a9aee22a5a596e81054ea07 --- /dev/null +++ b/configs/image_scoring/hub_dataset.yml @@ -0,0 +1,27 @@ +task: image_regression +base_model: google/vit-base-patch16-224 +project_name: autotrain-cats-vs-dogs-finetuned +log: tensorboard +backend: local + +data: + path: cats_vs_dogs + train_split: train + valid_split: null + column_mapping: + image_column: image + target_column: labels + +params: + epochs: 2 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/image_scoring/image_quality.yml b/configs/image_scoring/image_quality.yml new file mode 100644 index 0000000000000000000000000000000000000000..1cf4221f034821ba05d55de9be8e0dbb36dcce62 --- /dev/null +++ b/configs/image_scoring/image_quality.yml @@ -0,0 +1,27 @@ +task: image_regression +base_model: microsoft/resnet-50 +project_name: autotrain-img-quality-resnet50 +log: tensorboard +backend: local + +data: + path: abhishek/img-quality-full + train_split: train + valid_split: null + column_mapping: + image_column: image + target_column: target + +params: + epochs: 10 + batch_size: 8 + lr: 2e-3 + optimizer: adamw_torch + scheduler: cosine + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/image_scoring/local.yml b/configs/image_scoring/local.yml new file mode 100644 index 0000000000000000000000000000000000000000..377cf227a7de409409516e1eebaef891d7b137e2 --- /dev/null +++ b/configs/image_scoring/local.yml @@ -0,0 +1,28 @@ +task: image_regression +base_model: google/vit-base-patch16-224 +project_name: autotrain-image-regression-model +log: tensorboard +backend: local + +data: + path: data/ + train_split: train # this folder inside data/ will be used for training, it contains the images and metadata.jsonl + valid_split: valid # this folder inside data/ will be used for validation, it contains the images and metadata.jsonl. can be set to null + # column mapping should not be changed for local datasets + column_mapping: + image_column: image + target_column: target + +params: + epochs: 2 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/llm_finetuning/gpt2_sft.yml b/configs/llm_finetuning/gpt2_sft.yml new file mode 100644 index 0000000000000000000000000000000000000000..50350948edcde50580ce391b4b4bdf14e9f7d126 --- /dev/null +++ b/configs/llm_finetuning/gpt2_sft.yml @@ -0,0 +1,32 @@ +task: llm-sft +base_model: openai-community/gpt2 +project_name: autotrain-gpt2-finetuned-guanaco +log: tensorboard +backend: local + +data: + path: timdettmers/openassistant-guanaco + train_split: train + valid_split: null + chat_template: null + column_mapping: + text_column: text + +params: + block_size: 1024 + model_max_length: 2048 + max_prompt_length: 512 + epochs: 3 + batch_size: 2 + lr: 3e-5 + padding: right + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 4 + mixed_precision: fp16 + merge_adapter: true + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: false \ No newline at end of file diff --git a/configs/llm_finetuning/llama3-70b-orpo-v1.yml b/configs/llm_finetuning/llama3-70b-orpo-v1.yml new file mode 100644 index 0000000000000000000000000000000000000000..df10ba9739fd87138e37e0d7aad9145e3f58aafa --- /dev/null +++ b/configs/llm_finetuning/llama3-70b-orpo-v1.yml @@ -0,0 +1,36 @@ +task: llm-orpo +base_model: meta-llama/Meta-Llama-3-70B-Instruct +project_name: autotrain-llama3-70b-orpo-v1 +log: tensorboard +backend: local + +data: + path: argilla/distilabel-capybara-dpo-7k-binarized + train_split: train + valid_split: valid + chat_template: chatml + column_mapping: + text_column: chosen + rejected_text_column: rejected + prompt_text_column: prompt + +params: + block_size: 2048 + model_max_length: 8192 + max_prompt_length: 1024 + epochs: 3 + batch_size: 1 + lr: 1e-5 + peft: true + quantization: null + target_modules: all-linear + padding: right + optimizer: paged_adamw_8bit + scheduler: linear + gradient_accumulation: 4 + mixed_precision: bf16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/llm_finetuning/llama3-70b-sft.yml b/configs/llm_finetuning/llama3-70b-sft.yml new file mode 100644 index 0000000000000000000000000000000000000000..cbc89e04a95f5d54758239f4a1c43c9987fe167a --- /dev/null +++ b/configs/llm_finetuning/llama3-70b-sft.yml @@ -0,0 +1,33 @@ +task: llm-sft +base_model: meta-llama/Meta-Llama-3-70B-Instruct +project_name: autotrain-llama3-70b-math-v1 +log: tensorboard +backend: local + +data: + path: rishiraj/guanaco-style-metamath-40k + train_split: train + valid_split: null + chat_template: null + column_mapping: + text_column: text + +params: + block_size: 2048 + model_max_length: 8192 + epochs: 2 + batch_size: 1 + lr: 1e-5 + peft: true + quantization: null + target_modules: all-linear + padding: right + optimizer: paged_adamw_8bit + scheduler: linear + gradient_accumulation: 8 + mixed_precision: bf16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/llm_finetuning/llama3-8b-dpo-qlora.yml b/configs/llm_finetuning/llama3-8b-dpo-qlora.yml new file mode 100644 index 0000000000000000000000000000000000000000..640f4154395dd045c2b62a38698c053dffd7aeaf --- /dev/null +++ b/configs/llm_finetuning/llama3-8b-dpo-qlora.yml @@ -0,0 +1,36 @@ +task: llm-dpo +base_model: meta-llama/Meta-Llama-3-8B-Instruct +project_name: autotrain-llama3-8b-dpo-qlora +log: tensorboard +backend: local + +data: + path: mlabonne/orpo-dpo-mix-40k + train_split: train + valid_split: null + chat_template: chatml + column_mapping: + text_column: chosen + rejected_text_column: rejected + prompt_text_column: prompt + +params: + block_size: 1024 + model_max_length: 2048 + max_prompt_length: 512 + epochs: 3 + batch_size: 2 + lr: 3e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 4 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: false \ No newline at end of file diff --git a/configs/llm_finetuning/llama3-8b-orpo-space.yml b/configs/llm_finetuning/llama3-8b-orpo-space.yml new file mode 100644 index 0000000000000000000000000000000000000000..29e07899e40d9b5eef1f0ca420e3c26c2e88138e --- /dev/null +++ b/configs/llm_finetuning/llama3-8b-orpo-space.yml @@ -0,0 +1,36 @@ +task: llm-orpo +base_model: meta-llama/Meta-Llama-3-8B-Instruct +project_name: autotrain-llama3-8b-orpo-t1 +log: tensorboard +backend: spaces-a10g-largex4 + +data: + path: argilla/distilabel-capybara-dpo-7k-binarized + train_split: train + valid_split: null + chat_template: chatml + column_mapping: + text_column: chosen + rejected_text_column: rejected + prompt_text_column: prompt + +params: + block_size: 1024 + model_max_length: 8192 + max_prompt_length: 512 + epochs: 3 + batch_size: 2 + lr: 3e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 4 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/llm_finetuning/llama3-8b-orpo.yml b/configs/llm_finetuning/llama3-8b-orpo.yml new file mode 100644 index 0000000000000000000000000000000000000000..d5d8a1c434e46ba23297979799efe4228f518f83 --- /dev/null +++ b/configs/llm_finetuning/llama3-8b-orpo.yml @@ -0,0 +1,36 @@ +task: llm-orpo +base_model: meta-llama/Meta-Llama-3-8B-Instruct +project_name: autotrain-llama3-8b-orpo +log: tensorboard +backend: local + +data: + path: argilla/distilabel-capybara-dpo-7k-binarized + train_split: train + valid_split: null + chat_template: chatml + column_mapping: + text_column: chosen + rejected_text_column: rejected + prompt_text_column: prompt + +params: + block_size: 1024 + model_max_length: 8192 + max_prompt_length: 512 + epochs: 3 + batch_size: 2 + lr: 3e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 4 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/llm_finetuning/llama3-8b-sft-unsloth.yml b/configs/llm_finetuning/llama3-8b-sft-unsloth.yml new file mode 100644 index 0000000000000000000000000000000000000000..ec797c9a2ef1f6896c78cc2191d15f4a5fa8099c --- /dev/null +++ b/configs/llm_finetuning/llama3-8b-sft-unsloth.yml @@ -0,0 +1,36 @@ +task: llm-sft +base_model: meta-llama/Meta-Llama-3-8B-Instruct +project_name: autotrain-llama3-8b-sft-unsloth +log: tensorboard +backend: local + +data: + path: rishiraj/guanaco-style-metamath-40k + train_split: train + valid_split: null + chat_template: null + column_mapping: + text_column: text + +params: + block_size: 1024 + model_max_length: 8192 + max_prompt_length: 512 + epochs: 3 + batch_size: 2 + lr: 3e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 4 + mixed_precision: fp16 + unsloth: true + lora_dropout: 0 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/llm_finetuning/llama32-1b-sft.yml b/configs/llm_finetuning/llama32-1b-sft.yml new file mode 100644 index 0000000000000000000000000000000000000000..d716498d92cad5780a92e884b3ac5b804a1ef384 --- /dev/null +++ b/configs/llm_finetuning/llama32-1b-sft.yml @@ -0,0 +1,34 @@ +task: llm-sft +base_model: meta-llama/Llama-3.2-1B +project_name: autotrain-llama32-1b-finetune +log: tensorboard +backend: local + +data: + path: HuggingFaceH4/no_robots + train_split: train + valid_split: null + chat_template: tokenizer + column_mapping: + text_column: messages + +params: + block_size: 2048 + model_max_length: 4096 + epochs: 2 + batch_size: 1 + lr: 1e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: paged_adamw_8bit + scheduler: linear + gradient_accumulation: 8 + mixed_precision: bf16 + merge_adapter: true + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true diff --git a/configs/llm_finetuning/qwen.yml b/configs/llm_finetuning/qwen.yml new file mode 100644 index 0000000000000000000000000000000000000000..24655ccfad7dfd8cb50488a24e59ff1823e49eff --- /dev/null +++ b/configs/llm_finetuning/qwen.yml @@ -0,0 +1,34 @@ +task: llm-sft +base_model: Qwen/Qwen2.5-Coder-7B-Instruct +project_name: autotrain-qwen-finetune +log: tensorboard +backend: local + +data: + path: HuggingFaceH4/no_robots + train_split: test + valid_split: null + chat_template: tokenizer + column_mapping: + text_column: messages + +params: + block_size: 2048 + model_max_length: 4096 + epochs: 1 + batch_size: 1 + lr: 1e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + merge_adapter: true + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true diff --git a/configs/llm_finetuning/smollm2.yml b/configs/llm_finetuning/smollm2.yml new file mode 100644 index 0000000000000000000000000000000000000000..a6e02ed73d28a8cdfb30b8b16a6918717668e413 --- /dev/null +++ b/configs/llm_finetuning/smollm2.yml @@ -0,0 +1,34 @@ +task: llm-sft +base_model: HuggingFaceTB/SmolLM2-1.7B-Instruct +project_name: autotrain-smollm2-finetune +log: tensorboard +backend: local + +data: + path: HuggingFaceH4/no_robots + train_split: train + valid_split: null + chat_template: tokenizer + column_mapping: + text_column: messages + +params: + block_size: 2048 + model_max_length: 4096 + epochs: 2 + batch_size: 1 + lr: 1e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: paged_adamw_8bit + scheduler: linear + gradient_accumulation: 8 + mixed_precision: bf16 + merge_adapter: true + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true diff --git a/configs/llm_finetuning/smollm2_guanaco.yml b/configs/llm_finetuning/smollm2_guanaco.yml new file mode 100644 index 0000000000000000000000000000000000000000..ca0e5302b22f7d41273f009bb50c43cf911b51f3 --- /dev/null +++ b/configs/llm_finetuning/smollm2_guanaco.yml @@ -0,0 +1,34 @@ +task: llm-sft +base_model: HuggingFaceTB/SmolLM2-135M-Instruct +project_name: autotrain-smollm2-135m-finetune-guanaco +log: tensorboard +backend: local + +data: + path: timdettmers/openassistant-guanaco + train_split: train + valid_split: null + chat_template: null + column_mapping: + text_column: text + +params: + block_size: 1024 + model_max_length: 2048 + epochs: 1 + batch_size: 1 + lr: 1e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: paged_adamw_8bit + scheduler: linear + gradient_accumulation: 8 + mixed_precision: bf16 + merge_adapter: true + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/llm_finetuning/smollm2_orpo.yml b/configs/llm_finetuning/smollm2_orpo.yml new file mode 100644 index 0000000000000000000000000000000000000000..591771a2685e14f85ac1ffa41595de10b4c9bc97 --- /dev/null +++ b/configs/llm_finetuning/smollm2_orpo.yml @@ -0,0 +1,36 @@ +task: llm-orpo +base_model: HuggingFaceTB/SmolLM2-1.7B-Instruct +project_name: autotrain-smallm2-orpo +log: tensorboard +backend: local + +data: + path: argilla/distilabel-capybara-dpo-7k-binarized + train_split: train + valid_split: null + chat_template: chatml + column_mapping: + text_column: chosen + rejected_text_column: rejected + prompt_text_column: prompt + +params: + block_size: 1024 + model_max_length: 2048 + max_prompt_length: 512 + epochs: 3 + batch_size: 2 + lr: 3e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 4 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: false \ No newline at end of file diff --git a/configs/object_detection/hub_dataset.yml b/configs/object_detection/hub_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..b5f66a7db28fc728f6cb9aab26409a59f92db561 --- /dev/null +++ b/configs/object_detection/hub_dataset.yml @@ -0,0 +1,31 @@ +task: object_detection +base_model: facebook/detr-resnet-50 +project_name: autotrain-obj-det-cppe5-2 +log: tensorboard +backend: local + +data: + path: cppe-5 + train_split: train + valid_split: test + column_mapping: + image_column: image + objects_column: objects + +params: + image_square_size: 600 + epochs: 100 + batch_size: 8 + lr: 5e-5 + weight_decay: 1e-4 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + early_stopping_patience: 50 + early_stopping_threshold: 0.001 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/object_detection/local.yml b/configs/object_detection/local.yml new file mode 100644 index 0000000000000000000000000000000000000000..ac2877320dfd4c001047456f01f0a773fe3892b9 --- /dev/null +++ b/configs/object_detection/local.yml @@ -0,0 +1,31 @@ +task: object_detection +base_model: facebook/detr-resnet-50 +project_name: autotrain-obj-det-local-dataset +log: tensorboard +backend: local + +data: + path: data/ # this contains the train and validation folders + train_split: train # this is the folder name inside the data path, contains images and metadata.jsonl + valid_split: validation # this is the folder name inside the data path, contains images and metadata.jsonl, optional + column_mapping: + image_column: image + objects_column: objects + +params: + image_square_size: 600 + epochs: 100 + batch_size: 8 + lr: 5e-5 + weight_decay: 1e-4 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + early_stopping_patience: 50 + early_stopping_threshold: 0.001 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/sentence_transformers/local_dataset.yml b/configs/sentence_transformers/local_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..37313d498efa998d65d469b3dd6ea1ad5bf236ab --- /dev/null +++ b/configs/sentence_transformers/local_dataset.yml @@ -0,0 +1,29 @@ +task: sentence-transformers:pair_score +base_model: microsoft/mpnet-base +project_name: autotrain-st-pair-score-local-dataset +log: tensorboard +backend: local + +data: + path: /path/to/your/dataset # this must be the path to the directory containing the train and valid files + train_split: train # this is the name of the train file (csv or jsonl) + valid_split: null # this is the name of the valid file (csv or jsonl), optional + column_mapping: + sentence1_column: input_sentence + sentence2_column: target_sentence + target_column: score + +params: + max_seq_length: 512 + epochs: 5 + batch_size: 8 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/sentence_transformers/pair.yml b/configs/sentence_transformers/pair.yml new file mode 100644 index 0000000000000000000000000000000000000000..8095573be75823b9d218e12f0a7797f0504bf4c2 --- /dev/null +++ b/configs/sentence_transformers/pair.yml @@ -0,0 +1,28 @@ +task: sentence-transformers:pair +base_model: microsoft/mpnet-base +project_name: autotrain-st-pair +log: tensorboard +backend: local + +data: + path: sentence-transformers/all-nli + train_split: pair:train + valid_split: pair:dev + column_mapping: + sentence1_column: anchor + sentence2_column: positive + +params: + max_seq_length: 512 + epochs: 5 + batch_size: 8 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/sentence_transformers/pair_class.yml b/configs/sentence_transformers/pair_class.yml new file mode 100644 index 0000000000000000000000000000000000000000..ac5f75f1086b5ca7d4d114a536893e81405c0ee8 --- /dev/null +++ b/configs/sentence_transformers/pair_class.yml @@ -0,0 +1,29 @@ +task: sentence-transformers:pair_class +base_model: google-bert/bert-base-uncased +project_name: autotrain-st-pair-class +log: tensorboard +backend: local + +data: + path: sentence-transformers/all-nli + train_split: pair-class:train + valid_split: pair-class:test + column_mapping: + sentence1_column: premise + sentence2_column: hypothesis + target_column: label + +params: + max_seq_length: 512 + epochs: 5 + batch_size: 8 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/sentence_transformers/pair_score.yml b/configs/sentence_transformers/pair_score.yml new file mode 100644 index 0000000000000000000000000000000000000000..fdaba91c1bd0dc4f1894cdebe3feded3ba7c9e78 --- /dev/null +++ b/configs/sentence_transformers/pair_score.yml @@ -0,0 +1,29 @@ +task: sentence-transformers:pair_score +base_model: microsoft/mpnet-base +project_name: autotrain-st-pair-score +log: tensorboard +backend: local + +data: + path: sentence-transformers/all-nli + train_split: pair-score:train + valid_split: pair-score:dev + column_mapping: + sentence1_column: sentence1 + sentence2_column: sentence2 + target_column: score + +params: + max_seq_length: 512 + epochs: 5 + batch_size: 8 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/sentence_transformers/qa.yml b/configs/sentence_transformers/qa.yml new file mode 100644 index 0000000000000000000000000000000000000000..f37e5aa6553fdd8c2b3a0bd6a881cfdbeff28af2 --- /dev/null +++ b/configs/sentence_transformers/qa.yml @@ -0,0 +1,28 @@ +task: sentence-transformers:qa +base_model: microsoft/mpnet-base +project_name: autotrain-st-qa +log: tensorboard +backend: local + +data: + path: sentence-transformers/natural-questions + train_split: train + valid_split: null + column_mapping: + sentence1_column: query + sentence2_column: answer + +params: + max_seq_length: 512 + epochs: 5 + batch_size: 8 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/sentence_transformers/triplet.yml b/configs/sentence_transformers/triplet.yml new file mode 100644 index 0000000000000000000000000000000000000000..add2f0c07d987f2833a600b92761e7aa347e8299 --- /dev/null +++ b/configs/sentence_transformers/triplet.yml @@ -0,0 +1,29 @@ +task: sentence-transformers:triplet +base_model: microsoft/mpnet-base +project_name: autotrain-st-triplet +log: tensorboard +backend: local + +data: + path: sentence-transformers/all-nli + train_split: triplet:train + valid_split: triplet:dev + column_mapping: + sentence1_column: anchor + sentence2_column: positive + sentence3_column: negative + +params: + max_seq_length: 512 + epochs: 5 + batch_size: 8 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/seq2seq/hub_dataset.yml b/configs/seq2seq/hub_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..5df712162b45eaba852843caa000b3d99a03f471 --- /dev/null +++ b/configs/seq2seq/hub_dataset.yml @@ -0,0 +1,28 @@ +task: seq2seq +base_model: google/flan-t5-base +project_name: autotrain-seq2seq-hub-dataset +log: tensorboard +backend: local + +data: + path: samsum + train_split: train + valid_split: test + column_mapping: + text_column: dialogue + target_column: summary + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: none + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/seq2seq/local.yml b/configs/seq2seq/local.yml new file mode 100644 index 0000000000000000000000000000000000000000..16ba8ad69f7d25f758c5d2912d8277f0439737f5 --- /dev/null +++ b/configs/seq2seq/local.yml @@ -0,0 +1,29 @@ +task: seq2seq +base_model: google/flan-t5-base +project_name: autotrain-seq2seq-local +log: tensorboard +backend: local + +data: + path: path/to/your/dataset csv/jsonl files + train_split: train + valid_split: test + column_mapping: + text_column: text + target_column: target + + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: none + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/text_classification/hub_dataset.yml b/configs/text_classification/hub_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..08ba66ac00dfe403d9ac7ce2592dca1e23458ed3 --- /dev/null +++ b/configs/text_classification/hub_dataset.yml @@ -0,0 +1,28 @@ +task: text_classification +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-imdb-finetuned +log: tensorboard +backend: local + +data: + path: stanfordnlp/imdb + train_split: train + valid_split: test + column_mapping: + text_column: text + target_column: label + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/text_classification/local_dataset.yml b/configs/text_classification/local_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..76c56d8cb8eb05eeaf39eca0613e715933b29757 --- /dev/null +++ b/configs/text_classification/local_dataset.yml @@ -0,0 +1,28 @@ +task: text_classification +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-imdb-finetuned +log: tensorboard +backend: local + +data: + path: data/ # this must be the path to the directory containing the train and valid files + train_split: train # this must be either train.csv or train.json + valid_split: valid # this must be either valid.csv or valid.json + column_mapping: + text_column: text # this must be the name of the column containing the text + target_column: label # this must be the name of the column containing the target + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/text_regression/hub_dataset.yml b/configs/text_regression/hub_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..bee20ce5f3ade670785639bc294c2a44dc5380ae --- /dev/null +++ b/configs/text_regression/hub_dataset.yml @@ -0,0 +1,28 @@ +task: text_regression +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-sms-spam-finetuned +log: tensorboard +backend: local + +data: + path: sms_spam + train_split: train + valid_split: null + column_mapping: + text_column: sms + target_column: label + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/text_regression/local_dataset.yml b/configs/text_regression/local_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..9793489cb357b81c1f2ebc4dee3da5878b0be355 --- /dev/null +++ b/configs/text_regression/local_dataset.yml @@ -0,0 +1,28 @@ +task: text_regression +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-custom-finetuned +log: tensorboard +backend: local + +data: + path: data/ # this must be the path to the directory containing the train and valid files + train_split: train # this must be either train.csv or train.json + valid_split: valid # this must be either valid.csv or valid.json + column_mapping: + text_column: text # this must be the name of the column containing the text + target_column: label # this must be the name of the column containing the target + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/token_classification/hub_dataset.yml b/configs/token_classification/hub_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..82f44133c3309f3278e27cafcef518c02615c085 --- /dev/null +++ b/configs/token_classification/hub_dataset.yml @@ -0,0 +1,28 @@ +task: token_classification +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-conll2003-finetuned +log: tensorboard +backend: local + +data: + path: conll2003 + train_split: train + valid_split: validation + column_mapping: + tokens_column: tokens + tags_column: ner_tags + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/token_classification/local_dataset.yml b/configs/token_classification/local_dataset.yml new file mode 100644 index 0000000000000000000000000000000000000000..97ae844057ac135158461730c4524c833c8e594b --- /dev/null +++ b/configs/token_classification/local_dataset.yml @@ -0,0 +1,28 @@ +task: token_classification +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-custom-finetuned +log: tensorboard +backend: local + +data: + path: data/ # this must be the path to the directory containing the train and valid files + train_split: train # this must be either train.json + valid_split: test # this must be either valid.json, can also be set to null + column_mapping: + tokens_column: tokens # this must be the name of the column containing the text + tags_column: tags # this must be the name of the column containing the target + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/configs/vlm/paligemma_vqa.yml b/configs/vlm/paligemma_vqa.yml new file mode 100644 index 0000000000000000000000000000000000000000..484888d2fd96b80a1e6ec2cc8593ba07d0810793 --- /dev/null +++ b/configs/vlm/paligemma_vqa.yml @@ -0,0 +1,30 @@ +task: vlm:vqa +base_model: google/paligemma-3b-pt-224 +project_name: autotrain-paligemma-finetuned-vqa +log: tensorboard +backend: local + +data: + path: abhishek/vqa_small + train_split: train + valid_split: validation + column_mapping: + image_column: image + text_column: multiple_choice_answer + prompt_text_column: question + +params: + epochs: 3 + batch_size: 2 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 4 + mixed_precision: fp16 + peft: true + quantization: int4 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true \ No newline at end of file diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..942311cffcbeaf7bb091bf1d882c759a43df217d --- /dev/null +++ b/docs/README.md @@ -0,0 +1,58 @@ +# Generating the documentation + +To generate the documentation, you have to build it. Several packages are necessary to build the doc. + +First, you need to install the project itself by running the following command at the root of the code repository: + +```bash +pip install -e . +``` + +You also need to install 2 extra packages: + +```bash +# `hf-doc-builder` to build the docs +pip install git+https://github.com/huggingface/doc-builder@main +# `watchdog` for live reloads +pip install watchdog +``` + +--- +**NOTE** + +You only need to generate the documentation to inspect it locally (if you're planning changes and want to +check how they look before committing for instance). You don't have to commit the built documentation. + +--- + +## Building the documentation + +Once you have setup the `doc-builder` and additional packages with the pip install command above, +you can generate the documentation by typing the following command: + +```bash +doc-builder build autotrain docs/source/ --build_dir ~/tmp/test-build +``` + +You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate +the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite +Markdown editor. + +## Previewing the documentation + +To preview the docs, run the following command: + +```bash +doc-builder preview autotrain docs/source/ +``` + +The docs will be viewable at [http://localhost:5173](http://localhost:5173). You can also preview the docs once you +have opened a PR. You will see a bot add a comment to a link where the documentation with your changes lives. + +--- +**NOTE** + +The `preview` command only works with existing doc files. When you add a completely new file, you need to update +`_toctree.yml` & restart `preview` command (`ctrl-c` to stop it & call `doc-builder preview ...` again). + +--- \ No newline at end of file diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml new file mode 100644 index 0000000000000000000000000000000000000000..fbb26c0f87b5fc079da0a8fa2d77b45d73d9d932 --- /dev/null +++ b/docs/source/_toctree.yml @@ -0,0 +1,46 @@ +- sections: + - local: index + title: 🤗 AutoTrain + - local: cost + title: How much does it cost? + - local: support + title: Get help and support + - local: faq + title: Frequently Asked Questions + title: Getting Started +- sections: + - local: quickstart_spaces + title: Train on Spaces + - local: quickstart_py + title: Python SDK + - local: quickstart + title: Train Locally + - local: config + title: Config File + title: Quickstart +- sections: + - local: tasks/llm_finetuning + title: LLM Finetuning + - local: tasks/text_classification_regression + title: Text Classification/Regression + - local: tasks/extractive_qa + title: Extractive QA + - local: tasks/sentence_transformer + title: Sentence Transformer + - local: tasks/image_classification_regression + title: Image Classification / Regression + - local: tasks/object_detection + title: Object Detection + - local: tasks/seq2seq + title: Seq2Seq + - local: tasks/token_classification + title: Token Classification + - local: tasks/tabular + title: Tabular + title: Tasks +- sections: + - local: col_map + title: Understanding Column Mapping + - local: autotrain_api + title: AutoTrain API + title: Miscellaneous \ No newline at end of file diff --git a/docs/source/autotrain_api.mdx b/docs/source/autotrain_api.mdx new file mode 100644 index 0000000000000000000000000000000000000000..601e5df7ecc03c82af80cc42d09e274f619e0b3d --- /dev/null +++ b/docs/source/autotrain_api.mdx @@ -0,0 +1,57 @@ +# AutoTrain API + +With AutoTrain API, you can run your own instance of AutoTrain and use it to +train models on Hugging Face Spaces infrastructure (local training coming soon). +This API is designed to be used with autotrain compatible models and datasets, and it provides a simple interface to +train models with minimal configuration. + +## Getting Started + +To get started with AutoTrain API, all you need to do is install `autotrain-advanced` +as discussed in running locally section and run the autotrain app command: + +```bash +$ autotrain app --port 8000 --host 127.0.0.1 +``` + +You can then access the API reference at `http://127.0.0.1:8000/docs`. + +## Example Usage + +```bash +curl -X POST "http://127.0.0.1:8000/api/create_project" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer hf_XXXXX" \ + -d '{ + "username": "abhishek", + "project_name": "my-autotrain-api-model", + "task": "llm:orpo", + "base_model": "meta-llama/Meta-Llama-3-8B-Instruct", + "hub_dataset": "argilla/distilabel-capybara-dpo-7k-binarized", + "train_split": "train", + "hardware": "spaces-a10g-large", + "column_mapping": { + "text_column": "chosen", + "rejected_text_column": "rejected", + "prompt_text_column": "prompt" + }, + "params": { + "block_size": 1024, + "model_max_length": 4096, + "max_prompt_length": 512, + "epochs": 1, + "batch_size": 2, + "lr": 0.00003, + "peft": true, + "quantization": "int4", + "target_modules": "all-linear", + "padding": "right", + "optimizer": "adamw_torch", + "scheduler": "linear", + "gradient_accumulation": 4, + "mixed_precision": "fp16", + "chat_template": "chatml" + } + }' +``` + diff --git a/docs/source/col_map.mdx b/docs/source/col_map.mdx new file mode 100644 index 0000000000000000000000000000000000000000..fa802e5afb4f5d64374ceaae9c6ff59a37e577c3 --- /dev/null +++ b/docs/source/col_map.mdx @@ -0,0 +1,205 @@ +# Understanding Column Mapping + +Column mapping is a critical setup process in AutoTrain that informs the system +about the roles of different columns in your dataset. Whether it's a tabular +dataset, text classification data, or another type, the need for precise +column mapping ensures that AutoTrain processes each dataset element correctly. + +## How Column Mapping Works + +AutoTrain has no way of knowing what the columns in your dataset represent. +AutoTrain requires a clear understanding of each column's function within +your dataset to train models effectively. This is managed through a +straightforward mapping system in the user interface, represented as a dictionary. +Here's a typical example: + +``` +{"text": "text", "label": "target"} +``` + +In this example, the `text` column in your dataset corresponds to the text data +AutoTrain uses for processing, and the `target` column is treated as the +label for training. + +But let's not get confused! AutoTrain has a way to understand what each column in your dataset represents. +If your data is already in AutoTrain format, you dont need to change column mappings. +If not, you can easily map the columns in your dataset to the correct AutoTrain format. + +In the UI, you will see column mapping as a dictionary: + +``` +{"text": "text", "label": "target"} +``` + +Here, the column `text` in your dataset is mapped to the AutoTrain column `text`, +and the column `target` in your dataset is mapped to the AutoTrain column `label`. + +Let's say you are training a text classification model and your dataset has the following columns: + +``` +full_text, target_sentiment +"this movie is great", positive +"this movie is bad", negative +``` + +You can map these columns to the AutoTrain format as follows: + +``` +{"text": "full_text", "label": "target_sentiment"} +``` + +If your dataset has the columns: `text` and `label`, you don't need to change the column mapping. + +Let's take a look at column mappings for each task: + +## LLM + +Note: For all LLM tasks, if the text column(s) is not formatted i.e. if contains samples in chat format (dict or json), then you +should use `chat_template` parameter. Read more about it in LLM Parameters Section. + + +### SFT / Generic Trainer + +``` +{"text": "text"} +``` + +`text`: The column in your dataset that contains the text data. + + +### Reward Trainer + +``` +{"text": "text", "rejected_text": "rejected_text"} +``` + +`text`: The column in your dataset that contains the text data. + +`rejected_text`: The column in your dataset that contains the rejected text data. + +### DPO / ORPO Trainer + +``` +{"prompt": "prompt", "text": "text", "rejected_text": "rejected_text"} +``` + +`prompt`: The column in your dataset that contains the prompt data. + +`text`: The column in your dataset that contains the text data. + +`rejected_text`: The column in your dataset that contains the rejected text data. + + +## Text Classification & Regression, Seq2Seq + +For text classification and regression, the column mapping should be as follows: + +``` +{"text": "dataset_text_column", "label": "dataset_target_column"} +``` + +`text`: The column in your dataset that contains the text data. + +`label`: The column in your dataset that contains the target variable. + + +## Token Classification + + +``` +{"text": "tokens", "label": "tags"} +``` + +`text`: The column in your dataset that contains the tokens. These tokens must be a list of strings. + +`label`: The column in your dataset that contains the tags. These tags must be a list of strings. + +For token classification, if you are using a CSV, make sure that the columns are stringified lists. + +## Tabular Classification & Regression + +``` +{"id": "id", "label": ["target"]} +``` + +`id`: The column in your dataset that contains the unique identifier for each row. + +`label`: The column in your dataset that contains the target variable. This should be a list of strings. + +For a single target column, you can pass a list with a single element. + +For multiple target columns, e.g. a multi label classification task, you can pass a list with multiple elements. + + +# Image Classification + +For image classification, the column mapping should be as follows: + +``` +{"image": "image_column", "label": "label_column"} +``` + +Image classification requires column mapping only when you are using a dataset from Hugging Face Hub. +For uploaded datasets, leave column mapping as it is. + +# Sentence Transformers + +For all sentence transformers tasks, one needs to map columns to `sentence1_column`, `sentence2_column`, `sentence3_column` & `target_column` column. +Not all columns need to be mapped for all trainers of sentence transformers. + +## `pair`: + +``` +{"sentence1_column": "anchor", "sentence2_column": "positive"} +``` + +## `pair_class`: + +``` +{"sentence1_column": "premise", "sentence2_column": "hypothesis", "target_column": "label"} +``` + +## `pair_score`: + +``` +{"sentence1_column": "sentence1", "sentence2_column": "sentence2", "target_column": "score"} +``` + +## `triplet`: + +``` +{"sentence1_column": "anchor", "sentence2_column": "positive", "sentence3_column": "negative"} +``` + +## `qa`: + +``` +{"sentence1_column": "query", "sentence2_column": "answer"} +``` + + +# Extractive Question Answering + +For extractive question answering, the column mapping should be as follows: + +``` +{"text": "context", "question": "question", "answer": "answers"} +``` + +where `answer` is a dictionary with keys `text` and `answer_start`. + + +## Ensuring Accurate Mapping + +To ensure your model trains correctly: + +- Verify Column Names: Double-check that the names used in the mapping dictionary accurately reflect those in your dataset. + +- Format Appropriately: Especially in token classification, ensure your data format matches expectations (e.g., lists of strings). + +- Update Mappings for New Datasets: Each new dataset might require its unique mappings based on its structure and the task at hand. + +By following these guidelines and using the provided examples as templates, +you can effectively instruct AutoTrain on how to interpret and handle your +data for various machine learning tasks. This process is fundamental for +achieving optimal results from your model training endeavors. diff --git a/docs/source/config.mdx b/docs/source/config.mdx new file mode 100644 index 0000000000000000000000000000000000000000..e255229b6e8c55a4b650a5dbb000e73dc9043f16 --- /dev/null +++ b/docs/source/config.mdx @@ -0,0 +1,65 @@ +# AutoTrain Configs + +AutoTrain Configs are the way to use and train models using AutoTrain locally. + +Once you have installed AutoTrain Advanced, you can use the following command to train models using AutoTrain config files: + +```bash +$ export HF_USERNAME=your_hugging_face_username +$ export HF_TOKEN=your_hugging_face_write_token + +$ autotrain --config path/to/config.yaml +``` + +Example configurations for all tasks can be found in the `configs` directory of +the [AutoTrain Advanced GitHub repository](https://github.com/huggingface/autotrain-advanced). + +Here is an example of an AutoTrain config file: + +```yaml +task: llm +base_model: meta-llama/Meta-Llama-3-8B-Instruct +project_name: autotrain-llama3-8b-orpo +log: tensorboard +backend: local + +data: + path: argilla/distilabel-capybara-dpo-7k-binarized + train_split: train + valid_split: null + chat_template: chatml + column_mapping: + text_column: chosen + rejected_text_column: rejected + +params: + trainer: orpo + block_size: 1024 + model_max_length: 2048 + max_prompt_length: 512 + epochs: 3 + batch_size: 2 + lr: 3e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 4 + mixed_precision: bf16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true +``` + +In this config, we are finetuning the `meta-llama/Meta-Llama-3-8B-Instruct` model +on the `argilla/distilabel-capybara-dpo-7k-binarized` dataset using the `orpo` +trainer for 3 epochs with a batch size of 2 and a learning rate of `3e-5`. +More information on the available parameters can be found in the *Data Formats and Parameters* section. + +In case you dont want to push the model to hub, you can set `push_to_hub` to `false` in the config file. +If not pushing the model to hub username and token are not required. Note: they may still be needed +if you are trying to access gated models or datasets. \ No newline at end of file diff --git a/docs/source/cost.mdx b/docs/source/cost.mdx new file mode 100644 index 0000000000000000000000000000000000000000..4c9b4ef17f53a67431d7fd4d226c9eecaf16143b --- /dev/null +++ b/docs/source/cost.mdx @@ -0,0 +1,40 @@ +# How much does it cost? + +AutoTrain offers an accessible approach to model training, providing deployable models +with just a few clicks. Understanding the cost involved is essential to planning and +executing your projects efficiently. + + +## Local Usage + +When you choose to use AutoTrain locally on your own hardware, there is no cost. +This option is ideal for those who prefer to manage their own infrastructure and +do not require the scalability that cloud resources offer. + +## Using AutoTrain on Hugging Face Spaces + +**Pay-As-You-Go**: Costs for using AutoTrain in Hugging Face Spaces are based on the +computing resources you consume. This flexible pricing structure ensures you only pay +for what you use, making it cost-effective and scalable for projects of any size. + + +**Ownership and Portability**: Unlike some other platforms, AutoTrain does not retain +ownership of your models. Once training is complete, you are free to download and +deploy your models wherever you choose, providing flexibility and control over your all your assets. + +### Pricing Details + +**Resource-Based Billing**: Charges are accrued per minute according to the type of hardware +utilized during training. This means you can scale your resource usage based on the +complexity and needs of your projects. + +For a detailed breakdown of the costs associated with using Hugging Face Spaces, +please refer to the [pricing](https://huggingface.co/pricing#spaces) section on our website. + +To access the paid features of AutoTrain, you must have a valid payment method on file. +You can manage your payment options and view your billing information in +the [billing section of your Hugging Face account settings.](https://huggingface.co/settings/billing) + +By offering both free and flexible paid options, AutoTrain ensures that users can choose +the most suitable model training solution for their needs, whether they are experimenting +on a local machine or scaling up operations on Hugging Face Spaces. diff --git a/docs/source/faq.mdx b/docs/source/faq.mdx new file mode 100644 index 0000000000000000000000000000000000000000..e5130de43906d5255aa260ebe1bdd084bd97d247 --- /dev/null +++ b/docs/source/faq.mdx @@ -0,0 +1,117 @@ +# Frequently Asked Questions + +## Are my data and models secure? + +Yes, your data and models are secure. AutoTrain uses the Hugging Face Hub to store your data and models. +All your data and models are uploaded to your Hugging Face account as private repositories and are only accessible by you. +Read more about security [here](https://huggingface.co/docs/hub/en/security). + +## Do you upload my data to the Hugging Face Hub? + +AutoTrain will not upload your dataset to the Hub if you are using the local backend or training in the same space. +AutoTrain will push your dataset to the Hub if you are using features like: DGX Cloud +or using local CLI to train on Hugging Face's infrastructure. + +You can safely remove the dataset from the Hub after training is complete. +If uploaded, the dataset will be stored in your Hugging Face account as a private repository and will only be accessible by you +and the training process. It is not used once the training is complete. + +## My training space paused for no reason mid-training + +AutoTrain Training Spaces will pause itself after training is done (or failed). This is done to save resources and costs. +If your training failed, you can still see the space logs and find out what went wrong. Note: you won't be able to retrive the logs if you restart the space. + +Another reason for the space to pause is if the space is space's sleep time kicking in. If you have a long running training job, you must set the sleep time to a much higher value. +The space will anyways pause itself after the training is done thus saving you costs. + +## I get error `Your installed package nvidia-ml-py is corrupted. Skip patch functions` + +This error can be safely ignored. It is a warning from the `nvitop` library and does not affect the functionality of AutoTrain. + +## I get 409 conflict error when using the UI + +This error occurs when you try to create a project with the same name as an existing project. +To resolve this error, you can either delete the existing project or create a new project +with a different name. + +This error can also occur when you are trying to train a model while a model is already training in the same space or locally. + + +## The model I want to use doesn't show up in the model selection dropdown. + +If the model you want to use is not available in the model selection dropdown, +you can add it in the environment variable `AUTOTRAIN_CUSTOM_MODELS` in the space settings. +For example, if you want to add the `xxx/yyy` model, go to space settings, create a variable named `AUTOTRAIN_CUSTOM_MODELS` +and set the value to `xxx/yyy`. + +You can also pass the model name as query parameter in the URL. For example, if you want to use the `xxx/yyy` model, +you can use the URL `https://huggingface.co/spaces/your_autotrain_space?custom_models=xxx/yyy`. + +## How do I use AutoTrain locally? + +AutoTrain can be used locally by installing the AutoTrain Advanced pypi package. +You can read more in *Use AutoTrain Locally* section. + + +## Can I run AutoTrain on Colab? + +To start the UI on Colab, you can simply click on the following link: + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/colabs/AutoTrain.ipynb) + +Please note, to run the app on Colab, you will need an ngrok token. You can get one by signing up for free on [ngrok](https://ngrok.com/). +This is because Colab does not allow exposing ports to the internet directly. + +To use the CLI instead on Colab, you can follow the same instructions as for using AutoTrain locally. + + +## Does AutoTrain have a docker image? + +Yes, AutoTrain has a docker image. +You can find the docker image on Docker Hub [here](https://hub.docker.com/r/huggingface/autotrain-advanced). + + +## Is windows supported? + +Unfortunately, AutoTrain does not officially support Windows at the moment. +You can try using WSL (Windows Subsystem for Linux) to run AutoTrain on Windows or the docker image. + +## "--project-name" argument can not be set as a directory + +`--project-name` argument should not be a path. it will be created where autotrain command is run. +This parameter must be alphanumeric and can contain hypens. + +## I am getting `config.json` not found error + +This means you have trained an adapter model (peft=true) which doesnt generate config.json. +It doesnt matter though, the model can still be loaded with AutoModelForCausalLM or with Inference endpoints. +If you want to merge weights with base models, you can use `autotrain tools`. Please read about it in miscelleneous section. + +## Does autotrain support multi-gpu training? + +Yes, autotrain supports multi-gpu training. +AutoTrain will determine on its own if the user is running the command on a multi-gpu setup and will use +multi-gpu ddp if number of gpus is greater than 1 and less than 4 and deepspeed if number of gpus is greater than or equal to 4. + + +## How can i use a hub dataset with multiple configs? + +If your hub dataset has multiple configs, you can use `train_split` parameter to specify the both the config and the split. +For example, in this dataset [here](https://huggingface.co/datasets/timdettmers/openassistant-guanaco), +there are multiple configs: `pair`, `pair-class`, `pair-score` and `triplet`. + +If i want to use `train` split of `pair-class` config, i can use write `pair-class:train` as `train_split` in the UI or the CLI / config. + +An example config is shown below: + +```yaml +data: + path: sentence-transformers/all-nli + train_split: pair-class:train + valid_split: pair-class:test + column_mapping: + sentence1_column: premise + sentence2_column: hypothesis + target_column: label + +``` \ No newline at end of file diff --git a/docs/source/getting_started.bck b/docs/source/getting_started.bck new file mode 100644 index 0000000000000000000000000000000000000000..5ca6df4131a1459136c85c78e4073376b1a6ced4 --- /dev/null +++ b/docs/source/getting_started.bck @@ -0,0 +1,25 @@ +# Installation + +There is no installation required! AutoTrain Advanced runs on Hugging Face Spaces. All you need to do is create a new space with the AutoTrain Advanced template: https://huggingface.co/new-space?template=autotrain-projects/autotrain-advanced. Please make sure you keep the space private. + +![autotrain-space-template](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/space_template_1.png) + +Once you have selected Docker > AutoTrain template and an appropriate hardware, you can click on "Create Space" and you will be redirected to your new space. + +![autotrain-space-template](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/space_template_2.png) + +Make sure to use a write token and keep the space private for any unauthorized access. + +# Updating AutoTrain Advanced to Latest Version + +We are constantly adding new features and tasks to AutoTrain Advanced. Its always a good idea to update your space to the latest version before starting a new project. An up-to-date version of AutoTrain Advanced will have the latest tasks, features and bug fixes! Updating is as easy as clicking on the "Factory reboot" button in the setting page of your space. + +![autotrain-space-template](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/space_template_5.png) + +Please note that "restarting" a space will not update it to the latest version. You need to "Factory reboot" the space to update it to the latest version. + +And now we are all set and we can start with our first project! + +# Understanding the UI + +![autotrain-space-template](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/ui.png) diff --git a/docs/source/index.mdx b/docs/source/index.mdx new file mode 100644 index 0000000000000000000000000000000000000000..bd618508a6910c71a2122255e82703158d07d53a --- /dev/null +++ b/docs/source/index.mdx @@ -0,0 +1,61 @@ +# AutoTrain + +![autotrain-homepage](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/autotrain_homepage.png) + +🤗 AutoTrain Advanced (or simply AutoTrain), developed by Hugging Face, is a robust no-code +platform designed to simplify the process of training state-of-the-art models across +multiple domains: Natural Language Processing (NLP), Computer Vision (CV), +and even Tabular Data analysis. This tool leverages the powerful frameworks created by +various teams at Hugging Face, making advanced machine learning and artificial intelligence accessible to a broader +audience without requiring deep technical expertise. + +## Who should use AutoTrain? + +AutoTrain is the perfect tool for anyone eager to dive into the world of machine learning +without getting bogged down by the complexities of model training. +Whether you're a business professional, researcher, educator, or hobbyist, +AutoTrain offers the simplicity of a no-code interface while still providing the +capabilities necessary to develop sophisticated models tailored to your unique datasets. + +AutoTrain is for anyone who wants to train a state-of-the-art model for a NLP, CV, Speech or even Tabular task, +but doesn't want to spend time on the technical details of training a model. + +Our mission is to democratize machine learning technology, ensuring it is not only +accessible to data scientists and ML engineers but also to those without a technical +background. If you're looking to harness the power of AI for your projects, +AutoTrain is your answer. + + +## How to use AutoTrain? + +We offer several ways to use AutoTrain: + +- No code users can use `AutoTrain Advanced` by creating a new space with AutoTrain Docker image: +[Click here](https://huggingface.co/login?next=/spaces/autotrain-projects/autotrain-advanced?duplicate=true) to create AutoTrain Space. +Remember to keep your space private and ensure it is equipped with the necessary hardware resources (GPU) for optimal performance. + +- If you prefer a more hands-on approach, AutoTrain Advanced can also be run locally +through its intuitive UI or accessed via the Python API provided in the autotrain-advanced +package. This flexibility allows developers to integrate AutoTrain capabilities directly +into their projects, customize workflows, and enhance their toolsets with advanced machine +learning functionalities. + + +By bridging the gap between cutting-edge technology and practical usability, +AutoTrain Advanced empowers users to achieve remarkable results in AI without the need +for extensive programming knowledge. Start your journey with AutoTrain today and unlock +the potential of machine learning for your projects! + + +## Walkthroughs + +To get started with AutoTrain, check out our walkthroughs and tutorials: + +- [Extractive Question Answering with AutoTrain](https://huggingface.co/blog/abhishek/extractive-qa-autotrain) +- [Finetuning PaliGemma with AutoTrain](https://huggingface.co/blog/abhishek/paligemma-finetuning-autotrain) +- [Training an Object Detection Model with AutoTrain](https://huggingface.co/blog/abhishek/object-detection-autotrain) +- [How to Fine-Tune Custom Embedding Models Using AutoTrain](https://huggingface.co/blog/abhishek/finetune-custom-embeddings-autotrain) +- [Train Custom Models on Hugging Face Spaces with AutoTrain SpaceRunner](https://huggingface.co/blog/abhishek/autotrain-spacerunner) +- [How to Finetune phi-3 on MacBook Pro](https://huggingface.co/blog/abhishek/phi3-finetune-macbook) +- [Finetune Mixtral 8x7B with AutoTrain](https://huggingface.co/blog/abhishek/autotrain-mixtral-dgx-cloud-local) +- [Easily Train Models with H100 GPUs on NVIDIA DGX Cloud](https://huggingface.co/blog/train-dgx-cloud) diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx new file mode 100644 index 0000000000000000000000000000000000000000..048a25c959ad6c3545a9daccacd9306e9b1dd567 --- /dev/null +++ b/docs/source/quickstart.mdx @@ -0,0 +1,92 @@ +# Quickstart Guide for Local Training + +This quickstart is for local installation and usage. +If you want to use AutoTrain on Hugging Face Spaces, please refer to the *AutoTrain on Hugging Face Spaces* section. + +You can install AutoTrain Advanced using pip: + +```bash +$ pip install autotrain-advanced +``` + +It is advised to install autotrain-advanced in a virtual environment to avoid any conflicts with other packages. +Note: AutoTrain doesn't install pytorch, torchaudio, torchvision, or any other large dependencies. You will need to install them separately. + +```bash +$ conda create -n autotrain python=3.10 +$ conda activate autotrain +$ pip install autotrain-advanced +$ conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia +$ conda install -c "nvidia/label/cuda-12.1.0" cuda-nvcc +$ conda install xformers -c xformers +$ python -m nltk.downloader punkt +$ pip install flash-attn --no-build-isolation # if you want to use flash-attn +$ pip install deepspeed # if you want to use deepspeed +```` + +# Running AutoTrain User Interface (UI) + +To run the autotrain app locally, you can use the following command: + +```bash +$ export HF_TOKEN=your_hugging_face_write_token +$ autotrain app --host 127.0.0.1 --port 8000 +``` + +This will start the app on `http://127.0.0.1:8000`. + + +# Using AutoTrain Command Line Interface (CLI) + +It is also possible to use the CLI: + +```bash +$ export HF_TOKEN=your_hugging_face_write_token +$ autotrain --help +``` + +This will show the CLI commands that can be used: + +```bash +usage: autotrain [] + +positional arguments: +{ + app, + llm, + setup, + api, + text-classification, + text-regression, + image-classification, + tabular, + spacerunner, + seq2seq, + token-classification +} + + commands + +options: + -h, --help show this help message and exit + --version, -v Display AutoTrain version + --config CONFIG Optional configuration file + +For more information about a command, run: `autotrain --help` +``` + +It is advised to use only the `autotrain --config CONFIG_FILE` command for training when using the CLI. + +The autotrain commands that end users will be interested in are: + +- `app`: Start the AutoTrain UI +- `llm`: Train a language model +- `text-classification`: Train a text classification model +- `text-regression`: Train a text regression model +- `image-classification`: Train an image classification model +- `tabular`: Train a tabular model +- `spacerunner`: Train any custom model using SpaceRunner +- `seq2seq`: Train a sequence-to-sequence model +- `token-classification`: Train a token classification model + +Note: above commands are not required if you use preferred `autotrain --config CONFIG_FILE` command to train the models. \ No newline at end of file diff --git a/docs/source/quickstart_py.mdx b/docs/source/quickstart_py.mdx new file mode 100644 index 0000000000000000000000000000000000000000..d76086e56f33b0fd34f83195ad0dd9c6148a4552 --- /dev/null +++ b/docs/source/quickstart_py.mdx @@ -0,0 +1,110 @@ +# Quickstart with Python + +AutoTrain is a library that allows you to train state of the art models on Hugging Face Spaces, or locally. +It provides a simple and easy-to-use interface to train models for various tasks like llm finetuning, text classification, +image classification, object detection, and more. + +In this quickstart guide, we will show you how to train a model using AutoTrain in Python. + +## Getting Started + +AutoTrain can be installed using pip: + +```bash +$ pip install autotrain-advanced +``` + +The example code below shows how to finetune an LLM model using AutoTrain in Python: + +```python +import os + +from autotrain.params import LLMTrainingParams +from autotrain.project import AutoTrainProject + + +params = LLMTrainingParams( + model="meta-llama/Llama-3.2-1B-Instruct", + data_path="HuggingFaceH4/no_robots", + chat_template="tokenizer", + text_column="messages", + train_split="train", + trainer="sft", + epochs=3, + batch_size=1, + lr=1e-5, + peft=True, + quantization="int4", + target_modules="all-linear", + padding="right", + optimizer="paged_adamw_8bit", + scheduler="cosine", + gradient_accumulation=8, + mixed_precision="bf16", + merge_adapter=True, + project_name="autotrain-llama32-1b-finetune", + log="tensorboard", + push_to_hub=True, + username=os.environ.get("HF_USERNAME"), + token=os.environ.get("HF_TOKEN"), +) + + +backend = "local" +project = AutoTrainProject(params=params, backend=backend, process=True) +project.create() +``` + +In this example, we are finetuning the `meta-llama/Llama-3.2-1B-Instruct` model on the `HuggingFaceH4/no_robots` dataset. +We are training the model for 3 epochs with a batch size of 1 and a learning rate of `1e-5`. +We are using the `paged_adamw_8bit` optimizer and the `cosine` scheduler. +We are also using mixed precision training with a gradient accumulation of 8. +The final model will be pushed to the Hugging Face Hub after training. + +To train the model, run the following command: + +```bash +$ export HF_USERNAME= +$ export HF_TOKEN= +$ python train.py +``` + +This will create a new project directory with the name `autotrain-llama32-1b-finetune` and start the training process. +Once the training is complete, the model will be pushed to the Hugging Face Hub. + +Your HF_TOKEN and HF_USERNAME are only required if you want to push the model or if you are accessing a gated model or dataset. + +## AutoTrainProject Class + +[[autodoc]] project.AutoTrainProject + +## Parameters + +### Text Tasks + +[[autodoc]] trainers.clm.params.LLMTrainingParams + +[[autodoc]] trainers.sent_transformers.params.SentenceTransformersParams + +[[autodoc]] trainers.seq2seq.params.Seq2SeqParams + +[[autodoc]] trainers.token_classification.params.TokenClassificationParams + +[[autodoc]] trainers.extractive_question_answering.params.ExtractiveQuestionAnsweringParams + +[[autodoc]] trainers.text_classification.params.TextClassificationParams + +[[autodoc]] trainers.text_regression.params.TextRegressionParams + +### Image Tasks + +[[autodoc]] trainers.image_classification.params.ImageClassificationParams + +[[autodoc]] trainers.image_regression.params.ImageRegressionParams + +[[autodoc]] trainers.object_detection.params.ObjectDetectionParams + + +### Tabular Tasks + +[[autodoc]] trainers.tabular.params.TabularParams \ No newline at end of file diff --git a/docs/source/quickstart_spaces.mdx b/docs/source/quickstart_spaces.mdx new file mode 100644 index 0000000000000000000000000000000000000000..52f8725e0af24614939c83ae2c05f86af11ad7c9 --- /dev/null +++ b/docs/source/quickstart_spaces.mdx @@ -0,0 +1,78 @@ +# Quickstart Guide to AutoTrain on Hugging Face Spaces + +AutoTrain on Hugging Face Spaces is the preferred choice for a streamlined experience in +model training. This platform is optimized for ease of use, with pre-installed dependencies +and managed hardware resources. AutoTrain on Hugging Face Spaces can be used both by +no-code users and developers, making it versatile for various levels of expertise. + + +## Creating a New AutoTrain Space + +Getting started with AutoTrain is straightforward. Here’s how you can create your new space: + +1. **Visit the AutoTrain Page**: To create a new space with AutoTrain Docker image, all you need to do is go +to [AutoTrain Homepage](https://hf.co/autotrain) and click on "Create new project". + +2. **Log In or View the Setup Screen**: If not logged in, you'll be prompted to do so. Then, you’ll see a screen similar to this: + +![autotrain-duplicate-space](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/duplicate_space.png) + +3. **Set Up Your Space**: + +- **Choose a Space Name**: Name your space something relevant to your project. + +- **Allocate Hardware Resources**: Select the necessary computational resources based on your project needs. + +- **Duplicate Space**: Click on "Duplicate Space" to initiate your AutoTrain space with the Docker image. + +4. **Configuration Options**: + +- PAUSE_ON_FAILURE: Set this to 0 if you prefer the space not to pause on training failures, useful for running continuous experiments. This option can also be used if you continuously want to perfom many experiments in the same space. + +5. **Launch and Train**: + +- Once done, in a few seconds, the AutoTrain Space will be up and running and you will be presented with the following screen: + +![autotrain-space](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/autotrain_space.png) + +- From here, you can select tasks, upload datasets, choose models, adjust hyperparameters (if needed), +and start the training process directly within the space. + +- The space will manage its own activity, shutting down post-training unless configured +otherwise based on the `PAUSE_ON_FAILURE` setting. + +6. **Monitoring Progress**: + +- All training logs and progress can be monitored via TensorBoard, accessible under +`username/project_name` on the Hugging Face Hub. + +- Once training concludes successfully, you’ll find the model files in the same repository. + +7. **Navigating the UI**: + +- If you need help understanding any UI elements, click on the small (i) information icons for detailed descriptions. + +If you are confused about the UI elements, click on the small (i) information icon to get more information about the UI element. + +For data formats and detailed parameter information, please see the Data Formats and Parameters section where we provide +example datasets and detailed information about the parameters for each task supported by AutoTrain. + +## Ensuring Your AutoTrain is Up-to-Date + +We are constantly adding new features and tasks to AutoTrain Advanced. To benefit from the latest features, tasks, and bug fixes, update your AutoTrain space regularly: + +- *Factory Reboot*: Navigate to the settings page of your space and click on "Factory reboot" to upgrade to the latest version of AutoTrain Advanced. + +![autotrain-space-template](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/space_template_5.png) + +- *Note*: Simply "restarting" the space does not update it; a factory reboot is necessary for a complete update. + + +For additional details on data formats and specific parameters, refer to the +'Data Formats and Parameters' section where we provide example datasets and extensive +parameter information for each supported task by AutoTrain. + + +With these steps, you can effortlessly initiate and manage your AutoTrain projects on +Hugging Face Spaces, leveraging the platform's robust capabilities for your machine learning and AI +needs. diff --git a/docs/source/starting_ui.bck b/docs/source/starting_ui.bck new file mode 100644 index 0000000000000000000000000000000000000000..27e413a3fca19dbc389e37b7dac29ab0b7c656d1 --- /dev/null +++ b/docs/source/starting_ui.bck @@ -0,0 +1,65 @@ +# Starting the UI + +The AutoTrain UI can be started in multiple ways depending on your needs. +We offer UI on Hugging Face Spaces, Colab and locally! + +## Hugging Face Spaces + +To start the UI on Hugging Face Spaces, you can simply click on the following link: + +[![Deploy on Spaces](https://huggingface.co/datasets/huggingface/badges/resolve/main/deploy-on-spaces-md.svg)](https://huggingface.co/login?next=/spaces/autotrain-projects/autotrain-advanced?duplicate=true) + +Please make sure you keep the space private and attach appropriate hardware to the space. +You can also read more about AutoTrain on the homepage and follow the link there to start your own training instance on +Hugging Face Spaces. [Click here](https://huggingface.co/autotrain) to visit the homepage. + +## Colab + +To start the UI on Colab, you can simply click on the following link: + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/colabs/AutoTrain.ipynb) + +Please note, to run the app on Colab, you will need an ngrok token. You can get one by signing up for free on [ngrok](https://ngrok.com/). +This is because Colab does not allow exposing ports to the internet directly. + + +## Locally + +To run the autotrain app locally, install autotrain-advanced python package: + +```bash +$ pip install autotrain-advanced +``` + +and then run the following command: + +```bash +$ export HF_TOKEN=your_hugging_face_write_token +$ autotrain app --host 127.0.0.1 --port 8000 +``` + +This will start the app on `http://127.0.0.1:8000`. + +AutoTrain doesn't install pytorch, torchaudio, torchvision, or any other dependencies. You will need to install them separately. +It is thus recommended to use conda environment: + + +```bash +$ conda create -n autotrain python=3.10 +$ conda activate autotrain + +$ pip install autotrain-advanced + +$ conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia +$ conda install -c "nvidia/label/cuda-12.1.0" cuda-nvcc +$ conda install xformers -c xformers + +$ python -m nltk.downloader punkt +$ pip install flash-attn --no-build-isolation +$ pip install deepspeed + +$ export HF_TOKEN=your_hugging_face_write_token +$ autotrain app --host 127.0.0.1 --port 8000 +``` + +In case of any issues, please report on the [GitHub issues](https://github.com/huggingface/autotrain-advanced/). diff --git a/docs/source/support.mdx b/docs/source/support.mdx new file mode 100644 index 0000000000000000000000000000000000000000..d52733d21b016fd367b5dfa98e38c384d5de87d8 --- /dev/null +++ b/docs/source/support.mdx @@ -0,0 +1,31 @@ +# Help and Support + +If you need assistance with AutoTrain Advanced or have questions about your projects, +you can reach out through several dedicated support channels. We're here to help you +navigate any issues you encounter, from technical queries to billing concerns. +Below are the best ways to get support: + + +- For technical support or to report a bug, you can [create an issue](https://github.com/huggingface/autotrain-advanced/issues/new) +directly in the AutoTrain Advanced GitHub repository. GitHub repo is ideal for tracking bugs, +requesting features, or getting help with troubleshooting problems. When submitting an +issue, please include all the details in question to help us provide the most +relevant support quickly. + +- [Ask in the Hugging Face Forum](https://discuss.huggingface.co/c/autotrain/16). This space is perfect for asking questions, +sharing your experiences, or discussing AutoTrain with other users and the Hugging Face +team. The forum is a great resource for getting advice, learning best practices, and +connecting with other machine learning practitioners. + +- For enterprise users or specific inquiries related to billing, please [email us](mailto:autotrain@hf.co) directly. +This channel ensures that your more sensitive or account-specific issues are handled +appropriately and confidentially. When emailing, please provide your username and +project name so we can assist you efficiently. + +Please note: e-mail support is only available for pro/enterprise users or those with specific queries about billing. + + +By utilizing these support channels, you can ensure that any hurdles you face while using +AutoTrain Advanced are addressed promptly, allowing you to focus on achieving your project +goals. Whether you're a beginner or an experienced user, we are here to support your +journey in AI model training. diff --git a/docs/source/tasks/extractive_qa.mdx b/docs/source/tasks/extractive_qa.mdx new file mode 100644 index 0000000000000000000000000000000000000000..7af6f11dfb894d8a4f5590ee54d5e5acfb3a9de1 --- /dev/null +++ b/docs/source/tasks/extractive_qa.mdx @@ -0,0 +1,92 @@ +# Extractive Question Answering with AutoTrain + +Extractive Question Answering (QA) enables AI models to find and extract precise answers from text passages. This guide shows you how to train custom QA models using AutoTrain, supporting popular architectures like BERT, RoBERTa, and DeBERTa. + +## What is Extractive Question Answering? + +Extractive QA models learn to: +- Locate exact answer spans within longer text passages +- Understand questions and match them to relevant context +- Extract precise answers rather than generating them +- Handle both simple and complex queries about the text + +## Preparing your Data + +Your dataset needs these essential columns: +- `text`: The passage containing potential answers (also called context) +- `question`: The query you want to answer +- `answer`: Answer span information including text and position + +Here is an example of how your dataset should look: + +``` +{"context":"Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.","question":"To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?","answers":{"text":["Saint Bernadette Soubirous"],"answer_start":[515]}} +{"context":"Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.","question":"What is in front of the Notre Dame Main Building?","answers":{"text":["a copper statue of Christ"],"answer_start":[188]}} +{"context":"Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.","question":"The Basilica of the Sacred heart at Notre Dame is beside to which structure?","answers":{"text":["the Main Building"],"answer_start":[279]}} +``` + +Note: the preferred format for question answering is JSONL, if you want to use CSV, the `answer` column should be stringified JSON with the keys `text` and `answer_start`. + +Example dataset from Hugging Face Hub: [lhoestq/squad](https://huggingface.co/datasets/lhoestq/squad) + +P.S. You can use both squad and squad v2 data format with correct column mappings. + +## Training Options + +### Local Training +Train models on your own hardware with full control over the process. + +To train an Extractive QA model locally, you need a config file: + +```yaml +task: extractive-qa +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-ex-qa1 +log: tensorboard +backend: local + +data: + path: lhoestq/squad + train_split: train + valid_split: validation + column_mapping: + text_column: context + question_column: question + answer_column: answers + +params: + max_seq_length: 512 + max_doc_stride: 128 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true +``` + +To train the model, run the following command: + +```bash +$ autotrain --config config.yaml +``` + +Here, we are training a BERT model on the SQuAD dataset using the Extractive QA task. The model is trained for 3 epochs with a batch size of 4 and a learning rate of 2e-5. The training process is logged using TensorBoard. The model is trained locally and pushed to the Hugging Face Hub after training. + +### Cloud Training on Hugging Face +Train models using Hugging Face's cloud infrastructure for better scalability. + +![AutoTrain Extractive Question Answering on Hugging Face Spaces](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/ext_qa.png) + +As always, pay special attention to column mapping. + + +## Parameter Reference + +[[autodoc]] trainers.extractive_question_answering.params.ExtractiveQuestionAnsweringParams \ No newline at end of file diff --git a/docs/source/tasks/image_classification_regression.mdx b/docs/source/tasks/image_classification_regression.mdx new file mode 100644 index 0000000000000000000000000000000000000000..97d892727a3a1baa7abe679dff972d3c1dc8f996 --- /dev/null +++ b/docs/source/tasks/image_classification_regression.mdx @@ -0,0 +1,247 @@ +# Image Classification & Regression + +Image classification is a form of supervised learning where a model is trained to identify +and categorize objects within images. AutoTrain simplifies the process, enabling you to +train a state-of-the-art image classification model by simply uploading labeled example +images. + +Image regression/scoring is a form of supervised learning where a model is trained to predict a +score or value for an image. AutoTrain simplifies the process, enabling you to train a +state-of-the-art image scoring model by simply uploading labeled example images. + + +## Preparing your data + +To ensure your image classification model trains effectively, follow these guidelines for preparing your data: + + +### Organizing Images For Image Classification + + +Prepare a zip file containing your categorized images. Each category should have its own +subfolder named after the class it represents. For example, to differentiate between +'cats' and 'dogs', your zip file structure should resemble the following: + + +``` +cats_and_dogs.zip +├── cats +│ ├── cat.1.jpg +│ ├── cat.2.jpg +│ ├── cat.3.jpg +│ └── ... +└── dogs + ├── dog.1.jpg + ├── dog.2.jpg + ├── dog.3.jpg + └── ... +``` + +You can also use a dataset from the Hugging Face Hub. Example dataset from Hugging Face Hub: [truepositive/hotdog_nothotdog](https://huggingface.co/datasets/truepositive/hotdog_nothotdog). + + +### Organizing Images for Image Regression/Scoring + + +Prepare a zip file containing your images and metadata.jsonl. + + +``` +Archive.zip +├── 0001.png +├── 0002.png +├── 0003.png +├── . +├── . +├── . +└── metadata.jsonl +``` + +Example for `metadata.jsonl`: + +``` +{"file_name": "0001.png", "target": 0.5} +{"file_name": "0002.png", "target": 0.7} +{"file_name": "0003.png", "target": 0.3} +``` + +Please note that metadata.jsonl should contain the `file_name` and the `target` value for each image. + +You can also use a dataset from the Hugging Face Hub. Example dataset from Hugging Face Hub: [abhishek/img-quality-full](https://huggingface.co/datasets/abhishek/img-quality-full). + +### Image Requirements + +- Format: Ensure all images are in JPEG, JPG, or PNG format. + +- Quantity: Include at least 5 images per class to provide the model with sufficient examples for learning. + +- Exclusivity: The zip file should exclusively contain folders named after the classes, +and these folders should only contain relevant images. No additional files or nested +folders should be included. + + +** Additional Tips** + +- Uniformity: While not required, having images of similar sizes and resolutions can help improve model performance. + +- Variability: Include a variety of images for each class to encompass the range of +appearances and contexts the model might encounter in real-world scenarios. + +Some points to keep in mind: + +- The zip file should contain multiple folders (the classes), each folder should contain images of a single class. +- The name of the folder should be the name of the class. +- The images must be jpeg, jpg or png. +- There should be at least 5 images per class. +- There must not be any other files in the zip file. +- There must not be any other folders inside the zip folder. + +When train.zip is decompressed, it creates two folders: cats and dogs. these are the two categories for classification. The images for both categories are in their respective folders. You can have as many categories as you want. + +## Column Mapping + +For image classification, if you are using a `zip` dataset format, the column mapping should be default and should not be changed. + +```yaml +data: + . + . + . + column_mapping: + image_column: image + target_column: label +``` + +For image regression, the column mapping must be as follows: + +```yaml +data: + . + . + . + column_mapping: + image_column: image + target_column: target +``` + +For image regression, `metadata.jsonl` should contain the `file_name` and the `target` value for each image. + +If you are using a dataset from the Hugging Face Hub, you should set appropriate column mappings based on the dataset. + + +## Training + +### Local Training + +To train the model locally, create a configuration file (config.yaml) with the following content: + +```yaml +task: image_classification +base_model: google/vit-base-patch16-224 +project_name: autotrain-cats-vs-dogs-finetuned +log: tensorboard +backend: local + +data: + path: cats_vs_dogs + train_split: train + valid_split: null + column_mapping: + image_column: image + target_column: label + +params: + epochs: 2 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true +``` + +Here, we are using `cats_and_dogs` dataset from Hugging Face Hub. The model is trained for 2 epochs with a batch size of 4 and a learning rate of `2e-5`. We are using the `adamw_torch` optimizer and the `linear` scheduler. We are also using mixed precision training with a gradient accumulation of 1. + +In order to use a local dataset, you can change the `data` section to: + +```yaml +data: + path: data/ + train_split: train # this folder inside data/ will be used for training, it contains the images in subfolders. + valid_split: valid # this folder inside data/ will be used for validation, it contains the images in subfolders. can also be null. + column_mapping: + image_column: image + target_column: label +``` + +Similarly, for image regression, you can use the following configuration file: + +```yaml +task: image_regression +base_model: microsoft/resnet-50 +project_name: autotrain-img-quality-resnet50 +log: tensorboard +backend: local + +data: + path: abhishek/img-quality-full + train_split: train + valid_split: null + column_mapping: + image_column: image + target_column: target + +params: + epochs: 10 + batch_size: 8 + lr: 2e-3 + optimizer: adamw_torch + scheduler: cosine + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true +``` + +To train the model, run the following command: + +```bash +$ autotrain --config config.yaml +``` + +This will start the training process and save the model to the Hugging Face Hub after training is complete. In case you dont want to save the model to the hub, you can set `push_to_hub` to `false` in the configuration file. + +### Training on Hugging Face Spaces + +To train the model on Hugging Face Spaces, create a training space as described in `Quickstart` section. + +An example UI for training an image scoring model on Hugging Face Spaces is shown below: + +![llm-finetuning](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/img_reg_ui.png) + +In this example, we are training an image scoring model using the `microsoft/resnet-50` model on the `abhishek/img-quality-full` dataset. +We are training the model for 3 epochs with a batch size of 8 and a learning rate of `5e-5`. +We are using the `adamw_torch` optimizer and the `linear` scheduler. +We are also using mixed precision training with a gradient accumulation of 1. + +Note how the column mapping has now been changed and `target` points to `quality_mos` column in the dataset. + +To train the model, click on the `Start Training` button. This will start the training process and save the model to the Hugging Face Hub after training is complete. + +## Parameters + +### Image Classification Parameters + +[[autodoc]] trainers.image_classification.params.ImageClassificationParams + +### Image Regression Parameters + +[[autodoc]] trainers.image_regression.params.ImageRegressionParams \ No newline at end of file diff --git a/docs/source/tasks/llm_finetuning.mdx b/docs/source/tasks/llm_finetuning.mdx new file mode 100644 index 0000000000000000000000000000000000000000..cb6041d49e6a209019e94788b9bfed39f2b42d59 --- /dev/null +++ b/docs/source/tasks/llm_finetuning.mdx @@ -0,0 +1,357 @@ +# LLM Finetuning with AutoTrain Advanced + +AutoTrain Advanced makes it easy to fine-tune large language models (LLMs) for your specific use cases. This guide covers everything you need to know about LLM fine-tuning. + +## Key Features +- Simple data preparation with CSV and JSONL formats +- Support for multiple training approaches (SFT, DPO, ORPO) +- Built-in chat templates +- Local and cloud training options +- Optimized training parameters + +## Supported Training Methods +AutoTrain supports multiple specialized trainers: +- `llm`: Generic LLM trainer +- `llm-sft`: Supervised Fine-Tuning trainer +- `llm-reward`: Reward modeling trainer +- `llm-dpo`: Direct Preference Optimization trainer +- `llm-orpo`: ORPO (Optimal Reward Policy Optimization) trainer + +## Data Preparation + +LLM finetuning accepts data in CSV and JSONL formats. JSONL is the preferred format. +How data is formatted depends on the task you are training the LLM for. + +### Classic Text Generation + +For text generation, the data should be in the following format: + +| text | +|---------------------------------------------------------------| +| wikipedia is a free online encyclopedia | +| it is a collaborative project | +| that anyone can edit | +| wikipedia is the largest and most popular general reference work on the internet | + +An example dataset for this format can be found here: [stas/openwebtext-10k](https://huggingface.co/datasets/stas/openwebtext-10k) + +Example tasks: +- Text generation +- Code completion + +Compatible trainers: +- SFT Trainer +- Generic Trainer + +### Chatbot / question-answering / code generation / function calling + +For this task, you can use CSV or JSONL data. If you are formatting the data yourself (adding start, end tokens, etc.), you can use CSV or JSONL format. +If you do not want to format the data yourself and want `--chat-template` parameter to format the data for you, you must use JSONL format. +In both cases, CSV and JSONL can be used interchangeably but JSONL is the most preferred format. + +To train a chatbot, your data will have `content` and `role`. Some models support `system` role as well. + +Here is an example of a chatbot dataset (single sample): + +``` +[{'content': 'Help write a letter of 100 -200 words to my future self for ' + 'Kyra, reflecting on her goals and aspirations.', + 'role': 'user'}, + {'content': 'Dear Future Self,\n' + '\n' + "I hope you're happy and proud of what you've achieved. As I " + "write this, I'm excited to think about our goals and how far " + "you've come. One goal was to be a machine learning engineer. I " + "hope you've worked hard and become skilled in this field. Keep " + 'learning and innovating. Traveling was important to us. I hope ' + "you've seen different places and enjoyed the beauty of our " + 'world. Remember the memories and lessons. Starting a family ' + 'mattered to us. If you have kids, treasure every moment. Be ' + 'patient, loving, and grateful for your family.\n' + '\n' + 'Take care of yourself. Rest, reflect, and cherish the time you ' + 'spend with loved ones. Remember your dreams and celebrate what ' + "you've achieved. Your determination brought you here. I'm " + "excited to see the person you've become, the impact you've made, " + 'and the love and joy in your life. Embrace opportunities and ' + 'keep dreaming big.\n' + '\n' + 'With love,\n' + 'Kyra', + 'role': 'assistant'}] +``` + +As you can see, the data has `content` and `role` columns. The `role` column can be `user` or `assistant` or `system`. +This data is, however, not formatted for training. You can use the `--chat-template` parameter to format the data during training. + +`--chat-template` supports the following kinds of templates: +- `none` (default) +- `zephyr` +- `chatml` +- `tokenizer`: use chat template mentioned in tokenizer config + +A multi-line sample is also shown below: + +```json +[{"content": "hello", "role": "user"}, {"content": "hi nice to meet you", "role": "assistant"}] +[{"content": "how are you", "role": "user"}, {"content": "I am fine", "role": "assistant"}] +[{"content": "What is your name?", "role": "user"}, {"content": "My name is Mary", "role": "assistant"}] +[{"content": "Which is the best programming language?", "role": "user"}, {"content": "Python", "role": "assistant"}] +. +. +. +``` + +An example dataset for this format can be found here: [HuggingFaceH4/no_robots](https://huggingface.co/datasets/HuggingFaceH4/no_robots) + +If you dont want to format the data using `--chat-template`, you can format the data yourself and use the following format: + +``` +<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 03 Oct 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHelp write a letter of 100 -200 words to my future self for Kyra, reflecting on her goals and aspirations.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nDear Future Self,\n\nI hope you're happy and proud of what you've achieved. As I write this, I'm excited to think about our goals and how far you've come. One goal was to be a machine learning engineer. I hope you've worked hard and become skilled in this field. Keep learning and innovating. Traveling was important to us. I hope you've seen different places and enjoyed the beauty of our world. Remember the memories and lessons. Starting a family mattered to us. If you have kids, treasure every moment. Be patient, loving, and grateful for your family.\n\nTake care of yourself. Rest, reflect, and cherish the time you spend with loved ones. Remember your dreams and celebrate what you've achieved. Your determination brought you here. I'm excited to see the person you've become, the impact you've made, and the love and joy in your life. Embrace opportunities and keep dreaming big.\n\nWith love,\nKyra<|eot_id|> +``` + +A sample multi-line dataset is shown below: + +```json +[{"text": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 03 Oct 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nhello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nhi nice to meet you<|eot_id|>"}] +[{"text": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 03 Oct 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nhow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am fine<|eot_id|>"}] +[{"text": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 03 Oct 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is your name?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy name is Mary<|eot_id|>"}] +[{"text": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 03 Oct 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhich is the best programming language?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nPython<|eot_id|>"}] +. +. +. +``` + +An example dataset for this format can be found here: [timdettmers/openassistant-guanaco](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) + +In the examples above, we have seen only two turns: one from the user and one from the assistant. However, you can have multiple turns from the user and assistant in a single sample. + +Chat models can be trained using the following trainers: + +- SFT Trainer: + - requires only `text` column + - example dataset: [HuggingFaceH4/no_robots](https://huggingface.co/datasets/HuggingFaceH4/no_robots) + +- Generic Trainer: + - requires only `text` column + - example dataset: [HuggingFaceH4/no_robots](https://huggingface.co/datasets/HuggingFaceH4/no_robots) + +- Reward Trainer: + - requires `text` and `rejected_text` columns + - example dataset: [trl-lib/ultrafeedback_binarized](https://huggingface.co/datasets/trl-lib/ultrafeedback_binarized) + +- DPO Trainer: + - requires `prompt`, `text`, and `rejected_text` columns + - example dataset: [trl-lib/ultrafeedback_binarized](https://huggingface.co/datasets/trl-lib/ultrafeedback_binarized) + +- ORPO Trainer: + - requires `prompt`, `text`, and `rejected_text` columns + - example dataset: [trl-lib/ultrafeedback_binarized](https://huggingface.co/datasets/trl-lib/ultrafeedback_binarized) + +The only difference between the data format for reward trainer and DPO/ORPO trainer is that the reward trainer requires only `text` and `rejected_text` columns, while the DPO/ORPO trainer requires an additional `prompt` column. + +## Best Practices for LLM Fine-tuning + +### Memory Optimization +- Use appropriate `block_size` and `model_max_length` for your hardware +- Enable mixed precision training when possible +- Utilize PEFT techniques for large models + +### Data Quality +- Clean and validate your training data +- Ensure balanced conversation samples +- Use appropriate chat templates + +### Training Tips +- Start with small learning rates +- Monitor training metrics using tensorboard +- Validate model outputs during training + +### Related Resources +- [AutoTrain Documentation](https://huggingface.co/docs/autotrain) +- [Example Fine-tuned Models](https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads) +- [Training Datasets](https://huggingface.co/datasets?task_categories=task_categories:text-generation) + +## Training + +### Local Training + +Locally the training can be performed by using `autotrain --config config.yaml` command. The `config.yaml` file should contain the following parameters: + +```yaml +task: llm-orpo +base_model: meta-llama/Meta-Llama-3-8B-Instruct +project_name: autotrain-llama3-8b-orpo +log: tensorboard +backend: local + +data: + path: argilla/distilabel-capybara-dpo-7k-binarized + train_split: train + valid_split: null + chat_template: chatml + column_mapping: + text_column: chosen + rejected_text_column: rejected + prompt_text_column: prompt + +params: + block_size: 1024 + model_max_length: 8192 + max_prompt_length: 512 + epochs: 3 + batch_size: 2 + lr: 3e-5 + peft: true + quantization: int4 + target_modules: all-linear + padding: right + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 4 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true +``` + +In the above config file, we are training a model using the ORPO trainer. +The model is trained on the `meta-llama/Meta-Llama-3-8B-Instruct` model. +The data is `argilla/distilabel-capybara-dpo-7k-binarized` dataset. The `chat_template` parameter is set to `chatml`. +The `column_mapping` parameter is used to map the columns in the dataset to the required columns for the ORPO trainer. +The `params` section contains the training parameters such as `block_size`, `model_max_length`, `epochs`, `batch_size`, `lr`, `peft`, `quantization`, `target_modules`, `padding`, `optimizer`, `scheduler`, `gradient_accumulation`, and `mixed_precision`. +The `hub` section contains the username and token for the Hugging Face account and the `push_to_hub` parameter is set to `true` to push the trained model to the Hugging Face Hub. + +If you have training file locally, you can change data part to: + +```yaml +data: + path: path/to/training/file + train_split: train # name of the training file + valid_split: null + chat_template: chatml + column_mapping: + text_column: chosen + rejected_text_column: rejected + prompt_text_column: prompt +``` + +The above assumes you have `train.csv` or `train.jsonl` in the `path/to/training/file` directory and you will be applying `chatml` template to the data. + +You can run the training using the following command: + +```bash +$ autotrain --config config.yaml +``` + +More example config files for finetuning different types of lllm and different tasks can be found in the [here](https://github.com/huggingface/autotrain-advanced/tree/main/configs/llm_finetuning). + +### Training in Hugging Face Spaces + +If you are training in Hugging Face Spaces, everything is the same as local training: + +![llm-finetuning](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/llm_orpo_example.png) + +In the UI, you need to make sure you select the right model, the dataset and the splits. Special care should be taken for `column_mapping`. + +Once you are happy with the parameters, you can click on the `Start Training` button to start the training process. + +## Parameters + +### LLM Fine Tuning Parameters + +[[autodoc]] trainers.clm.params.LLMTrainingParams + +### Task specific parameters + + +The length parameters used for different trainers can be different. Some require more context than others. + +- block_size: This is the maximum sequence length or length of one block of text. Setting to -1 determines block size automatically. Default is -1. +- model_max_length: Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. Default is 1024 +- max_prompt_length: Specify the maximum length for prompts used in training, particularly relevant for tasks requiring initial contextual input. Used only for `orpo` and `dpo` trainer. +- max_completion_length: Completion length to use, for orpo: encoder-decoder models only. For dpo, it is the length of the completion text. + +**NOTE**: + - block size cannot be greater than model_max_length! + - max_prompt_length cannot be greater than model_max_length! + - max_prompt_length cannot be greater than block_size! + - max_completion_length cannot be greater than model_max_length! + - max_completion_length cannot be greater than block_size! + +**NOTE**: Not following these constraints will result in an error / nan losses. + +#### Generic Trainer + +``` +--add_eos_token, --add-eos-token + Toggle whether to automatically add an End Of Sentence (EOS) token at the end of texts, which can be critical for certain + types of models like language models. Only used for `default` trainer +--block_size BLOCK_SIZE, --block-size BLOCK_SIZE + Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to + -1 determines block size automatically. Default is -1. +--model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH + Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. + Default is 1024 +``` + +#### SFT Trainer + +``` +--block_size BLOCK_SIZE, --block-size BLOCK_SIZE + Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to + -1 determines block size automatically. Default is -1. +--model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH + Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. + Default is 1024 +``` + +#### Reward Trainer + +``` +--block_size BLOCK_SIZE, --block-size BLOCK_SIZE + Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to + -1 determines block size automatically. Default is -1. +--model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH + Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. + Default is 1024 +``` + +#### DPO Trainer + +``` +--dpo-beta DPO_BETA, --dpo-beta DPO_BETA + Beta for DPO trainer + +--model-ref MODEL_REF + Reference model to use for DPO when not using PEFT +--block_size BLOCK_SIZE, --block-size BLOCK_SIZE + Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to + -1 determines block size automatically. Default is -1. +--model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH + Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. + Default is 1024 +--max_prompt_length MAX_PROMPT_LENGTH, --max-prompt-length MAX_PROMPT_LENGTH + Specify the maximum length for prompts used in training, particularly relevant for tasks requiring initial contextual input. + Used only for `orpo` trainer. +--max_completion_length MAX_COMPLETION_LENGTH, --max-completion-length MAX_COMPLETION_LENGTH + Completion length to use, for orpo: encoder-decoder models only +``` + +#### ORPO Trainer + +``` +--block_size BLOCK_SIZE, --block-size BLOCK_SIZE + Specify the block size for processing sequences. This is maximum sequence length or length of one block of text. Setting to + -1 determines block size automatically. Default is -1. +--model_max_length MODEL_MAX_LENGTH, --model-max-length MODEL_MAX_LENGTH + Set the maximum length for the model to process in a single batch, which can affect both performance and memory usage. + Default is 1024 +--max_prompt_length MAX_PROMPT_LENGTH, --max-prompt-length MAX_PROMPT_LENGTH + Specify the maximum length for prompts used in training, particularly relevant for tasks requiring initial contextual input. + Used only for `orpo` trainer. +--max_completion_length MAX_COMPLETION_LENGTH, --max-completion-length MAX_COMPLETION_LENGTH + Completion length to use, for orpo: encoder-decoder models only +``` diff --git a/docs/source/tasks/object_detection.mdx b/docs/source/tasks/object_detection.mdx new file mode 100644 index 0000000000000000000000000000000000000000..3d1330978955c52355d1b129566fb1b78cbe4835 --- /dev/null +++ b/docs/source/tasks/object_detection.mdx @@ -0,0 +1,62 @@ +# Object Detection + +Object detection is a form of supervised learning where a model is trained to identify +and categorize objects within images. AutoTrain simplifies the process, enabling you to +train a state-of-the-art object detection model by simply uploading labeled example images. + + +## Preparing your data + +To ensure your object detection model trains effectively, follow these guidelines for preparing your data: + + +### Organizing Images + + +Prepare a zip file containing your images and metadata.jsonl. + + +``` +Archive.zip +├── 0001.png +├── 0002.png +├── 0003.png +├── . +├── . +├── . +└── metadata.jsonl +``` + +Example for `metadata.jsonl`: + +``` +{"file_name": "0001.png", "objects": {"bbox": [[302.0, 109.0, 73.0, 52.0]], "category": [0]}} +{"file_name": "0002.png", "objects": {"bbox": [[810.0, 100.0, 57.0, 28.0]], "category": [1]}} +{"file_name": "0003.png", "objects": {"bbox": [[160.0, 31.0, 248.0, 616.0], [741.0, 68.0, 202.0, 401.0]], "category": [2, 2]}} +``` + +Please note that bboxes need to be in COCO format `[x, y, width, height]`. + + +### Image Requirements + +- Format: Ensure all images are in JPEG, JPG, or PNG format. + +- Quantity: Include at least 5 images to provide the model with sufficient examples for learning. + +- Exclusivity: The zip file should exclusively contain images and metadata.jsonl. +No additional files or nested folders should be included. + + +Some points to keep in mind: + +- The images must be jpeg, jpg or png. +- There should be at least 5 images per split. +- There must not be any other files in the zip file. +- There must not be any other folders inside the zip folder. + +When train.zip is decompressed, it creates no folders: only images and metadata.jsonl. + +## Parameters + +[[autodoc]] trainers.object_detection.params.ObjectDetectionParams diff --git a/docs/source/tasks/sentence_transformer.mdx b/docs/source/tasks/sentence_transformer.mdx new file mode 100644 index 0000000000000000000000000000000000000000..ba7f68af01f3ccf42fa0ff1fffa636dc767379a5 --- /dev/null +++ b/docs/source/tasks/sentence_transformer.mdx @@ -0,0 +1,75 @@ +# Sentence Transformers + +This task lets you easily train or fine-tune a Sentence Transformer model on your own dataset. + +AutoTrain supports the following types of sentence transformer finetuning: + +- `pair`: dataset with two sentences: anchor and positive +- `pair_class`: dataset with two sentences: premise and hypothesis and a target label +- `pair_score`: dataset with two sentences: sentence1 and sentence2 and a target score +- `triplet`: dataset with three sentences: anchor, positive and negative +- `qa`: dataset with two sentences: query and answer + +## Data Format + +Sentence Transformers finetuning accepts data in CSV/JSONL format. You can also use a dataset from Hugging Face Hub. + +### `pair` + +For `pair` training, the data should be in the following format: + +| anchor | positive | +|--------|----------| +| hello | hi | +| how are you | I am fine | +| What is your name? | My name is Abhishek | +| Which is the best programming language? | Python | + +### `pair_class` + +For `pair_class` training, the data should be in the following format: + +| premise | hypothesis | label | +|---------|------------|-------| +| hello | hi | 1 | +| how are you | I am fine | 0 | +| What is your name? | My name is Abhishek | 1 | +| Which is the best programming language? | Python | 1 | + +### `pair_score` + +For `pair_score` training, the data should be in the following format: + +| sentence1 | sentence2 | score | +|-----------|-----------|-------| +| hello | hi | 0.8 | +| how are you | I am fine | 0.2 | +| What is your name? | My name is Abhishek | 0.9 | +| Which is the best programming language? | Python | 0.7 | + +### `triplet` + +For `triplet` training, the data should be in the following format: + +| anchor | positive | negative | +|--------|----------|----------| +| hello | hi | bye | +| how are you | I am fine | I am not fine | +| What is your name? | My name is Abhishek | Whats it to you? | +| Which is the best programming language? | Python | Javascript | + +### `qa` + +For `qa` training, the data should be in the following format: + +| query | answer | +|-------|--------| +| hello | hi | +| how are you | I am fine | +| What is your name? | My name is Abhishek | +| Which is the best programming language? | Python | + + +## Parameters + +[[autodoc]] trainers.sent_transformers.params.SentenceTransformersParams diff --git a/docs/source/tasks/seq2seq.mdx b/docs/source/tasks/seq2seq.mdx new file mode 100644 index 0000000000000000000000000000000000000000..f798ea160674bc27cab31f2c2d749dd762323c08 --- /dev/null +++ b/docs/source/tasks/seq2seq.mdx @@ -0,0 +1,37 @@ +# Seq2Seq + +Seq2Seq is a task that involves converting a sequence of words into another sequence of words. +It is used in machine translation, text summarization, and question answering. + +## Data Format + +You can have the dataset as a CSV file: + +```csv +text,target +"this movie is great","dieser Film ist großartig" +"this movie is bad","dieser Film ist schlecht" +. +. +. +``` + +Or as a JSONL file: + +```json +{"text": "this movie is great", "target": "dieser Film ist großartig"} +{"text": "this movie is bad", "target": "dieser Film ist schlecht"} +. +. +. +``` + + +## Columns + +Your CSV/JSONL dataset must have two columns: `text` and `target`. + + +## Parameters + +[[autodoc]] trainers.seq2seq.params.Seq2SeqParams diff --git a/docs/source/tasks/tabular.mdx b/docs/source/tasks/tabular.mdx new file mode 100644 index 0000000000000000000000000000000000000000..99bb9801841e6d221dcde388319c81735aa4fc17 --- /dev/null +++ b/docs/source/tasks/tabular.mdx @@ -0,0 +1,49 @@ +# Tabular Classification / Regression + +Using AutoTrain, you can train a model to classify or regress tabular data easily. +All you need to do is select from a list of models and upload your dataset. +Parameter tuning is done automatically. + +## Models + +The following models are available for tabular classification / regression. + +- xgboost +- random_forest +- ridge +- logistic_regression +- svm +- extra_trees +- gradient_boosting +- adaboost +- decision_tree +- knn + + +## Data Format + +```csv +id,category1,category2,feature1,target +1,A,X,0.3373961604172684,1 +2,B,Z,0.6481718720511972,0 +3,A,Y,0.36824153984054797,1 +4,B,Z,0.9571551589530464,1 +5,B,Z,0.14035078041264515,1 +6,C,X,0.8700872583584364,1 +7,A,Y,0.4736080452737105,0 +8,C,Y,0.8009107519796442,1 +9,A,Y,0.5204774795512048,0 +10,A,Y,0.6788795301189603,0 +. +. +. +``` + +## Columns + +Your CSV dataset must have two columns: `id` and `target`. + + +## Parameters + +[[autodoc]] trainers.tabular.params.TabularParams diff --git a/docs/source/tasks/text_classification_regression.mdx b/docs/source/tasks/text_classification_regression.mdx new file mode 100644 index 0000000000000000000000000000000000000000..503bfe2ae0047efbb4bb0984428255dbc62323fd --- /dev/null +++ b/docs/source/tasks/text_classification_regression.mdx @@ -0,0 +1,150 @@ +# Text Classification & Regression + +Training a text classification/regression model with AutoTrain is super-easy! Get your data ready in +proper format and then with just a few clicks, your state-of-the-art model will be ready to +be used in production. + +Config file task names: +- `text_classification` +- `text-classification` +- `text_regression` +- `text-regression` + +## Data Format + +Text classification/regression supports datasets in both CSV and JSONL formats. + +### CSV Format + +Let's train a model for classifying the sentiment of a movie review. The data should be +in the following CSV format: + +```csv +text,target +"this movie is great",positive +"this movie is bad",negative +. +. +. +``` + +As you can see, we have two columns in the CSV file. One column is the text and the other +is the label. The label can be any string. In this example, we have two labels: `positive` +and `negative`. You can have as many labels as you want. + +And if you would like to train a model for scoring a movie review on a scale of 1-5. The data can be as follows: + +```csv +text,target +"this movie is great",4.9 +"this movie is bad",1.5 +. +. +. +``` + +### JSONL Format +Instead of CSV you can also use JSONL format. The JSONL format should be as follows: + +```json +{"text": "this movie is great", "target": "positive"} +{"text": "this movie is bad", "target": "negative"} +. +. +. +``` + +and for regression: + +```json +{"text": "this movie is great", "target": 4.9} +{"text": "this movie is bad", "target": 1.5} +. +. +``` + +### Column Mapping / Names + +Your CSV dataset must have two columns: `text` and `target`. +If your column names are different than `text` and `target`, you can map the dataset column to AutoTrain column names. + +## Training + +### Local Training + +To train a text classification/regression model locally, you can use the `autotrain --config config.yaml` command. + +Here is an example of a `config.yaml` file for training a text classification model: + +```yaml +task: text_classification # or text_regression +base_model: google-bert/bert-base-uncased +project_name: autotrain-bert-imdb-finetuned +log: tensorboard +backend: local + +data: + path: stanfordnlp/imdb + train_split: train + valid_split: test + column_mapping: + text_column: text + target_column: label + +params: + max_seq_length: 512 + epochs: 3 + batch_size: 4 + lr: 2e-5 + optimizer: adamw_torch + scheduler: linear + gradient_accumulation: 1 + mixed_precision: fp16 + +hub: + username: ${HF_USERNAME} + token: ${HF_TOKEN} + push_to_hub: true +``` + +In this example, we are training a text classification model using the `google-bert/bert-base-uncased` model on the IMDB dataset. +We are using the `stanfordnlp/imdb` dataset, which is already available on Hugging Face Hub. +We are training the model for 3 epochs with a batch size of 4 and a learning rate of `2e-5`. +We are using the `adamw_torch` optimizer and the `linear` scheduler. +We are also using mixed precision training with a gradient accumulation of 1. + +If you want to use a local CSV/JSONL dataset, you can change the `data` section to: + +```yaml +data: + path: data/ # this must be the path to the directory containing the train and valid files + train_split: train # this must be either train.csv or train.json + valid_split: valid # this must be either valid.csv or valid.json + column_mapping: + text_column: text # this must be the name of the column containing the text + target_column: label # this must be the name of the column containing the target +``` + +To train the model, run the following command: + +```bash +$ autotrain --config config.yaml +``` + +You can find example config files for text classification and regression in the [here](https://github.com/huggingface/autotrain-advanced/tree/main/configs/text_classification) and [here](https://github.com/huggingface/autotrain-advanced/tree/main/configs/text_regression) respectively. + +### Training on Hugging Face Spaces + +The parameters for training on Hugging Face Spaces are the same as for local training. +If you are using your own dataset, select "Local" as dataset source and upload your dataset. +In the following screenshot, we are training a text classification model using the `google-bert/bert-base-uncased` model on the IMDB dataset. + +![AutoTrain Text Classification on Hugging Face Spaces](https://raw.githubusercontent.com/huggingface/autotrain-advanced/main/static/autotrain_text_classification.png) + +For text regression, all you need to do is select "Text Regression" as the task and everything else remains the same (except the data, of course). + +## Training Parameters + +Training parameters for text classification and regression are the same. + +[[autodoc]] trainers.text_classification.params.TextClassificationParams diff --git a/docs/source/tasks/token_classification.mdx b/docs/source/tasks/token_classification.mdx new file mode 100644 index 0000000000000000000000000000000000000000..a4b064e16bebd5e092ab641d46f0c30c91be39a8 --- /dev/null +++ b/docs/source/tasks/token_classification.mdx @@ -0,0 +1,65 @@ +# Token Classification + +Token classification is the task of classifying each token in a sequence. This can be used +for Named Entity Recognition (NER), Part-of-Speech (POS) tagging, and more. Get your data ready in +proper format and then with just a few clicks, your state-of-the-art model will be ready to +be used in production. + +## Data Format + +The data should be in the following CSV format: + +```csv +tokens,tags +"['I', 'love', 'Paris']","['O', 'O', 'B-LOC']" +"['I', 'live', 'in', 'New', 'York']","['O', 'O', 'O', 'B-LOC', 'I-LOC']" +. +. +. +``` + +or you can also use JSONL format: + +```json +{"tokens": ["I", "love", "Paris"],"tags": ["O", "O", "B-LOC"]} +{"tokens": ["I", "live", "in", "New", "York"],"tags": ["O", "O", "O", "B-LOC", "I-LOC"]} +. +. +. +``` + +As you can see, we have two columns in the CSV file. One column is the tokens and the other +is the tags. Both the columns are stringified lists! The tokens column contains the tokens +of the sentence and the tags column contains the tags for each token. + +If your CSV is huge, you can divide it into multiple CSV files and upload them separately. +Please make sure that the column names are the same in all CSV files. + +One way to divide the CSV file using pandas is as follows: + +```python +import pandas as pd + +# Set the chunk size +chunk_size = 1000 +i = 1 + +# Open the CSV file and read it in chunks +for chunk in pd.read_csv('example.csv', chunksize=chunk_size): + # Save each chunk to a new file + chunk.to_csv(f'chunk_{i}.csv', index=False) + i += 1 +``` + + +Sample dataset from HuggingFace Hub: [conll2003](https://huggingface.co/datasets/eriktks/conll2003) + + +## Columns + +Your CSV/JSONL dataset must have two columns: `tokens` and `tags`. + + +## Parameters + +[[autodoc]] trainers.token_classification.params.TokenClassificationParams diff --git a/notebooks/llm_finetuning.ipynb b/notebooks/llm_finetuning.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..8c1b9c3a80236514d6a9e5f533308b70bf5cf987 --- /dev/null +++ b/notebooks/llm_finetuning.ipynb @@ -0,0 +1,120 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LLM Finetuning using AutoTrain Advanced\n", + "\n", + "In this notebook, we will finetune a llama-3.2-1b-instruct model using AutoTrain Advanced.\n", + "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n", + "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from autotrain.params import LLMTrainingParams\n", + "from autotrain.project import AutoTrainProject" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "HF_USERNAME = \"your_huggingface_username\"\n", + "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n", + "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n", + "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "params = LLMTrainingParams(\n", + " model=\"meta-llama/Llama-3.2-1B-Instruct\",\n", + " data_path=\"HuggingFaceH4/no_robots\", # path to the dataset on huggingface hub\n", + " chat_template=\"tokenizer\", # using the chat template defined in the model's tokenizer\n", + " text_column=\"messages\", # the column in the dataset that contains the text\n", + " train_split=\"train\",\n", + " trainer=\"sft\", # using the SFT trainer, choose from sft, default, orpo, dpo and reward\n", + " epochs=3,\n", + " batch_size=1,\n", + " lr=1e-5,\n", + " peft=True, # training LoRA using PEFT\n", + " quantization=\"int4\", # using int4 quantization\n", + " target_modules=\"all-linear\",\n", + " padding=\"right\",\n", + " optimizer=\"paged_adamw_8bit\",\n", + " scheduler=\"cosine\",\n", + " gradient_accumulation=8,\n", + " mixed_precision=\"bf16\",\n", + " merge_adapter=True,\n", + " project_name=\"autotrain-llama32-1b-finetune\",\n", + " log=\"tensorboard\",\n", + " push_to_hub=True,\n", + " username=HF_USERNAME,\n", + " token=HF_TOKEN,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n", + "\n", + "```python\n", + "params = LLMTrainingParams(\n", + " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n", + " text_column=\"text\", # this is the column name in the CSV/JSONL file which contains the text\n", + " train_split = \"train\" # this is the filename without extension\n", + " .\n", + " .\n", + " .\n", + ")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this will train the model locally\n", + "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n", + "project.create()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "autotrain", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/text_classification.ipynb b/notebooks/text_classification.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..32f41546eb1c17dc607c95d90143a5069d6b42ed --- /dev/null +++ b/notebooks/text_classification.ipynb @@ -0,0 +1,118 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Text Classification using AutoTrain Advanced\n", + "\n", + "In this notebook, we will train a text classification model using AutoTrain Advanced.\n", + "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n", + "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from autotrain.params import TextClassificationParams\n", + "from autotrain.project import AutoTrainProject" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "HF_USERNAME = \"your_huggingface_username\"\n", + "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n", + "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n", + "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "params = TextClassificationParams(\n", + " model=\"google-bert/bert-base-uncased\",\n", + " data_path=\"stanfordnlp/imdb\", # path to the dataset on huggingface hub\n", + " text_column=\"text\", # the column in the dataset that contains the text\n", + " target_column=\"label\", # the column in the dataset that contains the labels\n", + " train_split=\"train\",\n", + " valid_split=\"test\",\n", + " epochs=3,\n", + " batch_size=8,\n", + " max_seq_length=512,\n", + " lr=1e-5,\n", + " optimizer=\"adamw_torch\",\n", + " scheduler=\"linear\",\n", + " gradient_accumulation=1,\n", + " mixed_precision=\"fp16\",\n", + " project_name=\"autotrain-model\",\n", + " log=\"tensorboard\",\n", + " push_to_hub=True,\n", + " username=HF_USERNAME,\n", + " token=HF_TOKEN,\n", + ")\n", + "# tip: you can use `?TextClassificationParams` to see the full list of allowed parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n", + "\n", + "```python\n", + "params = TextClassificationParams(\n", + " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n", + " text_column=\"text\", # this is the column name in the CSV/JSONL file which contains the text\n", + " train_split = \"train\" # this is the filename without extension\n", + " valid_split = \"valid\" # this is the filename without extension\n", + " .\n", + " .\n", + " .\n", + ")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this will train the model locally\n", + "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n", + "project.create()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "autotrain", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/text_regression.ipynb b/notebooks/text_regression.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..64f340dd870bc6a4ee3176e6c5ff5a5e9c84ff77 --- /dev/null +++ b/notebooks/text_regression.ipynb @@ -0,0 +1,118 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Text Regression/Scoring using AutoTrain Advanced\n", + "\n", + "In this notebook, we will train a text regression/scoring model using AutoTrain Advanced.\n", + "You can replace the model with any Hugging Face transformers compatible model and dataset with any other dataset in proper formatting.\n", + "For dataset formatting, please take a look at [docs](https://huggingface.co/docs/autotrain/index)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from autotrain.params import TextRegressionParams\n", + "from autotrain.project import AutoTrainProject" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "HF_USERNAME = \"your_huggingface_username\"\n", + "HF_TOKEN = \"your_huggingface_write_token\" # get it from https://huggingface.co/settings/token\n", + "# It is recommended to use secrets or environment variables to store your HF_TOKEN\n", + "# your token is required if push_to_hub is set to True or if you are accessing a gated model/dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "params = TextRegressionParams(\n", + " model=\"google-bert/bert-base-uncased\",\n", + " data_path=\"lewtun/drug-reviews\", # path to the dataset on huggingface hub\n", + " text_column=\"review\", # the column in the dataset that contains the text\n", + " target_column=\"rating\", # the column in the dataset that contains the labels\n", + " train_split=\"train\",\n", + " valid_split=\"test\",\n", + " epochs=3,\n", + " batch_size=8,\n", + " max_seq_length=512,\n", + " lr=1e-5,\n", + " optimizer=\"adamw_torch\",\n", + " scheduler=\"linear\",\n", + " gradient_accumulation=1,\n", + " mixed_precision=\"fp16\",\n", + " project_name=\"autotrain-model\",\n", + " log=\"tensorboard\",\n", + " push_to_hub=True,\n", + " username=HF_USERNAME,\n", + " token=HF_TOKEN,\n", + ")\n", + "# tip: you can use `?TextClassificationParams` to see the full list of allowed parameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If your dataset is in CSV / JSONL format (JSONL is most preferred) and is stored locally, make the following changes to `params`:\n", + "\n", + "```python\n", + "params = TextRegressionParams(\n", + " data_path=\"data/\", # this is the path to folder where train.jsonl/train.csv is located\n", + " text_column=\"text\", # this is the column name in the CSV/JSONL file which contains the text\n", + " train_split = \"train\" # this is the filename without extension\n", + " valid_split = \"valid\" # this is the filename without extension\n", + " .\n", + " .\n", + " .\n", + ")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this will train the model locally\n", + "project = AutoTrainProject(params=params, backend=\"local\", process=True)\n", + "project.create()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "autotrain", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..46cedd0ab57396ad7f7e9e95c63e75d8a4b9de57 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,49 @@ +albumentations==1.4.23 +datasets[vision]~=3.2.0 +evaluate==0.4.3 +ipadic==1.0.0 +jiwer==3.0.5 +joblib==1.4.2 +loguru==0.7.3 +pandas==2.2.3 +nltk==3.9.1 +optuna==4.1.0 +Pillow==11.0.0 +sacremoses==0.1.1 +scikit-learn==1.6.0 +sentencepiece==0.2.0 +tqdm==4.67.1 +werkzeug==3.1.3 +xgboost==2.1.3 +huggingface_hub==0.27.0 +requests==2.32.3 +einops==0.8.0 +packaging==24.2 +cryptography==44.0.0 +nvitop==1.3.2 +# latest versions +tensorboard==2.18.0 +peft==0.14.0 +trl==0.13.0 +tiktoken==0.8.0 +transformers==4.48.0 +accelerate==1.2.1 +bitsandbytes==0.45.0 +# extras +rouge_score==0.1.2 +py7zr==0.22.0 +fastapi==0.115.6 +uvicorn==0.34.0 +python-multipart==0.0.20 +pydantic==2.10.4 +hf-transfer +pyngrok==7.2.1 +authlib==1.4.0 +itsdangerous==2.2.0 +seqeval==1.2.2 +httpx==0.28.1 +pyyaml==6.0.2 +timm==1.0.12 +torchmetrics==1.6.0 +pycocotools==2.0.8 +sentence-transformers==3.3.1 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..d517262fc02ca47477dcccc48a9daae98c0d893f --- /dev/null +++ b/setup.cfg @@ -0,0 +1,26 @@ +[metadata] +license_files = LICENSE +version = attr: autotrain.__version__ + +[isort] +ensure_newline_before_comments = True +force_grid_wrap = 0 +include_trailing_comma = True +line_length = 119 +lines_after_imports = 2 +multi_line_output = 3 +use_parentheses = True + +[flake8] +ignore = E203, E501, W503 +max-line-length = 119 +per-file-ignores = + # imported but unused + __init__.py: F401, E402 + src/autotrain/params.py: F401 +exclude = + .git, + .venv, + __pycache__, + dist + build \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..fd5c30e706cfa5104c7f8af423c84ceee90b1de2 --- /dev/null +++ b/setup.py @@ -0,0 +1,106 @@ +# Lint as: python3 +""" +HuggingFace / AutoTrain Advanced +""" +import os + +from setuptools import find_packages, setup + + +DOCLINES = __doc__.split("\n") + +this_directory = os.path.abspath(os.path.dirname(__file__)) +with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as f: + LONG_DESCRIPTION = f.read() + +# get INSTALL_REQUIRES from requirements.txt +INSTALL_REQUIRES = [] +requirements_path = os.path.join(this_directory, "requirements.txt") +with open(requirements_path, encoding="utf-8") as f: + for line in f: + # Exclude 'bitsandbytes' if installing on macOS + if "bitsandbytes" in line: + line = line.strip() + " ; sys_platform == 'linux'" + INSTALL_REQUIRES.append(line.strip()) + else: + INSTALL_REQUIRES.append(line.strip()) + +QUALITY_REQUIRE = [ + "black", + "isort", + "flake8==3.7.9", +] + +TESTS_REQUIRE = ["pytest"] + +CLIENT_REQUIRES = ["requests", "loguru"] + + +EXTRAS_REQUIRE = { + "base": INSTALL_REQUIRES, + "dev": INSTALL_REQUIRES + QUALITY_REQUIRE + TESTS_REQUIRE, + "quality": INSTALL_REQUIRES + QUALITY_REQUIRE, + "docs": INSTALL_REQUIRES + + [ + "recommonmark", + "sphinx==3.1.2", + "sphinx-markdown-tables", + "sphinx-rtd-theme==0.4.3", + "sphinx-copybutton", + ], + "client": CLIENT_REQUIRES, +} + +setup( + name="autotrain-advanced", + description=DOCLINES[0], + long_description=LONG_DESCRIPTION, + long_description_content_type="text/markdown", + author="HuggingFace Inc.", + author_email="autotrain@huggingface.co", + url="https://github.com/huggingface/autotrain-advanced", + download_url="https://github.com/huggingface/autotrain-advanced/tags", + license="Apache 2.0", + package_dir={"": "src"}, + packages=find_packages("src"), + extras_require=EXTRAS_REQUIRE, + install_requires=INSTALL_REQUIRES, + entry_points={"console_scripts": ["autotrain=autotrain.cli.autotrain:main"]}, + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Education", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + ], + keywords="automl autonlp autotrain huggingface", + data_files=[ + ( + "static", + [ + "src/autotrain/app/static/logo.png", + "src/autotrain/app/static/scripts/fetch_data_and_update_models.js", + "src/autotrain/app/static/scripts/listeners.js", + "src/autotrain/app/static/scripts/utils.js", + "src/autotrain/app/static/scripts/poll.js", + "src/autotrain/app/static/scripts/logs.js", + ], + ), + ( + "templates", + [ + "src/autotrain/app/templates/index.html", + "src/autotrain/app/templates/error.html", + "src/autotrain/app/templates/duplicate.html", + "src/autotrain/app/templates/login.html", + ], + ), + ], + include_package_data=True, +) diff --git a/src/autotrain/__init__.py b/src/autotrain/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..956c9bee340ccb07921eee7128fcb5a6b488a52c --- /dev/null +++ b/src/autotrain/__init__.py @@ -0,0 +1,68 @@ +# coding=utf-8 +# Copyright 2020-2023 The HuggingFace AutoTrain Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +# pylint: enable=line-too-long +import os + + +os.environ["BITSANDBYTES_NOWELCOME"] = "1" +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" +os.environ["TOKENIZERS_PARALLELISM"] = "false" + + +import warnings + + +try: + import torch._dynamo + + torch._dynamo.config.suppress_errors = True +except ImportError: + pass + +from autotrain.logging import Logger + + +warnings.filterwarnings("ignore", category=UserWarning, module="tensorflow") +warnings.filterwarnings("ignore", category=UserWarning, module="transformers") +warnings.filterwarnings("ignore", category=UserWarning, module="peft") +warnings.filterwarnings("ignore", category=UserWarning, module="accelerate") +warnings.filterwarnings("ignore", category=UserWarning, module="datasets") +warnings.filterwarnings("ignore", category=FutureWarning, module="accelerate") +warnings.filterwarnings("ignore", category=UserWarning, module="huggingface_hub") + +logger = Logger().get_logger() +__version__ = "0.8.37.dev0" + + +def is_colab(): + try: + import google.colab + + return True + except ImportError: + return False + + +def is_unsloth_available(): + try: + from unsloth import FastLanguageModel + + return True + except Exception as e: + logger.warning("Unsloth not available, continuing without it") + logger.warning(e) + return False diff --git a/src/autotrain/app/__init__.py b/src/autotrain/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/app/api_routes.py b/src/autotrain/app/api_routes.py new file mode 100644 index 0000000000000000000000000000000000000000..8563ab15b838fc930142080c86354003f847e49c --- /dev/null +++ b/src/autotrain/app/api_routes.py @@ -0,0 +1,783 @@ +import json +from typing import Any, Dict, List, Literal, Optional, Tuple, Union, get_type_hints + +from fastapi import APIRouter, Depends, HTTPException, Request, status +from fastapi.responses import JSONResponse +from huggingface_hub import HfApi, constants +from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status +from pydantic import BaseModel, create_model, model_validator + +from autotrain import __version__, logger +from autotrain.app.params import HIDDEN_PARAMS, PARAMS, AppParams +from autotrain.app.utils import token_verification +from autotrain.project import AutoTrainProject +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams +from autotrain.trainers.image_classification.params import ImageClassificationParams +from autotrain.trainers.image_regression.params import ImageRegressionParams +from autotrain.trainers.object_detection.params import ObjectDetectionParams +from autotrain.trainers.sent_transformers.params import SentenceTransformersParams +from autotrain.trainers.seq2seq.params import Seq2SeqParams +from autotrain.trainers.tabular.params import TabularParams +from autotrain.trainers.text_classification.params import TextClassificationParams +from autotrain.trainers.text_regression.params import TextRegressionParams +from autotrain.trainers.token_classification.params import TokenClassificationParams +from autotrain.trainers.vlm.params import VLMTrainingParams + + +FIELDS_TO_EXCLUDE = HIDDEN_PARAMS + ["push_to_hub"] + + +def create_api_base_model(base_class, class_name): + """ + Creates a new Pydantic model based on a given base class and class name, + excluding specified fields. + + Args: + base_class (Type): The base Pydantic model class to extend. + class_name (str): The name of the new model class to create. + + Returns: + Type: A new Pydantic model class with the specified modifications. + + Notes: + - The function uses type hints from the base class to define the new model's fields. + - Certain fields are excluded from the new model based on the class name. + - The function supports different sets of hidden parameters for different class names. + - The new model's configuration is set to have no protected namespaces. + """ + annotations = get_type_hints(base_class) + if class_name in ("LLMSFTTrainingParamsAPI", "LLMRewardTrainingParamsAPI"): + more_hidden_params = [ + "model_ref", + "dpo_beta", + "add_eos_token", + "max_prompt_length", + "max_completion_length", + ] + elif class_name == "LLMORPOTrainingParamsAPI": + more_hidden_params = [ + "model_ref", + "dpo_beta", + "add_eos_token", + ] + elif class_name == "LLMDPOTrainingParamsAPI": + more_hidden_params = [ + "add_eos_token", + ] + elif class_name == "LLMGenericTrainingParamsAPI": + more_hidden_params = [ + "model_ref", + "dpo_beta", + "max_prompt_length", + "max_completion_length", + ] + else: + more_hidden_params = [] + _excluded = FIELDS_TO_EXCLUDE + more_hidden_params + new_fields: Dict[str, Tuple[Any, Any]] = {} + for name, field in base_class.__fields__.items(): + if name not in _excluded: + field_type = annotations[name] + if field.default is not None: + field_default = field.default + elif field.default_factory is not None: + field_default = field.default_factory + else: + field_default = None + new_fields[name] = (field_type, field_default) + return create_model( + class_name, + **{key: (value[0], value[1]) for key, value in new_fields.items()}, + __config__=type("Config", (), {"protected_namespaces": ()}), + ) + + +LLMSFTTrainingParamsAPI = create_api_base_model(LLMTrainingParams, "LLMSFTTrainingParamsAPI") +LLMDPOTrainingParamsAPI = create_api_base_model(LLMTrainingParams, "LLMDPOTrainingParamsAPI") +LLMORPOTrainingParamsAPI = create_api_base_model(LLMTrainingParams, "LLMORPOTrainingParamsAPI") +LLMGenericTrainingParamsAPI = create_api_base_model(LLMTrainingParams, "LLMGenericTrainingParamsAPI") +LLMRewardTrainingParamsAPI = create_api_base_model(LLMTrainingParams, "LLMRewardTrainingParamsAPI") +ImageClassificationParamsAPI = create_api_base_model(ImageClassificationParams, "ImageClassificationParamsAPI") +Seq2SeqParamsAPI = create_api_base_model(Seq2SeqParams, "Seq2SeqParamsAPI") +TabularClassificationParamsAPI = create_api_base_model(TabularParams, "TabularClassificationParamsAPI") +TabularRegressionParamsAPI = create_api_base_model(TabularParams, "TabularRegressionParamsAPI") +TextClassificationParamsAPI = create_api_base_model(TextClassificationParams, "TextClassificationParamsAPI") +TextRegressionParamsAPI = create_api_base_model(TextRegressionParams, "TextRegressionParamsAPI") +TokenClassificationParamsAPI = create_api_base_model(TokenClassificationParams, "TokenClassificationParamsAPI") +SentenceTransformersParamsAPI = create_api_base_model(SentenceTransformersParams, "SentenceTransformersParamsAPI") +ImageRegressionParamsAPI = create_api_base_model(ImageRegressionParams, "ImageRegressionParamsAPI") +VLMTrainingParamsAPI = create_api_base_model(VLMTrainingParams, "VLMTrainingParamsAPI") +ExtractiveQuestionAnsweringParamsAPI = create_api_base_model( + ExtractiveQuestionAnsweringParams, "ExtractiveQuestionAnsweringParamsAPI" +) +ObjectDetectionParamsAPI = create_api_base_model(ObjectDetectionParams, "ObjectDetectionParamsAPI") + + +class LLMSFTColumnMapping(BaseModel): + text_column: str + + +class LLMDPOColumnMapping(BaseModel): + text_column: str + rejected_text_column: str + prompt_text_column: str + + +class LLMORPOColumnMapping(BaseModel): + text_column: str + rejected_text_column: str + prompt_text_column: str + + +class LLMGenericColumnMapping(BaseModel): + text_column: str + + +class LLMRewardColumnMapping(BaseModel): + text_column: str + rejected_text_column: str + + +class ImageClassificationColumnMapping(BaseModel): + image_column: str + target_column: str + + +class ImageRegressionColumnMapping(BaseModel): + image_column: str + target_column: str + + +class Seq2SeqColumnMapping(BaseModel): + text_column: str + target_column: str + + +class TabularClassificationColumnMapping(BaseModel): + id_column: str + target_columns: List[str] + + +class TabularRegressionColumnMapping(BaseModel): + id_column: str + target_columns: List[str] + + +class TextClassificationColumnMapping(BaseModel): + text_column: str + target_column: str + + +class TextRegressionColumnMapping(BaseModel): + text_column: str + target_column: str + + +class TokenClassificationColumnMapping(BaseModel): + tokens_column: str + tags_column: str + + +class STPairColumnMapping(BaseModel): + sentence1_column: str + sentence2_column: str + + +class STPairClassColumnMapping(BaseModel): + sentence1_column: str + sentence2_column: str + target_column: str + + +class STPairScoreColumnMapping(BaseModel): + sentence1_column: str + sentence2_column: str + target_column: str + + +class STTripletColumnMapping(BaseModel): + sentence1_column: str + sentence2_column: str + sentence3_column: str + + +class STQAColumnMapping(BaseModel): + sentence1_column: str + sentence2_column: str + + +class VLMColumnMapping(BaseModel): + image_column: str + text_column: str + prompt_text_column: str + + +class ExtractiveQuestionAnsweringColumnMapping(BaseModel): + text_column: str + question_column: str + answer_column: str + + +class ObjectDetectionColumnMapping(BaseModel): + image_column: str + objects_column: str + + +class APICreateProjectModel(BaseModel): + """ + APICreateProjectModel is a Pydantic model that defines the schema for creating a project. + + Attributes: + project_name (str): The name of the project. + task (Literal): The type of task for the project. Supported tasks include various LLM tasks, + image classification, seq2seq, token classification, text classification, + text regression, tabular classification, tabular regression, image regression, VLM tasks, + and extractive question answering. + base_model (str): The base model to be used for the project. + hardware (Literal): The type of hardware to be used for the project. Supported hardware options + include various configurations of spaces and local. + params (Union): The training parameters for the project. The type of parameters depends on the + task selected. + username (str): The username of the person creating the project. + column_mapping (Optional[Union]): The column mapping for the project. The type of column mapping + depends on the task selected. + hub_dataset (str): The dataset to be used for the project. + train_split (str): The training split of the dataset. + valid_split (Optional[str]): The validation split of the dataset. + + Methods: + validate_column_mapping(cls, values): Validates the column mapping based on the task selected. + validate_params(cls, values): Validates the training parameters based on the task selected. + """ + + project_name: str + task: Literal[ + "llm:sft", + "llm:dpo", + "llm:orpo", + "llm:generic", + "llm:reward", + "st:pair", + "st:pair_class", + "st:pair_score", + "st:triplet", + "st:qa", + "image-classification", + "seq2seq", + "token-classification", + "text-classification", + "text-regression", + "tabular-classification", + "tabular-regression", + "image-regression", + "vlm:captioning", + "vlm:vqa", + "extractive-question-answering", + "image-object-detection", + ] + base_model: str + hardware: Literal[ + "spaces-a10g-large", + "spaces-a10g-small", + "spaces-a100-large", + "spaces-t4-medium", + "spaces-t4-small", + "spaces-cpu-upgrade", + "spaces-cpu-basic", + "spaces-l4x1", + "spaces-l4x4", + "spaces-l40sx1", + "spaces-l40sx4", + "spaces-l40sx8", + "spaces-a10g-largex2", + "spaces-a10g-largex4", + # "local", + ] + params: Union[ + LLMSFTTrainingParamsAPI, + LLMDPOTrainingParamsAPI, + LLMORPOTrainingParamsAPI, + LLMGenericTrainingParamsAPI, + LLMRewardTrainingParamsAPI, + SentenceTransformersParamsAPI, + ImageClassificationParamsAPI, + Seq2SeqParamsAPI, + TabularClassificationParamsAPI, + TabularRegressionParamsAPI, + TextClassificationParamsAPI, + TextRegressionParamsAPI, + TokenClassificationParamsAPI, + ImageRegressionParamsAPI, + VLMTrainingParamsAPI, + ExtractiveQuestionAnsweringParamsAPI, + ObjectDetectionParamsAPI, + ] + username: str + column_mapping: Optional[ + Union[ + LLMSFTColumnMapping, + LLMDPOColumnMapping, + LLMORPOColumnMapping, + LLMGenericColumnMapping, + LLMRewardColumnMapping, + ImageClassificationColumnMapping, + Seq2SeqColumnMapping, + TabularClassificationColumnMapping, + TabularRegressionColumnMapping, + TextClassificationColumnMapping, + TextRegressionColumnMapping, + TokenClassificationColumnMapping, + STPairColumnMapping, + STPairClassColumnMapping, + STPairScoreColumnMapping, + STTripletColumnMapping, + STQAColumnMapping, + ImageRegressionColumnMapping, + VLMColumnMapping, + ExtractiveQuestionAnsweringColumnMapping, + ObjectDetectionColumnMapping, + ] + ] = None + hub_dataset: str + train_split: str + valid_split: Optional[str] = None + + @model_validator(mode="before") + @classmethod + def validate_column_mapping(cls, values): + if values.get("task") == "llm:sft": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for llm:sft") + if not values.get("column_mapping").get("text_column"): + raise ValueError("text_column is required for llm:sft") + values["column_mapping"] = LLMSFTColumnMapping(**values["column_mapping"]) + elif values.get("task") == "llm:dpo": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for llm:dpo") + if not values.get("column_mapping").get("text_column"): + raise ValueError("text_column is required for llm:dpo") + if not values.get("column_mapping").get("rejected_text_column"): + raise ValueError("rejected_text_column is required for llm:dpo") + if not values.get("column_mapping").get("prompt_text_column"): + raise ValueError("prompt_text_column is required for llm:dpo") + values["column_mapping"] = LLMDPOColumnMapping(**values["column_mapping"]) + elif values.get("task") == "llm:orpo": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for llm:orpo") + if not values.get("column_mapping").get("text_column"): + raise ValueError("text_column is required for llm:orpo") + if not values.get("column_mapping").get("rejected_text_column"): + raise ValueError("rejected_text_column is required for llm:orpo") + if not values.get("column_mapping").get("prompt_text_column"): + raise ValueError("prompt_text_column is required for llm:orpo") + values["column_mapping"] = LLMORPOColumnMapping(**values["column_mapping"]) + elif values.get("task") == "llm:generic": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for llm:generic") + if not values.get("column_mapping").get("text_column"): + raise ValueError("text_column is required for llm:generic") + values["column_mapping"] = LLMGenericColumnMapping(**values["column_mapping"]) + elif values.get("task") == "llm:reward": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for llm:reward") + if not values.get("column_mapping").get("text_column"): + raise ValueError("text_column is required for llm:reward") + if not values.get("column_mapping").get("rejected_text_column"): + raise ValueError("rejected_text_column is required for llm:reward") + values["column_mapping"] = LLMRewardColumnMapping(**values["column_mapping"]) + elif values.get("task") == "seq2seq": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for seq2seq") + if not values.get("column_mapping").get("text_column"): + raise ValueError("text_column is required for seq2seq") + if not values.get("column_mapping").get("target_column"): + raise ValueError("target_column is required for seq2seq") + values["column_mapping"] = Seq2SeqColumnMapping(**values["column_mapping"]) + elif values.get("task") == "image-classification": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for image-classification") + if not values.get("column_mapping").get("image_column"): + raise ValueError("image_column is required for image-classification") + if not values.get("column_mapping").get("target_column"): + raise ValueError("target_column is required for image-classification") + values["column_mapping"] = ImageClassificationColumnMapping(**values["column_mapping"]) + elif values.get("task") == "tabular-classification": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for tabular-classification") + if not values.get("column_mapping").get("id_column"): + raise ValueError("id_column is required for tabular-classification") + if not values.get("column_mapping").get("target_columns"): + raise ValueError("target_columns is required for tabular-classification") + values["column_mapping"] = TabularClassificationColumnMapping(**values["column_mapping"]) + elif values.get("task") == "tabular-regression": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for tabular-regression") + if not values.get("column_mapping").get("id_column"): + raise ValueError("id_column is required for tabular-regression") + if not values.get("column_mapping").get("target_columns"): + raise ValueError("target_columns is required for tabular-regression") + values["column_mapping"] = TabularRegressionColumnMapping(**values["column_mapping"]) + elif values.get("task") == "text-classification": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for text-classification") + if not values.get("column_mapping").get("text_column"): + raise ValueError("text_column is required for text-classification") + if not values.get("column_mapping").get("target_column"): + raise ValueError("target_column is required for text-classification") + values["column_mapping"] = TextClassificationColumnMapping(**values["column_mapping"]) + elif values.get("task") == "text-regression": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for text-regression") + if not values.get("column_mapping").get("text_column"): + raise ValueError("text_column is required for text-regression") + if not values.get("column_mapping").get("target_column"): + raise ValueError("target_column is required for text-regression") + values["column_mapping"] = TextRegressionColumnMapping(**values["column_mapping"]) + elif values.get("task") == "token-classification": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for token-classification") + if not values.get("column_mapping").get("tokens_column"): + raise ValueError("tokens_column is required for token-classification") + if not values.get("column_mapping").get("tags_column"): + raise ValueError("tags_column is required for token-classification") + values["column_mapping"] = TokenClassificationColumnMapping(**values["column_mapping"]) + elif values.get("task") == "st:pair": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for st:pair") + if not values.get("column_mapping").get("sentence1_column"): + raise ValueError("sentence1_column is required for st:pair") + if not values.get("column_mapping").get("sentence2_column"): + raise ValueError("sentence2_column is required for st:pair") + values["column_mapping"] = STPairColumnMapping(**values["column_mapping"]) + elif values.get("task") == "st:pair_class": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for st:pair_class") + if not values.get("column_mapping").get("sentence1_column"): + raise ValueError("sentence1_column is required for st:pair_class") + if not values.get("column_mapping").get("sentence2_column"): + raise ValueError("sentence2_column is required for st:pair_class") + if not values.get("column_mapping").get("target_column"): + raise ValueError("target_column is required for st:pair_class") + values["column_mapping"] = STPairClassColumnMapping(**values["column_mapping"]) + elif values.get("task") == "st:pair_score": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for st:pair_score") + if not values.get("column_mapping").get("sentence1_column"): + raise ValueError("sentence1_column is required for st:pair_score") + if not values.get("column_mapping").get("sentence2_column"): + raise ValueError("sentence2_column is required for st:pair_score") + if not values.get("column_mapping").get("target_column"): + raise ValueError("target_column is required for st:pair_score") + values["column_mapping"] = STPairScoreColumnMapping(**values["column_mapping"]) + elif values.get("task") == "st:triplet": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for st:triplet") + if not values.get("column_mapping").get("sentence1_column"): + raise ValueError("sentence1_column is required for st:triplet") + if not values.get("column_mapping").get("sentence2_column"): + raise ValueError("sentence2_column is required for st:triplet") + if not values.get("column_mapping").get("sentence3_column"): + raise ValueError("sentence3_column is required for st:triplet") + values["column_mapping"] = STTripletColumnMapping(**values["column_mapping"]) + elif values.get("task") == "st:qa": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for st:qa") + if not values.get("column_mapping").get("sentence1_column"): + raise ValueError("sentence1_column is required for st:qa") + if not values.get("column_mapping").get("sentence2_column"): + raise ValueError("sentence2_column is required for st:qa") + values["column_mapping"] = STQAColumnMapping(**values["column_mapping"]) + elif values.get("task") == "image-regression": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for image-regression") + if not values.get("column_mapping").get("image_column"): + raise ValueError("image_column is required for image-regression") + if not values.get("column_mapping").get("target_column"): + raise ValueError("target_column is required for image-regression") + values["column_mapping"] = ImageRegressionColumnMapping(**values["column_mapping"]) + elif values.get("task") == "vlm:captioning": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for vlm:captioning") + if not values.get("column_mapping").get("image_column"): + raise ValueError("image_column is required for vlm:captioning") + if not values.get("column_mapping").get("text_column"): + raise ValueError("text_column is required for vlm:captioning") + if not values.get("column_mapping").get("prompt_text_column"): + raise ValueError("prompt_text_column is required for vlm:captioning") + values["column_mapping"] = VLMColumnMapping(**values["column_mapping"]) + elif values.get("task") == "vlm:vqa": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for vlm:vqa") + if not values.get("column_mapping").get("image_column"): + raise ValueError("image_column is required for vlm:vqa") + if not values.get("column_mapping").get("text_column"): + raise ValueError("text_column is required for vlm:vqa") + if not values.get("column_mapping").get("prompt_text_column"): + raise ValueError("prompt_text_column is required for vlm:vqa") + values["column_mapping"] = VLMColumnMapping(**values["column_mapping"]) + elif values.get("task") == "extractive-question-answering": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for extractive-question-answering") + if not values.get("column_mapping").get("text_column"): + raise ValueError("text_column is required for extractive-question-answering") + if not values.get("column_mapping").get("question_column"): + raise ValueError("question_column is required for extractive-question-answering") + if not values.get("column_mapping").get("answer_column"): + raise ValueError("answer_column is required for extractive-question-answering") + values["column_mapping"] = ExtractiveQuestionAnsweringColumnMapping(**values["column_mapping"]) + elif values.get("task") == "image-object-detection": + if not values.get("column_mapping"): + raise ValueError("column_mapping is required for image-object-detection") + if not values.get("column_mapping").get("image_column"): + raise ValueError("image_column is required for image-object-detection") + if not values.get("column_mapping").get("objects_column"): + raise ValueError("objects_column is required for image-object-detection") + values["column_mapping"] = ObjectDetectionColumnMapping(**values["column_mapping"]) + return values + + @model_validator(mode="before") + @classmethod + def validate_params(cls, values): + if values.get("task") == "llm:sft": + values["params"] = LLMSFTTrainingParamsAPI(**values["params"]) + elif values.get("task") == "llm:dpo": + values["params"] = LLMDPOTrainingParamsAPI(**values["params"]) + elif values.get("task") == "llm:orpo": + values["params"] = LLMORPOTrainingParamsAPI(**values["params"]) + elif values.get("task") == "llm:generic": + values["params"] = LLMGenericTrainingParamsAPI(**values["params"]) + elif values.get("task") == "llm:reward": + values["params"] = LLMRewardTrainingParamsAPI(**values["params"]) + elif values.get("task") == "seq2seq": + values["params"] = Seq2SeqParamsAPI(**values["params"]) + elif values.get("task") == "image-classification": + values["params"] = ImageClassificationParamsAPI(**values["params"]) + elif values.get("task") == "tabular-classification": + values["params"] = TabularClassificationParamsAPI(**values["params"]) + elif values.get("task") == "tabular-regression": + values["params"] = TabularRegressionParamsAPI(**values["params"]) + elif values.get("task") == "text-classification": + values["params"] = TextClassificationParamsAPI(**values["params"]) + elif values.get("task") == "text-regression": + values["params"] = TextRegressionParamsAPI(**values["params"]) + elif values.get("task") == "token-classification": + values["params"] = TokenClassificationParamsAPI(**values["params"]) + elif values.get("task").startswith("st:"): + values["params"] = SentenceTransformersParamsAPI(**values["params"]) + elif values.get("task") == "image-regression": + values["params"] = ImageRegressionParamsAPI(**values["params"]) + elif values.get("task").startswith("vlm:"): + values["params"] = VLMTrainingParamsAPI(**values["params"]) + elif values.get("task") == "extractive-question-answering": + values["params"] = ExtractiveQuestionAnsweringParamsAPI(**values["params"]) + elif values.get("task") == "image-object-detection": + values["params"] = ObjectDetectionParamsAPI(**values["params"]) + return values + + +class JobIDModel(BaseModel): + jid: str + + +api_router = APIRouter() + + +def api_auth(request: Request): + """ + Authenticates the API request using a Bearer token. + + Args: + request (Request): The incoming HTTP request object. + + Returns: + str: The verified Bearer token if authentication is successful. + + Raises: + HTTPException: If the token is invalid, expired, or missing. + """ + authorization = request.headers.get("Authorization") + if authorization: + schema, _, token = authorization.partition(" ") + if schema.lower() == "bearer": + token = token.strip() + try: + _ = token_verification(token=token) + return token + except Exception as e: + logger.error(f"Failed to verify token: {e}") + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired token: Bearer", + ) + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired token", + ) + + +@api_router.post("/create_project", response_class=JSONResponse) +async def api_create_project(project: APICreateProjectModel, token: bool = Depends(api_auth)): + """ + Asynchronously creates a new project based on the provided parameters. + + Args: + project (APICreateProjectModel): The model containing the project details and parameters. + token (bool, optional): The authentication token. Defaults to Depends(api_auth). + + Returns: + dict: A dictionary containing a success message, the job ID of the created project, and a success status. + + Raises: + HTTPException: If there is an error during project creation. + + Notes: + - The function determines the hardware type based on the project hardware attribute. + - It logs the provided parameters and column mapping. + - It sets the appropriate parameters based on the task type. + - It updates the parameters with the provided ones and creates an AppParams instance. + - The function then creates an AutoTrainProject instance and initiates the project creation process. + """ + provided_params = project.params.model_dump() + if project.hardware == "local": + hardware = "local-ui" # local-ui has wait=False + else: + hardware = project.hardware + + logger.info(provided_params) + logger.info(project.column_mapping) + + task = project.task + if task.startswith("llm"): + params = PARAMS["llm"] + trainer = task.split(":")[1] + params.update({"trainer": trainer}) + elif task.startswith("st:"): + params = PARAMS["st"] + trainer = task.split(":")[1] + params.update({"trainer": trainer}) + elif task.startswith("vlm:"): + params = PARAMS["vlm"] + trainer = task.split(":")[1] + params.update({"trainer": trainer}) + elif task.startswith("tabular"): + params = PARAMS["tabular"] + else: + params = PARAMS[task] + + params.update(provided_params) + + app_params = AppParams( + job_params_json=json.dumps(params), + token=token, + project_name=project.project_name, + username=project.username, + task=task, + data_path=project.hub_dataset, + base_model=project.base_model, + column_mapping=project.column_mapping.model_dump() if project.column_mapping else None, + using_hub_dataset=True, + train_split=project.train_split, + valid_split=project.valid_split, + api=True, + ) + params = app_params.munge() + project = AutoTrainProject(params=params, backend=hardware) + job_id = project.create() + return {"message": "Project created", "job_id": job_id, "success": True} + + +@api_router.get("/version", response_class=JSONResponse) +async def api_version(): + """ + Returns the current version of the API. + + This asynchronous function retrieves the version of the API from the + __version__ variable and returns it in a dictionary. + + Returns: + dict: A dictionary containing the API version. + """ + return {"version": __version__} + + +@api_router.post("/stop_training", response_class=JSONResponse) +async def api_stop_training(job: JobIDModel, token: bool = Depends(api_auth)): + """ + Stops the training job with the given job ID. + + This asynchronous function pauses the training job identified by the provided job ID. + It uses the Hugging Face API to pause the space associated with the job. + + Args: + job (JobIDModel): The job model containing the job ID. + token (bool, optional): The authentication token, provided by dependency injection. + + Returns: + dict: A dictionary containing a message and a success flag. If the training job + was successfully stopped, the message indicates success and the success flag is True. + If there was an error, the message contains the error details and the success flag is False. + + Raises: + Exception: If there is an error while attempting to stop the training job. + """ + hf_api = HfApi(token=token) + job_id = job.jid + try: + hf_api.pause_space(repo_id=job_id) + except Exception as e: + logger.error(f"Failed to stop training: {e}") + return {"message": f"Failed to stop training for {job_id}: {e}", "success": False} + return {"message": f"Training stopped for {job_id}", "success": True} + + +@api_router.post("/logs", response_class=JSONResponse) +async def api_logs(job: JobIDModel, token: bool = Depends(api_auth)): + """ + Fetch logs for a given job. + + This endpoint retrieves logs for a specified job by its job ID. It first obtains a JWT token + to authenticate the request and then fetches the logs from the Hugging Face API. + + Args: + job (JobIDModel): The job model containing the job ID. + token (bool, optional): Dependency injection for API authentication. Defaults to Depends(api_auth). + + Returns: + JSONResponse: A JSON response containing the logs, success status, and a message. + + Raises: + Exception: If there is an error fetching the logs, the exception message is returned in the response. + """ + job_id = job.jid + jwt_url = f"{constants.ENDPOINT}/api/spaces/{job_id}/jwt" + response = get_session().get(jwt_url, headers=build_hf_headers(token=token)) + hf_raise_for_status(response) + jwt_token = response.json()["token"] # works for 24h (see "exp" field) + + # fetch the logs + logs_url = f"https://api.hf.space/v1/{job_id}/logs/run" + + _logs = [] + try: + with get_session().get( + logs_url, headers=build_hf_headers(token=jwt_token), stream=True, timeout=3 + ) as response: + hf_raise_for_status(response) + for line in response.iter_lines(): + if not line.startswith(b"data: "): + continue + line_data = line[len(b"data: ") :] + try: + event = json.loads(line_data.decode()) + except json.JSONDecodeError: + continue # ignore (for example, empty lines or `b': keep-alive'`) + _logs.append((event["timestamp"], event["data"])) + + _logs = "\n".join([f"{timestamp}: {data}" for timestamp, data in _logs]) + return {"logs": _logs, "success": True, "message": "Logs fetched successfully"} + except Exception as e: + if "Read timed out" in str(e): + _logs = "\n".join([f"{timestamp}: {data}" for timestamp, data in _logs]) + return {"logs": _logs, "success": True, "message": "Logs fetched successfully"} + return {"logs": str(e), "success": False, "message": "Failed to fetch logs"} diff --git a/src/autotrain/app/app.py b/src/autotrain/app/app.py new file mode 100644 index 0000000000000000000000000000000000000000..e6e155545fd65331fa3d63f79f1afba09eae7c50 --- /dev/null +++ b/src/autotrain/app/app.py @@ -0,0 +1,43 @@ +import os + +from fastapi import FastAPI, Request +from fastapi.responses import RedirectResponse +from fastapi.staticfiles import StaticFiles + +from autotrain import __version__, logger +from autotrain.app.api_routes import api_router +from autotrain.app.oauth import attach_oauth +from autotrain.app.ui_routes import ui_router + + +logger.info("Starting AutoTrain...") +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +app = FastAPI() +if "SPACE_ID" in os.environ: + attach_oauth(app) + +app.include_router(ui_router, prefix="/ui", include_in_schema=False) +app.include_router(api_router, prefix="/api") +static_path = os.path.join(BASE_DIR, "static") +app.mount("/static", StaticFiles(directory=static_path), name="static") +logger.info(f"AutoTrain version: {__version__}") +logger.info("AutoTrain started successfully") + + +@app.get("/") +async def forward_to_ui(request: Request): + """ + Forwards the incoming request to the UI endpoint. + + Args: + request (Request): The incoming HTTP request. + + Returns: + RedirectResponse: A response object that redirects to the UI endpoint, + including any query parameters from the original request. + """ + query_params = request.query_params + url = "/ui/" + if query_params: + url += f"?{query_params}" + return RedirectResponse(url=url) diff --git a/src/autotrain/app/colab.py b/src/autotrain/app/colab.py new file mode 100644 index 0000000000000000000000000000000000000000..2193ba048f5b176789eb763d93679cc831cdff69 --- /dev/null +++ b/src/autotrain/app/colab.py @@ -0,0 +1,402 @@ +import json +import os +import random +import string +import subprocess + +import ipywidgets as widgets +import yaml + +from autotrain.app.models import fetch_models +from autotrain.app.params import get_task_params + + +def generate_random_string(): + prefix = "autotrain" + part1 = "".join(random.choices(string.ascii_lowercase + string.digits, k=5)) + part2 = "".join(random.choices(string.ascii_lowercase + string.digits, k=5)) + return f"{prefix}-{part1}-{part2}" + + +def colab_app(): + if not os.path.exists("data"): + os.makedirs("data") + MODEL_CHOICES = fetch_models() + TASK_NAMES = [ + "LLM SFT", + "LLM ORPO", + "LLM Generic", + "LLM DPO", + "LLM Reward", + "Text Classification", + "Text Regression", + "Sequence to Sequence", + "Token Classification", + "Image Classification", + "Image Regression", + "Object Detection", + "Tabular Classification", + "Tabular Regression", + "ST Pair", + "ST Pair Classification", + "ST Pair Scoring", + "ST Triplet", + "ST Question Answering", + ] + + TASK_MAP = { + "LLM SFT": "llm:sft", + "LLM ORPO": "llm:orpo", + "LLM Generic": "llm:generic", + "LLM DPO": "llm:dpo", + "LLM Reward": "llm:reward", + "Text Classification": "text-classification", + "Text Regression": "text-regression", + "Sequence to Sequence": "seq2seq", + "Token Classification": "token-classification", + "Image Classification": "image-classification", + "Image Regression": "image-regression", + "Object Detection": "image-object-detection", + "Tabular Classification": "tabular:classification", + "Tabular Regression": "tabular:regression", + "ST Pair": "st:pair", + "ST Pair Classification": "st:pair_class", + "ST Pair Scoring": "st:pair_score", + "ST Triplet": "st:triplet", + "ST Question Answering": "st:qa", + } + + def _get_params(task, param_type): + _p = get_task_params(task, param_type=param_type) + _p["push_to_hub"] = True + _p = json.dumps(_p, indent=4) + return _p + + hf_token_label = widgets.HTML("
Hugging Face Write Token
") + hf_token = widgets.Password( + value="", description="", disabled=False, layout=widgets.Layout(margin="0 0 0 0", width="200px") + ) + + hf_user_label = widgets.HTML("
Hugging Face Username
") + hf_user = widgets.Text( + value="", description="", disabled=False, layout=widgets.Layout(margin="0 0 0 0", width="200px") + ) + + base_model_label = widgets.HTML("
Base Model
") + base_model = widgets.Text(value=MODEL_CHOICES["llm"][0], disabled=False, layout=widgets.Layout(width="420px")) + + project_name_label = widgets.HTML("
Project Name
") + project_name = widgets.Text( + value=generate_random_string(), + description="", + disabled=False, + layout=widgets.Layout(margin="0 0 0 0", width="200px"), + ) + + task_dropdown_label = widgets.HTML("
Task
") + task_dropdown = widgets.Dropdown( + options=TASK_NAMES, + value=TASK_NAMES[0], + description="", + disabled=False, + layout=widgets.Layout(margin="0 0 0 0", width="200px"), + ) + + dataset_path_label = widgets.HTML("
Path
") + dataset_path = widgets.Text( + value="", description="", disabled=False, layout=widgets.Layout(margin="0 0 0 0", width="200px") + ) + + train_split_label = widgets.HTML("
Train Split
") + train_split = widgets.Text( + value="", description="", disabled=False, layout=widgets.Layout(margin="0 0 0 0", width="200px") + ) + + valid_split_label = widgets.HTML("
Valid Split
") + valid_split = widgets.Text( + value="", + placeholder="optional", + description="", + disabled=False, + layout=widgets.Layout(margin="0 0 0 0", width="200px"), + ) + + dataset_source_dropdown_label = widgets.HTML("
Source
") + dataset_source_dropdown = widgets.Dropdown( + options=["Hugging Face Hub", "Local"], + value="Hugging Face Hub", + description="", + disabled=False, + layout=widgets.Layout(margin="0 0 0 0", width="200px"), + ) + + col_mapping_label = widgets.HTML("
Column Mapping
") + col_mapping = widgets.Text( + value='{"text": "text"}', + placeholder="", + description="", + disabled=False, + layout=widgets.Layout(margin="0 0 0 0", width="420px"), + ) + + parameters_dropdown = widgets.Dropdown( + options=["Basic", "Full"], value="Basic", description="", disabled=False, layout=widgets.Layout(width="400px") + ) + + parameters = widgets.Textarea( + value=_get_params("llm:sft", "basic"), + description="", + disabled=False, + layout=widgets.Layout(height="400px", width="400px"), + ) + + start_training_button = widgets.Button( + description="Start Training", + layout=widgets.Layout(width="1000px"), + disabled=False, + button_style="", # 'success', 'info', 'warning', 'danger' or '' + tooltip="Click to start training", + icon="check", # (FontAwesome names without the `fa-` prefix) + ) + + spacer = widgets.Box(layout=widgets.Layout(width="20px")) + title_hbox0 = widgets.HTML("

Hugging Face Credentials

") + title_hbox1 = widgets.HTML("

Project Details

") + title_hbox2 = widgets.HTML("

Dataset Details

") + title_hbox3 = widgets.HTML("

Parameters

") + + hbox0 = widgets.HBox( + [ + widgets.VBox([hf_token_label, hf_token]), + spacer, + widgets.VBox([hf_user_label, hf_user]), + ] + ) + hbox1 = widgets.HBox( + [ + widgets.VBox([project_name_label, project_name]), + spacer, + widgets.VBox([task_dropdown_label, task_dropdown]), + ] + ) + hbox2_1 = widgets.HBox( + [ + widgets.VBox([dataset_source_dropdown_label, dataset_source_dropdown]), + spacer, + widgets.VBox([dataset_path_label, dataset_path]), + ] + ) + hbox2_2 = widgets.HBox( + [ + widgets.VBox([train_split_label, train_split]), + spacer, + widgets.VBox([valid_split_label, valid_split]), + ] + ) + hbox2_3 = widgets.HBox( + [ + widgets.VBox([col_mapping_label, col_mapping]), + ] + ) + hbox3 = widgets.VBox([parameters_dropdown, parameters]) + + vbox0 = widgets.VBox([title_hbox0, hbox0]) + vbox1 = widgets.VBox([title_hbox1, base_model_label, base_model, hbox1]) + vbox2 = widgets.VBox([title_hbox2, hbox2_1, hbox2_2, hbox2_3]) + vbox3 = widgets.VBox([title_hbox3, hbox3]) + + left_column = widgets.VBox([vbox0, vbox1, vbox2], layout=widgets.Layout(width="500px")) + right_column = widgets.VBox([vbox3], layout=widgets.Layout(width="500px", align_items="flex-end")) + + separator = widgets.HTML('
') + + _main_layout = widgets.HBox([left_column, separator, right_column]) + main_layout = widgets.VBox([_main_layout, start_training_button]) + + def on_dataset_change(change): + if change["new"] == "Local": + dataset_path.value = "data/" + train_split.value = "train" + valid_split.value = "" + else: + dataset_path.value = "" + train_split.value = "" + valid_split.value = "" + + def update_parameters(*args): + task = TASK_MAP[task_dropdown.value] + param_type = parameters_dropdown.value.lower() + parameters.value = _get_params(task, param_type) + + def update_col_mapping(*args): + task = TASK_MAP[task_dropdown.value] + if task in ["llm:sft", "llm:generic"]: + col_mapping.value = '{"text": "text"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = True + elif task in ["llm:dpo", "llm:orpo"]: + col_mapping.value = '{"prompt": "prompt", "text": "text", "rejected_text": "rejected_text"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = True + elif task == "llm:reward": + col_mapping.value = '{"text": "text", "rejected_text": "rejected_text"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = True + elif task == "text-classification": + col_mapping.value = '{"text": "text", "label": "target"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "text-regression": + col_mapping.value = '{"text": "text", "label": "target"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "token-classification": + col_mapping.value = '{"text": "tokens", "label": "tags"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "seq2seq": + col_mapping.value = '{"text": "text", "label": "target"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "image-classification": + col_mapping.value = '{"image": "image", "label": "label"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "image-regression": + col_mapping.value = '{"image": "image", "label": "target"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "image-object-detection": + col_mapping.value = '{"image": "image", "objects": "objects"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "tabular:classification": + col_mapping.value = '{"id": "id", "label": ["target"]}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "tabular:regression": + col_mapping.value = '{"id": "id", "label": ["target"]}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "st:pair": + col_mapping.value = '{"sentence1": "anchor", "sentence2": "positive"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "st:pair_class": + col_mapping.value = '{"sentence1": "premise", "sentence2": "hypothesis", "target": "label"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "st:pair_score": + col_mapping.value = '{"sentence1": "sentence1", "sentence2": "sentence2", "target": "score"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "st:triplet": + col_mapping.value = '{"sentence1": "anchor", "sentence2": "positive", "sentence3": "negative"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + elif task == "st:qa": + col_mapping.value = '{"sentence1": "query", "sentence1": "answer"}' + dataset_source_dropdown.disabled = False + valid_split.disabled = False + else: + col_mapping.value = "Enter column mapping..." + + def update_base_model(*args): + if TASK_MAP[task_dropdown.value] == "text-classification": + base_model.value = MODEL_CHOICES["text-classification"][0] + elif TASK_MAP[task_dropdown.value].startswith("llm"): + base_model.value = MODEL_CHOICES["llm"][0] + elif TASK_MAP[task_dropdown.value] == "image-classification": + base_model.value = MODEL_CHOICES["image-classification"][0] + elif TASK_MAP[task_dropdown.value] == "seq2seq": + base_model.value = MODEL_CHOICES["seq2seq"][0] + elif TASK_MAP[task_dropdown.value] == "tabular:classification": + base_model.value = MODEL_CHOICES["tabular-classification"][0] + elif TASK_MAP[task_dropdown.value] == "tabular:regression": + base_model.value = MODEL_CHOICES["tabular-regression"][0] + elif TASK_MAP[task_dropdown.value] == "token-classification": + base_model.value = MODEL_CHOICES["token-classification"][0] + elif TASK_MAP[task_dropdown.value] == "text-regression": + base_model.value = MODEL_CHOICES["text-regression"][0] + elif TASK_MAP[task_dropdown.value] == "image-object-detection": + base_model.value = MODEL_CHOICES["image-object-detection"][0] + elif TASK_MAP[task_dropdown.value].startswith("st:"): + base_model.value = MODEL_CHOICES["sentence-transformers"][0] + else: + base_model.value = "Enter base model..." + + def start_training(b): + start_training_button.disabled = True + try: + print("Training is starting... Please wait!") + os.environ["HF_USERNAME"] = hf_user.value + os.environ["HF_TOKEN"] = hf_token.value + train_split_value = train_split.value.strip() if train_split.value.strip() != "" else None + valid_split_value = valid_split.value.strip() if valid_split.value.strip() != "" else None + params_val = json.loads(parameters.value) + if task_dropdown.value.startswith("llm") or task_dropdown.value.startswith("sentence-transformers"): + params_val["trainer"] = task_dropdown.value.split(":")[1] + # params_val = {k: v for k, v in params_val.items() if k != "trainer"} + + chat_template = params_val.get("chat_template") + if chat_template is not None: + params_val = {k: v for k, v in params_val.items() if k != "chat_template"} + + push_to_hub = params_val.get("push_to_hub", True) + if "push_to_hub" in params_val: + params_val = {k: v for k, v in params_val.items() if k != "push_to_hub"} + + config = { + "task": TASK_MAP[task_dropdown.value].split(":")[0], + "base_model": base_model.value, + "project_name": project_name.value, + "log": "tensorboard", + "backend": "local", + "data": { + "path": dataset_path.value, + "train_split": train_split_value, + "valid_split": valid_split_value, + "column_mapping": json.loads(col_mapping.value), + }, + "params": params_val, + "hub": { + "username": "${{HF_USERNAME}}", + "token": "${{HF_TOKEN}}", + "push_to_hub": push_to_hub, + }, + } + if TASK_MAP[task_dropdown.value].startswith("llm"): + config["data"]["chat_template"] = chat_template + if config["data"]["chat_template"] == "none": + config["data"]["chat_template"] = None + + with open("config.yml", "w") as f: + yaml.dump(config, f) + + cmd = "autotrain --config config.yml" + process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + while True: + output = process.stdout.readline() + if output == "" and process.poll() is not None: + break + if output: + print(output.strip()) + + poll_res = process.poll() + if poll_res != 0: + start_training_button.disabled = False + raise Exception(f"Training failed with exit code: {poll_res}") + print("Training completed successfully!") + start_training_button.disabled = False + except Exception as e: + print("An error occurred while starting training!") + print(f"Error: {e}") + start_training_button.disabled = False + + start_training_button.on_click(start_training) + dataset_source_dropdown.observe(on_dataset_change, names="value") + task_dropdown.observe(update_col_mapping, names="value") + task_dropdown.observe(update_parameters, names="value") + task_dropdown.observe(update_base_model, names="value") + parameters_dropdown.observe(update_parameters, names="value") + return main_layout diff --git a/src/autotrain/app/db.py b/src/autotrain/app/db.py new file mode 100644 index 0000000000000000000000000000000000000000..793327353723e5f96b5f1fc27dcff324c9a94089 --- /dev/null +++ b/src/autotrain/app/db.py @@ -0,0 +1,62 @@ +import sqlite3 + + +class AutoTrainDB: + """ + A class to manage job records in a SQLite database. + + Attributes: + ----------- + db_path : str + The path to the SQLite database file. + conn : sqlite3.Connection + The SQLite database connection object. + c : sqlite3.Cursor + The SQLite database cursor object. + + Methods: + -------- + __init__(db_path): + Initializes the database connection and creates the jobs table if it does not exist. + + create_jobs_table(): + Creates the jobs table in the database if it does not exist. + + add_job(pid): + Adds a new job with the given process ID (pid) to the jobs table. + + get_running_jobs(): + Retrieves a list of all running job process IDs (pids) from the jobs table. + + delete_job(pid): + Deletes the job with the given process ID (pid) from the jobs table. + """ + + def __init__(self, db_path): + self.db_path = db_path + self.conn = sqlite3.connect(db_path) + self.c = self.conn.cursor() + self.create_jobs_table() + + def create_jobs_table(self): + self.c.execute( + """CREATE TABLE IF NOT EXISTS jobs + (id INTEGER PRIMARY KEY, pid INTEGER)""" + ) + self.conn.commit() + + def add_job(self, pid): + sql = f"INSERT INTO jobs (pid) VALUES ({pid})" + self.c.execute(sql) + self.conn.commit() + + def get_running_jobs(self): + self.c.execute("""SELECT pid FROM jobs""") + running_pids = self.c.fetchall() + running_pids = [pid[0] for pid in running_pids] + return running_pids + + def delete_job(self, pid): + sql = f"DELETE FROM jobs WHERE pid={pid}" + self.c.execute(sql) + self.conn.commit() diff --git a/src/autotrain/app/models.py b/src/autotrain/app/models.py new file mode 100644 index 0000000000000000000000000000000000000000..1d1f65811325bea94b2af799d559e33324bec7a6 --- /dev/null +++ b/src/autotrain/app/models.py @@ -0,0 +1,374 @@ +import collections + +from huggingface_hub import list_models + + +def get_sorted_models(hub_models): + """ + Filters and sorts a list of models based on their download count. + + Args: + hub_models (list): A list of model objects. Each model object must have the attributes 'id', 'downloads', and 'private'. + + Returns: + list: A list of model IDs sorted by their download count in descending order. Only includes models that are not private. + """ + hub_models = [{"id": m.id, "downloads": m.downloads} for m in hub_models if m.private is False] + hub_models = sorted(hub_models, key=lambda x: x["downloads"], reverse=True) + hub_models = [m["id"] for m in hub_models] + return hub_models + + +def _fetch_text_classification_models(): + """ + Fetches and sorts text classification models from the Hugging Face model hub. + + This function retrieves models for the tasks "fill-mask" and "text-classification" + from the Hugging Face model hub, sorts them by the number of downloads, and combines + them into a single list. Additionally, it fetches trending models based on the number + of likes in the past 7 days, sorts them, and places them at the beginning of the list + if they are not already included. + + Returns: + list: A sorted list of model identifiers from the Hugging Face model hub. + """ + hub_models1 = list( + list_models( + task="fill-mask", + library="transformers", + sort="downloads", + direction=-1, + limit=100, + full=False, + ) + ) + hub_models2 = list( + list_models( + task="text-classification", + library="transformers", + sort="downloads", + direction=-1, + limit=100, + full=False, + ) + ) + hub_models = list(hub_models1) + list(hub_models2) + hub_models = get_sorted_models(hub_models) + + trending_models = list( + list_models( + task="fill-mask", + library="transformers", + sort="likes7d", + direction=-1, + limit=30, + full=False, + ) + ) + if len(trending_models) > 0: + trending_models = get_sorted_models(trending_models) + hub_models = [m for m in hub_models if m not in trending_models] + hub_models = trending_models + hub_models + + return hub_models + + +def _fetch_llm_models(): + hub_models = list( + list_models( + task="text-generation", + library="transformers", + sort="downloads", + direction=-1, + limit=100, + full=False, + ) + ) + hub_models = get_sorted_models(hub_models) + trending_models = list( + list_models( + task="text-generation", + library="transformers", + sort="likes7d", + direction=-1, + limit=30, + full=False, + ) + ) + if len(trending_models) > 0: + trending_models = get_sorted_models(trending_models) + hub_models = [m for m in hub_models if m not in trending_models] + hub_models = trending_models + hub_models + return hub_models + + +def _fetch_image_classification_models(): + hub_models = list( + list_models( + task="image-classification", + library="transformers", + sort="downloads", + direction=-1, + limit=100, + full=False, + ) + ) + hub_models = get_sorted_models(hub_models) + + trending_models = list( + list_models( + task="image-classification", + library="transformers", + sort="likes7d", + direction=-1, + limit=30, + full=False, + ) + ) + if len(trending_models) > 0: + trending_models = get_sorted_models(trending_models) + hub_models = [m for m in hub_models if m not in trending_models] + hub_models = trending_models + hub_models + + return hub_models + + +def _fetch_image_object_detection_models(): + hub_models = list( + list_models( + task="object-detection", + library="transformers", + sort="downloads", + direction=-1, + limit=100, + full=False, + pipeline_tag="object-detection", + ) + ) + hub_models = get_sorted_models(hub_models) + + trending_models = list( + list_models( + task="object-detection", + library="transformers", + sort="likes7d", + direction=-1, + limit=30, + full=False, + pipeline_tag="object-detection", + ) + ) + if len(trending_models) > 0: + trending_models = get_sorted_models(trending_models) + hub_models = [m for m in hub_models if m not in trending_models] + hub_models = trending_models + hub_models + + return hub_models + + +def _fetch_seq2seq_models(): + hub_models = list( + list_models( + task="text2text-generation", + library="transformers", + sort="downloads", + direction=-1, + limit=100, + full=False, + ) + ) + hub_models = get_sorted_models(hub_models) + trending_models = list( + list_models( + task="text2text-generation", + library="transformers", + sort="likes7d", + direction=-1, + limit=30, + full=False, + ) + ) + if len(trending_models) > 0: + trending_models = get_sorted_models(trending_models) + hub_models = [m for m in hub_models if m not in trending_models] + hub_models = trending_models + hub_models + return hub_models + + +def _fetch_token_classification_models(): + hub_models1 = list( + list_models( + task="fill-mask", + library="transformers", + sort="downloads", + direction=-1, + limit=100, + full=False, + ) + ) + hub_models2 = list( + list_models( + task="token-classification", + library="transformers", + sort="downloads", + direction=-1, + limit=100, + full=False, + ) + ) + hub_models = list(hub_models1) + list(hub_models2) + hub_models = get_sorted_models(hub_models) + + trending_models = list( + list_models( + task="fill-mask", + library="transformers", + sort="likes7d", + direction=-1, + limit=30, + full=False, + ) + ) + if len(trending_models) > 0: + trending_models = get_sorted_models(trending_models) + hub_models = [m for m in hub_models if m not in trending_models] + hub_models = trending_models + hub_models + + return hub_models + + +def _fetch_st_models(): + hub_models1 = list( + list_models( + task="sentence-similarity", + library="sentence-transformers", + sort="downloads", + direction=-1, + limit=30, + full=False, + ) + ) + hub_models2 = list( + list_models( + task="fill-mask", + library="transformers", + sort="downloads", + direction=-1, + limit=30, + full=False, + ) + ) + + hub_models = list(hub_models1) + list(hub_models2) + hub_models = get_sorted_models(hub_models) + + trending_models = list( + list_models( + task="sentence-similarity", + library="sentence-transformers", + sort="likes7d", + direction=-1, + limit=30, + full=False, + ) + ) + if len(trending_models) > 0: + trending_models = get_sorted_models(trending_models) + hub_models = [m for m in hub_models if m not in trending_models] + hub_models = trending_models + hub_models + return hub_models + + +def _fetch_vlm_models(): + hub_models1 = list( + list_models( + task="image-text-to-text", + sort="downloads", + direction=-1, + limit=100, + full=False, + filter=["paligemma"], + ) + ) + # hub_models2 = list( + # list_models( + # task="image-text-to-text", + # sort="downloads", + # direction=-1, + # limit=100, + # full=False, + # filter=["florence2"], + # ) + # ) + hub_models2 = [] + hub_models = list(hub_models1) + list(hub_models2) + hub_models = get_sorted_models(hub_models) + + trending_models1 = list( + list_models( + task="image-text-to-text", + sort="likes7d", + direction=-1, + limit=30, + full=False, + filter=["paligemma"], + ) + ) + # trending_models2 = list( + # list_models( + # task="image-text-to-text", + # sort="likes7d", + # direction=-1, + # limit=30, + # full=False, + # filter=["florence2"], + # ) + # ) + trending_models2 = [] + trending_models = list(trending_models1) + list(trending_models2) + if len(trending_models) > 0: + trending_models = get_sorted_models(trending_models) + hub_models = [m for m in hub_models if m not in trending_models] + hub_models = trending_models + hub_models + return hub_models + + +def fetch_models(): + _mc = collections.defaultdict(list) + _mc["text-classification"] = _fetch_text_classification_models() + _mc["llm"] = _fetch_llm_models() + _mc["image-classification"] = _fetch_image_classification_models() + _mc["image-regression"] = _fetch_image_classification_models() + _mc["seq2seq"] = _fetch_seq2seq_models() + _mc["token-classification"] = _fetch_token_classification_models() + _mc["text-regression"] = _fetch_text_classification_models() + _mc["image-object-detection"] = _fetch_image_object_detection_models() + _mc["sentence-transformers"] = _fetch_st_models() + _mc["vlm"] = _fetch_vlm_models() + _mc["extractive-qa"] = _fetch_text_classification_models() + + # tabular-classification + _mc["tabular-classification"] = [ + "xgboost", + "random_forest", + "ridge", + "logistic_regression", + "svm", + "extra_trees", + "adaboost", + "decision_tree", + "knn", + ] + + # tabular-regression + _mc["tabular-regression"] = [ + "xgboost", + "random_forest", + "ridge", + "svm", + "extra_trees", + "adaboost", + "decision_tree", + "knn", + ] + return _mc diff --git a/src/autotrain/app/oauth.py b/src/autotrain/app/oauth.py new file mode 100644 index 0000000000000000000000000000000000000000..c03266e35180c7bce556ebdd016176e55c34aafa --- /dev/null +++ b/src/autotrain/app/oauth.py @@ -0,0 +1,172 @@ +"""OAuth support for AutoTrain. +Taken from: https://github.com/gradio-app/gradio/blob/main/gradio/oauth.py +""" + +from __future__ import annotations + +import hashlib +import os +import urllib.parse + +import fastapi +from authlib.integrations.base_client.errors import MismatchingStateError +from authlib.integrations.starlette_client import OAuth +from fastapi.responses import RedirectResponse +from starlette.middleware.sessions import SessionMiddleware + + +OAUTH_CLIENT_ID = os.environ.get("OAUTH_CLIENT_ID") +OAUTH_CLIENT_SECRET = os.environ.get("OAUTH_CLIENT_SECRET") +OAUTH_SCOPES = os.environ.get("OAUTH_SCOPES") +OPENID_PROVIDER_URL = os.environ.get("OPENID_PROVIDER_URL") + + +def attach_oauth(app: fastapi.FastAPI): + """ + Attaches OAuth functionality to a FastAPI application by adding OAuth routes and session middleware. + + Args: + app (fastapi.FastAPI): The FastAPI application instance to which OAuth routes and middleware will be attached. + + Notes: + - The session middleware requires a secret key to sign the cookies. A hash of the OAuth secret key is used to + make it unique to the Space and to ensure it is updated if the OAuth configuration changes. + - The session secret includes a version identifier ("-autotrain-v2") to allow for future changes in the session + cookie format. If the format changes, the version can be bumped to invalidate old cookies and prevent HTTP 500 errors. + """ + _add_oauth_routes(app) + # Session Middleware requires a secret key to sign the cookies. Let's use a hash + # of the OAuth secret key to make it unique to the Space + updated in case OAuth + # config gets updated. + session_secret = OAUTH_CLIENT_SECRET + "-autotrain-v2" + # ^ if we change the session cookie format in the future, we can bump the version of the session secret to make + # sure cookies are invalidated. Otherwise some users with an old cookie format might get a HTTP 500 error. + app.add_middleware( + SessionMiddleware, + secret_key=hashlib.sha256(session_secret.encode()).hexdigest(), + https_only=True, + same_site="none", + ) + + +def _add_oauth_routes(app: fastapi.FastAPI) -> None: + """ + Add OAuth routes to the FastAPI app (login, callback handler, and logout). + + This function performs the following tasks: + 1. Checks for required environment variables and raises a ValueError if any are missing. + 2. Registers the OAuth server with the provided client ID, client secret, scopes, and OpenID provider URL. + 3. Defines the following OAuth routes: + - `/login/huggingface`: Redirects to the Hugging Face OAuth page. + - `/auth`: Handles the OAuth callback and manages the OAuth state. + + Args: + app (fastapi.FastAPI): The FastAPI application instance to which the OAuth routes will be added. + + Raises: + ValueError: If any of the required environment variables (OAUTH_CLIENT_ID, OAUTH_CLIENT_SECRET, + OAUTH_SCOPES, OPENID_PROVIDER_URL) are not set. + """ + """Add OAuth routes to the FastAPI app (login, callback handler and logout).""" + # Check environment variables + msg = ( + "OAuth is required but {} environment variable is not set. Make sure you've enabled OAuth in your Space by" + " setting `hf_oauth: true` in the Space metadata." + ) + if OAUTH_CLIENT_ID is None: + raise ValueError(msg.format("OAUTH_CLIENT_ID")) + if OAUTH_CLIENT_SECRET is None: + raise ValueError(msg.format("OAUTH_CLIENT_SECRET")) + if OAUTH_SCOPES is None: + raise ValueError(msg.format("OAUTH_SCOPES")) + if OPENID_PROVIDER_URL is None: + raise ValueError(msg.format("OPENID_PROVIDER_URL")) + + # Register OAuth server + oauth = OAuth() + oauth.register( + name="huggingface", + client_id=OAUTH_CLIENT_ID, + client_secret=OAUTH_CLIENT_SECRET, + client_kwargs={"scope": OAUTH_SCOPES}, + server_metadata_url=OPENID_PROVIDER_URL + "/.well-known/openid-configuration", + ) + + # Define OAuth routes + @app.get("/login/huggingface") + async def oauth_login(request: fastapi.Request): + """ + Handles the OAuth login process by redirecting to the Hugging Face OAuth page. + + Args: + request (fastapi.Request): The incoming HTTP request. + + Returns: + Response: A redirection response to the Hugging Face OAuth authorization page. + """ + """Endpoint that redirects to HF OAuth page.""" + redirect_uri = request.url_for("auth") + redirect_uri_as_str = str(redirect_uri) + if redirect_uri.netloc.endswith(".hf.space"): + redirect_uri_as_str = redirect_uri_as_str.replace("http://", "https://") + return await oauth.huggingface.authorize_redirect(request, redirect_uri_as_str) # type: ignore + + @app.get("/auth") + async def auth(request: fastapi.Request) -> RedirectResponse: + """ + Handles the OAuth callback for Hugging Face authentication. + + Args: + request (fastapi.Request): The incoming request object. + + Returns: + RedirectResponse: A response object that redirects the user to the appropriate page. + + Raises: + MismatchingStateError: If there is a state mismatch, likely due to a corrupted cookie. + In this case, the user is redirected to the login page after clearing the relevant session keys. + + Notes: + - If the state mismatch occurs, it is likely due to a bug in authlib that causes the token to grow indefinitely + if the user tries to login repeatedly. Since cookies cannot exceed 4kb, the token will be truncated at some point, + resulting in a lost state. The workaround is to delete the cookie and redirect the user to the login page again. + - See https://github.com/lepture/authlib/issues/622 for more details. + """ + """Endpoint that handles the OAuth callback.""" + try: + oauth_info = await oauth.huggingface.authorize_access_token(request) # type: ignore + except MismatchingStateError: + # If the state mismatch, it is very likely that the cookie is corrupted. + # There is a bug reported in authlib that causes the token to grow indefinitely if the user tries to login + # repeatedly. Since cookies cannot get bigger than 4kb, the token will be truncated at some point - hence + # losing the state. A workaround is to delete the cookie and redirect the user to the login page again. + # See https://github.com/lepture/authlib/issues/622 for more details. + login_uri = "/login/huggingface" + if "_target_url" in request.query_params: + login_uri += "?" + urllib.parse.urlencode( # Keep same _target_url as before + {"_target_url": request.query_params["_target_url"]} + ) + for key in list(request.session.keys()): + # Delete all keys that are related to the OAuth state + if key.startswith("_state_huggingface"): + request.session.pop(key) + return RedirectResponse(login_uri) + + request.session["oauth_info"] = oauth_info + return _redirect_to_target(request) + + +def _redirect_to_target(request: fastapi.Request, default_target: str = "/") -> RedirectResponse: + """ + Redirects the incoming request to a target URL specified in the query parameters. + + Args: + request (fastapi.Request): The incoming HTTP request. + default_target (str, optional): The default URL to redirect to if no target URL is specified in the query parameters. Defaults to "/". + + Returns: + RedirectResponse: A response object that redirects the client to the target URL. + """ + target = request.query_params.get("_target_url", default_target) + # target = "https://huggingface.co/spaces/" + os.environ.get("SPACE_ID") + return RedirectResponse(target) diff --git a/src/autotrain/app/params.py b/src/autotrain/app/params.py new file mode 100644 index 0000000000000000000000000000000000000000..a6f4addbc53343dee8c5e36892fe35bd8cf1f061 --- /dev/null +++ b/src/autotrain/app/params.py @@ -0,0 +1,739 @@ +import json +from dataclasses import dataclass +from typing import Optional + +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams +from autotrain.trainers.image_classification.params import ImageClassificationParams +from autotrain.trainers.image_regression.params import ImageRegressionParams +from autotrain.trainers.object_detection.params import ObjectDetectionParams +from autotrain.trainers.sent_transformers.params import SentenceTransformersParams +from autotrain.trainers.seq2seq.params import Seq2SeqParams +from autotrain.trainers.tabular.params import TabularParams +from autotrain.trainers.text_classification.params import TextClassificationParams +from autotrain.trainers.text_regression.params import TextRegressionParams +from autotrain.trainers.token_classification.params import TokenClassificationParams +from autotrain.trainers.vlm.params import VLMTrainingParams + + +HIDDEN_PARAMS = [ + "token", + "project_name", + "username", + "task", + "backend", + "train_split", + "valid_split", + "text_column", + "rejected_text_column", + "prompt_text_column", + "push_to_hub", + "trainer", + "model", + "data_path", + "image_path", + "class_image_path", + "revision", + "tokenizer", + "class_prompt", + "num_class_images", + "class_labels_conditioning", + "resume_from_checkpoint", + "dataloader_num_workers", + "allow_tf32", + "prior_generation_precision", + "local_rank", + "tokenizer_max_length", + "rank", + "xl", + "checkpoints_total_limit", + "validation_images", + "validation_epochs", + "num_validation_images", + "validation_prompt", + "sample_batch_size", + "log", + "image_column", + "target_column", + "id_column", + "target_columns", + "tokens_column", + "tags_column", + "objects_column", + "sentence1_column", + "sentence2_column", + "sentence3_column", + "question_column", + "answer_column", +] + + +PARAMS = {} +PARAMS["llm"] = LLMTrainingParams( + target_modules="all-linear", + log="tensorboard", + mixed_precision="fp16", + quantization="int4", + peft=True, + block_size=1024, + epochs=3, + padding="right", + chat_template="none", + max_completion_length=128, + distributed_backend="ddp", +).model_dump() + +PARAMS["text-classification"] = TextClassificationParams( + mixed_precision="fp16", + log="tensorboard", +).model_dump() +PARAMS["st"] = SentenceTransformersParams( + mixed_precision="fp16", + log="tensorboard", +).model_dump() +PARAMS["image-classification"] = ImageClassificationParams( + mixed_precision="fp16", + log="tensorboard", +).model_dump() +PARAMS["image-object-detection"] = ObjectDetectionParams( + mixed_precision="fp16", + log="tensorboard", +).model_dump() +PARAMS["seq2seq"] = Seq2SeqParams( + mixed_precision="fp16", + target_modules="all-linear", + log="tensorboard", +).model_dump() +PARAMS["tabular"] = TabularParams( + categorical_imputer="most_frequent", + numerical_imputer="median", + numeric_scaler="robust", +).model_dump() +PARAMS["token-classification"] = TokenClassificationParams( + mixed_precision="fp16", + log="tensorboard", +).model_dump() +PARAMS["text-regression"] = TextRegressionParams( + mixed_precision="fp16", + log="tensorboard", +).model_dump() +PARAMS["image-regression"] = ImageRegressionParams( + mixed_precision="fp16", + log="tensorboard", +).model_dump() +PARAMS["vlm"] = VLMTrainingParams( + mixed_precision="fp16", + target_modules="all-linear", + log="tensorboard", + quantization="int4", + peft=True, + epochs=3, +).model_dump() +PARAMS["extractive-qa"] = ExtractiveQuestionAnsweringParams( + mixed_precision="fp16", + log="tensorboard", + max_seq_length=512, + max_doc_stride=128, +).model_dump() + + +@dataclass +class AppParams: + """ + AppParams class is responsible for managing and processing parameters for various machine learning tasks. + + Attributes: + job_params_json (str): JSON string containing job parameters. + token (str): Authentication token. + project_name (str): Name of the project. + username (str): Username of the project owner. + task (str): Type of task to be performed. + data_path (str): Path to the dataset. + base_model (str): Base model to be used. + column_mapping (dict): Mapping of columns for the dataset. + train_split (Optional[str]): Name of the training split. Default is None. + valid_split (Optional[str]): Name of the validation split. Default is None. + using_hub_dataset (Optional[bool]): Flag indicating if a hub dataset is used. Default is False. + api (Optional[bool]): Flag indicating if API is used. Default is False. + + Methods: + __post_init__(): Validates the parameters after initialization. + munge(): Processes the parameters based on the task type. + _munge_common_params(): Processes common parameters for all tasks. + _munge_params_sent_transformers(): Processes parameters for sentence transformers task. + _munge_params_llm(): Processes parameters for large language model task. + _munge_params_vlm(): Processes parameters for vision-language model task. + _munge_params_text_clf(): Processes parameters for text classification task. + _munge_params_extractive_qa(): Processes parameters for extractive question answering task. + _munge_params_text_reg(): Processes parameters for text regression task. + _munge_params_token_clf(): Processes parameters for token classification task. + _munge_params_seq2seq(): Processes parameters for sequence-to-sequence task. + _munge_params_img_clf(): Processes parameters for image classification task. + _munge_params_img_reg(): Processes parameters for image regression task. + _munge_params_img_obj_det(): Processes parameters for image object detection task. + _munge_params_tabular(): Processes parameters for tabular data task. + """ + + job_params_json: str + token: str + project_name: str + username: str + task: str + data_path: str + base_model: str + column_mapping: dict + train_split: Optional[str] = None + valid_split: Optional[str] = None + using_hub_dataset: Optional[bool] = False + api: Optional[bool] = False + + def __post_init__(self): + if self.using_hub_dataset and not self.train_split: + raise ValueError("train_split is required when using a hub dataset") + + def munge(self): + if self.task == "text-classification": + return self._munge_params_text_clf() + elif self.task == "seq2seq": + return self._munge_params_seq2seq() + elif self.task == "image-classification": + return self._munge_params_img_clf() + elif self.task == "image-object-detection": + return self._munge_params_img_obj_det() + elif self.task.startswith("tabular"): + return self._munge_params_tabular() + elif self.task.startswith("llm"): + return self._munge_params_llm() + elif self.task == "token-classification": + return self._munge_params_token_clf() + elif self.task == "text-regression": + return self._munge_params_text_reg() + elif self.task.startswith("st:"): + return self._munge_params_sent_transformers() + elif self.task == "image-regression": + return self._munge_params_img_reg() + elif self.task.startswith("vlm"): + return self._munge_params_vlm() + elif self.task == "extractive-qa": + return self._munge_params_extractive_qa() + else: + raise ValueError(f"Unknown task: {self.task}") + + def _munge_common_params(self): + _params = json.loads(self.job_params_json) + _params["token"] = self.token + _params["project_name"] = f"{self.project_name}" + if "push_to_hub" not in _params: + _params["push_to_hub"] = True + _params["data_path"] = self.data_path + _params["username"] = self.username + return _params + + def _munge_params_sent_transformers(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if "log" not in _params: + _params["log"] = "tensorboard" + if not self.using_hub_dataset: + _params["sentence1_column"] = "autotrain_sentence1" + _params["sentence2_column"] = "autotrain_sentence2" + _params["sentence3_column"] = "autotrain_sentence3" + _params["target_column"] = "autotrain_target" + _params["valid_split"] = "validation" + else: + _params["sentence1_column"] = self.column_mapping.get( + "sentence1" if not self.api else "sentence1_column", "sentence1" + ) + _params["sentence2_column"] = self.column_mapping.get( + "sentence2" if not self.api else "sentence2_column", "sentence2" + ) + _params["sentence3_column"] = self.column_mapping.get( + "sentence3" if not self.api else "sentence3_column", "sentence3" + ) + _params["target_column"] = self.column_mapping.get("target" if not self.api else "target_column", "target") + _params["train_split"] = self.train_split + _params["valid_split"] = self.valid_split + + trainer = self.task.split(":")[1] + _params["trainer"] = trainer.lower() + return SentenceTransformersParams(**_params) + + def _munge_params_llm(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if not self.using_hub_dataset: + _params["text_column"] = "autotrain_text" + _params["prompt_text_column"] = "autotrain_prompt" + _params["rejected_text_column"] = "autotrain_rejected_text" + else: + _params["text_column"] = self.column_mapping.get("text" if not self.api else "text_column", "text") + _params["prompt_text_column"] = self.column_mapping.get( + "prompt" if not self.api else "prompt_text_column", "prompt" + ) + _params["rejected_text_column"] = self.column_mapping.get( + "rejected_text" if not self.api else "rejected_text_column", "rejected_text" + ) + _params["train_split"] = self.train_split + if "log" not in _params: + _params["log"] = "tensorboard" + + trainer = self.task.split(":")[1] + if trainer != "generic": + _params["trainer"] = trainer.lower() + + if "quantization" in _params: + if _params["quantization"] in ("none", "no"): + _params["quantization"] = None + + return LLMTrainingParams(**_params) + + def _munge_params_vlm(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if not self.using_hub_dataset: + _params["text_column"] = "autotrain_text" + _params["prompt_text_column"] = "autotrain_prompt" + _params["image_column"] = "autotrain_image" + _params["valid_split"] = "validation" + else: + _params["text_column"] = self.column_mapping.get("text" if not self.api else "text_column", "text") + _params["prompt_text_column"] = self.column_mapping.get( + "prompt" if not self.api else "prompt_text_column", "prompt" + ) + _params["image_column"] = self.column_mapping.get( + "image" if not self.api else "rejected_text_column", "image" + ) + _params["train_split"] = self.train_split + _params["valid_split"] = self.valid_split + if "log" not in _params: + _params["log"] = "tensorboard" + + trainer = self.task.split(":")[1] + _params["trainer"] = trainer.lower() + + if "quantization" in _params: + if _params["quantization"] in ("none", "no"): + _params["quantization"] = None + + return VLMTrainingParams(**_params) + + def _munge_params_text_clf(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if "log" not in _params: + _params["log"] = "tensorboard" + if not self.using_hub_dataset: + _params["text_column"] = "autotrain_text" + _params["target_column"] = "autotrain_label" + _params["valid_split"] = "validation" + else: + _params["text_column"] = self.column_mapping.get("text" if not self.api else "text_column", "text") + _params["target_column"] = self.column_mapping.get("label" if not self.api else "target_column", "label") + _params["train_split"] = self.train_split + _params["valid_split"] = self.valid_split + return TextClassificationParams(**_params) + + def _munge_params_extractive_qa(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if "log" not in _params: + _params["log"] = "tensorboard" + if not self.using_hub_dataset: + _params["text_column"] = "autotrain_text" + _params["question_column"] = "autotrain_question" + _params["answer_column"] = "autotrain_answer" + _params["valid_split"] = "validation" + else: + _params["text_column"] = self.column_mapping.get("text" if not self.api else "text_column", "text") + _params["question_column"] = self.column_mapping.get( + "question" if not self.api else "question_column", "question" + ) + _params["answer_column"] = self.column_mapping.get("answer" if not self.api else "answer_column", "answer") + _params["train_split"] = self.train_split + _params["valid_split"] = self.valid_split + return ExtractiveQuestionAnsweringParams(**_params) + + def _munge_params_text_reg(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if "log" not in _params: + _params["log"] = "tensorboard" + if not self.using_hub_dataset: + _params["text_column"] = "autotrain_text" + _params["target_column"] = "autotrain_label" + _params["valid_split"] = "validation" + else: + _params["text_column"] = self.column_mapping.get("text" if not self.api else "text_column", "text") + _params["target_column"] = self.column_mapping.get("label" if not self.api else "target_column", "label") + _params["train_split"] = self.train_split + _params["valid_split"] = self.valid_split + return TextRegressionParams(**_params) + + def _munge_params_token_clf(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if "log" not in _params: + _params["log"] = "tensorboard" + if not self.using_hub_dataset: + _params["tokens_column"] = "autotrain_text" + _params["tags_column"] = "autotrain_label" + _params["valid_split"] = "validation" + else: + _params["tokens_column"] = self.column_mapping.get("tokens" if not self.api else "tokens_column", "tokens") + _params["tags_column"] = self.column_mapping.get("tags" if not self.api else "tags_column", "tags") + _params["train_split"] = self.train_split + _params["valid_split"] = self.valid_split + + return TokenClassificationParams(**_params) + + def _munge_params_seq2seq(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if "log" not in _params: + _params["log"] = "tensorboard" + if not self.using_hub_dataset: + _params["text_column"] = "autotrain_text" + _params["target_column"] = "autotrain_label" + _params["valid_split"] = "validation" + else: + _params["text_column"] = self.column_mapping.get("text" if not self.api else "text_column", "text") + _params["target_column"] = self.column_mapping.get("label" if not self.api else "target_column", "label") + _params["train_split"] = self.train_split + _params["valid_split"] = self.valid_split + + return Seq2SeqParams(**_params) + + def _munge_params_img_clf(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if "log" not in _params: + _params["log"] = "tensorboard" + if not self.using_hub_dataset: + _params["image_column"] = "autotrain_image" + _params["target_column"] = "autotrain_label" + _params["valid_split"] = "validation" + else: + _params["image_column"] = self.column_mapping.get("image" if not self.api else "image_column", "image") + _params["target_column"] = self.column_mapping.get("label" if not self.api else "target_column", "label") + _params["train_split"] = self.train_split + _params["valid_split"] = self.valid_split + + return ImageClassificationParams(**_params) + + def _munge_params_img_reg(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if "log" not in _params: + _params["log"] = "tensorboard" + if not self.using_hub_dataset: + _params["image_column"] = "autotrain_image" + _params["target_column"] = "autotrain_label" + _params["valid_split"] = "validation" + else: + _params["image_column"] = self.column_mapping.get("image" if not self.api else "image_column", "image") + _params["target_column"] = self.column_mapping.get("target" if not self.api else "target_column", "target") + _params["train_split"] = self.train_split + _params["valid_split"] = self.valid_split + + return ImageRegressionParams(**_params) + + def _munge_params_img_obj_det(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if "log" not in _params: + _params["log"] = "tensorboard" + if not self.using_hub_dataset: + _params["image_column"] = "autotrain_image" + _params["objects_column"] = "autotrain_objects" + _params["valid_split"] = "validation" + else: + _params["image_column"] = self.column_mapping.get("image" if not self.api else "image_column", "image") + _params["objects_column"] = self.column_mapping.get( + "objects" if not self.api else "objects_column", "objects" + ) + _params["train_split"] = self.train_split + _params["valid_split"] = self.valid_split + + return ObjectDetectionParams(**_params) + + def _munge_params_tabular(self): + _params = self._munge_common_params() + _params["model"] = self.base_model + if not self.using_hub_dataset: + _params["id_column"] = "autotrain_id" + _params["valid_split"] = "validation" + if len(self.column_mapping["label"]) == 1: + _params["target_columns"] = ["autotrain_label"] + else: + _params["target_columns"] = [ + "autotrain_label_" + str(i) for i in range(len(self.column_mapping["label"])) + ] + else: + _params["id_column"] = self.column_mapping.get("id" if not self.api else "id_column", "id") + _params["train_split"] = self.train_split + _params["valid_split"] = self.valid_split + _params["target_columns"] = self.column_mapping.get("label" if not self.api else "target_columns", "label") + + if len(_params["categorical_imputer"].strip()) == 0 or _params["categorical_imputer"].lower() == "none": + _params["categorical_imputer"] = None + if len(_params["numerical_imputer"].strip()) == 0 or _params["numerical_imputer"].lower() == "none": + _params["numerical_imputer"] = None + if len(_params["numeric_scaler"].strip()) == 0 or _params["numeric_scaler"].lower() == "none": + _params["numeric_scaler"] = None + + if "classification" in self.task: + _params["task"] = "classification" + else: + _params["task"] = "regression" + + return TabularParams(**_params) + + +def get_task_params(task, param_type): + """ + Retrieve task-specific parameters while filtering out hidden parameters based on the task and parameter type. + + Args: + task (str): The task identifier, which can include prefixes like "llm", "st:", "vlm:", etc. + param_type (str): The type of parameters to retrieve, typically "basic" or other types. + + Returns: + dict: A dictionary of task-specific parameters with hidden parameters filtered out. + + Notes: + - The function handles various task prefixes and adjusts the task and trainer variables accordingly. + - Hidden parameters are filtered out based on the task and parameter type. + - Additional hidden parameters are defined for specific tasks and trainers. + """ + if task.startswith("llm"): + trainer = task.split(":")[1].lower() + task = task.split(":")[0].lower() + + if task.startswith("st:"): + trainer = task.split(":")[1].lower() + task = task.split(":")[0].lower() + + if task.startswith("vlm:"): + trainer = task.split(":")[1].lower() + task = task.split(":")[0].lower() + + if task.startswith("tabular"): + task = "tabular" + + if task not in PARAMS: + return {} + + task_params = PARAMS[task] + task_params = {k: v for k, v in task_params.items() if k not in HIDDEN_PARAMS} + if task == "llm": + more_hidden_params = [] + if trainer == "sft": + more_hidden_params = [ + "model_ref", + "dpo_beta", + "add_eos_token", + "max_prompt_length", + "max_completion_length", + ] + elif trainer == "reward": + more_hidden_params = [ + "model_ref", + "dpo_beta", + "add_eos_token", + "max_prompt_length", + "max_completion_length", + "unsloth", + ] + elif trainer == "orpo": + more_hidden_params = [ + "model_ref", + "dpo_beta", + "add_eos_token", + "unsloth", + ] + elif trainer == "generic": + more_hidden_params = [ + "model_ref", + "dpo_beta", + "max_prompt_length", + "max_completion_length", + ] + elif trainer == "dpo": + more_hidden_params = [ + "add_eos_token", + "unsloth", + ] + if param_type == "basic": + more_hidden_params.extend( + [ + "padding", + "use_flash_attention_2", + "disable_gradient_checkpointing", + "logging_steps", + "eval_strategy", + "save_total_limit", + "auto_find_batch_size", + "warmup_ratio", + "weight_decay", + "max_grad_norm", + "seed", + "quantization", + "merge_adapter", + "lora_r", + "lora_alpha", + "lora_dropout", + "max_completion_length", + ] + ) + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + if task == "text-classification" and param_type == "basic": + more_hidden_params = [ + "warmup_ratio", + "weight_decay", + "max_grad_norm", + "seed", + "logging_steps", + "auto_find_batch_size", + "save_total_limit", + "eval_strategy", + "early_stopping_patience", + "early_stopping_threshold", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + if task == "extractive-qa" and param_type == "basic": + more_hidden_params = [ + "warmup_ratio", + "weight_decay", + "max_grad_norm", + "seed", + "logging_steps", + "auto_find_batch_size", + "save_total_limit", + "eval_strategy", + "early_stopping_patience", + "early_stopping_threshold", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + if task == "st" and param_type == "basic": + more_hidden_params = [ + "warmup_ratio", + "weight_decay", + "max_grad_norm", + "seed", + "logging_steps", + "auto_find_batch_size", + "save_total_limit", + "eval_strategy", + "early_stopping_patience", + "early_stopping_threshold", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + if task == "vlm" and param_type == "basic": + more_hidden_params = [ + "warmup_ratio", + "weight_decay", + "max_grad_norm", + "seed", + "logging_steps", + "auto_find_batch_size", + "save_total_limit", + "eval_strategy", + "early_stopping_patience", + "early_stopping_threshold", + "quantization", + "lora_r", + "lora_alpha", + "lora_dropout", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + if task == "text-regression" and param_type == "basic": + more_hidden_params = [ + "warmup_ratio", + "weight_decay", + "max_grad_norm", + "seed", + "logging_steps", + "auto_find_batch_size", + "save_total_limit", + "eval_strategy", + "early_stopping_patience", + "early_stopping_threshold", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + if task == "image-classification" and param_type == "basic": + more_hidden_params = [ + "warmup_ratio", + "weight_decay", + "max_grad_norm", + "seed", + "logging_steps", + "auto_find_batch_size", + "save_total_limit", + "eval_strategy", + "early_stopping_patience", + "early_stopping_threshold", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + if task == "image-regression" and param_type == "basic": + more_hidden_params = [ + "warmup_ratio", + "weight_decay", + "max_grad_norm", + "seed", + "logging_steps", + "auto_find_batch_size", + "save_total_limit", + "eval_strategy", + "early_stopping_patience", + "early_stopping_threshold", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + if task == "image-object-detection" and param_type == "basic": + more_hidden_params = [ + "warmup_ratio", + "weight_decay", + "max_grad_norm", + "seed", + "logging_steps", + "auto_find_batch_size", + "save_total_limit", + "eval_strategy", + "early_stopping_patience", + "early_stopping_threshold", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + if task == "seq2seq" and param_type == "basic": + more_hidden_params = [ + "warmup_ratio", + "weight_decay", + "max_grad_norm", + "seed", + "logging_steps", + "auto_find_batch_size", + "save_total_limit", + "eval_strategy", + "quantization", + "lora_r", + "lora_alpha", + "lora_dropout", + "target_modules", + "early_stopping_patience", + "early_stopping_threshold", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + if task == "token-classification" and param_type == "basic": + more_hidden_params = [ + "warmup_ratio", + "weight_decay", + "max_grad_norm", + "seed", + "logging_steps", + "auto_find_batch_size", + "save_total_limit", + "eval_strategy", + "early_stopping_patience", + "early_stopping_threshold", + ] + task_params = {k: v for k, v in task_params.items() if k not in more_hidden_params} + + return task_params diff --git a/src/autotrain/app/static/logo.png b/src/autotrain/app/static/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..02cbc2b4cefdd07c8d1593a48d2b518757320443 Binary files /dev/null and b/src/autotrain/app/static/logo.png differ diff --git a/src/autotrain/app/static/scripts/fetch_data_and_update_models.js b/src/autotrain/app/static/scripts/fetch_data_and_update_models.js new file mode 100644 index 0000000000000000000000000000000000000000..d20b16e5878bc26d740deafc0236ddb0f835df4e --- /dev/null +++ b/src/autotrain/app/static/scripts/fetch_data_and_update_models.js @@ -0,0 +1,34 @@ +document.addEventListener('DOMContentLoaded', function () { + function fetchDataAndUpdateModels() { + const taskValue = document.getElementById('task').value; + const baseModelSelect = document.getElementById('base_model'); + const queryParams = new URLSearchParams(window.location.search); + const customModelsValue = queryParams.get('custom_models'); + const baseModelInput = document.getElementById('base_model_input'); + const baseModelCheckbox = document.getElementById('base_model_checkbox'); + + let fetchURL = `/ui/model_choices/${taskValue}`; + if (customModelsValue) { + fetchURL += `?custom_models=${customModelsValue}`; + } + baseModelSelect.innerHTML = 'Fetching models...'; + fetch(fetchURL) + .then(response => response.json()) + .then(data => { + const baseModelSelect = document.getElementById('base_model'); + baseModelCheckbox.checked = false; + baseModelSelect.classList.remove('hidden'); + baseModelInput.classList.add('hidden'); + baseModelSelect.innerHTML = ''; // Clear existing options + data.forEach(model => { + let option = document.createElement('option'); + option.value = model.id; // Assuming each model has an 'id' + option.textContent = model.name; // Assuming each model has a 'name' + baseModelSelect.appendChild(option); + }); + }) + .catch(error => console.error('Error:', error)); + } + document.getElementById('task').addEventListener('change', fetchDataAndUpdateModels); + fetchDataAndUpdateModels(); +}); \ No newline at end of file diff --git a/src/autotrain/app/static/scripts/listeners.js b/src/autotrain/app/static/scripts/listeners.js new file mode 100644 index 0000000000000000000000000000000000000000..9638342aea68a7c26f6efed566cc57d0689436ca --- /dev/null +++ b/src/autotrain/app/static/scripts/listeners.js @@ -0,0 +1,190 @@ +document.addEventListener('DOMContentLoaded', function () { + const dataSource = document.getElementById("dataset_source"); + const uploadDataTabContent = document.getElementById("upload-data-tab-content"); + const hubDataTabContent = document.getElementById("hub-data-tab-content"); + const uploadDataTabs = document.getElementById("upload-data-tabs"); + + const jsonCheckbox = document.getElementById('show-json-parameters'); + const jsonParametersDiv = document.getElementById('json-parameters'); + const dynamicUiDiv = document.getElementById('dynamic-ui'); + + const paramsTextarea = document.getElementById('params_json'); + + const updateTextarea = () => { + const paramElements = document.querySelectorAll('[id^="param_"]'); + const params = {}; + paramElements.forEach(el => { + const key = el.id.replace('param_', ''); + params[key] = el.value; + }); + paramsTextarea.value = JSON.stringify(params, null, 2); + //paramsTextarea.className = 'p-2.5 w-full text-sm text-gray-600 border-white border-transparent focus:border-transparent focus:ring-0' + paramsTextarea.style.height = '600px'; + }; + const observeParamChanges = () => { + const paramElements = document.querySelectorAll('[id^="param_"]'); + paramElements.forEach(el => { + el.addEventListener('input', updateTextarea); + }); + }; + const updateParamsFromTextarea = () => { + try { + const params = JSON.parse(paramsTextarea.value); + Object.keys(params).forEach(key => { + const el = document.getElementById('param_' + key); + if (el) { + el.value = params[key]; + } + }); + } catch (e) { + console.error('Invalid JSON:', e); + } + }; + function switchToJSON() { + if (jsonCheckbox.checked) { + dynamicUiDiv.style.display = 'none'; + jsonParametersDiv.style.display = 'block'; + } else { + dynamicUiDiv.style.display = 'block'; + jsonParametersDiv.style.display = 'none'; + } + } + + function handleDataSource() { + if (dataSource.value === "hub") { + uploadDataTabContent.style.display = "none"; + uploadDataTabs.style.display = "none"; + hubDataTabContent.style.display = "block"; + } else if (dataSource.value === "local") { + uploadDataTabContent.style.display = "block"; + uploadDataTabs.style.display = "block"; + hubDataTabContent.style.display = "none"; + } + } + + async function fetchParams() { + const taskValue = document.getElementById('task').value; + const parameterMode = document.getElementById('parameter_mode').value; + const response = await fetch(`/ui/params/${taskValue}/${parameterMode}`); + const params = await response.json(); + return params; + } + + function createElement(param, config) { + let element = ''; + switch (config.type) { + case 'number': + element = `
+ + +
`; + break; + case 'dropdown': + let options = config.options.map(option => ``).join(''); + element = `
+ + +
`; + break; + case 'checkbox': + element = `
+ + +
`; + break; + case 'string': + element = `
+ + +
`; + break; + } + return element; + } + + function renderUI(params) { + const uiContainer = document.getElementById('dynamic-ui'); + let rowDiv = null; + let rowIndex = 0; + let lastType = null; + + Object.keys(params).forEach((param, index) => { + const config = params[param]; + if (lastType !== config.type || rowIndex >= 3) { + if (rowDiv) uiContainer.appendChild(rowDiv); + rowDiv = document.createElement('div'); + rowDiv.className = 'grid grid-cols-3 gap-2 mb-2'; + rowIndex = 0; + } + rowDiv.innerHTML += createElement(param, config); + rowIndex++; + lastType = config.type; + }); + if (rowDiv) uiContainer.appendChild(rowDiv); + } + + + fetchParams().then(params => renderUI(params)); + document.getElementById('task').addEventListener('change', function () { + fetchParams().then(params => { + document.getElementById('dynamic-ui').innerHTML = ''; + let jsonCheckBoxFlag = false; + if (jsonCheckbox.checked) { + jsonCheckbox.checked = false; + jsonCheckBoxFlag = true; + + } + renderUI(params); + if (jsonCheckBoxFlag) { + jsonCheckbox.checked = true; + updateTextarea(); + observeParamChanges(); + } + }); + }); + document.getElementById('parameter_mode').addEventListener('change', function () { + fetchParams().then(params => { + document.getElementById('dynamic-ui').innerHTML = ''; + let jsonCheckBoxFlag = false; + if (jsonCheckbox.checked) { + jsonCheckbox.checked = false; + jsonCheckBoxFlag = true; + + } + renderUI(params); + if (jsonCheckBoxFlag) { + jsonCheckbox.checked = true; + updateTextarea(); + observeParamChanges(); + } + }); + }); + + jsonCheckbox.addEventListener('change', function () { + if (jsonCheckbox.checked) { + updateTextarea(); + observeParamChanges(); + } + }); + document.getElementById('task').addEventListener('change', function () { + if (jsonCheckbox.checked) { + updateTextarea(); + observeParamChanges(); + } + }); + // Attach event listeners to dataset_source dropdown + dataSource.addEventListener("change", handleDataSource); + jsonCheckbox.addEventListener('change', switchToJSON); + paramsTextarea.addEventListener('input', updateParamsFromTextarea); + + // Trigger the event listener to set the initial state + handleDataSource(); + observeParamChanges(); + updateTextarea(); +}); \ No newline at end of file diff --git a/src/autotrain/app/static/scripts/logs.js b/src/autotrain/app/static/scripts/logs.js new file mode 100644 index 0000000000000000000000000000000000000000..11e3eedf9849e1fc1b0573cf8684d4fb86d05c26 --- /dev/null +++ b/src/autotrain/app/static/scripts/logs.js @@ -0,0 +1,62 @@ +document.addEventListener('DOMContentLoaded', function () { + var fetchLogsInterval; + + // Function to check the modal's display property and fetch logs if visible + function fetchAndDisplayLogs() { + var modal = document.getElementById('logs-modal'); + var displayStyle = window.getComputedStyle(modal).display; + + // Check if the modal display property is 'flex' + if (displayStyle === 'flex') { + fetchLogs(); // Initial fetch when the modal is opened + + // Clear any existing interval to avoid duplicates + clearInterval(fetchLogsInterval); + + // Set up the interval to fetch logs every 5 seconds + fetchLogsInterval = setInterval(fetchLogs, 5000); + } else { + // Clear the interval when the modal is not displayed as 'flex' + clearInterval(fetchLogsInterval); + } + } + + // Function to fetch logs from the server + function fetchLogs() { + fetch('/ui/logs') + .then(response => response.json()) + .then(data => { + var logContainer = document.getElementById('logContent'); + logContainer.innerHTML = ''; // Clear previous logs + + // Handling the case when logs are only available in local mode or no logs available + if (typeof data.logs === 'string') { + logContainer.textContent = data.logs; + } else { + // Assuming data.logs is an array of log entries + data.logs.forEach(log => { + if (log.trim().length > 0) { + var p = document.createElement('p'); + p.textContent = log; + logContainer.appendChild(p); // Appends logs in order received + } + }); + } + }) + .catch(error => console.error('Error fetching logs:', error)); + } + + // Set up an observer to detect when the modal becomes visible or hidden + var observer = new MutationObserver(function (mutations) { + mutations.forEach(function (mutation) { + if (mutation.attributeName === 'class') { + fetchAndDisplayLogs(); + } + }); + }); + + var modal = document.getElementById('logs-modal'); + observer.observe(modal, { + attributes: true //configure it to listen to attribute changes + }); +}); \ No newline at end of file diff --git a/src/autotrain/app/static/scripts/poll.js b/src/autotrain/app/static/scripts/poll.js new file mode 100644 index 0000000000000000000000000000000000000000..900b22c986e62021efb759818ba3deb9f2cb30da --- /dev/null +++ b/src/autotrain/app/static/scripts/poll.js @@ -0,0 +1,70 @@ +document.addEventListener('DOMContentLoaded', (event) => { + function pollAccelerators() { + const numAcceleratorsElement = document.getElementById('num_accelerators'); + if (autotrain_local_value === 0) { + numAcceleratorsElement.innerText = 'Accelerators: Only available in local mode.'; + numAcceleratorsElement.style.display = 'block'; // Ensure the element is visible + return; + } + + // Send a request to the /accelerators endpoint + fetch('/ui/accelerators') + .then(response => response.json()) // Assuming the response is in JSON format + .then(data => { + // Update the paragraph with the number of accelerators + document.getElementById('num_accelerators').innerText = `Accelerators: ${data.accelerators}`; + }) + .catch(error => { + console.error('Error:', error); + // Update the paragraph to show an error message + document.getElementById('num_accelerators').innerText = 'Accelerators: Error fetching data'; + }); + } + function pollModelTrainingStatus() { + // Send a request to the /is_model_training endpoint + + if (autotrain_local_value === 0) { + const statusParagraph = document.getElementById('is_model_training'); + statusParagraph.innerText = 'Running jobs: Only available in local mode.'; + statusParagraph.style.display = 'block'; + return; + } + fetch('/ui/is_model_training') + .then(response => response.json()) // Assuming the response is in JSON format + .then(data => { + // Construct the message to display + let message = data.model_training ? 'Running job PID(s): ' + data.pids.join(', ') : 'No running jobs'; + + // Update the paragraph with the status of model training + let statusParagraph = document.getElementById('is_model_training'); + statusParagraph.innerText = message; + let stopTrainingButton = document.getElementById('stop-training-button'); + let startTrainingButton = document.getElementById('start-training-button'); + + // Change the text color based on the model training status + if (data.model_training) { + // Set text color to red if jobs are running + statusParagraph.style.color = 'red'; + stopTrainingButton.style.display = 'block'; + startTrainingButton.style.display = 'none'; + } else { + // Set text color to green if no jobs are running + statusParagraph.style.color = 'green'; + stopTrainingButton.style.display = 'none'; + startTrainingButton.style.display = 'block'; + } + }) + .catch(error => { + console.error('Error:', error); + // Update the paragraph to show an error message + let statusParagraph = document.getElementById('is_model_training'); + statusParagraph.innerText = 'Error fetching training status'; + statusParagraph.style.color = 'red'; // Set error message color to red + }); + } + + setInterval(pollAccelerators, 10000); + setInterval(pollModelTrainingStatus, 5000); + pollAccelerators(); + pollModelTrainingStatus(); +}); \ No newline at end of file diff --git a/src/autotrain/app/static/scripts/utils.js b/src/autotrain/app/static/scripts/utils.js new file mode 100644 index 0000000000000000000000000000000000000000..a81267b807a1eea558cdd539eda7c47607e0f3a4 --- /dev/null +++ b/src/autotrain/app/static/scripts/utils.js @@ -0,0 +1,182 @@ +document.addEventListener('DOMContentLoaded', function () { + + const loadingSpinner = document.getElementById('loadingSpinner'); + + function generateRandomString(length) { + let result = ''; + const characters = 'abcdefghijklmnopqrstuvwxyz0123456789'; + const charactersLength = characters.length; + for (let i = 0; i < length; i++) { + result += characters.charAt(Math.floor(Math.random() * charactersLength)); + } + return result; + } + + function setRandomProjectName() { + const part1 = generateRandomString(5); + const part2 = generateRandomString(5); + const randomName = `autotrain-${part1}-${part2}`; + document.getElementById('project_name').value = randomName; + } + + function showFinalModal() { + const modal = document.getElementById('final-modal'); + modal.classList.add('flex'); + modal.classList.remove('hidden'); + } + + function hideFinalModal() { + const modal = document.getElementById('final-modal'); + modal.classList.remove('flex'); + modal.classList.add('hidden'); + } + + function showModal() { + const modal = document.getElementById('confirmation-modal'); + modal.classList.add('flex'); + modal.classList.remove('hidden'); + } + + function showLogsModal() { + const modal = document.getElementById('logs-modal'); + modal.classList.add('flex'); + modal.classList.remove('hidden'); + } + + function hideLogsModal() { + const modal = document.getElementById('logs-modal'); + modal.classList.remove('flex'); + modal.classList.add('hidden'); + } + + function hideModal() { + const modal = document.getElementById('confirmation-modal'); + modal.classList.remove('flex'); + modal.classList.add('hidden'); + } + + document.getElementById('start-training-button').addEventListener('click', function () { + showModal(); + }); + + document.querySelector('#confirmation-modal .confirm').addEventListener('click', async function () { + hideModal(); + loadingSpinner.classList.remove('hidden'); + console.log(document.getElementById('params_json').value) + + var formData = new FormData(); + var columnMapping = {}; + var params; + var paramsJsonElement = document.getElementById('params_json'); + document.querySelectorAll('[id^="col_map_"]').forEach(function (element) { + var key = element.id.replace('col_map_', ''); + columnMapping[key] = element.value; + }); + + if (paramsJsonElement.value == '{}' || paramsJsonElement.value == '') { + var paramsDict = {}; + document.querySelectorAll('[id^="param_"]').forEach(function (element) { + var key = element.id.replace('param_', ''); + paramsDict[key] = element.value; + }); + params = JSON.stringify(paramsDict); + } else { + params = paramsJsonElement.value; + } + const baseModelValue = document.getElementById('base_model_checkbox').checked + ? document.getElementById('base_model_input').value + : document.getElementById('base_model').value; + + formData.append('base_model', baseModelValue); + formData.append('project_name', document.getElementById('project_name').value); + formData.append('task', document.getElementById('task').value); + formData.append('hardware', document.getElementById('hardware').value); + formData.append('params', params); + formData.append('autotrain_user', document.getElementById('autotrain_user').value); + formData.append('column_mapping', JSON.stringify(columnMapping)); + formData.append('hub_dataset', document.getElementById('hub_dataset').value); + formData.append('train_split', document.getElementById('train_split').value); + formData.append('valid_split', document.getElementById('valid_split').value); + + var trainingFiles = document.getElementById('data_files_training').files; + for (var i = 0; i < trainingFiles.length; i++) { + formData.append('data_files_training', trainingFiles[i]); + } + + var validationFiles = document.getElementById('data_files_valid').files; + for (var i = 0; i < validationFiles.length; i++) { + formData.append('data_files_valid', validationFiles[i]); + } + + const xhr = new XMLHttpRequest(); + xhr.open('POST', '/ui/create_project', true); + + xhr.onload = function () { + loadingSpinner.classList.add('hidden'); + var finalModalContent = document.querySelector('#final-modal .text-center'); + + if (xhr.status === 200) { + var responseObj = JSON.parse(xhr.responseText); + var monitorURL = responseObj.monitor_url; + if (monitorURL.startsWith('http')) { + finalModalContent.innerHTML = '

Success!

' + + '

You can check the progress of your training here: ' + monitorURL + '

'; + } else { + finalModalContent.innerHTML = '

Success!

' + + '

' + monitorURL + '

'; + } + + showFinalModal(); + } else { + finalModalContent.innerHTML = '

Error: ' + xhr.status + ' ' + xhr.statusText + '

' + '

Please check the logs for more information.

'; + console.error('Error:', xhr.status, xhr.statusText); + showFinalModal(); + } + }; + + xhr.send(formData); + }); + + document.querySelector('#confirmation-modal .cancel').addEventListener('click', function () { + hideModal(); + }); + + document.querySelector('#final-modal button').addEventListener('click', function () { + hideFinalModal(); + }); + + document.querySelector('#button_logs').addEventListener('click', function () { + showLogsModal(); + }); + + document.querySelector('[data-modal-hide="logs-modal"]').addEventListener('click', function () { + hideLogsModal(); + }); + document.getElementById('success-message').textContent = ''; + document.getElementById('error-message').textContent = ''; + + document.getElementById('data_files_training').addEventListener('change', function () { + var fileContainer = document.getElementById('file-container-training'); + var files = this.files; + var fileText = ''; + + for (var i = 0; i < files.length; i++) { + fileText += files[i].name + ' '; + } + + fileContainer.innerHTML = fileText; + }); + document.getElementById('data_files_valid').addEventListener('change', function () { + var fileContainer = document.getElementById('file-container-valid'); + var files = this.files; + var fileText = ''; + + for (var i = 0; i < files.length; i++) { + fileText += files[i].name + ' '; + } + + fileContainer.innerHTML = fileText; + }); + + window.onload = setRandomProjectName; +}); diff --git a/src/autotrain/app/static/tensoralabs.jpg b/src/autotrain/app/static/tensoralabs.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0b8e2387a2ad4065ebdf49fd30985d7be7a0d8a5 Binary files /dev/null and b/src/autotrain/app/static/tensoralabs.jpg differ diff --git a/src/autotrain/app/templates/duplicate.html b/src/autotrain/app/templates/duplicate.html new file mode 100644 index 0000000000000000000000000000000000000000..2702c751f1496d604fc5c5083a13b3ed66f182b6 --- /dev/null +++ b/src/autotrain/app/templates/duplicate.html @@ -0,0 +1,52 @@ + + + + + + Error – Tensora AutoTrainer + + + + + + +
+ Tensora Logo +
+ + +
+
+

+ Access Error +

+

+ You need to duplicate this space to use Tensora AutoTrainer. +

+ + Duplicate on Hugging Face + +
+
+ + diff --git a/src/autotrain/app/templates/error.html b/src/autotrain/app/templates/error.html new file mode 100644 index 0000000000000000000000000000000000000000..daf1aa2d7a8c47a65c2b662504c94d9837048b9c --- /dev/null +++ b/src/autotrain/app/templates/error.html @@ -0,0 +1,64 @@ + + + + + + Tensora AutoTrainer – Error + + + + + + +
+ Tensora Logo +
+ +
+

Environment Error

+

+ HF_TOKEN environment variable is not set. +

+ Go Back to Home +
+ + diff --git a/src/autotrain/app/templates/index.html b/src/autotrain/app/templates/index.html new file mode 100644 index 0000000000000000000000000000000000000000..40f111ee2835465a56e584771a8876e5bbb6ddd2 --- /dev/null +++ b/src/autotrain/app/templates/index.html @@ -0,0 +1,1106 @@ + + + + + + TensoraLabs Trainer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+

Training Dashboard

+
+

+ Accelerators: Fetching... +

+

+ Status: Fetching... +

+
+
+
+ + +
+
+ + +
+ +
+ +
+

+ Project Setup +

+
+
+ + +
+
+ +
+ + +
+ + +
+
+
+
+
+ + +
+

+ Data Configuration +

+
+ + +
+ + +
+
+ +
+ + +
+
+
+
+ + +
+
+ + +
+
+
+ + +
+
    + + +
+
+ + +
+
+
+ +
+

+ Column Mapping +

+
+
+
+ + +
+
+
+

Parameters

+ +
+
+ +
+
+
+
+
+ + +
+
+
+ + + + + + + + + + + + + + + + + + diff --git a/src/autotrain/app/templates/login.html b/src/autotrain/app/templates/login.html new file mode 100644 index 0000000000000000000000000000000000000000..743bd0ed4acbefb2af294abde3026ae730d9c425 --- /dev/null +++ b/src/autotrain/app/templates/login.html @@ -0,0 +1,88 @@ + + + + + + Tensora AutoTrainer – Login + + + + + + +
+ Tensora Logo +
+ + +
+
+

+ Welcome to Tensora AutoTrainer +

+

+ Sign in using your Hugging Face account to begin training. +

+ +
+ + Login with Hugging Face + +
+ +

+ Can't login? Add your + HF_TOKEN as a secret in + your Space settings. Use a write token from your Hugging Face account. +

+ +
+ Docs + | + GitHub +
+
+
+ + + + diff --git a/src/autotrain/app/training_api.py b/src/autotrain/app/training_api.py new file mode 100644 index 0000000000000000000000000000000000000000..3fe19b8685680539a113771df0a7691228e5f32b --- /dev/null +++ b/src/autotrain/app/training_api.py @@ -0,0 +1,109 @@ +import asyncio +import os +import signal +import sys +from contextlib import asynccontextmanager + +from fastapi import FastAPI + +from autotrain import logger +from autotrain.app.db import AutoTrainDB +from autotrain.app.utils import get_running_jobs, kill_process_by_pid +from autotrain.utils import run_training + + +HF_TOKEN = os.environ.get("HF_TOKEN") +AUTOTRAIN_USERNAME = os.environ.get("AUTOTRAIN_USERNAME") +PROJECT_NAME = os.environ.get("PROJECT_NAME") +TASK_ID = int(os.environ.get("TASK_ID")) +PARAMS = os.environ.get("PARAMS") +DATA_PATH = os.environ.get("DATA_PATH") +MODEL = os.environ.get("MODEL") +DB = AutoTrainDB("autotrain.db") + + +def graceful_exit(signum, frame): + """ + Handles the SIGTERM signal to perform cleanup and exit the program gracefully. + + Args: + signum (int): The signal number. + frame (FrameType): The current stack frame (or None). + + Logs a message indicating that SIGTERM was received and then exits the program with status code 0. + """ + logger.info("SIGTERM received. Performing cleanup...") + sys.exit(0) + + +signal.signal(signal.SIGTERM, graceful_exit) + + +class BackgroundRunner: + """ + A class to handle background running tasks. + + Methods + ------- + run_main(): + Continuously checks for running jobs and shuts down the server if no jobs are found. + """ + + async def run_main(self): + while True: + running_jobs = get_running_jobs(DB) + if not running_jobs: + logger.info("No running jobs found. Shutting down the server.") + kill_process_by_pid(os.getpid()) + await asyncio.sleep(30) + + +runner = BackgroundRunner() + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """ + Manages the lifespan of the FastAPI application. + + This function is responsible for starting the training process and + managing a background task runner. It logs the process ID of the + training job, adds the job to the database, and ensures the background + task is properly cancelled when the application shuts down. + + Args: + app (FastAPI): The FastAPI application instance. + + Yields: + None: This function is a generator that yields control back to the + FastAPI application lifecycle. + """ + process_pid = run_training(params=PARAMS, task_id=TASK_ID) + logger.info(f"Started training with PID {process_pid}") + DB.add_job(process_pid) + task = asyncio.create_task(runner.run_main()) + yield + + task.cancel() + try: + await task + except asyncio.CancelledError: + logger.info("Background runner task cancelled.") + + +api = FastAPI(lifespan=lifespan) +logger.info(f"AUTOTRAIN_USERNAME: {AUTOTRAIN_USERNAME}") +logger.info(f"PROJECT_NAME: {PROJECT_NAME}") +logger.info(f"TASK_ID: {TASK_ID}") +logger.info(f"DATA_PATH: {DATA_PATH}") +logger.info(f"MODEL: {MODEL}") + + +@api.get("/") +async def root(): + return "Your model is being trained..." + + +@api.get("/health") +async def health(): + return "OK" diff --git a/src/autotrain/app/ui_routes.py b/src/autotrain/app/ui_routes.py new file mode 100644 index 0000000000000000000000000000000000000000..78aa04b781a31f1baf90cd4303f63f27717a009d --- /dev/null +++ b/src/autotrain/app/ui_routes.py @@ -0,0 +1,796 @@ +import json +import os +import signal +import sys +import time +from typing import List + +import torch +from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, Request, UploadFile, status +from fastapi.responses import HTMLResponse, JSONResponse, RedirectResponse +from fastapi.templating import Jinja2Templates +from huggingface_hub import repo_exists +from nvitop import Device + +from autotrain import __version__, logger +from autotrain.app.db import AutoTrainDB +from autotrain.app.models import fetch_models +from autotrain.app.params import AppParams, get_task_params +from autotrain.app.utils import get_running_jobs, get_user_and_orgs, kill_process_by_pid, token_verification +from autotrain.dataset import ( + AutoTrainDataset, + AutoTrainImageClassificationDataset, + AutoTrainImageRegressionDataset, + AutoTrainObjectDetectionDataset, + AutoTrainVLMDataset, +) +from autotrain.help import get_app_help +from autotrain.project import AutoTrainProject + + +logger.info("Starting AutoTrain...") +HF_TOKEN = os.environ.get("HF_TOKEN", None) +IS_RUNNING_IN_SPACE = "SPACE_ID" in os.environ +ENABLE_NGC = int(os.environ.get("ENABLE_NGC", 0)) +ENABLE_NVCF = int(os.environ.get("ENABLE_NVCF", 0)) +AUTOTRAIN_LOCAL = int(os.environ.get("AUTOTRAIN_LOCAL", 1)) +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +DB = AutoTrainDB("autotrain.db") +MODEL_CHOICE = fetch_models() + +ui_router = APIRouter() +templates_path = os.path.join(BASE_DIR, "templates") +templates = Jinja2Templates(directory=templates_path) + +UI_PARAMS = { + "mixed_precision": { + "type": "dropdown", + "label": "Mixed precision", + "options": ["fp16", "bf16", "none"], + }, + "optimizer": { + "type": "dropdown", + "label": "Optimizer", + "options": ["adamw_torch", "adamw", "adam", "sgd"], + }, + "scheduler": { + "type": "dropdown", + "label": "Scheduler", + "options": ["linear", "cosine", "cosine_warmup", "constant"], + }, + "eval_strategy": { + "type": "dropdown", + "label": "Evaluation strategy", + "options": ["epoch", "steps"], + }, + "logging_steps": { + "type": "number", + "label": "Logging steps", + }, + "save_total_limit": { + "type": "number", + "label": "Save total limit", + }, + "auto_find_batch_size": { + "type": "dropdown", + "label": "Auto find batch size", + "options": [True, False], + }, + "warmup_ratio": { + "type": "number", + "label": "Warmup proportion", + }, + "max_grad_norm": { + "type": "number", + "label": "Max grad norm", + }, + "weight_decay": { + "type": "number", + "label": "Weight decay", + }, + "epochs": { + "type": "number", + "label": "Epochs", + }, + "batch_size": { + "type": "number", + "label": "Batch size", + }, + "lr": { + "type": "number", + "label": "Learning rate", + }, + "seed": { + "type": "number", + "label": "Seed", + }, + "gradient_accumulation": { + "type": "number", + "label": "Gradient accumulation", + }, + "block_size": { + "type": "number", + "label": "Block size", + }, + "model_max_length": { + "type": "number", + "label": "Model max length", + }, + "add_eos_token": { + "type": "dropdown", + "label": "Add EOS token", + "options": [True, False], + }, + "disable_gradient_checkpointing": { + "type": "dropdown", + "label": "Disable GC", + "options": [True, False], + }, + "use_flash_attention_2": { + "type": "dropdown", + "label": "Use flash attention", + "options": [True, False], + }, + "log": { + "type": "dropdown", + "label": "Logging", + "options": ["tensorboard", "none"], + }, + "quantization": { + "type": "dropdown", + "label": "Quantization", + "options": ["int4", "int8", "none"], + }, + "target_modules": { + "type": "string", + "label": "Target modules", + }, + "merge_adapter": { + "type": "dropdown", + "label": "Merge adapter", + "options": [True, False], + }, + "peft": { + "type": "dropdown", + "label": "PEFT/LoRA", + "options": [True, False], + }, + "lora_r": { + "type": "number", + "label": "Lora r", + }, + "lora_alpha": { + "type": "number", + "label": "Lora alpha", + }, + "lora_dropout": { + "type": "number", + "label": "Lora dropout", + }, + "model_ref": { + "type": "string", + "label": "Reference model", + }, + "dpo_beta": { + "type": "number", + "label": "DPO beta", + }, + "max_prompt_length": { + "type": "number", + "label": "Prompt length", + }, + "max_completion_length": { + "type": "number", + "label": "Completion length", + }, + "chat_template": { + "type": "dropdown", + "label": "Chat template", + "options": ["none", "zephyr", "chatml", "tokenizer"], + }, + "padding": { + "type": "dropdown", + "label": "Padding side", + "options": ["right", "left", "none"], + }, + "max_seq_length": { + "type": "number", + "label": "Max sequence length", + }, + "early_stopping_patience": { + "type": "number", + "label": "Early stopping patience", + }, + "early_stopping_threshold": { + "type": "number", + "label": "Early stopping threshold", + }, + "max_target_length": { + "type": "number", + "label": "Max target length", + }, + "categorical_columns": { + "type": "string", + "label": "Categorical columns", + }, + "numerical_columns": { + "type": "string", + "label": "Numerical columns", + }, + "num_trials": { + "type": "number", + "label": "Number of trials", + }, + "time_limit": { + "type": "number", + "label": "Time limit", + }, + "categorical_imputer": { + "type": "dropdown", + "label": "Categorical imputer", + "options": ["most_frequent", "none"], + }, + "numerical_imputer": { + "type": "dropdown", + "label": "Numerical imputer", + "options": ["mean", "median", "none"], + }, + "numeric_scaler": { + "type": "dropdown", + "label": "Numeric scaler", + "options": ["standard", "minmax", "maxabs", "robust", "none"], + }, + "vae_model": { + "type": "string", + "label": "VAE model", + }, + "prompt": { + "type": "string", + "label": "Prompt", + }, + "resolution": { + "type": "number", + "label": "Resolution", + }, + "num_steps": { + "type": "number", + "label": "Number of steps", + }, + "checkpointing_steps": { + "type": "number", + "label": "Checkpointing steps", + }, + "use_8bit_adam": { + "type": "dropdown", + "label": "Use 8-bit Adam", + "options": [True, False], + }, + "xformers": { + "type": "dropdown", + "label": "xFormers", + "options": [True, False], + }, + "image_square_size": { + "type": "number", + "label": "Image square size", + }, + "unsloth": { + "type": "dropdown", + "label": "Unsloth", + "options": [True, False], + }, + "max_doc_stride": { + "type": "number", + "label": "Max doc stride", + }, + "distributed_backend": { + "type": "dropdown", + "label": "Distributed backend", + "options": ["ddp", "deepspeed"], + }, +} + + +def graceful_exit(signum, frame): + """ + Handles the SIGTERM signal to perform cleanup and exit the program gracefully. + + Args: + signum (int): The signal number. + frame (FrameType): The current stack frame (or None). + + Logs: + Logs the receipt of the SIGTERM signal and the initiation of cleanup. + + Exits: + Exits the program with status code 0. + """ + logger.info("SIGTERM received. Performing cleanup...") + sys.exit(0) + + +signal.signal(signal.SIGTERM, graceful_exit) + + +logger.info("AutoTrain started successfully") + + +def user_authentication(request: Request): + """ + Authenticates the user based on the following priority: + 1. HF_TOKEN environment variable + 2. OAuth information in session + 3. Token in bearer header (not implemented in the given code) + + Args: + request (Request): The incoming HTTP request object. + + Returns: + str: The authenticated token if verification is successful. + + Raises: + HTTPException: If the token is invalid or expired and the application is not running in a space. + + If the application is running in a space and authentication fails, it returns a login template response. + """ + # priority: hf_token env var > oauth_info in session > token in bearer header + # if "oauth_info" in request.session: + if HF_TOKEN is not None: + try: + _ = token_verification(token=os.environ.get("HF_TOKEN")) + return HF_TOKEN + except Exception as e: + logger.error(f"Failed to verify token: {e}") + if IS_RUNNING_IN_SPACE: + return templates.TemplateResponse("login.html", {"request": request}) + else: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired token: HF_TOKEN", + ) + + if IS_RUNNING_IN_SPACE and "oauth_info" in request.session: + try: + _ = token_verification(token=request.session["oauth_info"]["access_token"]) + return request.session["oauth_info"]["access_token"] + except Exception as e: + request.session.pop("oauth_info", None) + logger.error(f"Failed to verify token: {e}") + return templates.TemplateResponse("login.html", {"request": request}) + + if IS_RUNNING_IN_SPACE: + return templates.TemplateResponse("login.html", {"request": request}) + + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid or expired token", + ) + + +@ui_router.get("/", response_class=HTMLResponse) +async def load_index(request: Request, token: str = Depends(user_authentication)): + """ + This function is used to load the index page + :return: HTMLResponse + """ + if os.environ.get("SPACE_ID") == "autotrain-projects/autotrain-advanced": + return templates.TemplateResponse("duplicate.html", {"request": request}) + try: + _users = get_user_and_orgs(user_token=token) + except Exception as e: + logger.error(f"Failed to get user and orgs: {e}") + if "oauth_info" in request.session: + request.session.pop("oauth_info", None) + return templates.TemplateResponse("login.html", {"request": request}) + context = { + "request": request, + "valid_users": _users, + "enable_ngc": ENABLE_NGC, + "enable_nvcf": ENABLE_NVCF, + "enable_local": AUTOTRAIN_LOCAL, + "version": __version__, + "time": time.strftime("%Y-%m-%d %H:%M:%S"), + } + return templates.TemplateResponse("index.html", context) + + +@ui_router.get("/logout", response_class=HTMLResponse) +async def oauth_logout(request: Request, authenticated: bool = Depends(user_authentication)): + """ + This function is used to logout the oauth user + :return: HTMLResponse + """ + request.session.pop("oauth_info", None) + return RedirectResponse("/") + + +@ui_router.get("/params/{task}/{param_type}", response_class=JSONResponse) +async def fetch_params(task: str, param_type: str, authenticated: bool = Depends(user_authentication)): + """ + This function is used to fetch the parameters for a given task + :param task: str + :param param_type: str (basic, full) + :return: JSONResponse + """ + logger.info(f"Task: {task}") + task_params = get_task_params(task, param_type) + if len(task_params) == 0: + return {"error": "Task not found"} + ui_params = {} + for param in task_params: + if param in UI_PARAMS: + ui_params[param] = UI_PARAMS[param] + ui_params[param]["default"] = task_params[param] + else: + logger.info(f"Param {param} not found in UI_PARAMS") + + ui_params = dict(sorted(ui_params.items(), key=lambda x: (x[1]["type"], x[1]["label"]))) + return ui_params + + +@ui_router.get("/model_choices/{task}", response_class=JSONResponse) +async def fetch_model_choices( + task: str, + custom_models: str = Query(None), + authenticated: bool = Depends(user_authentication), +): + """ + This function is used to fetch the model choices for a given task + :param task: str + :param custom_models: str (optional, comma separated list of custom models, query parameter) + :return: JSONResponse + """ + resp = [] + + if custom_models is not None: + custom_models = custom_models.split(",") + for custom_model in custom_models: + custom_model = custom_model.strip() + resp.append({"id": custom_model, "name": custom_model}) + + if os.environ.get("AUTOTRAIN_CUSTOM_MODELS", None) is not None: + custom_models = os.environ.get("AUTOTRAIN_CUSTOM_MODELS") + custom_models = custom_models.split(",") + for custom_model in custom_models: + custom_model = custom_model.strip() + resp.append({"id": custom_model, "name": custom_model}) + + if task == "text-classification": + hub_models = MODEL_CHOICE["text-classification"] + elif task.startswith("llm"): + hub_models = MODEL_CHOICE["llm"] + elif task.startswith("st:"): + hub_models = MODEL_CHOICE["sentence-transformers"] + elif task == "image-classification": + hub_models = MODEL_CHOICE["image-classification"] + elif task == "seq2seq": + hub_models = MODEL_CHOICE["seq2seq"] + elif task == "tabular:classification": + hub_models = MODEL_CHOICE["tabular-classification"] + elif task == "tabular:regression": + hub_models = MODEL_CHOICE["tabular-regression"] + elif task == "token-classification": + hub_models = MODEL_CHOICE["token-classification"] + elif task == "text-regression": + hub_models = MODEL_CHOICE["text-regression"] + elif task == "image-object-detection": + hub_models = MODEL_CHOICE["image-object-detection"] + elif task == "image-regression": + hub_models = MODEL_CHOICE["image-regression"] + elif task.startswith("vlm:"): + hub_models = MODEL_CHOICE["vlm"] + elif task == "extractive-qa": + hub_models = MODEL_CHOICE["extractive-qa"] + else: + raise NotImplementedError + + for hub_model in hub_models: + resp.append({"id": hub_model, "name": hub_model}) + return resp + + +@ui_router.post("/create_project", response_class=JSONResponse) +async def handle_form( + project_name: str = Form(...), + task: str = Form(...), + base_model: str = Form(...), + hardware: str = Form(...), + params: str = Form(...), + autotrain_user: str = Form(...), + column_mapping: str = Form('{"default": "value"}'), + data_files_training: List[UploadFile] = File(None), + data_files_valid: List[UploadFile] = File(None), + hub_dataset: str = Form(""), + train_split: str = Form(""), + valid_split: str = Form(""), + token: str = Depends(user_authentication), +): + """ + Handle form submission for creating and managing AutoTrain projects. + + Args: + project_name (str): The name of the project. + task (str): The task type (e.g., "image-classification", "text-classification"). + base_model (str): The base model to use for training. + hardware (str): The hardware configuration (e.g., "local-ui"). + params (str): JSON string of additional parameters. + autotrain_user (str): The username of the AutoTrain user. + column_mapping (str): JSON string mapping columns to their roles. + data_files_training (List[UploadFile]): List of training data files. + data_files_valid (List[UploadFile]): List of validation data files. + hub_dataset (str): The Hugging Face Hub dataset identifier. + train_split (str): The training split identifier. + valid_split (str): The validation split identifier. + token (str): The authentication token. + + Returns: + dict: A dictionary containing the success status and monitor URL. + + Raises: + HTTPException: If there are conflicts or validation errors in the form submission. + """ + train_split = train_split.strip() + if len(train_split) == 0: + train_split = None + + valid_split = valid_split.strip() + if len(valid_split) == 0: + valid_split = None + + logger.info(f"hardware: {hardware}") + if hardware == "local-ui": + running_jobs = get_running_jobs(DB) + if running_jobs: + raise HTTPException( + status_code=409, detail="Another job is already running. Please wait for it to finish." + ) + + if repo_exists(f"{autotrain_user}/{project_name}", token=token): + raise HTTPException( + status_code=409, + detail=f"Project {project_name} already exists. Please choose a different name.", + ) + + params = json.loads(params) + # convert "null" to None + for key in params: + if params[key] == "null": + params[key] = None + column_mapping = json.loads(column_mapping) + + training_files = [f.file for f in data_files_training if f.filename != ""] if data_files_training else [] + validation_files = [f.file for f in data_files_valid if f.filename != ""] if data_files_valid else [] + + if len(training_files) > 0 and len(hub_dataset) > 0: + raise HTTPException( + status_code=400, detail="Please either upload a dataset or choose a dataset from the Hugging Face Hub." + ) + + if len(training_files) == 0 and len(hub_dataset) == 0: + raise HTTPException( + status_code=400, detail="Please upload a dataset or choose a dataset from the Hugging Face Hub." + ) + + if len(hub_dataset) > 0: + if not train_split: + raise HTTPException(status_code=400, detail="Please enter a training split.") + + if len(hub_dataset) == 0: + file_extension = os.path.splitext(data_files_training[0].filename)[1] + file_extension = file_extension[1:] if file_extension.startswith(".") else file_extension + if task == "image-classification": + dset = AutoTrainImageClassificationDataset( + train_data=training_files[0], + token=token, + project_name=project_name, + username=autotrain_user, + valid_data=validation_files[0] if validation_files else None, + percent_valid=None, # TODO: add to UI + local=hardware.lower() == "local-ui", + ) + elif task == "image-regression": + dset = AutoTrainImageRegressionDataset( + train_data=training_files[0], + token=token, + project_name=project_name, + username=autotrain_user, + valid_data=validation_files[0] if validation_files else None, + percent_valid=None, # TODO: add to UI + local=hardware.lower() == "local-ui", + ) + elif task == "image-object-detection": + dset = AutoTrainObjectDetectionDataset( + train_data=training_files[0], + token=token, + project_name=project_name, + username=autotrain_user, + valid_data=validation_files[0] if validation_files else None, + percent_valid=None, # TODO: add to UI + local=hardware.lower() == "local-ui", + ) + elif task.startswith("vlm:"): + dset = AutoTrainVLMDataset( + train_data=training_files[0], + token=token, + project_name=project_name, + username=autotrain_user, + column_mapping=column_mapping, + valid_data=validation_files[0] if validation_files else None, + percent_valid=None, # TODO: add to UI + local=hardware.lower() == "local-ui", + ) + else: + if task.startswith("llm"): + dset_task = "lm_training" + elif task.startswith("st:"): + dset_task = "sentence_transformers" + elif task == "text-classification": + dset_task = "text_multi_class_classification" + elif task == "text-regression": + dset_task = "text_single_column_regression" + elif task == "seq2seq": + dset_task = "seq2seq" + elif task.startswith("tabular"): + if "," in column_mapping["label"]: + column_mapping["label"] = column_mapping["label"].split(",") + else: + column_mapping["label"] = [column_mapping["label"]] + column_mapping["label"] = [col.strip() for col in column_mapping["label"]] + subtask = task.split(":")[-1].lower() + if len(column_mapping["label"]) > 1 and subtask == "classification": + dset_task = "tabular_multi_label_classification" + elif len(column_mapping["label"]) == 1 and subtask == "classification": + dset_task = "tabular_multi_class_classification" + elif len(column_mapping["label"]) > 1 and subtask == "regression": + dset_task = "tabular_multi_column_regression" + elif len(column_mapping["label"]) == 1 and subtask == "regression": + dset_task = "tabular_single_column_regression" + else: + raise NotImplementedError + elif task == "token-classification": + dset_task = "text_token_classification" + elif task == "extractive-qa": + dset_task = "text_extractive_question_answering" + else: + raise NotImplementedError + logger.info(f"Task: {dset_task}") + logger.info(f"Column mapping: {column_mapping}") + dset_args = dict( + train_data=training_files, + task=dset_task, + token=token, + project_name=project_name, + username=autotrain_user, + column_mapping=column_mapping, + valid_data=validation_files, + percent_valid=None, # TODO: add to UI + local=hardware.lower() == "local-ui", + ext=file_extension, + ) + if task in ("text-classification", "token-classification", "st:pair_class"): + dset_args["convert_to_class_label"] = True + dset = AutoTrainDataset(**dset_args) + data_path = dset.prepare() + else: + data_path = hub_dataset + app_params = AppParams( + job_params_json=json.dumps(params), + token=token, + project_name=project_name, + username=autotrain_user, + task=task, + data_path=data_path, + base_model=base_model, + column_mapping=column_mapping, + using_hub_dataset=len(hub_dataset) > 0, + train_split=None if len(hub_dataset) == 0 else train_split, + valid_split=None if len(hub_dataset) == 0 else valid_split, + ) + params = app_params.munge() + project = AutoTrainProject(params=params, backend=hardware) + job_id = project.create() + monitor_url = "" + if hardware == "local-ui": + DB.add_job(job_id) + monitor_url = "Monitor your job locally / in logs" + elif hardware.startswith("ep-"): + monitor_url = f"https://ui.endpoints.huggingface.co/{autotrain_user}/endpoints/{job_id}" + elif hardware.startswith("spaces-"): + monitor_url = f"https://hf.co/spaces/{job_id}" + else: + monitor_url = f"Success! Monitor your job in logs. Job ID: {job_id}" + + return {"success": "true", "monitor_url": monitor_url} + + +@ui_router.get("/help/{element_id}", response_class=JSONResponse) +async def fetch_help(element_id: str, authenticated: bool = Depends(user_authentication)): + """ + This function is used to fetch the help text for a given element + :param element_id: str + :return: JSONResponse + """ + msg = get_app_help(element_id) + return {"message": msg} + + +@ui_router.get("/accelerators", response_class=JSONResponse) +async def available_accelerators(authenticated: bool = Depends(user_authentication)): + """ + This function is used to fetch the number of available accelerators + :return: JSONResponse + """ + if AUTOTRAIN_LOCAL == 0: + return {"accelerators": "Not available in cloud mode."} + cuda_available = torch.cuda.is_available() + mps_available = torch.backends.mps.is_available() + if cuda_available: + num_gpus = torch.cuda.device_count() + elif mps_available: + num_gpus = 1 + else: + num_gpus = 0 + return {"accelerators": num_gpus} + + +@ui_router.get("/is_model_training", response_class=JSONResponse) +async def is_model_training(authenticated: bool = Depends(user_authentication)): + """ + This function is used to fetch the number of running jobs + :return: JSONResponse + """ + if AUTOTRAIN_LOCAL == 0: + return {"model_training": "Not available in cloud mode."} + running_jobs = get_running_jobs(DB) + if running_jobs: + return {"model_training": True, "pids": running_jobs} + return {"model_training": False, "pids": []} + + +@ui_router.get("/logs", response_class=JSONResponse) +async def fetch_logs(authenticated: bool = Depends(user_authentication)): + """ + This function is used to fetch the logs + :return: JSONResponse + """ + if not AUTOTRAIN_LOCAL: + return {"logs": "Logs are only available in local mode."} + log_file = "autotrain.log" + with open(log_file, "r", encoding="utf-8") as f: + logs = f.read() + if len(str(logs).strip()) == 0: + logs = "No logs available." + + logs = logs.split("\n") + logs = logs[::-1] + # remove lines containing /is_model_training & /accelerators + logs = [log for log in logs if "/ui/" not in log and "/static/" not in log and "nvidia-ml-py" not in log] + + cuda_available = torch.cuda.is_available() + if cuda_available: + devices = Device.all() + device_logs = [] + for device in devices: + device_logs.append( + f"Device {device.index}: {device.name()} - {device.memory_used_human()}/{device.memory_total_human()}" + ) + device_logs.append("-----------------") + logs = device_logs + logs + return {"logs": logs} + + +@ui_router.get("/stop_training", response_class=JSONResponse) +async def stop_training(authenticated: bool = Depends(user_authentication)): + """ + This function is used to stop the training + :return: JSONResponse + """ + running_jobs = get_running_jobs(DB) + if running_jobs: + for _pid in running_jobs: + try: + kill_process_by_pid(_pid) + except Exception: + logger.info(f"Process {_pid} is already completed. Skipping...") + return {"success": True} + return {"success": False} diff --git a/src/autotrain/app/utils.py b/src/autotrain/app/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..55f6d6a6ffc1509429bece76615a2f8b37bb1198 --- /dev/null +++ b/src/autotrain/app/utils.py @@ -0,0 +1,180 @@ +import os +import signal +import sys + +import psutil +import requests + +from autotrain import config, logger + + +def graceful_exit(signum, frame): + logger.info("SIGTERM received. Performing cleanup...") + sys.exit(0) + + +signal.signal(signal.SIGTERM, graceful_exit) + + +def get_running_jobs(db): + """ + Retrieves and manages running jobs from the database. + + This function fetches the list of running jobs from the provided database object. + For each running job, it checks the process status. If the status is "completed", + "error", or "zombie", it attempts to kill the process and remove the job from the + database. After processing, it fetches and returns the updated list of running jobs. + + Args: + db: A database object that provides methods to get and delete running jobs. + + Returns: + list: An updated list of running jobs from the database. + """ + running_jobs = db.get_running_jobs() + if running_jobs: + for _pid in running_jobs: + proc_status = get_process_status(_pid) + proc_status = proc_status.strip().lower() + if proc_status in ("completed", "error", "zombie"): + logger.info(f"Killing PID: {_pid}") + try: + kill_process_by_pid(_pid) + except Exception as e: + logger.info(f"Error while killing process: {e}") + logger.info(f"Process {_pid} is already completed. Skipping...") + db.delete_job(_pid) + + running_jobs = db.get_running_jobs() + return running_jobs + + +def get_process_status(pid): + """ + Retrieve the status of a process given its PID. + + Args: + pid (int): The process ID of the process to check. + + Returns: + str: The status of the process. If the process does not exist, returns "Completed". + + Raises: + psutil.NoSuchProcess: If no process with the given PID is found. + """ + try: + process = psutil.Process(pid) + proc_status = process.status() + return proc_status + except psutil.NoSuchProcess: + logger.info(f"No process found with PID: {pid}") + return "Completed" + + +def kill_process_by_pid(pid): + """ + Kill a process by its PID (Process ID). + + This function attempts to terminate a process with the given PID using the SIGTERM signal. + It logs the outcome of the operation, whether successful or not. + + Args: + pid (int): The Process ID of the process to be terminated. + + Raises: + ProcessLookupError: If no process with the given PID is found. + Exception: If an error occurs while attempting to send the SIGTERM signal. + """ + try: + os.kill(pid, signal.SIGTERM) + logger.info(f"Sent SIGTERM to process with PID {pid}") + except ProcessLookupError: + logger.error(f"No process found with PID {pid}") + except Exception as e: + logger.error(f"Failed to send SIGTERM to process with PID {pid}: {e}") + + +def token_verification(token): + """ + Verifies the provided token with the Hugging Face API and retrieves user information. + + Args: + token (str): The token to be verified. It can be either an OAuth token (starting with "hf_oauth") + or a regular token (starting with "hf_"). + + Returns: + dict: A dictionary containing user information with the following keys: + - id (str): The user ID. + - name (str): The user's preferred username. + - orgs (list): A list of organizations the user belongs to. + + Raises: + Exception: If the Hugging Face Hub is unreachable or the token is invalid. + """ + if token.startswith("hf_oauth"): + _api_url = config.HF_API + "/oauth/userinfo" + _err_msg = "/oauth/userinfo" + else: + _api_url = config.HF_API + "/api/whoami-v2" + _err_msg = "/api/whoami-v2" + headers = {} + cookies = {} + if token.startswith("hf_"): + headers["Authorization"] = f"Bearer {token}" + else: + cookies = {"token": token} + try: + response = requests.get( + _api_url, + headers=headers, + cookies=cookies, + timeout=3, + ) + except (requests.Timeout, ConnectionError) as err: + logger.error(f"Failed to request {_err_msg} - {repr(err)}") + raise Exception(f"Hugging Face Hub ({_err_msg}) is unreachable, please try again later.") + + if response.status_code != 200: + logger.error(f"Failed to request {_err_msg} - {response.status_code}") + raise Exception(f"Invalid token ({_err_msg}). Please login with a write token.") + + resp = response.json() + user_info = {} + + if token.startswith("hf_oauth"): + user_info["id"] = resp["sub"] + user_info["name"] = resp["preferred_username"] + user_info["orgs"] = [resp["orgs"][k]["preferred_username"] for k in range(len(resp["orgs"]))] + else: + user_info["id"] = resp["id"] + user_info["name"] = resp["name"] + user_info["orgs"] = [resp["orgs"][k]["name"] for k in range(len(resp["orgs"]))] + return user_info + + +def get_user_and_orgs(user_token): + """ + Retrieve the username and organizations associated with the provided user token. + + Args: + user_token (str): The token used to authenticate the user. Must be a valid write token. + + Returns: + list: A list containing the username followed by the organizations the user belongs to. + + Raises: + Exception: If the user token is None or an empty string. + """ + if user_token is None: + raise Exception("Please login with a write token.") + + if user_token is None or len(user_token) == 0: + raise Exception("Invalid token. Please login with a write token.") + + user_info = token_verification(token=user_token) + username = user_info["name"] + orgs = user_info["orgs"] + + who_is_training = [username] + orgs + + return who_is_training diff --git a/src/autotrain/backends/__init__.py b/src/autotrain/backends/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/backends/base.py b/src/autotrain/backends/base.py new file mode 100644 index 0000000000000000000000000000000000000000..01aac4e30c12ed5fe6b6534b09e790ac55fc9e61 --- /dev/null +++ b/src/autotrain/backends/base.py @@ -0,0 +1,163 @@ +import json +from dataclasses import dataclass +from typing import Union + +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams +from autotrain.trainers.generic.params import GenericParams +from autotrain.trainers.image_classification.params import ImageClassificationParams +from autotrain.trainers.image_regression.params import ImageRegressionParams +from autotrain.trainers.object_detection.params import ObjectDetectionParams +from autotrain.trainers.sent_transformers.params import SentenceTransformersParams +from autotrain.trainers.seq2seq.params import Seq2SeqParams +from autotrain.trainers.tabular.params import TabularParams +from autotrain.trainers.text_classification.params import TextClassificationParams +from autotrain.trainers.text_regression.params import TextRegressionParams +from autotrain.trainers.token_classification.params import TokenClassificationParams +from autotrain.trainers.vlm.params import VLMTrainingParams + + +AVAILABLE_HARDWARE = { + # hugging face spaces + "spaces-a10g-large": "a10g-large", + "spaces-a10g-small": "a10g-small", + "spaces-a100-large": "a100-large", + "spaces-t4-medium": "t4-medium", + "spaces-t4-small": "t4-small", + "spaces-cpu-upgrade": "cpu-upgrade", + "spaces-cpu-basic": "cpu-basic", + "spaces-l4x1": "l4x1", + "spaces-l4x4": "l4x4", + "spaces-l40sx1": "l40sx1", + "spaces-l40sx4": "l40sx4", + "spaces-l40sx8": "l40sx8", + "spaces-a10g-largex2": "a10g-largex2", + "spaces-a10g-largex4": "a10g-largex4", + # ngc + "dgx-a100": "dgxa100.80g.1.norm", + "dgx-2a100": "dgxa100.80g.2.norm", + "dgx-4a100": "dgxa100.80g.4.norm", + "dgx-8a100": "dgxa100.80g.8.norm", + # hugging face endpoints + "ep-aws-useast1-s": "aws_us-east-1_gpu_small_g4dn.xlarge", + "ep-aws-useast1-m": "aws_us-east-1_gpu_medium_g5.2xlarge", + "ep-aws-useast1-l": "aws_us-east-1_gpu_large_g4dn.12xlarge", + "ep-aws-useast1-xl": "aws_us-east-1_gpu_xlarge_p4de", + "ep-aws-useast1-2xl": "aws_us-east-1_gpu_2xlarge_p4de", + "ep-aws-useast1-4xl": "aws_us-east-1_gpu_4xlarge_p4de", + "ep-aws-useast1-8xl": "aws_us-east-1_gpu_8xlarge_p4de", + # nvcf + "nvcf-l40sx1": {"id": "67bb8939-c932-429a-a446-8ae898311856"}, + "nvcf-h100x1": {"id": "848348f8-a4e2-4242-bce9-6baa1bd70a66"}, + "nvcf-h100x2": {"id": "fb006a89-451e-4d9c-82b5-33eff257e0bf"}, + "nvcf-h100x4": {"id": "21bae5af-87e5-4132-8fc0-bf3084e59a57"}, + "nvcf-h100x8": {"id": "6e0c2af6-5368-47e0-b15e-c070c2c92018"}, + # local + "local-ui": "local", + "local": "local", + "local-cli": "local", +} + + +@dataclass +class BaseBackend: + """ + BaseBackend class is responsible for initializing and validating backend configurations + for various training parameters. It supports multiple types of training parameters + including text classification, image classification, LLM training, and more. + + Attributes: + params (Union[TextClassificationParams, ImageClassificationParams, LLMTrainingParams, + GenericParams, TabularParams, Seq2SeqParams, + TokenClassificationParams, TextRegressionParams, ObjectDetectionParams, + SentenceTransformersParams, ImageRegressionParams, VLMTrainingParams, + ExtractiveQuestionAnsweringParams]): Training parameters. + backend (str): Backend type. + + Methods: + __post_init__(): Initializes the backend configuration, validates parameters, + sets task IDs, and prepares environment variables. + """ + + params: Union[ + TextClassificationParams, + ImageClassificationParams, + LLMTrainingParams, + GenericParams, + TabularParams, + Seq2SeqParams, + TokenClassificationParams, + TextRegressionParams, + ObjectDetectionParams, + SentenceTransformersParams, + ImageRegressionParams, + VLMTrainingParams, + ExtractiveQuestionAnsweringParams, + ] + backend: str + + def __post_init__(self): + self.username = None + + if isinstance(self.params, GenericParams) and self.backend.startswith("local"): + raise ValueError("Local backend is not supported for GenericParams") + + if ( + self.backend.startswith("spaces-") + or self.backend.startswith("ep-") + or self.backend.startswith("ngc-") + or self.backend.startswith("nvcf-") + ): + if self.params.username is not None: + self.username = self.params.username + else: + raise ValueError("Must provide username") + + if isinstance(self.params, LLMTrainingParams): + self.task_id = 9 + elif isinstance(self.params, TextClassificationParams): + self.task_id = 2 + elif isinstance(self.params, TabularParams): + self.task_id = 26 + elif isinstance(self.params, GenericParams): + self.task_id = 27 + elif isinstance(self.params, Seq2SeqParams): + self.task_id = 28 + elif isinstance(self.params, ImageClassificationParams): + self.task_id = 18 + elif isinstance(self.params, TokenClassificationParams): + self.task_id = 4 + elif isinstance(self.params, TextRegressionParams): + self.task_id = 10 + elif isinstance(self.params, ObjectDetectionParams): + self.task_id = 29 + elif isinstance(self.params, SentenceTransformersParams): + self.task_id = 30 + elif isinstance(self.params, ImageRegressionParams): + self.task_id = 24 + elif isinstance(self.params, VLMTrainingParams): + self.task_id = 31 + elif isinstance(self.params, ExtractiveQuestionAnsweringParams): + self.task_id = 5 + else: + raise NotImplementedError + + self.available_hardware = AVAILABLE_HARDWARE + + self.wait = False + if self.backend == "local-ui": + self.wait = False + if self.backend in ("local", "local-cli"): + self.wait = True + + self.env_vars = { + "HF_TOKEN": self.params.token, + "AUTOTRAIN_USERNAME": self.username, + "PROJECT_NAME": self.params.project_name, + "TASK_ID": str(self.task_id), + "PARAMS": json.dumps(self.params.model_dump_json()), + } + self.env_vars["DATA_PATH"] = self.params.data_path + + if not isinstance(self.params, GenericParams): + self.env_vars["MODEL"] = self.params.model diff --git a/src/autotrain/backends/endpoints.py b/src/autotrain/backends/endpoints.py new file mode 100644 index 0000000000000000000000000000000000000000..84078f01e43a19c0841b22a2ad6ca26380faace9 --- /dev/null +++ b/src/autotrain/backends/endpoints.py @@ -0,0 +1,86 @@ +import requests + +from autotrain.backends.base import BaseBackend + + +ENDPOINTS_URL = "https://api.endpoints.huggingface.cloud/v2/endpoint/" + + +class EndpointsRunner(BaseBackend): + """ + EndpointsRunner is responsible for creating and managing endpoint instances. + + Methods + ------- + create(): + Creates an endpoint instance with the specified hardware and model parameters. + + create() Method + --------------- + Creates an endpoint instance with the specified hardware and model parameters. + + Parameters + ---------- + None + + Returns + ------- + str + The name of the created endpoint instance. + + Raises + ------ + requests.exceptions.RequestException + If there is an issue with the HTTP request. + """ + + def create(self): + hardware = self.available_hardware[self.backend] + accelerator = hardware.split("_")[2] + instance_size = hardware.split("_")[3] + region = hardware.split("_")[1] + vendor = hardware.split("_")[0] + instance_type = hardware.split("_")[4] + payload = { + "accountId": self.username, + "compute": { + "accelerator": accelerator, + "instanceSize": instance_size, + "instanceType": instance_type, + "scaling": {"maxReplica": 1, "minReplica": 1}, + }, + "model": { + "framework": "custom", + "image": { + "custom": { + "env": { + "HF_TOKEN": self.params.token, + "AUTOTRAIN_USERNAME": self.username, + "PROJECT_NAME": self.params.project_name, + "PARAMS": self.params.model_dump_json(), + "DATA_PATH": self.params.data_path, + "TASK_ID": str(self.task_id), + "MODEL": self.params.model, + "ENDPOINT_ID": f"{self.username}/{self.params.project_name}", + }, + "health_route": "/", + "port": 7860, + "url": "public.ecr.aws/z4c3o6n6/autotrain-api:latest", + } + }, + "repository": "autotrain-projects/autotrain-advanced", + "revision": "main", + "task": "custom", + }, + "name": self.params.project_name, + "provider": {"region": region, "vendor": vendor}, + "type": "protected", + } + headers = {"Authorization": f"Bearer {self.params.token}"} + r = requests.post( + ENDPOINTS_URL + self.username, + json=payload, + headers=headers, + timeout=120, + ) + return r.json()["name"] diff --git a/src/autotrain/backends/local.py b/src/autotrain/backends/local.py new file mode 100644 index 0000000000000000000000000000000000000000..442398141a319e4d21535891b20c751074e240b8 --- /dev/null +++ b/src/autotrain/backends/local.py @@ -0,0 +1,26 @@ +from autotrain import logger +from autotrain.backends.base import BaseBackend +from autotrain.utils import run_training + + +class LocalRunner(BaseBackend): + """ + LocalRunner is a class that inherits from BaseBackend and is responsible for managing local training tasks. + + Methods: + create(): + Starts the local training process by retrieving parameters and task ID from environment variables. + Logs the start of the training process. + Runs the training with the specified parameters and task ID. + If the `wait` attribute is False, logs the training process ID (PID). + Returns the training process ID (PID). + """ + + def create(self): + logger.info("Starting local training...") + params = self.env_vars["PARAMS"] + task_id = int(self.env_vars["TASK_ID"]) + training_pid = run_training(params, task_id, local=True, wait=self.wait) + if not self.wait: + logger.info(f"Training PID: {training_pid}") + return training_pid diff --git a/src/autotrain/backends/ngc.py b/src/autotrain/backends/ngc.py new file mode 100644 index 0000000000000000000000000000000000000000..35cb49d5e9f3b3de20a240537add90262683c715 --- /dev/null +++ b/src/autotrain/backends/ngc.py @@ -0,0 +1,107 @@ +import base64 +import json +import os + +import requests +from requests.exceptions import HTTPError + +from autotrain import logger +from autotrain.backends.base import BaseBackend + + +NGC_API = os.environ.get("NGC_API", "https://api.ngc.nvidia.com/v2/org") +NGC_AUTH = os.environ.get("NGC_AUTH", "https://authn.nvidia.com") +NGC_ACE = os.environ.get("NGC_ACE") +NGC_ORG = os.environ.get("NGC_ORG") +NGC_API_KEY = os.environ.get("NGC_CLI_API_KEY") +NGC_TEAM = os.environ.get("NGC_TEAM") + + +class NGCRunner(BaseBackend): + """ + NGCRunner class for managing NGC backend trainings. + + Methods: + _user_authentication_ngc(): + Authenticates the user with NGC and retrieves an authentication token. + Returns: + str: The authentication token. + Raises: + Exception: If an HTTP error or connection error occurs during the request. + + _create_ngc_job(token, url, payload): + Creates a job on NGC using the provided token, URL, and payload. + Args: + token (str): The authentication token. + url (str): The URL for the NGC API endpoint. + payload (dict): The payload containing job details. + Returns: + str: The ID of the created job. + Raises: + Exception: If an HTTP error or connection error occurs during the request. + + create(): + Creates a job on NGC with the specified parameters. + Returns: + str: The ID of the created job. + """ + + def _user_authentication_ngc(self): + logger.info("Authenticating NGC user...") + scope = "group/ngc" + + querystring = {"service": "ngc", "scope": scope} + auth = f"$oauthtoken:{NGC_API_KEY}" + headers = { + "Authorization": f"Basic {base64.b64encode(auth.encode('utf-8')).decode('utf-8')}", + "Content-Type": "application/json", + "Cache-Control": "no-cache", + } + try: + response = requests.get(NGC_AUTH + "/token", headers=headers, params=querystring, timeout=30) + except HTTPError as http_err: + logger.error(f"HTTP error occurred: {http_err}") + raise Exception("HTTP Error %d: from '%s'" % (response.status_code, NGC_AUTH)) + except (requests.Timeout, ConnectionError) as err: + logger.error(f"Failed to request NGC token - {repr(err)}") + raise Exception("%s is unreachable, please try again later." % NGC_AUTH) + return json.loads(response.text.encode("utf8"))["token"] + + def _create_ngc_job(self, token, url, payload): + logger.info("Creating NGC Job") + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"} + try: + response = requests.post(NGC_API + url + "/jobs", headers=headers, json=payload, timeout=30) + result = response.json() + logger.info( + f"NGC Job ID: {result.get('job', {}).get('id')}, Job Status History: {result.get('jobStatusHistory')}" + ) + return result.get("job", {}).get("id") + except HTTPError as http_err: + logger.error(f"HTTP error occurred: {http_err}") + raise Exception(f"HTTP Error {response.status_code}: {http_err}") + except (requests.Timeout, ConnectionError) as err: + logger.error(f"Failed to create NGC job - {repr(err)}") + raise Exception(f"Unreachable, please try again later: {err}") + + def create(self): + job_name = f"{self.username}-{self.params.project_name}" + ngc_url = f"/{NGC_ORG}/team/{NGC_TEAM}" + ngc_cmd = "set -x; conda run --no-capture-output -p /app/env autotrain api --port 7860 --host 0.0.0.0" + ngc_payload = { + "name": job_name, + "aceName": NGC_ACE, + "aceInstance": self.available_hardware[self.backend], + "dockerImageName": f"{NGC_ORG}/autotrain-advanced:latest", + "command": ngc_cmd, + "envs": [{"name": key, "value": value} for key, value in self.env_vars.items()], + "jobOrder": 50, + "jobPriority": "NORMAL", + "portMappings": [{"containerPort": 7860, "protocol": "HTTPS"}], + "resultContainerMountPoint": "/results", + "runPolicy": {"preemptClass": "RUNONCE", "totalRuntimeSeconds": 259200}, + } + + ngc_token = self._user_authentication_ngc() + job_id = self._create_ngc_job(ngc_token, ngc_url, ngc_payload) + return job_id diff --git a/src/autotrain/backends/nvcf.py b/src/autotrain/backends/nvcf.py new file mode 100644 index 0000000000000000000000000000000000000000..fc0bb445cac8b9be439f96b24ca5b56238de9257 --- /dev/null +++ b/src/autotrain/backends/nvcf.py @@ -0,0 +1,203 @@ +import os +import threading +import time +from types import SimpleNamespace + +import requests + +from autotrain import logger +from autotrain.backends.base import BaseBackend + + +NVCF_API = "https://huggingface.co/api/integrations/dgx/v1" + + +class NVCFRunner(BaseBackend): + """ + NVCFRunner is a backend class responsible for managing and executing NVIDIA NVCF jobs. + + Methods + ------- + _convert_dict_to_object(dictionary): + Recursively converts a dictionary to an object using SimpleNamespace. + + _conf_nvcf(token, nvcf_type, url, job_name, method="POST", payload=None): + Configures and submits an NVCF job using the specified parameters. + + _poll_nvcf(url, token, job_name, method="get", timeout=86400, interval=30, op="poll"): + Polls the status of an NVCF job until completion or timeout. + + create(): + Initiates the creation and polling of an NVCF job. + """ + + def _convert_dict_to_object(self, dictionary): + if isinstance(dictionary, dict): + for key, value in dictionary.items(): + dictionary[key] = self._convert_dict_to_object(value) + return SimpleNamespace(**dictionary) + elif isinstance(dictionary, list): + return [self._convert_dict_to_object(item) for item in dictionary] + else: + return dictionary + + def _conf_nvcf(self, token, nvcf_type, url, job_name, method="POST", payload=None): + logger.info(f"{job_name}: {method} - Configuring NVCF {nvcf_type}.") + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"} + try: + if method.upper() == "POST": + response = requests.post(url, headers=headers, json=payload, timeout=30) + else: + raise ValueError(f"Unsupported HTTP method: {method}") + + response.raise_for_status() + + if response.status_code == 202: + logger.info(f"{job_name}: {method} - Successfully submitted NVCF job. Polling reqId for completion") + response_data = response.json() + nvcf_reqid = response_data.get("nvcfRequestId") + if nvcf_reqid: + logger.info(f"{job_name}: nvcfRequestId: {nvcf_reqid}") + return nvcf_reqid + logger.warning(f"{job_name}: nvcfRequestId key is missing in the response body") + return None + + result = response.json() + result_obj = self._convert_dict_to_object(result) + logger.info(f"{job_name}: {method} - Successfully processed NVCF {nvcf_type}.") + return result_obj + + except requests.HTTPError as http_err: + # Log the response body for more context + error_message = http_err.response.text if http_err.response else "No additional error information." + logger.error( + f"{job_name}: HTTP error occurred processing NVCF {nvcf_type} with {method} request: {http_err}. " + f"Error details: {error_message}" + ) + raise Exception(f"HTTP Error {http_err.response.status_code}: {http_err}. Details: {error_message}") + + except (requests.Timeout, ConnectionError) as err: + logger.error(f"{job_name}: Failed to process NVCF {nvcf_type} with {method} request - {repr(err)}") + raise Exception(f"Unreachable, please try again later: {err}") + + def _poll_nvcf(self, url, token, job_name, method="get", timeout=86400, interval=30, op="poll"): + timeout = float(timeout) + interval = float(interval) + start_time = time.time() + success = False + last_full_log = "" + + while time.time() - start_time < timeout: + try: + headers = {"Content-Type": "application/json", "Authorization": f"Bearer {token}"} + if method.upper() == "GET": + response = requests.get(url, headers=headers) + else: + raise ValueError(f"Unsupported HTTP method: {method}") + + if response.status_code == 404 and success: + break + + response.raise_for_status() + + try: + data = response.json() + except ValueError: + logger.error("Failed to parse JSON from response") + continue + + if response.status_code == 500: + logger.error("Training failed") + if "detail" in data: + detail_message = data["detail"] + for line in detail_message.split("\n"): + if line.strip(): + print(line) + break + + if response.status_code in [200, 202]: + logger.info( + f"{job_name}: {method} - {response.status_code} - {'Polling completed' if response.status_code == 200 else 'Polling reqId for completion'}" + ) + + if "log" in data: + current_full_log = data["log"] + if current_full_log != last_full_log: + new_log_content = current_full_log[len(last_full_log) :] + for line in new_log_content.split("\n"): + if line.strip(): + print(line) + last_full_log = current_full_log + + if response.status_code == 200: + success = True + + except requests.HTTPError as http_err: + if not (http_err.response.status_code == 404 and success): + logger.error(f"HTTP error occurred: {http_err}") + except (requests.ConnectionError, ValueError) as err: + logger.error(f"Error while handling request: {err}") + + time.sleep(interval) + + if not success: + raise TimeoutError(f"Operation '{op}' did not complete successfully within the timeout period.") + + def create(self): + hf_token = self.env_vars["HF_TOKEN"] + job_name = f"{self.username}-{self.params.project_name}" + + logger.info("Starting NVCF training") + logger.info(f"job_name: {job_name}") + logger.info(f"backend: {self.backend}") + + nvcf_url_submit = f"{NVCF_API}/invoke/{self.available_hardware[self.backend]['id']}" + org_name = os.environ.get("SPACE_ID") + if org_name is None: + raise ValueError("SPACE_ID environment variable is not set") + org_name = org_name.split("/")[0] + nvcf_fr_payload = { + "cmd": [ + "conda", + "run", + "--no-capture-output", + "-p", + "/app/env", + "python", + "-u", + "-m", + "uvicorn", + "autotrain.app.training_api:api", + "--host", + "0.0.0.0", + "--port", + "7860", + ], + "env": {key: value for key, value in self.env_vars.items()}, + "ORG_NAME": org_name, + } + + nvcf_fn_req = self._conf_nvcf( + token=hf_token, + nvcf_type="job_submit", + url=nvcf_url_submit, + job_name=job_name, + method="POST", + payload=nvcf_fr_payload, + ) + + nvcf_url_reqpoll = f"{NVCF_API}/status/{nvcf_fn_req}" + logger.info(f"{job_name}: Polling : {nvcf_url_reqpoll}") + poll_thread = threading.Thread( + target=self._poll_nvcf, + kwargs={ + "url": nvcf_url_reqpoll, + "token": hf_token, + "job_name": job_name, + "method": "GET", + "timeout": 172800, + "interval": 20, + }, + ) + poll_thread.start() + return nvcf_fn_req diff --git a/src/autotrain/backends/spaces.py b/src/autotrain/backends/spaces.py new file mode 100644 index 0000000000000000000000000000000000000000..a19976c510604f0de71dd18b3eebc8d694f73752 --- /dev/null +++ b/src/autotrain/backends/spaces.py @@ -0,0 +1,93 @@ +import io + +from huggingface_hub import HfApi + +from autotrain.backends.base import BaseBackend +from autotrain.trainers.generic.params import GenericParams + + +_DOCKERFILE = """ +FROM huggingface/autotrain-advanced:latest + +CMD pip uninstall -y autotrain-advanced && pip install -U autotrain-advanced && autotrain api --port 7860 --host 0.0.0.0 +""" + +# format _DOCKERFILE +_DOCKERFILE = _DOCKERFILE.replace("\n", " ").replace(" ", "\n").strip() + + +class SpaceRunner(BaseBackend): + """ + SpaceRunner is a backend class responsible for creating and managing training jobs on Hugging Face Spaces. + + Methods + ------- + _create_readme(): + Creates a README.md file content for the space. + + _add_secrets(api, space_id): + Adds necessary secrets to the space repository. + + create(): + Creates a new space repository, adds secrets, and uploads necessary files. + """ + + def _create_readme(self): + _readme = "---\n" + _readme += f"title: {self.params.project_name}\n" + _readme += "emoji: 🚀\n" + _readme += "colorFrom: green\n" + _readme += "colorTo: indigo\n" + _readme += "sdk: docker\n" + _readme += "pinned: false\n" + _readme += "tags:\n" + _readme += "- autotrain\n" + _readme += "duplicated_from: autotrain-projects/autotrain-advanced\n" + _readme += "---\n" + _readme = io.BytesIO(_readme.encode()) + return _readme + + def _add_secrets(self, api, space_id): + if isinstance(self.params, GenericParams): + for k, v in self.params.env.items(): + api.add_space_secret(repo_id=space_id, key=k, value=v) + self.params.env = {} + + api.add_space_secret(repo_id=space_id, key="HF_TOKEN", value=self.params.token) + api.add_space_secret(repo_id=space_id, key="AUTOTRAIN_USERNAME", value=self.username) + api.add_space_secret(repo_id=space_id, key="PROJECT_NAME", value=self.params.project_name) + api.add_space_secret(repo_id=space_id, key="TASK_ID", value=str(self.task_id)) + api.add_space_secret(repo_id=space_id, key="PARAMS", value=self.params.model_dump_json()) + api.add_space_secret(repo_id=space_id, key="DATA_PATH", value=self.params.data_path) + + if not isinstance(self.params, GenericParams): + api.add_space_secret(repo_id=space_id, key="MODEL", value=self.params.model) + + def create(self): + api = HfApi(token=self.params.token) + space_id = f"{self.username}/autotrain-{self.params.project_name}" + api.create_repo( + repo_id=space_id, + repo_type="space", + space_sdk="docker", + space_hardware=self.available_hardware[self.backend], + private=True, + ) + self._add_secrets(api, space_id) + api.set_space_sleep_time(repo_id=space_id, sleep_time=604800) + readme = self._create_readme() + api.upload_file( + path_or_fileobj=readme, + path_in_repo="README.md", + repo_id=space_id, + repo_type="space", + ) + + _dockerfile = io.BytesIO(_DOCKERFILE.encode()) + api.upload_file( + path_or_fileobj=_dockerfile, + path_in_repo="Dockerfile", + repo_id=space_id, + repo_type="space", + ) + return space_id diff --git a/src/autotrain/cli/__init__.py b/src/autotrain/cli/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..02b80c7e1fdd1e09b445b058195a9722b3fbb978 --- /dev/null +++ b/src/autotrain/cli/__init__.py @@ -0,0 +1,13 @@ +from abc import ABC, abstractmethod +from argparse import ArgumentParser + + +class BaseAutoTrainCommand(ABC): + @staticmethod + @abstractmethod + def register_subcommand(parser: ArgumentParser): + raise NotImplementedError() + + @abstractmethod + def run(self): + raise NotImplementedError() diff --git a/src/autotrain/cli/autotrain.py b/src/autotrain/cli/autotrain.py new file mode 100644 index 0000000000000000000000000000000000000000..fcd85b9828b597ee6408a6ac75f2a1af515694e5 --- /dev/null +++ b/src/autotrain/cli/autotrain.py @@ -0,0 +1,72 @@ +import argparse + +from autotrain import __version__, logger +from autotrain.cli.run_api import RunAutoTrainAPICommand +from autotrain.cli.run_app import RunAutoTrainAppCommand +from autotrain.cli.run_extractive_qa import RunAutoTrainExtractiveQACommand +from autotrain.cli.run_image_classification import RunAutoTrainImageClassificationCommand +from autotrain.cli.run_image_regression import RunAutoTrainImageRegressionCommand +from autotrain.cli.run_llm import RunAutoTrainLLMCommand +from autotrain.cli.run_object_detection import RunAutoTrainObjectDetectionCommand +from autotrain.cli.run_sent_tranformers import RunAutoTrainSentenceTransformersCommand +from autotrain.cli.run_seq2seq import RunAutoTrainSeq2SeqCommand +from autotrain.cli.run_setup import RunSetupCommand +from autotrain.cli.run_spacerunner import RunAutoTrainSpaceRunnerCommand +from autotrain.cli.run_tabular import RunAutoTrainTabularCommand +from autotrain.cli.run_text_classification import RunAutoTrainTextClassificationCommand +from autotrain.cli.run_text_regression import RunAutoTrainTextRegressionCommand +from autotrain.cli.run_token_classification import RunAutoTrainTokenClassificationCommand +from autotrain.cli.run_tools import RunAutoTrainToolsCommand +from autotrain.parser import AutoTrainConfigParser + + +def main(): + parser = argparse.ArgumentParser( + "AutoTrain advanced CLI", + usage="autotrain []", + epilog="For more information about a command, run: `autotrain --help`", + ) + parser.add_argument("--version", "-v", help="Display AutoTrain version", action="store_true") + parser.add_argument("--config", help="Optional configuration file", type=str) + commands_parser = parser.add_subparsers(help="commands") + + # Register commands + RunAutoTrainAppCommand.register_subcommand(commands_parser) + RunAutoTrainLLMCommand.register_subcommand(commands_parser) + RunSetupCommand.register_subcommand(commands_parser) + RunAutoTrainAPICommand.register_subcommand(commands_parser) + RunAutoTrainTextClassificationCommand.register_subcommand(commands_parser) + RunAutoTrainImageClassificationCommand.register_subcommand(commands_parser) + RunAutoTrainTabularCommand.register_subcommand(commands_parser) + RunAutoTrainSpaceRunnerCommand.register_subcommand(commands_parser) + RunAutoTrainSeq2SeqCommand.register_subcommand(commands_parser) + RunAutoTrainTokenClassificationCommand.register_subcommand(commands_parser) + RunAutoTrainToolsCommand.register_subcommand(commands_parser) + RunAutoTrainTextRegressionCommand.register_subcommand(commands_parser) + RunAutoTrainObjectDetectionCommand.register_subcommand(commands_parser) + RunAutoTrainSentenceTransformersCommand.register_subcommand(commands_parser) + RunAutoTrainImageRegressionCommand.register_subcommand(commands_parser) + RunAutoTrainExtractiveQACommand.register_subcommand(commands_parser) + + args = parser.parse_args() + + if args.version: + print(__version__) + exit(0) + + if args.config: + logger.info(f"Using AutoTrain configuration: {args.config}") + cp = AutoTrainConfigParser(args.config) + cp.run() + exit(0) + + if not hasattr(args, "func"): + parser.print_help() + exit(1) + + command = args.func(args) + command.run() + + +if __name__ == "__main__": + main() diff --git a/src/autotrain/cli/run_api.py b/src/autotrain/cli/run_api.py new file mode 100644 index 0000000000000000000000000000000000000000..6edfd5d6bc2da44699ad1417a1962b4ca905af00 --- /dev/null +++ b/src/autotrain/cli/run_api.py @@ -0,0 +1,70 @@ +from argparse import ArgumentParser + +from . import BaseAutoTrainCommand + + +def run_api_command_factory(args): + return RunAutoTrainAPICommand( + args.port, + args.host, + args.task, + ) + + +class RunAutoTrainAPICommand(BaseAutoTrainCommand): + """ + Command to run the AutoTrain API. + + This command sets up and runs the AutoTrain API using the specified host and port. + + Methods + ------- + register_subcommand(parser: ArgumentParser) + Registers the 'api' subcommand and its arguments to the provided parser. + + __init__(port: int, host: str, task: str) + Initializes the command with the specified port, host, and task. + + run() + Runs the AutoTrain API using the uvicorn server. + """ + + @staticmethod + def register_subcommand(parser: ArgumentParser): + run_api_parser = parser.add_parser( + "api", + description="✨ Run AutoTrain API", + ) + run_api_parser.add_argument( + "--port", + type=int, + default=7860, + help="Port to run the api on", + required=False, + ) + run_api_parser.add_argument( + "--host", + type=str, + default="127.0.0.1", + help="Host to run the api on", + required=False, + ) + run_api_parser.add_argument( + "--task", + type=str, + required=False, + help="Task to run", + ) + run_api_parser.set_defaults(func=run_api_command_factory) + + def __init__(self, port, host, task): + self.port = port + self.host = host + self.task = task + + def run(self): + import uvicorn + + from autotrain.app.training_api import api + + uvicorn.run(api, host=self.host, port=self.port) diff --git a/src/autotrain/cli/run_app.py b/src/autotrain/cli/run_app.py new file mode 100644 index 0000000000000000000000000000000000000000..bdf0ea196912c6b85f7ad44a86dc4c528533b2b8 --- /dev/null +++ b/src/autotrain/cli/run_app.py @@ -0,0 +1,169 @@ +import os +import signal +import subprocess +import sys +import threading +from argparse import ArgumentParser + +from autotrain import logger + +from . import BaseAutoTrainCommand + + +def handle_output(stream, log_file): + """ + Continuously reads lines from a given stream and writes them to both + standard output and a log file until the stream is exhausted. + + Args: + stream (io.TextIOBase): The input stream to read lines from. + log_file (io.TextIOBase): The log file to write lines to. + + Returns: + None + """ + while True: + line = stream.readline() + if not line: + break + sys.stdout.write(line) + sys.stdout.flush() + log_file.write(line) + log_file.flush() + + +def run_app_command_factory(args): + return RunAutoTrainAppCommand(args.port, args.host, args.share, args.workers, args.colab) + + +class RunAutoTrainAppCommand(BaseAutoTrainCommand): + """ + Command to run the AutoTrain application. + + This command sets up and runs the AutoTrain application with the specified + configuration options such as port, host, number of workers, and sharing options. + + Methods + ------- + register_subcommand(parser: ArgumentParser): + Registers the subcommand and its arguments to the provided parser. + + __init__(port: int, host: str, share: bool, workers: int, colab: bool): + Initializes the command with the specified parameters. + + run(): + Executes the command to run the AutoTrain application. Handles different + modes such as running in Colab or sharing via ngrok. + """ + + @staticmethod + def register_subcommand(parser: ArgumentParser): + run_app_parser = parser.add_parser( + "app", + description="✨ Run AutoTrain app", + ) + run_app_parser.add_argument( + "--port", + type=int, + default=7860, + help="Port to run the app on", + required=False, + ) + run_app_parser.add_argument( + "--host", + type=str, + default="127.0.0.1", + help="Host to run the app on", + required=False, + ) + run_app_parser.add_argument( + "--workers", + type=int, + default=1, + help="Number of workers to run the app with", + required=False, + ) + run_app_parser.add_argument( + "--share", + action="store_true", + help="Share the app on ngrok", + required=False, + ) + run_app_parser.add_argument( + "--colab", + action="store_true", + help="Use app in colab", + required=False, + ) + run_app_parser.set_defaults(func=run_app_command_factory) + + def __init__(self, port, host, share, workers, colab): + self.port = port + self.host = host + self.share = share + self.workers = workers + self.colab = colab + + def run(self): + if self.colab: + from IPython.display import display + + from autotrain.app.colab import colab_app + + elements = colab_app() + display(elements) + return + + if self.share: + from pyngrok import ngrok + + os.system(f"fuser -n tcp -k {self.port}") + authtoken = os.environ.get("NGROK_AUTH_TOKEN", "") + if authtoken.strip() == "": + logger.info("NGROK_AUTH_TOKEN not set") + raise ValueError("NGROK_AUTH_TOKEN not set. Please set it!") + + ngrok.set_auth_token(authtoken) + active_tunnels = ngrok.get_tunnels() + for tunnel in active_tunnels: + public_url = tunnel.public_url + ngrok.disconnect(public_url) + url = ngrok.connect(addr=self.port, bind_tls=True) + logger.info(f"AutoTrain Public URL: {url}") + logger.info("Please wait for the app to load...") + + command = f"uvicorn autotrain.app.app:app --host {self.host} --port {self.port}" + command += f" --workers {self.workers}" + + with open("autotrain.log", "w", encoding="utf-8") as log_file: + if sys.platform == "win32": + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True, bufsize=1 + ) + + else: + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + shell=True, + text=True, + bufsize=1, + preexec_fn=os.setsid, + ) + + output_thread = threading.Thread(target=handle_output, args=(process.stdout, log_file)) + output_thread.start() + + try: + process.wait() + output_thread.join() + except KeyboardInterrupt: + logger.warning("Attempting to terminate the process...") + if sys.platform == "win32": + process.terminate() + else: + # If user cancels (Ctrl+C), terminate the subprocess + # Use os.killpg to send SIGTERM to the process group, ensuring all child processes are killed + os.killpg(os.getpgid(process.pid), signal.SIGTERM) + logger.info("Process terminated by user") diff --git a/src/autotrain/cli/run_extractive_qa.py b/src/autotrain/cli/run_extractive_qa.py new file mode 100644 index 0000000000000000000000000000000000000000..6062fbb345f55b59a014f13cacfdee60641ca11a --- /dev/null +++ b/src/autotrain/cli/run_extractive_qa.py @@ -0,0 +1,105 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams + +from . import BaseAutoTrainCommand + + +def run_extractive_qa_command_factory(args): + return RunAutoTrainExtractiveQACommand(args) + + +class RunAutoTrainExtractiveQACommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(ExtractiveQuestionAnsweringParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend to use for training", + "required": False, + "default": "local", + }, + ] + arg_list + arg_list = [arg for arg in arg_list if arg["arg"] != "--disable-gradient-checkpointing"] + run_extractive_qa_parser = parser.add_parser( + "extractive-qa", description="✨ Run AutoTrain Extractive Question Answering" + ) + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_extractive_qa_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_extractive_qa_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_extractive_qa_parser.set_defaults(func=run_extractive_qa_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [ + "train", + "deploy", + "inference", + "auto_find_batch_size", + "push_to_hub", + ] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + if self.args.username is None: + raise ValueError("Username must be specified for push to hub") + else: + raise ValueError("Must specify --train, --deploy or --inference") + + def run(self): + logger.info("Running Extractive Question Answering") + if self.args.train: + params = ExtractiveQuestionAnsweringParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_image_classification.py b/src/autotrain/cli/run_image_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..64430f7a2a4a2c8aa8082d6095f63dc516e3cea3 --- /dev/null +++ b/src/autotrain/cli/run_image_classification.py @@ -0,0 +1,113 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.image_classification.params import ImageClassificationParams + +from . import BaseAutoTrainCommand + + +def run_image_classification_command_factory(args): + return RunAutoTrainImageClassificationCommand(args) + + +class RunAutoTrainImageClassificationCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(ImageClassificationParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend", + "required": False, + "type": str, + "default": "local", + }, + ] + arg_list + run_image_classification_parser = parser.add_parser( + "image-classification", description="✨ Run AutoTrain Image Classification" + ) + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_image_classification_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_image_classification_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_image_classification_parser.set_defaults(func=run_image_classification_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [ + "train", + "deploy", + "inference", + "auto_find_batch_size", + "push_to_hub", + ] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + if self.args.username is None: + raise ValueError("Username must be specified for push to hub") + else: + raise ValueError("Must specify --train, --deploy or --inference") + + if self.args.backend.startswith("spaces") or self.args.backend.startswith("ep-"): + if not self.args.push_to_hub: + raise ValueError("Push to hub must be specified for spaces backend") + if self.args.username is None: + raise ValueError("Username must be specified for spaces backend") + if self.args.token is None: + raise ValueError("Token must be specified for spaces backend") + + def run(self): + logger.info("Running Image Classification") + if self.args.train: + params = ImageClassificationParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_image_regression.py b/src/autotrain/cli/run_image_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..713a5b2c590c1f5822d50d70669144bcfe8950f7 --- /dev/null +++ b/src/autotrain/cli/run_image_regression.py @@ -0,0 +1,113 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.image_regression.params import ImageRegressionParams + +from . import BaseAutoTrainCommand + + +def run_image_regression_command_factory(args): + return RunAutoTrainImageRegressionCommand(args) + + +class RunAutoTrainImageRegressionCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(ImageRegressionParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend", + "required": False, + "type": str, + "default": "local", + }, + ] + arg_list + run_image_regression_parser = parser.add_parser( + "image-regression", description="✨ Run AutoTrain Image Regression" + ) + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_image_regression_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_image_regression_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_image_regression_parser.set_defaults(func=run_image_regression_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [ + "train", + "deploy", + "inference", + "auto_find_batch_size", + "push_to_hub", + ] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + if self.args.username is None: + raise ValueError("Username must be specified for push to hub") + else: + raise ValueError("Must specify --train, --deploy or --inference") + + if self.args.backend.startswith("spaces") or self.args.backend.startswith("ep-"): + if not self.args.push_to_hub: + raise ValueError("Push to hub must be specified for spaces backend") + if self.args.username is None: + raise ValueError("Username must be specified for spaces backend") + if self.args.token is None: + raise ValueError("Token must be specified for spaces backend") + + def run(self): + logger.info("Running Image Regression") + if self.args.train: + params = ImageRegressionParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_llm.py b/src/autotrain/cli/run_llm.py new file mode 100644 index 0000000000000000000000000000000000000000..c2d3236cb783b332875e2a6f9701ed23abafcce5 --- /dev/null +++ b/src/autotrain/cli/run_llm.py @@ -0,0 +1,141 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.clm.params import LLMTrainingParams + +from . import BaseAutoTrainCommand + + +def run_llm_command_factory(args): + return RunAutoTrainLLMCommand(args) + + +class RunAutoTrainLLMCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(LLMTrainingParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend", + "required": False, + "type": str, + "default": "local", + }, + ] + arg_list + arg_list = [arg for arg in arg_list if arg["arg"] != "--block-size"] + arg_list.append( + { + "arg": "--block_size", + "help": "Block size", + "required": False, + "type": str, + "default": "1024", + "alias": ["--block-size"], + } + ) + run_llm_parser = parser.add_parser("llm", description="✨ Run AutoTrain LLM") + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_llm_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_llm_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_llm_parser.set_defaults(func=run_llm_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [ + "train", + "deploy", + "inference", + "add_eos_token", + "peft", + "auto_find_batch_size", + "push_to_hub", + "merge_adapter", + "use_flash_attention_2", + "disable_gradient_checkpointing", + ] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + block_size_split = self.args.block_size.strip().split(",") + if len(block_size_split) == 1: + self.args.block_size = int(block_size_split[0]) + elif len(block_size_split) > 1: + self.args.block_size = [int(x.strip()) for x in block_size_split] + else: + raise ValueError("Invalid block size") + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + # must have project_name, username and token OR project_name, token + if self.args.username is None: + raise ValueError("Usernamemust be specified for push to hub") + if self.args.token is None: + raise ValueError("Token must be specified for push to hub") + + if self.args.backend.startswith("spaces") or self.args.backend.startswith("ep-"): + if not self.args.push_to_hub: + raise ValueError("Push to hub must be specified for spaces backend") + if self.args.username is None: + raise ValueError("Username must be specified for spaces backend") + if self.args.token is None: + raise ValueError("Token must be specified for spaces backend") + + if self.args.deploy: + raise NotImplementedError("Deploy is not implemented yet") + if self.args.inference: + raise NotImplementedError("Inference is not implemented yet") + + def run(self): + logger.info("Running LLM") + if self.args.train: + params = LLMTrainingParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_object_detection.py b/src/autotrain/cli/run_object_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..3fd63fbc4a226ff493fbd42be507675c2403bd41 --- /dev/null +++ b/src/autotrain/cli/run_object_detection.py @@ -0,0 +1,113 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.object_detection.params import ObjectDetectionParams + +from . import BaseAutoTrainCommand + + +def run_object_detection_command_factory(args): + return RunAutoTrainObjectDetectionCommand(args) + + +class RunAutoTrainObjectDetectionCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(ObjectDetectionParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend", + "required": False, + "type": str, + "default": "local", + }, + ] + arg_list + run_object_detection_parser = parser.add_parser( + "object-detection", description="✨ Run AutoTrain Object Detection" + ) + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_object_detection_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_object_detection_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_object_detection_parser.set_defaults(func=run_object_detection_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [ + "train", + "deploy", + "inference", + "auto_find_batch_size", + "push_to_hub", + ] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + if self.args.username is None: + raise ValueError("Username must be specified for push to hub") + else: + raise ValueError("Must specify --train, --deploy or --inference") + + if self.args.backend.startswith("spaces") or self.args.backend.startswith("ep-"): + if not self.args.push_to_hub: + raise ValueError("Push to hub must be specified for spaces backend") + if self.args.username is None: + raise ValueError("Username must be specified for spaces backend") + if self.args.token is None: + raise ValueError("Token must be specified for spaces backend") + + def run(self): + logger.info("Running Object Detection") + if self.args.train: + params = ObjectDetectionParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_sent_tranformers.py b/src/autotrain/cli/run_sent_tranformers.py new file mode 100644 index 0000000000000000000000000000000000000000..a6858dee2f63637a9fa8d92ea95e0f070d572ab1 --- /dev/null +++ b/src/autotrain/cli/run_sent_tranformers.py @@ -0,0 +1,113 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.sent_transformers.params import SentenceTransformersParams + +from . import BaseAutoTrainCommand + + +def run_sentence_transformers_command_factory(args): + return RunAutoTrainSentenceTransformersCommand(args) + + +class RunAutoTrainSentenceTransformersCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(SentenceTransformersParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend", + "required": False, + "type": str, + "default": "local", + }, + ] + arg_list + run_sentence_transformers_parser = parser.add_parser( + "sentence-transformers", description="✨ Run AutoTrain Sentence Transformers" + ) + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_sentence_transformers_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_sentence_transformers_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_sentence_transformers_parser.set_defaults(func=run_sentence_transformers_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [ + "train", + "deploy", + "inference", + "auto_find_batch_size", + "push_to_hub", + ] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + if self.args.username is None: + raise ValueError("Username must be specified for push to hub") + else: + raise ValueError("Must specify --train, --deploy or --inference") + + if self.args.backend.startswith("spaces") or self.args.backend.startswith("ep-"): + if not self.args.push_to_hub: + raise ValueError("Push to hub must be specified for spaces backend") + if self.args.username is None: + raise ValueError("Username must be specified for spaces backend") + if self.args.token is None: + raise ValueError("Token must be specified for spaces backend") + + def run(self): + logger.info("Running Sentence Transformers...") + if self.args.train: + params = SentenceTransformersParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_seq2seq.py b/src/autotrain/cli/run_seq2seq.py new file mode 100644 index 0000000000000000000000000000000000000000..0a7aaef0d556296c3364c8a9f3567bd9ce37cd84 --- /dev/null +++ b/src/autotrain/cli/run_seq2seq.py @@ -0,0 +1,97 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.seq2seq.params import Seq2SeqParams + +from . import BaseAutoTrainCommand + + +def run_seq2seq_command_factory(args): + return RunAutoTrainSeq2SeqCommand(args) + + +class RunAutoTrainSeq2SeqCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(Seq2SeqParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend", + "required": False, + "type": str, + "default": "local", + }, + ] + arg_list + run_seq2seq_parser = parser.add_parser("seq2seq", description="✨ Run AutoTrain Seq2Seq") + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_seq2seq_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_seq2seq_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_seq2seq_parser.set_defaults(func=run_seq2seq_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = ["train", "deploy", "inference", "auto_find_batch_size", "push_to_hub", "peft"] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + if self.args.username is None: + raise ValueError("Username must be specified for push to hub") + else: + raise ValueError("Must specify --train, --deploy or --inference") + + def run(self): + logger.info("Running Seq2Seq Classification") + if self.args.train: + params = Seq2SeqParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_setup.py b/src/autotrain/cli/run_setup.py new file mode 100644 index 0000000000000000000000000000000000000000..f64727a99cc819b95ce5e8c194f556a014fa6b97 --- /dev/null +++ b/src/autotrain/cli/run_setup.py @@ -0,0 +1,53 @@ +import subprocess +from argparse import ArgumentParser + +from autotrain import logger + +from . import BaseAutoTrainCommand + + +def run_app_command_factory(args): + return RunSetupCommand(args.update_torch, args.colab) + + +class RunSetupCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + run_setup_parser = parser.add_parser( + "setup", + description="✨ Run AutoTrain setup", + ) + run_setup_parser.add_argument( + "--update-torch", + action="store_true", + help="Update PyTorch to latest version", + ) + run_setup_parser.add_argument( + "--colab", + action="store_true", + help="Run setup for Google Colab", + ) + run_setup_parser.set_defaults(func=run_app_command_factory) + + def __init__(self, update_torch: bool, colab: bool = False): + self.update_torch = update_torch + self.colab = colab + + def run(self): + if self.colab: + cmd = "pip install -U xformers==0.0.24" + else: + cmd = "pip uninstall -y xformers" + cmd = cmd.split() + pipe = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logger.info("Installing latest xformers") + _, _ = pipe.communicate() + logger.info("Successfully installed latest xformers") + + if self.update_torch: + cmd = "pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121" + cmd = cmd.split() + pipe = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + logger.info("Installing latest PyTorch") + _, _ = pipe.communicate() + logger.info("Successfully installed latest PyTorch") diff --git a/src/autotrain/cli/run_spacerunner.py b/src/autotrain/cli/run_spacerunner.py new file mode 100644 index 0000000000000000000000000000000000000000..7b814a426840520f5e99170c352ecd14dea028e5 --- /dev/null +++ b/src/autotrain/cli/run_spacerunner.py @@ -0,0 +1,143 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.backends.base import AVAILABLE_HARDWARE +from autotrain.backends.spaces import SpaceRunner +from autotrain.trainers.generic.params import GenericParams +from autotrain.trainers.generic.utils import create_dataset_repo + +from . import BaseAutoTrainCommand + + +BACKEND_CHOICES = list(AVAILABLE_HARDWARE.keys()) +BACKEND_CHOICES = [b for b in BACKEND_CHOICES if b.startswith("spaces-")] + + +def run_spacerunner_command_factory(args): + return RunAutoTrainSpaceRunnerCommand(args) + + +class RunAutoTrainSpaceRunnerCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = [ + { + "arg": "--project-name", + "help": "Name of the project. Must be unique.", + "required": True, + "type": str, + }, + { + "arg": "--script-path", + "help": "Path to the script", + "required": True, + "type": str, + }, + { + "arg": "--username", + "help": "Hugging Face Username, can also be an organization name", + "required": True, + "type": str, + }, + { + "arg": "--token", + "help": "Hugging Face API Token", + "required": True, + "type": str, + }, + { + "arg": "--backend", + "help": "Hugging Face backend to use", + "required": True, + "type": str, + "choices": BACKEND_CHOICES, + }, + { + "arg": "--env", + "help": "Environment variables, e.g. --env FOO=bar;FOO2=bar2;FOO3=bar3", + "required": False, + "type": str, + }, + { + "arg": "--args", + "help": "Arguments to pass to the script, e.g. --args foo=bar;foo2=bar2;foo3=bar3;store_true_arg", + "required": False, + "type": str, + }, + ] + run_spacerunner_parser = parser.add_parser("spacerunner", description="✨ Run AutoTrain SpaceRunner") + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_spacerunner_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + else: + run_spacerunner_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_spacerunner_parser.set_defaults(func=run_spacerunner_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + env_vars = {} + if self.args.env: + for env_name_value in self.args.env.split(";"): + if len(env_name_value.split("=")) == 2: + env_vars[env_name_value.split("=")[0]] = env_name_value.split("=")[1] + else: + raise ValueError("Invalid environment variable format.") + self.args.env = env_vars + + app_args = {} + store_true_args = [] + if self.args.args: + for arg_name_value in self.args.args.split(";"): + if len(arg_name_value.split("=")) == 1: + store_true_args.append(arg_name_value) + elif len(arg_name_value.split("=")) == 2: + app_args[arg_name_value.split("=")[0]] = arg_name_value.split("=")[1] + else: + raise ValueError("Invalid argument format.") + + for arg_name in store_true_args: + app_args[arg_name] = "" + self.args.args = app_args + + def run(self): + dataset_id = create_dataset_repo( + username=self.args.username, + project_name=self.args.project_name, + script_path=self.args.script_path, + token=self.args.token, + ) + params = GenericParams( + project_name=self.args.project_name, + data_path=dataset_id, + username=self.args.username, + token=self.args.token, + script_path=self.args.script_path, + env=self.args.env, + args=self.args.args, + ) + project = SpaceRunner(params=params, backend=self.args.backend) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_tabular.py b/src/autotrain/cli/run_tabular.py new file mode 100644 index 0000000000000000000000000000000000000000..8b1b72ee8e5f878c7ed924dd91bca64972bccf38 --- /dev/null +++ b/src/autotrain/cli/run_tabular.py @@ -0,0 +1,106 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.tabular.params import TabularParams + +from . import BaseAutoTrainCommand + + +def run_tabular_command_factory(args): + return RunAutoTrainTabularCommand(args) + + +class RunAutoTrainTabularCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(TabularParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend", + "required": False, + "type": str, + "default": "local", + }, + ] + arg_list + remove_args = ["--disable_gradient_checkpointing", "--gradient_accumulation", "--epochs", "--log", "--lr"] + arg_list = [arg for arg in arg_list if arg["arg"] not in remove_args] + run_tabular_parser = parser.add_parser("tabular", description="✨ Run AutoTrain Tabular Data Training") + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_tabular_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_tabular_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_tabular_parser.set_defaults(func=run_tabular_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [ + "train", + "deploy", + "inference", + "push_to_hub", + ] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + if self.args.username is None: + raise ValueError("Username must be specified for push to hub") + else: + raise ValueError("Must specify --train, --deploy or --inference") + + self.args.target_columns = [k.strip() for k in self.args.target_columns.split(",")] + + def run(self): + logger.info("Running Tabular Training") + if self.args.train: + params = TabularParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_text_classification.py b/src/autotrain/cli/run_text_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..79a1a6f4af41454681c1471790f53c230eeffc32 --- /dev/null +++ b/src/autotrain/cli/run_text_classification.py @@ -0,0 +1,106 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.text_classification.params import TextClassificationParams + +from . import BaseAutoTrainCommand + + +def run_text_classification_command_factory(args): + return RunAutoTrainTextClassificationCommand(args) + + +class RunAutoTrainTextClassificationCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(TextClassificationParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend", + "required": False, + "type": str, + "default": "local", + }, + ] + arg_list + arg_list = [arg for arg in arg_list if arg["arg"] != "--disable-gradient-checkpointing"] + run_text_classification_parser = parser.add_parser( + "text-classification", description="✨ Run AutoTrain Text Classification" + ) + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_text_classification_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_text_classification_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_text_classification_parser.set_defaults(func=run_text_classification_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [ + "train", + "deploy", + "inference", + "auto_find_batch_size", + "push_to_hub", + ] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + if self.args.username is None: + raise ValueError("Username must be specified for push to hub") + else: + raise ValueError("Must specify --train, --deploy or --inference") + + def run(self): + logger.info("Running Text Classification") + if self.args.train: + params = TextClassificationParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_text_regression.py b/src/autotrain/cli/run_text_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..a49c5ec070d7e9d13bf571a612145e89bdb46daa --- /dev/null +++ b/src/autotrain/cli/run_text_regression.py @@ -0,0 +1,106 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.text_regression.params import TextRegressionParams + +from . import BaseAutoTrainCommand + + +def run_text_regression_command_factory(args): + return RunAutoTrainTextRegressionCommand(args) + + +class RunAutoTrainTextRegressionCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(TextRegressionParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend", + "required": False, + "type": str, + "default": "local", + }, + ] + arg_list + arg_list = [arg for arg in arg_list if arg["arg"] != "--disable-gradient-checkpointing"] + run_text_regression_parser = parser.add_parser( + "text-regression", description="✨ Run AutoTrain Text Regression" + ) + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_text_regression_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_text_regression_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_text_regression_parser.set_defaults(func=run_text_regression_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [ + "train", + "deploy", + "inference", + "auto_find_batch_size", + "push_to_hub", + ] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + if self.args.username is None: + raise ValueError("Username must be specified for push to hub") + else: + raise ValueError("Must specify --train, --deploy or --inference") + + def run(self): + logger.info("Running Text Regression") + if self.args.train: + params = TextRegressionParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_token_classification.py b/src/autotrain/cli/run_token_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..15f5cb243815ffdbcdcd32d8bd618c8ce25f957b --- /dev/null +++ b/src/autotrain/cli/run_token_classification.py @@ -0,0 +1,106 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.token_classification.params import TokenClassificationParams + +from . import BaseAutoTrainCommand + + +def run_token_classification_command_factory(args): + return RunAutoTrainTokenClassificationCommand(args) + + +class RunAutoTrainTokenClassificationCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(TokenClassificationParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend", + "required": False, + "type": str, + "default": "local", + }, + ] + arg_list + arg_list = [arg for arg in arg_list if arg["arg"] != "--disable-gradient-checkpointing"] + run_token_classification_parser = parser.add_parser( + "token-classification", description="✨ Run AutoTrain Token Classification" + ) + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_token_classification_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_token_classification_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_token_classification_parser.set_defaults(func=run_token_classification_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [ + "train", + "deploy", + "inference", + "auto_find_batch_size", + "push_to_hub", + ] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + if self.args.username is None: + raise ValueError("Username must be specified for push to hub") + else: + raise ValueError("Must specify --train, --deploy or --inference") + + def run(self): + logger.info("Running Token Classification") + if self.args.train: + params = TokenClassificationParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/run_tools.py b/src/autotrain/cli/run_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..c8fe1369f17664b9b6f9409758ba7d4b11674239 --- /dev/null +++ b/src/autotrain/cli/run_tools.py @@ -0,0 +1,99 @@ +from argparse import ArgumentParser + +from . import BaseAutoTrainCommand + + +def run_tools_command_factory(args): + return RunAutoTrainToolsCommand(args) + + +class RunAutoTrainToolsCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + run_app_parser = parser.add_parser("tools", help="Run AutoTrain tools") + subparsers = run_app_parser.add_subparsers(title="tools", dest="tool_name") + + merge_llm_parser = subparsers.add_parser( + "merge-llm-adapter", + help="Merge LLM Adapter tool", + ) + merge_llm_parser.add_argument( + "--base-model-path", + type=str, + help="Base model path", + ) + merge_llm_parser.add_argument( + "--adapter-path", + type=str, + help="Adapter path", + ) + merge_llm_parser.add_argument( + "--token", + type=str, + help="Token", + default=None, + required=False, + ) + merge_llm_parser.add_argument( + "--pad-to-multiple-of", + type=int, + help="Pad to multiple of", + default=None, + required=False, + ) + merge_llm_parser.add_argument( + "--output-folder", + type=str, + help="Output folder", + required=False, + default=None, + ) + merge_llm_parser.add_argument( + "--push-to-hub", + action="store_true", + help="Push to Hugging Face Hub", + required=False, + ) + merge_llm_parser.set_defaults(func=run_tools_command_factory, merge_llm_adapter=True) + + convert_to_kohya_parser = subparsers.add_parser("convert_to_kohya", help="Convert to Kohya tool") + convert_to_kohya_parser.add_argument( + "--input-path", + type=str, + help="Input path", + ) + convert_to_kohya_parser.add_argument( + "--output-path", + type=str, + help="Output path", + ) + convert_to_kohya_parser.set_defaults(func=run_tools_command_factory, convert_to_kohya=True) + + def __init__(self, args): + self.args = args + + def run(self): + if getattr(self.args, "merge_llm_adapter", False): + self.run_merge_llm_adapter() + if getattr(self.args, "convert_to_kohya", False): + self.run_convert_to_kohya() + + def run_merge_llm_adapter(self): + from autotrain.tools.merge_adapter import merge_llm_adapter + + merge_llm_adapter( + base_model_path=self.args.base_model_path, + adapter_path=self.args.adapter_path, + token=self.args.token, + output_folder=self.args.output_folder, + pad_to_multiple_of=self.args.pad_to_multiple_of, + push_to_hub=self.args.push_to_hub, + ) + + def run_convert_to_kohya(self): + from autotrain.tools.convert_to_kohya import convert_to_kohya + + convert_to_kohya( + input_path=self.args.input_path, + output_path=self.args.output_path, + ) diff --git a/src/autotrain/cli/run_vlm.py b/src/autotrain/cli/run_vlm.py new file mode 100644 index 0000000000000000000000000000000000000000..5f7a93e28da55c07630a078924f0d00e832774f9 --- /dev/null +++ b/src/autotrain/cli/run_vlm.py @@ -0,0 +1,111 @@ +from argparse import ArgumentParser + +from autotrain import logger +from autotrain.cli.utils import get_field_info +from autotrain.project import AutoTrainProject +from autotrain.trainers.vlm.params import VLMTrainingParams + +from . import BaseAutoTrainCommand + + +def run_vlm_command_factory(args): + return RunAutoTrainVLMCommand(args) + + +class RunAutoTrainVLMCommand(BaseAutoTrainCommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + arg_list = get_field_info(VLMTrainingParams) + arg_list = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--backend", + "help": "Backend", + "required": False, + "type": str, + "default": "local", + }, + ] + arg_list + run_image_regression_parser = parser.add_parser("vlm", description="✨ Run AutoTrain VLM") + for arg in arg_list: + names = [arg["arg"]] + arg.get("alias", []) + if "action" in arg: + run_image_regression_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + action=arg.get("action"), + default=arg.get("default"), + ) + else: + run_image_regression_parser.add_argument( + *names, + dest=arg["arg"].replace("--", "").replace("-", "_"), + help=arg["help"], + required=arg.get("required", False), + type=arg.get("type"), + default=arg.get("default"), + choices=arg.get("choices"), + ) + run_image_regression_parser.set_defaults(func=run_vlm_command_factory) + + def __init__(self, args): + self.args = args + + store_true_arg_names = [ + "train", + "deploy", + "inference", + "auto_find_batch_size", + "push_to_hub", + ] + for arg_name in store_true_arg_names: + if getattr(self.args, arg_name) is None: + setattr(self.args, arg_name, False) + + if self.args.train: + if self.args.project_name is None: + raise ValueError("Project name must be specified") + if self.args.data_path is None: + raise ValueError("Data path must be specified") + if self.args.model is None: + raise ValueError("Model must be specified") + if self.args.push_to_hub: + if self.args.username is None: + raise ValueError("Username must be specified for push to hub") + else: + raise ValueError("Must specify --train, --deploy or --inference") + + if self.args.backend.startswith("spaces") or self.args.backend.startswith("ep-"): + if not self.args.push_to_hub: + raise ValueError("Push to hub must be specified for spaces backend") + if self.args.username is None: + raise ValueError("Username must be specified for spaces backend") + if self.args.token is None: + raise ValueError("Token must be specified for spaces backend") + + def run(self): + logger.info("Running Image Regression") + if self.args.train: + params = VLMTrainingParams(**vars(self.args)) + project = AutoTrainProject(params=params, backend=self.args.backend, process=True) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/cli/utils.py b/src/autotrain/cli/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d95cbb8860ee0c7ed03cd82f908c11d90ee8e56f --- /dev/null +++ b/src/autotrain/cli/utils.py @@ -0,0 +1,178 @@ +from typing import Any, Type + +from autotrain.backends.base import AVAILABLE_HARDWARE + + +def common_args(): + args = [ + { + "arg": "--train", + "help": "Command to train the model", + "required": False, + "action": "store_true", + }, + { + "arg": "--deploy", + "help": "Command to deploy the model (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--inference", + "help": "Command to run inference (limited availability)", + "required": False, + "action": "store_true", + }, + { + "arg": "--username", + "help": "Hugging Face Hub Username", + "required": False, + "type": str, + }, + { + "arg": "--backend", + "help": "Backend to use: default or spaces. Spaces backend requires push_to_hub & username. Advanced users only.", + "required": False, + "type": str, + "default": "local", + "choices": AVAILABLE_HARDWARE.keys(), + }, + { + "arg": "--token", + "help": "Your Hugging Face API token. Token must have write access to the model hub.", + "required": False, + "type": str, + }, + { + "arg": "--push-to-hub", + "help": "Push to hub after training will push the trained model to the Hugging Face model hub.", + "required": False, + "action": "store_true", + }, + { + "arg": "--model", + "help": "Base model to use for training", + "required": True, + "type": str, + }, + { + "arg": "--project-name", + "help": "Output directory / repo id for trained model (must be unique on hub)", + "required": True, + "type": str, + }, + { + "arg": "--data-path", + "help": "Train dataset to use. When using cli, this should be a directory path containing training and validation data in appropriate formats", + "required": False, + "type": str, + }, + { + "arg": "--train-split", + "help": "Train dataset split to use", + "required": False, + "type": str, + "default": "train", + }, + { + "arg": "--valid-split", + "help": "Validation dataset split to use", + "required": False, + "type": str, + "default": None, + }, + { + "arg": "--batch-size", + "help": "Training batch size to use", + "required": False, + "type": int, + "default": 2, + "alias": ["--train-batch-size"], + }, + { + "arg": "--seed", + "help": "Random seed for reproducibility", + "required": False, + "default": 42, + "type": int, + }, + { + "arg": "--epochs", + "help": "Number of training epochs", + "required": False, + "default": 1, + "type": int, + }, + { + "arg": "--gradient-accumulation", + "help": "Gradient accumulation steps", + "required": False, + "default": 1, + "type": int, + "alias": ["--gradient-accumulation"], + }, + { + "arg": "--disable-gradient-checkpointing", + "help": "Disable gradient checkpointing", + "required": False, + "action": "store_true", + "alias": ["--disable-gradient-checkpointing", "--disable-gc"], + }, + { + "arg": "--lr", + "help": "Learning rate", + "required": False, + "default": 5e-4, + "type": float, + }, + { + "arg": "--log", + "help": "Use experiment tracking", + "required": False, + "type": str, + "default": "none", + "choices": ["none", "wandb", "tensorboard"], + }, + ] + return args + + +def python_type_from_schema_field(field_data: dict) -> Type: + """Converts JSON schema field types to Python types.""" + type_map = { + "string": str, + "number": float, + "integer": int, + "boolean": bool, + } + field_type = field_data.get("type") + if field_type: + return type_map.get(field_type, str) + elif "anyOf" in field_data: + for type_option in field_data["anyOf"]: + if type_option["type"] != "null": + return type_map.get(type_option["type"], str) + return str + + +def get_default_value(field_data: dict) -> Any: + return field_data["default"] + + +def get_field_info(params_class): + schema = params_class.model_json_schema() + properties = schema.get("properties", {}) + field_info = [] + for field_name, field_data in properties.items(): + temp_info = { + "arg": f"--{field_name.replace('_', '-')}", + "alias": [f"--{field_name}", f"--{field_name.replace('_', '-')}"], + "type": python_type_from_schema_field(field_data), + "help": field_data.get("title", ""), + "default": get_default_value(field_data), + } + if temp_info["type"] == bool: + temp_info["action"] = "store_true" + + field_info.append(temp_info) + return field_info diff --git a/src/autotrain/client.py b/src/autotrain/client.py new file mode 100644 index 0000000000000000000000000000000000000000..ea3734c0e94e82af0bfae770abb103dd2221c601 --- /dev/null +++ b/src/autotrain/client.py @@ -0,0 +1,294 @@ +import os +from dataclasses import dataclass +from typing import Optional + +import requests + +from autotrain import logger + + +AUTOTRAIN_API = os.environ.get("AUTOTRAIN_API", "https://autotrain-projects-autotrain-advanced.hf.space/") + +BACKENDS = { + "spaces-a10g-large": "a10g-large", + "spaces-a10g-small": "a10g-small", + "spaces-a100-large": "a100-large", + "spaces-t4-medium": "t4-medium", + "spaces-t4-small": "t4-small", + "spaces-cpu-upgrade": "cpu-upgrade", + "spaces-cpu-basic": "cpu-basic", + "spaces-l4x1": "l4x1", + "spaces-l4x4": "l4x4", + "spaces-l40sx1": "l40sx1", + "spaces-l40sx4": "l40sx4", + "spaces-l40sx8": "l40sx8", + "spaces-a10g-largex2": "a10g-largex2", + "spaces-a10g-largex4": "a10g-largex4", +} + + +PARAMS = {} +PARAMS["llm"] = { + "target_modules": "all-linear", + "log": "tensorboard", + "mixed_precision": "fp16", + "quantization": "int4", + "peft": True, + "block_size": 1024, + "epochs": 3, + "padding": "right", + "chat_template": "none", + "max_completion_length": 128, + "distributed_backend": "ddp", + "scheduler": "linear", + "merge_adapter": True, +} + +PARAMS["text-classification"] = { + "mixed_precision": "fp16", + "log": "tensorboard", +} + +PARAMS["st"] = { + "mixed_precision": "fp16", + "log": "tensorboard", +} + +PARAMS["image-classification"] = { + "mixed_precision": "fp16", + "log": "tensorboard", +} + +PARAMS["image-object-detection"] = { + "mixed_precision": "fp16", + "log": "tensorboard", +} + +PARAMS["seq2seq"] = { + "mixed_precision": "fp16", + "target_modules": "all-linear", + "log": "tensorboard", +} + +PARAMS["tabular"] = { + "categorical_imputer": "most_frequent", + "numerical_imputer": "median", + "numeric_scaler": "robust", +} + +PARAMS["token-classification"] = { + "mixed_precision": "fp16", + "log": "tensorboard", +} + +PARAMS["text-regression"] = { + "mixed_precision": "fp16", + "log": "tensorboard", +} + +PARAMS["image-regression"] = { + "mixed_precision": "fp16", + "log": "tensorboard", +} + +PARAMS["vlm"] = { + "mixed_precision": "fp16", + "target_modules": "all-linear", + "log": "tensorboard", + "quantization": "int4", + "peft": True, + "epochs": 3, +} + +PARAMS["extractive-qa"] = { + "mixed_precision": "fp16", + "log": "tensorboard", + "max_seq_length": 512, + "max_doc_stride": 128, +} + +DEFAULT_COLUMN_MAPPING = {} +DEFAULT_COLUMN_MAPPING["llm:sft"] = {"text_column": "text"} +DEFAULT_COLUMN_MAPPING["llm:generic"] = {"text_column": "text"} +DEFAULT_COLUMN_MAPPING["llm:default"] = {"text_column": "text"} +DEFAULT_COLUMN_MAPPING["llm:dpo"] = { + "prompt_column": "prompt", + "text_column": "chosen", + "rejected_text_column": "rejected", +} +DEFAULT_COLUMN_MAPPING["llm:orpo"] = { + "prompt_column": "prompt", + "text_column": "chosen", + "rejected_text_column": "rejected", +} +DEFAULT_COLUMN_MAPPING["llm:reward"] = {"text_column": "chosen", "rejected_text_column": "rejected"} +DEFAULT_COLUMN_MAPPING["vlm:captioning"] = {"image_column": "image", "text_column": "caption"} +DEFAULT_COLUMN_MAPPING["vlm:vqa"] = { + "image_column": "image", + "prompt_text_column": "question", + "text_column": "answer", +} +DEFAULT_COLUMN_MAPPING["st:pair"] = {"sentence1": "anchor", "sentence2": "positive"} +DEFAULT_COLUMN_MAPPING["st:pair_class"] = { + "sentence1_column": "premise", + "sentence2_column": "hypothesis", + "target_column": "label", +} +DEFAULT_COLUMN_MAPPING["st:pair_score"] = { + "sentence1_column": "sentence1", + "sentence2_column": "sentence2", + "target_column": "score", +} +DEFAULT_COLUMN_MAPPING["st:triplet"] = { + "sentence1_column": "anchor", + "sentence2_column": "positive", + "sentence3_column": "negative", +} +DEFAULT_COLUMN_MAPPING["st:qa"] = {"sentence1_column": "query", "sentence2_column": "answer"} +DEFAULT_COLUMN_MAPPING["text-classification"] = {"text_column": "text", "target_column": "target"} +DEFAULT_COLUMN_MAPPING["seq2seq"] = {"text_column": "text", "target_column": "target"} +DEFAULT_COLUMN_MAPPING["text-regression"] = {"text_column": "text", "target_column": "target"} +DEFAULT_COLUMN_MAPPING["token-classification"] = {"text_column": "tokens", "target_column": "tags"} +DEFAULT_COLUMN_MAPPING["image-classification"] = {"image_column": "image", "target_column": "label"} +DEFAULT_COLUMN_MAPPING["image-regression"] = {"image_column": "image", "target_column": "target"} +DEFAULT_COLUMN_MAPPING["image-object-detection"] = {"image_column": "image", "objects_column": "objects"} +DEFAULT_COLUMN_MAPPING["tabular:classification"] = {"id_column": "id", "target__columns": ["target"]} +DEFAULT_COLUMN_MAPPING["tabular:regression"] = {"id_column": "id", "target_columns": ["target"]} +DEFAULT_COLUMN_MAPPING["extractive-qa"] = { + "text_column": "context", + "question_column": "question", + "answer_column": "answers", +} + +VALID_TASKS = [k for k in DEFAULT_COLUMN_MAPPING.keys()] + + +@dataclass +class Client: + """ + A client to interact with the AutoTrain API. + Attributes: + host (Optional[str]): The host URL for the AutoTrain API. + token (Optional[str]): The authentication token for the API. + username (Optional[str]): The username for the API. + Methods: + __post_init__(): + Initializes the client with default values if not provided and sets up headers. + __str__(): + Returns a string representation of the client with masked token. + __repr__(): + Returns a string representation of the client with masked token. + create(project_name: str, task: str, base_model: str, hardware: str, dataset: str, train_split: str, column_mapping: Optional[dict] = None, params: Optional[dict] = None, valid_split: Optional[str] = None): + Creates a new project on the AutoTrain platform. + get_logs(job_id: str): + Retrieves logs for a given job ID. + stop_training(job_id: str): + Stops the training for a given job ID. + """ + + host: Optional[str] = None + token: Optional[str] = None + username: Optional[str] = None + + def __post_init__(self): + if self.host is None: + self.host = AUTOTRAIN_API + + if self.token is None: + self.token = os.environ.get("HF_TOKEN") + + if self.username is None: + self.username = os.environ.get("HF_USERNAME") + + if self.token is None or self.username is None: + raise ValueError("Please provide a valid username and token") + + self.headers = {"Authorization": f"Bearer {self.token}", "Content-Type": "application/json"} + + def __str__(self): + return f"Client(host={self.host}, token=****, username={self.username})" + + def __repr__(self): + return self.__str__() + + def create( + self, + project_name: str, + task: str, + base_model: str, + backend: str, + dataset: str, + train_split: str, + column_mapping: Optional[dict] = None, + params: Optional[dict] = None, + valid_split: Optional[str] = None, + ): + + if task not in VALID_TASKS: + raise ValueError(f"Invalid task. Valid tasks are: {VALID_TASKS}") + + if backend not in BACKENDS: + raise ValueError(f"Invalid backend. Valid backends are: {list(BACKENDS.keys())}") + + url = f"{self.host}/api/create_project" + + if task == "llm:defaut": + task = "llm:generic" + + if params is None: + params = {} + + if task.startswith("llm"): + params = {k: v for k, v in PARAMS["llm"].items() if k not in params} + elif task.startswith("st"): + params = {k: v for k, v in PARAMS["st"].items() if k not in params} + else: + params = {k: v for k, v in PARAMS[task].items() if k not in params} + + if column_mapping is None: + column_mapping = DEFAULT_COLUMN_MAPPING[task] + + # check if column_mapping is valid for the task + default_col_map = DEFAULT_COLUMN_MAPPING[task] + missing_cols = [] + for k, _ in default_col_map.items(): + if k not in column_mapping.keys(): + missing_cols.append(k) + + if missing_cols: + raise ValueError(f"Missing columns in column_mapping: {missing_cols}") + + data = { + "project_name": project_name, + "task": task, + "base_model": base_model, + "hardware": backend, + "params": params, + "username": self.username, + "column_mapping": column_mapping, + "hub_dataset": dataset, + "train_split": train_split, + "valid_split": valid_split, + } + response = requests.post(url, headers=self.headers, json=data) + if response.status_code == 200: + resp = response.json() + logger.info( + f"Project created successfully. Job ID: {resp['job_id']}. View logs at: https://hf.co/spaces/{resp['job_id']}" + ) + return resp + else: + logger.error(f"Error creating project: {response.json()}") + return response.json() + + def get_logs(self, job_id: str): + url = f"{self.host}/api/logs" + data = {"jid": job_id} + response = requests.post(url, headers=self.headers, json=data) + return response.json() + + def stop_training(self, job_id: str): + url = f"{self.host}/api/stop_training/{job_id}" + data = {"jid": job_id} + response = requests.post(url, headers=self.headers, json=data) + return response.json() diff --git a/src/autotrain/commands.py b/src/autotrain/commands.py new file mode 100644 index 0000000000000000000000000000000000000000..23182c26c5461d81da9c0678c82c2d240edea9a0 --- /dev/null +++ b/src/autotrain/commands.py @@ -0,0 +1,516 @@ +import os +import shlex + +import torch + +from autotrain import logger +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams +from autotrain.trainers.generic.params import GenericParams +from autotrain.trainers.image_classification.params import ImageClassificationParams +from autotrain.trainers.image_regression.params import ImageRegressionParams +from autotrain.trainers.object_detection.params import ObjectDetectionParams +from autotrain.trainers.sent_transformers.params import SentenceTransformersParams +from autotrain.trainers.seq2seq.params import Seq2SeqParams +from autotrain.trainers.tabular.params import TabularParams +from autotrain.trainers.text_classification.params import TextClassificationParams +from autotrain.trainers.text_regression.params import TextRegressionParams +from autotrain.trainers.token_classification.params import TokenClassificationParams +from autotrain.trainers.vlm.params import VLMTrainingParams + + +CPU_COMMAND = [ + "accelerate", + "launch", + "--cpu", +] + +SINGLE_GPU_COMMAND = [ + "accelerate", + "launch", + "--num_machines", + "1", + "--num_processes", + "1", +] + + +def get_accelerate_command(num_gpus, gradient_accumulation_steps=1, distributed_backend=None): + """ + Generates the appropriate command to launch a training job using the `accelerate` library based on the number of GPUs + and the specified distributed backend. + + Args: + num_gpus (int): The number of GPUs available for training. If 0, training will be forced on CPU. + gradient_accumulation_steps (int, optional): The number of gradient accumulation steps. Defaults to 1. + distributed_backend (str, optional): The distributed backend to use. Can be "ddp" (Distributed Data Parallel), + "deepspeed", or None. Defaults to None. + + Returns: + list or str: The command to be executed as a list of strings. If no GPU is found, returns a CPU command string. + If a single GPU is found, returns a single GPU command string. Otherwise, returns a list of + command arguments for multi-GPU or DeepSpeed training. + + Raises: + ValueError: If an unsupported distributed backend is specified. + """ + if num_gpus == 0: + logger.warning("No GPU found. Forcing training on CPU. This will be super slow!") + return CPU_COMMAND + + if num_gpus == 1: + return SINGLE_GPU_COMMAND + + if distributed_backend in ("ddp", None): + return [ + "accelerate", + "launch", + "--multi_gpu", + "--num_machines", + "1", + "--num_processes", + str(num_gpus), + ] + elif distributed_backend == "deepspeed": + return [ + "accelerate", + "launch", + "--use_deepspeed", + "--zero_stage", + "3", + "--offload_optimizer_device", + "none", + "--offload_param_device", + "none", + "--zero3_save_16bit_model", + "true", + "--zero3_init_flag", + "true", + "--deepspeed_multinode_launcher", + "standard", + "--gradient_accumulation_steps", + str(gradient_accumulation_steps), + ] + else: + raise ValueError("Unsupported distributed backend") + + +def launch_command(params): + """ + Launches the appropriate training command based on the type of training parameters provided. + + Args: + params (object): An instance of one of the training parameter classes. This can be one of the following: + - LLMTrainingParams + - GenericParams + - TabularParams + - TextClassificationParams + - TextRegressionParams + - SentenceTransformersParams + - ExtractiveQuestionAnsweringParams + - TokenClassificationParams + - ImageClassificationParams + - ObjectDetectionParams + - ImageRegressionParams + - Seq2SeqParams + - VLMTrainingParams + + Returns: + list: A list of command line arguments to be executed for training. + + Raises: + ValueError: If the provided params type is unsupported. + """ + + params.project_name = shlex.split(params.project_name)[0] + cuda_available = torch.cuda.is_available() + mps_available = torch.backends.mps.is_available() + if cuda_available: + num_gpus = torch.cuda.device_count() + elif mps_available: + num_gpus = 1 + else: + num_gpus = 0 + if isinstance(params, LLMTrainingParams): + cmd = get_accelerate_command(num_gpus, params.gradient_accumulation, params.distributed_backend) + if num_gpus > 0: + cmd.append("--mixed_precision") + if params.mixed_precision == "fp16": + cmd.append("fp16") + elif params.mixed_precision == "bf16": + cmd.append("bf16") + else: + cmd.append("no") + + cmd.extend( + [ + "-m", + "autotrain.trainers.clm", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + ) + + elif isinstance(params, GenericParams): + cmd = [ + "python", + "-m", + "autotrain.trainers.generic", + "--config", + os.path.join(params.project_name, "training_params.json"), + ] + elif isinstance(params, TabularParams): + cmd = [ + "python", + "-m", + "autotrain.trainers.tabular", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + elif ( + isinstance(params, TextClassificationParams) + or isinstance(params, TextRegressionParams) + or isinstance(params, SentenceTransformersParams) + or isinstance(params, ExtractiveQuestionAnsweringParams) + ): + if num_gpus == 0: + cmd = [ + "accelerate", + "launch", + "--cpu", + ] + elif num_gpus == 1: + cmd = [ + "accelerate", + "launch", + "--num_machines", + "1", + "--num_processes", + "1", + ] + else: + cmd = [ + "accelerate", + "launch", + "--multi_gpu", + "--num_machines", + "1", + "--num_processes", + str(num_gpus), + ] + + if num_gpus > 0: + cmd.append("--mixed_precision") + if params.mixed_precision == "fp16": + cmd.append("fp16") + elif params.mixed_precision == "bf16": + cmd.append("bf16") + else: + cmd.append("no") + + if isinstance(params, TextRegressionParams): + cmd.extend( + [ + "-m", + "autotrain.trainers.text_regression", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + ) + elif isinstance(params, SentenceTransformersParams): + cmd.extend( + [ + "-m", + "autotrain.trainers.sent_transformers", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + ) + elif isinstance(params, ExtractiveQuestionAnsweringParams): + cmd.extend( + [ + "-m", + "autotrain.trainers.extractive_question_answering", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + ) + else: + cmd.extend( + [ + "-m", + "autotrain.trainers.text_classification", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + ) + elif isinstance(params, TokenClassificationParams): + if num_gpus == 0: + cmd = [ + "accelerate", + "launch", + "--cpu", + ] + elif num_gpus == 1: + cmd = [ + "accelerate", + "launch", + "--num_machines", + "1", + "--num_processes", + "1", + ] + else: + cmd = [ + "accelerate", + "launch", + "--multi_gpu", + "--num_machines", + "1", + "--num_processes", + str(num_gpus), + ] + + if num_gpus > 0: + cmd.append("--mixed_precision") + if params.mixed_precision == "fp16": + cmd.append("fp16") + elif params.mixed_precision == "bf16": + cmd.append("bf16") + else: + cmd.append("no") + + cmd.extend( + [ + "-m", + "autotrain.trainers.token_classification", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + ) + elif ( + isinstance(params, ImageClassificationParams) + or isinstance(params, ObjectDetectionParams) + or isinstance(params, ImageRegressionParams) + ): + if num_gpus == 0: + cmd = [ + "accelerate", + "launch", + "--cpu", + ] + elif num_gpus == 1: + cmd = [ + "accelerate", + "launch", + "--num_machines", + "1", + "--num_processes", + "1", + ] + else: + cmd = [ + "accelerate", + "launch", + "--multi_gpu", + "--num_machines", + "1", + "--num_processes", + str(num_gpus), + ] + + if num_gpus > 0: + cmd.append("--mixed_precision") + if params.mixed_precision == "fp16": + cmd.append("fp16") + elif params.mixed_precision == "bf16": + cmd.append("bf16") + else: + cmd.append("no") + + if isinstance(params, ObjectDetectionParams): + cmd.extend( + [ + "-m", + "autotrain.trainers.object_detection", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + ) + elif isinstance(params, ImageRegressionParams): + cmd.extend( + [ + "-m", + "autotrain.trainers.image_regression", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + ) + else: + cmd.extend( + [ + "-m", + "autotrain.trainers.image_classification", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + ) + elif isinstance(params, Seq2SeqParams): + if num_gpus == 0: + logger.warning("No GPU found. Forcing training on CPU. This will be super slow!") + cmd = [ + "accelerate", + "launch", + "--cpu", + ] + elif num_gpus == 1: + cmd = [ + "accelerate", + "launch", + "--num_machines", + "1", + "--num_processes", + "1", + ] + elif num_gpus == 2: + cmd = [ + "accelerate", + "launch", + "--multi_gpu", + "--num_machines", + "1", + "--num_processes", + "2", + ] + else: + if params.quantization in ("int8", "int4") and params.peft and params.mixed_precision == "bf16": + cmd = [ + "accelerate", + "launch", + "--multi_gpu", + "--num_machines", + "1", + "--num_processes", + str(num_gpus), + ] + else: + cmd = [ + "accelerate", + "launch", + "--use_deepspeed", + "--zero_stage", + "3", + "--offload_optimizer_device", + "none", + "--offload_param_device", + "none", + "--zero3_save_16bit_model", + "true", + "--zero3_init_flag", + "true", + "--deepspeed_multinode_launcher", + "standard", + "--gradient_accumulation_steps", + str(params.gradient_accumulation), + ] + if num_gpus > 0: + cmd.append("--mixed_precision") + if params.mixed_precision == "fp16": + cmd.append("fp16") + elif params.mixed_precision == "bf16": + cmd.append("bf16") + else: + cmd.append("no") + + cmd.extend( + [ + "-m", + "autotrain.trainers.seq2seq", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + ) + + elif isinstance(params, VLMTrainingParams): + if num_gpus == 0: + logger.warning("No GPU found. Forcing training on CPU. This will be super slow!") + cmd = [ + "accelerate", + "launch", + "--cpu", + ] + elif num_gpus == 1: + cmd = [ + "accelerate", + "launch", + "--num_machines", + "1", + "--num_processes", + "1", + ] + elif num_gpus == 2: + cmd = [ + "accelerate", + "launch", + "--multi_gpu", + "--num_machines", + "1", + "--num_processes", + "2", + ] + else: + if params.quantization in ("int8", "int4") and params.peft and params.mixed_precision == "bf16": + cmd = [ + "accelerate", + "launch", + "--multi_gpu", + "--num_machines", + "1", + "--num_processes", + str(num_gpus), + ] + else: + cmd = [ + "accelerate", + "launch", + "--use_deepspeed", + "--zero_stage", + "3", + "--offload_optimizer_device", + "none", + "--offload_param_device", + "none", + "--zero3_save_16bit_model", + "true", + "--zero3_init_flag", + "true", + "--deepspeed_multinode_launcher", + "standard", + "--gradient_accumulation_steps", + str(params.gradient_accumulation), + ] + + if num_gpus > 0: + cmd.append("--mixed_precision") + if params.mixed_precision == "fp16": + cmd.append("fp16") + elif params.mixed_precision == "bf16": + cmd.append("bf16") + else: + cmd.append("no") + + cmd.extend( + [ + "-m", + "autotrain.trainers.vlm", + "--training_config", + os.path.join(params.project_name, "training_params.json"), + ] + ) + + else: + raise ValueError("Unsupported params type") + + logger.info(cmd) + logger.info(params) + return cmd diff --git a/src/autotrain/config.py b/src/autotrain/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4b7b4d9c22d94dce3f47180d2a4d63b01183346a --- /dev/null +++ b/src/autotrain/config.py @@ -0,0 +1,4 @@ +import os + + +HF_API = os.getenv("HF_API", "https://huggingface.co") diff --git a/src/autotrain/dataset.py b/src/autotrain/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..5da820985f6aed9dc080fc7f7d99c09972798588 --- /dev/null +++ b/src/autotrain/dataset.py @@ -0,0 +1,812 @@ +import io +import os +import uuid +import zipfile +from dataclasses import dataclass +from typing import Dict, List, Optional + +import pandas as pd + +from autotrain.preprocessor.tabular import ( + TabularBinaryClassificationPreprocessor, + TabularMultiClassClassificationPreprocessor, + TabularMultiColumnRegressionPreprocessor, + TabularMultiLabelClassificationPreprocessor, + TabularSingleColumnRegressionPreprocessor, +) +from autotrain.preprocessor.text import ( + LLMPreprocessor, + SentenceTransformersPreprocessor, + Seq2SeqPreprocessor, + TextBinaryClassificationPreprocessor, + TextExtractiveQuestionAnsweringPreprocessor, + TextMultiClassClassificationPreprocessor, + TextSingleColumnRegressionPreprocessor, + TextTokenClassificationPreprocessor, +) +from autotrain.preprocessor.vision import ( + ImageClassificationPreprocessor, + ImageRegressionPreprocessor, + ObjectDetectionPreprocessor, +) +from autotrain.preprocessor.vlm import VLMPreprocessor + + +def remove_non_image_files(folder): + """ + Remove non-image files from a specified folder and its subfolders. + + This function iterates through all files in the given folder and its subfolders, + and removes any file that does not have an allowed image file extension. The allowed + extensions are: .jpg, .jpeg, .png, .JPG, .JPEG, .PNG, and .jsonl. + + Args: + folder (str): The path to the folder from which non-image files should be removed. + + Returns: + None + """ + # Define allowed image file extensions + allowed_extensions = {".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG", ".jsonl"} + + # Iterate through all files in the folder + for root, dirs, files in os.walk(folder): + for file in files: + # Get the file extension + file_extension = os.path.splitext(file)[1] + + # If the file extension is not in the allowed list, remove the file + if file_extension.lower() not in allowed_extensions: + file_path = os.path.join(root, file) + os.remove(file_path) + print(f"Removed file: {file_path}") + + # Recursively call the function on each subfolder + for subfolder in dirs: + remove_non_image_files(os.path.join(root, subfolder)) + + +@dataclass +class AutoTrainImageClassificationDataset: + """ + A class to handle image classification datasets for AutoTrain. + + Attributes: + train_data (str): Path to the training data. + token (str): Authentication token. + project_name (str): Name of the project. + username (str): Username of the project owner. + valid_data (Optional[str]): Path to the validation data. Default is None. + percent_valid (Optional[float]): Percentage of training data to use for validation. Default is None. + local (bool): Flag to indicate if the data is local. Default is False. + + Methods: + __str__() -> str: + Returns a string representation of the dataset. + + __post_init__(): + Initializes the dataset and sets default values for validation data. + + prepare(): + Prepares the dataset for training by extracting and preprocessing the data. + """ + + train_data: str + token: str + project_name: str + username: str + valid_data: Optional[str] = None + percent_valid: Optional[float] = None + local: bool = False + + def __str__(self) -> str: + info = f"Dataset: {self.project_name} ({self.task})\n" + info += f"Train data: {self.train_data}\n" + info += f"Valid data: {self.valid_data}\n" + return info + + def __post_init__(self): + self.task = "image_multi_class_classification" + if not self.valid_data and self.percent_valid is None: + self.percent_valid = 0.2 + elif self.valid_data and self.percent_valid is not None: + raise ValueError("You can only specify one of valid_data or percent_valid") + elif self.valid_data: + self.percent_valid = 0.0 + + def prepare(self): + valid_dir = None + if not isinstance(self.train_data, str): + cache_dir = os.environ.get("HF_HOME") + if not cache_dir: + cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") + + random_uuid = uuid.uuid4() + train_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + os.makedirs(train_dir, exist_ok=True) + self.train_data.seek(0) + content = self.train_data.read() + bytes_io = io.BytesIO(content) + + zip_ref = zipfile.ZipFile(bytes_io, "r") + zip_ref.extractall(train_dir) + # remove the __MACOSX directory + macosx_dir = os.path.join(train_dir, "__MACOSX") + if os.path.exists(macosx_dir): + os.system(f"rm -rf {macosx_dir}") + remove_non_image_files(train_dir) + if self.valid_data: + random_uuid = uuid.uuid4() + valid_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + os.makedirs(valid_dir, exist_ok=True) + self.valid_data.seek(0) + content = self.valid_data.read() + bytes_io = io.BytesIO(content) + zip_ref = zipfile.ZipFile(bytes_io, "r") + zip_ref.extractall(valid_dir) + # remove the __MACOSX directory + macosx_dir = os.path.join(valid_dir, "__MACOSX") + if os.path.exists(macosx_dir): + os.system(f"rm -rf {macosx_dir}") + remove_non_image_files(valid_dir) + else: + train_dir = self.train_data + if self.valid_data: + valid_dir = self.valid_data + + preprocessor = ImageClassificationPreprocessor( + train_data=train_dir, + valid_data=valid_dir, + token=self.token, + project_name=self.project_name, + username=self.username, + local=self.local, + ) + return preprocessor.prepare() + + +@dataclass +class AutoTrainObjectDetectionDataset: + """ + A dataset class for AutoTrain object detection tasks. + + Attributes: + train_data (str): Path to the training data. + token (str): Authentication token. + project_name (str): Name of the project. + username (str): Username of the project owner. + valid_data (Optional[str]): Path to the validation data. Default is None. + percent_valid (Optional[float]): Percentage of training data to be used for validation. Default is None. + local (bool): Flag indicating if the data is local. Default is False. + + Methods: + __str__() -> str: + Returns a string representation of the dataset. + + __post_init__(): + Initializes the dataset and sets default values for validation data. + + prepare(): + Prepares the dataset for training by extracting and preprocessing the data. + """ + + train_data: str + token: str + project_name: str + username: str + valid_data: Optional[str] = None + percent_valid: Optional[float] = None + local: bool = False + + def __str__(self) -> str: + info = f"Dataset: {self.project_name} ({self.task})\n" + info += f"Train data: {self.train_data}\n" + info += f"Valid data: {self.valid_data}\n" + return info + + def __post_init__(self): + self.task = "image_object_detection" + if not self.valid_data and self.percent_valid is None: + self.percent_valid = 0.2 + elif self.valid_data and self.percent_valid is not None: + raise ValueError("You can only specify one of valid_data or percent_valid") + elif self.valid_data: + self.percent_valid = 0.0 + + def prepare(self): + valid_dir = None + if not isinstance(self.train_data, str): + cache_dir = os.environ.get("HF_HOME") + if not cache_dir: + cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") + + random_uuid = uuid.uuid4() + train_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + os.makedirs(train_dir, exist_ok=True) + self.train_data.seek(0) + content = self.train_data.read() + bytes_io = io.BytesIO(content) + + zip_ref = zipfile.ZipFile(bytes_io, "r") + zip_ref.extractall(train_dir) + # remove the __MACOSX directory + macosx_dir = os.path.join(train_dir, "__MACOSX") + if os.path.exists(macosx_dir): + os.system(f"rm -rf {macosx_dir}") + remove_non_image_files(train_dir) + if self.valid_data: + random_uuid = uuid.uuid4() + valid_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + os.makedirs(valid_dir, exist_ok=True) + self.valid_data.seek(0) + content = self.valid_data.read() + bytes_io = io.BytesIO(content) + zip_ref = zipfile.ZipFile(bytes_io, "r") + zip_ref.extractall(valid_dir) + # remove the __MACOSX directory + macosx_dir = os.path.join(valid_dir, "__MACOSX") + if os.path.exists(macosx_dir): + os.system(f"rm -rf {macosx_dir}") + remove_non_image_files(valid_dir) + else: + train_dir = self.train_data + if self.valid_data: + valid_dir = self.valid_data + + preprocessor = ObjectDetectionPreprocessor( + train_data=train_dir, + valid_data=valid_dir, + token=self.token, + project_name=self.project_name, + username=self.username, + local=self.local, + ) + return preprocessor.prepare() + + +@dataclass +class AutoTrainVLMDataset: + """ + A class to handle dataset for AutoTrain Vision-Language Model (VLM) task. + + Attributes: + ----------- + train_data : str + Path to the training data or a file-like object containing the training data. + token : str + Authentication token for accessing the dataset. + project_name : str + Name of the project. + username : str + Username of the project owner. + column_mapping : Dict[str, str] + Mapping of columns in the dataset. + valid_data : Optional[str], default=None + Path to the validation data or a file-like object containing the validation data. + percent_valid : Optional[float], default=None + Percentage of the training data to be used for validation if `valid_data` is not provided. + local : bool, default=False + Flag indicating whether the dataset is stored locally. + + Methods: + -------- + __str__() -> str: + Returns a string representation of the dataset. + + __post_init__(): + Initializes the dataset and sets default values for validation data percentage. + + prepare(): + Prepares the dataset for training by extracting and processing the data. + """ + + train_data: str + token: str + project_name: str + username: str + column_mapping: Dict[str, str] + valid_data: Optional[str] = None + percent_valid: Optional[float] = None + local: bool = False + + def __str__(self) -> str: + info = f"Dataset: {self.project_name} ({self.task})\n" + info += f"Train data: {self.train_data}\n" + info += f"Valid data: {self.valid_data}\n" + return info + + def __post_init__(self): + self.task = "vlm" + if not self.valid_data and self.percent_valid is None: + self.percent_valid = 0.2 + elif self.valid_data and self.percent_valid is not None: + raise ValueError("You can only specify one of valid_data or percent_valid") + elif self.valid_data: + self.percent_valid = 0.0 + + def prepare(self): + valid_dir = None + if not isinstance(self.train_data, str): + cache_dir = os.environ.get("HF_HOME") + if not cache_dir: + cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") + + random_uuid = uuid.uuid4() + train_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + os.makedirs(train_dir, exist_ok=True) + self.train_data.seek(0) + content = self.train_data.read() + bytes_io = io.BytesIO(content) + + zip_ref = zipfile.ZipFile(bytes_io, "r") + zip_ref.extractall(train_dir) + # remove the __MACOSX directory + macosx_dir = os.path.join(train_dir, "__MACOSX") + if os.path.exists(macosx_dir): + os.system(f"rm -rf {macosx_dir}") + remove_non_image_files(train_dir) + if self.valid_data: + random_uuid = uuid.uuid4() + valid_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + os.makedirs(valid_dir, exist_ok=True) + self.valid_data.seek(0) + content = self.valid_data.read() + bytes_io = io.BytesIO(content) + zip_ref = zipfile.ZipFile(bytes_io, "r") + zip_ref.extractall(valid_dir) + # remove the __MACOSX directory + macosx_dir = os.path.join(valid_dir, "__MACOSX") + if os.path.exists(macosx_dir): + os.system(f"rm -rf {macosx_dir}") + remove_non_image_files(valid_dir) + else: + train_dir = self.train_data + if self.valid_data: + valid_dir = self.valid_data + + preprocessor = VLMPreprocessor( + train_data=train_dir, + valid_data=valid_dir, + token=self.token, + project_name=self.project_name, + username=self.username, + local=self.local, + column_mapping=self.column_mapping, + ) + return preprocessor.prepare() + + +@dataclass +class AutoTrainImageRegressionDataset: + """ + AutoTrainImageRegressionDataset is a class designed for handling image regression datasets in the AutoTrain framework. + + Attributes: + train_data (str): Path to the training data. + token (str): Authentication token. + project_name (str): Name of the project. + username (str): Username of the project owner. + valid_data (Optional[str]): Path to the validation data. Default is None. + percent_valid (Optional[float]): Percentage of training data to be used for validation if valid_data is not provided. Default is None. + local (bool): Flag indicating if the data is local. Default is False. + + Methods: + __str__() -> str: + Returns a string representation of the dataset information. + + __post_init__(): + Initializes the task attribute and sets the percent_valid attribute based on the presence of valid_data. + + prepare(): + Prepares the dataset for training by extracting and organizing the data, and returns a preprocessor object. + """ + + train_data: str + token: str + project_name: str + username: str + valid_data: Optional[str] = None + percent_valid: Optional[float] = None + local: bool = False + + def __str__(self) -> str: + info = f"Dataset: {self.project_name} ({self.task})\n" + info += f"Train data: {self.train_data}\n" + info += f"Valid data: {self.valid_data}\n" + return info + + def __post_init__(self): + self.task = "image_single_column_regression" + if not self.valid_data and self.percent_valid is None: + self.percent_valid = 0.2 + elif self.valid_data and self.percent_valid is not None: + raise ValueError("You can only specify one of valid_data or percent_valid") + elif self.valid_data: + self.percent_valid = 0.0 + + def prepare(self): + valid_dir = None + if not isinstance(self.train_data, str): + cache_dir = os.environ.get("HF_HOME") + if not cache_dir: + cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") + + random_uuid = uuid.uuid4() + train_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + os.makedirs(train_dir, exist_ok=True) + self.train_data.seek(0) + content = self.train_data.read() + bytes_io = io.BytesIO(content) + + zip_ref = zipfile.ZipFile(bytes_io, "r") + zip_ref.extractall(train_dir) + # remove the __MACOSX directory + macosx_dir = os.path.join(train_dir, "__MACOSX") + if os.path.exists(macosx_dir): + os.system(f"rm -rf {macosx_dir}") + remove_non_image_files(train_dir) + if self.valid_data: + random_uuid = uuid.uuid4() + valid_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + os.makedirs(valid_dir, exist_ok=True) + self.valid_data.seek(0) + content = self.valid_data.read() + bytes_io = io.BytesIO(content) + zip_ref = zipfile.ZipFile(bytes_io, "r") + zip_ref.extractall(valid_dir) + # remove the __MACOSX directory + macosx_dir = os.path.join(valid_dir, "__MACOSX") + if os.path.exists(macosx_dir): + os.system(f"rm -rf {macosx_dir}") + remove_non_image_files(valid_dir) + else: + train_dir = self.train_data + if self.valid_data: + valid_dir = self.valid_data + + preprocessor = ImageRegressionPreprocessor( + train_data=train_dir, + valid_data=valid_dir, + token=self.token, + project_name=self.project_name, + username=self.username, + local=self.local, + ) + return preprocessor.prepare() + + +@dataclass +class AutoTrainDataset: + """ + AutoTrainDataset class for handling various types of datasets and preprocessing tasks. + + Attributes: + train_data (List[str]): List of file paths or DataFrames for training data. + task (str): The type of task to perform (e.g., "text_binary_classification"). + token (str): Authentication token. + project_name (str): Name of the project. + username (Optional[str]): Username of the project owner. Defaults to None. + column_mapping (Optional[Dict[str, str]]): Mapping of column names. Defaults to None. + valid_data (Optional[List[str]]): List of file paths or DataFrames for validation data. Defaults to None. + percent_valid (Optional[float]): Percentage of training data to use for validation. Defaults to None. + convert_to_class_label (Optional[bool]): Whether to convert labels to class labels. Defaults to False. + local (bool): Whether the data is local. Defaults to False. + ext (Optional[str]): File extension of the data files. Defaults to "csv". + + Methods: + __str__(): Returns a string representation of the dataset. + __post_init__(): Initializes validation data and preprocesses the data. + _preprocess_data(): Preprocesses the training and validation data. + num_samples(): Returns the total number of samples in the dataset. + prepare(): Prepares the dataset for the specified task using the appropriate preprocessor. + """ + + train_data: List[str] + task: str + token: str + project_name: str + username: Optional[str] = None + column_mapping: Optional[Dict[str, str]] = None + valid_data: Optional[List[str]] = None + percent_valid: Optional[float] = None + convert_to_class_label: Optional[bool] = False + local: bool = False + ext: Optional[str] = "csv" + + def __str__(self) -> str: + info = f"Dataset: {self.project_name} ({self.task})\n" + info += f"Train data: {self.train_data}\n" + info += f"Valid data: {self.valid_data}\n" + info += f"Column mapping: {self.column_mapping}\n" + return info + + def __post_init__(self): + if self.valid_data is None: + self.valid_data = [] + if not self.valid_data and self.percent_valid is None: + self.percent_valid = 0.2 + elif self.valid_data and self.percent_valid is not None: + raise ValueError("You can only specify one of valid_data or percent_valid") + elif self.valid_data: + self.percent_valid = 0.0 + + self.train_df, self.valid_df = self._preprocess_data() + + def _preprocess_data(self): + train_df = [] + for file in self.train_data: + if isinstance(file, pd.DataFrame): + train_df.append(file) + else: + if self.ext == "jsonl": + train_df.append(pd.read_json(file, lines=True)) + else: + train_df.append(pd.read_csv(file)) + if len(train_df) > 1: + train_df = pd.concat(train_df) + else: + train_df = train_df[0] + + valid_df = None + if len(self.valid_data) > 0: + valid_df = [] + for file in self.valid_data: + if isinstance(file, pd.DataFrame): + valid_df.append(file) + else: + if self.ext == "jsonl": + valid_df.append(pd.read_json(file, lines=True)) + else: + valid_df.append(pd.read_csv(file)) + if len(valid_df) > 1: + valid_df = pd.concat(valid_df) + else: + valid_df = valid_df[0] + return train_df, valid_df + + @property + def num_samples(self): + return len(self.train_df) + len(self.valid_df) if self.valid_df is not None else len(self.train_df) + + def prepare(self): + if self.task == "text_binary_classification": + text_column = self.column_mapping["text"] + label_column = self.column_mapping["label"] + preprocessor = TextBinaryClassificationPreprocessor( + train_data=self.train_df, + text_column=text_column, + label_column=label_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + convert_to_class_label=self.convert_to_class_label, + local=self.local, + ) + return preprocessor.prepare() + + elif self.task == "text_multi_class_classification": + text_column = self.column_mapping["text"] + label_column = self.column_mapping["label"] + preprocessor = TextMultiClassClassificationPreprocessor( + train_data=self.train_df, + text_column=text_column, + label_column=label_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + convert_to_class_label=self.convert_to_class_label, + local=self.local, + ) + return preprocessor.prepare() + + elif self.task == "text_token_classification": + text_column = self.column_mapping["text"] + label_column = self.column_mapping["label"] + preprocessor = TextTokenClassificationPreprocessor( + train_data=self.train_df, + text_column=text_column, + label_column=label_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + local=self.local, + convert_to_class_label=self.convert_to_class_label, + ) + return preprocessor.prepare() + + elif self.task == "text_single_column_regression": + text_column = self.column_mapping["text"] + label_column = self.column_mapping["label"] + preprocessor = TextSingleColumnRegressionPreprocessor( + train_data=self.train_df, + text_column=text_column, + label_column=label_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + local=self.local, + ) + return preprocessor.prepare() + + elif self.task == "seq2seq": + text_column = self.column_mapping["text"] + label_column = self.column_mapping["label"] + preprocessor = Seq2SeqPreprocessor( + train_data=self.train_df, + text_column=text_column, + label_column=label_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + local=self.local, + ) + return preprocessor.prepare() + + elif self.task == "lm_training": + text_column = self.column_mapping["text"] + prompt_column = self.column_mapping.get("prompt") + rejected_text_column = self.column_mapping.get("rejected_text") + preprocessor = LLMPreprocessor( + train_data=self.train_df, + text_column=text_column, + prompt_column=prompt_column, + rejected_text_column=rejected_text_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + local=self.local, + ) + return preprocessor.prepare() + + elif self.task == "sentence_transformers": + sentence1_column = self.column_mapping["sentence1"] + sentence2_column = self.column_mapping["sentence2"] + sentence3_column = self.column_mapping.get("sentence3") + target_column = self.column_mapping.get("target") + + preprocessor = SentenceTransformersPreprocessor( + train_data=self.train_df, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + local=self.local, + sentence1_column=sentence1_column, + sentence2_column=sentence2_column, + sentence3_column=sentence3_column, + target_column=target_column, + convert_to_class_label=self.convert_to_class_label, + ) + return preprocessor.prepare() + + elif self.task == "text_extractive_question_answering": + text_column = self.column_mapping["text"] + question_column = self.column_mapping["question"] + answer_column = self.column_mapping["answer"] + preprocessor = TextExtractiveQuestionAnsweringPreprocessor( + train_data=self.train_df, + text_column=text_column, + question_column=question_column, + answer_column=answer_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + local=self.local, + ) + return preprocessor.prepare() + + elif self.task == "tabular_binary_classification": + id_column = self.column_mapping["id"] + label_column = self.column_mapping["label"][0] + if len(id_column.strip()) == 0: + id_column = None + preprocessor = TabularBinaryClassificationPreprocessor( + train_data=self.train_df, + id_column=id_column, + label_column=label_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + local=self.local, + ) + return preprocessor.prepare() + elif self.task == "tabular_multi_class_classification": + id_column = self.column_mapping["id"] + label_column = self.column_mapping["label"][0] + if len(id_column.strip()) == 0: + id_column = None + preprocessor = TabularMultiClassClassificationPreprocessor( + train_data=self.train_df, + id_column=id_column, + label_column=label_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + local=self.local, + ) + return preprocessor.prepare() + elif self.task == "tabular_single_column_regression": + id_column = self.column_mapping["id"] + label_column = self.column_mapping["label"][0] + if len(id_column.strip()) == 0: + id_column = None + preprocessor = TabularSingleColumnRegressionPreprocessor( + train_data=self.train_df, + id_column=id_column, + label_column=label_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + local=self.local, + ) + return preprocessor.prepare() + elif self.task == "tabular_multi_column_regression": + id_column = self.column_mapping["id"] + label_column = self.column_mapping["label"] + if len(id_column.strip()) == 0: + id_column = None + preprocessor = TabularMultiColumnRegressionPreprocessor( + train_data=self.train_df, + id_column=id_column, + label_column=label_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + local=self.local, + ) + return preprocessor.prepare() + elif self.task == "tabular_multi_label_classification": + id_column = self.column_mapping["id"] + label_column = self.column_mapping["label"] + if len(id_column.strip()) == 0: + id_column = None + preprocessor = TabularMultiLabelClassificationPreprocessor( + train_data=self.train_df, + id_column=id_column, + label_column=label_column, + username=self.username, + project_name=self.project_name, + valid_data=self.valid_df, + test_size=self.percent_valid, + token=self.token, + seed=42, + local=self.local, + ) + return preprocessor.prepare() + else: + raise ValueError(f"Task {self.task} not supported") diff --git a/src/autotrain/help.py b/src/autotrain/help.py new file mode 100644 index 0000000000000000000000000000000000000000..162997ef92fe880d62b9401593786189bbad948d --- /dev/null +++ b/src/autotrain/help.py @@ -0,0 +1,81 @@ +autotrain_user_info = """ +

Please choose the user or organization who is creating the AutoTrain Project.

+

In case of non-free tier, this user or organization will be billed.

+""" + +project_name_info = """A unique name for the AutoTrain Project. +This name will be used to identify the project in the AutoTrain dashboard.""" + +column_mapping_info = """ +

Column Mapping is used to map the columns in the dataset to the columns in the AutoTrain Project.

+

For example, if your dataset has a column named "input" and you want to use it as the input for the model, +you can map it to the "text" column in the AutoTrain Project.

+

Similarly, if your dataset has a column named "label" and you want to use it as the label for the model, +you can map it to the "target" column in the AutoTrain Project.

+

Column mapping keys are AutoTrain Project column names and values are your dataset column names.

+

For tabular datasets, you can map multiple targets to the "label" column. This will enable multi-label task. +The column names must be a comma separated list.

+

For other tasks, mappings are one-to-one.

+

Note: column names are case sensitive.

+""" + +base_model_info = """ +

Base Model is the model that will be used for fine-tuning.

+

For example, if you are training a text classification model, you can choose a base model like "bert-base-uncased".

+

For a list of available models, please see HuggingFace Model Hub.

+

Note: not all models listed here are going to be compatible with +your data and parameters. You should select a model that is compatible with your task, data and parameters.

+Dont see your favorite model? You can also use a custom model by providing the model name in an environment variable: AUTOTRAIN_CUSTOM_MODELS. +For example, go to settings and add a new environment variable with the key AUTOTRAIN_CUSTOM_MODELS and value as the model name (e.g. google/gemma-7b) +""" + +hardware_info = """ + +

Hardware is the machine that will be used for training.

+

Please choose a hardware that is compatible with your task, data and parameters.

+""" + +task_info = """ +

Task is the type of model you want to train.

+

Please choose a task that is compatible with your data and parameters.

+

For example, if you are training a text classification model, you can choose "Text Classification" task.

+""" + + +APP_IMAGE_CLASSIFICATION_DATA_HELP = """The data for the Image Classification task should be in the following format: +- The data should be in a zip file. +- The zip file should contain multiple folders (the classes), each folder should contain images of a single class. +- The name of the folder should be the name of the class. +- The images must be jpeg, jpg or png. +- There should be at least 5 images per class. +- There should not be any other files in the zip file. +- There should not be any other folders inside the zip folder. +""" + +APP_LM_TRAINING_TYPE = """There are two types of Language Model Training: +- generic +- chat + +In the generic mode, you provide a CSV with a text column which has already been formatted by you for training a language model. +In the chat mode, you provide a CSV with two or three text columns: prompt, context (optional) and response. +Context column can be empty for samples if not needed. You can also have a "prompt start" column. If provided, "prompt start" will be prepended before the prompt column. + +Please see [this](https://huggingface.co/datasets/tatsu-lab/alpaca) dataset which has both formats in the same dataset. +""" + + +def get_app_help(element_id): + if element_id == "autotrain_user_info": + return autotrain_user_info + elif element_id == "project_name_info": + return project_name_info + elif element_id == "column_mapping_info": + return column_mapping_info + elif element_id == "base_model_info": + return base_model_info + elif element_id == "hardware_info": + return hardware_info + elif element_id == "task_info": + return task_info + else: + return "No help available for this element." diff --git a/src/autotrain/logging.py b/src/autotrain/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..0e7ea7305d6ce6014396830dc175a5eb79b875d8 --- /dev/null +++ b/src/autotrain/logging.py @@ -0,0 +1,61 @@ +import sys +from dataclasses import dataclass + +from loguru import logger + + +IS_ACCELERATE_AVAILABLE = False + +try: + from accelerate.state import PartialState + + IS_ACCELERATE_AVAILABLE = True +except ImportError: + pass + + +@dataclass +class Logger: + """ + A custom logger class that sets up and manages logging configuration. + + Methods + ------- + __post_init__(): + Initializes the logger with a specific format and sets up the logger. + + _should_log(record): + Determines if a log record should be logged based on the process state. + + setup_logger(): + Configures the logger to output to stdout with the specified format and filter. + + get_logger(): + Returns the configured logger instance. + """ + + def __post_init__(self): + self.log_format = ( + "{level: <8} | " + "{time:YYYY-MM-DD HH:mm:ss} | " + "{name}:{function}:{line} - " + "{message}" + ) + self.logger = logger + self.setup_logger() + + def _should_log(self, record): + if not IS_ACCELERATE_AVAILABLE: + return None + return PartialState().is_main_process + + def setup_logger(self): + self.logger.remove() + self.logger.add( + sys.stdout, + format=self.log_format, + filter=lambda x: self._should_log(x) if IS_ACCELERATE_AVAILABLE else None, + ) + + def get_logger(self): + return self.logger diff --git a/src/autotrain/params.py b/src/autotrain/params.py new file mode 100644 index 0000000000000000000000000000000000000000..5ab8f8f5694477e94844fc51f7ba8fed04889963 --- /dev/null +++ b/src/autotrain/params.py @@ -0,0 +1,12 @@ +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams +from autotrain.trainers.image_classification.params import ImageClassificationParams +from autotrain.trainers.image_regression.params import ImageRegressionParams +from autotrain.trainers.object_detection.params import ObjectDetectionParams +from autotrain.trainers.sent_transformers.params import SentenceTransformersParams +from autotrain.trainers.seq2seq.params import Seq2SeqParams +from autotrain.trainers.tabular.params import TabularParams +from autotrain.trainers.text_classification.params import TextClassificationParams +from autotrain.trainers.text_regression.params import TextRegressionParams +from autotrain.trainers.token_classification.params import TokenClassificationParams +from autotrain.trainers.vlm.params import VLMTrainingParams diff --git a/src/autotrain/parser.py b/src/autotrain/parser.py new file mode 100644 index 0000000000000000000000000000000000000000..fd7327e904bbaaa6571240cf9c019031baa27aa0 --- /dev/null +++ b/src/autotrain/parser.py @@ -0,0 +1,229 @@ +import os +from dataclasses import dataclass + +import requests +import yaml + +from autotrain import logger +from autotrain.project import ( + AutoTrainProject, + ext_qa_munge_data, + img_clf_munge_data, + img_obj_detect_munge_data, + img_reg_munge_data, + llm_munge_data, + sent_transformers_munge_data, + seq2seq_munge_data, + tabular_munge_data, + text_clf_munge_data, + text_reg_munge_data, + token_clf_munge_data, + vlm_munge_data, +) +from autotrain.tasks import TASKS +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams +from autotrain.trainers.image_classification.params import ImageClassificationParams +from autotrain.trainers.image_regression.params import ImageRegressionParams +from autotrain.trainers.object_detection.params import ObjectDetectionParams +from autotrain.trainers.sent_transformers.params import SentenceTransformersParams +from autotrain.trainers.seq2seq.params import Seq2SeqParams +from autotrain.trainers.tabular.params import TabularParams +from autotrain.trainers.text_classification.params import TextClassificationParams +from autotrain.trainers.text_regression.params import TextRegressionParams +from autotrain.trainers.token_classification.params import TokenClassificationParams +from autotrain.trainers.vlm.params import VLMTrainingParams + + +@dataclass +class AutoTrainConfigParser: + """ + AutoTrainConfigParser is a class responsible for parsing and validating the yaml configuration + required to run various tasks in the AutoTrain framework. It supports loading configurations + from both local files and remote URLs, and maps task aliases to their respective parameters + and data munging functions. + + Attributes: + config_path (str): Path or URL to the configuration file. + config (dict): Parsed configuration data. + task_param_map (dict): Mapping of task names to their parameter classes. + munge_data_map (dict): Mapping of task names to their data munging functions. + task_aliases (dict): Mapping of task aliases to their canonical task names. + task (str): The resolved task name from the configuration. + backend (str): The backend specified in the configuration. + parsed_config (dict): The parsed configuration parameters. + + Methods: + __post_init__(): Initializes the parser, loads the configuration, and validates required fields. + _parse_config(): Parses the configuration and extracts relevant parameters based on the task. + run(): Executes the task with the parsed configuration. + """ + + config_path: str + + def __post_init__(self): + if self.config_path.startswith("http"): + response = requests.get(self.config_path) + if response.status_code == 200: + self.config = yaml.safe_load(response.content) + else: + raise ValueError("Failed to retrieve YAML file.") + else: + with open(self.config_path, "r") as f: + self.config = yaml.safe_load(f) + + self.task_param_map = { + "lm_training": LLMTrainingParams, + "image_binary_classification": ImageClassificationParams, + "image_multi_class_classification": ImageClassificationParams, + "image_object_detection": ObjectDetectionParams, + "seq2seq": Seq2SeqParams, + "tabular": TabularParams, + "text_binary_classification": TextClassificationParams, + "text_multi_class_classification": TextClassificationParams, + "text_single_column_regression": TextRegressionParams, + "text_token_classification": TokenClassificationParams, + "sentence_transformers": SentenceTransformersParams, + "image_single_column_regression": ImageRegressionParams, + "vlm": VLMTrainingParams, + "text_extractive_question_answering": ExtractiveQuestionAnsweringParams, + } + self.munge_data_map = { + "lm_training": llm_munge_data, + "tabular": tabular_munge_data, + "seq2seq": seq2seq_munge_data, + "image_multi_class_classification": img_clf_munge_data, + "image_object_detection": img_obj_detect_munge_data, + "text_multi_class_classification": text_clf_munge_data, + "text_token_classification": token_clf_munge_data, + "text_single_column_regression": text_reg_munge_data, + "sentence_transformers": sent_transformers_munge_data, + "image_single_column_regression": img_reg_munge_data, + "vlm": vlm_munge_data, + "text_extractive_question_answering": ext_qa_munge_data, + } + self.task_aliases = { + "llm": "lm_training", + "llm-sft": "lm_training", + "llm-orpo": "lm_training", + "llm-generic": "lm_training", + "llm-dpo": "lm_training", + "llm-reward": "lm_training", + "image_binary_classification": "image_multi_class_classification", + "image-binary-classification": "image_multi_class_classification", + "image_classification": "image_multi_class_classification", + "image-classification": "image_multi_class_classification", + "seq2seq": "seq2seq", + "tabular": "tabular", + "text_binary_classification": "text_multi_class_classification", + "text-binary-classification": "text_multi_class_classification", + "text_classification": "text_multi_class_classification", + "text-classification": "text_multi_class_classification", + "text_single_column_regression": "text_single_column_regression", + "text-single-column-regression": "text_single_column_regression", + "text_regression": "text_single_column_regression", + "text-regression": "text_single_column_regression", + "token_classification": "text_token_classification", + "token-classification": "text_token_classification", + "image_object_detection": "image_object_detection", + "image-object-detection": "image_object_detection", + "object_detection": "image_object_detection", + "object-detection": "image_object_detection", + "st": "sentence_transformers", + "st:pair": "sentence_transformers", + "st:pair_class": "sentence_transformers", + "st:pair_score": "sentence_transformers", + "st:triplet": "sentence_transformers", + "st:qa": "sentence_transformers", + "sentence-transformers:pair": "sentence_transformers", + "sentence-transformers:pair_class": "sentence_transformers", + "sentence-transformers:pair_score": "sentence_transformers", + "sentence-transformers:triplet": "sentence_transformers", + "sentence-transformers:qa": "sentence_transformers", + "image_single_column_regression": "image_single_column_regression", + "image-single-column-regression": "image_single_column_regression", + "image_regression": "image_single_column_regression", + "image-regression": "image_single_column_regression", + "image-scoring": "image_single_column_regression", + "vlm:captioning": "vlm", + "vlm:vqa": "vlm", + "extractive_question_answering": "text_extractive_question_answering", + "ext_qa": "text_extractive_question_answering", + "ext-qa": "text_extractive_question_answering", + "extractive-qa": "text_extractive_question_answering", + } + task = self.config.get("task") + self.task = self.task_aliases.get(task, task) + if self.task is None: + raise ValueError("Task is required in the configuration file") + if self.task not in TASKS: + raise ValueError(f"Task `{self.task}` is not supported") + self.backend = self.config.get("backend") + if self.backend is None: + raise ValueError("Backend is required in the configuration file") + + logger.info(f"Running task: {self.task}") + logger.info(f"Using backend: {self.backend}") + + self.parsed_config = self._parse_config() + + def _parse_config(self): + params = { + "model": self.config["base_model"], + "project_name": self.config["project_name"], + } + + params["data_path"] = self.config["data"]["path"] + + if self.task == "lm_training": + params["chat_template"] = self.config["data"]["chat_template"] + if "-" in self.config["task"]: + params["trainer"] = self.config["task"].split("-")[1] + if params["trainer"] == "generic": + params["trainer"] = "default" + if params["trainer"] not in ["sft", "orpo", "dpo", "reward", "default"]: + raise ValueError("Invalid LLM training task") + + if self.task == "sentence_transformers": + params["trainer"] = self.config["task"].split(":")[1] + + if self.task == "vlm": + params["trainer"] = self.config["task"].split(":")[1] + + for k, v in self.config["data"]["column_mapping"].items(): + params[k] = v + params["train_split"] = self.config["data"]["train_split"] + params["valid_split"] = self.config["data"]["valid_split"] + params["log"] = self.config["log"] + + if "hub" in self.config: + params["username"] = self.config["hub"]["username"] + params["token"] = self.config["hub"]["token"] + params["push_to_hub"] = self.config["hub"]["push_to_hub"] + else: + params["username"] = None + params["token"] = None + params["push_to_hub"] = False + + if params["username"]: + if params["username"].startswith("${"): + params["username"] = os.environ.get(params["username"][2:-1]) + + if params["token"]: + if params["token"].startswith("${"): + params["token"] = os.environ.get(params["token"][2:-1]) + + other_params = self.config.get("params") + if other_params: + params.update(other_params) + + return params + + def run(self): + _params = self.task_param_map[self.task](**self.parsed_config) + logger.info(_params) + _munge_fn = self.munge_data_map[self.task] + _munge_fn(_params, local=self.backend.startswith("local")) + project = AutoTrainProject(params=_params, backend=self.backend) + job_id = project.create() + logger.info(f"Job ID: {job_id}") diff --git a/src/autotrain/preprocessor/__init__.py b/src/autotrain/preprocessor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/preprocessor/tabular.py b/src/autotrain/preprocessor/tabular.py new file mode 100644 index 0000000000000000000000000000000000000000..defcb3c4a7647a39891a113a681b8794287f2ea3 --- /dev/null +++ b/src/autotrain/preprocessor/tabular.py @@ -0,0 +1,273 @@ +from dataclasses import dataclass +from typing import List, Optional + +import pandas as pd +from datasets import Dataset, DatasetDict +from sklearn.model_selection import train_test_split + + +RESERVED_COLUMNS = ["autotrain_id", "autotrain_label"] + + +@dataclass +class TabularBinaryClassificationPreprocessor: + """ + A preprocessor class for tabular binary classification tasks. + + Attributes: + train_data (pd.DataFrame): The training data. + label_column (str): The name of the label column in the training data. + username (str): The username for the Hugging Face Hub. + project_name (str): The name of the project. + token (str): The authentication token for the Hugging Face Hub. + id_column (Optional[str]): The name of the ID column in the training data. Default is None. + valid_data (Optional[pd.DataFrame]): The validation data. Default is None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2. + seed (Optional[int]): The random seed for splitting the data. Default is 42. + local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Default is False. + + Methods: + __post_init__(): Validates the presence of required columns in the training and validation data. + split(): Splits the training data into training and validation sets if validation data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns by adding 'autotrain_id' and 'autotrain_label', and drops the original ID and label columns. + prepare(): Prepares the dataset by splitting, processing columns, and saving or pushing the dataset to the Hugging Face Hub. + """ + + train_data: pd.DataFrame + label_column: str + username: str + project_name: str + token: str + id_column: Optional[str] = None + valid_data: Optional[pd.DataFrame] = None + test_size: Optional[float] = 0.2 + seed: Optional[int] = 42 + local: Optional[bool] = False + + def __post_init__(self): + # check if id_column and label_column are in train_data + if self.id_column is not None: + if self.id_column not in self.train_data.columns: + raise ValueError(f"{self.id_column} not in train data") + + if self.label_column not in self.train_data.columns: + raise ValueError(f"{self.label_column} not in train data") + + # check if id_column and label_column are in valid_data + if self.valid_data is not None: + if self.id_column is not None: + if self.id_column not in self.valid_data.columns: + raise ValueError(f"{self.id_column} not in valid data") + if self.label_column not in self.valid_data.columns: + raise ValueError(f"{self.label_column} not in valid data") + + # make sure no reserved columns are in train_data or valid_data + for column in RESERVED_COLUMNS: + if column in self.train_data.columns: + raise ValueError(f"{column} is a reserved column name") + if self.valid_data is not None: + if column in self.valid_data.columns: + raise ValueError(f"{column} is a reserved column name") + + def split(self): + if self.valid_data is not None: + return self.train_data, self.valid_data + else: + train_df, valid_df = train_test_split( + self.train_data, + test_size=self.test_size, + random_state=self.seed, + stratify=self.train_data[self.label_column], + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare_columns(self, train_df, valid_df): + train_df.loc[:, "autotrain_id"] = train_df[self.id_column] if self.id_column else list(range(len(train_df))) + train_df.loc[:, "autotrain_label"] = train_df[self.label_column] + valid_df.loc[:, "autotrain_id"] = valid_df[self.id_column] if self.id_column else list(range(len(valid_df))) + valid_df.loc[:, "autotrain_label"] = valid_df[self.label_column] + + # drop id_column and label_column + drop_cols = [self.id_column, self.label_column] if self.id_column else [self.label_column] + train_df = train_df.drop(columns=drop_cols) + valid_df = valid_df.drop(columns=drop_cols) + return train_df, valid_df + + def prepare(self): + train_df, valid_df = self.split() + train_df, valid_df = self.prepare_columns(train_df, valid_df) + train_df = Dataset.from_pandas(train_df) + valid_df = Dataset.from_pandas(valid_df) + if self.local: + dataset = DatasetDict( + { + "train": train_df, + "validation": valid_df, + } + ) + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + train_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="train", + private=True, + token=self.token, + ) + valid_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="validation", + private=True, + token=self.token, + ) + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" + + +class TabularMultiClassClassificationPreprocessor(TabularBinaryClassificationPreprocessor): + pass + + +class TabularSingleColumnRegressionPreprocessor(TabularBinaryClassificationPreprocessor): + def split(self): + if self.valid_data is not None: + return self.train_data, self.valid_data + else: + train_df, valid_df = train_test_split( + self.train_data, + test_size=self.test_size, + random_state=self.seed, + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + +@dataclass +class TabularMultiLabelClassificationPreprocessor: + """ + TabularMultiLabelClassificationPreprocessor is a class for preprocessing tabular data for multi-label classification tasks. + + Attributes: + train_data (pd.DataFrame): The training data. + label_column (List[str]): List of columns to be used as labels. + username (str): The username for the Hugging Face Hub. + project_name (str): The project name for the Hugging Face Hub. + id_column (Optional[str]): The column to be used as an identifier. Defaults to None. + valid_data (Optional[pd.DataFrame]): The validation data. Defaults to None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Defaults to 0.2. + seed (Optional[int]): The random seed for splitting the data. Defaults to 42. + token (Optional[str]): The token for authentication with the Hugging Face Hub. Defaults to None. + local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Defaults to False. + + Methods: + __post_init__(): Validates the presence of id_column and label_column in train_data and valid_data, and checks for reserved column names. + split(): Splits the train_data into training and validation sets if valid_data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns by adding autotrain_id and autotrain_label columns, and drops the original id_column and label_column. + prepare(): Prepares the dataset by splitting the data, preparing the columns, and converting to Hugging Face Dataset format. Saves the dataset locally or pushes to the Hugging Face Hub. + """ + + train_data: pd.DataFrame + label_column: List[str] + username: str + project_name: str + id_column: Optional[str] = None + valid_data: Optional[pd.DataFrame] = None + test_size: Optional[float] = 0.2 + seed: Optional[int] = 42 + token: Optional[str] = None + local: Optional[bool] = False + + def __post_init__(self): + # check if id_column and label_column are in train_data + if self.id_column is not None: + if self.id_column not in self.train_data.columns: + raise ValueError(f"{self.id_column} not in train data") + + for label in self.label_column: + if label not in self.train_data.columns: + raise ValueError(f"{label} not in train data") + + # check if id_column and label_column are in valid_data + if self.valid_data is not None: + if self.id_column is not None: + if self.id_column not in self.valid_data.columns: + raise ValueError(f"{self.id_column} not in valid data") + for label in self.label_column: + if label not in self.valid_data.columns: + raise ValueError(f"{label} not in valid data") + + # make sure no reserved columns are in train_data or valid_data + for column in RESERVED_COLUMNS: + if column in self.train_data.columns: + raise ValueError(f"{column} is a reserved column name") + if self.valid_data is not None: + if column in self.valid_data.columns: + raise ValueError(f"{column} is a reserved column name") + + def split(self): + if self.valid_data is not None: + return self.train_data, self.valid_data + else: + train_df, valid_df = train_test_split( + self.train_data, + test_size=self.test_size, + random_state=self.seed, + stratify=self.train_data[self.label_column], + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare_columns(self, train_df, valid_df): + train_df.loc[:, "autotrain_id"] = train_df[self.id_column] if self.id_column else list(range(len(train_df))) + + for label in range(len(self.label_column)): + train_df.loc[:, f"autotrain_label_{label}"] = train_df[self.label_column[label]] + + valid_df.loc[:, "autotrain_id"] = valid_df[self.id_column] if self.id_column else list(range(len(valid_df))) + + for label in range(len(self.label_column)): + valid_df.loc[:, f"autotrain_label_{label}"] = valid_df[self.label_column[label]] + + # drop id_column and label_column + drop_cols = [self.id_column] + self.label_column if self.id_column else self.label_column + train_df = train_df.drop(columns=drop_cols) + valid_df = valid_df.drop(columns=drop_cols) + return train_df, valid_df + + def prepare(self): + train_df, valid_df = self.split() + train_df, valid_df = self.prepare_columns(train_df, valid_df) + train_df = Dataset.from_pandas(train_df) + valid_df = Dataset.from_pandas(valid_df) + if self.local: + dataset = DatasetDict( + { + "train": train_df, + "validation": valid_df, + } + ) + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + train_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="train", + private=True, + token=self.token, + ) + valid_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="validation", + private=True, + token=self.token, + ) + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" + + +class TabularMultiColumnRegressionPreprocessor(TabularMultiLabelClassificationPreprocessor): + pass diff --git a/src/autotrain/preprocessor/text.py b/src/autotrain/preprocessor/text.py new file mode 100644 index 0000000000000000000000000000000000000000..0c2a991818d8ac2edddaf90eff7c790f76ab979f --- /dev/null +++ b/src/autotrain/preprocessor/text.py @@ -0,0 +1,828 @@ +import ast +from dataclasses import dataclass +from typing import Optional + +import pandas as pd +from datasets import ClassLabel, Dataset, DatasetDict, Sequence +from sklearn.model_selection import train_test_split + +from autotrain import logger + + +RESERVED_COLUMNS = ["autotrain_text", "autotrain_label", "autotrain_question", "autotrain_answer"] +LLM_RESERVED_COLUMNS = [ + "autotrain_prompt", + "autotrain_context", + "autotrain_rejected_text", + "autotrain_prompt_start", +] + + +@dataclass +class TextBinaryClassificationPreprocessor: + """ + A preprocessor class for binary text classification tasks. + + Attributes: + train_data (pd.DataFrame): The training data. + text_column (str): The name of the column containing text data. + label_column (str): The name of the column containing label data. + username (str): The username for the Hugging Face Hub. + project_name (str): The project name for saving datasets. + token (str): The authentication token for the Hugging Face Hub. + valid_data (Optional[pd.DataFrame]): The validation data. Defaults to None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Defaults to 0.2. + seed (Optional[int]): The random seed for splitting the data. Defaults to 42. + convert_to_class_label (Optional[bool]): Whether to convert labels to class labels. Defaults to False. + local (Optional[bool]): Whether to save the dataset locally. Defaults to False. + + Methods: + __post_init__(): Validates the presence of required columns in the dataframes and checks for reserved column names. + split(): Splits the training data into training and validation sets if validation data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns for training and validation dataframes. + prepare(): Prepares the datasets for training and validation, converts labels if required, and saves or uploads the datasets. + """ + + train_data: pd.DataFrame + text_column: str + label_column: str + username: str + project_name: str + token: str + valid_data: Optional[pd.DataFrame] = None + test_size: Optional[float] = 0.2 + seed: Optional[int] = 42 + convert_to_class_label: Optional[bool] = False + local: Optional[bool] = False + + def __post_init__(self): + # check if text_column and label_column are in train_data + if self.text_column not in self.train_data.columns: + raise ValueError(f"{self.text_column} not in train data") + if self.label_column not in self.train_data.columns: + raise ValueError(f"{self.label_column} not in train data") + # check if text_column and label_column are in valid_data + if self.valid_data is not None: + if self.text_column not in self.valid_data.columns: + raise ValueError(f"{self.text_column} not in valid data") + if self.label_column not in self.valid_data.columns: + raise ValueError(f"{self.label_column} not in valid data") + + # make sure no reserved columns are in train_data or valid_data + for column in RESERVED_COLUMNS: + if column in self.train_data.columns: + raise ValueError(f"{column} is a reserved column name") + if self.valid_data is not None: + if column in self.valid_data.columns: + raise ValueError(f"{column} is a reserved column name") + + def split(self): + if self.valid_data is not None: + return self.train_data, self.valid_data + else: + train_df, valid_df = train_test_split( + self.train_data, + test_size=self.test_size, + random_state=self.seed, + stratify=self.train_data[self.label_column], + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare_columns(self, train_df, valid_df): + train_df.loc[:, "autotrain_text"] = train_df[self.text_column] + train_df.loc[:, "autotrain_label"] = train_df[self.label_column] + valid_df.loc[:, "autotrain_text"] = valid_df[self.text_column] + valid_df.loc[:, "autotrain_label"] = valid_df[self.label_column] + + # drop text_column and label_column + train_df = train_df.drop(columns=[self.text_column, self.label_column]) + valid_df = valid_df.drop(columns=[self.text_column, self.label_column]) + return train_df, valid_df + + def prepare(self): + train_df, valid_df = self.split() + train_df, valid_df = self.prepare_columns(train_df, valid_df) + + train_df.loc[:, "autotrain_label"] = train_df["autotrain_label"].astype(str) + valid_df.loc[:, "autotrain_label"] = valid_df["autotrain_label"].astype(str) + + label_names = sorted(set(train_df["autotrain_label"].unique().tolist())) + + train_df = Dataset.from_pandas(train_df) + valid_df = Dataset.from_pandas(valid_df) + + if self.convert_to_class_label: + train_df = train_df.cast_column("autotrain_label", ClassLabel(names=label_names)) + valid_df = valid_df.cast_column("autotrain_label", ClassLabel(names=label_names)) + + if self.local: + dataset = DatasetDict( + { + "train": train_df, + "validation": valid_df, + } + ) + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + train_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="train", + private=True, + token=self.token, + ) + valid_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="validation", + private=True, + token=self.token, + ) + + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" + + +class TextMultiClassClassificationPreprocessor(TextBinaryClassificationPreprocessor): + """ + TextMultiClassClassificationPreprocessor is a class for preprocessing text data for multi-class classification tasks. + + This class inherits from TextBinaryClassificationPreprocessor and is designed to handle scenarios where the text data + needs to be classified into more than two categories. + + Methods: + Inherits all methods from TextBinaryClassificationPreprocessor. + + Attributes: + Inherits all attributes from TextBinaryClassificationPreprocessor. + """ + + pass + + +class TextSingleColumnRegressionPreprocessor(TextBinaryClassificationPreprocessor): + """ + A preprocessor class for single-column regression tasks, inheriting from TextBinaryClassificationPreprocessor. + + Methods + ------- + split(): + Splits the training data into training and validation sets. If validation data is already provided, it returns + the training and validation data as is. Otherwise, it performs a train-test split on the training data. + + prepare(): + Prepares the training and validation datasets by splitting the data, preparing the columns, and converting + them to Hugging Face Datasets. The datasets are then either saved locally or pushed to the Hugging Face Hub, + depending on the `local` attribute. + """ + + def split(self): + if self.valid_data is not None: + return self.train_data, self.valid_data + else: + train_df, valid_df = train_test_split( + self.train_data, + test_size=self.test_size, + random_state=self.seed, + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare(self): + train_df, valid_df = self.split() + train_df, valid_df = self.prepare_columns(train_df, valid_df) + + train_df = Dataset.from_pandas(train_df) + valid_df = Dataset.from_pandas(valid_df) + + if self.local: + dataset = DatasetDict( + { + "train": train_df, + "validation": valid_df, + } + ) + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + train_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="train", + private=True, + token=self.token, + ) + valid_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="validation", + private=True, + token=self.token, + ) + + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" + + +class TextTokenClassificationPreprocessor(TextBinaryClassificationPreprocessor): + """ + A preprocessor class for text token classification tasks, inheriting from TextBinaryClassificationPreprocessor. + + Methods + ------- + split(): + Splits the training data into training and validation sets. If validation data is already provided, it returns + the training and validation data as is. Otherwise, it splits the training data based on the test size and seed. + + prepare(): + Prepares the training and validation data for token classification. This includes splitting the data, preparing + columns, evaluating text and label columns, and converting them to datasets. The datasets are then either saved + locally or pushed to the Hugging Face Hub based on the configuration. + """ + + def split(self): + if self.valid_data is not None: + return self.train_data, self.valid_data + else: + train_df, valid_df = train_test_split( + self.train_data, + test_size=self.test_size, + random_state=self.seed, + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare(self): + train_df, valid_df = self.split() + train_df, valid_df = self.prepare_columns(train_df, valid_df) + try: + train_df.loc[:, "autotrain_text"] = train_df["autotrain_text"].apply(lambda x: ast.literal_eval(x)) + valid_df.loc[:, "autotrain_text"] = valid_df["autotrain_text"].apply(lambda x: ast.literal_eval(x)) + except ValueError: + logger.warning("Unable to do ast.literal_eval on train_df['autotrain_text']") + logger.warning("assuming autotrain_text is already a list") + try: + train_df.loc[:, "autotrain_label"] = train_df["autotrain_label"].apply(lambda x: ast.literal_eval(x)) + valid_df.loc[:, "autotrain_label"] = valid_df["autotrain_label"].apply(lambda x: ast.literal_eval(x)) + except ValueError: + logger.warning("Unable to do ast.literal_eval on train_df['autotrain_label']") + logger.warning("assuming autotrain_label is already a list") + + label_names_train = sorted(set(train_df["autotrain_label"].explode().unique().tolist())) + label_names_valid = sorted(set(valid_df["autotrain_label"].explode().unique().tolist())) + label_names = sorted(set(label_names_train + label_names_valid)) + + train_df = Dataset.from_pandas(train_df) + valid_df = Dataset.from_pandas(valid_df) + + if self.convert_to_class_label: + train_df = train_df.cast_column("autotrain_label", Sequence(ClassLabel(names=label_names))) + valid_df = valid_df.cast_column("autotrain_label", Sequence(ClassLabel(names=label_names))) + + if self.local: + dataset = DatasetDict( + { + "train": train_df, + "validation": valid_df, + } + ) + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + train_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="train", + private=True, + token=self.token, + ) + valid_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="validation", + private=True, + token=self.token, + ) + + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" + + +@dataclass +class LLMPreprocessor: + """ + A class used to preprocess data for large language model (LLM) training. + + Attributes + ---------- + train_data : pd.DataFrame + The training data. + username : str + The username for the Hugging Face Hub. + project_name : str + The name of the project. + token : str + The token for authentication. + valid_data : Optional[pd.DataFrame], optional + The validation data, by default None. + test_size : Optional[float], optional + The size of the test split, by default 0.2. + seed : Optional[int], optional + The random seed, by default 42. + text_column : Optional[str], optional + The name of the text column, by default None. + prompt_column : Optional[str], optional + The name of the prompt column, by default None. + rejected_text_column : Optional[str], optional + The name of the rejected text column, by default None. + local : Optional[bool], optional + Whether to save the dataset locally, by default False. + + Methods + ------- + __post_init__() + Validates the provided columns and checks for reserved column names. + split() + Splits the data into training and validation sets. + prepare_columns(train_df, valid_df) + Prepares the columns for training and validation datasets. + prepare() + Prepares the datasets and pushes them to the Hugging Face Hub or saves them locally. + """ + + train_data: pd.DataFrame + username: str + project_name: str + token: str + valid_data: Optional[pd.DataFrame] = None + test_size: Optional[float] = 0.2 + seed: Optional[int] = 42 + text_column: Optional[str] = None + prompt_column: Optional[str] = None + rejected_text_column: Optional[str] = None + local: Optional[bool] = False + + def __post_init__(self): + if self.text_column is None: + raise ValueError("text_column must be provided") + + # check if text_column and rejected_text_column are in train_data + if self.prompt_column is not None and self.prompt_column not in self.train_data.columns: + self.prompt_column = None + if self.rejected_text_column is not None and self.rejected_text_column not in self.train_data.columns: + self.rejected_text_column = None + + # make sure no reserved columns are in train_data or valid_data + for column in RESERVED_COLUMNS + LLM_RESERVED_COLUMNS: + if column in self.train_data.columns: + raise ValueError(f"{column} is a reserved column name") + if self.valid_data is not None: + if column in self.valid_data.columns: + raise ValueError(f"{column} is a reserved column name") + + def split(self): + if self.valid_data is not None: + return self.train_data, self.valid_data + # no validation is done in llm training if validation data is not provided + return self.train_data, self.train_data + # else: + # train_df, valid_df = train_test_split( + # self.train_data, + # test_size=self.test_size, + # random_state=self.seed, + # ) + # train_df = train_df.reset_index(drop=True) + # valid_df = valid_df.reset_index(drop=True) + # return train_df, valid_df + + def prepare_columns(self, train_df, valid_df): + drop_cols = [self.text_column] + train_df.loc[:, "autotrain_text"] = train_df[self.text_column] + valid_df.loc[:, "autotrain_text"] = valid_df[self.text_column] + if self.prompt_column is not None: + drop_cols.append(self.prompt_column) + train_df.loc[:, "autotrain_prompt"] = train_df[self.prompt_column] + valid_df.loc[:, "autotrain_prompt"] = valid_df[self.prompt_column] + if self.rejected_text_column is not None: + drop_cols.append(self.rejected_text_column) + train_df.loc[:, "autotrain_rejected_text"] = train_df[self.rejected_text_column] + valid_df.loc[:, "autotrain_rejected_text"] = valid_df[self.rejected_text_column] + + # drop drop_cols + train_df = train_df.drop(columns=drop_cols) + valid_df = valid_df.drop(columns=drop_cols) + return train_df, valid_df + + def prepare(self): + train_df, valid_df = self.split() + train_df, valid_df = self.prepare_columns(train_df, valid_df) + train_df = Dataset.from_pandas(train_df) + valid_df = Dataset.from_pandas(valid_df) + if self.local: + dataset = DatasetDict( + { + "train": train_df, + "validation": valid_df, + } + ) + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + train_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="train", + private=True, + token=self.token, + ) + valid_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="validation", + private=True, + token=self.token, + ) + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" + + +@dataclass +class Seq2SeqPreprocessor: + """ + Seq2SeqPreprocessor is a class for preprocessing sequence-to-sequence training data. + + Attributes: + train_data (pd.DataFrame): The training data. + text_column (str): The name of the column containing the input text. + label_column (str): The name of the column containing the labels. + username (str): The username for pushing data to the hub. + project_name (str): The name of the project. + token (str): The token for authentication. + valid_data (Optional[pd.DataFrame]): The validation data. Default is None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2. + seed (Optional[int]): The random seed for splitting the data. Default is 42. + local (Optional[bool]): Whether to save the dataset locally or push to the hub. Default is False. + + Methods: + __post_init__(): Validates the presence of required columns in the training and validation data. + split(): Splits the training data into training and validation sets if validation data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns for training and validation data. + prepare(): Prepares the dataset for training by splitting, preparing columns, and converting to Dataset objects. + """ + + train_data: pd.DataFrame + text_column: str + label_column: str + username: str + project_name: str + token: str + valid_data: Optional[pd.DataFrame] = None + test_size: Optional[float] = 0.2 + seed: Optional[int] = 42 + local: Optional[bool] = False + + def __post_init__(self): + # check if text_column and label_column are in train_data + if self.text_column not in self.train_data.columns: + raise ValueError(f"{self.text_column} not in train data") + if self.label_column not in self.train_data.columns: + raise ValueError(f"{self.label_column} not in train data") + # check if text_column and label_column are in valid_data + if self.valid_data is not None: + if self.text_column not in self.valid_data.columns: + raise ValueError(f"{self.text_column} not in valid data") + if self.label_column not in self.valid_data.columns: + raise ValueError(f"{self.label_column} not in valid data") + + # make sure no reserved columns are in train_data or valid_data + for column in RESERVED_COLUMNS: + if column in self.train_data.columns: + raise ValueError(f"{column} is a reserved column name") + if self.valid_data is not None: + if column in self.valid_data.columns: + raise ValueError(f"{column} is a reserved column name") + + def split(self): + if self.valid_data is not None: + return self.train_data, self.valid_data + else: + train_df, valid_df = train_test_split( + self.train_data, + test_size=self.test_size, + random_state=self.seed, + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare_columns(self, train_df, valid_df): + train_df.loc[:, "autotrain_text"] = train_df[self.text_column] + train_df.loc[:, "autotrain_label"] = train_df[self.label_column] + valid_df.loc[:, "autotrain_text"] = valid_df[self.text_column] + valid_df.loc[:, "autotrain_label"] = valid_df[self.label_column] + + # drop text_column and label_column + train_df = train_df.drop(columns=[self.text_column, self.label_column]) + valid_df = valid_df.drop(columns=[self.text_column, self.label_column]) + return train_df, valid_df + + def prepare(self): + train_df, valid_df = self.split() + train_df, valid_df = self.prepare_columns(train_df, valid_df) + + train_df = Dataset.from_pandas(train_df) + valid_df = Dataset.from_pandas(valid_df) + + if self.local: + dataset = DatasetDict( + { + "train": train_df, + "validation": valid_df, + } + ) + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + train_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="train", + private=True, + token=self.token, + ) + valid_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="validation", + private=True, + token=self.token, + ) + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" + + +@dataclass +class SentenceTransformersPreprocessor: + """ + A preprocessor class for preparing datasets for sentence transformers. + + Attributes: + train_data (pd.DataFrame): The training data. + username (str): The username for the Hugging Face Hub. + project_name (str): The project name for the Hugging Face Hub. + token (str): The token for authentication with the Hugging Face Hub. + valid_data (Optional[pd.DataFrame]): The validation data. Default is None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2. + seed (Optional[int]): The random seed for splitting the data. Default is 42. + local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Default is False. + sentence1_column (Optional[str]): The name of the first sentence column. Default is "sentence1". + sentence2_column (Optional[str]): The name of the second sentence column. Default is "sentence2". + sentence3_column (Optional[str]): The name of the third sentence column. Default is "sentence3". + target_column (Optional[str]): The name of the target column. Default is "target". + convert_to_class_label (Optional[bool]): Whether to convert the target column to class labels. Default is False. + + Methods: + __post_init__(): Ensures no reserved columns are in train_data or valid_data. + split(): Splits the train_data into training and validation sets if valid_data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns for training and validation datasets. + prepare(): Prepares the datasets and either saves them locally or pushes them to the Hugging Face Hub. + """ + + train_data: pd.DataFrame + username: str + project_name: str + token: str + valid_data: Optional[pd.DataFrame] = None + test_size: Optional[float] = 0.2 + seed: Optional[int] = 42 + local: Optional[bool] = False + sentence1_column: Optional[str] = "sentence1" + sentence2_column: Optional[str] = "sentence2" + sentence3_column: Optional[str] = "sentence3" + target_column: Optional[str] = "target" + convert_to_class_label: Optional[bool] = False + + def __post_init__(self): + # make sure no reserved columns are in train_data or valid_data + for column in RESERVED_COLUMNS + LLM_RESERVED_COLUMNS: + if column in self.train_data.columns: + raise ValueError(f"{column} is a reserved column name") + if self.valid_data is not None: + if column in self.valid_data.columns: + raise ValueError(f"{column} is a reserved column name") + + def split(self): + if self.valid_data is not None: + return self.train_data, self.valid_data + else: + train_df, valid_df = train_test_split( + self.train_data, + test_size=self.test_size, + random_state=self.seed, + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare_columns(self, train_df, valid_df): + train_df.loc[:, "autotrain_sentence1"] = train_df[self.sentence1_column] + train_df.loc[:, "autotrain_sentence2"] = train_df[self.sentence2_column] + valid_df.loc[:, "autotrain_sentence1"] = valid_df[self.sentence1_column] + valid_df.loc[:, "autotrain_sentence2"] = valid_df[self.sentence2_column] + keep_cols = ["autotrain_sentence1", "autotrain_sentence2"] + + if self.sentence3_column is not None: + train_df.loc[:, "autotrain_sentence3"] = train_df[self.sentence3_column] + valid_df.loc[:, "autotrain_sentence3"] = valid_df[self.sentence3_column] + keep_cols.append("autotrain_sentence3") + + if self.target_column is not None: + train_df.loc[:, "autotrain_target"] = train_df[self.target_column] + valid_df.loc[:, "autotrain_target"] = valid_df[self.target_column] + keep_cols.append("autotrain_target") + + train_df = train_df[keep_cols] + valid_df = valid_df[keep_cols] + + return train_df, valid_df + + def prepare(self): + train_df, valid_df = self.split() + train_df, valid_df = self.prepare_columns(train_df, valid_df) + + if self.convert_to_class_label: + label_names = sorted(set(train_df["autotrain_target"].unique().tolist())) + + train_df = Dataset.from_pandas(train_df) + valid_df = Dataset.from_pandas(valid_df) + + if self.convert_to_class_label: + train_df = train_df.cast_column("autotrain_target", ClassLabel(names=label_names)) + valid_df = valid_df.cast_column("autotrain_target", ClassLabel(names=label_names)) + + if self.local: + dataset = DatasetDict( + { + "train": train_df, + "validation": valid_df, + } + ) + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + train_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="train", + private=True, + token=self.token, + ) + valid_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="validation", + private=True, + token=self.token, + ) + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" + + +@dataclass +class TextExtractiveQuestionAnsweringPreprocessor: + """ + Preprocessor for text extractive question answering tasks. + + Attributes: + train_data (pd.DataFrame): The training data. + text_column (str): The name of the text column in the data. + question_column (str): The name of the question column in the data. + answer_column (str): The name of the answer column in the data. + username (str): The username for the Hugging Face Hub. + project_name (str): The project name for the Hugging Face Hub. + token (str): The token for authentication with the Hugging Face Hub. + valid_data (Optional[pd.DataFrame]): The validation data. Default is None. + test_size (Optional[float]): The proportion of the dataset to include in the validation split. Default is 0.2. + seed (Optional[int]): The random seed for splitting the data. Default is 42. + local (Optional[bool]): Whether to save the dataset locally or push to the Hugging Face Hub. Default is False. + + Methods: + __post_init__(): Validates the columns in the training and validation data and converts the answer column to a dictionary. + split(): Splits the training data into training and validation sets if validation data is not provided. + prepare_columns(train_df, valid_df): Prepares the columns for training and validation data. + prepare(): Prepares the dataset for training by splitting, preparing columns, and converting to Hugging Face Dataset format. + """ + + train_data: pd.DataFrame + text_column: str + question_column: str + answer_column: str + username: str + project_name: str + token: str + valid_data: Optional[pd.DataFrame] = None + test_size: Optional[float] = 0.2 + seed: Optional[int] = 42 + local: Optional[bool] = False + + def __post_init__(self): + # check if text_column, question_column, and answer_column are in train_data + if self.text_column not in self.train_data.columns: + raise ValueError(f"{self.text_column} not in train data") + if self.question_column not in self.train_data.columns: + raise ValueError(f"{self.question_column} not in train data") + if self.answer_column not in self.train_data.columns: + raise ValueError(f"{self.answer_column} not in train data") + # check if text_column, question_column, and answer_column are in valid_data + if self.valid_data is not None: + if self.text_column not in self.valid_data.columns: + raise ValueError(f"{self.text_column} not in valid data") + if self.question_column not in self.valid_data.columns: + raise ValueError(f"{self.question_column} not in valid data") + if self.answer_column not in self.valid_data.columns: + raise ValueError(f"{self.answer_column} not in valid data") + + # make sure no reserved columns are in train_data or valid_data + for column in RESERVED_COLUMNS: + if column in self.train_data.columns: + raise ValueError(f"{column} is a reserved column name") + if self.valid_data is not None: + if column in self.valid_data.columns: + raise ValueError(f"{column} is a reserved column name") + + # convert answer_column to dict + try: + self.train_data.loc[:, self.answer_column] = self.train_data[self.answer_column].apply( + lambda x: ast.literal_eval(x) + ) + except ValueError: + logger.warning("Unable to do ast.literal_eval on train_data[answer_column]") + logger.warning("assuming answer_column is already a dict") + + if self.valid_data is not None: + try: + self.valid_data.loc[:, self.answer_column] = self.valid_data[self.answer_column].apply( + lambda x: ast.literal_eval(x) + ) + except ValueError: + logger.warning("Unable to do ast.literal_eval on valid_data[answer_column]") + logger.warning("assuming answer_column is already a dict") + + def split(self): + if self.valid_data is not None: + return self.train_data, self.valid_data + else: + train_df, valid_df = train_test_split( + self.train_data, + test_size=self.test_size, + random_state=self.seed, + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare_columns(self, train_df, valid_df): + train_df.loc[:, "autotrain_text"] = train_df[self.text_column] + train_df.loc[:, "autotrain_question"] = train_df[self.question_column] + train_df.loc[:, "autotrain_answer"] = train_df[self.answer_column] + valid_df.loc[:, "autotrain_text"] = valid_df[self.text_column] + valid_df.loc[:, "autotrain_question"] = valid_df[self.question_column] + valid_df.loc[:, "autotrain_answer"] = valid_df[self.answer_column] + + # drop all other columns + train_df = train_df.drop( + columns=[ + x for x in train_df.columns if x not in ["autotrain_text", "autotrain_question", "autotrain_answer"] + ] + ) + valid_df = valid_df.drop( + columns=[ + x for x in valid_df.columns if x not in ["autotrain_text", "autotrain_question", "autotrain_answer"] + ] + ) + return train_df, valid_df + + def prepare(self): + train_df, valid_df = self.split() + train_df, valid_df = self.prepare_columns(train_df, valid_df) + + train_df = Dataset.from_pandas(train_df) + valid_df = Dataset.from_pandas(valid_df) + + if self.local: + dataset = DatasetDict( + { + "train": train_df, + "validation": valid_df, + } + ) + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + train_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="train", + private=True, + token=self.token, + ) + valid_df.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + split="validation", + private=True, + token=self.token, + ) + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" diff --git a/src/autotrain/preprocessor/vision.py b/src/autotrain/preprocessor/vision.py new file mode 100644 index 0000000000000000000000000000000000000000..b1075888a57d9456e3a305051014c96d5cf617fd --- /dev/null +++ b/src/autotrain/preprocessor/vision.py @@ -0,0 +1,565 @@ +import os +import shutil +import uuid +from dataclasses import dataclass +from typing import Optional + +import pandas as pd +from datasets import ClassLabel, Features, Image, Sequence, Value, load_dataset +from sklearn.model_selection import train_test_split + + +ALLOWED_EXTENSIONS = ("jpeg", "png", "jpg", "JPG", "JPEG", "PNG") + + +@dataclass +class ImageClassificationPreprocessor: + """ + A class used to preprocess image data for classification tasks. + + Attributes + ---------- + train_data : str + Path to the training data directory. + username : str + Username for the Hugging Face Hub. + project_name : str + Name of the project. + token : str + Authentication token for the Hugging Face Hub. + valid_data : Optional[str], optional + Path to the validation data directory, by default None. + test_size : Optional[float], optional + Proportion of the dataset to include in the validation split, by default 0.2. + seed : Optional[int], optional + Random seed for reproducibility, by default 42. + local : Optional[bool], optional + Whether to save the dataset locally or push to the Hugging Face Hub, by default False. + + Methods + ------- + __post_init__(): + Validates the structure and contents of the training and validation data directories. + split(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: + Splits the dataframe into training and validation sets. + prepare() -> str: + Prepares the dataset for training and either saves it locally or pushes it to the Hugging Face Hub. + """ + + train_data: str + username: str + project_name: str + token: str + valid_data: Optional[str] = None + test_size: Optional[float] = 0.2 + seed: Optional[int] = 42 + local: Optional[bool] = False + + def __post_init__(self): + # Check if train data path exists + if not os.path.exists(self.train_data): + raise ValueError(f"{self.train_data} does not exist.") + + # Check if train data path contains at least 2 folders + subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()] + # list subfolders + if len(subfolders) < 2: + raise ValueError(f"{self.train_data} should contain at least 2 subfolders.") + + # Check if each subfolder contains at least 5 image files in jpeg, png or jpg format only + for subfolder in subfolders: + image_files = [f for f in os.listdir(subfolder) if f.endswith(ALLOWED_EXTENSIONS)] + if len(image_files) < 5: + raise ValueError(f"{subfolder} should contain at least 5 jpeg, png or jpg files.") + # Check if there are no other files except image files in the subfolder + if len(image_files) != len(os.listdir(subfolder)): + raise ValueError(f"{subfolder} should not contain any other files except image files.") + + # Check if there are no subfolders inside subfolders + subfolders_in_subfolder = [f.path for f in os.scandir(subfolder) if f.is_dir()] + if len(subfolders_in_subfolder) > 0: + raise ValueError(f"{subfolder} should not contain any subfolders.") + + if self.valid_data: + # Check if valid data path exists + if not os.path.exists(self.valid_data): + raise ValueError(f"{self.valid_data} does not exist.") + + # Check if valid data path contains at least 2 folders + subfolders = [f.path for f in os.scandir(self.valid_data) if f.is_dir()] + + # make sure that the subfolders in train and valid data are the same + train_subfolders = set(os.path.basename(f.path) for f in os.scandir(self.train_data) if f.is_dir()) + valid_subfolders = set(os.path.basename(f.path) for f in os.scandir(self.valid_data) if f.is_dir()) + if train_subfolders != valid_subfolders: + raise ValueError(f"{self.valid_data} should have the same subfolders as {self.train_data}.") + + if len(subfolders) < 2: + raise ValueError(f"{self.valid_data} should contain at least 2 subfolders.") + + # Check if each subfolder contains at least 5 image files in jpeg, png or jpg format only + for subfolder in subfolders: + image_files = [f for f in os.listdir(subfolder) if f.endswith(ALLOWED_EXTENSIONS)] + if len(image_files) < 5: + raise ValueError(f"{subfolder} should contain at least 5 jpeg, png or jpg files.") + + # Check if there are no other files except image files in the subfolder + if len(image_files) != len(os.listdir(subfolder)): + raise ValueError(f"{subfolder} should not contain any other files except image files.") + + # Check if there are no subfolders inside subfolders + subfolders_in_subfolder = [f.path for f in os.scandir(subfolder) if f.is_dir()] + if len(subfolders_in_subfolder) > 0: + raise ValueError(f"{subfolder} should not contain any subfolders.") + + def split(self, df): + train_df, valid_df = train_test_split( + df, + test_size=self.test_size, + random_state=self.seed, + stratify=df["subfolder"], + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare(self): + random_uuid = uuid.uuid4() + cache_dir = os.environ.get("HF_HOME") + if not cache_dir: + cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") + data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + + if self.valid_data: + shutil.copytree(self.train_data, os.path.join(data_dir, "train")) + shutil.copytree(self.valid_data, os.path.join(data_dir, "validation")) + + dataset = load_dataset("imagefolder", data_dir=data_dir) + dataset = dataset.rename_columns({"image": "autotrain_image", "label": "autotrain_label"}) + if self.local: + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + dataset.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + private=True, + token=self.token, + ) + + else: + subfolders = [f.path for f in os.scandir(self.train_data) if f.is_dir()] + + image_filenames = [] + subfolder_names = [] + + for subfolder in subfolders: + for filename in os.listdir(subfolder): + if filename.endswith(("jpeg", "png", "jpg")): + image_filenames.append(filename) + subfolder_names.append(os.path.basename(subfolder)) + + df = pd.DataFrame({"image_filename": image_filenames, "subfolder": subfolder_names}) + train_df, valid_df = self.split(df) + + for row in train_df.itertuples(): + os.makedirs(os.path.join(data_dir, "train", row.subfolder), exist_ok=True) + shutil.copy( + os.path.join(self.train_data, row.subfolder, row.image_filename), + os.path.join(data_dir, "train", row.subfolder, row.image_filename), + ) + + for row in valid_df.itertuples(): + os.makedirs(os.path.join(data_dir, "validation", row.subfolder), exist_ok=True) + shutil.copy( + os.path.join(self.train_data, row.subfolder, row.image_filename), + os.path.join(data_dir, "validation", row.subfolder, row.image_filename), + ) + + dataset = load_dataset("imagefolder", data_dir=data_dir) + dataset = dataset.rename_columns({"image": "autotrain_image", "label": "autotrain_label"}) + if self.local: + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + dataset.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + private=True, + token=self.token, + ) + + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" + + +@dataclass +class ObjectDetectionPreprocessor: + """ + A class to preprocess data for object detection tasks. + + Attributes: + ----------- + train_data : str + Path to the training data directory. + username : str + Username for the Hugging Face Hub. + project_name : str + Name of the project. + token : str + Authentication token for the Hugging Face Hub. + valid_data : Optional[str], default=None + Path to the validation data directory. + test_size : Optional[float], default=0.2 + Proportion of the dataset to include in the validation split. + seed : Optional[int], default=42 + Random seed for reproducibility. + local : Optional[bool], default=False + Whether to save the dataset locally or push to the Hugging Face Hub. + + Methods: + -------- + _process_metadata(data_path): + Processes the metadata.jsonl file and extracts required columns and categories. + __post_init__(): + Validates the existence and content of the training and validation data directories. + split(df): + Splits the dataframe into training and validation sets. + prepare(): + Prepares the dataset for training by processing metadata, splitting data, and saving or pushing the dataset. + """ + + train_data: str + username: str + project_name: str + token: str + valid_data: Optional[str] = None + test_size: Optional[float] = 0.2 + seed: Optional[int] = 42 + local: Optional[bool] = False + + @staticmethod + def _process_metadata(data_path): + metadata = pd.read_json(os.path.join(data_path, "metadata.jsonl"), lines=True) + # make sure that the metadata.jsonl file contains the required columns: file_name, objects + if "file_name" not in metadata.columns or "objects" not in metadata.columns: + raise ValueError(f"{data_path}/metadata.jsonl should contain 'file_name' and 'objects' columns.") + + # keeo only file_name and objects columns + metadata = metadata[["file_name", "objects"]] + # inside metadata objects column, values should be bbox, area and category + # if area does not exist, it should be created by multiplying bbox width and height + categories = [] + for _, row in metadata.iterrows(): + obj = row["objects"] + if "bbox" not in obj or "category" not in obj: + raise ValueError(f"{data_path}/metadata.jsonl should contain 'bbox' and 'category' keys in 'objects'.") + # keep only bbox, area and category keys + obj = {k: obj[k] for k in ["bbox", "category"]} + categories.extend(obj["category"]) + + categories = set(categories) + + return metadata, categories + + def __post_init__(self): + # Check if train data path exists + if not os.path.exists(self.train_data): + raise ValueError(f"{self.train_data} does not exist.") + + # check if self.train_data contains at least 5 image files in jpeg, png or jpg format only + train_image_files = [f for f in os.listdir(self.train_data) if f.endswith(ALLOWED_EXTENSIONS)] + if len(train_image_files) < 5: + raise ValueError(f"{self.train_data} should contain at least 5 jpeg, png or jpg files.") + + # check if self.train_data contains a metadata.jsonl file + if "metadata.jsonl" not in os.listdir(self.train_data): + raise ValueError(f"{self.train_data} should contain a metadata.jsonl file.") + + # Check if valid data path exists + if self.valid_data: + if not os.path.exists(self.valid_data): + raise ValueError(f"{self.valid_data} does not exist.") + + # check if self.valid_data contains at least 5 image files in jpeg, png or jpg format only + valid_image_files = [f for f in os.listdir(self.valid_data) if f.endswith(ALLOWED_EXTENSIONS)] + if len(valid_image_files) < 5: + raise ValueError(f"{self.valid_data} should contain at least 5 jpeg, png or jpg files.") + + # check if self.valid_data contains a metadata.jsonl file + if "metadata.jsonl" not in os.listdir(self.valid_data): + raise ValueError(f"{self.valid_data} should contain a metadata.jsonl file.") + + def split(self, df): + train_df, valid_df = train_test_split( + df, + test_size=self.test_size, + random_state=self.seed, + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare(self): + random_uuid = uuid.uuid4() + cache_dir = os.environ.get("HF_HOME") + if not cache_dir: + cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") + data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + + if self.valid_data: + shutil.copytree(self.train_data, os.path.join(data_dir, "train")) + shutil.copytree(self.valid_data, os.path.join(data_dir, "validation")) + + train_metadata, train_categories = self._process_metadata(os.path.join(data_dir, "train")) + valid_metadata, valid_categories = self._process_metadata(os.path.join(data_dir, "validation")) + + train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) + valid_metadata.to_json( + os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True + ) + + all_categories = train_categories.union(valid_categories) + + features = Features( + { + "image": Image(), + "objects": Sequence( + { + "bbox": Sequence(Value("float32"), length=4), + "category": ClassLabel(names=list(all_categories)), + } + ), + } + ) + + dataset = load_dataset("imagefolder", data_dir=data_dir, features=features) + dataset = dataset.rename_columns( + { + "image": "autotrain_image", + "objects": "autotrain_objects", + } + ) + + if self.local: + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + dataset.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + private=True, + token=self.token, + ) + else: + metadata = pd.read_json(os.path.join(self.train_data, "metadata.jsonl"), lines=True) + train_df, valid_df = self.split(metadata) + + # create train and validation folders + os.makedirs(os.path.join(data_dir, "train"), exist_ok=True) + os.makedirs(os.path.join(data_dir, "validation"), exist_ok=True) + + # move images to train and validation folders + for row in train_df.iterrows(): + shutil.copy( + os.path.join(self.train_data, row[1]["file_name"]), + os.path.join(data_dir, "train", row[1]["file_name"]), + ) + + for row in valid_df.iterrows(): + shutil.copy( + os.path.join(self.train_data, row[1]["file_name"]), + os.path.join(data_dir, "validation", row[1]["file_name"]), + ) + + # save metadata.jsonl file to train and validation folders + train_df.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) + valid_df.to_json(os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True) + + train_metadata, train_categories = self._process_metadata(os.path.join(data_dir, "train")) + valid_metadata, valid_categories = self._process_metadata(os.path.join(data_dir, "validation")) + + train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) + valid_metadata.to_json( + os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True + ) + + all_categories = train_categories.union(valid_categories) + + features = Features( + { + "image": Image(), + "objects": Sequence( + { + "bbox": Sequence(Value("float32"), length=4), + "category": ClassLabel(names=list(all_categories)), + } + ), + } + ) + + dataset = load_dataset("imagefolder", data_dir=data_dir, features=features) + dataset = dataset.rename_columns( + { + "image": "autotrain_image", + "objects": "autotrain_objects", + } + ) + + if self.local: + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + dataset.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + private=True, + token=self.token, + ) + + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" + + +@dataclass +class ImageRegressionPreprocessor: + train_data: str + username: str + project_name: str + token: str + valid_data: Optional[str] = None + test_size: Optional[float] = 0.2 + seed: Optional[int] = 42 + local: Optional[bool] = False + + @staticmethod + def _process_metadata(data_path): + metadata = pd.read_json(os.path.join(data_path, "metadata.jsonl"), lines=True) + # make sure that the metadata.jsonl file contains the required columns: file_name, target + if "file_name" not in metadata.columns or "target" not in metadata.columns: + raise ValueError(f"{data_path}/metadata.jsonl should contain 'file_name' and 'target' columns.") + + # keep only file_name and target columns + metadata = metadata[["file_name", "target"]] + return metadata + + def __post_init__(self): + # Check if train data path exists + if not os.path.exists(self.train_data): + raise ValueError(f"{self.train_data} does not exist.") + + # check if self.train_data contains at least 5 image files in jpeg, png or jpg format only + train_image_files = [f for f in os.listdir(self.train_data) if f.endswith(ALLOWED_EXTENSIONS)] + if len(train_image_files) < 5: + raise ValueError(f"{self.train_data} should contain at least 5 jpeg, png or jpg files.") + + # check if self.train_data contains a metadata.jsonl file + if "metadata.jsonl" not in os.listdir(self.train_data): + raise ValueError(f"{self.train_data} should contain a metadata.jsonl file.") + + # Check if valid data path exists + if self.valid_data: + if not os.path.exists(self.valid_data): + raise ValueError(f"{self.valid_data} does not exist.") + + # check if self.valid_data contains at least 5 image files in jpeg, png or jpg format only + valid_image_files = [f for f in os.listdir(self.valid_data) if f.endswith(ALLOWED_EXTENSIONS)] + if len(valid_image_files) < 5: + raise ValueError(f"{self.valid_data} should contain at least 5 jpeg, png or jpg files.") + + # check if self.valid_data contains a metadata.jsonl file + if "metadata.jsonl" not in os.listdir(self.valid_data): + raise ValueError(f"{self.valid_data} should contain a metadata.jsonl file.") + + def split(self, df): + train_df, valid_df = train_test_split( + df, + test_size=self.test_size, + random_state=self.seed, + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare(self): + random_uuid = uuid.uuid4() + cache_dir = os.environ.get("HF_HOME") + if not cache_dir: + cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") + data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + + if self.valid_data: + shutil.copytree(self.train_data, os.path.join(data_dir, "train")) + shutil.copytree(self.valid_data, os.path.join(data_dir, "validation")) + + train_metadata = self._process_metadata(os.path.join(data_dir, "train")) + valid_metadata = self._process_metadata(os.path.join(data_dir, "validation")) + + train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) + valid_metadata.to_json( + os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True + ) + + dataset = load_dataset("imagefolder", data_dir=data_dir) + dataset = dataset.rename_columns( + { + "image": "autotrain_image", + "target": "autotrain_label", + } + ) + + if self.local: + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + dataset.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + private=True, + token=self.token, + ) + else: + metadata = pd.read_json(os.path.join(self.train_data, "metadata.jsonl"), lines=True) + train_df, valid_df = self.split(metadata) + + # create train and validation folders + os.makedirs(os.path.join(data_dir, "train"), exist_ok=True) + os.makedirs(os.path.join(data_dir, "validation"), exist_ok=True) + + # move images to train and validation folders + for row in train_df.iterrows(): + shutil.copy( + os.path.join(self.train_data, row[1]["file_name"]), + os.path.join(data_dir, "train", row[1]["file_name"]), + ) + + for row in valid_df.iterrows(): + shutil.copy( + os.path.join(self.train_data, row[1]["file_name"]), + os.path.join(data_dir, "validation", row[1]["file_name"]), + ) + + # save metadata.jsonl file to train and validation folders + train_df.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) + valid_df.to_json(os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True) + + train_metadata = self._process_metadata(os.path.join(data_dir, "train")) + valid_metadata = self._process_metadata(os.path.join(data_dir, "validation")) + + train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) + valid_metadata.to_json( + os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True + ) + + dataset = load_dataset("imagefolder", data_dir=data_dir) + dataset = dataset.rename_columns( + { + "image": "autotrain_image", + "target": "autotrain_label", + } + ) + + if self.local: + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + dataset.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + private=True, + token=self.token, + ) + + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" diff --git a/src/autotrain/preprocessor/vlm.py b/src/autotrain/preprocessor/vlm.py new file mode 100644 index 0000000000000000000000000000000000000000..1f5edf1fd904837ea45e46cd617df36db04740f3 --- /dev/null +++ b/src/autotrain/preprocessor/vlm.py @@ -0,0 +1,224 @@ +import os +import shutil +import uuid +from dataclasses import dataclass +from typing import Optional + +import pandas as pd +from datasets import Features, Image, Value, load_dataset +from sklearn.model_selection import train_test_split + + +ALLOWED_EXTENSIONS = ("jpeg", "png", "jpg", "JPG", "JPEG", "PNG") + + +@dataclass +class VLMPreprocessor: + """ + VLMPreprocessor is a class for preprocessing visual language model (VLM) datasets. It handles tasks such as + validating data paths, ensuring the presence of required files, splitting datasets, and preparing data for + training and validation. + + Attributes: + train_data (str): Path to the training data directory. + username (str): Username for the Hugging Face Hub. + project_name (str): Name of the project. + token (str): Authentication token for the Hugging Face Hub. + column_mapping (dict): Mapping of column names. + valid_data (Optional[str]): Path to the validation data directory. Default is None. + test_size (Optional[float]): Proportion of the dataset to include in the validation split. Default is 0.2. + seed (Optional[int]): Random seed for dataset splitting. Default is 42. + local (Optional[bool]): Flag indicating whether to save data locally or push to the Hugging Face Hub. Default is False. + + Methods: + _process_metadata(data_path): + Processes the metadata.jsonl file in the given data path and ensures it contains the required columns. + + __post_init__(): + Validates the existence of training and validation data paths, checks for required files, and ensures + the presence of a minimum number of image files. + + split(df): + Splits the given DataFrame into training and validation sets based on the specified test size and seed. + + prepare(): + Prepares the dataset for training and validation by copying data to a cache directory, processing metadata, + and either saving the dataset locally or pushing it to the Hugging Face Hub. + """ + + train_data: str + username: str + project_name: str + token: str + column_mapping: dict + valid_data: Optional[str] = None + test_size: Optional[float] = 0.2 + seed: Optional[int] = 42 + local: Optional[bool] = False + + def _process_metadata(self, data_path): + metadata = pd.read_json(os.path.join(data_path, "metadata.jsonl"), lines=True) + # make sure that the metadata.jsonl file contains the required columns: file_name, objects + if "file_name" not in metadata.columns: + raise ValueError(f"{data_path}/metadata.jsonl should contain 'file_name' column.") + + col_names = list(self.column_mapping.values()) + + for col in col_names: + if col not in metadata.columns: + raise ValueError(f"{data_path}/metadata.jsonl should contain '{col}' column.") + + return metadata + + def __post_init__(self): + # Check if train data path exists + if not os.path.exists(self.train_data): + raise ValueError(f"{self.train_data} does not exist.") + + # check if self.train_data contains at least 5 image files in jpeg, png or jpg format only + train_image_files = [f for f in os.listdir(self.train_data) if f.endswith(ALLOWED_EXTENSIONS)] + if len(train_image_files) < 5: + raise ValueError(f"{self.train_data} should contain at least 5 jpeg, png or jpg files.") + + # check if self.train_data contains a metadata.jsonl file + if "metadata.jsonl" not in os.listdir(self.train_data): + raise ValueError(f"{self.train_data} should contain a metadata.jsonl file.") + + # Check if valid data path exists + if self.valid_data: + if not os.path.exists(self.valid_data): + raise ValueError(f"{self.valid_data} does not exist.") + + # check if self.valid_data contains at least 5 image files in jpeg, png or jpg format only + valid_image_files = [f for f in os.listdir(self.valid_data) if f.endswith(ALLOWED_EXTENSIONS)] + if len(valid_image_files) < 5: + raise ValueError(f"{self.valid_data} should contain at least 5 jpeg, png or jpg files.") + + # check if self.valid_data contains a metadata.jsonl file + if "metadata.jsonl" not in os.listdir(self.valid_data): + raise ValueError(f"{self.valid_data} should contain a metadata.jsonl file.") + + def split(self, df): + train_df, valid_df = train_test_split( + df, + test_size=self.test_size, + random_state=self.seed, + ) + train_df = train_df.reset_index(drop=True) + valid_df = valid_df.reset_index(drop=True) + return train_df, valid_df + + def prepare(self): + random_uuid = uuid.uuid4() + cache_dir = os.environ.get("HF_HOME") + if not cache_dir: + cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface") + data_dir = os.path.join(cache_dir, "autotrain", str(random_uuid)) + + if self.valid_data: + shutil.copytree(self.train_data, os.path.join(data_dir, "train")) + shutil.copytree(self.valid_data, os.path.join(data_dir, "validation")) + + train_metadata = self._process_metadata(os.path.join(data_dir, "train")) + valid_metadata = self._process_metadata(os.path.join(data_dir, "validation")) + + train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) + valid_metadata.to_json( + os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True + ) + + features = Features( + { + "image": Image(), + } + ) + for _, col_map in self.column_mapping.items(): + features[col_map] = Value(dtype="string") + + dataset = load_dataset("imagefolder", data_dir=data_dir, features=features) + + rename_dict = { + "image": "autotrain_image", + } + for col, col_map in self.column_mapping.items(): + if col == "text_column": + rename_dict[col_map] = "autotrain_text" + elif col == "prompt_text_column": + rename_dict[col_map] = "autotrain_prompt" + + dataset = dataset.rename_columns(rename_dict) + + if self.local: + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + dataset.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + private=True, + token=self.token, + ) + else: + metadata = pd.read_json(os.path.join(self.train_data, "metadata.jsonl"), lines=True) + train_df, valid_df = self.split(metadata) + + # create train and validation folders + os.makedirs(os.path.join(data_dir, "train"), exist_ok=True) + os.makedirs(os.path.join(data_dir, "validation"), exist_ok=True) + + # move images to train and validation folders + for row in train_df.iterrows(): + shutil.copy( + os.path.join(self.train_data, row[1]["file_name"]), + os.path.join(data_dir, "train", row[1]["file_name"]), + ) + + for row in valid_df.iterrows(): + shutil.copy( + os.path.join(self.train_data, row[1]["file_name"]), + os.path.join(data_dir, "validation", row[1]["file_name"]), + ) + + # save metadata.jsonl file to train and validation folders + train_df.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) + valid_df.to_json(os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True) + + train_metadata = self._process_metadata(os.path.join(data_dir, "train")) + valid_metadata = self._process_metadata(os.path.join(data_dir, "validation")) + + train_metadata.to_json(os.path.join(data_dir, "train", "metadata.jsonl"), orient="records", lines=True) + valid_metadata.to_json( + os.path.join(data_dir, "validation", "metadata.jsonl"), orient="records", lines=True + ) + + features = Features( + { + "image": Image(), + } + ) + for _, col_map in self.column_mapping.items(): + features[col_map] = Value(dtype="string") + + dataset = load_dataset("imagefolder", data_dir=data_dir, features=features) + + rename_dict = { + "image": "autotrain_image", + } + for col, col_map in self.column_mapping.items(): + if col == "text_column": + rename_dict[col_map] = "autotrain_text" + elif col == "prompt_text_column": + rename_dict[col_map] = "autotrain_prompt" + + dataset = dataset.rename_columns(rename_dict) + + if self.local: + dataset.save_to_disk(f"{self.project_name}/autotrain-data") + else: + dataset.push_to_hub( + f"{self.username}/autotrain-data-{self.project_name}", + private=True, + token=self.token, + ) + + if self.local: + return f"{self.project_name}/autotrain-data" + return f"{self.username}/autotrain-data-{self.project_name}" diff --git a/src/autotrain/project.py b/src/autotrain/project.py new file mode 100644 index 0000000000000000000000000000000000000000..86d5933f02ef8605db868a19e56e762bbdf6d414 --- /dev/null +++ b/src/autotrain/project.py @@ -0,0 +1,563 @@ +""" +Copyright 2023 The HuggingFace Team +""" + +import os +from dataclasses import dataclass +from typing import Union + +from autotrain.backends.base import AVAILABLE_HARDWARE +from autotrain.backends.endpoints import EndpointsRunner +from autotrain.backends.local import LocalRunner +from autotrain.backends.ngc import NGCRunner +from autotrain.backends.nvcf import NVCFRunner +from autotrain.backends.spaces import SpaceRunner +from autotrain.dataset import ( + AutoTrainDataset, + AutoTrainImageClassificationDataset, + AutoTrainImageRegressionDataset, + AutoTrainObjectDetectionDataset, + AutoTrainVLMDataset, +) +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams +from autotrain.trainers.image_classification.params import ImageClassificationParams +from autotrain.trainers.image_regression.params import ImageRegressionParams +from autotrain.trainers.object_detection.params import ObjectDetectionParams +from autotrain.trainers.sent_transformers.params import SentenceTransformersParams +from autotrain.trainers.seq2seq.params import Seq2SeqParams +from autotrain.trainers.tabular.params import TabularParams +from autotrain.trainers.text_classification.params import TextClassificationParams +from autotrain.trainers.text_regression.params import TextRegressionParams +from autotrain.trainers.token_classification.params import TokenClassificationParams +from autotrain.trainers.vlm.params import VLMTrainingParams + + +def tabular_munge_data(params, local): + if isinstance(params.target_columns, str): + col_map_label = [params.target_columns] + else: + col_map_label = params.target_columns + task = params.task + if task == "classification" and len(col_map_label) > 1: + task = "tabular_multi_label_classification" + elif task == "classification" and len(col_map_label) == 1: + task = "tabular_multi_class_classification" + elif task == "regression" and len(col_map_label) > 1: + task = "tabular_multi_column_regression" + elif task == "regression" and len(col_map_label) == 1: + task = "tabular_single_column_regression" + else: + raise Exception("Please select a valid task.") + + exts = ["csv", "jsonl"] + ext_to_use = None + for ext in exts: + path = f"{params.data_path}/{params.train_split}.{ext}" + if os.path.exists(path): + ext_to_use = ext + break + + train_data_path = f"{params.data_path}/{params.train_split}.{ext_to_use}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}.{ext_to_use}" + else: + valid_data_path = None + if os.path.exists(train_data_path): + dset = AutoTrainDataset( + train_data=[train_data_path], + task=task, + token=params.token, + project_name=params.project_name, + username=params.username, + column_mapping={"id": params.id_column, "label": col_map_label}, + valid_data=[valid_data_path] if valid_data_path is not None else None, + percent_valid=None, # TODO: add to UI + local=local, + ext=ext_to_use, + ) + params.data_path = dset.prepare() + params.valid_split = "validation" + params.id_column = "autotrain_id" + if len(col_map_label) == 1: + params.target_columns = ["autotrain_label"] + else: + params.target_columns = [f"autotrain_label_{i}" for i in range(len(col_map_label))] + return params + + +def llm_munge_data(params, local): + exts = ["csv", "jsonl"] + ext_to_use = None + for ext in exts: + path = f"{params.data_path}/{params.train_split}.{ext}" + if os.path.exists(path): + ext_to_use = ext + break + + train_data_path = f"{params.data_path}/{params.train_split}.{ext_to_use}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}.{ext_to_use}" + else: + valid_data_path = None + if os.path.exists(train_data_path): + col_map = {"text": params.text_column} + if params.rejected_text_column is not None: + col_map["rejected_text"] = params.rejected_text_column + if params.prompt_text_column is not None: + col_map["prompt"] = params.prompt_text_column + dset = AutoTrainDataset( + train_data=[train_data_path], + task="lm_training", + token=params.token, + project_name=params.project_name, + username=params.username, + column_mapping=col_map, + valid_data=[valid_data_path] if valid_data_path is not None else None, + percent_valid=None, # TODO: add to UI + local=local, + ext=ext_to_use, + ) + params.data_path = dset.prepare() + params.valid_split = None + params.text_column = "autotrain_text" + params.rejected_text_column = "autotrain_rejected_text" + params.prompt_text_column = "autotrain_prompt" + return params + + +def seq2seq_munge_data(params, local): + exts = ["csv", "jsonl"] + ext_to_use = None + for ext in exts: + path = f"{params.data_path}/{params.train_split}.{ext}" + if os.path.exists(path): + ext_to_use = ext + break + + train_data_path = f"{params.data_path}/{params.train_split}.{ext_to_use}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}.{ext_to_use}" + else: + valid_data_path = None + if os.path.exists(train_data_path): + dset = AutoTrainDataset( + train_data=[train_data_path], + task="seq2seq", + token=params.token, + project_name=params.project_name, + username=params.username, + column_mapping={"text": params.text_column, "label": params.target_column}, + valid_data=[valid_data_path] if valid_data_path is not None else None, + percent_valid=None, # TODO: add to UI + local=local, + ext=ext_to_use, + ) + params.data_path = dset.prepare() + params.valid_split = "validation" + params.text_column = "autotrain_text" + params.target_column = "autotrain_label" + return params + + +def text_clf_munge_data(params, local): + exts = ["csv", "jsonl"] + ext_to_use = None + for ext in exts: + path = f"{params.data_path}/{params.train_split}.{ext}" + if os.path.exists(path): + ext_to_use = ext + break + + train_data_path = f"{params.data_path}/{params.train_split}.{ext_to_use}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}.{ext_to_use}" + else: + valid_data_path = None + if os.path.exists(train_data_path): + dset = AutoTrainDataset( + train_data=[train_data_path], + valid_data=[valid_data_path] if valid_data_path is not None else None, + task="text_multi_class_classification", + token=params.token, + project_name=params.project_name, + username=params.username, + column_mapping={"text": params.text_column, "label": params.target_column}, + percent_valid=None, # TODO: add to UI + local=local, + convert_to_class_label=True, + ext=ext_to_use, + ) + params.data_path = dset.prepare() + params.valid_split = "validation" + params.text_column = "autotrain_text" + params.target_column = "autotrain_label" + return params + + +def text_reg_munge_data(params, local): + exts = ["csv", "jsonl"] + ext_to_use = None + for ext in exts: + path = f"{params.data_path}/{params.train_split}.{ext}" + if os.path.exists(path): + ext_to_use = ext + break + + train_data_path = f"{params.data_path}/{params.train_split}.{ext_to_use}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}.{ext_to_use}" + else: + valid_data_path = None + if os.path.exists(train_data_path): + dset = AutoTrainDataset( + train_data=[train_data_path], + valid_data=[valid_data_path] if valid_data_path is not None else None, + task="text_single_column_regression", + token=params.token, + project_name=params.project_name, + username=params.username, + column_mapping={"text": params.text_column, "label": params.target_column}, + percent_valid=None, # TODO: add to UI + local=local, + convert_to_class_label=False, + ext=ext_to_use, + ) + params.data_path = dset.prepare() + params.valid_split = "validation" + params.text_column = "autotrain_text" + params.target_column = "autotrain_label" + return params + + +def token_clf_munge_data(params, local): + exts = ["csv", "jsonl"] + ext_to_use = None + for ext in exts: + path = f"{params.data_path}/{params.train_split}.{ext}" + if os.path.exists(path): + ext_to_use = ext + break + + train_data_path = f"{params.data_path}/{params.train_split}.{ext_to_use}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}.{ext_to_use}" + else: + valid_data_path = None + if os.path.exists(train_data_path): + dset = AutoTrainDataset( + train_data=[train_data_path], + valid_data=[valid_data_path] if valid_data_path is not None else None, + task="text_token_classification", + token=params.token, + project_name=params.project_name, + username=params.username, + column_mapping={"text": params.tokens_column, "label": params.tags_column}, + percent_valid=None, # TODO: add to UI + local=local, + convert_to_class_label=True, + ext=ext_to_use, + ) + params.data_path = dset.prepare() + params.valid_split = "validation" + params.tokens_column = "autotrain_text" + params.tags_column = "autotrain_label" + return params + + +def img_clf_munge_data(params, local): + train_data_path = f"{params.data_path}/{params.train_split}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}" + else: + valid_data_path = None + if os.path.isdir(train_data_path): + dset = AutoTrainImageClassificationDataset( + train_data=train_data_path, + valid_data=valid_data_path, + token=params.token, + project_name=params.project_name, + username=params.username, + local=local, + ) + params.data_path = dset.prepare() + params.valid_split = "validation" + params.image_column = "autotrain_image" + params.target_column = "autotrain_label" + return params + + +def img_obj_detect_munge_data(params, local): + train_data_path = f"{params.data_path}/{params.train_split}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}" + else: + valid_data_path = None + if os.path.isdir(train_data_path): + dset = AutoTrainObjectDetectionDataset( + train_data=train_data_path, + valid_data=valid_data_path, + token=params.token, + project_name=params.project_name, + username=params.username, + local=local, + ) + params.data_path = dset.prepare() + params.valid_split = "validation" + params.image_column = "autotrain_image" + params.objects_column = "autotrain_objects" + return params + + +def sent_transformers_munge_data(params, local): + exts = ["csv", "jsonl"] + ext_to_use = None + for ext in exts: + path = f"{params.data_path}/{params.train_split}.{ext}" + if os.path.exists(path): + ext_to_use = ext + break + + train_data_path = f"{params.data_path}/{params.train_split}.{ext_to_use}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}.{ext_to_use}" + else: + valid_data_path = None + if os.path.exists(train_data_path): + dset = AutoTrainDataset( + train_data=[train_data_path], + valid_data=[valid_data_path] if valid_data_path is not None else None, + task="sentence_transformers", + token=params.token, + project_name=params.project_name, + username=params.username, + column_mapping={ + "sentence1": params.sentence1_column, + "sentence2": params.sentence2_column, + "sentence3": params.sentence3_column, + "target": params.target_column, + }, + percent_valid=None, # TODO: add to UI + local=local, + convert_to_class_label=True if params.trainer == "pair_class" else False, + ext=ext_to_use, + ) + params.data_path = dset.prepare() + params.valid_split = "validation" + params.sentence1_column = "autotrain_sentence1" + params.sentence2_column = "autotrain_sentence2" + params.sentence3_column = "autotrain_sentence3" + params.target_column = "autotrain_target" + return params + + +def img_reg_munge_data(params, local): + train_data_path = f"{params.data_path}/{params.train_split}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}" + else: + valid_data_path = None + if os.path.isdir(train_data_path): + dset = AutoTrainImageRegressionDataset( + train_data=train_data_path, + valid_data=valid_data_path, + token=params.token, + project_name=params.project_name, + username=params.username, + local=local, + ) + params.data_path = dset.prepare() + params.valid_split = "validation" + params.image_column = "autotrain_image" + params.target_column = "autotrain_label" + return params + + +def vlm_munge_data(params, local): + train_data_path = f"{params.data_path}/{params.train_split}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}" + else: + valid_data_path = None + if os.path.exists(train_data_path): + col_map = {"text": params.text_column} + if params.prompt_text_column is not None: + col_map["prompt"] = params.prompt_text_column + dset = AutoTrainVLMDataset( + train_data=train_data_path, + token=params.token, + project_name=params.project_name, + username=params.username, + column_mapping=col_map, + valid_data=valid_data_path if valid_data_path is not None else None, + percent_valid=None, # TODO: add to UI + local=local, + ) + params.data_path = dset.prepare() + params.text_column = "autotrain_text" + params.image_column = "autotrain_image" + params.prompt_text_column = "autotrain_prompt" + return params + + +def ext_qa_munge_data(params, local): + exts = ["csv", "jsonl"] + ext_to_use = None + for ext in exts: + path = f"{params.data_path}/{params.train_split}.{ext}" + if os.path.exists(path): + ext_to_use = ext + break + + train_data_path = f"{params.data_path}/{params.train_split}.{ext_to_use}" + if params.valid_split is not None: + valid_data_path = f"{params.data_path}/{params.valid_split}.{ext_to_use}" + else: + valid_data_path = None + if os.path.exists(train_data_path): + dset = AutoTrainDataset( + train_data=[train_data_path], + valid_data=[valid_data_path] if valid_data_path is not None else None, + task="text_extractive_question_answering", + token=params.token, + project_name=params.project_name, + username=params.username, + column_mapping={ + "text": params.text_column, + "question": params.question_column, + "answer": params.answer_column, + }, + percent_valid=None, # TODO: add to UI + local=local, + convert_to_class_label=True, + ext=ext_to_use, + ) + params.data_path = dset.prepare() + params.valid_split = "validation" + params.text_column = "autotrain_text" + params.question_column = "autotrain_question" + params.answer_column = "autotrain_answer" + return params + + +@dataclass +class AutoTrainProject: + """ + A class to train an AutoTrain project + + Attributes + ---------- + params : Union[ + LLMTrainingParams, + TextClassificationParams, + TabularParams, + Seq2SeqParams, + ImageClassificationParams, + TextRegressionParams, + ObjectDetectionParams, + TokenClassificationParams, + SentenceTransformersParams, + ImageRegressionParams, + ExtractiveQuestionAnsweringParams, + VLMTrainingParams, + ] + The parameters for the AutoTrain project. + backend : str + The backend to be used for the AutoTrain project. It should be one of the following: + - local + - spaces-a10g-large + - spaces-a10g-small + - spaces-a100-large + - spaces-t4-medium + - spaces-t4-small + - spaces-cpu-upgrade + - spaces-cpu-basic + - spaces-l4x1 + - spaces-l4x4 + - spaces-l40sx1 + - spaces-l40sx4 + - spaces-l40sx8 + - spaces-a10g-largex2 + - spaces-a10g-largex4 + process : bool + Flag to indicate if the params and dataset should be processed. If your data format is not AutoTrain-readable, set it to True. Set it to True when in doubt. Defaults to False. + + Methods + ------- + __post_init__(): + Validates the backend attribute. + create(): + Creates a runner based on the backend and initializes the AutoTrain project. + """ + + params: Union[ + LLMTrainingParams, + TextClassificationParams, + TabularParams, + Seq2SeqParams, + ImageClassificationParams, + TextRegressionParams, + ObjectDetectionParams, + TokenClassificationParams, + SentenceTransformersParams, + ImageRegressionParams, + ExtractiveQuestionAnsweringParams, + VLMTrainingParams, + ] + backend: str + process: bool = False + + def __post_init__(self): + self.local = self.backend.startswith("local") + if self.backend not in AVAILABLE_HARDWARE: + raise ValueError(f"Invalid backend: {self.backend}") + + def _process_params_data(self): + if isinstance(self.params, LLMTrainingParams): + return llm_munge_data(self.params, self.local) + elif isinstance(self.params, ExtractiveQuestionAnsweringParams): + return ext_qa_munge_data(self.params, self.local) + elif isinstance(self.params, ImageClassificationParams): + return img_clf_munge_data(self.params, self.local) + elif isinstance(self.params, ImageRegressionParams): + return img_reg_munge_data(self.params, self.local) + elif isinstance(self.params, ObjectDetectionParams): + return img_obj_detect_munge_data(self.params, self.local) + elif isinstance(self.params, SentenceTransformersParams): + return sent_transformers_munge_data(self.params, self.local) + elif isinstance(self.params, Seq2SeqParams): + return seq2seq_munge_data(self.params, self.local) + elif isinstance(self.params, TabularParams): + return tabular_munge_data(self.params, self.local) + elif isinstance(self.params, TextClassificationParams): + return text_clf_munge_data(self.params, self.local) + elif isinstance(self.params, TextRegressionParams): + return text_reg_munge_data(self.params, self.local) + elif isinstance(self.params, TokenClassificationParams): + return token_clf_munge_data(self.params, self.local) + elif isinstance(self.params, VLMTrainingParams): + return vlm_munge_data(self.params, self.local) + else: + raise Exception("Invalid params class") + + def create(self): + if self.process: + self.params = self._process_params_data() + + if self.backend.startswith("local"): + runner = LocalRunner(params=self.params, backend=self.backend) + return runner.create() + elif self.backend.startswith("spaces-"): + runner = SpaceRunner(params=self.params, backend=self.backend) + return runner.create() + elif self.backend.startswith("ep-"): + runner = EndpointsRunner(params=self.params, backend=self.backend) + return runner.create() + elif self.backend.startswith("ngc-"): + runner = NGCRunner(params=self.params, backend=self.backend) + return runner.create() + elif self.backend.startswith("nvcf-"): + runner = NVCFRunner(params=self.params, backend=self.backend) + return runner.create() + else: + raise NotImplementedError diff --git a/src/autotrain/tasks.py b/src/autotrain/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..05c1fed9425237fc4cc7e9503a977e75b8c4a4cf --- /dev/null +++ b/src/autotrain/tasks.py @@ -0,0 +1,36 @@ +NLP_TASKS = { + "text_binary_classification": 1, + "text_multi_class_classification": 2, + "text_token_classification": 4, + "text_extractive_question_answering": 5, + "text_summarization": 8, + "text_single_column_regression": 10, + "speech_recognition": 11, + "natural_language_inference": 22, + "lm_training": 9, + "seq2seq": 28, # 27 is reserved for generic training + "sentence_transformers": 30, + "vlm": 31, +} + +VISION_TASKS = { + "image_binary_classification": 17, + "image_multi_class_classification": 18, + "image_single_column_regression": 24, + "image_object_detection": 29, +} + +TABULAR_TASKS = { + "tabular_binary_classification": 13, + "tabular_multi_class_classification": 14, + "tabular_multi_label_classification": 15, + "tabular_single_column_regression": 16, + "tabular": 26, +} + + +TASKS = { + **NLP_TASKS, + **VISION_TASKS, + **TABULAR_TASKS, +} diff --git a/src/autotrain/tests/test_cli.py b/src/autotrain/tests/test_cli.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/tests/test_dummy.py b/src/autotrain/tests/test_dummy.py new file mode 100644 index 0000000000000000000000000000000000000000..97144ebc823f20f6bd2a4910f531b994ab39d988 --- /dev/null +++ b/src/autotrain/tests/test_dummy.py @@ -0,0 +1,2 @@ +def test_dummy(): + assert 1 + 1 == 2 diff --git a/src/autotrain/tools/__init__.py b/src/autotrain/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/tools/convert_to_kohya.py b/src/autotrain/tools/convert_to_kohya.py new file mode 100644 index 0000000000000000000000000000000000000000..970aa1247a2b225c1ce0c20e940bde4fac94bcf9 --- /dev/null +++ b/src/autotrain/tools/convert_to_kohya.py @@ -0,0 +1,23 @@ +from diffusers.utils import convert_all_state_dict_to_peft, convert_state_dict_to_kohya +from safetensors.torch import load_file, save_file + +from autotrain import logger + + +def convert_to_kohya(input_path, output_path): + """ + Converts a Lora state dictionary to a Kohya state dictionary and saves it to the specified output path. + + Args: + input_path (str): The file path to the input Lora state dictionary. + output_path (str): The file path where the converted Kohya state dictionary will be saved. + + Returns: + None + """ + logger.info(f"Converting Lora state dict from {input_path} to Kohya state dict at {output_path}") + lora_state_dict = load_file(input_path) + peft_state_dict = convert_all_state_dict_to_peft(lora_state_dict) + kohya_state_dict = convert_state_dict_to_kohya(peft_state_dict) + save_file(kohya_state_dict, output_path) + logger.info(f"Kohya state dict saved at {output_path}") diff --git a/src/autotrain/tools/merge_adapter.py b/src/autotrain/tools/merge_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..df4d1a772a2887bda3eef37d042948e07a186c78 --- /dev/null +++ b/src/autotrain/tools/merge_adapter.py @@ -0,0 +1,68 @@ +import torch +from peft import PeftModel +from transformers import AutoModelForCausalLM, AutoTokenizer + +from autotrain import logger +from autotrain.trainers.common import ALLOW_REMOTE_CODE + + +def merge_llm_adapter( + base_model_path, adapter_path, token, output_folder=None, pad_to_multiple_of=None, push_to_hub=False +): + """ + Merges a language model adapter into a base model and optionally saves or pushes the merged model. + + Args: + base_model_path (str): Path to the base model. + adapter_path (str): Path to the adapter model. + token (str): Authentication token for accessing the models. + output_folder (str, optional): Directory to save the merged model. Defaults to None. + pad_to_multiple_of (int, optional): If specified, pad the token embeddings to a multiple of this value. Defaults to None. + push_to_hub (bool, optional): If True, push the merged model to the Hugging Face Hub. Defaults to False. + + Raises: + ValueError: If neither `output_folder` nor `push_to_hub` is specified. + + Returns: + None + """ + if output_folder is None and push_to_hub is False: + raise ValueError("You must specify either --output_folder or --push_to_hub") + + logger.info("Loading adapter...") + base_model = AutoModelForCausalLM.from_pretrained( + base_model_path, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + trust_remote_code=ALLOW_REMOTE_CODE, + token=token, + ) + + tokenizer = AutoTokenizer.from_pretrained( + adapter_path, + trust_remote_code=ALLOW_REMOTE_CODE, + token=token, + ) + if pad_to_multiple_of: + base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=pad_to_multiple_of) + else: + base_model.resize_token_embeddings(len(tokenizer)) + + model = PeftModel.from_pretrained( + base_model, + adapter_path, + token=token, + ) + model = model.merge_and_unload() + + if output_folder is not None: + logger.info("Saving target model...") + model.save_pretrained(output_folder) + tokenizer.save_pretrained(output_folder) + logger.info(f"Model saved to {output_folder}") + + if push_to_hub: + logger.info("Pushing model to Hugging Face Hub...") + model.push_to_hub(adapter_path) + tokenizer.push_to_hub(adapter_path) + logger.info(f"Model pushed to Hugging Face Hub as {adapter_path}") diff --git a/src/autotrain/trainers/__init__.py b/src/autotrain/trainers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/clm/__init__.py b/src/autotrain/trainers/clm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/clm/__main__.py b/src/autotrain/trainers/clm/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..29d6bd773d3378026e07714633790aa43561e3f3 --- /dev/null +++ b/src/autotrain/trainers/clm/__main__.py @@ -0,0 +1,53 @@ +import argparse +import json + +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.common import monitor + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +@monitor +def train(config): + if isinstance(config, dict): + config = LLMTrainingParams(**config) + + if config.trainer == "default": + from autotrain.trainers.clm.train_clm_default import train as train_default + + train_default(config) + + elif config.trainer == "sft": + from autotrain.trainers.clm.train_clm_sft import train as train_sft + + train_sft(config) + + elif config.trainer == "reward": + from autotrain.trainers.clm.train_clm_reward import train as train_reward + + train_reward(config) + + elif config.trainer == "dpo": + from autotrain.trainers.clm.train_clm_dpo import train as train_dpo + + train_dpo(config) + + elif config.trainer == "orpo": + from autotrain.trainers.clm.train_clm_orpo import train as train_orpo + + train_orpo(config) + + else: + raise ValueError(f"trainer `{config.trainer}` not supported") + + +if __name__ == "__main__": + _args = parse_args() + training_config = json.load(open(_args.training_config)) + _config = LLMTrainingParams(**training_config) + train(_config) diff --git a/src/autotrain/trainers/clm/callbacks.py b/src/autotrain/trainers/clm/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..112a468982ec3a16168df9ef777568b44c08189a --- /dev/null +++ b/src/autotrain/trainers/clm/callbacks.py @@ -0,0 +1,61 @@ +import os + +import torch +from peft import set_peft_model_state_dict +from transformers import TrainerCallback, TrainerControl, TrainerState, TrainingArguments +from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR + + +class SavePeftModelCallback(TrainerCallback): + def on_save( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}") + + kwargs["model"].save_pretrained(checkpoint_folder) + + pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin") + torch.save({}, pytorch_model_path) + return control + + +class LoadBestPeftModelCallback(TrainerCallback): + def on_train_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + print(f"Loading best peft model from {state.best_model_checkpoint} (score: {state.best_metric}).") + best_model_path = os.path.join(state.best_model_checkpoint, "adapter_model.bin") + adapters_weights = torch.load(best_model_path) + model = kwargs["model"] + set_peft_model_state_dict(model, adapters_weights) + return control + + +class SaveDeepSpeedPeftModelCallback(TrainerCallback): + def __init__(self, trainer, save_steps=500): + self.trainer = trainer + self.save_steps = save_steps + + def on_step_end( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + if (state.global_step + 1) % self.save_steps == 0: + self.trainer.accelerator.wait_for_everyone() + state_dict = self.trainer.accelerator.get_state_dict(self.trainer.deepspeed) + unwrapped_model = self.trainer.accelerator.unwrap_model(self.trainer.deepspeed) + if self.trainer.accelerator.is_main_process: + unwrapped_model.save_pretrained(args.output_dir, state_dict=state_dict) + self.trainer.accelerator.wait_for_everyone() + return control diff --git a/src/autotrain/trainers/clm/params.py b/src/autotrain/trainers/clm/params.py new file mode 100644 index 0000000000000000000000000000000000000000..1a65b9007a93ddd86b5290d2cce75665717d6872 --- /dev/null +++ b/src/autotrain/trainers/clm/params.py @@ -0,0 +1,140 @@ +from typing import List, Optional, Union + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class LLMTrainingParams(AutoTrainParams): + """ + LLMTrainingParams: Parameters for training a language model using the autotrain library. + + Attributes: + model (str): Model name to be used for training. Default is "gpt2". + project_name (str): Name of the project and output directory. Default is "project-name". + + data_path (str): Path to the dataset. Default is "data". + train_split (str): Configuration for the training data split. Default is "train". + valid_split (Optional[str]): Configuration for the validation data split. Default is None. + add_eos_token (bool): Whether to add an EOS token at the end of sequences. Default is True. + block_size (Union[int, List[int]]): Size of the blocks for training, can be a single integer or a list of integers. Default is -1. + model_max_length (int): Maximum length of the model input. Default is 2048. + padding (Optional[str]): Side on which to pad sequences (left or right). Default is "right". + + trainer (str): Type of trainer to use. Default is "default". + use_flash_attention_2 (bool): Whether to use flash attention version 2. Default is False. + log (str): Logging method for experiment tracking. Default is "none". + disable_gradient_checkpointing (bool): Whether to disable gradient checkpointing. Default is False. + logging_steps (int): Number of steps between logging events. Default is -1. + eval_strategy (str): Strategy for evaluation (e.g., 'epoch'). Default is "epoch". + save_total_limit (int): Maximum number of checkpoints to keep. Default is 1. + auto_find_batch_size (bool): Whether to automatically find the optimal batch size. Default is False. + mixed_precision (Optional[str]): Type of mixed precision to use (e.g., 'fp16', 'bf16', or None). Default is None. + lr (float): Learning rate for training. Default is 3e-5. + epochs (int): Number of training epochs. Default is 1. + batch_size (int): Batch size for training. Default is 2. + warmup_ratio (float): Proportion of training to perform learning rate warmup. Default is 0.1. + gradient_accumulation (int): Number of steps to accumulate gradients before updating. Default is 4. + optimizer (str): Optimizer to use for training. Default is "adamw_torch". + scheduler (str): Learning rate scheduler to use. Default is "linear". + weight_decay (float): Weight decay to apply to the optimizer. Default is 0.0. + max_grad_norm (float): Maximum norm for gradient clipping. Default is 1.0. + seed (int): Random seed for reproducibility. Default is 42. + chat_template (Optional[str]): Template for chat-based models, options include: None, zephyr, chatml, or tokenizer. Default is None. + + quantization (Optional[str]): Quantization method to use (e.g., 'int4', 'int8', or None). Default is "int4". + target_modules (Optional[str]): Target modules for quantization or fine-tuning. Default is "all-linear". + merge_adapter (bool): Whether to merge the adapter layers. Default is False. + peft (bool): Whether to use Parameter-Efficient Fine-Tuning (PEFT). Default is False. + lora_r (int): Rank of the LoRA matrices. Default is 16. + lora_alpha (int): Alpha parameter for LoRA. Default is 32. + lora_dropout (float): Dropout rate for LoRA. Default is 0.05. + + model_ref (Optional[str]): Reference model for DPO trainer. Default is None. + dpo_beta (float): Beta parameter for DPO trainer. Default is 0.1. + + max_prompt_length (int): Maximum length of the prompt. Default is 128. + max_completion_length (Optional[int]): Maximum length of the completion. Default is None. + + prompt_text_column (Optional[str]): Column name for the prompt text. Default is None. + text_column (str): Column name for the text data. Default is "text". + rejected_text_column (Optional[str]): Column name for the rejected text data. Default is None. + + push_to_hub (bool): Whether to push the model to the Hugging Face Hub. Default is False. + username (Optional[str]): Hugging Face username for authentication. Default is None. + token (Optional[str]): Hugging Face token for authentication. Default is None. + + unsloth (bool): Whether to use the unsloth library. Default is False. + distributed_backend (Optional[str]): Backend to use for distributed training. Default is None. + """ + + model: str = Field("gpt2", title="Model name to be used for training") + project_name: str = Field("project-name", title="Name of the project and output directory") + + # data params + data_path: str = Field("data", title="Path to the dataset") + train_split: str = Field("train", title="Configuration for the training data split") + valid_split: Optional[str] = Field(None, title="Configuration for the validation data split") + add_eos_token: bool = Field(True, title="Whether to add an EOS token at the end of sequences") + block_size: Union[int, List[int]] = Field( + -1, title="Size of the blocks for training, can be a single integer or a list of integers" + ) + model_max_length: int = Field(2048, title="Maximum length of the model input") + padding: Optional[str] = Field("right", title="Side on which to pad sequences (left or right)") + + # trainer params + trainer: str = Field("default", title="Type of trainer to use") + use_flash_attention_2: bool = Field(False, title="Whether to use flash attention version 2") + log: str = Field("none", title="Logging method for experiment tracking") + disable_gradient_checkpointing: bool = Field(False, title="Whether to disable gradient checkpointing") + logging_steps: int = Field(-1, title="Number of steps between logging events") + eval_strategy: str = Field("epoch", title="Strategy for evaluation (e.g., 'epoch')") + save_total_limit: int = Field(1, title="Maximum number of checkpoints to keep") + auto_find_batch_size: bool = Field(False, title="Whether to automatically find the optimal batch size") + mixed_precision: Optional[str] = Field( + None, title="Type of mixed precision to use (e.g., 'fp16', 'bf16', or None)" + ) + lr: float = Field(3e-5, title="Learning rate for training") + epochs: int = Field(1, title="Number of training epochs") + batch_size: int = Field(2, title="Batch size for training") + warmup_ratio: float = Field(0.1, title="Proportion of training to perform learning rate warmup") + gradient_accumulation: int = Field(4, title="Number of steps to accumulate gradients before updating") + optimizer: str = Field("adamw_torch", title="Optimizer to use for training") + scheduler: str = Field("linear", title="Learning rate scheduler to use") + weight_decay: float = Field(0.0, title="Weight decay to apply to the optimizer") + max_grad_norm: float = Field(1.0, title="Maximum norm for gradient clipping") + seed: int = Field(42, title="Random seed for reproducibility") + chat_template: Optional[str] = Field( + None, title="Template for chat-based models, options include: None, zephyr, chatml, or tokenizer" + ) + + # peft + quantization: Optional[str] = Field("int4", title="Quantization method to use (e.g., 'int4', 'int8', or None)") + target_modules: Optional[str] = Field("all-linear", title="Target modules for quantization or fine-tuning") + merge_adapter: bool = Field(False, title="Whether to merge the adapter layers") + peft: bool = Field(False, title="Whether to use Parameter-Efficient Fine-Tuning (PEFT)") + lora_r: int = Field(16, title="Rank of the LoRA matrices") + lora_alpha: int = Field(32, title="Alpha parameter for LoRA") + lora_dropout: float = Field(0.05, title="Dropout rate for LoRA") + + # dpo + model_ref: Optional[str] = Field(None, title="Reference model for DPO trainer") + dpo_beta: float = Field(0.1, title="Beta parameter for DPO trainer") + + # orpo + dpo + max_prompt_length: int = Field(128, title="Maximum length of the prompt") + max_completion_length: Optional[int] = Field(None, title="Maximum length of the completion") + + # column mappings + prompt_text_column: Optional[str] = Field(None, title="Column name for the prompt text") + text_column: str = Field("text", title="Column name for the text data") + rejected_text_column: Optional[str] = Field(None, title="Column name for the rejected text data") + + # push to hub + push_to_hub: bool = Field(False, title="Whether to push the model to the Hugging Face Hub") + username: Optional[str] = Field(None, title="Hugging Face username for authentication") + token: Optional[str] = Field(None, title="Hugging Face token for authentication") + + # unsloth + unsloth: bool = Field(False, title="Whether to use the unsloth library") + distributed_backend: Optional[str] = Field(None, title="Backend to use for distributed training") diff --git a/src/autotrain/trainers/clm/train_clm_default.py b/src/autotrain/trainers/clm/train_clm_default.py new file mode 100644 index 0000000000000000000000000000000000000000..b2e36b09b760ddbae07583bf24d03a4a0f83ded5 --- /dev/null +++ b/src/autotrain/trainers/clm/train_clm_default.py @@ -0,0 +1,114 @@ +from functools import partial + +import torch +from datasets import Dataset +from peft.tuners.lora import LoraLayer +from transformers import Trainer, TrainingArguments, default_data_collator +from transformers.trainer_callback import PrinterCallback + +from autotrain import logger +from autotrain.trainers.clm import utils +from autotrain.trainers.clm.params import LLMTrainingParams + + +def process_data(data, tokenizer, config): + data = data.to_pandas() + data = data.fillna("") + + data = data[[config.text_column]] + if config.add_eos_token: + data[config.text_column] = data[config.text_column] + tokenizer.eos_token + data = Dataset.from_pandas(data) + return data + + +def train(config): + logger.info("Starting default/generic CLM training...") + if isinstance(config, dict): + config = LLMTrainingParams(**config) + train_data, valid_data = utils.process_input_data(config) + tokenizer = utils.get_tokenizer(config) + train_data, valid_data = utils.process_data_with_chat_template(config, tokenizer, train_data, valid_data) + + train_data = process_data( + data=train_data, + tokenizer=tokenizer, + config=config, + ) + if config.valid_split is not None: + valid_data = process_data( + data=valid_data, + tokenizer=tokenizer, + config=config, + ) + + logging_steps = utils.configure_logging_steps(config, train_data, valid_data) + training_args = utils.configure_training_args(config, logging_steps) + config = utils.configure_block_size(config, tokenizer) + args = TrainingArguments(**training_args) + + model = utils.get_model(config, tokenizer) + + tokenize_fn = partial(utils.tokenize, tokenizer=tokenizer, config=config) + group_texts_fn = partial(utils.group_texts, config=config) + + train_data = train_data.map( + tokenize_fn, + batched=True, + num_proc=1, + remove_columns=list(train_data.features), + desc="Running tokenizer on train dataset", + ) + + if config.valid_split is not None: + valid_data = valid_data.map( + tokenize_fn, + batched=True, + num_proc=1, + remove_columns=list(valid_data.features), + desc="Running tokenizer on validation dataset", + ) + + train_data = train_data.map( + group_texts_fn, + batched=True, + num_proc=4, + desc=f"Grouping texts in chunks of {config.block_size}", + ) + + if config.valid_split is not None: + valid_data = valid_data.map( + group_texts_fn, + batched=True, + num_proc=4, + desc=f"Grouping texts in chunks of {config.block_size}", + ) + + logger.info("creating trainer") + callbacks = utils.get_callbacks(config) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks, + ) + trainer = Trainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data if config.valid_split is not None else None, + tokenizer=tokenizer, + data_collator=default_data_collator, + ) + for name, module in trainer.model.named_modules(): + if isinstance(module, LoraLayer): + if config.mixed_precision == "bf16": + module = module.to(torch.bfloat16) + if "norm" in name: + module = module.to(torch.float32) + if any(x in name for x in ["lm_head", "embed_tokens", "wte", "wpe"]): + if hasattr(module, "weight"): + if config.mixed_precision == "bf16" and module.weight.dtype == torch.float32: + module = module.to(torch.bfloat16) + + trainer.remove_callback(PrinterCallback) + trainer.train() + utils.post_training_steps(config, trainer) diff --git a/src/autotrain/trainers/clm/train_clm_dpo.py b/src/autotrain/trainers/clm/train_clm_dpo.py new file mode 100644 index 0000000000000000000000000000000000000000..959c3ed50edad25e0a52c50021b55f78601cae1c --- /dev/null +++ b/src/autotrain/trainers/clm/train_clm_dpo.py @@ -0,0 +1,118 @@ +import torch +from peft import LoraConfig +from transformers import AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig +from transformers.trainer_callback import PrinterCallback +from trl import DPOConfig, DPOTrainer + +from autotrain import logger +from autotrain.trainers.clm import utils +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.common import ALLOW_REMOTE_CODE + + +def train(config): + logger.info("Starting DPO training...") + if isinstance(config, dict): + config = LLMTrainingParams(**config) + train_data, valid_data = utils.process_input_data(config) + tokenizer = utils.get_tokenizer(config) + train_data, valid_data = utils.process_data_with_chat_template(config, tokenizer, train_data, valid_data) + + logging_steps = utils.configure_logging_steps(config, train_data, valid_data) + training_args = utils.configure_training_args(config, logging_steps) + config = utils.configure_block_size(config, tokenizer) + + training_args["max_length"] = config.block_size + training_args["max_prompt_length"] = config.max_prompt_length + training_args["max_target_length"] = config.max_completion_length + training_args["beta"] = config.dpo_beta + args = DPOConfig(**training_args) + + logger.info("loading model config...") + model_config = AutoConfig.from_pretrained( + config.model, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + use_cache=config.disable_gradient_checkpointing, + ) + + logger.info("loading model...") + if config.peft: + if config.quantization == "int4": + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=False, + ) + elif config.quantization == "int8": + bnb_config = BitsAndBytesConfig(load_in_8bit=True) + else: + bnb_config = None + + model = AutoModelForCausalLM.from_pretrained( + config.model, + config=model_config, + token=config.token, + quantization_config=bnb_config, + trust_remote_code=ALLOW_REMOTE_CODE, + use_flash_attention_2=config.use_flash_attention_2, + ) + logger.info("Using PEFT, model_ref will be set to None") + model_ref = None + else: + model = AutoModelForCausalLM.from_pretrained( + config.model, + config=model_config, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + use_flash_attention_2=config.use_flash_attention_2, + ) + if config.model_ref is not None: + model_ref = AutoModelForCausalLM.from_pretrained( + config.model_ref, + config=model_config, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + use_flash_attention_2=config.use_flash_attention_2, + ) + else: + model_ref = None + + logger.info(f"model dtype: {model.dtype}") + model.resize_token_embeddings(len(tokenizer)) + + if model_ref is not None: + logger.info(f"model_ref dtype: {model_ref.dtype}") + model_ref.resize_token_embeddings(len(tokenizer)) + + if config.peft: + peft_config = LoraConfig( + r=config.lora_r, + lora_alpha=config.lora_alpha, + lora_dropout=config.lora_dropout, + bias="none", + task_type="CAUSAL_LM", + target_modules=utils.get_target_modules(config), + ) + + logger.info("creating trainer") + callbacks = utils.get_callbacks(config) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks, + ) + + trainer = DPOTrainer( + **trainer_args, + ref_model=model_ref, + train_dataset=train_data, + eval_dataset=valid_data if config.valid_split is not None else None, + processing_class=tokenizer, + peft_config=peft_config if config.peft else None, + ) + + trainer.remove_callback(PrinterCallback) + trainer.train() + utils.post_training_steps(config, trainer) diff --git a/src/autotrain/trainers/clm/train_clm_orpo.py b/src/autotrain/trainers/clm/train_clm_orpo.py new file mode 100644 index 0000000000000000000000000000000000000000..498816197c7e40358de3591725c21ed37973e8cb --- /dev/null +++ b/src/autotrain/trainers/clm/train_clm_orpo.py @@ -0,0 +1,57 @@ +from peft import LoraConfig +from transformers.trainer_callback import PrinterCallback +from trl import ORPOConfig, ORPOTrainer + +from autotrain import logger +from autotrain.trainers.clm import utils +from autotrain.trainers.clm.params import LLMTrainingParams + + +def train(config): + logger.info("Starting ORPO training...") + if isinstance(config, dict): + config = LLMTrainingParams(**config) + train_data, valid_data = utils.process_input_data(config) + tokenizer = utils.get_tokenizer(config) + train_data, valid_data = utils.process_data_with_chat_template(config, tokenizer, train_data, valid_data) + + logging_steps = utils.configure_logging_steps(config, train_data, valid_data) + training_args = utils.configure_training_args(config, logging_steps) + config = utils.configure_block_size(config, tokenizer) + + training_args["max_length"] = config.block_size + training_args["max_prompt_length"] = config.max_prompt_length + training_args["max_completion_length"] = config.max_completion_length + args = ORPOConfig(**training_args) + + model = utils.get_model(config, tokenizer) + + if config.peft: + peft_config = LoraConfig( + r=config.lora_r, + lora_alpha=config.lora_alpha, + lora_dropout=config.lora_dropout, + bias="none", + task_type="CAUSAL_LM", + target_modules=utils.get_target_modules(config), + ) + + logger.info("creating trainer") + callbacks = utils.get_callbacks(config) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks, + ) + + trainer = ORPOTrainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data if config.valid_split is not None else None, + processing_class=tokenizer, + peft_config=peft_config if config.peft else None, + ) + + trainer.remove_callback(PrinterCallback) + trainer.train() + utils.post_training_steps(config, trainer) diff --git a/src/autotrain/trainers/clm/train_clm_reward.py b/src/autotrain/trainers/clm/train_clm_reward.py new file mode 100644 index 0000000000000000000000000000000000000000..9faadf205bcd50e8ca6da1a8faf06a6cbc6d7750 --- /dev/null +++ b/src/autotrain/trainers/clm/train_clm_reward.py @@ -0,0 +1,124 @@ +from functools import partial + +import torch +from peft import LoraConfig +from transformers import AutoConfig, AutoModelForSequenceClassification, BitsAndBytesConfig +from transformers.trainer_callback import PrinterCallback +from trl import RewardConfig, RewardTrainer + +from autotrain import logger +from autotrain.trainers.clm import utils +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.common import ALLOW_REMOTE_CODE + + +def train(config): + logger.info("Starting Reward training...") + if isinstance(config, dict): + config = LLMTrainingParams(**config) + train_data, valid_data = utils.process_input_data(config) + tokenizer = utils.get_tokenizer(config) + train_data, valid_data = utils.process_data_with_chat_template(config, tokenizer, train_data, valid_data) + + logging_steps = utils.configure_logging_steps(config, train_data, valid_data) + training_args = utils.configure_training_args(config, logging_steps) + config = utils.configure_block_size(config, tokenizer) + training_args["max_length"] = config.block_size + args = RewardConfig(**training_args) + + logger.info("loading model config...") + model_config = AutoConfig.from_pretrained( + config.model, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + use_cache=config.disable_gradient_checkpointing, + ) + + model_config.num_labels = 1 + model_config.pad_token_id = tokenizer.pad_token_id + model_config.pad_token = tokenizer.pad_token + + logger.info("loading model...") + if config.peft: + if config.quantization == "int4": + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=False, + ) + elif config.quantization == "int8": + bnb_config = BitsAndBytesConfig(load_in_8bit=True) + else: + bnb_config = None + + model = AutoModelForSequenceClassification.from_pretrained( + config.model, + config=model_config, + token=config.token, + quantization_config=bnb_config, + trust_remote_code=ALLOW_REMOTE_CODE, + use_flash_attention_2=config.use_flash_attention_2, + ) + else: + model = AutoModelForSequenceClassification.from_pretrained( + config.model, + config=model_config, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + use_flash_attention_2=config.use_flash_attention_2, + ) + + logger.info(f"model dtype: {model.dtype}") + model.resize_token_embeddings(len(tokenizer)) + + if config.peft: + peft_config = LoraConfig( + r=config.lora_r, + lora_alpha=config.lora_alpha, + lora_dropout=config.lora_dropout, + bias="none", + task_type="SEQ_CLS", + target_modules=utils.get_target_modules(config), + ) + + reward_proc = partial(utils.preprocess_reward, tokenizer=tokenizer) + train_data = train_data.map( + reward_proc, + batched=True, + num_proc=4, + desc="Running tokenizer on train dataset", + ) + train_data = train_data.filter( + lambda x: len(x["input_ids_chosen"]) <= config.block_size and len(x["input_ids_rejected"]) <= config.block_size + ) + if config.valid_split is not None: + valid_data = valid_data.map( + reward_proc, + batched=True, + num_proc=4, + desc="Running tokenizer on validation dataset", + ) + valid_data = valid_data.filter( + lambda x: len(x["input_ids_chosen"]) <= config.block_size + and len(x["input_ids_rejected"]) <= config.block_size + ) + + logger.info("creating trainer") + callbacks = utils.get_callbacks(config) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks, + ) + trainer = RewardTrainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data if config.valid_split is not None else None, + peft_config=peft_config if config.peft else None, + processing_class=tokenizer, + ) + + trainer.remove_callback(PrinterCallback) + trainer.train() + utils.post_training_steps(config, trainer) diff --git a/src/autotrain/trainers/clm/train_clm_sft.py b/src/autotrain/trainers/clm/train_clm_sft.py new file mode 100644 index 0000000000000000000000000000000000000000..0530534ffacc63a76323bcbc0d4f171ae51e750a --- /dev/null +++ b/src/autotrain/trainers/clm/train_clm_sft.py @@ -0,0 +1,56 @@ +from peft import LoraConfig +from transformers.trainer_callback import PrinterCallback +from trl import SFTConfig, SFTTrainer + +from autotrain import logger +from autotrain.trainers.clm import utils +from autotrain.trainers.clm.params import LLMTrainingParams + + +def train(config): + logger.info("Starting SFT training...") + if isinstance(config, dict): + config = LLMTrainingParams(**config) + train_data, valid_data = utils.process_input_data(config) + tokenizer = utils.get_tokenizer(config) + train_data, valid_data = utils.process_data_with_chat_template(config, tokenizer, train_data, valid_data) + + logging_steps = utils.configure_logging_steps(config, train_data, valid_data) + training_args = utils.configure_training_args(config, logging_steps) + config = utils.configure_block_size(config, tokenizer) + + training_args["dataset_text_field"] = config.text_column + training_args["max_seq_length"] = config.block_size + training_args["packing"] = True + args = SFTConfig(**training_args) + + model = utils.get_model(config, tokenizer) + + if config.peft: + peft_config = LoraConfig( + r=config.lora_r, + lora_alpha=config.lora_alpha, + lora_dropout=config.lora_dropout, + bias="none", + task_type="CAUSAL_LM", + target_modules=utils.get_target_modules(config), + ) + + logger.info("creating trainer") + callbacks = utils.get_callbacks(config) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks, + ) + trainer = SFTTrainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data if config.valid_split is not None else None, + peft_config=peft_config if config.peft else None, + processing_class=tokenizer, + ) + + trainer.remove_callback(PrinterCallback) + trainer.train() + utils.post_training_steps(config, trainer) diff --git a/src/autotrain/trainers/clm/utils.py b/src/autotrain/trainers/clm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5d000889a83ef39395be497a02aa0be2f391eeb3 --- /dev/null +++ b/src/autotrain/trainers/clm/utils.py @@ -0,0 +1,993 @@ +import ast +import gc +import os +from enum import Enum +from itertools import chain + +import requests +import torch +from accelerate.state import PartialState +from datasets import load_dataset, load_from_disk +from huggingface_hub import HfApi +from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +from autotrain import is_unsloth_available, logger +from autotrain.trainers.clm.callbacks import LoadBestPeftModelCallback, SavePeftModelCallback +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + LossLoggingCallback, + TrainStartCallback, + UploadLogs, + pause_space, + remove_autotrain_data, + save_training_params, +) + + +DEFAULT_CHAT_TEMPLATE = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" +CHATML_CHAT_TEMPLATE = "{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% if loop.last and add_generation_prompt %}{{'<|im_start|>assistant\n' }}{% endif %}{% endfor %}" +ZEPHYR_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}" + + +IGNORE_INDEX = -100 +DEFAULT_PAD_TOKEN = "[PAD]" +DEFAULT_EOS_TOKEN = "" +DEFAULT_BOS_TOKEN = "" +DEFAULT_UNK_TOKEN = "" +TARGET_MODULES = { + "Salesforce/codegen25-7b-multi": "q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj", +} + +MODEL_CARD = """ +--- +tags: +- autotrain +- text-generation-inference +- text-generation{peft} +library_name: transformers{base_model} +widget: + - messages: + - role: user + content: What is your favorite condiment? +license: other{dataset_tag} +--- + +# Model Trained Using AutoTrain + +This model was trained using AutoTrain. For more information, please visit [AutoTrain](https://hf.co/docs/autotrain). + +# Usage + +```python + +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_path = "PATH_TO_THIS_REPO" + +tokenizer = AutoTokenizer.from_pretrained(model_path) +model = AutoModelForCausalLM.from_pretrained( + model_path, + device_map="auto", + torch_dtype='auto' +).eval() + +# Prompt content: "hi" +messages = [ + {{"role": "user", "content": "hi"}} +] + +input_ids = tokenizer.apply_chat_template(conversation=messages, tokenize=True, add_generation_prompt=True, return_tensors='pt') +output_ids = model.generate(input_ids.to('cuda')) +response = tokenizer.decode(output_ids[0][input_ids.shape[1]:], skip_special_tokens=True) + +# Model response: "Hello! How can I assist you today?" +print(response) +``` + +""" + + +class ZephyrSpecialTokens(str, Enum): + USER = "<|user|>" + ASSISTANT = "<|assistant|>" + SYSTEM = "<|system|>" + EOS_TOKEN = "" + BOS_TOKEN = "" + PAD_TOKEN = "" + + @classmethod + def list(cls): + return [c.value for c in cls] + + +class ChatmlSpecialTokens(str, Enum): + USER = "<|im_start|>user" + ASSISTANT = "<|im_start|>assistant" + SYSTEM = "<|im_start|>system" + EOS_TOKEN = "<|im_end|>" + BOS_TOKEN = "" + PAD_TOKEN = "" + + @classmethod + def list(cls): + return [c.value for c in cls] + + +def preprocess_reward(examples, tokenizer): + """ + Preprocesses the reward data by tokenizing the chosen and rejected examples. + + Args: + examples (dict): A dictionary containing two keys, "chosen" and "rejected", each mapping to a list of text examples. + tokenizer (PreTrainedTokenizer): A tokenizer instance from the Hugging Face library used to tokenize the text examples. + + Returns: + dict: A dictionary with the following keys: + - "input_ids_chosen": List of tokenized input IDs for the chosen examples. + - "attention_mask_chosen": List of attention masks for the chosen examples. + - "input_ids_rejected": List of tokenized input IDs for the rejected examples. + - "attention_mask_rejected": List of attention masks for the rejected examples. + """ + new_examples = { + "input_ids_chosen": [], + "attention_mask_chosen": [], + "input_ids_rejected": [], + "attention_mask_rejected": [], + } + for chosen, rejected in zip(examples["chosen"], examples["rejected"]): + tokenized_chosen = tokenizer(chosen, truncation=True) + tokenized_rejected = tokenizer(rejected, truncation=True) + + new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"]) + new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"]) + new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"]) + new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"]) + + return new_examples + + +def get_target_modules(config): + """ + Determines the target modules based on the provided configuration. + + Args: + config (object): Configuration object that contains the following attributes: + - target_modules (str or None): Specifies the target modules. It can be: + - None: Returns the default target modules for the model specified in the config. + - An empty string: Returns the default target modules for the model specified in the config. + - "all-linear": Returns the string "all-linear". + - A comma-separated string: Returns a list of target modules split by commas. + + Returns: + list or str: A list of target modules or a specific string ("all-linear") based on the configuration. + """ + if config.target_modules is None: + return TARGET_MODULES.get(config.model) + if config.target_modules.strip() == "": + return TARGET_MODULES.get(config.model) + if config.target_modules.strip().lower() == "all-linear": + return "all-linear" + return config.target_modules.split(",") + + +def group_texts(examples, config): + """ + Groups texts into chunks of a specified block size. + + Args: + examples (dict): A dictionary where keys are feature names and values are lists of lists containing text data. + config (object): A configuration object that contains the block_size attribute. + + Returns: + dict: A dictionary with the same keys as the input examples, where each value is a list of chunks of text data. + Additionally, a "labels" key is added with the same value as the "input_ids" key. + """ + # Concatenate all texts. + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + if total_length >= config.block_size: + total_length = (total_length // config.block_size) * config.block_size + else: + total_length = 0 + # Split by chunks of max_len. + result = { + k: [t[i : i + config.block_size] for i in range(0, total_length, config.block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + +def tokenize(examples, tokenizer, config): + """ + Tokenizes the input examples using the provided tokenizer and configuration. + + Args: + examples (dict): A dictionary containing the input examples to be tokenized. + tokenizer (PreTrainedTokenizer): The tokenizer to be used for tokenizing the examples. + config (object): Configuration object that contains the text column name. + + Returns: + dict: A dictionary containing the tokenized output. + """ + output = tokenizer(examples[config.text_column]) + return output + + +def merge_adapter(base_model_path, target_model_path, adapter_path): + """ + Merges an adapter into a base model and saves the resulting model and tokenizer. + + Args: + base_model_path (str): Path to the base model directory. + target_model_path (str): Path to the directory where the merged model and tokenizer will be saved. + adapter_path (str): Path to the adapter model directory. + + Raises: + RuntimeError: If resizing token embeddings fails without padding to a multiple of 8. + """ + logger.info("Loading adapter...") + model = AutoModelForCausalLM.from_pretrained( + base_model_path, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + tokenizer = AutoTokenizer.from_pretrained( + target_model_path, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + try: + model.resize_token_embeddings(len(tokenizer)) + model = PeftModel.from_pretrained(model, adapter_path) + except RuntimeError: + model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8) + model = PeftModel.from_pretrained(model, adapter_path) + model = model.merge_and_unload() + + logger.info("Saving target model...") + model.save_pretrained(target_model_path) + tokenizer.save_pretrained(target_model_path) + + +def create_model_card(config): + """ + Generates a model card string based on the provided configuration. + + Args: + config (object): Configuration object with the following attributes: + - peft (bool): Indicates if PEFT (Parameter-Efficient Fine-Tuning) is used. + - data_path (str): Path to the dataset. + - project_name (str): Name of the project. + - model (str): Path or identifier of the model. + + Returns: + str: A formatted model card string. + """ + if config.peft: + peft = "\n- peft" + else: + peft = "" + + if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path): + dataset_tag = "" + else: + dataset_tag = f"\ndatasets:\n- {config.data_path}" + + if os.path.isdir(config.model): + base_model = "" + else: + base_model = f"\nbase_model: {config.model}" + + model_card = MODEL_CARD.format( + dataset_tag=dataset_tag, + peft=peft, + base_model=base_model, + ) + return model_card.strip() + + +def pause_endpoint(params): + """ + Pauses a Hugging Face endpoint using the provided parameters. + + Args: + params (object): An object containing the necessary parameters, including: + - token (str): The authorization token to access the Hugging Face API. + + Returns: + dict: The JSON response from the API call. + + Raises: + KeyError: If the "ENDPOINT_ID" environment variable is not set. + requests.exceptions.RequestException: If there is an issue with the API request. + """ + endpoint_id = os.environ["ENDPOINT_ID"] + username = endpoint_id.split("/")[0] + project_name = endpoint_id.split("/")[1] + api_url = f"https://api.endpoints.huggingface.cloud/v2/endpoint/{username}/{project_name}/pause" + headers = {"Authorization": f"Bearer {params.token}"} + r = requests.post(api_url, headers=headers, timeout=30) + return r.json() + + +def apply_chat_template( + example, + tokenizer, + config, +): + """ + Applies a chat template to the given example based on the specified configuration. + + Args: + example (dict): The input example containing the text data to be processed. + tokenizer (object): The tokenizer to be used for applying the chat template. + config (object): Configuration object containing the following attributes: + - trainer (str): Specifies the type of trainer. Can be "default", "sft", "reward", "dpo", or "orpo". + - text_column (str): The key in the example dict that contains the text data. + - chat_template (str): Specifies the chat template to be used. Relevant for "reward" and "dpo" trainers. + + Returns: + dict: The modified example with the chat template applied. + + Raises: + ValueError: If the required keys are not found in the example for "reward", "dpo", or "orpo" trainers. + """ + # kudos to Hugging Face H4 Team for this snippet + if config.trainer in ("default", "sft"): + messages = example[config.text_column] + if isinstance(messages, str): + messages = ast.literal_eval(messages) + example[config.text_column] = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=False + ) + + elif config.trainer == "reward": + if all(k in example.keys() for k in ("chosen", "rejected")): + chosen_messages = example["chosen"] + rejected_messages = example["rejected"] + if isinstance(chosen_messages, str): + chosen_messages = ast.literal_eval(chosen_messages) + if isinstance(rejected_messages, str): + rejected_messages = ast.literal_eval(rejected_messages) + + if config.chat_template == "zephyr" and chosen_messages[0]["role"] != "system": + chosen_messages.insert(0, {"role": "system", "content": ""}) + if config.chat_template == "zephyr" and rejected_messages[0]["role"] != "system": + rejected_messages.insert(0, {"role": "system", "content": ""}) + + example["chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False) + example["rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False) + else: + raise ValueError( + f"Could not format example as dialogue for `rm/orpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}" + ) + elif config.trainer in ("dpo", "orpo"): + if all(k in example.keys() for k in ("chosen", "rejected")): + # For DPO, the inputs are triples of (prompt, chosen, rejected), where `chosen` and `rejected` are the final turn of a dialogue + # We therefore need to extract the N-1 turns to form the prompt + if isinstance(example["chosen"], str): + example["chosen"] = ast.literal_eval(example["chosen"]) + if isinstance(example["rejected"], str): + example["rejected"] = ast.literal_eval(example["rejected"]) + prompt_messages = example["chosen"][:-1] + if config.chat_template == "zephyr" and example["chosen"][0]["role"] != "system": + prompt_messages.insert(0, {"role": "system", "content": ""}) + chosen_messages = example["chosen"][-1:] + rejected_messages = example["rejected"][-1:] + example["chosen"] = tokenizer.apply_chat_template(chosen_messages, tokenize=False) + example["rejected"] = tokenizer.apply_chat_template(rejected_messages, tokenize=False) + example["prompt"] = tokenizer.apply_chat_template(prompt_messages, tokenize=False) + else: + raise ValueError( + f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}" + ) + return example + + +def post_training_steps(config, trainer): + """ + Perform post-training steps including saving the model, creating a model card, merging adapter weights, + and optionally pushing the model to the Hugging Face Hub. + + Args: + config (object): Configuration object containing various settings and parameters. + trainer (object): Trainer object used for training the model. + + Steps: + 1. Save the trained model and set `use_cache` to True. + 2. Create a model card and save it as README.md in the output directory. + 3. If PEFT (Parameter-Efficient Fine-Tuning) and adapter merging are enabled: + - Delete the trainer object and clear CUDA cache. + - Merge adapter weights into the base model. + - Remove adapter weight files from the output directory. + 4. If pushing to the Hugging Face Hub is enabled: + - Remove training data folder. + - Push the model to the Hugging Face Hub repository. + 5. Pause the space if the process index is 0. + + Raises: + Exception: If merging adapter weights fails. + """ + logger.info("Finished training, saving model...") + trainer.model.config.use_cache = True + trainer.save_model(config.project_name) + + model_card = create_model_card(config) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w", encoding="utf-8") as f: + f.write(model_card) + + if config.peft and config.merge_adapter: + del trainer + gc.collect() + torch.cuda.empty_cache() + logger.info("Merging adapter weights...") + try: + merge_adapter( + base_model_path=config.model, + target_model_path=config.project_name, + adapter_path=config.project_name, + ) + # remove adapter weights: adapter_* + for file in os.listdir(config.project_name): + if file.startswith("adapter_"): + os.remove(f"{config.project_name}/{file}") + except Exception as e: + logger.warning(f"Failed to merge adapter weights: {e}") + logger.warning("Skipping adapter merge. Only adapter weights will be saved.") + + if config.push_to_hub: + if PartialState().process_index == 0: + # remove data folder + remove_autotrain_data(config) + logger.info("Pushing model to hub...") + save_training_params(config) + api = HfApi(token=config.token) + api.create_repo( + repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True + ) + api.upload_folder( + folder_path=config.project_name, + repo_id=f"{config.username}/{config.project_name}", + repo_type="model", + ) + + if PartialState().process_index == 0: + pause_space(config) + + +def process_input_data(config): + """ + Processes input data based on the provided configuration. + + Args: + config (object): Configuration object containing the following attributes: + - data_path (str): Path to the dataset. + - project_name (str): Name of the project. + - train_split (str): Split name for training data. + - valid_split (str, optional): Split name for validation data. + - token (str, optional): Token for accessing the dataset. + - text_column (str): Name of the text column. + - rejected_text_column (str): Name of the rejected text column. + - prompt_text_column (str): Name of the prompt text column. + - trainer (str): Type of trainer (e.g., "dpo", "reward", "orpo"). + + Returns: + tuple: A tuple containing: + - train_data (Dataset): Processed training dataset. + - valid_data (Dataset or None): Processed validation dataset if valid_split is provided, otherwise None. + """ + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + # rename columns for reward trainer + if config.trainer in ("dpo", "reward", "orpo"): + if not (config.text_column == "chosen" and config.text_column in train_data.column_names): + train_data = train_data.rename_column(config.text_column, "chosen") + if not (config.rejected_text_column == "rejected" and config.rejected_text_column in train_data.column_names): + train_data = train_data.rename_column(config.rejected_text_column, "rejected") + if config.trainer in ("dpo", "orpo"): + if not (config.prompt_text_column == "prompt" and config.prompt_text_column in train_data.column_names): + train_data = train_data.rename_column(config.prompt_text_column, "prompt") + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + if config.trainer in ("dpo", "reward", "orpo"): + if not (config.text_column == "chosen" and config.text_column in valid_data.column_names): + valid_data = valid_data.rename_column(config.text_column, "chosen") + if not ( + config.rejected_text_column == "rejected" and config.rejected_text_column in valid_data.column_names + ): + valid_data = valid_data.rename_column(config.rejected_text_column, "rejected") + if config.trainer in ("dpo", "reward"): + if not (config.prompt_text_column == "prompt" and config.prompt_text_column in valid_data.column_names): + valid_data = valid_data.rename_column(config.prompt_text_column, "prompt") + else: + valid_data = None + + logger.info(f"Train data: {train_data}") + logger.info(f"Valid data: {valid_data}") + + return train_data, valid_data + + +def get_tokenizer(config): + """ + Initializes and returns a tokenizer based on the provided configuration. + + Args: + config (object): Configuration object containing the following attributes: + - chat_template (str): The chat template type, either "chatml" or "zephyr". + - model (str): The model identifier to load the tokenizer from. + - token (str): The token to use for the tokenizer. + - model_max_length (int): The maximum length of the model. + - padding (str): The padding side, either "left" or "right". + + Returns: + tokenizer (PreTrainedTokenizer): The initialized tokenizer with the specified configuration. + """ + special_tokens = None + chat_template = None + if config.chat_template == "chatml": + special_tokens = ChatmlSpecialTokens + chat_template = CHATML_CHAT_TEMPLATE + elif config.chat_template == "zephyr": + special_tokens = ZephyrSpecialTokens + chat_template = ZEPHYR_CHAT_TEMPLATE + + if special_tokens is not None: + tokenizer = AutoTokenizer.from_pretrained( + config.model, + pad_token=special_tokens.PAD_TOKEN.value, + bos_token=special_tokens.BOS_TOKEN.value, + eos_token=special_tokens.EOS_TOKEN.value, + additional_special_tokens=special_tokens.list(), + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + tokenizer.chat_template = chat_template + else: + tokenizer = AutoTokenizer.from_pretrained( + config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE + ) + if tokenizer.chat_template is None: + tokenizer.chat_template = DEFAULT_CHAT_TEMPLATE + + if tokenizer.model_max_length > 2048: + tokenizer.model_max_length = config.model_max_length + + if getattr(tokenizer, "pad_token", None) is None: + tokenizer.pad_token = tokenizer.eos_token + + if getattr(tokenizer, "pad_token_id", None) is None: + tokenizer.pad_token_id = tokenizer.eos_token_id + + if config.padding in ("left", "right"): + tokenizer.padding_side = config.padding + + return tokenizer + + +def process_data_with_chat_template(config, tokenizer, train_data, valid_data): + """ + Processes training and validation data using a specified chat template. + + Args: + config (object): Configuration object containing settings and parameters. + tokenizer (object): Tokenizer object used for tokenizing the data. + train_data (Dataset): Training dataset to be processed. + valid_data (Dataset): Validation dataset to be processed. + + Returns: + tuple: A tuple containing the processed training and validation datasets. + + Notes: + - If `config.chat_template` is one of ("chatml", "zephyr", "tokenizer"), the chat template will be applied. + - Logs information about the application of the chat template. + - For ORPO/DPO, the `prompt` will be extracted from chosen messages. + - If `config.valid_split` is not None, the validation data will also be processed. + """ + valid_data = None + if config.chat_template in ("chatml", "zephyr", "tokenizer"): + logger.info("Applying chat template") + logger.info("For ORPO/DPO, `prompt` will be extracted from chosen messages") + train_data = train_data.map( + apply_chat_template, + fn_kwargs={ + "tokenizer": tokenizer, + "config": config, + }, + ) + if config.valid_split is not None: + valid_data = valid_data.map( + apply_chat_template, + fn_kwargs={ + "tokenizer": tokenizer, + "config": config, + }, + ) + return train_data, valid_data + + +def configure_logging_steps(config, train_data, valid_data): + """ + Configures the logging steps for training based on the provided configuration and data. + + Parameters: + config (object): Configuration object containing training parameters, including `logging_steps`, `valid_split`, and `batch_size`. + train_data (iterable): Training dataset. + valid_data (iterable): Validation dataset. + + Returns: + int: The number of logging steps to be used during training. + + Notes: + - If `config.logging_steps` is set to -1, the function calculates logging steps based on 20% of the length of the validation data (if `valid_split` is provided) or the training data. + - The calculated logging steps are constrained to be between 1 and 25. + - If `config.logging_steps` is not -1, the function uses the provided value. + """ + logger.info("configuring logging steps") + if config.logging_steps == -1: + if config.valid_split is not None: + logging_steps = int(0.2 * len(valid_data) / config.batch_size) + else: + logging_steps = int(0.2 * len(train_data) / config.batch_size) + if logging_steps == 0: + logging_steps = 1 + if logging_steps > 25: + logging_steps = 25 + config.logging_steps = logging_steps + else: + logging_steps = config.logging_steps + logger.info(f"Logging steps: {logging_steps}") + return logging_steps + + +def configure_training_args(config, logging_steps): + """ + Configures the training arguments for a language model based on the provided configuration. + + Args: + config (object): Configuration object containing various training parameters. + logging_steps (int): Number of steps between logging events. + + Returns: + dict: A dictionary containing the configured training arguments. + + The configuration object `config` should have the following attributes: + - project_name (str): The name of the project, used as the output directory. + - batch_size (int): Batch size for both training and evaluation. + - lr (float): Learning rate. + - epochs (int): Number of training epochs. + - eval_strategy (str): Evaluation strategy, e.g., "steps" or "epoch". + - valid_split (float or None): Validation split ratio. If None, evaluation is disabled. + - save_total_limit (int): Maximum number of checkpoints to save. + - gradient_accumulation (int): Number of gradient accumulation steps. + - log (str): Logging destination, e.g., "tensorboard". + - auto_find_batch_size (bool): Whether to automatically find the optimal batch size. + - scheduler (str): Learning rate scheduler type. + - optimizer (str): Optimizer type. + - warmup_ratio (float): Warmup ratio for learning rate scheduling. + - weight_decay (float): Weight decay for the optimizer. + - max_grad_norm (float): Maximum gradient norm for clipping. + - disable_gradient_checkpointing (bool): Whether to disable gradient checkpointing. + - peft (bool): Whether to use Parameter-Efficient Fine-Tuning (PEFT). + - quantization (str): Quantization type, e.g., "int4" or "int8". + - mixed_precision (str): Mixed precision type, e.g., "fp16" or "bf16". + + The function also sets additional training arguments based on the provided configuration, + such as enabling gradient checkpointing and mixed precision training. + """ + logger.info("configuring training args") + training_args = dict( + output_dir=config.project_name, + per_device_train_batch_size=config.batch_size, + per_device_eval_batch_size=config.batch_size, + learning_rate=config.lr, + num_train_epochs=config.epochs, + eval_strategy=config.eval_strategy if config.valid_split is not None else "no", + logging_steps=logging_steps, + save_total_limit=config.save_total_limit, + save_strategy=config.eval_strategy if config.valid_split is not None else "no", + gradient_accumulation_steps=config.gradient_accumulation, + report_to=config.log, + auto_find_batch_size=config.auto_find_batch_size, + lr_scheduler_type=config.scheduler, + optim=config.optimizer, + warmup_ratio=config.warmup_ratio, + weight_decay=config.weight_decay, + max_grad_norm=config.max_grad_norm, + push_to_hub=False, + load_best_model_at_end=True if config.valid_split is not None else False, + ddp_find_unused_parameters=False, + gradient_checkpointing=not config.disable_gradient_checkpointing, + remove_unused_columns=False, + ) + + if not config.disable_gradient_checkpointing: + if config.peft and config.quantization in ("int4", "int8"): + training_args["gradient_checkpointing_kwargs"] = {"use_reentrant": True} + else: + training_args["gradient_checkpointing_kwargs"] = {"use_reentrant": False} + + if config.mixed_precision == "fp16": + training_args["fp16"] = True + if config.mixed_precision == "bf16": + training_args["bf16"] = True + + return training_args + + +def configure_block_size(config, tokenizer): + """ + Configures the block size for the given configuration and tokenizer. + + This function sets the `block_size` attribute in the `config` object based on the `tokenizer`'s maximum model length. + If `config.block_size` is -1, it is set to None. If `config.block_size` is None, it defaults to the tokenizer's + `model_max_length` but not exceeding 1024. If `config.block_size` is specified and exceeds the tokenizer's + `model_max_length`, a warning is logged and the block size is set to the tokenizer's `model_max_length`. + + Args: + config (object): Configuration object that contains the `block_size` attribute. + tokenizer (object): Tokenizer object that contains the `model_max_length` attribute. + + Returns: + object: The updated configuration object with the `block_size` attribute set. + """ + if config.block_size == -1: + config.block_size = None + + if config.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warning( + "The chosen tokenizer supports a `model_max_length` that is longer than the default `block_size` value" + " of 1024. If you would like to use a longer `block_size` up to `tokenizer.model_max_length` you can" + " override this default with `--block_size xxx`." + ) + block_size = 1024 + else: + if config.block_size > tokenizer.model_max_length: + logger.warning( + f"The block_size passed ({config.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(config.block_size, tokenizer.model_max_length) + + config.block_size = block_size + + logger.info(f"Using block size {block_size}") + return config + + +def get_callbacks(config): + """ + Generate a list of callback instances based on the provided configuration. + + This function creates a list of callback instances that are used during the training process. + It includes default callbacks for logging and training start, and conditionally adds callbacks + for saving and loading PEFT models based on the configuration and environment settings. + + Args: + config (object): Configuration object containing training settings and parameters. + + Returns: + list: A list of callback instances to be used during training. + """ + is_deepspeed_enabled = os.environ.get("ACCELERATE_USE_DEEPSPEED", "False").lower() == "true" + callbacks = [UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()] + if config.peft and not is_deepspeed_enabled: + callbacks.append(SavePeftModelCallback) + if config.valid_split is not None: + callbacks.append(LoadBestPeftModelCallback) + return callbacks + + +def get_model(config, tokenizer): + """ + Loads and configures a language model based on the provided configuration and tokenizer. + + Args: + config (Namespace): Configuration object containing model parameters and settings. + - model (str): The model name or path. + - token (str): Token for accessing the model. + - unsloth (bool): Flag to determine if unsloth is used. + - trainer (str): Type of trainer to use. + - target_modules (str): Target modules for unsloth. + - peft (bool): Flag to determine if PEFT (Parameter-Efficient Fine-Tuning) is used. + - quantization (str): Quantization type, either "int4" or "int8". + - mixed_precision (str): Mixed precision type, either "fp16" or "bf16". + - block_size (int): Maximum sequence length. + - lora_r (int): LoRA rank. + - lora_alpha (int): LoRA alpha. + - lora_dropout (float): LoRA dropout rate. + - seed (int): Random seed. + - disable_gradient_checkpointing (bool): Flag to disable gradient checkpointing. + - use_flash_attention_2 (bool): Flag to use flash attention 2. + tokenizer (PreTrainedTokenizer): Tokenizer to use with the model. + + Returns: + PreTrainedModel: The configured language model. + + Raises: + ImportError: If unsloth is not available when required. + """ + model_config = AutoConfig.from_pretrained( + config.model, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + model_type = model_config.model_type + unsloth_target_modules = None + can_use_unloth = False + + if config.unsloth and is_unsloth_available() and config.trainer in ("default", "sft"): + can_use_unloth = True + + if model_type in ("llama", "mistral", "gemma", "qwen2") and config.unsloth: + if config.target_modules.strip().lower() == "all-linear": + unsloth_target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + else: + unsloth_target_modules = get_target_modules(config) + else: + can_use_unloth = False + + logger.info(f"Can use unsloth: {can_use_unloth}") + if can_use_unloth: + from unsloth import FastLanguageModel + + load_in_4bit = False + load_in_8bit = False + if config.peft and config.quantization == "int4": + load_in_4bit = True + elif config.peft and config.quantization == "int8": + load_in_8bit = True + + dtype = None + if config.mixed_precision == "fp16": + dtype = torch.float16 + elif config.mixed_precision == "bf16": + dtype = torch.bfloat16 + + model, _ = FastLanguageModel.from_pretrained( + model_name=config.model, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + load_in_4bit=load_in_4bit, + load_in_8bit=load_in_8bit, + max_seq_length=config.block_size, + dtype=dtype, + ) + if config.peft: + model = FastLanguageModel.get_peft_model( + model, + r=config.lora_r, + target_modules=unsloth_target_modules, + lora_alpha=config.lora_alpha, + lora_dropout=config.lora_dropout, + bias="none", + use_gradient_checkpointing="unsloth", + random_state=config.seed, + max_seq_length=config.block_size, + use_rslora=False, + loftq_config=None, + ) + return model + else: + logger.warning("Unsloth not available, continuing without it...") + + logger.info("loading model config...") + model_config = AutoConfig.from_pretrained( + config.model, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + use_cache=config.disable_gradient_checkpointing, + ) + + logger.info("loading model...") + if config.peft: + if config.quantization == "int4": + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=False, + ) + elif config.quantization == "int8": + bnb_config = BitsAndBytesConfig(load_in_8bit=True) + else: + bnb_config = None + + model = AutoModelForCausalLM.from_pretrained( + config.model, + config=model_config, + token=config.token, + quantization_config=bnb_config, + trust_remote_code=ALLOW_REMOTE_CODE, + use_flash_attention_2=config.use_flash_attention_2, + ) + else: + model = AutoModelForCausalLM.from_pretrained( + config.model, + config=model_config, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + use_flash_attention_2=config.use_flash_attention_2, + ) + + logger.info(f"model dtype: {model.dtype}") + model.resize_token_embeddings(len(tokenizer)) + + if config.trainer != "default": + return model + + if config.peft: + logger.info("preparing peft model...") + if config.quantization is not None: + gradient_checkpointing_kwargs = {} + if not config.disable_gradient_checkpointing: + if config.quantization in ("int4", "int8"): + gradient_checkpointing_kwargs = {"use_reentrant": True} + else: + gradient_checkpointing_kwargs = {"use_reentrant": False} + model = prepare_model_for_kbit_training( + model, + use_gradient_checkpointing=not config.disable_gradient_checkpointing, + gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, + ) + else: + model.enable_input_require_grads() + + peft_config = LoraConfig( + r=config.lora_r, + lora_alpha=config.lora_alpha, + lora_dropout=config.lora_dropout, + bias="none", + task_type="CAUSAL_LM", + target_modules=get_target_modules(config), + ) + model = get_peft_model(model, peft_config) + + return model diff --git a/src/autotrain/trainers/common.py b/src/autotrain/trainers/common.py new file mode 100644 index 0000000000000000000000000000000000000000..1db188015ffdacf3c9bc032dbeb6a7b270bd8331 --- /dev/null +++ b/src/autotrain/trainers/common.py @@ -0,0 +1,386 @@ +""" +Common classes and functions for all trainers. +""" + +import json +import os +import shutil +import time +import traceback + +import requests +from accelerate import PartialState +from huggingface_hub import HfApi +from pydantic import BaseModel +from transformers import TrainerCallback, TrainerControl, TrainerState, TrainingArguments + +from autotrain import is_colab, logger + + +ALLOW_REMOTE_CODE = os.environ.get("ALLOW_REMOTE_CODE", "true").lower() == "true" + + +def get_file_sizes(directory): + """ + Calculate the sizes of all files in a given directory and its subdirectories. + + Args: + directory (str): The path to the directory to scan for files. + + Returns: + dict: A dictionary where the keys are the file paths and the values are the file sizes in gigabytes (GB). + """ + file_sizes = {} + for root, _, files in os.walk(directory): + for file in files: + file_path = os.path.join(root, file) + file_size = os.path.getsize(file_path) + file_size_gb = file_size / (1024**3) # Convert bytes to GB + file_sizes[file_path] = file_size_gb + return file_sizes + + +def remove_global_step(directory): + """ + Removes directories that start with 'global_step' within the specified directory. + + This function traverses the given directory and its subdirectories in a bottom-up manner. + If it finds any directory whose name starts with 'global_step', it deletes that directory + and all its contents. + + Args: + directory (str): The path to the directory to be traversed and cleaned. + + Returns: + None + """ + for root, dirs, _ in os.walk(directory, topdown=False): + for name in dirs: + if name.startswith("global_step"): + folder_path = os.path.join(root, name) + print(f"Removing folder: {folder_path}") + shutil.rmtree(folder_path) + + +def remove_autotrain_data(config): + """ + Removes the AutoTrain data directory and global step for a given project. + + Args: + config (object): Configuration object that contains the project name. + + Raises: + OSError: If the removal of the directory fails. + """ + os.system(f"rm -rf {config.project_name}/autotrain-data") + remove_global_step(config.project_name) + + +def save_training_params(config): + """ + Saves the training parameters to a JSON file, excluding the "token" key if it exists. + + Args: + config (object): Configuration object that contains the project name. + + The function checks if a file named 'training_params.json' exists in the directory + specified by `config.project_name`. If the file exists, it loads the JSON content, + removes the "token" key if present, and then writes the updated content back to the file. + """ + if os.path.exists(f"{config.project_name}/training_params.json"): + training_params = json.load(open(f"{config.project_name}/training_params.json")) + if "token" in training_params: + training_params.pop("token") + json.dump( + training_params, + open(f"{config.project_name}/training_params.json", "w"), + indent=4, + ) + + +def pause_endpoint(params): + """ + Pauses a Hugging Face endpoint using the provided parameters. + + Args: + params (dict or object): Parameters containing the token required for authorization. + If a dictionary is provided, it should have a key "token" with the authorization token. + If an object is provided, it should have an attribute `token` with the authorization token. + + Returns: + dict: The JSON response from the API call to pause the endpoint. + + Raises: + KeyError: If the "token" key is missing in the params dictionary. + requests.exceptions.RequestException: If there is an issue with the API request. + + Environment Variables: + ENDPOINT_ID: Should be set to the endpoint identifier in the format "username/project_name". + """ + if isinstance(params, dict): + token = params["token"] + else: + token = params.token + endpoint_id = os.environ["ENDPOINT_ID"] + username = endpoint_id.split("/")[0] + project_name = endpoint_id.split("/")[1] + api_url = f"https://api.endpoints.huggingface.cloud/v2/endpoint/{username}/{project_name}/pause" + headers = {"Authorization": f"Bearer {token}"} + r = requests.post(api_url, headers=headers, timeout=120) + return r.json() + + +def pause_space(params, is_failure=False): + """ + Pauses the Hugging Face space and optionally shuts down the endpoint. + + This function checks for the presence of "SPACE_ID" and "ENDPOINT_ID" in the environment variables. + If "SPACE_ID" is found, it pauses the space and creates a discussion on the Hugging Face platform + to notify the user about the status of the training run (success or failure). + If "ENDPOINT_ID" is found, it pauses the endpoint. + + Args: + params (object): An object containing the necessary parameters, including the token, username, and project name. + is_failure (bool, optional): A flag indicating whether the training run failed. Defaults to False. + + Raises: + Exception: If there is an error while creating the discussion on the Hugging Face platform. + + Logs: + Info: Logs the status of pausing the space and endpoint. + Warning: Logs any issues encountered while creating the discussion. + Error: Logs if the model failed to train and the discussion was not created. + """ + if "SPACE_ID" in os.environ: + # shut down the space + logger.info("Pausing space...") + api = HfApi(token=params.token) + + if is_failure: + msg = "Your training run has failed! Please check the logs for more details" + title = "Your training has failed ❌" + else: + msg = "Your training run was successful! [Check out your trained model here]" + msg += f"(https://huggingface.co/{params.username}/{params.project_name})" + title = "Your training has finished successfully ✅" + + if not params.token.startswith("hf_oauth_"): + try: + api.create_discussion( + repo_id=os.environ["SPACE_ID"], + title=title, + description=msg, + repo_type="space", + ) + except Exception as e: + logger.warning(f"Failed to create discussion: {e}") + if is_failure: + logger.error("Model failed to train and discussion was not created.") + else: + logger.warning("Model trained successfully but discussion was not created.") + + api.pause_space(repo_id=os.environ["SPACE_ID"]) + if "ENDPOINT_ID" in os.environ: + # shut down the endpoint + logger.info("Pausing endpoint...") + pause_endpoint(params) + + +def monitor(func): + """ + A decorator that wraps a function to monitor its execution and handle exceptions. + + This decorator performs the following actions: + 1. Retrieves the 'config' parameter from the function's keyword arguments or positional arguments. + 2. Executes the wrapped function. + 3. If an exception occurs during the execution of the wrapped function, logs the error message and stack trace. + 4. Optionally pauses the execution if the environment variable 'PAUSE_ON_FAILURE' is set to 1. + + Args: + func (callable): The function to be wrapped by the decorator. + + Returns: + callable: The wrapped function with monitoring capabilities. + """ + + def wrapper(*args, **kwargs): + config = kwargs.get("config", None) + if config is None and len(args) > 0: + config = args[0] + + try: + return func(*args, **kwargs) + except Exception as e: + error_message = f"""{func.__name__} has failed due to an exception: {traceback.format_exc()}""" + logger.error(error_message) + logger.error(str(e)) + if int(os.environ.get("PAUSE_ON_FAILURE", 1)) == 1: + pause_space(config, is_failure=True) + + return wrapper + + +class AutoTrainParams(BaseModel): + """ + AutoTrainParams is a base class for all AutoTrain parameters. + Attributes: + Config (class): Configuration class for Pydantic model. + protected_namespaces (tuple): Protected namespaces for the model. + Methods: + save(output_dir): + Save parameters to a JSON file in the specified output directory. + __str__(): + Return a string representation of the parameters, masking the token if present. + __init__(**data): + Initialize the parameters, check for unused/extra parameters, and warn the user if necessary. + Raises ValueError if project_name is not alphanumeric (with hyphens allowed) or exceeds 50 characters. + """ + + class Config: + protected_namespaces = () + + def save(self, output_dir): + """ + Save parameters to a json file. + """ + os.makedirs(output_dir, exist_ok=True) + path = os.path.join(output_dir, "training_params.json") + # save formatted json + with open(path, "w", encoding="utf-8") as f: + f.write(self.model_dump_json(indent=4)) + + def __str__(self): + """ + String representation of the parameters. + """ + data = self.model_dump() + data["token"] = "*****" if data.get("token") else None + return str(data) + + def __init__(self, **data): + """ + Initialize the parameters, check for unused/extra parameters and warn the user. + """ + super().__init__(**data) + + if len(self.project_name) > 0: + # make sure project_name is always alphanumeric but can have hyphens. if not, raise ValueError + if not self.project_name.replace("-", "").isalnum(): + raise ValueError("project_name must be alphanumeric but can contain hyphens") + + # project name cannot be more than 50 characters + if len(self.project_name) > 50: + raise ValueError("project_name cannot be more than 50 characters") + + # Parameters not supplied by the user + defaults = set(self.model_fields.keys()) + supplied = set(data.keys()) + not_supplied = defaults - supplied + if not_supplied and not is_colab: + logger.warning(f"Parameters not supplied by user and set to default: {', '.join(not_supplied)}") + + # Parameters that were supplied but not used + # This is a naive implementation. It might catch some internal Pydantic params. + unused = supplied - set(self.model_fields) + if unused: + logger.warning(f"Parameters supplied but not used: {', '.join(unused)}") + + +class UploadLogs(TrainerCallback): + """ + A callback to upload training logs to the Hugging Face Hub. + + Args: + config (object): Configuration object containing necessary parameters. + + Attributes: + config (object): Configuration object containing necessary parameters. + api (HfApi or None): Instance of HfApi for interacting with the Hugging Face Hub. + last_upload_time (float): Timestamp of the last upload. + + Methods: + on_step_end(args, state, control, **kwargs): + Called at the end of each training step. Uploads logs to the Hugging Face Hub if conditions are met. + """ + + def __init__(self, config): + self.config = config + self.api = None + self.last_upload_time = 0 + + if self.config.push_to_hub: + if PartialState().process_index == 0: + self.api = HfApi(token=config.token) + self.api.create_repo( + repo_id=f"{self.config.username}/{self.config.project_name}", repo_type="model", private=True + ) + + def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + if self.config.push_to_hub is False: + return control + + if not os.path.exists(os.path.join(self.config.project_name, "runs")): + return control + + if (state.global_step + 1) % self.config.logging_steps == 0 and self.config.log == "tensorboard": + if PartialState().process_index == 0: + current_time = time.time() + if current_time - self.last_upload_time >= 600: + try: + self.api.upload_folder( + folder_path=os.path.join(self.config.project_name, "runs"), + repo_id=f"{self.config.username}/{self.config.project_name}", + path_in_repo="runs", + ) + except Exception as e: + logger.warning(f"Failed to upload logs: {e}") + logger.warning("Continuing training...") + + self.last_upload_time = current_time + return control + + +class LossLoggingCallback(TrainerCallback): + """ + LossLoggingCallback is a custom callback for logging loss during training. + + This callback inherits from `TrainerCallback` and overrides the `on_log` method + to remove the "total_flos" key from the logs and log the remaining information + if the current process is the local process zero. + + Methods: + on_log(args, state, control, logs=None, **kwargs): + Called when the logs are updated. Removes the "total_flos" key from the logs + and logs the remaining information if the current process is the local process zero. + + Args: + args: The training arguments. + state: The current state of the Trainer. + control: The control object for the Trainer. + logs (dict, optional): The logs dictionary containing the training metrics. + **kwargs: Additional keyword arguments. + """ + + def on_log(self, args, state, control, logs=None, **kwargs): + _ = logs.pop("total_flos", None) + if state.is_local_process_zero: + logger.info(logs) + + +class TrainStartCallback(TrainerCallback): + """ + TrainStartCallback is a custom callback for the Trainer class that logs a message when training begins. + + Methods: + on_train_begin(args, state, control, **kwargs): + Logs a message indicating that training is starting. + + Args: + args: The training arguments. + state: The current state of the Trainer. + control: The control object for the Trainer. + **kwargs: Additional keyword arguments. + """ + + def on_train_begin(self, args, state, control, **kwargs): + logger.info("Starting to train...") diff --git a/src/autotrain/trainers/extractive_question_answering/__init__.py b/src/autotrain/trainers/extractive_question_answering/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/extractive_question_answering/__main__.py b/src/autotrain/trainers/extractive_question_answering/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..42c3646949eabc791be4dc9f8c3201be4c77a634 --- /dev/null +++ b/src/autotrain/trainers/extractive_question_answering/__main__.py @@ -0,0 +1,263 @@ +import argparse +import copy +import json +from functools import partial + +from accelerate.state import PartialState +from datasets import load_dataset, load_from_disk +from huggingface_hub import HfApi +from transformers import ( + AutoConfig, + AutoModelForQuestionAnswering, + AutoTokenizer, + EarlyStoppingCallback, + Trainer, + TrainingArguments, +) +from transformers.trainer_callback import PrinterCallback + +from autotrain import logger +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + LossLoggingCallback, + TrainStartCallback, + UploadLogs, + monitor, + pause_space, + remove_autotrain_data, + save_training_params, +) +from autotrain.trainers.extractive_question_answering import utils +from autotrain.trainers.extractive_question_answering.dataset import ExtractiveQuestionAnsweringDataset +from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +@monitor +def train(config): + if isinstance(config, dict): + config = ExtractiveQuestionAnsweringParams(**config) + + train_data = None + valid_data = None + # check if config.train_split.csv exists in config.data_path + if config.train_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + logger.info(train_data) + if config.valid_split is not None: + logger.info(valid_data) + + model_config = AutoConfig.from_pretrained(config.model, allow_remote_code=ALLOW_REMOTE_CODE, token=config.token) + + try: + model = AutoModelForQuestionAnswering.from_pretrained( + config.model, + config=model_config, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + except OSError: + model = AutoModelForQuestionAnswering.from_pretrained( + config.model, + config=model_config, + from_tf=True, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + + tokenizer = AutoTokenizer.from_pretrained(config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE) + + use_v2 = False + if config.valid_split is not None: + id_column = list(range(len(valid_data))) + for data in valid_data: + if -1 in data[config.answer_column]["answer_start"]: + use_v2 = True + break + + valid_data = valid_data.add_column("id", id_column) + column_names = valid_data.column_names + partial_process = partial( + utils.prepare_qa_validation_features, + tokenizer=tokenizer, + config=config, + ) + processed_eval_dataset = valid_data.map( + partial_process, + batched=True, + remove_columns=column_names, + num_proc=2, + desc="Running tokenizer on validation dataset", + ) + orig_valid_data = copy.deepcopy(valid_data) + + train_data = ExtractiveQuestionAnsweringDataset(data=train_data, tokenizer=tokenizer, config=config) + if config.valid_split is not None: + valid_data = ExtractiveQuestionAnsweringDataset(data=valid_data, tokenizer=tokenizer, config=config) + + if config.logging_steps == -1: + if config.valid_split is not None: + logging_steps = int(0.2 * len(valid_data) / config.batch_size) + else: + logging_steps = int(0.2 * len(train_data) / config.batch_size) + if logging_steps == 0: + logging_steps = 1 + if logging_steps > 25: + logging_steps = 25 + config.logging_steps = logging_steps + else: + logging_steps = config.logging_steps + + logger.info(f"Logging steps: {logging_steps}") + + training_args = dict( + output_dir=config.project_name, + per_device_train_batch_size=config.batch_size, + per_device_eval_batch_size=2 * config.batch_size, + learning_rate=config.lr, + num_train_epochs=config.epochs, + eval_strategy=config.eval_strategy if config.valid_split is not None else "no", + logging_steps=logging_steps, + save_total_limit=config.save_total_limit, + save_strategy=config.eval_strategy if config.valid_split is not None else "no", + gradient_accumulation_steps=config.gradient_accumulation, + report_to=config.log, + auto_find_batch_size=config.auto_find_batch_size, + lr_scheduler_type=config.scheduler, + optim=config.optimizer, + warmup_ratio=config.warmup_ratio, + weight_decay=config.weight_decay, + max_grad_norm=config.max_grad_norm, + push_to_hub=False, + load_best_model_at_end=True if config.valid_split is not None else False, + ddp_find_unused_parameters=False, + ) + + if config.mixed_precision == "fp16": + training_args["fp16"] = True + if config.mixed_precision == "bf16": + training_args["bf16"] = True + + if config.valid_split is not None: + early_stop = EarlyStoppingCallback( + early_stopping_patience=config.early_stopping_patience, + early_stopping_threshold=config.early_stopping_threshold, + ) + callbacks_to_use = [early_stop] + else: + callbacks_to_use = [] + + callbacks_to_use.extend([UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()]) + + if config.valid_split is not None: + logger.info(processed_eval_dataset) + compute_metrics = partial( + utils.compute_metrics, + eval_dataset=processed_eval_dataset, + eval_examples=orig_valid_data, + config=config, + use_v2=use_v2, + ) + else: + compute_metrics = None + + args = TrainingArguments(**training_args) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks_to_use, + compute_metrics=compute_metrics, + ) + + trainer = Trainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data, + ) + trainer.remove_callback(PrinterCallback) + trainer.train() + + logger.info("Finished training, saving model...") + trainer.save_model(config.project_name) + tokenizer.save_pretrained(config.project_name) + + model_card = utils.create_model_card(config, trainer) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w") as f: + f.write(model_card) + + if config.push_to_hub: + if PartialState().process_index == 0: + remove_autotrain_data(config) + save_training_params(config) + logger.info("Pushing model to hub...") + api = HfApi(token=config.token) + api.create_repo( + repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True + ) + api.upload_folder( + folder_path=config.project_name, + repo_id=f"{config.username}/{config.project_name}", + repo_type="model", + ) + + if PartialState().process_index == 0: + pause_space(config) + + +if __name__ == "__main__": + args = parse_args() + training_config = json.load(open(args.training_config)) + config = ExtractiveQuestionAnsweringParams(**training_config) + train(config) diff --git a/src/autotrain/trainers/extractive_question_answering/dataset.py b/src/autotrain/trainers/extractive_question_answering/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..c9429f6242513f6c4f6d80d6c018d0d7b94fc00c --- /dev/null +++ b/src/autotrain/trainers/extractive_question_answering/dataset.py @@ -0,0 +1,121 @@ +from functools import partial + +from autotrain import logger + + +def _prepare_dataset(examples, tokenizer, config): + # taken from: + # https://github.com/huggingface/transformers/blob/master/examples/pytorch/question-answering/run_qa.py + # and modified for AutoTrain + pad_on_right = tokenizer.padding_side == "right" + tokenized_examples = tokenizer( + examples[config.question_column if pad_on_right else config.text_column], + examples[config.text_column if pad_on_right else config.question_column], + truncation="only_second" if pad_on_right else "only_first", + max_length=config.max_seq_length, + stride=config.max_doc_stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length", + ) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + # The offset mappings will give us a map from token to character position in the original context. This will + # help us compute the start_positions and end_positions. + offset_mapping = tokenized_examples.pop("offset_mapping") + + # Let's label those examples! + tokenized_examples["start_positions"] = [] + tokenized_examples["end_positions"] = [] + + for i, offsets in enumerate(offset_mapping): + # We will label impossible answers with the index of the CLS token. + input_ids = tokenized_examples["input_ids"][i] + if tokenizer.cls_token_id in input_ids: + cls_index = input_ids.index(tokenizer.cls_token_id) + elif tokenizer.bos_token_id in input_ids: + cls_index = input_ids.index(tokenizer.bos_token_id) + else: + cls_index = 0 + + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + answers = examples[config.answer_column][sample_index] + # If no answers are given, set the cls_index as answer. + if len(answers["answer_start"]) == 0: + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Start/end character index of the answer in the text. + start_char = answers["answer_start"][0] + end_char = start_char + len(answers["text"][0]) + + # Start token index of the current span in the text. + token_start_index = 0 + while sequence_ids[token_start_index] != (1 if pad_on_right else 0): + token_start_index += 1 + + # End token index of the current span in the text. + token_end_index = len(input_ids) - 1 + while sequence_ids[token_end_index] != (1 if pad_on_right else 0): + token_end_index -= 1 + + # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). + if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Otherwise move the token_start_index and token_end_index to the two ends of the answer. + # Note: we could go after the last offset if the answer is the last word (edge case). + while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: + token_start_index += 1 + tokenized_examples["start_positions"].append(token_start_index - 1) + while offsets[token_end_index][1] >= end_char: + token_end_index -= 1 + tokenized_examples["end_positions"].append(token_end_index + 1) + + return tokenized_examples + + +class ExtractiveQuestionAnsweringDataset: + """ + A dataset class for extractive question answering tasks. + + Args: + data (Dataset): The dataset to be processed. + tokenizer (PreTrainedTokenizer): The tokenizer to be used for processing the data. + config (dict): Configuration parameters for processing the dataset. + + Attributes: + data (Dataset): The original dataset. + tokenizer (PreTrainedTokenizer): The tokenizer used for processing the data. + config (dict): Configuration parameters for processing the dataset. + tokenized_data (Dataset): The tokenized dataset after applying the mapping function. + + Methods: + __len__(): Returns the length of the tokenized dataset. + __getitem__(item): Returns the tokenized data at the specified index. + """ + + def __init__(self, data, tokenizer, config): + self.data = data + self.tokenizer = tokenizer + self.config = config + logger.info("Processing data for Extractive QA") + mapping_function = partial(_prepare_dataset, tokenizer=self.tokenizer, config=self.config) + self.tokenized_data = self.data.map( + mapping_function, + batched=True, + remove_columns=self.data.column_names, + ) + + def __len__(self): + return len(self.tokenized_data) + + def __getitem__(self, item): + return self.tokenized_data[item] diff --git a/src/autotrain/trainers/extractive_question_answering/params.py b/src/autotrain/trainers/extractive_question_answering/params.py new file mode 100644 index 0000000000000000000000000000000000000000..61e1f8a68692d1994e805a823f519d51b5dcbd3b --- /dev/null +++ b/src/autotrain/trainers/extractive_question_answering/params.py @@ -0,0 +1,76 @@ +from typing import Optional + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class ExtractiveQuestionAnsweringParams(AutoTrainParams): + """ + ExtractiveQuestionAnsweringParams + + Parameters: + data_path (str): Path to the dataset. + model (str): Pre-trained model name. Default is "bert-base-uncased". + lr (float): Learning rate for the optimizer. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length for inputs. Default is 128. + max_doc_stride (int): Maximum document stride for splitting context. Default is 128. + batch_size (int): Batch size for training. Default is 8. + warmup_ratio (float): Warmup proportion for learning rate scheduler. Default is 0.1. + gradient_accumulation (int): Number of gradient accumulation steps. Default is 1. + optimizer (str): Optimizer type. Default is "adamw_torch". + scheduler (str): Learning rate scheduler type. Default is "linear". + weight_decay (float): Weight decay for the optimizer. Default is 0.0. + max_grad_norm (float): Maximum gradient norm for clipping. Default is 1.0. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. Default is None. + text_column (str): Column name for context/text. Default is "context". + question_column (str): Column name for questions. Default is "question". + answer_column (str): Column name for answers. Default is "answers". + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project for output directory. Default is "project-name". + auto_find_batch_size (bool): Automatically find optimal batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision training mode (fp16, bf16, or None). Default is None. + save_total_limit (int): Maximum number of checkpoints to save. Default is 1. + token (Optional[str]): Authentication token for Hugging Face Hub. Default is None. + push_to_hub (bool): Whether to push the model to Hugging Face Hub. Default is False. + eval_strategy (str): Evaluation strategy during training. Default is "epoch". + username (Optional[str]): Hugging Face username for authentication. Default is None. + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Number of epochs with no improvement for early stopping. Default is 5. + early_stopping_threshold (float): Threshold for early stopping improvement. Default is 0.01. + """ + + data_path: str = Field(None, title="Path to the dataset") + model: str = Field("bert-base-uncased", title="Pre-trained model name") + lr: float = Field(5e-5, title="Learning rate for the optimizer") + epochs: int = Field(3, title="Number of training epochs") + max_seq_length: int = Field(128, title="Maximum sequence length for inputs") + max_doc_stride: int = Field(128, title="Maximum document stride for splitting context") + batch_size: int = Field(8, title="Batch size for training") + warmup_ratio: float = Field(0.1, title="Warmup proportion for learning rate scheduler") + gradient_accumulation: int = Field(1, title="Number of gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer type") + scheduler: str = Field("linear", title="Learning rate scheduler type") + weight_decay: float = Field(0.0, title="Weight decay for the optimizer") + max_grad_norm: float = Field(1.0, title="Maximum gradient norm for clipping") + seed: int = Field(42, title="Random seed for reproducibility") + train_split: str = Field("train", title="Name of the training data split") + valid_split: Optional[str] = Field(None, title="Name of the validation data split") + text_column: str = Field("context", title="Column name for context/text") + question_column: str = Field("question", title="Column name for questions") + answer_column: str = Field("answers", title="Column name for answers") + logging_steps: int = Field(-1, title="Number of steps between logging") + project_name: str = Field("project-name", title="Name of the project for output directory") + auto_find_batch_size: bool = Field(False, title="Automatically find optimal batch size") + mixed_precision: Optional[str] = Field(None, title="Mixed precision training mode (fp16, bf16, or None)") + save_total_limit: int = Field(1, title="Maximum number of checkpoints to save") + token: Optional[str] = Field(None, title="Authentication token for Hugging Face Hub") + push_to_hub: bool = Field(False, title="Whether to push the model to Hugging Face Hub") + eval_strategy: str = Field("epoch", title="Evaluation strategy during training") + username: Optional[str] = Field(None, title="Hugging Face username for authentication") + log: str = Field("none", title="Logging method for experiment tracking") + early_stopping_patience: int = Field(5, title="Number of epochs with no improvement for early stopping") + early_stopping_threshold: float = Field(0.01, title="Threshold for early stopping improvement") diff --git a/src/autotrain/trainers/extractive_question_answering/utils.py b/src/autotrain/trainers/extractive_question_answering/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d3deea7d8fb63a6f12da9b4d19faac3f7c979cb6 --- /dev/null +++ b/src/autotrain/trainers/extractive_question_answering/utils.py @@ -0,0 +1,396 @@ +import collections +import json +import os + +import numpy as np +from datasets import load_metric +from transformers import EvalPrediction + +from autotrain import logger + + +MODEL_CARD = """ +--- +library_name: transformers +tags: +- autotrain +- question-answering{base_model} +widget: +- text: "Who loves AutoTrain?" + context: "Everyone loves AutoTrain"{dataset_tag} +--- + +# Model Trained Using AutoTrain + +- Problem type: Extractive Question Answering + +## Validation Metrics + +{validation_metrics} + +## Usage + + +```python +import torch + +from transformers import AutoModelForQuestionAnswering, AutoTokenizer + +model = AutoModelForQuestionAnswering.from_pretrained(...) + +tokenizer = AutoTokenizer.from_pretrained(...) + +from transformers import BertTokenizer, BertForQuestionAnswering + +question, text = "Who loves AutoTrain?", "Everyone loves AutoTrain" + +inputs = tokenizer(question, text, return_tensors='pt') + +start_positions = torch.tensor([1]) + +end_positions = torch.tensor([3]) + +outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions) + +loss = outputs.loss + +start_scores = outputs.start_logits + +end_scores = outputs.end_logits +``` +""" + +SQUAD_METRIC = load_metric("squad") +SQUAD_V2_METRIC = load_metric("squad_v2") + + +def postprocess_qa_predictions( + examples, + features, + predictions, + config, + version_2_with_negative=False, + n_best_size=20, + max_answer_length=30, + null_score_diff_threshold=0.0, + output_dir=None, + prefix=None, +): + # This function is taken from: https://github.com/huggingface/transformers/blob/dcec4c4387850dff8123d5752aab8c1b5431465b/examples/pytorch/question-answering/run_qa.py#L470 + """ + Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the + original contexts. This is the base postprocessing functions for models that only return start and end logits. + + Args: + examples: The non-preprocessed dataset (see the main script for more information). + features: The processed dataset (see the main script for more information). + predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): + The predictions of the model: two arrays containing the start logits and the end logits respectively. Its + first dimension must match the number of elements of :obj:`features`. + version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the underlying dataset contains examples with no answers. + n_best_size (:obj:`int`, `optional`, defaults to 20): + The total number of n-best predictions to generate when looking for an answer. + max_answer_length (:obj:`int`, `optional`, defaults to 30): + The maximum length of an answer that can be generated. This is needed because the start and end predictions + are not conditioned on one another. + null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0): + The threshold used to select the null answer: if the best answer has a score that is less than the score of + the null answer minus this threshold, the null answer is selected for this example (note that the score of + the null answer for an example giving several features is the minimum of the scores for the null answer on + each feature: all features must be aligned on the fact they `want` to predict a null answer). + + Only useful when :obj:`version_2_with_negative` is :obj:`True`. + output_dir (:obj:`str`, `optional`): + If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if + :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null + answers, are saved in `output_dir`. + prefix (:obj:`str`, `optional`): + If provided, the dictionaries mentioned above are saved with `prefix` added to their names. + log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): + ``logging`` log level (e.g., ``logging.WARNING``) + """ + if len(predictions) != 2: + raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).") + all_start_logits, all_end_logits = predictions + + if len(predictions[0]) != len(features): + raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") + + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(examples["id"])} + features_per_example = collections.defaultdict(list) + for i, feature in enumerate(features): + features_per_example[example_id_to_index[feature["example_id"]]].append(i) + + # The dictionaries we have to fill. + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + if version_2_with_negative: + scores_diff_json = collections.OrderedDict() + + # Logging. + logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") + + # Let's loop over all the examples! + for example_index, example in enumerate(examples): + # Those are the indices of the features associated to the current example. + feature_indices = features_per_example[example_index] + + min_null_prediction = None + prelim_predictions = [] + + # Looping through all the features associated to the current example. + for feature_index in feature_indices: + # We grab the predictions of the model for this feature. + start_logits = all_start_logits[feature_index] + end_logits = all_end_logits[feature_index] + # This is what will allow us to map some the positions in our logits to span of texts in the original + # context. + offset_mapping = features[feature_index]["offset_mapping"] + # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context + # available in the current feature. + token_is_max_context = features[feature_index].get("token_is_max_context", None) + + # Update minimum null prediction. + feature_null_score = start_logits[0] + end_logits[0] + if min_null_prediction is None or min_null_prediction["score"] > feature_null_score: + min_null_prediction = { + "offsets": (0, 0), + "score": feature_null_score, + "start_logit": start_logits[0], + "end_logit": end_logits[0], + } + + # Go through all possibilities for the `n_best_size` greater start and end logits. + start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() + end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() + for start_index in start_indexes: + for end_index in end_indexes: + # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond + # to part of the input_ids that are not in the context. + if ( + start_index >= len(offset_mapping) + or end_index >= len(offset_mapping) + or offset_mapping[start_index] is None + or len(offset_mapping[start_index]) < 2 + or offset_mapping[end_index] is None + or len(offset_mapping[end_index]) < 2 + ): + continue + # Don't consider answers with a length that is either < 0 or > max_answer_length. + if end_index < start_index or end_index - start_index + 1 > max_answer_length: + continue + # Don't consider answer that don't have the maximum context available (if such information is + # provided). + if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): + continue + + prelim_predictions.append( + { + "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), + "score": start_logits[start_index] + end_logits[end_index], + "start_logit": start_logits[start_index], + "end_logit": end_logits[end_index], + } + ) + if version_2_with_negative and min_null_prediction is not None: + # Add the minimum null prediction + prelim_predictions.append(min_null_prediction) + null_score = min_null_prediction["score"] + + # Only keep the best `n_best_size` predictions. + predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] + + # Add back the minimum null prediction if it was removed because of its low score. + if ( + version_2_with_negative + and min_null_prediction is not None + and not any(p["offsets"] == (0, 0) for p in predictions) + ): + predictions.append(min_null_prediction) + + # Use the offsets to gather the answer text in the original context. + context = example[config.text_column] + for pred in predictions: + offsets = pred.pop("offsets") + pred["text"] = context[offsets[0] : offsets[1]] + + # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid + # failure. + if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""): + predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}) + + # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using + # the LogSumExp trick). + scores = np.array([pred.pop("score") for pred in predictions]) + exp_scores = np.exp(scores - np.max(scores)) + probs = exp_scores / exp_scores.sum() + + # Include the probabilities in our predictions. + for prob, pred in zip(probs, predictions): + pred["probability"] = prob + + # Pick the best prediction. If the null answer is not possible, this is easy. + if not version_2_with_negative: + all_predictions[example["id"]] = predictions[0]["text"] + else: + # Otherwise we first need to find the best non-empty prediction. + i = 0 + while predictions[i]["text"] == "": + i += 1 + best_non_null_pred = predictions[i] + + # Then we compare to the null prediction using the threshold. + score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"] + scores_diff_json[example["id"]] = float(score_diff) # To be JSON-serializable. + if score_diff > null_score_diff_threshold: + all_predictions[example["id"]] = "" + else: + all_predictions[example["id"]] = best_non_null_pred["text"] + + # Make `predictions` JSON-serializable by casting np.float back to float. + all_nbest_json[example["id"]] = [ + {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} + for pred in predictions + ] + + # If we have an output_dir, let's save all those dicts. + if output_dir is not None: + if not os.path.isdir(output_dir): + raise EnvironmentError(f"{output_dir} is not a directory.") + + prediction_file = os.path.join( + output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" + ) + nbest_file = os.path.join( + output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" + ) + if version_2_with_negative: + null_odds_file = os.path.join( + output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" + ) + + logger.info(f"Saving predictions to {prediction_file}.") + with open(prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + logger.info(f"Saving nbest_preds to {nbest_file}.") + with open(nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + logger.info(f"Saving null_odds to {null_odds_file}.") + with open(null_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions + + +def post_processing_function_qa(examples, features, predictions, version_2_with_negative, config, stage="eval"): + # Post-processing: we match the start logits and end logits to answers in the original context. + predictions = postprocess_qa_predictions( + examples=examples, + features=features, + predictions=predictions, + version_2_with_negative=version_2_with_negative, + n_best_size=20, + max_answer_length=30, + null_score_diff_threshold=0.0, + output_dir=None, + prefix=stage, + config=config, + ) + # Format the result to the format the metric expects. + if version_2_with_negative: + formatted_predictions = [ + {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() + ] + else: + formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] + + references = [{"id": str(ex["id"]), "answers": ex[config.answer_column]} for ex in examples] + return EvalPrediction(predictions=formatted_predictions, label_ids=references) + + +def compute_metrics(pred, eval_dataset, eval_examples, use_v2, config): + preds, label_ids = post_processing_function_qa(eval_examples, eval_dataset, pred.predictions, use_v2, config) + + if use_v2: + result = SQUAD_V2_METRIC.compute(predictions=preds, references=label_ids) + else: + result = SQUAD_METRIC.compute(predictions=preds, references=label_ids) + return {k: round(v, 4) for k, v in result.items()} + + +def create_model_card(config, trainer): + if config.valid_split is not None: + eval_scores = trainer.evaluate() + eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items()] + eval_scores = "\n\n".join(eval_scores) + + else: + eval_scores = "No validation metrics available" + + if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path): + dataset_tag = "" + else: + dataset_tag = f"\ndatasets:\n- {config.data_path}" + + if os.path.isdir(config.model): + base_model = "" + else: + base_model = f"\nbase_model: {config.model}" + + model_card = MODEL_CARD.format( + dataset_tag=dataset_tag, + validation_metrics=eval_scores, + base_model=base_model, + ) + return model_card + + +def prepare_qa_validation_features(examples, tokenizer, config): + # Some of the questions have lots of whitespace on the left, which is not useful and will make the + # truncation of the context fail (the tokenized question will take a lots of space). So we remove that + # left whitespace + pad_on_right = tokenizer.padding_side == "right" + examples[config.question_column] = [q.lstrip() for q in examples[config.question_column]] + + # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results + # in one example possible giving several features when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + tokenized_examples = tokenizer( + examples[config.question_column if pad_on_right else config.text_column], + examples[config.text_column if pad_on_right else config.question_column], + truncation="only_second" if pad_on_right else "only_first", + max_length=config.max_seq_length, + stride=config.max_doc_stride, + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length", + ) + + # Since one example might give us several features if it has a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the + # corresponding example_id and we will store the offset mappings. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # Grab the sequence corresponding to that example (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if pad_on_right else 0 + + # One example can give several spans, this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(examples["id"][sample_index]) + + # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token + # position is part of the context or not. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + + return tokenized_examples diff --git a/src/autotrain/trainers/generic/__init__.py b/src/autotrain/trainers/generic/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/generic/__main__.py b/src/autotrain/trainers/generic/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..2dc4c07b7820967779d884f4255e6e7b44a75540 --- /dev/null +++ b/src/autotrain/trainers/generic/__main__.py @@ -0,0 +1,58 @@ +import argparse +import json + +from autotrain import logger +from autotrain.trainers.common import monitor, pause_space +from autotrain.trainers.generic import utils +from autotrain.trainers.generic.params import GenericParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--config", type=str, required=True) + return parser.parse_args() + + +@monitor +def run(config): + """ + Executes a series of operations based on the provided configuration. + + This function performs the following steps: + 1. Converts the configuration dictionary to a GenericParams object if necessary. + 2. Downloads the data repository specified in the configuration. + 3. Uninstalls any existing requirements specified in the configuration. + 4. Installs the necessary requirements specified in the configuration. + 5. Runs a command specified in the configuration. + 6. Pauses the space as specified in the configuration. + + Args: + config (dict or GenericParams): The configuration for the operations to be performed. + """ + if isinstance(config, dict): + config = GenericParams(**config) + + # download the data repo + logger.info("Downloading data repo...") + utils.pull_dataset_repo(config) + + logger.info("Unintalling requirements...") + utils.uninstall_requirements(config) + + # install the requirements + logger.info("Installing requirements...") + utils.install_requirements(config) + + # run the command + logger.info("Running command...") + utils.run_command(config) + + pause_space(config) + + +if __name__ == "__main__": + args = parse_args() + _config = json.load(open(args.config)) + _config = GenericParams(**_config) + run(_config) diff --git a/src/autotrain/trainers/generic/params.py b/src/autotrain/trainers/generic/params.py new file mode 100644 index 0000000000000000000000000000000000000000..8d826a4bec89f907ae999b80bd83043eac0c4780 --- /dev/null +++ b/src/autotrain/trainers/generic/params.py @@ -0,0 +1,36 @@ +from typing import Dict, Optional + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class GenericParams(AutoTrainParams): + """ + GenericParams is a class that holds configuration parameters for an AutoTrain SpaceRunner project. + + Attributes: + username (str): The username for your Hugging Face account. + project_name (str): The name of the project. + data_path (str): The file path to the dataset. + token (str): The authentication token for accessing Hugging Face Hub. + script_path (str): The file path to the script to be executed. Path to script.py. + env (Optional[Dict[str, str]]): A dictionary of environment variables to be set. + args (Optional[Dict[str, str]]): A dictionary of arguments to be passed to the script. + """ + + username: str = Field( + None, title="Hugging Face Username", description="The username for your Hugging Face account." + ) + project_name: str = Field("project-name", title="Project Name", description="The name of the project.") + data_path: str = Field(None, title="Data Path", description="The file path to the dataset.") + token: str = Field(None, title="Hub Token", description="The authentication token for accessing Hugging Face Hub.") + script_path: str = Field( + None, title="Script Path", description="The file path to the script to be executed. Path to script.py" + ) + env: Optional[Dict[str, str]] = Field( + None, title="Environment Variables", description="A dictionary of environment variables to be set." + ) + args: Optional[Dict[str, str]] = Field( + None, title="Arguments", description="A dictionary of arguments to be passed to the script." + ) diff --git a/src/autotrain/trainers/generic/utils.py b/src/autotrain/trainers/generic/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1290631432784e0b059c9deaa79e360b682b5148 --- /dev/null +++ b/src/autotrain/trainers/generic/utils.py @@ -0,0 +1,201 @@ +import os +import subprocess + +import requests +from huggingface_hub import HfApi, snapshot_download + +from autotrain import logger + + +def create_dataset_repo(username, project_name, script_path, token): + """ + Creates a new dataset repository on Hugging Face and uploads the specified dataset. + + Args: + username (str): The username of the Hugging Face account. + project_name (str): The name of the project for which the dataset repository is being created. + script_path (str): The local path to the dataset folder that needs to be uploaded. + token (str): The authentication token for the Hugging Face API. + + Returns: + str: The repository ID of the newly created dataset repository. + """ + logger.info("Creating dataset repo...") + api = HfApi(token=token) + repo_id = f"{username}/autotrain-{project_name}" + api.create_repo( + repo_id=repo_id, + repo_type="dataset", + private=True, + ) + logger.info("Uploading dataset...") + api.upload_folder( + folder_path=script_path, + repo_id=repo_id, + repo_type="dataset", + ) + logger.info("Dataset uploaded.") + return repo_id + + +def pull_dataset_repo(params): + """ + Downloads a dataset repository from Hugging Face Hub. + + Args: + params (object): An object containing the following attributes: + - data_path (str): The repository ID of the dataset. + - project_name (str): The local directory where the dataset will be downloaded. + - token (str): The authentication token for accessing the repository. + + Returns: + None + """ + snapshot_download( + repo_id=params.data_path, + local_dir=params.project_name, + token=params.token, + repo_type="dataset", + ) + + +def uninstall_requirements(params): + """ + Uninstalls the requirements specified in the requirements.txt file of a given project. + + This function reads the requirements.txt file located in the project's directory, + extracts the packages to be uninstalled, writes them to an uninstall.txt file, + and then uses pip to uninstall those packages. + + Args: + params (object): An object containing the project_name attribute, which specifies + the directory of the project. + + Returns: + None + """ + if os.path.exists(f"{params.project_name}/requirements.txt"): + # read the requirements.txt + uninstall_list = [] + with open(f"{params.project_name}/requirements.txt", "r", encoding="utf-8") as f: + for line in f: + if line.startswith("-"): + uninstall_list.append(line[1:]) + + # create an uninstall.txt + with open(f"{params.project_name}/uninstall.txt", "w", encoding="utf-8") as f: + for line in uninstall_list: + f.write(line) + + pipe = subprocess.Popen( + [ + "pip", + "uninstall", + "-r", + "uninstall.txt", + "-y", + ], + cwd=params.project_name, + ) + pipe.wait() + logger.info("Requirements uninstalled.") + return + + +def install_requirements(params): + """ + Installs the Python packages listed in the requirements.txt file located in the specified project directory. + + Args: + params: An object containing the project_name attribute, which specifies the directory of the project. + + Behavior: + - Checks if a requirements.txt file exists in the project directory. + - Reads the requirements.txt file and filters out lines starting with a hyphen. + - Rewrites the filtered requirements back to the requirements.txt file. + - Uses subprocess to run the pip install command on the requirements.txt file. + - Logs the installation status. + + Returns: + None + """ + # check if params.project_name has a requirements.txt + if os.path.exists(f"{params.project_name}/requirements.txt"): + # install the requirements using subprocess, wait for it to finish + install_list = [] + + with open(f"{params.project_name}/requirements.txt", "r", encoding="utf-8") as f: + for line in f: + if not line.startswith("-"): + install_list.append(line) + + with open(f"{params.project_name}/requirements.txt", "w", encoding="utf-8") as f: + for line in install_list: + f.write(line) + + pipe = subprocess.Popen( + [ + "pip", + "install", + "-r", + "requirements.txt", + ], + cwd=params.project_name, + ) + pipe.wait() + logger.info("Requirements installed.") + return + logger.info("No requirements.txt found. Skipping requirements installation.") + return + + +def run_command(params): + """ + Executes a Python script with optional arguments in a specified project directory. + + Args: + params (object): An object containing the following attributes: + - project_name (str): The name of the project directory where the script is located. + - args (dict): A dictionary of arguments to pass to the script. Keys are argument names, and values are argument values. + + Raises: + ValueError: If the script.py file is not found in the specified project directory. + + Returns: + None + """ + if os.path.exists(f"{params.project_name}/script.py"): + cmd = ["python", "script.py"] + if params.args: + for arg in params.args: + cmd.append(f"--{arg}") + if params.args[arg] != "": + cmd.append(params.args[arg]) + pipe = subprocess.Popen(cmd, cwd=params.project_name) + pipe.wait() + logger.info("Command finished.") + return + raise ValueError("No script.py found.") + + +def pause_endpoint(params): + """ + Pauses a specific endpoint using the Hugging Face API. + + This function retrieves the endpoint ID from the environment variables, + extracts the username and project name from the endpoint ID, constructs + the API URL, and sends a POST request to pause the endpoint. + + Args: + params (object): An object containing the token attribute for authorization. + + Returns: + dict: The JSON response from the API call. + """ + endpoint_id = os.environ["ENDPOINT_ID"] + username = endpoint_id.split("/")[0] + project_name = endpoint_id.split("/")[1] + api_url = f"https://api.endpoints.huggingface.cloud/v2/endpoint/{username}/{project_name}/pause" + headers = {"Authorization": f"Bearer {params.token}"} + r = requests.post(api_url, headers=headers, timeout=120) + return r.json() diff --git a/src/autotrain/trainers/image_classification/__init__.py b/src/autotrain/trainers/image_classification/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/image_classification/__main__.py b/src/autotrain/trainers/image_classification/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..8d29c1c74a51e4e3ace4291b86d4db11b002a8b2 --- /dev/null +++ b/src/autotrain/trainers/image_classification/__main__.py @@ -0,0 +1,242 @@ +import argparse +import json + +from accelerate.state import PartialState +from datasets import load_dataset, load_from_disk +from huggingface_hub import HfApi +from transformers import ( + AutoConfig, + AutoImageProcessor, + AutoModelForImageClassification, + EarlyStoppingCallback, + Trainer, + TrainingArguments, +) +from transformers.trainer_callback import PrinterCallback + +from autotrain import logger +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + LossLoggingCallback, + TrainStartCallback, + UploadLogs, + monitor, + pause_space, + remove_autotrain_data, + save_training_params, +) +from autotrain.trainers.image_classification import utils +from autotrain.trainers.image_classification.params import ImageClassificationParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +@monitor +def train(config): + if isinstance(config, dict): + config = ImageClassificationParams(**config) + + valid_data = None + if config.data_path == f"{config.project_name}/autotrain-data": + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + logger.info(f"Train data: {train_data}") + logger.info(f"Valid data: {valid_data}") + + classes = train_data.features[config.target_column].names + logger.info(f"Classes: {classes}") + label2id = {c: i for i, c in enumerate(classes)} + num_classes = len(classes) + + if num_classes < 2: + raise ValueError("Invalid number of classes. Must be greater than 1.") + + if config.valid_split is not None: + num_classes_valid = len(valid_data.unique(config.target_column)) + if num_classes_valid != num_classes: + raise ValueError( + f"Number of classes in train and valid are not the same. Training has {num_classes} and valid has {num_classes_valid}" + ) + + model_config = AutoConfig.from_pretrained( + config.model, + num_labels=num_classes, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ) + model_config._num_labels = len(label2id) + model_config.label2id = label2id + model_config.id2label = {v: k for k, v in label2id.items()} + + try: + model = AutoModelForImageClassification.from_pretrained( + config.model, + config=model_config, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + except OSError: + model = AutoModelForImageClassification.from_pretrained( + config.model, + config=model_config, + from_tf=True, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + + image_processor = AutoImageProcessor.from_pretrained( + config.model, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + train_data, valid_data = utils.process_data(train_data, valid_data, image_processor, config) + + if config.logging_steps == -1: + if config.valid_split is not None: + logging_steps = int(0.2 * len(valid_data) / config.batch_size) + else: + logging_steps = int(0.2 * len(train_data) / config.batch_size) + if logging_steps == 0: + logging_steps = 1 + if logging_steps > 25: + logging_steps = 25 + config.logging_steps = logging_steps + else: + logging_steps = config.logging_steps + + logger.info(f"Logging steps: {logging_steps}") + + training_args = dict( + output_dir=config.project_name, + per_device_train_batch_size=config.batch_size, + per_device_eval_batch_size=2 * config.batch_size, + learning_rate=config.lr, + num_train_epochs=config.epochs, + eval_strategy=config.eval_strategy if config.valid_split is not None else "no", + logging_steps=logging_steps, + save_total_limit=config.save_total_limit, + save_strategy=config.eval_strategy if config.valid_split is not None else "no", + gradient_accumulation_steps=config.gradient_accumulation, + report_to=config.log, + auto_find_batch_size=config.auto_find_batch_size, + lr_scheduler_type=config.scheduler, + optim=config.optimizer, + warmup_ratio=config.warmup_ratio, + weight_decay=config.weight_decay, + max_grad_norm=config.max_grad_norm, + push_to_hub=False, + load_best_model_at_end=True if config.valid_split is not None else False, + ddp_find_unused_parameters=False, + ) + + if config.mixed_precision == "fp16": + training_args["fp16"] = True + if config.mixed_precision == "bf16": + training_args["bf16"] = True + + if config.valid_split is not None: + early_stop = EarlyStoppingCallback( + early_stopping_patience=config.early_stopping_patience, + early_stopping_threshold=config.early_stopping_threshold, + ) + callbacks_to_use = [early_stop] + else: + callbacks_to_use = [] + + callbacks_to_use.extend([UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()]) + + args = TrainingArguments(**training_args) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks_to_use, + compute_metrics=( + utils._binary_classification_metrics if num_classes == 2 else utils._multi_class_classification_metrics + ), + ) + + trainer = Trainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data, + ) + trainer.remove_callback(PrinterCallback) + trainer.train() + + logger.info("Finished training, saving model...") + trainer.save_model(config.project_name) + image_processor.save_pretrained(config.project_name) + + model_card = utils.create_model_card(config, trainer, num_classes) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w") as f: + f.write(model_card) + + if config.push_to_hub: + if PartialState().process_index == 0: + remove_autotrain_data(config) + save_training_params(config) + logger.info("Pushing model to hub...") + api = HfApi(token=config.token) + api.create_repo( + repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True + ) + api.upload_folder( + folder_path=config.project_name, repo_id=f"{config.username}/{config.project_name}", repo_type="model" + ) + + if PartialState().process_index == 0: + pause_space(config) + + +if __name__ == "__main__": + _args = parse_args() + training_config = json.load(open(_args.training_config)) + _config = ImageClassificationParams(**training_config) + train(_config) diff --git a/src/autotrain/trainers/image_classification/dataset.py b/src/autotrain/trainers/image_classification/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..10d7e6c1afdbaf089c49b3e03b635bb82b9afa2c --- /dev/null +++ b/src/autotrain/trainers/image_classification/dataset.py @@ -0,0 +1,46 @@ +import numpy as np +import torch + + +class ImageClassificationDataset: + """ + A custom dataset class for image classification tasks. + + Args: + data (list): A list of data samples, where each sample is a dictionary containing image and target information. + transforms (callable): A function/transform that takes in an image and returns a transformed version. + config (object): A configuration object containing the column names for images and targets. + + Attributes: + data (list): The dataset containing image and target information. + transforms (callable): The transformation function to be applied to the images. + config (object): The configuration object with image and target column names. + + Methods: + __len__(): Returns the number of samples in the dataset. + __getitem__(item): Retrieves the image and target at the specified index, applies transformations, and returns them as tensors. + + Example: + dataset = ImageClassificationDataset(data, transforms, config) + image, target = dataset[0] + """ + + def __init__(self, data, transforms, config): + self.data = data + self.transforms = transforms + self.config = config + + def __len__(self): + return len(self.data) + + def __getitem__(self, item): + image = self.data[item][self.config.image_column] + target = int(self.data[item][self.config.target_column]) + + image = self.transforms(image=np.array(image.convert("RGB")))["image"] + image = np.transpose(image, (2, 0, 1)).astype(np.float32) + + return { + "pixel_values": torch.tensor(image, dtype=torch.float), + "labels": torch.tensor(target, dtype=torch.long), + } diff --git a/src/autotrain/trainers/image_classification/params.py b/src/autotrain/trainers/image_classification/params.py new file mode 100644 index 0000000000000000000000000000000000000000..c213972066725ed05292f6828859084acdd00387 --- /dev/null +++ b/src/autotrain/trainers/image_classification/params.py @@ -0,0 +1,70 @@ +from typing import Optional + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class ImageClassificationParams(AutoTrainParams): + """ + ImageClassificationParams is a configuration class for image classification training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Pre-trained model name or path. Default is "google/vit-base-patch16-224". + username (Optional[str]): Hugging Face account username. + lr (float): Learning rate for the optimizer. Default is 5e-5. + epochs (int): Number of epochs for training. Default is 3. + batch_size (int): Batch size for training. Default is 8. + warmup_ratio (float): Warmup ratio for learning rate scheduler. Default is 0.1. + gradient_accumulation (int): Number of gradient accumulation steps. Default is 1. + optimizer (str): Optimizer type. Default is "adamw_torch". + scheduler (str): Learning rate scheduler type. Default is "linear". + weight_decay (float): Weight decay for the optimizer. Default is 0.0. + max_grad_norm (float): Maximum gradient norm for clipping. Default is 1.0. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project for output directory. Default is "project-name". + auto_find_batch_size (bool): Automatically find optimal batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision training mode (fp16, bf16, or None). + save_total_limit (int): Maximum number of checkpoints to keep. Default is 1. + token (Optional[str]): Hugging Face Hub token for authentication. + push_to_hub (bool): Whether to push the model to Hugging Face Hub. Default is False. + eval_strategy (str): Evaluation strategy during training. Default is "epoch". + image_column (str): Column name for images in the dataset. Default is "image". + target_column (str): Column name for target labels in the dataset. Default is "target". + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Number of epochs with no improvement for early stopping. Default is 5. + early_stopping_threshold (float): Threshold for early stopping. Default is 0.01. + """ + + data_path: str = Field(None, title="Path to the dataset") + model: str = Field("google/vit-base-patch16-224", title="Pre-trained model name or path") + username: Optional[str] = Field(None, title="Hugging Face account username") + lr: float = Field(5e-5, title="Learning rate for the optimizer") + epochs: int = Field(3, title="Number of epochs for training") + batch_size: int = Field(8, title="Batch size for training") + warmup_ratio: float = Field(0.1, title="Warmup ratio for learning rate scheduler") + gradient_accumulation: int = Field(1, title="Number of gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer type") + scheduler: str = Field("linear", title="Learning rate scheduler type") + weight_decay: float = Field(0.0, title="Weight decay for the optimizer") + max_grad_norm: float = Field(1.0, title="Maximum gradient norm for clipping") + seed: int = Field(42, title="Random seed for reproducibility") + train_split: str = Field("train", title="Name of the training data split") + valid_split: Optional[str] = Field(None, title="Name of the validation data split") + logging_steps: int = Field(-1, title="Number of steps between logging") + project_name: str = Field("project-name", title="Name of the project for output directory") + auto_find_batch_size: bool = Field(False, title="Automatically find optimal batch size") + mixed_precision: Optional[str] = Field(None, title="Mixed precision training mode (fp16, bf16, or None)") + save_total_limit: int = Field(1, title="Maximum number of checkpoints to keep") + token: Optional[str] = Field(None, title="Hugging Face Hub token for authentication") + push_to_hub: bool = Field(False, title="Whether to push the model to Hugging Face Hub") + eval_strategy: str = Field("epoch", title="Evaluation strategy during training") + image_column: str = Field("image", title="Column name for images in the dataset") + target_column: str = Field("target", title="Column name for target labels in the dataset") + log: str = Field("none", title="Logging method for experiment tracking") + early_stopping_patience: int = Field(5, title="Number of epochs with no improvement for early stopping") + early_stopping_threshold: float = Field(0.01, title="Threshold for early stopping") diff --git a/src/autotrain/trainers/image_classification/utils.py b/src/autotrain/trainers/image_classification/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6b8b2421ab6b41d4ed571c7dbd79f984633e207c --- /dev/null +++ b/src/autotrain/trainers/image_classification/utils.py @@ -0,0 +1,218 @@ +import os + +import albumentations as A +import numpy as np +from sklearn import metrics + +from autotrain.trainers.image_classification.dataset import ImageClassificationDataset + + +BINARY_CLASSIFICATION_EVAL_METRICS = ( + "eval_loss", + "eval_accuracy", + "eval_f1", + "eval_auc", + "eval_precision", + "eval_recall", +) + +MULTI_CLASS_CLASSIFICATION_EVAL_METRICS = ( + "eval_loss", + "eval_accuracy", + "eval_f1_macro", + "eval_f1_micro", + "eval_f1_weighted", + "eval_precision_macro", + "eval_precision_micro", + "eval_precision_weighted", + "eval_recall_macro", + "eval_recall_micro", + "eval_recall_weighted", +) + +MODEL_CARD = """ +--- +tags: +- autotrain +- transformers +- image-classification{base_model} +widget: +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg + example_title: Tiger +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/teapot.jpg + example_title: Teapot +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/palace.jpg + example_title: Palace{dataset_tag} +--- + +# Model Trained Using AutoTrain + +- Problem type: Image Classification + +## Validation Metrics +{validation_metrics} +""" + + +def _binary_classification_metrics(pred): + """ + Computes various binary classification metrics given the predictions and labels. + + Args: + pred (tuple): A tuple containing raw predictions and true labels. + raw_predictions (numpy.ndarray): The raw prediction scores from the model. + labels (numpy.ndarray): The true labels. + + Returns: + dict: A dictionary containing the following metrics: + - f1 (float): The F1 score. + - precision (float): The precision score. + - recall (float): The recall score. + - auc (float): The Area Under the ROC Curve (AUC) score. + - accuracy (float): The accuracy score. + """ + raw_predictions, labels = pred + predictions = np.argmax(raw_predictions, axis=1) + result = { + "f1": metrics.f1_score(labels, predictions), + "precision": metrics.precision_score(labels, predictions), + "recall": metrics.recall_score(labels, predictions), + "auc": metrics.roc_auc_score(labels, raw_predictions[:, 1]), + "accuracy": metrics.accuracy_score(labels, predictions), + } + return result + + +def _multi_class_classification_metrics(pred): + """ + Compute various classification metrics for multi-class classification. + + Args: + pred (tuple): A tuple containing raw predictions and true labels. + - raw_predictions (numpy.ndarray): The raw prediction scores for each class. + - labels (numpy.ndarray): The true labels. + + Returns: + dict: A dictionary containing the following metrics: + - "f1_macro": F1 score with macro averaging. + - "f1_micro": F1 score with micro averaging. + - "f1_weighted": F1 score with weighted averaging. + - "precision_macro": Precision score with macro averaging. + - "precision_micro": Precision score with micro averaging. + - "precision_weighted": Precision score with weighted averaging. + - "recall_macro": Recall score with macro averaging. + - "recall_micro": Recall score with micro averaging. + - "recall_weighted": Recall score with weighted averaging. + - "accuracy": Accuracy score. + """ + raw_predictions, labels = pred + predictions = np.argmax(raw_predictions, axis=1) + results = { + "f1_macro": metrics.f1_score(labels, predictions, average="macro"), + "f1_micro": metrics.f1_score(labels, predictions, average="micro"), + "f1_weighted": metrics.f1_score(labels, predictions, average="weighted"), + "precision_macro": metrics.precision_score(labels, predictions, average="macro"), + "precision_micro": metrics.precision_score(labels, predictions, average="micro"), + "precision_weighted": metrics.precision_score(labels, predictions, average="weighted"), + "recall_macro": metrics.recall_score(labels, predictions, average="macro"), + "recall_micro": metrics.recall_score(labels, predictions, average="micro"), + "recall_weighted": metrics.recall_score(labels, predictions, average="weighted"), + "accuracy": metrics.accuracy_score(labels, predictions), + } + return results + + +def process_data(train_data, valid_data, image_processor, config): + """ + Processes training and validation data for image classification. + + Args: + train_data (Dataset): The training dataset. + valid_data (Dataset or None): The validation dataset. Can be None if no validation data is provided. + image_processor (ImageProcessor): An object containing image processing parameters such as size, mean, and std. + config (dict): Configuration dictionary containing additional parameters for dataset processing. + + Returns: + tuple: A tuple containing the processed training dataset and the processed validation dataset (or None if no validation data is provided). + """ + if "shortest_edge" in image_processor.size: + size = image_processor.size["shortest_edge"] + else: + size = (image_processor.size["height"], image_processor.size["width"]) + try: + height, width = size + except TypeError: + height = size + width = size + + train_transforms = A.Compose( + [ + A.RandomResizedCrop(height=height, width=width), + A.RandomRotate90(), + A.HorizontalFlip(p=0.5), + A.RandomBrightnessContrast(p=0.2), + A.Normalize(mean=image_processor.image_mean, std=image_processor.image_std), + ] + ) + + val_transforms = A.Compose( + [ + A.Resize(height=height, width=width), + A.Normalize(mean=image_processor.image_mean, std=image_processor.image_std), + ] + ) + train_data = ImageClassificationDataset(train_data, train_transforms, config) + if valid_data is not None: + valid_data = ImageClassificationDataset(valid_data, val_transforms, config) + return train_data, valid_data + return train_data, None + + +def create_model_card(config, trainer, num_classes): + """ + Generates a model card for the given configuration and trainer. + + Args: + config (object): Configuration object containing various settings. + trainer (object): Trainer object used for model training and evaluation. + num_classes (int): Number of classes in the classification task. + + Returns: + str: A formatted string representing the model card. + + The function evaluates the model if a validation split is provided in the config. + It then formats the evaluation scores based on whether the task is binary or multi-class classification. + If no validation split is provided, it notes that no validation metrics are available. + + The function also checks the data path and model path in the config to determine if they are directories. + Based on these checks, it formats the dataset tag and base model information accordingly. + + Finally, it uses the formatted information to create and return the model card string. + """ + if config.valid_split is not None: + eval_scores = trainer.evaluate() + valid_metrics = ( + BINARY_CLASSIFICATION_EVAL_METRICS if num_classes == 2 else MULTI_CLASS_CLASSIFICATION_EVAL_METRICS + ) + eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items() if k in valid_metrics] + eval_scores = "\n\n".join(eval_scores) + + else: + eval_scores = "No validation metrics available" + + if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path): + dataset_tag = "" + else: + dataset_tag = f"\ndatasets:\n- {config.data_path}" + + if os.path.isdir(config.model): + base_model = "" + else: + base_model = f"\nbase_model: {config.model}" + + model_card = MODEL_CARD.format( + dataset_tag=dataset_tag, + validation_metrics=eval_scores, + base_model=base_model, + ) + return model_card diff --git a/src/autotrain/trainers/image_regression/__init__.py b/src/autotrain/trainers/image_regression/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/image_regression/__main__.py b/src/autotrain/trainers/image_regression/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..a4db2e06dec91e5de6c0cb3fd88c0088ec8ecff9 --- /dev/null +++ b/src/autotrain/trainers/image_regression/__main__.py @@ -0,0 +1,226 @@ +import argparse +import json + +from accelerate.state import PartialState +from datasets import load_dataset, load_from_disk +from huggingface_hub import HfApi +from transformers import ( + AutoConfig, + AutoImageProcessor, + AutoModelForImageClassification, + EarlyStoppingCallback, + Trainer, + TrainingArguments, +) +from transformers.trainer_callback import PrinterCallback + +from autotrain import logger +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + LossLoggingCallback, + TrainStartCallback, + UploadLogs, + monitor, + pause_space, + remove_autotrain_data, + save_training_params, +) +from autotrain.trainers.image_regression import utils +from autotrain.trainers.image_regression.params import ImageRegressionParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +@monitor +def train(config): + if isinstance(config, dict): + config = ImageRegressionParams(**config) + + valid_data = None + if config.data_path == f"{config.project_name}/autotrain-data": + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + logger.info(f"Train data: {train_data}") + logger.info(f"Valid data: {valid_data}") + + model_config = AutoConfig.from_pretrained( + config.model, + num_labels=1, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ) + model_config._num_labels = 1 + label2id = {"target": 0} + model_config.label2id = label2id + model_config.id2label = {v: k for k, v in label2id.items()} + + try: + model = AutoModelForImageClassification.from_pretrained( + config.model, + config=model_config, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + except OSError: + model = AutoModelForImageClassification.from_pretrained( + config.model, + config=model_config, + from_tf=True, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + + image_processor = AutoImageProcessor.from_pretrained( + config.model, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + train_data, valid_data = utils.process_data(train_data, valid_data, image_processor, config) + + if config.logging_steps == -1: + if config.valid_split is not None: + logging_steps = int(0.2 * len(valid_data) / config.batch_size) + else: + logging_steps = int(0.2 * len(train_data) / config.batch_size) + if logging_steps == 0: + logging_steps = 1 + if logging_steps > 25: + logging_steps = 25 + config.logging_steps = logging_steps + else: + logging_steps = config.logging_steps + + logger.info(f"Logging steps: {logging_steps}") + + training_args = dict( + output_dir=config.project_name, + per_device_train_batch_size=config.batch_size, + per_device_eval_batch_size=2 * config.batch_size, + learning_rate=config.lr, + num_train_epochs=config.epochs, + eval_strategy=config.eval_strategy if config.valid_split is not None else "no", + logging_steps=logging_steps, + save_total_limit=config.save_total_limit, + save_strategy=config.eval_strategy if config.valid_split is not None else "no", + gradient_accumulation_steps=config.gradient_accumulation, + report_to=config.log, + auto_find_batch_size=config.auto_find_batch_size, + lr_scheduler_type=config.scheduler, + optim=config.optimizer, + warmup_ratio=config.warmup_ratio, + weight_decay=config.weight_decay, + max_grad_norm=config.max_grad_norm, + push_to_hub=False, + load_best_model_at_end=True if config.valid_split is not None else False, + ddp_find_unused_parameters=False, + ) + + if config.mixed_precision == "fp16": + training_args["fp16"] = True + if config.mixed_precision == "bf16": + training_args["bf16"] = True + + if config.valid_split is not None: + early_stop = EarlyStoppingCallback( + early_stopping_patience=config.early_stopping_patience, + early_stopping_threshold=config.early_stopping_threshold, + ) + callbacks_to_use = [early_stop] + else: + callbacks_to_use = [] + + callbacks_to_use.extend([UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()]) + + args = TrainingArguments(**training_args) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks_to_use, + compute_metrics=utils.image_regression_metrics, + ) + + trainer = Trainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data, + ) + trainer.remove_callback(PrinterCallback) + trainer.train() + + logger.info("Finished training, saving model...") + trainer.save_model(config.project_name) + image_processor.save_pretrained(config.project_name) + + model_card = utils.create_model_card(config, trainer) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w") as f: + f.write(model_card) + + if config.push_to_hub: + if PartialState().process_index == 0: + remove_autotrain_data(config) + save_training_params(config) + logger.info("Pushing model to hub...") + api = HfApi(token=config.token) + api.create_repo( + repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True + ) + api.upload_folder( + folder_path=config.project_name, repo_id=f"{config.username}/{config.project_name}", repo_type="model" + ) + + if PartialState().process_index == 0: + pause_space(config) + + +if __name__ == "__main__": + _args = parse_args() + training_config = json.load(open(_args.training_config)) + _config = ImageRegressionParams(**training_config) + train(_config) diff --git a/src/autotrain/trainers/image_regression/dataset.py b/src/autotrain/trainers/image_regression/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..43c01a13e170604be56e3e951f280b7ac8fbdeef --- /dev/null +++ b/src/autotrain/trainers/image_regression/dataset.py @@ -0,0 +1,42 @@ +import numpy as np +import torch + + +class ImageRegressionDataset: + """ + A dataset class for image regression tasks. + + Args: + data (list): A list of data points where each data point is a dictionary containing image and target information. + transforms (callable): A function/transform that takes in an image and returns a transformed version. + config (object): A configuration object that contains the column names for images and targets. + + Attributes: + data (list): The input data. + transforms (callable): The transformation function. + config (object): The configuration object. + + Methods: + __len__(): Returns the number of data points in the dataset. + __getitem__(item): Returns a dictionary containing the transformed image and the target value for the given index. + """ + + def __init__(self, data, transforms, config): + self.data = data + self.transforms = transforms + self.config = config + + def __len__(self): + return len(self.data) + + def __getitem__(self, item): + image = self.data[item][self.config.image_column] + target = self.data[item][self.config.target_column] + + image = self.transforms(image=np.array(image.convert("RGB")))["image"] + image = np.transpose(image, (2, 0, 1)).astype(np.float32) + + return { + "pixel_values": torch.tensor(image, dtype=torch.float), + "labels": torch.tensor(target, dtype=torch.float), + } diff --git a/src/autotrain/trainers/image_regression/params.py b/src/autotrain/trainers/image_regression/params.py new file mode 100644 index 0000000000000000000000000000000000000000..c47b1eb7e85bfc03cc0e83be8182032fe4da86b5 --- /dev/null +++ b/src/autotrain/trainers/image_regression/params.py @@ -0,0 +1,70 @@ +from typing import Optional + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class ImageRegressionParams(AutoTrainParams): + """ + ImageRegressionParams is a configuration class for image regression training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to use. Default is "google/vit-base-patch16-224". + username (Optional[str]): Hugging Face Username. + lr (float): Learning rate. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + batch_size (int): Training batch size. Default is 8. + warmup_ratio (float): Warmup proportion. Default is 0.1. + gradient_accumulation (int): Gradient accumulation steps. Default is 1. + optimizer (str): Optimizer to use. Default is "adamw_torch". + scheduler (str): Scheduler to use. Default is "linear". + weight_decay (float): Weight decay. Default is 0.0. + max_grad_norm (float): Max gradient norm. Default is 1.0. + seed (int): Random seed. Default is 42. + train_split (str): Train split name. Default is "train". + valid_split (Optional[str]): Validation split name. + logging_steps (int): Logging steps. Default is -1. + project_name (str): Output directory name. Default is "project-name". + auto_find_batch_size (bool): Whether to auto find batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision type (fp16, bf16, or None). + save_total_limit (int): Save total limit. Default is 1. + token (Optional[str]): Hub Token. + push_to_hub (bool): Whether to push to hub. Default is False. + eval_strategy (str): Evaluation strategy. Default is "epoch". + image_column (str): Image column name. Default is "image". + target_column (str): Target column name. Default is "target". + log (str): Logging using experiment tracking. Default is "none". + early_stopping_patience (int): Early stopping patience. Default is 5. + early_stopping_threshold (float): Early stopping threshold. Default is 0.01. + """ + + data_path: str = Field(None, title="Data path") + model: str = Field("google/vit-base-patch16-224", title="Model name") + username: Optional[str] = Field(None, title="Hugging Face Username") + lr: float = Field(5e-5, title="Learning rate") + epochs: int = Field(3, title="Number of training epochs") + batch_size: int = Field(8, title="Training batch size") + warmup_ratio: float = Field(0.1, title="Warmup proportion") + gradient_accumulation: int = Field(1, title="Gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer") + scheduler: str = Field("linear", title="Scheduler") + weight_decay: float = Field(0.0, title="Weight decay") + max_grad_norm: float = Field(1.0, title="Max gradient norm") + seed: int = Field(42, title="Seed") + train_split: str = Field("train", title="Train split") + valid_split: Optional[str] = Field(None, title="Validation split") + logging_steps: int = Field(-1, title="Logging steps") + project_name: str = Field("project-name", title="Output directory") + auto_find_batch_size: bool = Field(False, title="Auto find batch size") + mixed_precision: Optional[str] = Field(None, title="fp16, bf16, or None") + save_total_limit: int = Field(1, title="Save total limit") + token: Optional[str] = Field(None, title="Hub Token") + push_to_hub: bool = Field(False, title="Push to hub") + eval_strategy: str = Field("epoch", title="Evaluation strategy") + image_column: str = Field("image", title="Image column") + target_column: str = Field("target", title="Target column") + log: str = Field("none", title="Logging using experiment tracking") + early_stopping_patience: int = Field(5, title="Early stopping patience") + early_stopping_threshold: float = Field(0.01, title="Early stopping threshold") diff --git a/src/autotrain/trainers/image_regression/utils.py b/src/autotrain/trainers/image_regression/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..66d94761ebcae7f20271995681c1174f1fcbf5d3 --- /dev/null +++ b/src/autotrain/trainers/image_regression/utils.py @@ -0,0 +1,174 @@ +import os + +import albumentations as A +import numpy as np +from sklearn import metrics + +from autotrain.trainers.image_regression.dataset import ImageRegressionDataset + + +VALID_METRICS = [ + "eval_loss", + "eval_mse", + "eval_mae", + "eval_r2", + "eval_rmse", + "eval_explained_variance", +] + +MODEL_CARD = """ +--- +library_name: transformers +tags: +- autotrain +- vision +- image-classification +- image-regression{base_model} +widget: +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg + example_title: Tiger +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/teapot.jpg + example_title: Teapot +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/palace.jpg + example_title: Palace{dataset_tag} +--- + +# Model Trained Using AutoTrain + +- Problem type: Image Regression + +## Validation Metrics + +{validation_metrics} +""" + + +def image_regression_metrics(pred): + """ + Calculate various regression metrics for image regression tasks. + + Args: + pred (tuple): A tuple containing raw predictions and labels. + raw_predictions should be a list of lists or a list of numpy.float32 values. + labels should be a list of true values. + + Returns: + dict: A dictionary containing the calculated metrics: + - 'mse': Mean Squared Error + - 'mae': Mean Absolute Error + - 'r2': R^2 Score + - 'rmse': Root Mean Squared Error + - 'explained_variance': Explained Variance Score + + If an error occurs during the calculation of a metric, the value for that metric will be -999. + """ + raw_predictions, labels = pred + + try: + raw_predictions = [r for preds in raw_predictions for r in preds] + except TypeError as err: + if "numpy.float32" not in str(err): + raise Exception(err) + + pred_dict = {} + metrics_to_calculate = { + "mse": metrics.mean_squared_error, + "mae": metrics.mean_absolute_error, + "r2": metrics.r2_score, + "rmse": lambda y_true, y_pred: np.sqrt(metrics.mean_squared_error(y_true, y_pred)), + "explained_variance": metrics.explained_variance_score, + } + + for key, func in metrics_to_calculate.items(): + try: + pred_dict[key] = float(func(labels, raw_predictions)) + except Exception: + pred_dict[key] = -999 + + return pred_dict + + +def process_data(train_data, valid_data, image_processor, config): + """ + Processes training and validation data by applying image transformations. + + Args: + train_data (Dataset): The training dataset. + valid_data (Dataset or None): The validation dataset. If None, only training data is processed. + image_processor (ImageProcessor): An object containing image processing parameters such as size, mean, and std. + config (dict): Configuration dictionary containing additional parameters for the dataset. + + Returns: + tuple: A tuple containing the processed training dataset and the processed validation dataset (or None if valid_data is None). + """ + if "shortest_edge" in image_processor.size: + size = image_processor.size["shortest_edge"] + else: + size = (image_processor.size["height"], image_processor.size["width"]) + try: + height, width = size + except TypeError: + height = size + width = size + + train_transforms = A.Compose( + [ + A.RandomResizedCrop(height=height, width=width), + A.RandomRotate90(), + A.HorizontalFlip(p=0.5), + A.RandomBrightnessContrast(p=0.2), + A.Normalize(mean=image_processor.image_mean, std=image_processor.image_std), + ] + ) + + val_transforms = A.Compose( + [ + A.Resize(height=height, width=width), + A.Normalize(mean=image_processor.image_mean, std=image_processor.image_std), + ] + ) + train_data = ImageRegressionDataset(train_data, train_transforms, config) + if valid_data is not None: + valid_data = ImageRegressionDataset(valid_data, val_transforms, config) + return train_data, valid_data + return train_data, None + + +def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing various settings such as + valid_split, data_path, project_name, and model. + trainer (object): Trainer object used to evaluate the model if validation + split is provided. + + Returns: + str: A formatted model card string containing dataset information, + validation metrics, and base model details. + """ + if config.valid_split is not None: + eval_scores = trainer.evaluate() + eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items() if k in VALID_METRICS] + eval_scores = "\n\n".join(eval_scores) + + else: + eval_scores = "No validation metrics available" + + if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path): + dataset_tag = "" + else: + dataset_tag = f"\ndatasets:\n- {config.data_path}" + + if os.path.isdir(config.model): + base_model = "" + else: + base_model = f"\nbase_model: {config.model}" + + model_card = MODEL_CARD.format( + dataset_tag=dataset_tag, + validation_metrics=eval_scores, + base_model=base_model, + ) + return model_card diff --git a/src/autotrain/trainers/object_detection/__init__.py b/src/autotrain/trainers/object_detection/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/object_detection/__main__.py b/src/autotrain/trainers/object_detection/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..bd6a7b46054b5b2f15fd3cc3513138989caf10b7 --- /dev/null +++ b/src/autotrain/trainers/object_detection/__main__.py @@ -0,0 +1,236 @@ +import argparse +import json +from functools import partial + +from accelerate.state import PartialState +from datasets import load_dataset, load_from_disk +from huggingface_hub import HfApi +from transformers import ( + AutoConfig, + AutoImageProcessor, + AutoModelForObjectDetection, + EarlyStoppingCallback, + Trainer, + TrainingArguments, +) +from transformers.trainer_callback import PrinterCallback + +from autotrain import logger +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + LossLoggingCallback, + TrainStartCallback, + UploadLogs, + monitor, + pause_space, + remove_autotrain_data, + save_training_params, +) +from autotrain.trainers.object_detection import utils +from autotrain.trainers.object_detection.params import ObjectDetectionParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +@monitor +def train(config): + if isinstance(config, dict): + config = ObjectDetectionParams(**config) + + valid_data = None + if config.data_path == f"{config.project_name}/autotrain-data": + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + logger.info(f"Train data: {train_data}") + logger.info(f"Valid data: {valid_data}") + + categories = train_data.features[config.objects_column].feature["category"].names + id2label = dict(enumerate(categories)) + label2id = {v: k for k, v in id2label.items()} + + model_config = AutoConfig.from_pretrained( + config.model, + label2id=label2id, + id2label=id2label, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ) + try: + model = AutoModelForObjectDetection.from_pretrained( + config.model, + config=model_config, + ignore_mismatched_sizes=True, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ) + except OSError: + model = AutoModelForObjectDetection.from_pretrained( + config.model, + config=model_config, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + from_tf=True, + ) + image_processor = AutoImageProcessor.from_pretrained( + config.model, + token=config.token, + do_pad=False, + do_resize=False, + size={"longest_edge": config.image_square_size}, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + train_data, valid_data = utils.process_data(train_data, valid_data, image_processor, config) + + if config.logging_steps == -1: + if config.valid_split is not None: + logging_steps = int(0.2 * len(valid_data) / config.batch_size) + else: + logging_steps = int(0.2 * len(train_data) / config.batch_size) + if logging_steps == 0: + logging_steps = 1 + if logging_steps > 25: + logging_steps = 25 + config.logging_steps = logging_steps + else: + logging_steps = config.logging_steps + + logger.info(f"Logging steps: {logging_steps}") + + training_args = dict( + output_dir=config.project_name, + per_device_train_batch_size=config.batch_size, + per_device_eval_batch_size=2 * config.batch_size, + learning_rate=config.lr, + num_train_epochs=config.epochs, + eval_strategy=config.eval_strategy if config.valid_split is not None else "no", + logging_steps=logging_steps, + save_total_limit=config.save_total_limit, + save_strategy=config.eval_strategy if config.valid_split is not None else "no", + gradient_accumulation_steps=config.gradient_accumulation, + report_to=config.log, + auto_find_batch_size=config.auto_find_batch_size, + lr_scheduler_type=config.scheduler, + optim=config.optimizer, + warmup_ratio=config.warmup_ratio, + weight_decay=config.weight_decay, + max_grad_norm=config.max_grad_norm, + push_to_hub=False, + load_best_model_at_end=True if config.valid_split is not None else False, + ddp_find_unused_parameters=False, + ) + + if config.mixed_precision == "fp16": + training_args["fp16"] = True + if config.mixed_precision == "bf16": + training_args["bf16"] = True + + if config.valid_split is not None: + training_args["eval_do_concat_batches"] = False + early_stop = EarlyStoppingCallback( + early_stopping_patience=config.early_stopping_patience, + early_stopping_threshold=config.early_stopping_threshold, + ) + callbacks_to_use = [early_stop] + else: + callbacks_to_use = [] + + callbacks_to_use.extend([UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()]) + + _compute_metrics_fn = partial( + utils.object_detection_metrics, image_processor=image_processor, id2label=id2label, threshold=0.0 + ) + + args = TrainingArguments(**training_args) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks_to_use, + data_collator=utils.collate_fn, + tokenizer=image_processor, + compute_metrics=_compute_metrics_fn, + ) + + trainer = Trainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data, + ) + trainer.remove_callback(PrinterCallback) + trainer.train() + + logger.info("Finished training, saving model...") + trainer.save_model(config.project_name) + image_processor.save_pretrained(config.project_name) + + model_card = utils.create_model_card(config, trainer) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w") as f: + f.write(model_card) + + if config.push_to_hub: + if PartialState().process_index == 0: + remove_autotrain_data(config) + save_training_params(config) + logger.info("Pushing model to hub...") + api = HfApi(token=config.token) + api.create_repo( + repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True + ) + api.upload_folder( + folder_path=config.project_name, repo_id=f"{config.username}/{config.project_name}", repo_type="model" + ) + + if PartialState().process_index == 0: + pause_space(config) + + +if __name__ == "__main__": + _args = parse_args() + training_config = json.load(open(_args.training_config)) + _config = ObjectDetectionParams(**training_config) + train(_config) diff --git a/src/autotrain/trainers/object_detection/dataset.py b/src/autotrain/trainers/object_detection/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..6d9315f60ae94d8206cf3f62f9b0aac8f34b6f80 --- /dev/null +++ b/src/autotrain/trainers/object_detection/dataset.py @@ -0,0 +1,60 @@ +import numpy as np + + +class ObjectDetectionDataset: + """ + A dataset class for object detection tasks. + + Args: + data (list): A list of data entries where each entry is a dictionary containing image and object information. + transforms (callable): A function or transform to apply to the images and bounding boxes. + image_processor (callable): A function or processor to convert images and annotations into the desired format. + config (object): A configuration object containing column names for images and objects. + + Attributes: + data (list): The dataset containing image and object information. + transforms (callable): The transform function to apply to the images and bounding boxes. + image_processor (callable): The processor to convert images and annotations into the desired format. + config (object): The configuration object with column names for images and objects. + + Methods: + __len__(): Returns the number of items in the dataset. + __getitem__(item): Retrieves and processes the image and annotations for the given index. + + Example: + dataset = ObjectDetectionDataset(data, transforms, image_processor, config) + image_data = dataset[0] + """ + + def __init__(self, data, transforms, image_processor, config): + self.data = data + self.transforms = transforms + self.image_processor = image_processor + self.config = config + + def __len__(self): + return len(self.data) + + def __getitem__(self, item): + image = self.data[item][self.config.image_column] + objects = self.data[item][self.config.objects_column] + output = self.transforms( + image=np.array(image.convert("RGB")), bboxes=objects["bbox"], category=objects["category"] + ) + image = output["image"] + annotations = [] + for j in range(len(output["bboxes"])): + annotations.append( + { + "image_id": str(item), + "category_id": output["category"][j], + "iscrowd": 0, + "area": objects["bbox"][j][2] * objects["bbox"][j][3], # [x, y, w, h + "bbox": output["bboxes"][j], + } + ) + annotations = {"annotations": annotations, "image_id": str(item)} + result = self.image_processor(images=image, annotations=annotations, return_tensors="pt") + result["pixel_values"] = result["pixel_values"][0] + result["labels"] = result["labels"][0] + return result diff --git a/src/autotrain/trainers/object_detection/params.py b/src/autotrain/trainers/object_detection/params.py new file mode 100644 index 0000000000000000000000000000000000000000..19cca756017bd622b2c222a6995280e9d2cbd925 --- /dev/null +++ b/src/autotrain/trainers/object_detection/params.py @@ -0,0 +1,74 @@ +from typing import Optional + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class ObjectDetectionParams(AutoTrainParams): + """ + ObjectDetectionParams is a configuration class for object detection training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to be used. Default is "google/vit-base-patch16-224". + username (Optional[str]): Hugging Face Username. + lr (float): Learning rate. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + batch_size (int): Training batch size. Default is 8. + warmup_ratio (float): Warmup proportion. Default is 0.1. + gradient_accumulation (int): Gradient accumulation steps. Default is 1. + optimizer (str): Optimizer to be used. Default is "adamw_torch". + scheduler (str): Scheduler to be used. Default is "linear". + weight_decay (float): Weight decay. Default is 0.0. + max_grad_norm (float): Max gradient norm. Default is 1.0. + seed (int): Random seed. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project for output directory. Default is "project-name". + auto_find_batch_size (bool): Whether to automatically find batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision type (fp16, bf16, or None). + save_total_limit (int): Total number of checkpoints to save. Default is 1. + token (Optional[str]): Hub Token for authentication. + push_to_hub (bool): Whether to push the model to the Hugging Face Hub. Default is False. + eval_strategy (str): Evaluation strategy. Default is "epoch". + image_column (str): Name of the image column in the dataset. Default is "image". + objects_column (str): Name of the target column in the dataset. Default is "objects". + log (str): Logging method for experiment tracking. Default is "none". + image_square_size (Optional[int]): Longest size to which the image will be resized, then padded to square. Default is 600. + early_stopping_patience (int): Number of epochs with no improvement after which training will be stopped. Default is 5. + early_stopping_threshold (float): Minimum change to qualify as an improvement. Default is 0.01. + """ + + data_path: str = Field(None, title="Data path") + model: str = Field("google/vit-base-patch16-224", title="Model name") + username: Optional[str] = Field(None, title="Hugging Face Username") + lr: float = Field(5e-5, title="Learning rate") + epochs: int = Field(3, title="Number of training epochs") + batch_size: int = Field(8, title="Training batch size") + warmup_ratio: float = Field(0.1, title="Warmup proportion") + gradient_accumulation: int = Field(1, title="Gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer") + scheduler: str = Field("linear", title="Scheduler") + weight_decay: float = Field(0.0, title="Weight decay") + max_grad_norm: float = Field(1.0, title="Max gradient norm") + seed: int = Field(42, title="Seed") + train_split: str = Field("train", title="Train split") + valid_split: Optional[str] = Field(None, title="Validation split") + logging_steps: int = Field(-1, title="Logging steps") + project_name: str = Field("project-name", title="Output directory") + auto_find_batch_size: bool = Field(False, title="Auto find batch size") + mixed_precision: Optional[str] = Field(None, title="fp16, bf16, or None") + save_total_limit: int = Field(1, title="Save total limit") + token: Optional[str] = Field(None, title="Hub Token") + push_to_hub: bool = Field(False, title="Push to hub") + eval_strategy: str = Field("epoch", title="Evaluation strategy") + image_column: str = Field("image", title="Image column") + objects_column: str = Field("objects", title="Target column") + log: str = Field("none", title="Logging using experiment tracking") + image_square_size: Optional[int] = Field( + 600, title="Image longest size will be resized to this value, then image will be padded to square." + ) + early_stopping_patience: int = Field(5, title="Early stopping patience") + early_stopping_threshold: float = Field(0.01, title="Early stopping threshold") diff --git a/src/autotrain/trainers/object_detection/utils.py b/src/autotrain/trainers/object_detection/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..902500050005f9edbe3a8907ffbfaa20866a8bbb --- /dev/null +++ b/src/autotrain/trainers/object_detection/utils.py @@ -0,0 +1,270 @@ +import os +from dataclasses import dataclass + +import albumentations as A +import torch +from torchmetrics.detection.mean_ap import MeanAveragePrecision +from transformers.image_transforms import center_to_corners_format + +from autotrain.trainers.object_detection.dataset import ObjectDetectionDataset + + +VALID_METRICS = ( + "eval_loss", + "eval_map", + "eval_map_50", + "eval_map_75", + "eval_map_small", + "eval_map_medium", + "eval_map_large", + "eval_mar_1", + "eval_mar_10", + "eval_mar_100", + "eval_mar_small", + "eval_mar_medium", + "eval_mar_large", +) + +MODEL_CARD = """ +--- +library_name: transformers +tags: +- autotrain +- object-detection +- vision{base_model} +widget: +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/tiger.jpg + example_title: Tiger +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/teapot.jpg + example_title: Teapot +- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/palace.jpg + example_title: Palace{dataset_tag} +--- + +# Model Trained Using AutoTrain + +- Problem type: Object Detection + +## Validation Metrics +{validation_metrics} +""" + + +def collate_fn(batch): + """ + Collates a batch of data for object detection training. + + Args: + batch (list): A list of dictionaries, where each dictionary contains + 'pixel_values', 'labels', and optionally 'pixel_mask'. + + Returns: + dict: A dictionary with the following keys: + - 'pixel_values' (torch.Tensor): A tensor containing stacked pixel values from the batch. + - 'labels' (list): A list of labels from the batch. + - 'pixel_mask' (torch.Tensor, optional): A tensor containing stacked pixel masks from the batch, + if 'pixel_mask' is present in the input batch. + """ + data = {} + data["pixel_values"] = torch.stack([x["pixel_values"] for x in batch]) + data["labels"] = [x["labels"] for x in batch] + if "pixel_mask" in batch[0]: + data["pixel_mask"] = torch.stack([x["pixel_mask"] for x in batch]) + return data + + +def process_data(train_data, valid_data, image_processor, config): + """ + Processes training and validation data for object detection. + + Args: + train_data (list): List of training data samples. + valid_data (list or None): List of validation data samples. If None, only training data is processed. + image_processor (object): An image processor object that contains image processing configurations. + config (dict): Configuration dictionary containing various settings for data processing. + + Returns: + tuple: A tuple containing processed training data and validation data (if provided). If validation data is not provided, the second element of the tuple is None. + """ + max_size = image_processor.size["longest_edge"] + basic_transforms = [ + A.LongestMaxSize(max_size=max_size), + A.PadIfNeeded(max_size, max_size, border_mode=0, value=(128, 128, 128), position="top_left"), + ] + train_transforms = A.Compose( + [ + A.Compose( + [ + A.SmallestMaxSize(max_size=max_size, p=1.0), + A.RandomSizedBBoxSafeCrop(height=max_size, width=max_size, p=1.0), + ], + p=0.2, + ), + A.OneOf( + [ + A.Blur(blur_limit=7, p=0.5), + A.MotionBlur(blur_limit=7, p=0.5), + A.Defocus(radius=(1, 5), alias_blur=(0.1, 0.25), p=0.1), + ], + p=0.1, + ), + A.Perspective(p=0.1), + A.HorizontalFlip(p=0.5), + A.RandomBrightnessContrast(p=0.5), + A.HueSaturationValue(p=0.1), + *basic_transforms, + ], + bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True, min_area=25), + ) + val_transforms = A.Compose( + basic_transforms, + bbox_params=A.BboxParams(format="coco", label_fields=["category"], clip=True), + ) + + train_data = ObjectDetectionDataset(train_data, train_transforms, image_processor, config) + if valid_data is not None: + valid_data = ObjectDetectionDataset(valid_data, val_transforms, image_processor, config) + return train_data, valid_data + return train_data, None + + +def convert_bbox_yolo_to_pascal(boxes, image_size): + """ + Convert bounding boxes from YOLO format (x_center, y_center, width, height) in range [0, 1] + to Pascal VOC format (x_min, y_min, x_max, y_max) in absolute coordinates. + + Args: + boxes (torch.Tensor): Bounding boxes in YOLO format + image_size (Tuple[int, int]): Image size in format (height, width) + + Returns: + torch.Tensor: Bounding boxes in Pascal VOC format (x_min, y_min, x_max, y_max) + """ + # convert center to corners format + boxes = center_to_corners_format(boxes) + + # convert to absolute coordinates + height, width = image_size + boxes = boxes * torch.tensor([[width, height, width, height]]) + + return boxes + + +@torch.no_grad() +def object_detection_metrics(evaluation_results, image_processor, threshold=0.0, id2label=None): + """ + Compute mean average mAP, mAR and their variants for the object detection task. + + Args: + evaluation_results (EvalPrediction): Predictions and targets from evaluation. + threshold (float, optional): Threshold to filter predicted boxes by confidence. Defaults to 0.0. + id2label (Optional[dict], optional): Mapping from class id to class name. Defaults to None. + + Returns: + Mapping[str, float]: Metrics in a form of dictionary {: } + """ + + @dataclass + class ModelOutput: + logits: torch.Tensor + pred_boxes: torch.Tensor + + predictions, targets = evaluation_results.predictions, evaluation_results.label_ids + + # For metric computation we need to provide: + # - targets in a form of list of dictionaries with keys "boxes", "labels" + # - predictions in a form of list of dictionaries with keys "boxes", "scores", "labels" + + image_sizes = [] + post_processed_targets = [] + post_processed_predictions = [] + + # Collect targets in the required format for metric computation + for batch in targets: + # collect image sizes, we will need them for predictions post processing + batch_image_sizes = torch.tensor([x["orig_size"] for x in batch]) + image_sizes.append(batch_image_sizes) + # collect targets in the required format for metric computation + # boxes were converted to YOLO format needed for model training + # here we will convert them to Pascal VOC format (x_min, y_min, x_max, y_max) + for image_target in batch: + boxes = torch.tensor(image_target["boxes"]) + boxes = convert_bbox_yolo_to_pascal(boxes, image_target["orig_size"]) + labels = torch.tensor(image_target["class_labels"]) + post_processed_targets.append({"boxes": boxes, "labels": labels}) + + # Collect predictions in the required format for metric computation, + # model produce boxes in YOLO format, then image_processor convert them to Pascal VOC format + for batch, target_sizes in zip(predictions, image_sizes): + batch_logits, batch_boxes = batch[1], batch[2] + output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes)) + post_processed_output = image_processor.post_process_object_detection( + output, threshold=threshold, target_sizes=target_sizes + ) + post_processed_predictions.extend(post_processed_output) + + # Compute metrics + metric = MeanAveragePrecision(box_format="xyxy", class_metrics=True) + metric.update(post_processed_predictions, post_processed_targets) + metrics = metric.compute() + + # Replace list of per class metrics with separate metric for each class + classes = metrics.pop("classes") + try: + len(classes) + calc_map_per_class = True + except TypeError: + calc_map_per_class = False + + if calc_map_per_class: + map_per_class = metrics.pop("map_per_class") + mar_100_per_class = metrics.pop("mar_100_per_class") + for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class): + class_name = id2label[class_id.item()] if id2label is not None else class_id.item() + metrics[f"map_{class_name}"] = class_map + metrics[f"mar_100_{class_name}"] = class_mar + + metrics = {k: round(v.item(), 4) for k, v in metrics.items()} + + return metrics + + +def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing the following attributes: + - valid_split (optional): Validation split information. + - data_path (str): Path to the dataset. + - project_name (str): Name of the project. + - model (str): Path or identifier of the model. + trainer (object): Trainer object with an `evaluate` method that returns evaluation metrics. + + Returns: + str: A formatted model card string containing dataset information, validation metrics, and base model details. + """ + if config.valid_split is not None: + eval_scores = trainer.evaluate() + eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items() if k in VALID_METRICS] + eval_scores = "\n\n".join(eval_scores) + + else: + eval_scores = "No validation metrics available" + + if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path): + dataset_tag = "" + else: + dataset_tag = f"\ndatasets:\n- {config.data_path}" + + if os.path.isdir(config.model): + base_model = "" + else: + base_model = f"\nbase_model: {config.model}" + + model_card = MODEL_CARD.format( + dataset_tag=dataset_tag, + validation_metrics=eval_scores, + base_model=base_model, + ) + return model_card diff --git a/src/autotrain/trainers/sent_transformers/__init__.py b/src/autotrain/trainers/sent_transformers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/sent_transformers/__main__.py b/src/autotrain/trainers/sent_transformers/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..2b3832a5403cb2231899411577984f3daed40b2e --- /dev/null +++ b/src/autotrain/trainers/sent_transformers/__main__.py @@ -0,0 +1,251 @@ +import argparse +import json +from functools import partial + +from accelerate import PartialState +from datasets import load_dataset, load_from_disk +from huggingface_hub import HfApi +from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer +from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, TripletEvaluator +from sentence_transformers.losses import CoSENTLoss, MultipleNegativesRankingLoss, SoftmaxLoss +from sentence_transformers.training_args import SentenceTransformerTrainingArguments +from transformers import EarlyStoppingCallback +from transformers.trainer_callback import PrinterCallback + +from autotrain import logger +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + LossLoggingCallback, + TrainStartCallback, + UploadLogs, + monitor, + pause_space, + remove_autotrain_data, + save_training_params, +) +from autotrain.trainers.sent_transformers import utils +from autotrain.trainers.sent_transformers.params import SentenceTransformersParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +@monitor +def train(config): + if isinstance(config, dict): + config = SentenceTransformersParams(**config) + + train_data = None + valid_data = None + # check if config.train_split.csv exists in config.data_path + if config.train_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + num_classes = None + if config.trainer == "pair_class": + classes = train_data.features[config.target_column].names + # label2id = {c: i for i, c in enumerate(classes)} + num_classes = len(classes) + + if num_classes < 2: + raise ValueError("Invalid number of classes. Must be greater than 1.") + + if config.valid_split is not None: + num_classes_valid = len(valid_data.unique(config.target_column)) + if num_classes_valid != num_classes: + raise ValueError( + f"Number of classes in train and valid are not the same. Training has {num_classes} and valid has {num_classes_valid}" + ) + + if config.logging_steps == -1: + logging_steps = int(0.2 * len(train_data) / config.batch_size) + if logging_steps == 0: + logging_steps = 1 + if logging_steps > 25: + logging_steps = 25 + config.logging_steps = logging_steps + else: + logging_steps = config.logging_steps + + logger.info(f"Logging steps: {logging_steps}") + + train_data = utils.process_columns(train_data, config) + logger.info(f"Train data: {train_data}") + if config.valid_split is not None: + valid_data = utils.process_columns(valid_data, config) + logger.info(f"Valid data: {valid_data}") + + training_args = dict( + output_dir=config.project_name, + per_device_train_batch_size=config.batch_size, + per_device_eval_batch_size=2 * config.batch_size, + learning_rate=config.lr, + num_train_epochs=config.epochs, + eval_strategy=config.eval_strategy if config.valid_split is not None else "no", + logging_steps=logging_steps, + save_total_limit=config.save_total_limit, + save_strategy=config.eval_strategy if config.valid_split is not None else "no", + gradient_accumulation_steps=config.gradient_accumulation, + report_to=config.log, + auto_find_batch_size=config.auto_find_batch_size, + lr_scheduler_type=config.scheduler, + optim=config.optimizer, + warmup_ratio=config.warmup_ratio, + weight_decay=config.weight_decay, + max_grad_norm=config.max_grad_norm, + push_to_hub=False, + load_best_model_at_end=True if config.valid_split is not None else False, + ddp_find_unused_parameters=False, + ) + + if config.mixed_precision == "fp16": + training_args["fp16"] = True + if config.mixed_precision == "bf16": + training_args["bf16"] = True + + if config.valid_split is not None: + early_stop = EarlyStoppingCallback( + early_stopping_patience=config.early_stopping_patience, + early_stopping_threshold=config.early_stopping_threshold, + ) + callbacks_to_use = [early_stop] + else: + callbacks_to_use = [] + + callbacks_to_use.extend([UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()]) + + model = SentenceTransformer( + config.model, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + model_kwargs={ + "ignore_mismatched_sizes": True, + }, + ) + + loss_mapping = { + "pair": MultipleNegativesRankingLoss, + "pair_class": partial( + SoftmaxLoss, + sentence_embedding_dimension=model.get_sentence_embedding_dimension(), + num_labels=num_classes, + ), + "pair_score": CoSENTLoss, + "triplet": MultipleNegativesRankingLoss, + "qa": MultipleNegativesRankingLoss, + } + + evaluator = None + if config.valid_split is not None: + if config.trainer == "pair_score": + evaluator = EmbeddingSimilarityEvaluator( + sentences1=valid_data["sentence1"], + sentences2=valid_data["sentence2"], + scores=valid_data["score"], + name=config.valid_split, + ) + elif config.trainer == "triplet": + evaluator = TripletEvaluator( + anchors=valid_data["anchor"], + positives=valid_data["positive"], + negatives=valid_data["negative"], + ) + + logger.info("Setting up training arguments...") + args = SentenceTransformerTrainingArguments(**training_args) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks_to_use, + ) + + logger.info("Setting up trainer...") + trainer = SentenceTransformerTrainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data, + loss=loss_mapping[config.trainer], + evaluator=evaluator, + ) + trainer.remove_callback(PrinterCallback) + logger.info("Starting training...") + trainer.train() + + logger.info("Finished training, saving model...") + trainer.save_model(config.project_name) + + model_card = utils.create_model_card(config, trainer) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w") as f: + f.write(model_card) + + if config.push_to_hub: + if PartialState().process_index == 0: + remove_autotrain_data(config) + save_training_params(config) + logger.info("Pushing model to hub...") + api = HfApi(token=config.token) + api.create_repo( + repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True + ) + api.upload_folder( + folder_path=config.project_name, + repo_id=f"{config.username}/{config.project_name}", + repo_type="model", + ) + + if PartialState().process_index == 0: + pause_space(config) + + +if __name__ == "__main__": + _args = parse_args() + training_config = json.load(open(_args.training_config)) + _config = SentenceTransformersParams(**training_config) + train(_config) diff --git a/src/autotrain/trainers/sent_transformers/params.py b/src/autotrain/trainers/sent_transformers/params.py new file mode 100644 index 0000000000000000000000000000000000000000..10d8c5f378537d386f03ae30a046491fa494bf78 --- /dev/null +++ b/src/autotrain/trainers/sent_transformers/params.py @@ -0,0 +1,84 @@ +from typing import Optional + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class SentenceTransformersParams(AutoTrainParams): + """ + SentenceTransformersParams is a configuration class for setting up parameters for training sentence transformers. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the pre-trained model to use. Default is "microsoft/mpnet-base". + lr (float): Learning rate for training. Default is 3e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length for the input. Default is 128. + batch_size (int): Batch size for training. Default is 8. + warmup_ratio (float): Proportion of training to perform learning rate warmup. Default is 0.1. + gradient_accumulation (int): Number of steps to accumulate gradients before updating. Default is 1. + optimizer (str): Optimizer to use. Default is "adamw_torch". + scheduler (str): Learning rate scheduler to use. Default is "linear". + weight_decay (float): Weight decay to apply. Default is 0.0. + max_grad_norm (float): Maximum gradient norm for clipping. Default is 1.0. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. Default is None. + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project for output directory. Default is "project-name". + auto_find_batch_size (bool): Whether to automatically find the optimal batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision training mode (fp16, bf16, or None). Default is None. + save_total_limit (int): Maximum number of checkpoints to save. Default is 1. + token (Optional[str]): Token for accessing Hugging Face Hub. Default is None. + push_to_hub (bool): Whether to push the model to Hugging Face Hub. Default is False. + eval_strategy (str): Evaluation strategy to use. Default is "epoch". + username (Optional[str]): Hugging Face username. Default is None. + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Number of epochs with no improvement after which training will be stopped. Default is 5. + early_stopping_threshold (float): Threshold for measuring the new optimum, to qualify as an improvement. Default is 0.01. + trainer (str): Name of the trainer to use. Default is "pair_score". + sentence1_column (str): Name of the column containing the first sentence. Default is "sentence1". + sentence2_column (str): Name of the column containing the second sentence. Default is "sentence2". + sentence3_column (Optional[str]): Name of the column containing the third sentence (if applicable). Default is None. + target_column (Optional[str]): Name of the column containing the target variable. Default is None. + """ + + data_path: str = Field(None, title="Data path") + model: str = Field("microsoft/mpnet-base", title="Model name") + lr: float = Field(3e-5, title="Learning rate") + epochs: int = Field(3, title="Number of training epochs") + max_seq_length: int = Field(128, title="Max sequence length") + batch_size: int = Field(8, title="Training batch size") + warmup_ratio: float = Field(0.1, title="Warmup proportion") + gradient_accumulation: int = Field(1, title="Gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer") + scheduler: str = Field("linear", title="Scheduler") + weight_decay: float = Field(0.0, title="Weight decay") + max_grad_norm: float = Field(1.0, title="Max gradient norm") + seed: int = Field(42, title="Seed") + train_split: str = Field("train", title="Train split") + valid_split: Optional[str] = Field(None, title="Validation split") + logging_steps: int = Field(-1, title="Logging steps") + project_name: str = Field("project-name", title="Output directory") + auto_find_batch_size: bool = Field(False, title="Auto find batch size") + mixed_precision: Optional[str] = Field(None, title="fp16, bf16, or None") + save_total_limit: int = Field(1, title="Save total limit") + token: Optional[str] = Field(None, title="Hub Token") + push_to_hub: bool = Field(False, title="Push to hub") + eval_strategy: str = Field("epoch", title="Evaluation strategy") + username: Optional[str] = Field(None, title="Hugging Face Username") + log: str = Field("none", title="Logging using experiment tracking") + early_stopping_patience: int = Field(5, title="Early stopping patience") + early_stopping_threshold: float = Field(0.01, title="Early stopping threshold") + # trainers: pair, pair_class, pair_score, triplet, qa + # pair: sentence1, sentence2 + # pair_class: sentence1, sentence2, target + # pair_score: sentence1, sentence2, target + # triplet: sentence1, sentence2, sentence3 + # qa: sentence1, sentence2 + trainer: str = Field("pair_score", title="Trainer name") + sentence1_column: str = Field("sentence1", title="Sentence 1 column") + sentence2_column: str = Field("sentence2", title="Sentence 2 column") + sentence3_column: Optional[str] = Field(None, title="Sentence 3 column") + target_column: Optional[str] = Field(None, title="Target column") diff --git a/src/autotrain/trainers/sent_transformers/utils.py b/src/autotrain/trainers/sent_transformers/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..57cc626d44d249122c245ef3a623ca67f1f0877f --- /dev/null +++ b/src/autotrain/trainers/sent_transformers/utils.py @@ -0,0 +1,159 @@ +import os + +from autotrain import logger + + +MODEL_CARD = """ +--- +library_name: sentence-transformers +tags: +- sentence-transformers +- sentence-similarity +- feature-extraction +- autotrain{base_model} +widget: +- source_sentence: 'search_query: i love autotrain' + sentences: + - 'search_query: huggingface auto train' + - 'search_query: hugging face auto train' + - 'search_query: i love autotrain' +pipeline_tag: sentence-similarity{dataset_tag} +--- + +# Model Trained Using AutoTrain + +- Problem type: Sentence Transformers + +## Validation Metrics +{validation_metrics} + +## Usage + +### Direct Usage (Sentence Transformers) + +First install the Sentence Transformers library: + +```bash +pip install -U sentence-transformers +``` + +Then you can load this model and run inference. +```python +from sentence_transformers import SentenceTransformer + +# Download from the Hugging Face Hub +model = SentenceTransformer("sentence_transformers_model_id") +# Run inference +sentences = [ + 'search_query: autotrain', + 'search_query: auto train', + 'search_query: i love autotrain', +] +embeddings = model.encode(sentences) +print(embeddings.shape) + +# Get the similarity scores for the embeddings +similarities = model.similarity(embeddings, embeddings) +print(similarities.shape) +``` +""" + + +def process_columns(data, config): + """ + Processes and renames columns in the dataset based on the trainer type specified in the configuration. + + Args: + data (Dataset): The dataset containing the columns to be processed. + config (Config): Configuration object containing the trainer type and column names. + + Returns: + Dataset: The dataset with renamed columns as per the trainer type. + + Raises: + ValueError: If the trainer type specified in the configuration is invalid. + + Trainer Types and Corresponding Columns: + - "pair": Renames columns to "anchor" and "positive". + - "pair_class": Renames columns to "premise", "hypothesis", and "label". + - "pair_score": Renames columns to "sentence1", "sentence2", and "score". + - "triplet": Renames columns to "anchor", "positive", and "negative". + - "qa": Renames columns to "query" and "answer". + """ + # trainers: pair, pair_class, pair_score, triplet, qa + # pair: anchor, positive + # pair_class: premise, hypothesis, label + # pair_score: sentence1, sentence2, score + # triplet: anchor, positive, negative + # qa: query, answer + if config.trainer == "pair": + if not (config.sentence1_column == "anchor" and config.sentence1_column in data.column_names): + data = data.rename_column(config.sentence1_column, "anchor") + if not (config.sentence2_column == "positive" and config.sentence2_column in data.column_names): + data = data.rename_column(config.sentence2_column, "positive") + elif config.trainer == "pair_class": + if not (config.sentence1_column == "premise" and config.sentence1_column in data.column_names): + data = data.rename_column(config.sentence1_column, "premise") + if not (config.sentence2_column == "hypothesis" and config.sentence2_column in data.column_names): + data = data.rename_column(config.sentence2_column, "hypothesis") + if not (config.target_column == "label" and config.target_column in data.column_names): + data = data.rename_column(config.target_column, "label") + elif config.trainer == "pair_score": + if not (config.sentence1_column == "sentence1" and config.sentence1_column in data.column_names): + data = data.rename_column(config.sentence1_column, "sentence1") + if not (config.sentence2_column == "sentence2" and config.sentence2_column in data.column_names): + data = data.rename_column(config.sentence2_column, "sentence2") + if not (config.target_column == "score" and config.target_column in data.column_names): + data = data.rename_column(config.target_column, "score") + elif config.trainer == "triplet": + if not (config.sentence1_column == "anchor" and config.sentence1_column in data.column_names): + data = data.rename_column(config.sentence1_column, "anchor") + if not (config.sentence2_column == "positive" and config.sentence2_column in data.column_names): + data = data.rename_column(config.sentence2_column, "positive") + if not (config.sentence3_column == "negative" and config.sentence3_column in data.column_names): + data = data.rename_column(config.sentence3_column, "negative") + elif config.trainer == "qa": + if not (config.sentence1_column == "query" and config.sentence1_column in data.column_names): + data = data.rename_column(config.sentence1_column, "query") + if not (config.sentence2_column == "answer" and config.sentence2_column in data.column_names): + data = data.rename_column(config.sentence2_column, "answer") + else: + raise ValueError(f"Invalid trainer: {config.trainer}") + return data + + +def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing model and dataset details. + trainer (object): Trainer object used to evaluate the model. + + Returns: + str: A formatted model card string containing dataset information, validation metrics, and base model details. + """ + if config.valid_split is not None: + eval_scores = trainer.evaluate() + logger.info(eval_scores) + eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items()] + eval_scores = "\n\n".join(eval_scores) + else: + eval_scores = "No validation metrics available" + + if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path): + dataset_tag = "" + else: + dataset_tag = f"\ndatasets:\n- {config.data_path}" + + if os.path.isdir(config.model): + base_model = "" + else: + base_model = f"\nbase_model: {config.model}" + + model_card = MODEL_CARD.format( + dataset_tag=dataset_tag, + validation_metrics=eval_scores, + base_model=base_model, + ) + return model_card diff --git a/src/autotrain/trainers/seq2seq/__init__.py b/src/autotrain/trainers/seq2seq/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/seq2seq/__main__.py b/src/autotrain/trainers/seq2seq/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..390e70927f320d5dec13dbdf85849e9435492c3c --- /dev/null +++ b/src/autotrain/trainers/seq2seq/__main__.py @@ -0,0 +1,279 @@ +import argparse +import json +from functools import partial + +import torch +from accelerate.state import PartialState +from datasets import load_dataset, load_from_disk +from huggingface_hub import HfApi +from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training +from transformers import ( + AutoConfig, + AutoModelForSeq2SeqLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForSeq2Seq, + EarlyStoppingCallback, + Seq2SeqTrainer, + Seq2SeqTrainingArguments, +) +from transformers.trainer_callback import PrinterCallback + +from autotrain import logger +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + LossLoggingCallback, + TrainStartCallback, + UploadLogs, + monitor, + pause_space, + remove_autotrain_data, + save_training_params, +) +from autotrain.trainers.seq2seq import utils +from autotrain.trainers.seq2seq.dataset import Seq2SeqDataset +from autotrain.trainers.seq2seq.params import Seq2SeqParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +@monitor +def train(config): + if isinstance(config, dict): + config = Seq2SeqParams(**config) + + train_data = None + valid_data = None + # check if config.train_split.csv exists in config.data_path + if config.train_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + tokenizer = AutoTokenizer.from_pretrained(config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE) + + train_data = Seq2SeqDataset(data=train_data, tokenizer=tokenizer, config=config) + if config.valid_split is not None: + valid_data = Seq2SeqDataset(data=valid_data, tokenizer=tokenizer, config=config) + + if config.logging_steps == -1: + if config.valid_split is not None: + logging_steps = int(0.2 * len(valid_data) / config.batch_size) + else: + logging_steps = int(0.2 * len(train_data) / config.batch_size) + if logging_steps == 0: + logging_steps = 1 + if logging_steps > 25: + logging_steps = 25 + config.logging_steps = logging_steps + else: + logging_steps = config.logging_steps + + logger.info(f"Logging steps: {logging_steps}") + + training_args = dict( + output_dir=config.project_name, + per_device_train_batch_size=config.batch_size, + per_device_eval_batch_size=2 * config.batch_size, + learning_rate=config.lr, + num_train_epochs=config.epochs, + eval_strategy=config.eval_strategy if config.valid_split is not None else "no", + logging_steps=logging_steps, + save_total_limit=config.save_total_limit, + save_strategy=config.eval_strategy if config.valid_split is not None else "no", + gradient_accumulation_steps=config.gradient_accumulation, + report_to=config.log, + auto_find_batch_size=config.auto_find_batch_size, + lr_scheduler_type=config.scheduler, + optim=config.optimizer, + warmup_ratio=config.warmup_ratio, + weight_decay=config.weight_decay, + max_grad_norm=config.max_grad_norm, + push_to_hub=False, + load_best_model_at_end=True if config.valid_split is not None else False, + ddp_find_unused_parameters=False, + predict_with_generate=True, + seed=config.seed, + ) + + if config.mixed_precision == "fp16": + training_args["fp16"] = True + if config.mixed_precision == "bf16": + training_args["bf16"] = True + + if config.valid_split is not None: + early_stop = EarlyStoppingCallback( + early_stopping_patience=config.early_stopping_patience, + early_stopping_threshold=config.early_stopping_threshold, + ) + callbacks_to_use = [early_stop] + else: + callbacks_to_use = [] + + callbacks_to_use.extend([UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()]) + + args = Seq2SeqTrainingArguments(**training_args) + + model_config = AutoConfig.from_pretrained( + config.model, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + use_cache=False, + ) + + if config.peft: + if config.quantization == "int4": + raise NotImplementedError("int4 quantization is not supported") + # if config.use_int4: + # bnb_config = BitsAndBytesConfig( + # load_in_4bit=config.use_int4, + # bnb_4bit_quant_type="nf4", + # bnb_4bit_compute_dtype=torch.float16, + # bnb_4bit_use_double_quant=False, + # ) + # config.fp16 = True + if config.quantization == "int8": + bnb_config = BitsAndBytesConfig(load_in_8bit=True) + else: + bnb_config = None + + model = AutoModelForSeq2SeqLM.from_pretrained( + config.model, + config=model_config, + token=config.token, + quantization_config=bnb_config, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + model = AutoModelForSeq2SeqLM.from_pretrained( + config.model, + config=model_config, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + embedding_size = model.get_input_embeddings().weight.shape[0] + if len(tokenizer) > embedding_size: + model.resize_token_embeddings(len(tokenizer)) + + if config.peft: + target_modules = config.target_modules.split(",") if config.target_modules is not None else None + if target_modules: + target_modules = [module.strip() for module in target_modules] + if len(target_modules) == 1 and target_modules[0] == "all-linear": + target_modules = "all-linear" + lora_config = LoraConfig( + r=config.lora_r, + lora_alpha=config.lora_alpha, + target_modules=target_modules, + lora_dropout=config.lora_dropout, + bias="none", + task_type=TaskType.SEQ_2_SEQ_LM, + ) + if config.quantization is not None: + model = prepare_model_for_kbit_training(model) + + model = get_peft_model(model, lora_config) + + _s2s_metrics = partial(utils._seq2seq_metrics, tokenizer=tokenizer) + + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks_to_use, + compute_metrics=_s2s_metrics, + ) + data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) + trainer = Seq2SeqTrainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data, + data_collator=data_collator, + tokenizer=tokenizer, + ) + + for name, module in trainer.model.named_modules(): + if "norm" in name: + module = module.to(torch.float32) + + trainer.remove_callback(PrinterCallback) + trainer.train() + + logger.info("Finished training, saving model...") + trainer.model.config.use_cache = True + trainer.save_model(config.project_name) + + model_card = utils.create_model_card(config, trainer) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w", encoding="utf-8") as f: + f.write(model_card) + + if config.push_to_hub: + if PartialState().process_index == 0: + remove_autotrain_data(config) + save_training_params(config) + logger.info("Pushing model to hub...") + api = HfApi(token=config.token) + api.create_repo( + repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True + ) + api.upload_folder( + folder_path=config.project_name, + repo_id=f"{config.username}/{config.project_name}", + repo_type="model", + ) + + if PartialState().process_index == 0: + pause_space(config) + + +if __name__ == "__main__": + _args = parse_args() + training_config = json.load(open(_args.training_config)) + config = Seq2SeqParams(**training_config) + train(config) diff --git a/src/autotrain/trainers/seq2seq/dataset.py b/src/autotrain/trainers/seq2seq/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..15186873b06bce993bf80658c4c8ff035425b873 --- /dev/null +++ b/src/autotrain/trainers/seq2seq/dataset.py @@ -0,0 +1,41 @@ +class Seq2SeqDataset: + """ + A dataset class for sequence-to-sequence tasks. + + Args: + data (list): The dataset containing input and target sequences. + tokenizer (PreTrainedTokenizer): The tokenizer to process the text data. + config (object): Configuration object containing dataset parameters. + + Attributes: + data (list): The dataset containing input and target sequences. + tokenizer (PreTrainedTokenizer): The tokenizer to process the text data. + config (object): Configuration object containing dataset parameters. + max_len_input (int): Maximum length for input sequences. + max_len_target (int): Maximum length for target sequences. + + Methods: + __len__(): Returns the number of samples in the dataset. + __getitem__(item): Returns the tokenized input and target sequences for a given index. + """ + + def __init__(self, data, tokenizer, config): + self.data = data + self.tokenizer = tokenizer + self.config = config + self.max_len_input = self.config.max_seq_length + self.max_len_target = self.config.max_target_length + + def __len__(self): + return len(self.data) + + def __getitem__(self, item): + text = str(self.data[item][self.config.text_column]) + target = str(self.data[item][self.config.target_column]) + + model_inputs = self.tokenizer(text, max_length=self.max_len_input, truncation=True) + + labels = self.tokenizer(text_target=target, max_length=self.max_len_target, truncation=True) + + model_inputs["labels"] = labels["input_ids"] + return model_inputs diff --git a/src/autotrain/trainers/seq2seq/params.py b/src/autotrain/trainers/seq2seq/params.py new file mode 100644 index 0000000000000000000000000000000000000000..06832309465ac7496c2d9a0bdbad289f670709e8 --- /dev/null +++ b/src/autotrain/trainers/seq2seq/params.py @@ -0,0 +1,88 @@ +from typing import Optional + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class Seq2SeqParams(AutoTrainParams): + """ + Seq2SeqParams is a configuration class for sequence-to-sequence training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to be used. Default is "google/flan-t5-base". + username (Optional[str]): Hugging Face Username. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. + project_name (str): Name of the project or output directory. Default is "project-name". + token (Optional[str]): Hub Token for authentication. + push_to_hub (bool): Whether to push the model to the Hugging Face Hub. Default is False. + text_column (str): Name of the text column in the dataset. Default is "text". + target_column (str): Name of the target text column in the dataset. Default is "target". + lr (float): Learning rate for training. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length for input text. Default is 128. + max_target_length (int): Maximum sequence length for target text. Default is 128. + batch_size (int): Training batch size. Default is 2. + warmup_ratio (float): Proportion of warmup steps. Default is 0.1. + gradient_accumulation (int): Number of gradient accumulation steps. Default is 1. + optimizer (str): Optimizer to be used. Default is "adamw_torch". + scheduler (str): Learning rate scheduler to be used. Default is "linear". + weight_decay (float): Weight decay for the optimizer. Default is 0.0. + max_grad_norm (float): Maximum gradient norm for clipping. Default is 1.0. + logging_steps (int): Number of steps between logging. Default is -1 (disabled). + eval_strategy (str): Evaluation strategy. Default is "epoch". + auto_find_batch_size (bool): Whether to automatically find the batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision training mode (fp16, bf16, or None). + save_total_limit (int): Maximum number of checkpoints to save. Default is 1. + peft (bool): Whether to use Parameter-Efficient Fine-Tuning (PEFT). Default is False. + quantization (Optional[str]): Quantization mode (int4, int8, or None). Default is "int8". + lora_r (int): LoRA-R parameter for PEFT. Default is 16. + lora_alpha (int): LoRA-Alpha parameter for PEFT. Default is 32. + lora_dropout (float): LoRA-Dropout parameter for PEFT. Default is 0.05. + target_modules (str): Target modules for PEFT. Default is "all-linear". + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Patience for early stopping. Default is 5. + early_stopping_threshold (float): Threshold for early stopping. Default is 0.01. + """ + + data_path: str = Field(None, title="Data path") + model: str = Field("google/flan-t5-base", title="Model name") + username: Optional[str] = Field(None, title="Hugging Face Username") + seed: int = Field(42, title="Seed") + train_split: str = Field("train", title="Train split") + valid_split: Optional[str] = Field(None, title="Validation split") + project_name: str = Field("project-name", title="Output directory") + token: Optional[str] = Field(None, title="Hub Token") + push_to_hub: bool = Field(False, title="Push to hub") + text_column: str = Field("text", title="Text column") + target_column: str = Field("target", title="Target text column") + lr: float = Field(5e-5, title="Learning rate") + epochs: int = Field(3, title="Number of training epochs") + max_seq_length: int = Field(128, title="Max sequence length") + max_target_length: int = Field(128, title="Max target sequence length") + batch_size: int = Field(2, title="Training batch size") + warmup_ratio: float = Field(0.1, title="Warmup proportion") + gradient_accumulation: int = Field(1, title="Gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer") + scheduler: str = Field("linear", title="Scheduler") + weight_decay: float = Field(0.0, title="Weight decay") + max_grad_norm: float = Field(1.0, title="Max gradient norm") + logging_steps: int = Field(-1, title="Logging steps") + eval_strategy: str = Field("epoch", title="Evaluation strategy") + auto_find_batch_size: bool = Field(False, title="Auto find batch size") + mixed_precision: Optional[str] = Field(None, title="fp16, bf16, or None") + save_total_limit: int = Field(1, title="Save total limit") + token: Optional[str] = Field(None, title="Hub Token") + push_to_hub: bool = Field(False, title="Push to hub") + peft: bool = Field(False, title="Use PEFT") + quantization: Optional[str] = Field("int8", title="int4, int8, or None") + lora_r: int = Field(16, title="LoRA-R") + lora_alpha: int = Field(32, title="LoRA-Alpha") + lora_dropout: float = Field(0.05, title="LoRA-Dropout") + target_modules: str = Field("all-linear", title="Target modules for PEFT") + log: str = Field("none", title="Logging using experiment tracking") + early_stopping_patience: int = Field(5, title="Early stopping patience") + early_stopping_threshold: float = Field(0.01, title="Early stopping threshold") diff --git a/src/autotrain/trainers/seq2seq/utils.py b/src/autotrain/trainers/seq2seq/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c49148174ece1701ddb2111aa7310c190bf09b68 --- /dev/null +++ b/src/autotrain/trainers/seq2seq/utils.py @@ -0,0 +1,98 @@ +import os + +import evaluate +import nltk +import numpy as np + + +ROUGE_METRIC = evaluate.load("rouge") + +MODEL_CARD = """ +--- +library_name: transformers +tags: +- autotrain +- text2text-generation{base_model} +widget: +- text: "I love AutoTrain"{dataset_tag} +--- + +# Model Trained Using AutoTrain + +- Problem type: Seq2Seq + +## Validation Metrics +{validation_metrics} +""" + + +def _seq2seq_metrics(pred, tokenizer): + """ + Compute sequence-to-sequence metrics for predictions and labels. + + Args: + pred (tuple): A tuple containing predictions and labels. + Predictions and labels are expected to be token IDs. + tokenizer (PreTrainedTokenizer): The tokenizer used for decoding the predictions and labels. + + Returns: + dict: A dictionary containing the computed ROUGE metrics and the average length of the generated sequences. + The keys are the metric names and the values are the corresponding scores rounded to four decimal places. + """ + predictions, labels = pred + decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) + + labels = np.where(labels != -100, labels, tokenizer.pad_token_id) + decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) + + decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds] + decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels] + + result = ROUGE_METRIC.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) + result = {key: value * 100 for key, value in result.items()} + + prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions] + result["gen_len"] = np.mean(prediction_lens) + + return {k: round(v, 4) for k, v in result.items()} + + +def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing the following attributes: + - valid_split (optional): If not None, the function will include evaluation scores. + - data_path (str): Path to the dataset. + - project_name (str): Name of the project. + - model (str): Path or identifier of the model. + trainer (object): Trainer object with an `evaluate` method that returns evaluation metrics. + + Returns: + str: A formatted model card string containing dataset information, validation metrics, and base model details. + """ + if config.valid_split is not None: + eval_scores = trainer.evaluate() + eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items()] + eval_scores = "\n\n".join(eval_scores) + + else: + eval_scores = "No validation metrics available" + + if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path): + dataset_tag = "" + else: + dataset_tag = f"\ndatasets:\n- {config.data_path}" + + if os.path.isdir(config.model): + base_model = "" + else: + base_model = f"\nbase_model: {config.model}" + + model_card = MODEL_CARD.format( + dataset_tag=dataset_tag, + validation_metrics=eval_scores, + base_model=base_model, + ) + return model_card diff --git a/src/autotrain/trainers/tabular/__init__.py b/src/autotrain/trainers/tabular/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/tabular/__main__.py b/src/autotrain/trainers/tabular/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..4474f8780e897d32cd17db5c27761bc98447c4ab --- /dev/null +++ b/src/autotrain/trainers/tabular/__main__.py @@ -0,0 +1,409 @@ +import argparse +import json +import os +from functools import partial + +import joblib +import numpy as np +import optuna +import pandas as pd +from datasets import load_dataset, load_from_disk +from huggingface_hub import HfApi +from sklearn import pipeline, preprocessing +from sklearn.compose import ColumnTransformer + +from autotrain import logger +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + monitor, + pause_space, + remove_autotrain_data, + save_training_params, +) +from autotrain.trainers.tabular import utils +from autotrain.trainers.tabular.params import TabularParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +def optimize(trial, model_name, xtrain, xvalid, ytrain, yvalid, eval_metric, task, preprocessor): + """ + Optimize the model based on the given trial and parameters. + + Parameters: + trial (dict or optuna.trial.Trial): The trial object or dictionary containing hyperparameters. + model_name (str): The name of the model to be used (e.g., "xgboost"). + xtrain (pd.DataFrame or np.ndarray): Training features. + xvalid (pd.DataFrame or np.ndarray): Validation features. + ytrain (pd.Series or np.ndarray): Training labels. + yvalid (pd.Series or np.ndarray): Validation labels. + eval_metric (str): The evaluation metric to be used for optimization. + task (str): The type of task (e.g., "binary_classification", "multi_class_classification", "single_column_regression"). + preprocessor (object): The preprocessor object to be applied to the data. + + Returns: + float or tuple: If trial is a dictionary, returns a tuple containing the models, preprocessor, and metric dictionary. + Otherwise, returns the loss value based on the evaluation metric. + """ + if isinstance(trial, dict): + params = trial + else: + params = utils.get_params(trial, model_name, task) + labels = None + if task == "multi_class_classification": + labels = np.unique(ytrain) + metrics = utils.TabularMetrics(sub_task=task, labels=labels) + + if task in ("binary_classification", "multi_class_classification", "single_column_regression"): + ytrain = ytrain.ravel() + yvalid = yvalid.ravel() + + if preprocessor is not None: + try: + xtrain = preprocessor.fit_transform(xtrain) + xvalid = preprocessor.transform(xvalid) + except ValueError: + logger.info("Preprocessing failed, using nan_to_num") + train_cols = xtrain.columns.tolist() + valid_cols = xvalid.columns.tolist() + xtrain = np.nan_to_num(xtrain) + xvalid = np.nan_to_num(xvalid) + # convert back to dataframe + xtrain = pd.DataFrame(xtrain, columns=train_cols) + xvalid = pd.DataFrame(xvalid, columns=valid_cols) + xtrain = preprocessor.fit_transform(xtrain) + xvalid = preprocessor.transform(xvalid) + + if model_name == "xgboost": + params["eval_metric"] = eval_metric + + _model = utils.TabularModel(model_name, preprocessor=None, sub_task=task, params=params) + model = _model.pipeline + models = [] + if task in ("multi_label_classification", "multi_column_regression"): + # also multi_column_regression + ypred = [] + models = [model] * ytrain.shape[1] + for idx, _m in enumerate(models): + if model_name == "xgboost": + _m.fit( + xtrain, + ytrain[:, idx], + model__eval_set=[(xvalid, yvalid[:, idx])], + model__verbose=False, + ) + else: + _m.fit(xtrain, ytrain[:, idx]) + if task == "multi_column_regression": + ypred_temp = _m.predict(xvalid) + else: + if _model.use_predict_proba: + ypred_temp = _m.predict_proba(xvalid)[:, 1] + else: + ypred_temp = _m.predict(xvalid) + ypred.append(ypred_temp) + ypred = np.column_stack(ypred) + + else: + models = [model] + if model_name == "xgboost": + model.fit( + xtrain, + ytrain, + model__eval_set=[(xvalid, yvalid)], + model__verbose=False, + ) + else: + models[0].fit(xtrain, ytrain) + + if _model.use_predict_proba: + ypred = models[0].predict_proba(xvalid) + else: + ypred = models[0].predict(xvalid) + + if task == "multi_class_classification": + if ypred.reshape(xvalid.shape[0], -1).shape[1] != len(labels): + ypred_ohe = np.zeros((xvalid.shape[0], len(labels))) + ypred_ohe[np.arange(xvalid.shape[0]), ypred] = 1 + ypred = ypred_ohe + + if task == "binary_classification": + if ypred.reshape(xvalid.shape[0], -1).shape[1] != 2: + ypred = np.column_stack([1 - ypred, ypred]) + + # calculate metric + metric_dict = metrics.calculate(yvalid, ypred) + + # change eval_metric key to loss + if eval_metric in metric_dict: + metric_dict["loss"] = metric_dict[eval_metric] + + logger.info(f"Metrics: {metric_dict}") + if isinstance(trial, dict): + return models, preprocessor, metric_dict + return metric_dict["loss"] + + +@monitor +def train(config): + """ + Train a tabular model based on the provided configuration. + + Args: + config (dict or TabularParams): Configuration parameters for training. If a dictionary is provided, it will be converted to a TabularParams object. + + Raises: + Exception: If `valid_data` is None, indicating that a valid split for tabular training was not provided. + + The function performs the following steps: + 1. Loads the training and validation datasets from disk or a specified data path. + 2. Identifies and processes categorical and numerical columns. + 3. Encodes target columns for classification tasks. + 4. Constructs preprocessing pipelines for numerical and categorical data. + 5. Determines the sub-task (e.g., binary classification, multi-class classification, regression). + 6. Optimizes the model using Optuna for hyperparameter tuning. + 7. Saves the best model and target encoders to disk. + 8. Creates and saves a model card. + 9. Optionally pushes the model to the Hugging Face Hub. + + Note: + The function expects the configuration to contain various parameters such as `data_path`, `train_split`, `valid_split`, `categorical_columns`, `numerical_columns`, `model`, `task`, `num_trials`, `time_limit`, `project_name`, `token`, `username`, and `push_to_hub`. + """ + if isinstance(config, dict): + config = TabularParams(**config) + + logger.info("Starting training...") + logger.info(f"Training config: {config}") + + train_data = None + valid_data = None + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + train_data = train_data.to_pandas() + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + valid_data = valid_data.to_pandas() + + if valid_data is None: + raise Exception("valid_data is None. Please provide a valid_split for tabular training.") + + # determine which columns are categorical + if config.categorical_columns is None: + config.categorical_columns = utils.get_categorical_columns(train_data) + if config.numerical_columns is None: + config.numerical_columns = utils.get_numerical_columns(train_data) + + _id_target_cols = ( + [config.id_column] + config.target_columns if config.id_column is not None else config.target_columns + ) + config.numerical_columns = [c for c in config.numerical_columns if c not in _id_target_cols] + config.categorical_columns = [c for c in config.categorical_columns if c not in _id_target_cols] + + useful_columns = config.categorical_columns + config.numerical_columns + + logger.info(f"Categorical columns: {config.categorical_columns}") + logger.info(f"Numerical columns: {config.numerical_columns}") + + # convert object columns to categorical + for col in config.categorical_columns: + train_data[col] = train_data[col].astype("category") + valid_data[col] = valid_data[col].astype("category") + + logger.info(f"Useful columns: {useful_columns}") + + target_encoders = {} + if config.task == "classification": + for target_column in config.target_columns: + target_encoder = preprocessing.LabelEncoder() + target_encoder.fit(train_data[target_column]) + target_encoders[target_column] = target_encoder + + # encode target columns in train and valid data + for k, v in target_encoders.items(): + train_data.loc[:, k] = v.transform(train_data[k]) + valid_data.loc[:, k] = v.transform(valid_data[k]) + + numeric_transformer = "passthrough" + categorical_transformer = "passthrough" + transformers = [] + preprocessor = None + + numeric_steps = [] + imputer = utils.get_imputer(config.numerical_imputer) + scaler = utils.get_scaler(config.numeric_scaler) + if imputer is not None: + numeric_steps.append(("num_imputer", imputer)) + if scaler is not None: + numeric_steps.append(("num_scaler", scaler)) + + if len(numeric_steps) > 0: + numeric_transformer = pipeline.Pipeline(numeric_steps) + transformers.append(("numeric", numeric_transformer, config.numerical_columns)) + + categorical_steps = [] + imputer = utils.get_imputer(config.categorical_imputer) + if imputer is not None: + categorical_steps.append(("cat_imputer", imputer)) + + if len(config.categorical_columns) > 0: + if config.model in ("xgboost", "lightgbm", "randomforest", "catboost", "extratrees"): + categorical_steps.append( + ( + "cat_encoder", + preprocessing.OrdinalEncoder( + handle_unknown="use_encoded_value", + categories="auto", + unknown_value=np.nan, + ), + ) + ) + else: + categorical_steps.append( + ( + "cat_encoder", + preprocessing.OneHotEncoder(handle_unknown="ignore"), + ) + ) + + if len(categorical_steps) > 0: + categorical_transformer = pipeline.Pipeline(categorical_steps) + transformers.append(("categorical", categorical_transformer, config.categorical_columns)) + + if len(transformers) > 0: + preprocessor = ColumnTransformer(transformers=transformers, verbose=True, n_jobs=-1) + logger.info(f"Preprocessor: {preprocessor}") + + xtrain = train_data[useful_columns].reset_index(drop=True) + xvalid = valid_data[useful_columns].reset_index(drop=True) + + ytrain = train_data[config.target_columns].values + yvalid = valid_data[config.target_columns].values + + # determine sub_task + if config.task == "classification": + if len(target_encoders) == 1: + if len(target_encoders[config.target_columns[0]].classes_) == 2: + sub_task = "binary_classification" + else: + sub_task = "multi_class_classification" + else: + sub_task = "multi_label_classification" + else: + if len(config.target_columns) > 1: + sub_task = "multi_column_regression" + else: + sub_task = "single_column_regression" + + eval_metric, direction = utils.get_metric_direction(sub_task) + + logger.info(f"Sub task: {sub_task}") + + args = { + "model_name": config.model, + "xtrain": xtrain, + "xvalid": xvalid, + "ytrain": ytrain, + "yvalid": yvalid, + "eval_metric": eval_metric, + "task": sub_task, + "preprocessor": preprocessor, + } + + optimize_func = partial(optimize, **args) + study = optuna.create_study(direction=direction, study_name="AutoTrain") + study.optimize(optimize_func, n_trials=config.num_trials, timeout=config.time_limit) + best_params = study.best_params + + logger.info(f"Best params: {best_params}") + best_models, best_preprocessors, best_metrics = optimize(best_params, **args) + + models = ( + [pipeline.Pipeline([("preprocessor", best_preprocessors), ("model", m)]) for m in best_models] + if best_preprocessors is not None + else best_models + ) + + joblib.dump( + models[0] if len(models) == 1 else models, + os.path.join(config.project_name, "model.joblib"), + ) + joblib.dump(target_encoders, os.path.join(config.project_name, "target_encoders.joblib")) + + model_card = utils.create_model_card(config, sub_task, best_params, best_metrics) + + if model_card is not None: + with open(os.path.join(config.project_name, "README.md"), "w") as fp: + fp.write(f"{model_card}") + + # remove token key from training_params.json located in output directory + # first check if file exists + if os.path.exists(f"{config.project_name}/training_params.json"): + training_params = json.load(open(f"{config.project_name}/training_params.json")) + training_params.pop("token") + json.dump(training_params, open(f"{config.project_name}/training_params.json", "w")) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w") as f: + f.write(model_card) + + if config.push_to_hub: + remove_autotrain_data(config) + save_training_params(config) + logger.info("Pushing model to hub...") + api = HfApi(token=config.token) + api.create_repo(repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True) + api.upload_folder( + folder_path=config.project_name, repo_id=f"{config.username}/{config.project_name}", repo_type="model" + ) + + pause_space(config) + + +if __name__ == "__main__": + args = parse_args() + training_config = json.load(open(args.training_config)) + config = TabularParams(**training_config) + train(config) diff --git a/src/autotrain/trainers/tabular/params.py b/src/autotrain/trainers/tabular/params.py new file mode 100644 index 0000000000000000000000000000000000000000..ed553cba31141bd5c502065c7bf7f87d0c0fce1d --- /dev/null +++ b/src/autotrain/trainers/tabular/params.py @@ -0,0 +1,52 @@ +from typing import List, Optional, Union + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class TabularParams(AutoTrainParams): + """ + TabularParams is a configuration class for tabular data training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to use. Default is "xgboost". + username (Optional[str]): Hugging Face Username. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. + project_name (str): Name of the output directory. Default is "project-name". + token (Optional[str]): Hub Token for authentication. + push_to_hub (bool): Whether to push the model to the hub. Default is False. + id_column (str): Name of the ID column. Default is "id". + target_columns (Union[List[str], str]): Target column(s) in the dataset. Default is ["target"]. + categorical_columns (Optional[List[str]]): List of categorical columns. + numerical_columns (Optional[List[str]]): List of numerical columns. + task (str): Type of task (e.g., "classification"). Default is "classification". + num_trials (int): Number of trials for hyperparameter optimization. Default is 10. + time_limit (int): Time limit for training in seconds. Default is 600. + categorical_imputer (Optional[str]): Imputer strategy for categorical columns. + numerical_imputer (Optional[str]): Imputer strategy for numerical columns. + numeric_scaler (Optional[str]): Scaler strategy for numerical columns. + """ + + data_path: str = Field(None, title="Data path") + model: str = Field("xgboost", title="Model name") + username: Optional[str] = Field(None, title="Hugging Face Username") + seed: int = Field(42, title="Seed") + train_split: str = Field("train", title="Train split") + valid_split: Optional[str] = Field(None, title="Validation split") + project_name: str = Field("project-name", title="Output directory") + token: Optional[str] = Field(None, title="Hub Token") + push_to_hub: bool = Field(False, title="Push to hub") + id_column: str = Field("id", title="ID column") + target_columns: Union[List[str], str] = Field(["target"], title="Target column(s)") + categorical_columns: Optional[List[str]] = Field(None, title="Categorical columns") + numerical_columns: Optional[List[str]] = Field(None, title="Numerical columns") + task: str = Field("classification", title="Task") + num_trials: int = Field(10, title="Number of trials") + time_limit: int = Field(600, title="Time limit") + categorical_imputer: Optional[str] = Field(None, title="Categorical imputer") + numerical_imputer: Optional[str] = Field(None, title="Numerical imputer") + numeric_scaler: Optional[str] = Field(None, title="Numeric scaler") diff --git a/src/autotrain/trainers/tabular/utils.py b/src/autotrain/trainers/tabular/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..11e7d87c13306945599d90c82ab222a61e5f3341 --- /dev/null +++ b/src/autotrain/trainers/tabular/utils.py @@ -0,0 +1,546 @@ +import copy +from collections import defaultdict +from dataclasses import dataclass +from functools import partial +from typing import List, Optional + +import numpy as np +from sklearn import ensemble, impute, linear_model +from sklearn import metrics as skmetrics +from sklearn import naive_bayes, neighbors, pipeline, preprocessing, svm, tree +from xgboost import XGBClassifier, XGBRegressor + + +MARKDOWN = """ +--- +tags: +- autotrain +- tabular +- {task} +- tabular-{task} +datasets: +- {dataset} +--- + +# Model Trained Using AutoTrain + +- Problem type: Tabular {task} + +## Validation Metrics + +{metrics} + +## Best Params + +{params} + +## Usage + +```python +import json +import joblib +import pandas as pd + +model = joblib.load('model.joblib') +config = json.load(open('config.json')) + +features = config['features'] + +# data = pd.read_csv("data.csv") +data = data[features] + +predictions = model.predict(data) # or model.predict_proba(data) + +# predictions can be converted to original labels using label_encoders.pkl + +``` +""" + +_MODELS: dict = defaultdict(dict) +_MODELS["xgboost"]["classification"] = XGBClassifier +_MODELS["xgboost"]["regression"] = XGBRegressor +_MODELS["logistic_regression"]["classification"] = linear_model.LogisticRegression +_MODELS["logistic_regression"]["regression"] = linear_model.LogisticRegression +_MODELS["random_forest"]["classification"] = ensemble.RandomForestClassifier +_MODELS["random_forest"]["regression"] = ensemble.RandomForestRegressor +_MODELS["extra_trees"]["classification"] = ensemble.ExtraTreesClassifier +_MODELS["extra_trees"]["regression"] = ensemble.ExtraTreesRegressor +_MODELS["gradient_boosting"]["classification"] = ensemble.GradientBoostingClassifier +_MODELS["gradient_boosting"]["regression"] = ensemble.GradientBoostingRegressor +_MODELS["adaboost"]["classification"] = ensemble.AdaBoostClassifier +_MODELS["adaboost"]["regression"] = ensemble.AdaBoostRegressor +_MODELS["ridge"]["classification"] = linear_model.RidgeClassifier +_MODELS["ridge"]["regression"] = linear_model.Ridge +_MODELS["svm"]["classification"] = svm.LinearSVC +_MODELS["svm"]["regression"] = svm.LinearSVR +_MODELS["decision_tree"]["classification"] = tree.DecisionTreeClassifier +_MODELS["decision_tree"]["regression"] = tree.DecisionTreeRegressor +_MODELS["lasso"]["regression"] = linear_model.Lasso +_MODELS["linear_regression"]["regression"] = linear_model.LinearRegression +_MODELS["naive_bayes"]["classification"] = naive_bayes.GaussianNB +_MODELS["knn"]["classification"] = neighbors.KNeighborsClassifier +_MODELS["knn"]["regression"] = neighbors.KNeighborsRegressor + +CLASSIFICATION_TASKS = ("binary_classification", "multi_class_classification", "multi_label_classification") +REGRESSION_TASKS = ("single_column_regression", "multi_column_regression") + + +@dataclass +class TabularMetrics: + """ + A class to calculate various metrics for different types of tabular tasks. + + Attributes: + ----------- + sub_task : str + The type of sub-task. It can be one of the following: + - "binary_classification" + - "multi_class_classification" + - "single_column_regression" + - "multi_column_regression" + - "multi_label_classification" + labels : Optional[List], optional + The list of labels for multi-class classification tasks (default is None). + + Methods: + -------- + __post_init__(): + Initializes the valid metrics based on the sub-task type. + + calculate(y_true, y_pred): + Calculates the metrics based on the true and predicted values. + + Parameters: + ----------- + y_true : array-like + True labels or values. + y_pred : array-like + Predicted labels or values. + + Returns: + -------- + dict + A dictionary with metric names as keys and their calculated values as values. + """ + + sub_task: str + labels: Optional[List] = None + + def __post_init__(self): + if self.sub_task == "binary_classification": + self.valid_metrics = { + "auc": skmetrics.roc_auc_score, + "logloss": skmetrics.log_loss, + "f1": skmetrics.f1_score, + "accuracy": skmetrics.accuracy_score, + "precision": skmetrics.precision_score, + "recall": skmetrics.recall_score, + } + elif self.sub_task == "multi_class_classification": + self.valid_metrics = { + "logloss": partial(skmetrics.log_loss, labels=self.labels), + "accuracy": skmetrics.accuracy_score, + "mlogloss": partial(skmetrics.log_loss, labels=self.labels), + "f1_macro": partial(skmetrics.f1_score, average="macro", labels=self.labels), + "f1_micro": partial(skmetrics.f1_score, average="micro", labels=self.labels), + "f1_weighted": partial(skmetrics.f1_score, average="weighted", labels=self.labels), + "precision_macro": partial(skmetrics.precision_score, average="macro", labels=self.labels), + "precision_micro": partial(skmetrics.precision_score, average="micro", labels=self.labels), + "precision_weighted": partial(skmetrics.precision_score, average="weighted", labels=self.labels), + "recall_macro": partial(skmetrics.recall_score, average="macro", labels=self.labels), + "recall_micro": partial(skmetrics.recall_score, average="micro", labels=self.labels), + "recall_weighted": partial(skmetrics.recall_score, average="weighted", labels=self.labels), + } + elif self.sub_task in ("single_column_regression", "multi_column_regression"): + self.valid_metrics = { + "r2": skmetrics.r2_score, + "mse": skmetrics.mean_squared_error, + "mae": skmetrics.mean_absolute_error, + "rmse": partial(skmetrics.mean_squared_error, squared=False), + "rmsle": partial(skmetrics.mean_squared_log_error, squared=False), + } + elif self.sub_task == "multi_label_classification": + self.valid_metrics = { + "logloss": skmetrics.log_loss, + } + else: + raise ValueError("Invalid problem type") + + def calculate(self, y_true, y_pred): + metrics = {} + for metric_name, metric_func in self.valid_metrics.items(): + if self.sub_task == "binary_classification": + if metric_name == "auc": + metrics[metric_name] = metric_func(y_true, y_pred[:, 1]) + elif metric_name == "logloss": + metrics[metric_name] = metric_func(y_true, y_pred) + else: + metrics[metric_name] = metric_func(y_true, y_pred[:, 1] >= 0.5) + elif self.sub_task == "multi_class_classification": + if metric_name in ( + "accuracy", + "f1_macro", + "f1_micro", + "f1_weighted", + "precision_macro", + "precision_micro", + "precision_weighted", + "recall_macro", + "recall_micro", + "recall_weighted", + ): + metrics[metric_name] = metric_func(y_true, np.argmax(y_pred, axis=1)) + else: + metrics[metric_name] = metric_func(y_true, y_pred) + else: + if metric_name == "rmsle": + temp_pred = copy.deepcopy(y_pred) + temp_pred = np.clip(temp_pred, 0, None) + metrics[metric_name] = metric_func(y_true, temp_pred) + else: + metrics[metric_name] = metric_func(y_true, y_pred) + return metrics + + +class TabularModel: + """ + A class used to represent a Tabular Model for AutoTrain training. + + Attributes + ---------- + model : str + The name of the model to be used. + preprocessor : object + The preprocessor to be applied to the data. + sub_task : str + The sub-task type, either classification or regression. + params : dict + The parameters to be passed to the model. + use_predict_proba : bool + A flag indicating whether to use the predict_proba method. + + Methods + ------- + _get_model(): + Retrieves the appropriate model based on the sub-task and model name. + """ + + def __init__(self, model, preprocessor, sub_task, params): + self.model = model + self.preprocessor = preprocessor + self.sub_task = sub_task + self.params = params + self.use_predict_proba = True + + _model = self._get_model() + if self.preprocessor is not None: + self.pipeline = pipeline.Pipeline([("preprocessor", self.preprocessor), ("model", _model)]) + else: + self.pipeline = pipeline.Pipeline([("model", _model)]) + + def _get_model(self): + if self.model in _MODELS: + if self.sub_task in CLASSIFICATION_TASKS: + if self.model in ("svm", "ridge"): + self.use_predict_proba = False + return _MODELS[self.model]["classification"](**self.params) + elif self.sub_task in REGRESSION_TASKS: + self.use_predict_proba = False + return _MODELS[self.model]["regression"](**self.params) + else: + raise ValueError("Invalid task") + else: + raise ValueError("Invalid model") + + +def get_params(trial, model, task): + if model == "xgboost": + params = { + "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.25, log=True), + "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 100.0, log=True), + "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 100.0, log=True), + "subsample": trial.suggest_float("subsample", 0.1, 1.0), + "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0), + "max_depth": trial.suggest_int("max_depth", 1, 9), + "early_stopping_rounds": trial.suggest_int("early_stopping_rounds", 100, 500), + "n_estimators": trial.suggest_categorical("n_estimators", [7000, 15000, 20000]), + "tree_method": "hist", + "random_state": 42, + } + + return params + + if model == "logistic_regression": + if task in CLASSIFICATION_TASKS: + params = { + "C": trial.suggest_float("C", 1e-8, 1e3, log=True), + "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]), + "solver": trial.suggest_categorical("solver", ["liblinear", "saga"]), + "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]), + "n_jobs": -1, + } + return params + + raise ValueError("Task not supported") + + if model == "random_forest": + params = { + "n_estimators": trial.suggest_int("n_estimators", 10, 10000), + "max_depth": trial.suggest_int("max_depth", 2, 15), + "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2", None]), + "min_samples_split": trial.suggest_int("min_samples_split", 2, 20), + "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20), + "bootstrap": trial.suggest_categorical("bootstrap", [True, False]), + "n_jobs": -1, + } + if task in CLASSIFICATION_TASKS: + params["criterion"] = trial.suggest_categorical("criterion", ["gini", "entropy"]) + return params + if task in REGRESSION_TASKS: + params["criterion"] = trial.suggest_categorical( + "criterion", ["squared_error", "absolute_error", "poisson"] + ) + return params + raise ValueError("Task not supported") + + if model == "extra_trees": + params = { + "n_estimators": trial.suggest_int("n_estimators", 10, 10000), + "max_depth": trial.suggest_int("max_depth", 2, 15), + "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2", None]), + "min_samples_split": trial.suggest_int("min_samples_split", 2, 20), + "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20), + "bootstrap": trial.suggest_categorical("bootstrap", [True, False]), + "n_jobs": -1, + } + if task in CLASSIFICATION_TASKS: + params["criterion"] = trial.suggest_categorical("criterion", ["gini", "entropy"]) + return params + if task in REGRESSION_TASKS: + params["criterion"] = trial.suggest_categorical("criterion", ["squared_error", "absolute_error"]) + return params + raise ValueError("Task not supported") + + if model == "decision_tree": + params = { + "max_depth": trial.suggest_int("max_depth", 1, 15), + "min_samples_split": trial.suggest_int("min_samples_split", 2, 20), + "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20), + "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2", None]), + "splitter": trial.suggest_categorical("splitter", ["best", "random"]), + } + if task in CLASSIFICATION_TASKS: + params["criterion"] = trial.suggest_categorical("criterion", ["gini", "entropy"]) + return params + if task in REGRESSION_TASKS: + params["criterion"] = trial.suggest_categorical( + "criterion", ["squared_error", "absolute_error", "friedman_mse", "poisson"] + ) + return params + raise ValueError("Task not supported") + + if model == "linear_regression": + if task in REGRESSION_TASKS: + params = { + "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]), + } + return params + raise ValueError("Task not supported") + + if model == "svm": + if task in CLASSIFICATION_TASKS: + params = { + "C": trial.suggest_float("C", 1e-8, 1e3, log=True), + "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]), + "penalty": "l2", + "max_iter": trial.suggest_int("max_iter", 1000, 10000), + } + return params + + if task in REGRESSION_TASKS: + params = { + "C": trial.suggest_float("C", 1e-8, 1e3, log=True), + "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]), + "loss": trial.suggest_categorical("loss", ["epsilon_insensitive", "squared_epsilon_insensitive"]), + "epsilon": trial.suggest_float("epsilon", 1e-8, 1e-1, log=True), + "max_iter": trial.suggest_int("max_iter", 1000, 10000), + } + return params + raise ValueError("Task not supported") + + if model == "ridge": + params = { + "alpha": trial.suggest_float("alpha", 1e-8, 1e3, log=True), + "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]), + "max_iter": trial.suggest_int("max_iter", 1000, 10000), + } + if task in CLASSIFICATION_TASKS: + return params + if task in REGRESSION_TASKS: + return params + raise ValueError("Task not supported") + + if model == "lasso": + if task in REGRESSION_TASKS: + params = { + "alpha": trial.suggest_float("alpha", 1e-8, 1e3, log=True), + "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]), + "max_iter": trial.suggest_int("max_iter", 1000, 10000), + } + return params + raise ValueError("Task not supported") + + if model == "knn": + params = { + "n_neighbors": trial.suggest_int("n_neighbors", 1, 25), + "weights": trial.suggest_categorical("weights", ["uniform", "distance"]), + "algorithm": trial.suggest_categorical("algorithm", ["ball_tree", "kd_tree", "brute"]), + "leaf_size": trial.suggest_int("leaf_size", 1, 100), + "p": trial.suggest_categorical("p", [1, 2]), + "metric": trial.suggest_categorical("metric", ["minkowski", "euclidean", "manhattan"]), + } + if task in CLASSIFICATION_TASKS or task in REGRESSION_TASKS: + return params + raise ValueError("Task not supported") + + return ValueError("Invalid model") + + +def get_imputer(imputer_name): + """ + Returns an imputer object based on the specified imputer name. + + Parameters: + imputer_name (str): The name of the imputer to use. Can be one of the following: + - "median": Uses the median value for imputation. + - "mean": Uses the mean value for imputation. + - "most_frequent": Uses the most frequent value for imputation. + If None, returns None. + + Returns: + impute.SimpleImputer or None: An instance of SimpleImputer with the specified strategy, + or None if imputer_name is None. + + Raises: + ValueError: If an invalid imputer_name is provided. + """ + if imputer_name is None: + return None + if imputer_name == "median": + return impute.SimpleImputer(strategy="median") + if imputer_name == "mean": + return impute.SimpleImputer(strategy="mean") + if imputer_name == "most_frequent": + return impute.SimpleImputer(strategy="most_frequent") + raise ValueError("Invalid imputer") + + +def get_scaler(scaler_name): + """ + Returns a scaler object based on the provided scaler name. + + Parameters: + scaler_name (str): The name of the scaler to be returned. + Possible values are "standard", "minmax", "robust", and "normal". + If None, returns None. + + Returns: + scaler: An instance of the corresponding scaler from sklearn.preprocessing. + If the scaler_name is None, returns None. + + Raises: + ValueError: If the scaler_name is not one of the expected values. + """ + if scaler_name is None: + return None + if scaler_name == "standard": + return preprocessing.StandardScaler() + if scaler_name == "minmax": + return preprocessing.MinMaxScaler() + if scaler_name == "robust": + return preprocessing.RobustScaler() + if scaler_name == "normal": + return preprocessing.Normalizer() + raise ValueError("Invalid scaler") + + +def get_metric_direction(sub_task): + """ + Determines the appropriate metric and its optimization direction based on the given sub-task. + + Parameters: + sub_task (str): The type of sub-task. Must be one of the following: + - "binary_classification" + - "multi_class_classification" + - "single_column_regression" + - "multi_label_classification" + - "multi_column_regression" + + Returns: + tuple: A tuple containing: + - str: The metric to be used (e.g., "logloss", "mlogloss", "rmse"). + - str: The direction of optimization ("minimize"). + + Raises: + ValueError: If the provided sub_task is not one of the recognized types. + """ + if sub_task == "binary_classification": + return "logloss", "minimize" + if sub_task == "multi_class_classification": + return "mlogloss", "minimize" + if sub_task == "single_column_regression": + return "rmse", "minimize" + if sub_task == "multi_label_classification": + return "logloss", "minimize" + if sub_task == "multi_column_regression": + return "rmse", "minimize" + raise ValueError("Invalid sub_task") + + +def get_categorical_columns(df): + """ + Extracts the names of categorical columns from a DataFrame. + + Parameters: + df (pandas.DataFrame): The DataFrame from which to extract categorical columns. + + Returns: + list: A list of column names that are of categorical data type (either 'category' or 'object'). + """ + return list(df.select_dtypes(include=["category", "object"]).columns) + + +def get_numerical_columns(df): + """ + Extracts and returns a list of numerical column names from a given DataFrame. + + Args: + df (pandas.DataFrame): The DataFrame from which to extract numerical columns. + + Returns: + list: A list of column names that have numerical data types. + """ + return list(df.select_dtypes(include=["number"]).columns) + + +def create_model_card(config, sub_task, best_params, best_metrics): + """ + Generates a markdown formatted model card with the given configuration, sub-task, best parameters, and best metrics. + + Args: + config (object): Configuration object containing task and data path information. + sub_task (str): The specific sub-task for which the model card is being created. + best_params (dict): Dictionary containing the best hyperparameters for the model. + best_metrics (dict): Dictionary containing the best performance metrics for the model. + + Returns: + str: A string containing the formatted model card in markdown. + """ + best_metrics = "\n".join([f"- {k}: {v}" for k, v in best_metrics.items()]) + best_params = "\n".join([f"- {k}: {v}" for k, v in best_params.items()]) + return MARKDOWN.format( + task=config.task, + dataset=config.data_path, + metrics=best_metrics, + params=best_params, + ) diff --git a/src/autotrain/trainers/text_classification/__init__.py b/src/autotrain/trainers/text_classification/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/text_classification/__main__.py b/src/autotrain/trainers/text_classification/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..5b2cf67af5a072856f4677e1546dba476606e952 --- /dev/null +++ b/src/autotrain/trainers/text_classification/__main__.py @@ -0,0 +1,239 @@ +import argparse +import json + +from accelerate.state import PartialState +from datasets import load_dataset, load_from_disk +from huggingface_hub import HfApi +from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + EarlyStoppingCallback, + Trainer, + TrainingArguments, +) +from transformers.trainer_callback import PrinterCallback + +from autotrain import logger +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + LossLoggingCallback, + TrainStartCallback, + UploadLogs, + monitor, + pause_space, + remove_autotrain_data, + save_training_params, +) +from autotrain.trainers.text_classification import utils +from autotrain.trainers.text_classification.dataset import TextClassificationDataset +from autotrain.trainers.text_classification.params import TextClassificationParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +@monitor +def train(config): + if isinstance(config, dict): + config = TextClassificationParams(**config) + + train_data = None + valid_data = None + # check if config.train_split.csv exists in config.data_path + if config.train_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + classes = train_data.features[config.target_column].names + label2id = {c: i for i, c in enumerate(classes)} + num_classes = len(classes) + + if num_classes < 2: + raise ValueError("Invalid number of classes. Must be greater than 1.") + + if config.valid_split is not None: + num_classes_valid = len(valid_data.unique(config.target_column)) + if num_classes_valid != num_classes: + raise ValueError( + f"Number of classes in train and valid are not the same. Training has {num_classes} and valid has {num_classes_valid}" + ) + + model_config = AutoConfig.from_pretrained(config.model, num_labels=num_classes) + model_config._num_labels = len(label2id) + model_config.label2id = label2id + model_config.id2label = {v: k for k, v in label2id.items()} + + try: + model = AutoModelForSequenceClassification.from_pretrained( + config.model, + config=model_config, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + except OSError: + model = AutoModelForSequenceClassification.from_pretrained( + config.model, + config=model_config, + from_tf=True, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + + tokenizer = AutoTokenizer.from_pretrained(config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE) + train_data = TextClassificationDataset(data=train_data, tokenizer=tokenizer, config=config) + if config.valid_split is not None: + valid_data = TextClassificationDataset(data=valid_data, tokenizer=tokenizer, config=config) + + if config.logging_steps == -1: + if config.valid_split is not None: + logging_steps = int(0.2 * len(valid_data) / config.batch_size) + else: + logging_steps = int(0.2 * len(train_data) / config.batch_size) + if logging_steps == 0: + logging_steps = 1 + if logging_steps > 25: + logging_steps = 25 + config.logging_steps = logging_steps + else: + logging_steps = config.logging_steps + + logger.info(f"Logging steps: {logging_steps}") + + training_args = dict( + output_dir=config.project_name, + per_device_train_batch_size=config.batch_size, + per_device_eval_batch_size=2 * config.batch_size, + learning_rate=config.lr, + num_train_epochs=config.epochs, + eval_strategy=config.eval_strategy if config.valid_split is not None else "no", + logging_steps=logging_steps, + save_total_limit=config.save_total_limit, + save_strategy=config.eval_strategy if config.valid_split is not None else "no", + gradient_accumulation_steps=config.gradient_accumulation, + report_to=config.log, + auto_find_batch_size=config.auto_find_batch_size, + lr_scheduler_type=config.scheduler, + optim=config.optimizer, + warmup_ratio=config.warmup_ratio, + weight_decay=config.weight_decay, + max_grad_norm=config.max_grad_norm, + push_to_hub=False, + load_best_model_at_end=True if config.valid_split is not None else False, + ddp_find_unused_parameters=False, + ) + + if config.mixed_precision == "fp16": + training_args["fp16"] = True + if config.mixed_precision == "bf16": + training_args["bf16"] = True + + if config.valid_split is not None: + early_stop = EarlyStoppingCallback( + early_stopping_patience=config.early_stopping_patience, + early_stopping_threshold=config.early_stopping_threshold, + ) + callbacks_to_use = [early_stop] + else: + callbacks_to_use = [] + + callbacks_to_use.extend([UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()]) + + args = TrainingArguments(**training_args) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks_to_use, + compute_metrics=( + utils._binary_classification_metrics if num_classes == 2 else utils._multi_class_classification_metrics + ), + ) + + trainer = Trainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data, + ) + trainer.remove_callback(PrinterCallback) + trainer.train() + + logger.info("Finished training, saving model...") + trainer.save_model(config.project_name) + tokenizer.save_pretrained(config.project_name) + + model_card = utils.create_model_card(config, trainer, num_classes) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w") as f: + f.write(model_card) + + if config.push_to_hub: + if PartialState().process_index == 0: + remove_autotrain_data(config) + save_training_params(config) + logger.info("Pushing model to hub...") + api = HfApi(token=config.token) + api.create_repo( + repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True + ) + api.upload_folder( + folder_path=config.project_name, + repo_id=f"{config.username}/{config.project_name}", + repo_type="model", + ) + + if PartialState().process_index == 0: + pause_space(config) + + +if __name__ == "__main__": + args = parse_args() + training_config = json.load(open(args.training_config)) + config = TextClassificationParams(**training_config) + train(config) diff --git a/src/autotrain/trainers/text_classification/dataset.py b/src/autotrain/trainers/text_classification/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b14303e741dcc51a6eb5877c39c8da3dfd45ec13 --- /dev/null +++ b/src/autotrain/trainers/text_classification/dataset.py @@ -0,0 +1,65 @@ +import torch + + +class TextClassificationDataset: + """ + A dataset class for text classification tasks. + + Args: + data (list): The dataset containing text and target columns. + tokenizer (PreTrainedTokenizer): The tokenizer to preprocess the text data. + config (object): Configuration object containing dataset parameters. + + Attributes: + data (list): The dataset containing text and target columns. + tokenizer (PreTrainedTokenizer): The tokenizer to preprocess the text data. + config (object): Configuration object containing dataset parameters. + text_column (str): The name of the column containing text data. + target_column (str): The name of the column containing target labels. + + Methods: + __len__(): Returns the number of samples in the dataset. + __getitem__(item): Returns a dictionary containing tokenized input ids, attention mask, token type ids (if available), and target labels for the given item index. + """ + + def __init__(self, data, tokenizer, config): + self.data = data + self.tokenizer = tokenizer + self.config = config + self.text_column = self.config.text_column + self.target_column = self.config.target_column + + def __len__(self): + return len(self.data) + + def __getitem__(self, item): + text = str(self.data[item][self.text_column]) + target = self.data[item][self.target_column] + target = int(target) + inputs = self.tokenizer( + text, + max_length=self.config.max_seq_length, + padding="max_length", + truncation=True, + ) + + ids = inputs["input_ids"] + mask = inputs["attention_mask"] + + if "token_type_ids" in inputs: + token_type_ids = inputs["token_type_ids"] + else: + token_type_ids = None + + if token_type_ids is not None: + return { + "input_ids": torch.tensor(ids, dtype=torch.long), + "attention_mask": torch.tensor(mask, dtype=torch.long), + "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long), + "labels": torch.tensor(target, dtype=torch.long), + } + return { + "input_ids": torch.tensor(ids, dtype=torch.long), + "attention_mask": torch.tensor(mask, dtype=torch.long), + "labels": torch.tensor(target, dtype=torch.long), + } diff --git a/src/autotrain/trainers/text_classification/params.py b/src/autotrain/trainers/text_classification/params.py new file mode 100644 index 0000000000000000000000000000000000000000..b03758adad9e0faf286ba81d5c3c4613f80d08b3 --- /dev/null +++ b/src/autotrain/trainers/text_classification/params.py @@ -0,0 +1,72 @@ +from typing import Optional + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class TextClassificationParams(AutoTrainParams): + """ + [`TextClassificationParams`] is a configuration class for text classification training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to use. Default is "bert-base-uncased". + lr (float): Learning rate. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length. Default is 128. + batch_size (int): Training batch size. Default is 8. + warmup_ratio (float): Warmup proportion. Default is 0.1. + gradient_accumulation (int): Number of gradient accumulation steps. Default is 1. + optimizer (str): Optimizer to use. Default is "adamw_torch". + scheduler (str): Scheduler to use. Default is "linear". + weight_decay (float): Weight decay. Default is 0.0. + max_grad_norm (float): Maximum gradient norm. Default is 1.0. + seed (int): Random seed. Default is 42. + train_split (str): Name of the training split. Default is "train". + valid_split (Optional[str]): Name of the validation split. Default is None. + text_column (str): Name of the text column in the dataset. Default is "text". + target_column (str): Name of the target column in the dataset. Default is "target". + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project. Default is "project-name". + auto_find_batch_size (bool): Whether to automatically find the batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision setting (fp16, bf16, or None). Default is None. + save_total_limit (int): Total number of checkpoints to save. Default is 1. + token (Optional[str]): Hub token for authentication. Default is None. + push_to_hub (bool): Whether to push the model to the hub. Default is False. + eval_strategy (str): Evaluation strategy. Default is "epoch". + username (Optional[str]): Hugging Face username. Default is None. + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Number of epochs with no improvement after which training will be stopped. Default is 5. + early_stopping_threshold (float): Threshold for measuring the new optimum to continue training. Default is 0.01. + """ + + data_path: str = Field(None, title="Data path") + model: str = Field("bert-base-uncased", title="Model name") + lr: float = Field(5e-5, title="Learning rate") + epochs: int = Field(3, title="Number of training epochs") + max_seq_length: int = Field(128, title="Max sequence length") + batch_size: int = Field(8, title="Training batch size") + warmup_ratio: float = Field(0.1, title="Warmup proportion") + gradient_accumulation: int = Field(1, title="Gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer") + scheduler: str = Field("linear", title="Scheduler") + weight_decay: float = Field(0.0, title="Weight decay") + max_grad_norm: float = Field(1.0, title="Max gradient norm") + seed: int = Field(42, title="Seed") + train_split: str = Field("train", title="Train split") + valid_split: Optional[str] = Field(None, title="Validation split") + text_column: str = Field("text", title="Text column") + target_column: str = Field("target", title="Target column") + logging_steps: int = Field(-1, title="Logging steps") + project_name: str = Field("project-name", title="Output directory") + auto_find_batch_size: bool = Field(False, title="Auto find batch size") + mixed_precision: Optional[str] = Field(None, title="fp16, bf16, or None") + save_total_limit: int = Field(1, title="Save total limit") + token: Optional[str] = Field(None, title="Hub Token") + push_to_hub: bool = Field(False, title="Push to hub") + eval_strategy: str = Field("epoch", title="Evaluation strategy") + username: Optional[str] = Field(None, title="Hugging Face Username") + log: str = Field("none", title="Logging using experiment tracking") + early_stopping_patience: int = Field(5, title="Early stopping patience") + early_stopping_threshold: float = Field(0.01, title="Early stopping threshold") diff --git a/src/autotrain/trainers/text_classification/utils.py b/src/autotrain/trainers/text_classification/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..42ca7203bcd78681e353efd3b0aa57faa553d89d --- /dev/null +++ b/src/autotrain/trainers/text_classification/utils.py @@ -0,0 +1,179 @@ +import os + +import numpy as np +import requests +from sklearn import metrics + + +BINARY_CLASSIFICATION_EVAL_METRICS = ( + "eval_loss", + "eval_accuracy", + "eval_f1", + "eval_auc", + "eval_precision", + "eval_recall", +) + +MULTI_CLASS_CLASSIFICATION_EVAL_METRICS = ( + "eval_loss", + "eval_accuracy", + "eval_f1_macro", + "eval_f1_micro", + "eval_f1_weighted", + "eval_precision_macro", + "eval_precision_micro", + "eval_precision_weighted", + "eval_recall_macro", + "eval_recall_micro", + "eval_recall_weighted", +) + +MODEL_CARD = """ +--- +library_name: transformers +tags: +- autotrain +- text-classification{base_model} +widget: +- text: "I love AutoTrain"{dataset_tag} +--- + +# Model Trained Using AutoTrain + +- Problem type: Text Classification + +## Validation Metrics +{validation_metrics} +""" + + +def _binary_classification_metrics(pred): + """ + Calculate various binary classification metrics. + + Args: + pred (tuple): A tuple containing raw predictions and true labels. + - raw_predictions (numpy.ndarray): The raw prediction scores from the model. + - labels (numpy.ndarray): The true labels. + + Returns: + dict: A dictionary containing the following metrics: + - "f1" (float): The F1 score. + - "precision" (float): The precision score. + - "recall" (float): The recall score. + - "auc" (float): The Area Under the ROC Curve (AUC) score. + - "accuracy" (float): The accuracy score. + """ + raw_predictions, labels = pred + predictions = np.argmax(raw_predictions, axis=1) + result = { + "f1": metrics.f1_score(labels, predictions), + "precision": metrics.precision_score(labels, predictions), + "recall": metrics.recall_score(labels, predictions), + "auc": metrics.roc_auc_score(labels, raw_predictions[:, 1]), + "accuracy": metrics.accuracy_score(labels, predictions), + } + return result + + +def _multi_class_classification_metrics(pred): + """ + Compute various classification metrics for multi-class classification. + + Args: + pred (tuple): A tuple containing raw predictions and true labels. + - raw_predictions (numpy.ndarray): The raw prediction scores for each class. + - labels (numpy.ndarray): The true labels. + + Returns: + dict: A dictionary containing the following metrics: + - "f1_macro": F1 score with macro averaging. + - "f1_micro": F1 score with micro averaging. + - "f1_weighted": F1 score with weighted averaging. + - "precision_macro": Precision score with macro averaging. + - "precision_micro": Precision score with micro averaging. + - "precision_weighted": Precision score with weighted averaging. + - "recall_macro": Recall score with macro averaging. + - "recall_micro": Recall score with micro averaging. + - "recall_weighted": Recall score with weighted averaging. + - "accuracy": Accuracy score. + """ + raw_predictions, labels = pred + predictions = np.argmax(raw_predictions, axis=1) + results = { + "f1_macro": metrics.f1_score(labels, predictions, average="macro"), + "f1_micro": metrics.f1_score(labels, predictions, average="micro"), + "f1_weighted": metrics.f1_score(labels, predictions, average="weighted"), + "precision_macro": metrics.precision_score(labels, predictions, average="macro"), + "precision_micro": metrics.precision_score(labels, predictions, average="micro"), + "precision_weighted": metrics.precision_score(labels, predictions, average="weighted"), + "recall_macro": metrics.recall_score(labels, predictions, average="macro"), + "recall_micro": metrics.recall_score(labels, predictions, average="micro"), + "recall_weighted": metrics.recall_score(labels, predictions, average="weighted"), + "accuracy": metrics.accuracy_score(labels, predictions), + } + return results + + +def create_model_card(config, trainer, num_classes): + """ + Generates a model card for a text classification model. + + Args: + config (object): Configuration object containing various settings and paths. + trainer (object): Trainer object used for evaluating the model. + num_classes (int): Number of classes in the classification task. + + Returns: + str: A formatted string representing the model card. + """ + if config.valid_split is not None: + eval_scores = trainer.evaluate() + valid_metrics = ( + BINARY_CLASSIFICATION_EVAL_METRICS if num_classes == 2 else MULTI_CLASS_CLASSIFICATION_EVAL_METRICS + ) + eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items() if k in valid_metrics] + eval_scores = "\n\n".join(eval_scores) + + else: + eval_scores = "No validation metrics available" + + if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path): + dataset_tag = "" + else: + dataset_tag = f"\ndatasets:\n- {config.data_path}" + + if os.path.isdir(config.model): + base_model = "" + else: + base_model = f"\nbase_model: {config.model}" + + model_card = MODEL_CARD.format( + dataset_tag=dataset_tag, + validation_metrics=eval_scores, + base_model=base_model, + ) + return model_card + + +def pause_endpoint(params): + """ + Pauses a Hugging Face endpoint using the provided parameters. + + This function constructs an API URL using the endpoint ID from the environment + variables, and sends a POST request to pause the specified endpoint. + + Args: + params (object): An object containing the following attribute: + - token (str): The authorization token required to authenticate the API request. + + Returns: + dict: The JSON response from the API call. + """ + endpoint_id = os.environ["ENDPOINT_ID"] + username = endpoint_id.split("/")[0] + project_name = endpoint_id.split("/")[1] + api_url = f"https://api.endpoints.huggingface.cloud/v2/endpoint/{username}/{project_name}/pause" + headers = {"Authorization": f"Bearer {params.token}"} + r = requests.post(api_url, headers=headers) + return r.json() diff --git a/src/autotrain/trainers/text_regression/__init__.py b/src/autotrain/trainers/text_regression/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/text_regression/__main__.py b/src/autotrain/trainers/text_regression/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..3425d3e56a1a4a87e7c9597c4db5ef67d5ae0936 --- /dev/null +++ b/src/autotrain/trainers/text_regression/__main__.py @@ -0,0 +1,229 @@ +import argparse +import json + +from accelerate.state import PartialState +from datasets import load_dataset, load_from_disk +from huggingface_hub import HfApi +from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + EarlyStoppingCallback, + Trainer, + TrainingArguments, +) +from transformers.trainer_callback import PrinterCallback + +from autotrain import logger +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + LossLoggingCallback, + TrainStartCallback, + UploadLogs, + monitor, + pause_space, + remove_autotrain_data, + save_training_params, +) +from autotrain.trainers.text_regression import utils +from autotrain.trainers.text_regression.dataset import TextRegressionDataset +from autotrain.trainers.text_regression.params import TextRegressionParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +@monitor +def train(config): + if isinstance(config, dict): + config = TextRegressionParams(**config) + + train_data = None + valid_data = None + # check if config.train_split.csv exists in config.data_path + if config.train_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + model_config = AutoConfig.from_pretrained( + config.model, + num_labels=1, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ) + model_config._num_labels = 1 + label2id = {"target": 0} + model_config.label2id = label2id + model_config.id2label = {v: k for k, v in label2id.items()} + + try: + model = AutoModelForSequenceClassification.from_pretrained( + config.model, + config=model_config, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + except OSError: + model = AutoModelForSequenceClassification.from_pretrained( + config.model, + config=model_config, + from_tf=True, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + + tokenizer = AutoTokenizer.from_pretrained(config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE) + train_data = TextRegressionDataset(data=train_data, tokenizer=tokenizer, config=config) + if config.valid_split is not None: + valid_data = TextRegressionDataset(data=valid_data, tokenizer=tokenizer, config=config) + + if config.logging_steps == -1: + if config.valid_split is not None: + logging_steps = int(0.2 * len(valid_data) / config.batch_size) + else: + logging_steps = int(0.2 * len(train_data) / config.batch_size) + if logging_steps == 0: + logging_steps = 1 + if logging_steps > 25: + logging_steps = 25 + config.logging_steps = logging_steps + else: + logging_steps = config.logging_steps + + logger.info(f"Logging steps: {logging_steps}") + + training_args = dict( + output_dir=config.project_name, + per_device_train_batch_size=config.batch_size, + per_device_eval_batch_size=2 * config.batch_size, + learning_rate=config.lr, + num_train_epochs=config.epochs, + eval_strategy=config.eval_strategy if config.valid_split is not None else "no", + logging_steps=logging_steps, + save_total_limit=config.save_total_limit, + save_strategy=config.eval_strategy if config.valid_split is not None else "no", + gradient_accumulation_steps=config.gradient_accumulation, + report_to=config.log, + auto_find_batch_size=config.auto_find_batch_size, + lr_scheduler_type=config.scheduler, + optim=config.optimizer, + warmup_ratio=config.warmup_ratio, + weight_decay=config.weight_decay, + max_grad_norm=config.max_grad_norm, + push_to_hub=False, + load_best_model_at_end=True if config.valid_split is not None else False, + ddp_find_unused_parameters=False, + ) + + if config.mixed_precision == "fp16": + training_args["fp16"] = True + if config.mixed_precision == "bf16": + training_args["bf16"] = True + + if config.valid_split is not None: + early_stop = EarlyStoppingCallback( + early_stopping_patience=config.early_stopping_patience, + early_stopping_threshold=config.early_stopping_threshold, + ) + callbacks_to_use = [early_stop] + else: + callbacks_to_use = [] + + callbacks_to_use.extend([UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()]) + + args = TrainingArguments(**training_args) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks_to_use, + compute_metrics=utils.single_column_regression_metrics, + ) + + trainer = Trainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data, + ) + trainer.remove_callback(PrinterCallback) + trainer.train() + + logger.info("Finished training, saving model...") + trainer.save_model(config.project_name) + tokenizer.save_pretrained(config.project_name) + + model_card = utils.create_model_card(config, trainer) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w") as f: + f.write(model_card) + + if config.push_to_hub: + if PartialState().process_index == 0: + remove_autotrain_data(config) + save_training_params(config) + logger.info("Pushing model to hub...") + api = HfApi(token=config.token) + api.create_repo( + repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True + ) + api.upload_folder( + folder_path=config.project_name, + repo_id=f"{config.username}/{config.project_name}", + repo_type="model", + ) + + if PartialState().process_index == 0: + pause_space(config) + + +if __name__ == "__main__": + args = parse_args() + training_config = json.load(open(args.training_config)) + config = TextRegressionParams(**training_config) + train(config) diff --git a/src/autotrain/trainers/text_regression/dataset.py b/src/autotrain/trainers/text_regression/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..abd54854ed1f289ed6fb183eb3d4009304af9b21 --- /dev/null +++ b/src/autotrain/trainers/text_regression/dataset.py @@ -0,0 +1,66 @@ +import torch + + +class TextRegressionDataset: + """ + A custom dataset class for text regression tasks for AutoTrain. + + Args: + data (list of dict): The dataset containing text and target values. + tokenizer (PreTrainedTokenizer): The tokenizer to preprocess the text data. + config (object): Configuration object containing dataset parameters. + + Attributes: + data (list of dict): The dataset containing text and target values. + tokenizer (PreTrainedTokenizer): The tokenizer to preprocess the text data. + config (object): Configuration object containing dataset parameters. + text_column (str): The column name for text data in the dataset. + target_column (str): The column name for target values in the dataset. + max_len (int): The maximum sequence length for tokenized inputs. + + Methods: + __len__(): Returns the number of samples in the dataset. + __getitem__(item): Returns a dictionary containing tokenized inputs and target value for a given index. + """ + + def __init__(self, data, tokenizer, config): + self.data = data + self.tokenizer = tokenizer + self.config = config + self.text_column = self.config.text_column + self.target_column = self.config.target_column + self.max_len = self.config.max_seq_length + + def __len__(self): + return len(self.data) + + def __getitem__(self, item): + text = str(self.data[item][self.text_column]) + target = float(self.data[item][self.target_column]) + inputs = self.tokenizer( + text, + max_length=self.max_len, + padding="max_length", + truncation=True, + ) + + ids = inputs["input_ids"] + mask = inputs["attention_mask"] + + if "token_type_ids" in inputs: + token_type_ids = inputs["token_type_ids"] + else: + token_type_ids = None + + if token_type_ids is not None: + return { + "input_ids": torch.tensor(ids, dtype=torch.long), + "attention_mask": torch.tensor(mask, dtype=torch.long), + "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long), + "labels": torch.tensor(target, dtype=torch.float), + } + return { + "input_ids": torch.tensor(ids, dtype=torch.long), + "attention_mask": torch.tensor(mask, dtype=torch.long), + "labels": torch.tensor(target, dtype=torch.float), + } diff --git a/src/autotrain/trainers/text_regression/params.py b/src/autotrain/trainers/text_regression/params.py new file mode 100644 index 0000000000000000000000000000000000000000..9c920a1b640000153eb463dfb4d5f01dd72332c8 --- /dev/null +++ b/src/autotrain/trainers/text_regression/params.py @@ -0,0 +1,72 @@ +from typing import Optional + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class TextRegressionParams(AutoTrainParams): + """ + TextRegressionParams is a configuration class for setting up text regression training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the pre-trained model to use. Default is "bert-base-uncased". + lr (float): Learning rate for the optimizer. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length for the inputs. Default is 128. + batch_size (int): Batch size for training. Default is 8. + warmup_ratio (float): Proportion of training to perform learning rate warmup. Default is 0.1. + gradient_accumulation (int): Number of steps to accumulate gradients before updating. Default is 1. + optimizer (str): Optimizer to use. Default is "adamw_torch". + scheduler (str): Learning rate scheduler to use. Default is "linear". + weight_decay (float): Weight decay to apply. Default is 0.0. + max_grad_norm (float): Maximum norm for the gradients. Default is 1.0. + seed (int): Random seed for reproducibility. Default is 42. + train_split (str): Name of the training data split. Default is "train". + valid_split (Optional[str]): Name of the validation data split. Default is None. + text_column (str): Name of the column containing text data. Default is "text". + target_column (str): Name of the column containing target data. Default is "target". + logging_steps (int): Number of steps between logging. Default is -1 (no logging). + project_name (str): Name of the project for output directory. Default is "project-name". + auto_find_batch_size (bool): Whether to automatically find the batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision training mode (fp16, bf16, or None). Default is None. + save_total_limit (int): Maximum number of checkpoints to save. Default is 1. + token (Optional[str]): Token for accessing Hugging Face Hub. Default is None. + push_to_hub (bool): Whether to push the model to Hugging Face Hub. Default is False. + eval_strategy (str): Evaluation strategy to use. Default is "epoch". + username (Optional[str]): Hugging Face username. Default is None. + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Number of epochs with no improvement after which training will be stopped. Default is 5. + early_stopping_threshold (float): Threshold for measuring the new optimum, to qualify as an improvement. Default is 0.01. + """ + + data_path: str = Field(None, title="Data path") + model: str = Field("bert-base-uncased", title="Model name") + lr: float = Field(5e-5, title="Learning rate") + epochs: int = Field(3, title="Number of training epochs") + max_seq_length: int = Field(128, title="Max sequence length") + batch_size: int = Field(8, title="Training batch size") + warmup_ratio: float = Field(0.1, title="Warmup proportion") + gradient_accumulation: int = Field(1, title="Gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer") + scheduler: str = Field("linear", title="Scheduler") + weight_decay: float = Field(0.0, title="Weight decay") + max_grad_norm: float = Field(1.0, title="Max gradient norm") + seed: int = Field(42, title="Seed") + train_split: str = Field("train", title="Train split") + valid_split: Optional[str] = Field(None, title="Validation split") + text_column: str = Field("text", title="Text column") + target_column: str = Field("target", title="Target column(s)") + logging_steps: int = Field(-1, title="Logging steps") + project_name: str = Field("project-name", title="Output directory") + auto_find_batch_size: bool = Field(False, title="Auto find batch size") + mixed_precision: Optional[str] = Field(None, title="fp16, bf16, or None") + save_total_limit: int = Field(1, title="Save total limit") + token: Optional[str] = Field(None, title="Hub Token") + push_to_hub: bool = Field(False, title="Push to hub") + eval_strategy: str = Field("epoch", title="Evaluation strategy") + username: Optional[str] = Field(None, title="Hugging Face Username") + log: str = Field("none", title="Logging using experiment tracking") + early_stopping_patience: int = Field(5, title="Early stopping patience") + early_stopping_threshold: float = Field(0.01, title="Early stopping threshold") diff --git a/src/autotrain/trainers/text_regression/utils.py b/src/autotrain/trainers/text_regression/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8d058a40c8891d0a2c154ce9dc274ffdbf5103e9 --- /dev/null +++ b/src/autotrain/trainers/text_regression/utils.py @@ -0,0 +1,118 @@ +import os + +import numpy as np +from sklearn import metrics + + +SINGLE_COLUMN_REGRESSION_EVAL_METRICS = ( + "eval_loss", + "eval_mse", + "eval_mae", + "eval_r2", + "eval_rmse", + "eval_explained_variance", +) + + +MODEL_CARD = """ +--- +tags: +- autotrain +- text-regression{base_model} +widget: +- text: "I love AutoTrain"{dataset_tag} +--- + +# Model Trained Using AutoTrain + +- Problem type: Text Regression + +## Validation Metrics +{validation_metrics} +""" + + +def single_column_regression_metrics(pred): + """ + Computes various regression metrics for a single column of predictions. + + Args: + pred (tuple): A tuple containing raw predictions and true labels. + The first element is an array-like of raw predictions, + and the second element is an array-like of true labels. + + Returns: + dict: A dictionary containing the computed regression metrics: + - "mse": Mean Squared Error + - "mae": Mean Absolute Error + - "r2": R-squared Score + - "rmse": Root Mean Squared Error + - "explained_variance": Explained Variance Score + + Notes: + If any metric computation fails, the function will return a default value of -999 for that metric. + """ + raw_predictions, labels = pred + + def safe_compute(metric_func, default=-999): + try: + return metric_func(labels, raw_predictions) + except Exception: + return default + + pred_dict = { + "mse": safe_compute(lambda labels, predictions: metrics.mean_squared_error(labels, predictions)), + "mae": safe_compute(lambda labels, predictions: metrics.mean_absolute_error(labels, predictions)), + "r2": safe_compute(lambda labels, predictions: metrics.r2_score(labels, predictions)), + "rmse": safe_compute(lambda labels, predictions: np.sqrt(metrics.mean_squared_error(labels, predictions))), + "explained_variance": safe_compute( + lambda labels, predictions: metrics.explained_variance_score(labels, predictions) + ), + } + + for key, value in pred_dict.items(): + pred_dict[key] = float(value) + return pred_dict + + +def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing the following attributes: + - valid_split (optional): Validation split to evaluate the model. + - data_path (str): Path to the dataset. + - project_name (str): Name of the project. + - model (str): Path or identifier of the model. + trainer (object): Trainer object used to evaluate the model. + + Returns: + str: A formatted model card string containing dataset information, validation metrics, and base model details. + """ + if config.valid_split is not None: + eval_scores = trainer.evaluate() + eval_scores = [ + f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items() if k in SINGLE_COLUMN_REGRESSION_EVAL_METRICS + ] + eval_scores = "\n\n".join(eval_scores) + + else: + eval_scores = "No validation metrics available" + + if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path): + dataset_tag = "" + else: + dataset_tag = f"\ndatasets:\n- {config.data_path}" + + if os.path.isdir(config.model): + base_model = "" + else: + base_model = f"\nbase_model: {config.model}" + + model_card = MODEL_CARD.format( + dataset_tag=dataset_tag, + validation_metrics=eval_scores, + base_model=base_model, + ) + return model_card diff --git a/src/autotrain/trainers/token_classification/__init__.py b/src/autotrain/trainers/token_classification/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/token_classification/__main__.py b/src/autotrain/trainers/token_classification/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..ad5b461c3046e6ba174aebe987f0a44abb3140ca --- /dev/null +++ b/src/autotrain/trainers/token_classification/__main__.py @@ -0,0 +1,235 @@ +import argparse +import json +from functools import partial + +from accelerate.state import PartialState +from datasets import load_dataset, load_from_disk +from huggingface_hub import HfApi +from transformers import ( + AutoConfig, + AutoModelForTokenClassification, + AutoTokenizer, + EarlyStoppingCallback, + Trainer, + TrainingArguments, +) +from transformers.trainer_callback import PrinterCallback + +from autotrain import logger +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + LossLoggingCallback, + TrainStartCallback, + UploadLogs, + monitor, + pause_space, + remove_autotrain_data, + save_training_params, +) +from autotrain.trainers.token_classification import utils +from autotrain.trainers.token_classification.dataset import TokenClassificationDataset +from autotrain.trainers.token_classification.params import TokenClassificationParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +@monitor +def train(config): + if isinstance(config, dict): + config = TokenClassificationParams(**config) + + train_data = None + valid_data = None + # check if config.train_split.csv exists in config.data_path + if config.train_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + logger.info("loading dataset from disk") + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + label_list = train_data.features[config.tags_column].feature.names + num_classes = len(label_list) + + model_config = AutoConfig.from_pretrained(config.model, num_labels=num_classes) + model_config._num_labels = num_classes + model_config.label2id = {l: i for i, l in enumerate(label_list)} + model_config.id2label = dict(enumerate(label_list)) + + try: + model = AutoModelForTokenClassification.from_pretrained( + config.model, + config=model_config, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + except OSError: + model = AutoModelForTokenClassification.from_pretrained( + config.model, + config=model_config, + from_tf=True, + trust_remote_code=ALLOW_REMOTE_CODE, + token=config.token, + ignore_mismatched_sizes=True, + ) + + if model_config.model_type in {"bloom", "gpt2", "roberta"}: + tokenizer = AutoTokenizer.from_pretrained( + config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE, add_prefix_space=True + ) + else: + tokenizer = AutoTokenizer.from_pretrained( + config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE + ) + + train_data = TokenClassificationDataset(data=train_data, tokenizer=tokenizer, config=config) + if config.valid_split is not None: + valid_data = TokenClassificationDataset(data=valid_data, tokenizer=tokenizer, config=config) + + if config.logging_steps == -1: + if config.valid_split is not None: + logging_steps = int(0.2 * len(valid_data) / config.batch_size) + else: + logging_steps = int(0.2 * len(train_data) / config.batch_size) + if logging_steps == 0: + logging_steps = 1 + if logging_steps > 25: + logging_steps = 25 + config.logging_steps = logging_steps + else: + logging_steps = config.logging_steps + + logger.info(f"Logging steps: {logging_steps}") + + training_args = dict( + output_dir=config.project_name, + per_device_train_batch_size=config.batch_size, + per_device_eval_batch_size=2 * config.batch_size, + learning_rate=config.lr, + num_train_epochs=config.epochs, + eval_strategy=config.eval_strategy if config.valid_split is not None else "no", + logging_steps=logging_steps, + save_total_limit=config.save_total_limit, + save_strategy=config.eval_strategy if config.valid_split is not None else "no", + gradient_accumulation_steps=config.gradient_accumulation, + report_to=config.log, + auto_find_batch_size=config.auto_find_batch_size, + lr_scheduler_type=config.scheduler, + optim=config.optimizer, + warmup_ratio=config.warmup_ratio, + weight_decay=config.weight_decay, + max_grad_norm=config.max_grad_norm, + push_to_hub=False, + load_best_model_at_end=True if config.valid_split is not None else False, + ddp_find_unused_parameters=False, + ) + + if config.mixed_precision == "fp16": + training_args["fp16"] = True + if config.mixed_precision == "bf16": + training_args["bf16"] = True + + if config.valid_split is not None: + early_stop = EarlyStoppingCallback( + early_stopping_patience=config.early_stopping_patience, + early_stopping_threshold=config.early_stopping_threshold, + ) + callbacks_to_use = [early_stop] + else: + callbacks_to_use = [] + + callbacks_to_use.extend([UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()]) + + args = TrainingArguments(**training_args) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks_to_use, + compute_metrics=partial(utils.token_classification_metrics, label_list=label_list), + ) + + trainer = Trainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data, + ) + trainer.remove_callback(PrinterCallback) + trainer.train() + + logger.info("Finished training, saving model...") + trainer.save_model(config.project_name) + tokenizer.save_pretrained(config.project_name) + + model_card = utils.create_model_card(config, trainer) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w", encoding="utf-8") as f: + f.write(model_card) + + if config.push_to_hub: + if PartialState().process_index == 0: + remove_autotrain_data(config) + save_training_params(config) + logger.info("Pushing model to hub...") + api = HfApi(token=config.token) + api.create_repo( + repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True + ) + api.upload_folder( + folder_path=config.project_name, + repo_id=f"{config.username}/{config.project_name}", + repo_type="model", + ) + + if PartialState().process_index == 0: + pause_space(config) + + +if __name__ == "__main__": + args = parse_args() + training_config = json.load(open(args.training_config)) + config = TokenClassificationParams(**training_config) + train(config) diff --git a/src/autotrain/trainers/token_classification/dataset.py b/src/autotrain/trainers/token_classification/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..907c8411234f648c48f33c86079bd7971460366d --- /dev/null +++ b/src/autotrain/trainers/token_classification/dataset.py @@ -0,0 +1,65 @@ +class TokenClassificationDataset: + """ + A dataset class for token classification tasks. + + Args: + data (Dataset): The dataset containing the text and tags. + tokenizer (PreTrainedTokenizer): The tokenizer to be used for tokenizing the text. + config (Config): Configuration object containing necessary parameters. + + Attributes: + data (Dataset): The dataset containing the text and tags. + tokenizer (PreTrainedTokenizer): The tokenizer to be used for tokenizing the text. + config (Config): Configuration object containing necessary parameters. + + Methods: + __len__(): + Returns the number of samples in the dataset. + + __getitem__(item): + Retrieves a tokenized sample and its corresponding labels. + + Args: + item (int): The index of the sample to retrieve. + + Returns: + dict: A dictionary containing tokenized text and corresponding labels. + """ + + def __init__(self, data, tokenizer, config): + self.data = data + self.tokenizer = tokenizer + self.config = config + + def __len__(self): + return len(self.data) + + def __getitem__(self, item): + text = self.data[item][self.config.tokens_column] + tags = self.data[item][self.config.tags_column] + + label_list = self.data.features[self.config.tags_column].feature.names + label_to_id = {i: i for i in range(len(label_list))} + + tokenized_text = self.tokenizer( + text, + max_length=self.config.max_seq_length, + padding="max_length", + truncation=True, + is_split_into_words=True, + ) + + word_ids = tokenized_text.word_ids(batch_index=0) + previous_word_idx = None + label_ids = [] + for word_idx in word_ids: + if word_idx is None: + label_ids.append(-100) + elif word_idx != previous_word_idx: + label_ids.append(label_to_id[tags[word_idx]]) + else: + label_ids.append(label_to_id[tags[word_idx]]) + previous_word_idx = word_idx + + tokenized_text["labels"] = label_ids + return tokenized_text diff --git a/src/autotrain/trainers/token_classification/params.py b/src/autotrain/trainers/token_classification/params.py new file mode 100644 index 0000000000000000000000000000000000000000..7ad22295ed9f6c8d43382c62561f555ed4f4f556 --- /dev/null +++ b/src/autotrain/trainers/token_classification/params.py @@ -0,0 +1,72 @@ +from typing import Optional + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class TokenClassificationParams(AutoTrainParams): + """ + TokenClassificationParams is a configuration class for token classification training parameters. + + Attributes: + data_path (str): Path to the dataset. + model (str): Name of the model to use. Default is "bert-base-uncased". + lr (float): Learning rate. Default is 5e-5. + epochs (int): Number of training epochs. Default is 3. + max_seq_length (int): Maximum sequence length. Default is 128. + batch_size (int): Training batch size. Default is 8. + warmup_ratio (float): Warmup proportion. Default is 0.1. + gradient_accumulation (int): Gradient accumulation steps. Default is 1. + optimizer (str): Optimizer to use. Default is "adamw_torch". + scheduler (str): Scheduler to use. Default is "linear". + weight_decay (float): Weight decay. Default is 0.0. + max_grad_norm (float): Maximum gradient norm. Default is 1.0. + seed (int): Random seed. Default is 42. + train_split (str): Name of the training split. Default is "train". + valid_split (Optional[str]): Name of the validation split. Default is None. + tokens_column (str): Name of the tokens column. Default is "tokens". + tags_column (str): Name of the tags column. Default is "tags". + logging_steps (int): Number of steps between logging. Default is -1. + project_name (str): Name of the project. Default is "project-name". + auto_find_batch_size (bool): Whether to automatically find the batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision setting (fp16, bf16, or None). Default is None. + save_total_limit (int): Total number of checkpoints to save. Default is 1. + token (Optional[str]): Hub token for authentication. Default is None. + push_to_hub (bool): Whether to push the model to the Hugging Face hub. Default is False. + eval_strategy (str): Evaluation strategy. Default is "epoch". + username (Optional[str]): Hugging Face username. Default is None. + log (str): Logging method for experiment tracking. Default is "none". + early_stopping_patience (int): Patience for early stopping. Default is 5. + early_stopping_threshold (float): Threshold for early stopping. Default is 0.01. + """ + + data_path: str = Field(None, title="Data path") + model: str = Field("bert-base-uncased", title="Model name") + lr: float = Field(5e-5, title="Learning rate") + epochs: int = Field(3, title="Number of training epochs") + max_seq_length: int = Field(128, title="Max sequence length") + batch_size: int = Field(8, title="Training batch size") + warmup_ratio: float = Field(0.1, title="Warmup proportion") + gradient_accumulation: int = Field(1, title="Gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer") + scheduler: str = Field("linear", title="Scheduler") + weight_decay: float = Field(0.0, title="Weight decay") + max_grad_norm: float = Field(1.0, title="Max gradient norm") + seed: int = Field(42, title="Seed") + train_split: str = Field("train", title="Train split") + valid_split: Optional[str] = Field(None, title="Validation split") + tokens_column: str = Field("tokens", title="Tokens column") + tags_column: str = Field("tags", title="Tags column") + logging_steps: int = Field(-1, title="Logging steps") + project_name: str = Field("project-name", title="Output directory") + auto_find_batch_size: bool = Field(False, title="Auto find batch size") + mixed_precision: Optional[str] = Field(None, title="fp16, bf16, or None") + save_total_limit: int = Field(1, title="Save total limit") + token: Optional[str] = Field(None, title="Hub Token") + push_to_hub: bool = Field(False, title="Push to hub") + eval_strategy: str = Field("epoch", title="Evaluation strategy") + username: Optional[str] = Field(None, title="Hugging Face Username") + log: str = Field("none", title="Logging using experiment tracking") + early_stopping_patience: int = Field(5, title="Early stopping patience") + early_stopping_threshold: float = Field(0.01, title="Early stopping threshold") diff --git a/src/autotrain/trainers/token_classification/utils.py b/src/autotrain/trainers/token_classification/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..aa2b80ede6f4a1cd3d021b08e5a8f99e9aae66be --- /dev/null +++ b/src/autotrain/trainers/token_classification/utils.py @@ -0,0 +1,98 @@ +import os + +import numpy as np +from seqeval import metrics + + +MODEL_CARD = """ +--- +library_name: transformers +tags: +- autotrain +- token-classification{base_model} +widget: +- text: "I love AutoTrain"{dataset_tag} +--- + +# Model Trained Using AutoTrain + +- Problem type: Token Classification + +## Validation Metrics +{validation_metrics} +""" + + +def token_classification_metrics(pred, label_list): + """ + Compute token classification metrics including precision, recall, F1 score, and accuracy. + + Args: + pred (tuple): A tuple containing predictions and labels. + Predictions should be a 3D array (batch_size, sequence_length, num_labels). + Labels should be a 2D array (batch_size, sequence_length). + label_list (list): A list of label names corresponding to the indices used in predictions and labels. + + Returns: + dict: A dictionary containing the following metrics: + - "precision": Precision score of the token classification. + - "recall": Recall score of the token classification. + - "f1": F1 score of the token classification. + - "accuracy": Accuracy score of the token classification. + """ + predictions, labels = pred + predictions = np.argmax(predictions, axis=2) + + true_predictions = [ + [label_list[predi] for (predi, lbl) in zip(prediction, label) if lbl != -100] + for prediction, label in zip(predictions, labels) + ] + true_labels = [ + [label_list[lbl] for (predi, lbl) in zip(prediction, label) if lbl != -100] + for prediction, label in zip(predictions, labels) + ] + + results = { + "precision": metrics.precision_score(true_labels, true_predictions), + "recall": metrics.recall_score(true_labels, true_predictions), + "f1": metrics.f1_score(true_labels, true_predictions), + "accuracy": metrics.accuracy_score(true_labels, true_predictions), + } + return results + + +def create_model_card(config, trainer): + """ + Generates a model card string based on the provided configuration and trainer. + + Args: + config (object): Configuration object containing model and dataset information. + trainer (object): Trainer object used to evaluate the model. + + Returns: + str: A formatted model card string with dataset tags, validation metrics, and base model information. + """ + if config.valid_split is not None: + eval_scores = trainer.evaluate() + valid_metrics = ["eval_loss", "eval_precision", "eval_recall", "eval_f1", "eval_accuracy"] + eval_scores = [f"{k[len('eval_'):]}: {v}" for k, v in eval_scores.items() if k in valid_metrics] + eval_scores = "\n\n".join(eval_scores) + else: + eval_scores = "No validation metrics available" + + if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path): + dataset_tag = "" + else: + dataset_tag = f"\ndatasets:\n- {config.data_path}" + + if os.path.isdir(config.model): + base_model = "" + else: + base_model = f"\nbase_model: {config.model}" + + model_card = MODEL_CARD.format( + dataset_tag=dataset_tag, + validation_metrics=eval_scores, + base_model=base_model, + ) + return model_card diff --git a/src/autotrain/trainers/vlm/__init__.py b/src/autotrain/trainers/vlm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/vlm/__main__.py b/src/autotrain/trainers/vlm/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..d74a6215d2c455367bb08931fb35806bac386d77 --- /dev/null +++ b/src/autotrain/trainers/vlm/__main__.py @@ -0,0 +1,37 @@ +import argparse +import json + +from autotrain.trainers.common import monitor +from autotrain.trainers.vlm import utils +from autotrain.trainers.vlm.params import VLMTrainingParams + + +def parse_args(): + # get training_config.json from the end user + parser = argparse.ArgumentParser() + parser.add_argument("--training_config", type=str, required=True) + return parser.parse_args() + + +@monitor +def train(config): + if isinstance(config, dict): + config = VLMTrainingParams(**config) + + if not utils.check_model_support(config): + raise ValueError(f"model `{config.model}` not supported") + + if config.trainer in ("vqa", "captioning"): + from autotrain.trainers.vlm.train_vlm_generic import train as train_generic + + train_generic(config) + + else: + raise ValueError(f"trainer `{config.trainer}` not supported") + + +if __name__ == "__main__": + _args = parse_args() + training_config = json.load(open(_args.training_config)) + _config = VLMTrainingParams(**training_config) + train(_config) diff --git a/src/autotrain/trainers/vlm/dataset.py b/src/autotrain/trainers/vlm/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/autotrain/trainers/vlm/params.py b/src/autotrain/trainers/vlm/params.py new file mode 100644 index 0000000000000000000000000000000000000000..5e41e25ea07eb2ebe1deecc75b25d343d8e87e56 --- /dev/null +++ b/src/autotrain/trainers/vlm/params.py @@ -0,0 +1,101 @@ +from typing import Optional + +from pydantic import Field + +from autotrain.trainers.common import AutoTrainParams + + +class VLMTrainingParams(AutoTrainParams): + """ + VLMTrainingParams + + Attributes: + model (str): Model name. Default is "google/paligemma-3b-pt-224". + project_name (str): Output directory. Default is "project-name". + + data_path (str): Data path. Default is "data". + train_split (str): Train data config. Default is "train". + valid_split (Optional[str]): Validation data config. Default is None. + + trainer (str): Trainer type (captioning, vqa, segmentation, detection). Default is "vqa". + log (str): Logging using experiment tracking. Default is "none". + disable_gradient_checkpointing (bool): Gradient checkpointing. Default is False. + logging_steps (int): Logging steps. Default is -1. + eval_strategy (str): Evaluation strategy. Default is "epoch". + save_total_limit (int): Save total limit. Default is 1. + auto_find_batch_size (bool): Auto find batch size. Default is False. + mixed_precision (Optional[str]): Mixed precision (fp16, bf16, or None). Default is None. + lr (float): Learning rate. Default is 3e-5. + epochs (int): Number of training epochs. Default is 1. + batch_size (int): Training batch size. Default is 2. + warmup_ratio (float): Warmup proportion. Default is 0.1. + gradient_accumulation (int): Gradient accumulation steps. Default is 4. + optimizer (str): Optimizer. Default is "adamw_torch". + scheduler (str): Scheduler. Default is "linear". + weight_decay (float): Weight decay. Default is 0.0. + max_grad_norm (float): Max gradient norm. Default is 1.0. + seed (int): Seed. Default is 42. + + quantization (Optional[str]): Quantization (int4, int8, or None). Default is "int4". + target_modules (Optional[str]): Target modules. Default is "all-linear". + merge_adapter (bool): Merge adapter. Default is False. + peft (bool): Use PEFT. Default is False. + lora_r (int): Lora r. Default is 16. + lora_alpha (int): Lora alpha. Default is 32. + lora_dropout (float): Lora dropout. Default is 0.05. + + image_column (Optional[str]): Image column. Default is "image". + text_column (str): Text (answer) column. Default is "text". + prompt_text_column (Optional[str]): Prompt (prefix) column. Default is "prompt". + + push_to_hub (bool): Push to hub. Default is False. + username (Optional[str]): Hugging Face Username. Default is None. + token (Optional[str]): Huggingface token. Default is None. + """ + + model: str = Field("google/paligemma-3b-pt-224", title="Model name") + project_name: str = Field("project-name", title="Output directory") + + # data params + data_path: str = Field("data", title="Data path") + train_split: str = Field("train", title="Train data config") + valid_split: Optional[str] = Field(None, title="Validation data config") + + # trainer params + trainer: str = Field("vqa", title="Trainer type") # captioning, vqa, segmentation, detection + log: str = Field("none", title="Logging using experiment tracking") + disable_gradient_checkpointing: bool = Field(False, title="Gradient checkpointing") + logging_steps: int = Field(-1, title="Logging steps") + eval_strategy: str = Field("epoch", title="Evaluation strategy") + save_total_limit: int = Field(1, title="Save total limit") + auto_find_batch_size: bool = Field(False, title="Auto find batch size") + mixed_precision: Optional[str] = Field(None, title="fp16, bf16, or None") + lr: float = Field(3e-5, title="Learning rate") + epochs: int = Field(1, title="Number of training epochs") + batch_size: int = Field(2, title="Training batch size") + warmup_ratio: float = Field(0.1, title="Warmup proportion") + gradient_accumulation: int = Field(4, title="Gradient accumulation steps") + optimizer: str = Field("adamw_torch", title="Optimizer") + scheduler: str = Field("linear", title="Scheduler") + weight_decay: float = Field(0.0, title="Weight decay") + max_grad_norm: float = Field(1.0, title="Max gradient norm") + seed: int = Field(42, title="Seed") + + # peft + quantization: Optional[str] = Field("int4", title="int4, int8, or None") + target_modules: Optional[str] = Field("all-linear", title="Target modules") + merge_adapter: bool = Field(False, title="Merge adapter") + peft: bool = Field(False, title="Use PEFT") + lora_r: int = Field(16, title="Lora r") + lora_alpha: int = Field(32, title="Lora alpha") + lora_dropout: float = Field(0.05, title="Lora dropout") + + # column mappings + image_column: Optional[str] = Field("image", title="Image column") + text_column: str = Field("text", title="Text (answer) column") + prompt_text_column: Optional[str] = Field("prompt", title="Prompt (prefix) column") + + # push to hub + push_to_hub: bool = Field(False, title="Push to hub") + username: Optional[str] = Field(None, title="Hugging Face Username") + token: Optional[str] = Field(None, title="Huggingface token") diff --git a/src/autotrain/trainers/vlm/train_vlm_generic.py b/src/autotrain/trainers/vlm/train_vlm_generic.py new file mode 100644 index 0000000000000000000000000000000000000000..fa3788cabb316ba9662bbaba44ad846d19069dd5 --- /dev/null +++ b/src/autotrain/trainers/vlm/train_vlm_generic.py @@ -0,0 +1,98 @@ +from functools import partial + +from datasets import load_dataset, load_from_disk +from transformers import AutoProcessor, Trainer, TrainingArguments +from transformers.trainer_callback import PrinterCallback + +from autotrain import logger +from autotrain.trainers.common import ALLOW_REMOTE_CODE +from autotrain.trainers.vlm import utils + + +def collate_fn(examples, config, processor): + prompts = ["answer " + example[config.prompt_text_column] for example in examples] + labels = [example[config.text_column] for example in examples] + images = [example[config.image_column].convert("RGB") for example in examples] + tokens = processor( + text=prompts, + images=images, + suffix=labels, + return_tensors="pt", + padding="longest", + tokenize_newline_separately=False, + ) + return tokens + + +def train(config): + valid_data = None + if config.data_path == f"{config.project_name}/autotrain-data": + train_data = load_from_disk(config.data_path)[config.train_split] + else: + if ":" in config.train_split: + dataset_config_name, split = config.train_split.split(":") + train_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + ) + else: + train_data = load_dataset( + config.data_path, + split=config.train_split, + token=config.token, + ) + + if config.valid_split is not None: + if config.data_path == f"{config.project_name}/autotrain-data": + valid_data = load_from_disk(config.data_path)[config.valid_split] + else: + if ":" in config.valid_split: + dataset_config_name, split = config.valid_split.split(":") + valid_data = load_dataset( + config.data_path, + name=dataset_config_name, + split=split, + token=config.token, + ) + else: + valid_data = load_dataset( + config.data_path, + split=config.valid_split, + token=config.token, + ) + + logger.info(f"Train data: {train_data}") + logger.info(f"Valid data: {valid_data}") + + if config.trainer == "captioning": + config.prompt_text_column = "caption" + + processor = AutoProcessor.from_pretrained(config.model, token=config.token, trust_remote_code=ALLOW_REMOTE_CODE) + + logging_steps = utils.configure_logging_steps(config, train_data, valid_data) + training_args = utils.configure_training_args(config, logging_steps) + + args = TrainingArguments(**training_args) + model = utils.get_model(config) + + logger.info("creating trainer") + callbacks = utils.get_callbacks(config) + trainer_args = dict( + args=args, + model=model, + callbacks=callbacks, + ) + + col_fn = partial(collate_fn, config=config, processor=processor) + + trainer = Trainer( + **trainer_args, + train_dataset=train_data, + eval_dataset=valid_data if valid_data is not None else None, + data_collator=col_fn, + ) + trainer.remove_callback(PrinterCallback) + trainer.train() + utils.post_training_steps(config, trainer) diff --git a/src/autotrain/trainers/vlm/utils.py b/src/autotrain/trainers/vlm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ee3826a7822f8843e0eed563ba53909b30dae49e --- /dev/null +++ b/src/autotrain/trainers/vlm/utils.py @@ -0,0 +1,329 @@ +import os + +import torch +from accelerate import PartialState +from huggingface_hub import HfApi +from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training +from transformers import AutoConfig, BitsAndBytesConfig, PaliGemmaForConditionalGeneration + +from autotrain import logger +from autotrain.trainers.common import ( + ALLOW_REMOTE_CODE, + LossLoggingCallback, + TrainStartCallback, + UploadLogs, + pause_space, + remove_autotrain_data, + save_training_params, +) + + +TARGET_MODULES = {} + +SUPPORTED_MODELS = [ + "PaliGemmaForConditionalGeneration", + # "Florence2ForConditionalGeneration", support later +] + +MODEL_CARD = """ +--- +tags: +- autotrain +- text-generation-inference +- image-text-to-text +- text-generation{peft} +library_name: transformers{base_model} +license: other{dataset_tag} +--- + +# Model Trained Using AutoTrain + +This model was trained using AutoTrain. For more information, please visit [AutoTrain](https://hf.co/docs/autotrain). + +# Usage + +```python +# you will need to adjust code if you didnt use peft + +from PIL import Image +from transformers import PaliGemmaForConditionalGeneration, PaliGemmaProcessor +import torch +import requests +from peft import PeftModel + +base_model_id = BASE_MODEL_ID +peft_model_id = THIS_MODEL_ID +max_new_tokens = 100 +text = "Whats on the flower?" +img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/bee.JPG?download=true" +image = Image.open(requests.get(img_url, stream=True).raw) + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +base_model = PaliGemmaForConditionalGeneration.from_pretrained(base_model_id) +processor = PaliGemmaProcessor.from_pretrained(base_model_id) + +model = PeftModel.from_pretrained(base_model, peft_model_id) +model.merge_and_unload() + +model = model.eval().to(device) + +inputs = processor(text=text, images=image, return_tensors="pt").to(device) +with torch.inference_mode(): + generated_ids = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=False, + ) +result = processor.batch_decode(generated_ids, skip_special_tokens=True) +print(result) +``` +""" + + +def get_target_modules(config): + if config.target_modules is None: + return TARGET_MODULES.get(config.model) + if config.target_modules.strip() == "": + return TARGET_MODULES.get(config.model) + if config.target_modules.strip().lower() == "all-linear": + return "all-linear" + return config.target_modules.split(",") + + +def create_model_card(config): + if config.peft: + peft = "\n- peft" + else: + peft = "" + + if config.data_path == f"{config.project_name}/autotrain-data" or os.path.isdir(config.data_path): + dataset_tag = "" + else: + dataset_tag = f"\ndatasets:\n- {config.data_path}" + + if os.path.isdir(config.model): + base_model = "" + else: + base_model = f"\nbase_model: {config.model}" + + model_card = MODEL_CARD.format( + dataset_tag=dataset_tag, + peft=peft, + base_model=base_model, + ) + return model_card.strip() + + +def check_model_support(config): + api = HfApi(token=config.token) + model_info = api.model_info(config.model) + architectures = model_info.config.get("architectures", []) + for arch in architectures: + if arch in SUPPORTED_MODELS: + return True + return False + + +def configure_logging_steps(config, train_data, valid_data): + logger.info("configuring logging steps") + if config.logging_steps == -1: + if config.valid_split is not None: + logging_steps = int(0.2 * len(valid_data) / config.batch_size) + else: + logging_steps = int(0.2 * len(train_data) / config.batch_size) + if logging_steps == 0: + logging_steps = 1 + if logging_steps > 25: + logging_steps = 25 + config.logging_steps = logging_steps + else: + logging_steps = config.logging_steps + logger.info(f"Logging steps: {logging_steps}") + return logging_steps + + +def configure_training_args(config, logging_steps): + logger.info("configuring training args") + training_args = dict( + output_dir=config.project_name, + per_device_train_batch_size=config.batch_size, + per_device_eval_batch_size=config.batch_size, + learning_rate=config.lr, + num_train_epochs=config.epochs, + eval_strategy=config.eval_strategy if config.valid_split is not None else "no", + logging_steps=logging_steps, + save_total_limit=config.save_total_limit, + save_strategy=config.eval_strategy if config.valid_split is not None else "no", + gradient_accumulation_steps=config.gradient_accumulation, + report_to=config.log, + auto_find_batch_size=config.auto_find_batch_size, + lr_scheduler_type=config.scheduler, + optim=config.optimizer, + warmup_ratio=config.warmup_ratio, + weight_decay=config.weight_decay, + max_grad_norm=config.max_grad_norm, + push_to_hub=False, + load_best_model_at_end=True if config.valid_split is not None else False, + ddp_find_unused_parameters=False, + gradient_checkpointing=not config.disable_gradient_checkpointing, + remove_unused_columns=False, + ) + + if not config.disable_gradient_checkpointing: + if config.peft and config.quantization in ("int4", "int8"): + training_args["gradient_checkpointing_kwargs"] = {"use_reentrant": True} + else: + training_args["gradient_checkpointing_kwargs"] = {"use_reentrant": False} + + if config.mixed_precision == "fp16": + training_args["fp16"] = True + if config.mixed_precision == "bf16": + training_args["bf16"] = True + + return training_args + + +def get_callbacks(config): + callbacks = [UploadLogs(config=config), LossLoggingCallback(), TrainStartCallback()] + return callbacks + + +def get_model(config): + logger.info("loading model config...") + model_config = AutoConfig.from_pretrained( + config.model, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + use_cache=config.disable_gradient_checkpointing, + ) + + logger.info("loading model...") + if config.peft: + if config.quantization == "int4": + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=False, + ) + elif config.quantization == "int8": + bnb_config = BitsAndBytesConfig(load_in_8bit=True) + else: + bnb_config = None + + model = PaliGemmaForConditionalGeneration.from_pretrained( + config.model, + config=model_config, + token=config.token, + quantization_config=bnb_config, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + else: + model = PaliGemmaForConditionalGeneration.from_pretrained( + config.model, + config=model_config, + token=config.token, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + logger.info(f"model dtype: {model.dtype}") + + if config.peft: + logger.info("preparing peft model...") + if config.quantization is not None: + gradient_checkpointing_kwargs = {} + if not config.disable_gradient_checkpointing: + if config.quantization in ("int4", "int8"): + gradient_checkpointing_kwargs = {"use_reentrant": True} + else: + gradient_checkpointing_kwargs = {"use_reentrant": False} + model = prepare_model_for_kbit_training( + model, + use_gradient_checkpointing=not config.disable_gradient_checkpointing, + gradient_checkpointing_kwargs=gradient_checkpointing_kwargs, + ) + else: + model.enable_input_require_grads() + + peft_config = LoraConfig( + r=config.lora_r, + lora_alpha=config.lora_alpha, + lora_dropout=config.lora_dropout, + bias="none", + task_type="CAUSAL_LM", + target_modules=get_target_modules(config), + ) + model = get_peft_model(model, peft_config) + + for param in model.vision_tower.parameters(): + param.requires_grad = False + + for param in model.multi_modal_projector.parameters(): + param.requires_grad = False + + return model + + +def merge_adapter(base_model_path, target_model_path, adapter_path): + logger.info("Loading adapter...") + model = PaliGemmaForConditionalGeneration.from_pretrained( + base_model_path, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + trust_remote_code=ALLOW_REMOTE_CODE, + ) + + model = PeftModel.from_pretrained(model, adapter_path) + model = model.merge_and_unload() + + logger.info("Saving target model...") + model.save_pretrained(target_model_path) + + +def post_training_steps(config, trainer): + logger.info("Finished training, saving model...") + trainer.model.config.use_cache = True + trainer.save_model(config.project_name) + + model_card = create_model_card(config) + + # save model card to output directory as README.md + with open(f"{config.project_name}/README.md", "w", encoding="utf-8") as f: + f.write(model_card) + + if config.peft and config.merge_adapter: + logger.info("Merging adapter weights...") + try: + del trainer + torch.cuda.empty_cache() + merge_adapter( + base_model_path=config.model, + target_model_path=config.project_name, + adapter_path=config.project_name, + ) + # remove adapter weights: adapter_* + for file in os.listdir(config.project_name): + if file.startswith("adapter_"): + os.remove(f"{config.project_name}/{file}") + except Exception as e: + logger.warning(f"Failed to merge adapter weights: {e}") + logger.warning("Skipping adapter merge. Only adapter weights will be saved.") + + if config.push_to_hub: + if PartialState().process_index == 0: + # remove data folder + remove_autotrain_data(config) + logger.info("Pushing model to hub...") + save_training_params(config) + api = HfApi(token=config.token) + api.create_repo( + repo_id=f"{config.username}/{config.project_name}", repo_type="model", private=True, exist_ok=True + ) + api.upload_folder( + folder_path=config.project_name, + repo_id=f"{config.username}/{config.project_name}", + repo_type="model", + ) + + if PartialState().process_index == 0: + pause_space(config) diff --git a/src/autotrain/utils.py b/src/autotrain/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e26cfb2bef408a05c67cb1dd1ab1b186bf617814 --- /dev/null +++ b/src/autotrain/utils.py @@ -0,0 +1,79 @@ +import json +import os +import subprocess + +from autotrain.commands import launch_command +from autotrain.trainers.clm.params import LLMTrainingParams +from autotrain.trainers.extractive_question_answering.params import ExtractiveQuestionAnsweringParams +from autotrain.trainers.generic.params import GenericParams +from autotrain.trainers.image_classification.params import ImageClassificationParams +from autotrain.trainers.image_regression.params import ImageRegressionParams +from autotrain.trainers.object_detection.params import ObjectDetectionParams +from autotrain.trainers.sent_transformers.params import SentenceTransformersParams +from autotrain.trainers.seq2seq.params import Seq2SeqParams +from autotrain.trainers.tabular.params import TabularParams +from autotrain.trainers.text_classification.params import TextClassificationParams +from autotrain.trainers.text_regression.params import TextRegressionParams +from autotrain.trainers.token_classification.params import TokenClassificationParams +from autotrain.trainers.vlm.params import VLMTrainingParams + + +ALLOW_REMOTE_CODE = os.environ.get("ALLOW_REMOTE_CODE", "true").lower() == "true" + + +def run_training(params, task_id, local=False, wait=False): + """ + Run the training process based on the provided parameters and task ID. + + Args: + params (str): JSON string of the parameters required for training. + task_id (int): Identifier for the type of task to be performed. + local (bool, optional): Flag to indicate if the training should be run locally. Defaults to False. + wait (bool, optional): Flag to indicate if the function should wait for the process to complete. Defaults to False. + + Returns: + int: Process ID of the launched training process. + + Raises: + NotImplementedError: If the task_id does not match any of the predefined tasks. + """ + params = json.loads(params) + if isinstance(params, str): + params = json.loads(params) + if task_id == 9: + params = LLMTrainingParams(**params) + elif task_id == 28: + params = Seq2SeqParams(**params) + elif task_id in (1, 2): + params = TextClassificationParams(**params) + elif task_id in (13, 14, 15, 16, 26): + params = TabularParams(**params) + elif task_id == 27: + params = GenericParams(**params) + elif task_id == 18: + params = ImageClassificationParams(**params) + elif task_id == 4: + params = TokenClassificationParams(**params) + elif task_id == 10: + params = TextRegressionParams(**params) + elif task_id == 29: + params = ObjectDetectionParams(**params) + elif task_id == 30: + params = SentenceTransformersParams(**params) + elif task_id == 24: + params = ImageRegressionParams(**params) + elif task_id == 31: + params = VLMTrainingParams(**params) + elif task_id == 5: + params = ExtractiveQuestionAnsweringParams(**params) + else: + raise NotImplementedError + + params.save(output_dir=params.project_name) + cmd = launch_command(params=params) + cmd = [str(c) for c in cmd] + env = os.environ.copy() + process = subprocess.Popen(cmd, env=env) + if wait: + process.wait() + return process.pid diff --git a/static/autotrain_homepage.png b/static/autotrain_homepage.png new file mode 100644 index 0000000000000000000000000000000000000000..eb95ae8a857e89d184866d88c0fe3e4e384be63b --- /dev/null +++ b/static/autotrain_homepage.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6feb954885fa0330b5e9920c5ec83fa938c12a249f87983f2929f66279cbe96f +size 605846 diff --git a/static/autotrain_model_choice.png b/static/autotrain_model_choice.png new file mode 100644 index 0000000000000000000000000000000000000000..723b35a974d630cfca459275a1c7b39a486cd63f --- /dev/null +++ b/static/autotrain_model_choice.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93dab6be0fa708a2f0c5ff63355b61dbc0af0daa41ab32db54a04d67452782ac +size 221281 diff --git a/static/autotrain_space.png b/static/autotrain_space.png new file mode 100644 index 0000000000000000000000000000000000000000..fba080cbf86fbf0ad322a43bba896a069ff6aabf --- /dev/null +++ b/static/autotrain_space.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31e36fee395ed6fd11776bd5bf5cad11c6d165e50721499a7f4eace48340908a +size 410983 diff --git a/static/autotrain_text_classification.png b/static/autotrain_text_classification.png new file mode 100644 index 0000000000000000000000000000000000000000..6a664f4f17618f8102c331cb12d7f0eae3d02383 --- /dev/null +++ b/static/autotrain_text_classification.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b72caa4084f41cd376b68df8fc4a52935d40b2746ff1a43a319d3cad5c9398d9 +size 419419 diff --git a/static/cost.png b/static/cost.png new file mode 100644 index 0000000000000000000000000000000000000000..7e338eb073de1b7158175ad3d1e36013b7a4e075 --- /dev/null +++ b/static/cost.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1473adcb2e75b3f25349293357343a9702a0c66c41c22e192e235a673b757ea +size 209167 diff --git a/static/dreambooth1.jpeg b/static/dreambooth1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..b14c5638d3986b56a8f6fdd335654ef04ef2a956 --- /dev/null +++ b/static/dreambooth1.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66dbab88a6cf4aec38202b6d4f99e51bf51178b96f59c9f42b042e2d014674cf +size 617270 diff --git a/static/dreambooth2.png b/static/dreambooth2.png new file mode 100644 index 0000000000000000000000000000000000000000..1e38f2636ea751d791721624ff519ca1b1872fef --- /dev/null +++ b/static/dreambooth2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fd22803fca7b8b05e9cb5785de88bae7e297720508fffebf4d013e841a4eebd +size 327356 diff --git a/static/duplicate_space.png b/static/duplicate_space.png new file mode 100644 index 0000000000000000000000000000000000000000..4cb69b9c7a73ffbda255f6c3cc68a8fb631bc2d4 --- /dev/null +++ b/static/duplicate_space.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f6d77e4abef8a3da064d350e7e7d6aeabf7af563b632e5bc04118257950b959 +size 206177 diff --git a/static/ext_qa.png b/static/ext_qa.png new file mode 100644 index 0000000000000000000000000000000000000000..379b03165285ac5b9148a731e7cdd2d9e4ab6f62 --- /dev/null +++ b/static/ext_qa.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6d74c92f16559b5114fd5631e7df2d793f12049ab7fb86ed1f82ef3e63bbf1f +size 352513 diff --git a/static/hub_model_choice.png b/static/hub_model_choice.png new file mode 100644 index 0000000000000000000000000000000000000000..f5dca2ba4e43a52a329d9aaa7d57ed87765e8b56 --- /dev/null +++ b/static/hub_model_choice.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:990d8cec2ff4f7ced4bad127f414cd63fc1b27dd35d0764285eea9e4de715628 +size 211561 diff --git a/static/image_classification_1.png b/static/image_classification_1.png new file mode 100644 index 0000000000000000000000000000000000000000..2afbc3cfa39a5c77e09882da6aa0f04d11b468e5 --- /dev/null +++ b/static/image_classification_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ab829b683a332569aa88d754a72215838a5716d12400f8a072d87308a0973e1 +size 183257 diff --git a/static/img_reg_ui.png b/static/img_reg_ui.png new file mode 100644 index 0000000000000000000000000000000000000000..aa0116d49208c0e1458c3b24f160cedd8a2be7c5 --- /dev/null +++ b/static/img_reg_ui.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18a17ae1a0ca57ed634ce94f477f64f9f3d79804712d6eaed1937c1cfd458c1b +size 353164 diff --git a/static/llm_1.png b/static/llm_1.png new file mode 100644 index 0000000000000000000000000000000000000000..7331648ec2ec9d6264d4b10f38ef7ea6d6bf2c8e --- /dev/null +++ b/static/llm_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b561e5fac1eb7d9c9af6b235618eb812aa297f839068432b876be3a7e2c84b86 +size 216072 diff --git a/static/llm_2.png b/static/llm_2.png new file mode 100644 index 0000000000000000000000000000000000000000..ac826de7c658f7f3a500cb639504c5c0c8417d71 --- /dev/null +++ b/static/llm_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bda654a95ca873bdf38d0949ffee6862b9e398b9f34e680f9c524435f55074c0 +size 223052 diff --git a/static/llm_3.png b/static/llm_3.png new file mode 100644 index 0000000000000000000000000000000000000000..d6318b63db7596fe963bc0fff6b9706d515b077e --- /dev/null +++ b/static/llm_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:433db8f262bc984b6e16463053fe24b9baf76ca1223c1348e643338d2b5e1018 +size 226568 diff --git a/static/llm_orpo_example.png b/static/llm_orpo_example.png new file mode 100644 index 0000000000000000000000000000000000000000..d12a54866167d79f57525f3fb59a06c2fd615f75 --- /dev/null +++ b/static/llm_orpo_example.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:241a7954b2c7e92988e32ddc85acb156b9c5d2286db1c91ab493d931eee52d6a +size 405188 diff --git a/static/logo.png b/static/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..02cbc2b4cefdd07c8d1593a48d2b518757320443 Binary files /dev/null and b/static/logo.png differ diff --git a/static/model_choice_1.png b/static/model_choice_1.png new file mode 100644 index 0000000000000000000000000000000000000000..723b35a974d630cfca459275a1c7b39a486cd63f --- /dev/null +++ b/static/model_choice_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93dab6be0fa708a2f0c5ff63355b61dbc0af0daa41ab32db54a04d67452782ac +size 221281 diff --git a/static/param_choice_1.png b/static/param_choice_1.png new file mode 100644 index 0000000000000000000000000000000000000000..94e269518c74e4726ff25e4355325580ce7e88f9 --- /dev/null +++ b/static/param_choice_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508f351dbad97435e96521ac5cc040c3480a4bccf6ab6a5124dfdc5bb58354bb +size 198177 diff --git a/static/param_choice_2.png b/static/param_choice_2.png new file mode 100644 index 0000000000000000000000000000000000000000..bd93da3018706d0ab8a9b501997ce98e14552d3a --- /dev/null +++ b/static/param_choice_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ef392f7e9526234cf1af5e40cb50a57046e57b97f7c57a19869acfcc7b42bff +size 268673 diff --git a/static/space_template_1.png b/static/space_template_1.png new file mode 100644 index 0000000000000000000000000000000000000000..95b2754cbaf8e2e7afcf6c17eddd1b250bb3578c --- /dev/null +++ b/static/space_template_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90d428615e010adfc98b75caecdd1aaa28b3d8a302e1502c02f59d4a4c2b1163 +size 265036 diff --git a/static/space_template_2.png b/static/space_template_2.png new file mode 100644 index 0000000000000000000000000000000000000000..5435c10a596f6bebe1b332ba5947162d94c7022a --- /dev/null +++ b/static/space_template_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb4348b0d9c76cf8ac37ed9ab12fc99fb5105980013b297acdc948fab008634 +size 302465 diff --git a/static/space_template_3.png b/static/space_template_3.png new file mode 100644 index 0000000000000000000000000000000000000000..0cf97e327916aefcd9855c4eb2cb57feddd8c7c0 --- /dev/null +++ b/static/space_template_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ccd524b58a0b5efc82a03cc2a3af03dc3cc868c0aa4df4b4b10e79290127585 +size 126656 diff --git a/static/space_template_4.png b/static/space_template_4.png new file mode 100644 index 0000000000000000000000000000000000000000..64113f30bea8ec6fef2de7da407277f6334d47a5 --- /dev/null +++ b/static/space_template_4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04a13a35ea294d4b6690e1d55c1169cba17144a60f4f47337d36dd7db01376e6 +size 420629 diff --git a/static/space_template_5.png b/static/space_template_5.png new file mode 100644 index 0000000000000000000000000000000000000000..1b25ca61cc85728c475b77cc31db0a97c9ad88e4 --- /dev/null +++ b/static/space_template_5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:008b5b63769602b547e48769a75834b948f6b4ff186cd442a6d908eb541b183f +size 203566 diff --git a/static/text_classification_1.png b/static/text_classification_1.png new file mode 100644 index 0000000000000000000000000000000000000000..f8f4e394e6e0eb86f209c620b8e9f9cccff1f08b --- /dev/null +++ b/static/text_classification_1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab0a6ffec106f94987be9190040e4b6cfdd57c037b6c09136113506f19c1b3a0 +size 208950 diff --git a/static/ui.png b/static/ui.png new file mode 100644 index 0000000000000000000000000000000000000000..e8fdc1a4d25ed56c597a35ef70fccbc99141dc88 Binary files /dev/null and b/static/ui.png differ