Spaces:

lucalp
/

blt-entropy-patcher

Running on Zero

App Files Files Community

par-meta commited on Dec 12, 2024

Commit

bcc039b

0 Parent(s):

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.github/workflows/black.yml +12 -0
.github/workflows/isort.yml +10 -0
.gitignore +168 -0
.prettierrc +8 -0
CODE_OF_CONDUCT.md +80 -0
CONTRIBUTING.md +36 -0
LICENSE +28 -0
README.md +117 -0
apps/__init__.py +0 -0
apps/main/__init__.py +0 -0
apps/main/configs/eval.yaml +35 -0
apps/main/configs/llama_1B.yaml +87 -0
apps/main/configs/llama_7B.yaml +95 -0
apps/main/eval.py +354 -0
apps/main/generate.py +463 -0
apps/main/lingua_train.py +654 -0
blt-figure.jpg +0 -0
blt-figure.pdf +0 -0
bytelatent/.DS_Store +0 -0
bytelatent/__init__.py +3 -0
bytelatent/args.py +199 -0
bytelatent/base_transformer.py +585 -0
bytelatent/checkpoint.py +311 -0
bytelatent/configs/debug.yaml +110 -0
bytelatent/constants.py +5 -0
bytelatent/data/__init__.py +1 -0
bytelatent/data/data_types.py +115 -0
bytelatent/data/iterators/__init__.py +1 -0
bytelatent/data/iterators/abstract_iterator.py +23 -0
bytelatent/data/iterators/arrow_iterator.py +216 -0
bytelatent/data/iterators/looping_iterator.py +36 -0
bytelatent/data/iterators/multiprocess_iterator.py +243 -0
bytelatent/data/iterators/packing_iterator.py +226 -0
bytelatent/data/iterators/preprocess_iterator.py +111 -0
bytelatent/data/iterators/sampling_iterator.py +66 -0
bytelatent/data/iterators/sequence_iterator.py +122 -0
bytelatent/data/iterators/test_arrow_iterator.py +89 -0
bytelatent/data/iterators/test_iters.py +162 -0
bytelatent/data/ngram_processor.py +146 -0
bytelatent/data/patcher.py +609 -0
bytelatent/distributed.py +478 -0
bytelatent/entropy_model.py +36 -0
bytelatent/float8.py +152 -0
bytelatent/logger.py +129 -0
bytelatent/metrics.py +232 -0
bytelatent/model/__init__.py +1 -0
bytelatent/model/blt.py +1064 -0
bytelatent/model/local_models.py +356 -0
bytelatent/model/transformer.py +199 -0
bytelatent/model/utils.py +116 -0

.github/workflows/black.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+name: Lint with Black
+on: [push, pull_request]
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: psf/black@stable
+        with:
+          version: "24.8.0"

.github/workflows/isort.yml ADDED Viewed

	@@ -0,0 +1,10 @@

+name: Lint with isort
+on: [push, pull_request]
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: isort/isort-action@master

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+*.out
+figures/
+.vscode/
+.DS_Store

.prettierrc ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "overrides": [
+    {
+      "files": "*.yaml",
+      "options": { "tabWidth": 2 }
+    }
+  ]
+}

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@meta.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,36 @@

+# Contributing to
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## License
+By contributing to BLT, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

LICENSE ADDED Viewed

	@@ -0,0 +1,28 @@

+BSD 3-Clause License
+Copyright 2024 Meta
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice,this list
+of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may
+be used to endorse or promote products derived from this software without specific
+prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.

README.md ADDED Viewed

	@@ -0,0 +1,117 @@

+# Byte Latent Transformer
+This repository contains code for our paper: "Byte Latent Transformer: Patches Scale Better Than Tokens"
+- [Paper Link](https://dl.fbaipublicfiles.com/blt/BLT__Patches_Scale_Better_Than_Tokens.pdf)
+## Abstract
+We introduce the Byte Latent Transformer architecture (BLTs), a new byte-level LLM architecture that
+for the first time, matches tokenization-based LLM performance at scale, with significant improvements
+in inference efficiency and robustness. BLT encodes bytes into dynamically sized patches, which serve
+as the primary units of computation. Patches are segmented dynamically based on the entropy of the
+next byte, allocating more compute and model capacity where there is more data complexity. The BLT
+architecture includes new attention mechanisms to maximize the information flow between byte and
+patch hidden representations and a new type of byte-sequence memory. We present the first scaling
+study of byte-level models up to 8B parameters and 8T training bytes, showing for the first time
+that we can train a model end-to-end at scale from bytes with no tokenization or other preprocessing.
+Scaling trends reveal training and inference efficiency benefits from dynamically selecting very long
+patches on average, along with qualitative improvements with reasoning and long tail generalization
+from modeling byte-sequences.
+![BLT Architecture Diagram](blt-figure.jpg)
+## Development Status
+We are actively updating the blt code to make it easier to reproduce our results.
+Please file an issue and/or be patient while we make more of our code public!
+## Quick start
+The following commands launch a SLURM job that creates an environment for Meta Lingua.
+The env creation should take around 5 minutes without counting downloads.
+```bash
+git clone https://github.com/facebookresearch/blt
+cd blt
+bash setup/create_env.sh
+# or if you have access to a SLURM cluster
+sbatch setup/create_env.sh
+```
+Once that is done your can activate the environment
+```bash
+conda activate blt_<date>
+```
+use the provided script to download and prepare data from huggingface (among `fineweb_edu`, `fineweb_edu_10bt`, or `dclm_baseline_1.0`).
+This command will download the `fineweb_edu` and prepare it for training in the `./data` directory, specifying the amount of memory `terashuf` (the tool used to shuffle samples) will be allocated. By default, the number of chunks (`nchunks`) is 32. If you are running on fewer than 32 GPUs, it is recommended to set `nchunks` to 1 or to match `nchunks` with the number of GPUs (`nchunks` = NGPUs). See [here](https://github.com/facebookresearch/lingua/issues/55#issuecomment-2483643076) for more details.
+```bash
+python setup/download_prepare_hf_data.py fineweb_edu <MEMORY> --data_dir ./data --seed 42 --nchunks <NCHUNKS>
+```
+to download tokenizer (here llama3), use the folowing script:
+```bash
+python setup/download_tokenizer.py llama3 <SAVE_PATH> --api_key <HUGGINGFACE_TOKEN>
+```
+Now launch a debug job to check if everything works. **The provided configurations are templates, you need to adapt them for them to work (change `dump_dir`, `data.root_dir`, `data.tokenizer.path`, etc ...)**
+```bash
+# stool stands for SLURM tool !
+python -m bytelatent.stool script=bytelatent.train config=apps/bytelatent/configs/debug.yaml nodes=1 partition=<partition>
+# if you want to launch locally you can use torchrun
+torchrun --nproc-per-node 8 -m bytelatent.train config=apps/bytelatent/configs/debug.yaml
+# or you can also launch on 1 GPU
+python -m bytelatent.train  config=apps/bytelatent/configs/debug.yaml
+```
+When using `stool`, if a job crashes, it can be relaunched using sbatch:
+```bash
+sbatch path/to/dump_dir/submit.slurm
+```
+## Linting
+To lint, run the following command
+```
+bash dev/lint.sh
+```
+## Citation
+The BLT is partially based on Meta Lingua, so consider citing it in addition to our BLT paper if you re-use our work.
+BLT Paper Citation (will be updated to arXiv soon)
+```
+@article{meta_blt,
+  author = {Artidoro Pagnoni, Ram Pasunuru, Pedro Rodriguez, John Nguyen, Benjamin Muller, Margaret Li, Chunting Zhou, Lili Yu, Jason Weston, Luke Zettlemoyer, Gargi Ghosh, Mike Lewis, Ari Holtzman†, Srinivasan Iyer},
+  title = {Byte Latent Transformer: Patches Scale Better Than Tokens},
+  url = {https://github.com/facebookresearch/blt},
+  year = {2024}
+}
+```
+Lingua Code
+```
+@misc{meta_lingua,
+  author = {Mathurin Videau, Badr Youbi Idrissi, Daniel Haziza, Luca Wehrstedt, Jade Copet, Olivier Teytaud, David Lopez-Paz},
+  title = {{Meta Lingua}: A minimal {PyTorch LLM} training library},
+  url = {https://github.com/facebookresearch/lingua},
+  year = {2024}
+}
+```
+## License
+The BLT code is partially based on Meta Lingia.
+Meta Lingua is licensed under BSD-3-Clause license. Refer to the LICENSE file in the top level directory.

apps/__init__.py ADDED Viewed

File without changes

apps/main/__init__.py ADDED Viewed

File without changes

apps/main/configs/eval.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+name: "debug_evals"
+# ckpt_dir: !!CHANGETHIS!!
+# dump_dir: !!CHANGETHIS!!
+generator:
+  max_tokens: 8192
+  dtype: bf16
+  temperature: 1.0
+  top_p: 0.95
+harness:
+  tasks:
+    - hellaswag
+    - task: boolq
+      dataset_kwargs:
+        trust_remote_code: true
+    - task: nq_open
+      num_fewshot: 5
+    - piqa
+    - task: social_iqa
+      dataset_kwargs:
+        trust_remote_code: true
+    - triviaqa
+    - winogrande
+    - openbookqa
+    - arc_easy
+    - arc_challenge
+    - race
+    - commonsense_qa
+    # - coqa
+    - copa
+    - gsm8k
+    - bbh
+    - mmlu
+    - mmlu_pro
+validation:
+  max_steps: 1000

apps/main/configs/llama_1B.yaml ADDED Viewed

	@@ -0,0 +1,87 @@

+# dump_dir: !!!CHANGE_THIS!!!
+name: large_lm
+steps: 60_000
+probe_freq: null
+seed: 777
+optim:
+  lr: 3e-3
+  weight_decay: 0.033
+  warmup: 5000
+  lr_min_ratio: 0.000001
+  clip: 1.0
+distributed:
+  fsdp_type: full_shard
+  compile: true
+  model_dtype: bf16
+  matmul_allow_tf32: false
+  selective_activation_checkpointing: false
+  tp_size: 1
+model:
+  dim: 2048
+  n_layers: 25
+  n_heads: 16
+data:
+  root_dir: data/shuffled
+  sources:
+    dclm_baseline_1.0: 100.0
+  batch_size: 4
+  prefetch_size: 1024
+  seq_len: 4096
+  n_views: 2
+  load_async: true
+  add_bos: true
+  add_eos: true
+  tokenizer:
+    name: tiktoken
+    path: tokenizers/cl_toplang_128k.tiktoken
+profiling:
+  run: true
+  mem_warmup: 0
+  mem_steps: 4
+  profile_warmup: 100
+  profile_steps: 4
+checkpoint:
+  dump:
+    every: 2500
+    keep: 3
+  eval:
+    every: 5000
+    keep: -1
+logging:
+  freq: 1
+async_eval_gpus: 8
+eval:
+  harness:
+    tasks:
+      - hellaswag
+      - task: boolq
+        dataset_kwargs:
+          trust_remote_code: true
+      - piqa
+      - task: social_iqa
+        dataset_kwargs:
+          trust_remote_code: true
+      - winogrande
+      - openbookqa
+      - arc_easy
+      - arc_challenge
+      - race
+      - commonsense_qa
+      - copa
+      # - coqa
+      # - task: nq_open
+      #   num_fewshot: 5
+      # - triviaqa
+  validation:
+    max_steps: 1000
+  generator:
+    max_tokens: 16384
+    dtype: bf16

apps/main/configs/llama_7B.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+#python -m lingua.stool config=apps/main/configs/llama2_7B.yaml nodes=32 account=fair_amaia_cw_codegen qos=lowest
+# dump_dir: !!!CHANGE_THIS!!!
+name: "7b_baseline"
+steps: 100_000
+grad_acc_steps: 1
+probe_freq: 100
+seed: 777
+optim:
+  lr: 1.0e-3
+  weight_decay: 0.1
+  warmup: 2000
+  lr_min_ratio: 0.000001
+  clip: 1.0
+distributed:
+  fsdp_type: full_shard
+  compile: true
+  model_dtype: bf16
+  matmul_allow_tf32: false
+  selective_activation_checkpointing: false
+  tp_size: 1
+model:
+  dim: 4096
+  n_layers: 32
+  n_heads: 32
+  rope_theta: 100_000
+  ffn_dim_multiplier: 1.0
+  multiple_of: 256
+data:
+  root_dir: data/shuffled
+  sources:
+    dclm_baseline_1.0: 1.0
+  batch_size: 2
+  prefetch_size: 1024
+  seq_len: 4096
+  n_views: 2
+  load_async: true
+  tokenizer:
+    name: tiktoken
+    path: tokenizers/cl_toplang_128k.tiktoken
+profiling:
+  run: true
+  mem_warmup: 0
+  mem_steps: 4
+  profile_warmup: 100
+  profile_steps: 4
+checkpoint:
+  dump:
+    every: 10000
+    keep: -1
+  eval:
+    every: 1000
+    keep: 3
+logging:
+  freq: 1
+async_eval_gpus: 8
+eval:
+  dataset_dir: datasets/eval
+  harness:
+    tasks:
+      - hellaswag
+      - task: boolq
+        dataset_kwargs:
+          trust_remote_code: true
+      - piqa
+      - task: social_iqa
+        dataset_kwargs:
+          trust_remote_code: true
+      - winogrande
+      - openbookqa
+      - arc_easy
+      - arc_challenge
+      - race
+      - commonsense_qa
+      # - coqa
+      - copa
+      - mmlu
+      - mmlu_pro
+      # - task: nq_open
+      #   num_fewshot: 5
+      # - triviaqa
+      # - gsm8k
+      # - bbh
+  validation:
+    max_steps: 1000
+  generator:
+    max_tokens: 8192
+    dtype: bf16

apps/main/eval.py ADDED Viewed

	@@ -0,0 +1,354 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import json
+import logging
+import os
+from collections import defaultdict
+from dataclasses import asdict, dataclass, field
+from datetime import datetime
+from pathlib import Path
+from typing import Any, List, Optional, Tuple, Union
+import torch
+from lingua.args import dump_config
+from lingua.data import init_choice_state, setup_sources
+from lm_eval import simple_evaluate
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from omegaconf import OmegaConf
+from bytelatent.checkpoint import CONSOLIDATE_FOLDER, consolidate_checkpoints
+from bytelatent.distributed import (
+    DistributedArgs,
+    dist_mean_dict,
+    get_global_rank,
+    get_world_size,
+    setup_torch_distributed,
+)
+from bytelatent.transformer import LMTransformer, LMTransformerArgs
+from apps.main.generate import (
+    PackedCausalTransformerGenerator,
+    PackedCausalTransformerGeneratorArgs,
+    load_consolidated_model_and_tokenizer,
+)
+EVAL_FOLDER_NAME = "{:010d}"
+logger = logging.getLogger()
+@dataclass
+class LMHarnessArgs:
+    tasks: Optional[List[Any]] = None
+    num_fewshot: Optional[int] = None
+    device: Optional[str] = None
+    use_cache: Optional[str] = None
+    cache_requests: bool = False
+    rewrite_requests_cache: bool = False
+    delete_requests_cache: bool = False
+    limit: Optional[Union[int, float]] = None
+    bootstrap_iters: int = 100000
+    check_integrity: bool = False
+    write_out: bool = False
+    log_samples: bool = True
+    system_instruction: Optional[str] = None
+    apply_chat_template: Union[bool, str] = False
+    fewshot_as_multiturn: bool = False
+    gen_kwargs: Optional[str] = None
+    verbosity: str = "INFO"
+    predict_only: bool = False
+    random_seed: int = 0
+    numpy_random_seed: int = 1234
+    torch_random_seed: int = 1234
+    fewshot_random_seed: int = 1234
+@dataclass
+class ValidationArgs:
+    max_steps: Optional[int] = (
+        None  # If None the whole validation file is used -> /!\ This number of steps is gpu dependent (100 max steps on 8 gpus = 800 steps on 1 gpu)
+    )
+    use_val_from_train_src: bool = True  # Use the validation set from training sources
+    root_dir: str = ""
+    sources: List[str] = field(default_factory=list)  # Other sources to eval on
+@dataclass
+class EvalArgs:
+    name: str = "evals"
+    dump_dir: Optional[str] = None
+    metric_log_dir: Optional[str] = None
+    ckpt_dir: str = ""
+    generator: PackedCausalTransformerGeneratorArgs = field(
+        default_factory=PackedCausalTransformerGeneratorArgs
+    )
+    harness: Optional[LMHarnessArgs] = field(default_factory=LMHarnessArgs)
+    validation: Optional[ValidationArgs] = field(default_factory=ValidationArgs)
+    wandb: Optional[Any] = None
+    global_step: Optional[int] = None  # for in-training evaluation
+def all_dicts_same(dict_list):
+    if not dict_list:  # Check if the list is empty
+        return True
+    # Compare each dictionary to the first one
+    first_dict = dict_list[0]
+    return all(d == first_dict for d in dict_list)
+class MockAccelerator:
+    def gather(self, tensor):
+        l = [torch.zeros_like(tensor) for _ in range(get_world_size())]
+        torch.distributed.all_gather(l, tensor)
+        return torch.stack(l)
+    def wait_for_everyone(self):
+        torch.distributed.barrier()
+# Light wrapper around generator for lm-eval harness
+class EvalHarnessLM(LM):
+    def __init__(self, generator):
+        super().__init__()
+        self.generator = generator
+        self.accelerator = MockAccelerator()
+        self._rank = get_global_rank()
+        self._world_size = get_world_size()
+        self.device = generator.device
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        prompts, gen_args = zip(*[req.args for req in requests])
+        assert all_dicts_same(gen_args), "Doesn't support different gen args for now"
+        gen_args = gen_args[0]
+        temperature = gen_args.get("temperature", 0.0)
+        top_p = gen_args.get("top_p", None)
+        top_k = gen_args.get("top_k", None)
+        until = gen_args.get("until", [])
+        self.generator.temperature = temperature
+        self.generator.top_p = top_p
+        self.generator.top_k = top_k
+        self.generator.until = until
+        generations, _, _ = self.generator.generate(prompts)
+        filtered_gen = []
+        for g in generations:
+            for e in until:
+                g = g.replace(e, "")
+            filtered_gen.append(g)
+        return filtered_gen
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        prompts, continuations = zip(*[req.args for req in requests])
+        inputs = [req.args[0] + req.args[1] for req in requests]
+        max_gen_len = self.generator.max_gen_len
+        # We temporarily lower max gen len
+        self.generator.max_gen_len = 1
+        _, lls, greedy = self.generator.generate(inputs)
+        results = []
+        for p, ll, gr in zip(prompts, lls, greedy):
+            p_len = len(
+                self.generator.tokenizer.encode(p, add_bos=False, add_eos=False)
+            )
+            results.append((ll[p_len:].sum().item(), gr[p_len:].all().item()))
+        self.generator.max_gen_len = max_gen_len
+        return results
+    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+        prompts = [req.args[0] for req in requests]
+        max_gen_len = self.generator.max_gen_len
+        # We temporarily lower max gen len
+        self.generator.max_gen_len = 1
+        _, lls, _ = self.generator.generate(prompts)
+        results = []
+        for ll in lls:
+            results.append((ll.sum().item(),))
+        self.generator.max_gen_len = max_gen_len
+        return results
+def eval_on_val(generator, val_args: ValidationArgs, train_cfg):
+    srcs = {}
+    for src in val_args.sources:
+        path = os.path.join(val_args.root_dir, src)
+        srcs[path] = 1.0
+    for src in train_cfg.data.sources:
+        path = os.path.join(train_cfg.data.root_dir, src)
+        srcs[path] = 1.0
+    multi_state = init_choice_state(
+        "", srcs, 0, get_global_rank(), get_world_size(), "*.val.jsonl"
+    )
+    path_to_iter = setup_sources(multi_state)
+    max_gen_len = generator.max_gen_len
+    # We temporarily lower max gen len
+    generator.max_gen_len = 1
+    all_val_metrics = {}
+    for src in path_to_iter:
+        jsonl_iterator = path_to_iter[src]
+        texts = []
+        logger.info(f"Running validation on {src}...")
+        for step, (content, state) in enumerate(jsonl_iterator):
+            if state["current_iter"] > 0 or (
+                val_args.max_steps is not None and step >= val_args.max_steps
+            ):
+                break
+            content_key = "text" if ("text" in content) else "content"
+            texts.append(content[content_key])
+        _, loglikelihood, _ = generator.generate(texts)
+        metrics = defaultdict(list)
+        for i, ll in enumerate(loglikelihood):
+            tmp = ll.sum().item()
+            metrics["nll"].append(tmp)
+            metrics["nll_per_token"].append(tmp / len(ll))
+            metrics["nll_per_char"].append(tmp / len(texts[i]))
+            metrics["avg_seqlen"].append(len(ll))
+        for m in metrics:
+            metrics[m] = sum(metrics[m]) / len(metrics[m])
+        metrics.update(dist_mean_dict(metrics))
+        logger.info(f"Validation on {src} done. Metrics: {metrics}")
+        name = os.path.basename(src)
+        if name in all_val_metrics:
+            logger.warning(
+                f"Duplicate source name {name}, path {src} in validation sources, renaming to {name}_1"
+            )
+            name = f"{name}_1"
+        all_val_metrics[name] = metrics
+    generator.max_gen_len = max_gen_len
+    return all_val_metrics
+def launch_eval(cfg: EvalArgs):
+    if not torch.distributed.is_initialized():
+        setup_torch_distributed(DistributedArgs())
+    if (
+        Path(cfg.ckpt_dir).exists()
+        and (Path(cfg.ckpt_dir) / "params.json").exists()
+        and next(Path(cfg.ckpt_dir).glob("*.pth"), None) is not None
+    ):
+        consolidate_path = Path(cfg.ckpt_dir)
+    else:
+        consolidate_path = Path(cfg.ckpt_dir) / CONSOLIDATE_FOLDER
+        if not consolidate_path.exists() and get_global_rank() == 0:
+            consolidate_path = consolidate_checkpoints(cfg.ckpt_dir)
+    Path(cfg.dump_dir).mkdir(parents=True, exist_ok=True)
+    dump_config(cfg, Path(cfg.dump_dir) / "config.yaml", log_config=False)
+    consolidate_path = str(consolidate_path)
+    torch.distributed.barrier()
+    logger.info("Loading model")
+    model, tokenizer, train_cfg = load_consolidated_model_and_tokenizer(
+        consolidate_path,
+        model_cls=LMTransformer,
+        model_args_cls=LMTransformerArgs,
+    )
+    logger.info("Model loaded")
+    model.eval()
+    generator = PackedCausalTransformerGenerator(cfg.generator, model, tokenizer)
+    wrap = EvalHarnessLM(generator)
+    results = simple_evaluate(wrap, **asdict(cfg.harness))
+    val_results = None
+    if cfg.validation:
+        val_results = eval_on_val(generator, cfg.validation, train_cfg)
+    if get_global_rank() == 0:
+        with open(Path(cfg.dump_dir) / "results.json", "w") as f:
+            f.write(json.dumps(results))
+        logger.info(f"All evaluation results: {results['results']}")
+        if val_results is not None:
+            with open(Path(cfg.dump_dir) / "validation.json", "w") as f:
+                f.write(json.dumps(val_results))
+            logger.info(f"All validation results: {val_results}")
+    if cfg.metric_log_dir and get_global_rank() == 0:
+        metric_log_path = Path(cfg.metric_log_dir) / "metrics.eval.jsonl"
+        logger.info(f"Writing metric logs to {metric_log_path}")
+        timestamp = {
+            "created_at": datetime.utcnow().isoformat(),
+        }
+        if cfg.global_step is not None:
+            timestamp["global_step"] = cfg.global_step
+        print(
+            json.dumps(timestamp | results["results"]),
+            file=open(metric_log_path, mode="a"),
+            flush=True,
+        )
+        val_log_path = Path(cfg.metric_log_dir) / "metrics.validation.jsonl"
+        if val_results is not None:
+            print(
+                json.dumps(timestamp | val_results),
+                file=open(val_log_path, mode="a"),
+                flush=True,
+            )
+    del generator
+def main():
+    """
+    The command line interface here uses OmegaConf https://omegaconf.readthedocs.io/en/2.3_branch/usage.html#from-command-line-arguments
+    This accepts arguments as a dot list
+    So if the dataclass looks like
+    @dataclass
+    class DummyArgs:
+        name: str
+        model: LMTransformerArgsgs
+    @dataclass
+    class LMTransformerArgsgs:
+        dim: int
+    Then you can pass model.dim=32 to change values in LMTransformerArgsgs
+    or just name=tictac for top level attributes.
+    The behavior here is as follows:
+    1. We instantiate EvalArgs with its default values
+    2. We override those default values with the ones in the provided config file
+    3. We override the result with the additional arguments provided through command line
+    For example, if the config is the following
+    model:
+        dim: 128
+        n_layers: 4
+    and you call eval.py with eval.py model.dim=64
+    Then the final TrainArgs will have
+    model:
+        dim: 64
+        n_layers: 4
+    Plus all the default values in EvalArgs dataclass.
+    """
+    cli_args = OmegaConf.from_cli()
+    file_cfg = OmegaConf.load(cli_args.config)
+    # We remove 'config' attribute from config as the underlying DataClass does not have it
+    del cli_args.config
+    default_cfg = OmegaConf.structured(EvalArgs())
+    cfg = OmegaConf.merge(default_cfg, file_cfg, cli_args)
+    cfg = OmegaConf.to_object(cfg)
+    launch_eval(cfg)
+if __name__ == "__main__":
+    main()

apps/main/generate.py ADDED Viewed

	@@ -0,0 +1,463 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import List, Optional
+import torch
+from lingua.args import dataclass_from_dict
+from lingua.tokenizers.abstract_tokenizer import Tokenizer
+from lingua.tokenizers.build_tokenizer import build_tokenizer
+from omegaconf import OmegaConf
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.attention.flex_attention import create_block_mask
+from tqdm import tqdm
+from bytelatent.base_transformer import (
+    Attention,
+    causal_mask,
+    generate_doc_mask_mod,
+    lengths_to_local_ids,
+    lengths_to_start_ids,
+)
+from bytelatent.checkpoint import CONSOLIDATE_NAME
+from bytelatent.transformer import LMTransformer, LMTransformerArgs
+def sample_top_p(probs: torch.Tensor, p: float) -> torch.Tensor:
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+def sample_top_k(probs, k):
+    topk_value, _ = torch.topk(probs, k)  # batch_sz x topk
+    min_value_top_k = topk_value[:, [-1]]
+    probs[probs < min_value_top_k] = 0.0
+    probs.div_(probs.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs, num_samples=1)
+    return next_token
+def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None):
+    shape = logits.shape
+    logits = logits.flatten(end_dim=-2)
+    if temperature > 0.0:
+        probs = torch.softmax(logits / temperature, dim=-1)
+        if top_p is not None:
+            next_token = sample_top_p(probs, top_p)
+        elif top_k is not None:
+            next_token = sample_top_k(probs, top_k)
+        else:
+            next_token = torch.multinomial(probs, num_samples=1)
+    else:
+        next_token = torch.argmax(logits, dim=-1)
+    return next_token.view(shape[:-1])
+def pack_prompts(prompts: List[int]):
+    res = []
+    lengths = []
+    for i, p in enumerate(prompts):
+        p = torch.tensor(p, dtype=torch.long)
+        l = p.size(0)
+        res.append(p)
+        lengths.append(l)
+    lengths = torch.tensor(lengths, dtype=torch.long)
+    res = torch.cat(res)
+    return res, lengths
+def batch_prompts(prompts, max_elements, lengths=None):
+    batches = []
+    current_batch = []
+    current_count = 0
+    for i in range(len(prompts)):
+        prt = prompts[i]
+        prompt_size = len(prt) if lengths is None else lengths[i]
+        if current_count + prompt_size <= max_elements:
+            current_batch.append(prt)
+            current_count += prompt_size
+        else:
+            if current_batch:  # Add the current batch to batches
+                batches.append(current_batch)
+            # Start a new batch with the current prompt
+            current_batch = [prt]
+            current_count = prompt_size
+    # Add the last batch if it contains any prompts
+    if current_batch:
+        batches.append(current_batch)
+    return batches
+class KVCache(nn.Module):
+    def __init__(self, bsz, seqlen, n_heads, head_dim, dtype, device):
+        super().__init__()
+        shape = (bsz, seqlen, n_heads, head_dim)
+        self.register_buffer("k_cache", torch.zeros(shape, dtype=dtype, device=device))
+        self.register_buffer("v_cache", torch.zeros(shape, dtype=dtype, device=device))
+        self.offset = 0
+    def reset(self):
+        self.k_cache.zero_()
+        self.v_cache.zero_()
+        self.offset = 0
+    def update(self, k_val, v_val, tok_idx):
+        # input_pos: [B], k_val: [B, S, H, D]
+        self.k_cache.index_copy_(1, self.offset + tok_idx, k_val)
+        self.v_cache.index_copy_(1, self.offset + tok_idx, v_val)
+        return self.k_cache, self.v_cache
+@dataclass
+class PackedCausalTransformerGeneratorArgs:
+    temperature: float = 0.0
+    top_p: Optional[float] = None
+    top_k: Optional[float] = None
+    max_gen_len: int = 512  # Maximum number of tokens to generate
+    max_tokens: int = 1024  # Maximum number of tokens that can go through the model
+    max_prompt_len: Optional[int] = None
+    until: List[str] = field(default_factory=list)
+    compile_prefilling: bool = False
+    reduce_generation_overhead: bool = False
+    show_progress: bool = False
+    dtype: Optional[str] = "bf16"
+    device: Optional[str] = "cuda"
+class PackedCausalTransformerGenerator:
+    def __init__(
+        self,
+        cfg: PackedCausalTransformerGeneratorArgs,
+        model: nn.Module,
+        tokenizer: Tokenizer,
+    ):
+        """
+        This class wraps a causal transformer model with its corresponding tokenizer
+        and provides an efficient way to pack prompts together and do generation on
+        the packed sequence.
+        For example, if we had the prompts "Hello, I am a " and "Initiating calibration "
+        Then this class will concatenate those sequence (pack them together)
+        "Hello, I am a Initiating calibration"
+        And make the necessary attention masks such that a sequence only attends to itself
+        during prefilling and generation.
+        This class creates a fixed size cache of size max_tokens or sum of prompt sizes
+        + the max number of generated tokens per sequence.
+        """
+        self.model = model
+        self.tokenizer = tokenizer
+        self.temperature = cfg.temperature
+        self.top_p = cfg.top_p
+        self.top_k = cfg.top_k
+        self.max_gen_len = cfg.max_gen_len
+        self.max_tokens = cfg.max_tokens
+        self.max_prompt_len = cfg.max_prompt_len
+        self.until = cfg.until
+        self.max_until_size = max([len(e) for e in self.until]) if self.until else 1
+        self.device = cfg.device
+        # Compile if necessary
+        self.prefill = torch.compile(self.prefill, disable=not cfg.compile_prefilling)
+        self.generate_next_token = torch.compile(
+            self.generate_next_token,
+            mode="reduce-overhead",
+            disable=not cfg.reduce_generation_overhead,
+        )
+        self.show_progress = cfg.show_progress
+        self.dtype = dict(fp32=torch.float32, bf16=torch.bfloat16)[cfg.dtype]
+        self.prefill_doc_id, self.prefill_tok_id = None, None
+        self.padded_doc_id, self.padded_tok_id = None, None
+        self.current_doc_id, self.current_tok_id = None, None
+        self.padded_doc_start = None
+        self.prefill_mask = None
+    def clear_cache(self, offset):
+        for module in self.model.modules():
+            if isinstance(module, Attention):
+                if not hasattr(module, "kv_cache"):
+                    module.kv_cache = KVCache(
+                        1,
+                        self.max_tokens,
+                        module.n_kv_heads,
+                        module.head_dim,
+                        self.dtype,
+                        self.device,
+                    )
+                module.kv_cache.offset = offset
+    @torch.compiler.disable
+    def setup_prefilling(self, lengths: torch.Tensor):
+        # The KV cache is a fixed size tensor of size max_tokens that we need
+        # to update in order to do correct autoregressive generation.
+        # Here we will generate token by token but on multiple sequences
+        # at once. To do so, we need to have an attention mask that makes
+        # each sequence independent.
+        # Each sequence will write to its allocated space in the KV Cache.
+        # We allocate len(seq) + max_gen_len to each sequence in the cache.
+        # We will generate max_gen_len for each document
+        padded_lengths = lengths + self.max_gen_len
+        max_tokens = self.max_tokens or padded_lengths.sum().item()
+        # The last document might have more padding to fill up to max_tokens
+        padded_lengths[-1] += max_tokens - padded_lengths.sum()
+        # This is the start index in the cache for each document
+        self.padded_doc_start = lengths_to_start_ids(padded_lengths)
+        # For example with ab--123--cdef--
+        # this would be 0, 4, 9 if max_gen_len is 2
+        # We repeat interleave to align with tokens for prefilling
+        # Ex: ab--123--cdef--
+        #     000044444999999
+        prefill_offset = torch.repeat_interleave(self.padded_doc_start, lengths)
+        # This offset will make sure the tokens are written to the
+        # correct positions in the cache during prefilling
+        # We either init the cache or clear it by resetting the offset to prefill_offset
+        self.clear_cache(prefill_offset)
+        # The prefilling mask looks like the following for
+        # the two packed sequences ab and 123 : ab123
+        # Where spaces are empty cache positions
+        #                 keys
+        #                ab---123---
+        #   queries    a 10000000000
+        #              b 11000000000
+        #              1 00000100000
+        #              2 00000110000
+        #              3 00000111000
+        # We make sure to skip the empty cache positions
+        # and only attend to positions within the same sequence
+        doc_mask_mod = generate_doc_mask_mod(causal_mask, lengths, padded_lengths)
+        self.prefill_mask = create_block_mask(
+            doc_mask_mod, 1, None, lengths.sum(), max_tokens
+        )
+        # This creates the prefilling token ids which look like
+        # the following for the packed sequence abcdefg1234
+        # abcdefg1234
+        # 01234560123
+        # The token id gives us the position within each sequence
+        # This is used to compute ROPE and to update the cache
+        # At each forward pass the current tokens are written to
+        # offset + tok_id
+        self.prefill_doc_id, self.prefill_tok_id = lengths_to_local_ids(lengths)
+        # This creates the padded token and document ids
+        # which look like the following for the packed sequence ab123
+        #               ab---123---               ab---123---
+        # padded_doc_id 00000111111 padded_tok_id 01234012345
+        # This will later be useful for the attention mask at generation
+        self.padded_doc_id, self.padded_tok_id = lengths_to_local_ids(padded_lengths)
+    @torch.compiler.disable
+    def setup_generation(self, lengths):
+        # KV Cache offset is set to the start of the padded documents
+        for module in self.model.modules():
+            if isinstance(module, Attention):
+                module.kv_cache.offset = self.padded_doc_start
+        # The token ids during generations correspond to the lengths of each doc
+        # current_tok_id will be incremented during generation
+        self.current_tok_id = lengths.clone()
+        # Since we're generating one token per document
+        # the document id is just an arange
+        self.current_doc_id = torch.arange(lengths.size(0), device=lengths.device)
+    # From here on some methods for generation
+    def prefill(self, tokens: torch.Tensor, lengths: torch.Tensor):
+        # Prefilling is done by taking multiple packed sequences and
+        # doing block diagonal attention on them so they remain independent
+        self.setup_prefilling(lengths=lengths)
+        prefill_out = self.model.forward(
+            tokens,
+            tok_idx=self.prefill_tok_id,
+            mask=self.prefill_mask,
+            attn_impl="flex_attention",
+        )
+        self.setup_generation(lengths=lengths)
+        return prefill_out
+    def generate_next_token(self, current_token):
+        # Since we're doing generation with multiple sequences at once
+        # we need to ignore tokens and cache entries from other sequences
+        # or in the future.
+        # Example mask :
+        #                  keys
+        #                abc--1234--
+        #   queries    c 11100000000
+        #              4 00000111100
+        # mask shape : (n_seqs, cache_size)
+        doc_mask = self.current_doc_id.unsqueeze(1) == self.padded_doc_id.unsqueeze(0)
+        caus_mask = self.current_tok_id.unsqueeze(1) >= self.padded_tok_id.unsqueeze(0)
+        mask = doc_mask & caus_mask
+        out = self.model.forward(
+            current_token,
+            tok_idx=self.current_tok_id,  # n_seqs
+            mask=mask,
+            attn_impl="sdpa",
+        )
+        self.current_tok_id += 1
+        return out
+    @torch.inference_mode()
+    def generate(self, prompts):
+        # Tokenize
+        prompts = [
+            self.tokenizer.encode(p, add_bos=True, add_eos=False) for p in prompts
+        ]
+        # Truncate
+        max_seqlen = (
+            self.max_tokens
+            if not hasattr(self.model, "max_seqlen")
+            else self.model.max_seqlen
+        )
+        max_prompt_len = self.max_prompt_len or min(
+            max_seqlen - self.max_gen_len, self.max_tokens - self.max_gen_len
+        )
+        prompts = [p[-max_prompt_len:] for p in prompts]
+        # Account for the generation in lengths
+        padded_lengths = [len(p) + self.max_gen_len for p in prompts]
+        generation = []
+        loglikelihood = []
+        greedy = []
+        it = batch_prompts(prompts, self.max_tokens, lengths=padded_lengths)
+        if self.show_progress:
+            it = tqdm(it)
+        for batch in it:
+            n_seqs = len(batch)
+            generated_tokens = [[] for _ in range(n_seqs)]
+            is_done = [False for _ in range(n_seqs)]
+            packed_batch, lengths = pack_prompts(batch)
+            packed_batch, lengths = packed_batch.cuda(), lengths.cuda()
+            n_seqs = lengths.size(0)
+            # Prefilling cache
+            prompt_logits = self.prefill(packed_batch.unsqueeze(0), lengths)
+            # Selecting last token in each prompt
+            all_tokens = sample_tokens(
+                prompt_logits, self.temperature, self.top_p, self.top_k
+            )
+            start_token = all_tokens[:, lengths.cumsum(0) - 1]
+            for seq_id, tok in enumerate(start_token.squeeze(0).tolist()):
+                generated_tokens[seq_id].append(tok)
+            current_token = start_token
+            for i in range(1, self.max_gen_len):
+                next_logits = self.generate_next_token(current_token)
+                next_token = sample_tokens(
+                    next_logits.clone(), self.temperature, self.top_p, self.top_k
+                )
+                for seq_id, tok in enumerate(next_token.squeeze(0).tolist()):
+                    if not is_done[seq_id]:
+                        generated_tokens[seq_id].append(tok)
+                        current_end_str = self.tokenizer.decode(
+                            generated_tokens[seq_id][-self.max_until_size :]
+                        )
+                        contains_end_string = any(
+                            [e in current_end_str for e in self.until]
+                        )
+                        is_done[seq_id] = (
+                            contains_end_string or tok == self.tokenizer.eos_id
+                        )
+                if all(is_done):
+                    break
+                current_token = next_token
+            generation.extend([self.tokenizer.decode(g) for g in generated_tokens])
+            for p, logit in zip(
+                batch, prompt_logits.squeeze(0).split(lengths.tolist())
+            ):
+                x = logit[:-1]
+                y = torch.tensor(p[1:], device=x.device)
+                loglikelihood.append(-F.cross_entropy(x, y, reduction="none").cpu())
+                greedy.append((x.argmax(dim=-1) == y).cpu())
+        return generation, loglikelihood, greedy
+def load_consolidated_model_and_tokenizer(
+    consolidated_path,
+    model_cls=LMTransformer,
+    model_args_cls=LMTransformerArgs,
+):
+    ckpt_path = Path(consolidated_path)
+    config = ckpt_path / "params.json"
+    config = OmegaConf.load(config)
+    param_dtype = dict(fp32=torch.float32, fp16=torch.float16, bf16=torch.bfloat16)[
+        config.distributed.model_dtype
+    ]
+    model_args = dataclass_from_dict(model_args_cls, config.model, strict=False)
+    tokenizer = build_tokenizer(config.data.tokenizer.name, config.data.tokenizer.path)
+    model = model_cls(model_args)
+    st_dict = torch.load(ckpt_path / CONSOLIDATE_NAME, weights_only=True)
+    model.load_state_dict(st_dict["model"])
+    model = model.cuda().eval()
+    for param in model.parameters():
+        param.data = param.data.to(dtype=param_dtype)
+    return model, tokenizer, config
+def main():
+    # Load CLI arguments (overrides) and combine with a YAML config
+    cfg = OmegaConf.from_cli()
+    gen_cfg = dataclass_from_dict(
+        PackedCausalTransformerGeneratorArgs, cfg, strict=False
+    )
+    print(cfg)
+    model, tokenizer, _ = load_consolidated_model_and_tokenizer(cfg.ckpt)
+    generator = PackedCausalTransformerGenerator(gen_cfg, model, tokenizer)
+    # Allow multiple prompts
+    prompts = []
+    while True:
+        prompt = input("Enter a prompt (or press enter to finish): ")
+        if not prompt:
+            break
+        prompts.append(prompt)
+    # Start generation
+    start_time = time.time()
+    generation, loglikelihood, greedy = generator.generate(prompts)
+    end_time = time.time()
+    # Calculate tokens per second
+    total_tokens = sum(len(tokenizer.encode(gen, False, False)) for gen in generation)
+    tokens_per_second = total_tokens / (end_time - start_time)
+    # Display the results
+    for i, gen in enumerate(generation):
+        print(f"\nPrompt {i+1}: {prompts[i]}")
+        print(f"Generated Text: {gen}")
+    print(f"\nTokens per second: {tokens_per_second:.2f}")
+if __name__ == "__main__":
+    main()

apps/main/lingua_train.py ADDED Viewed

	@@ -0,0 +1,654 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+import gc
+import logging
+import os
+import sys
+from contextlib import ExitStack
+from copy import deepcopy
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from timeit import default_timer as timer
+from typing import Any, Dict, Optional
+import torch
+import torch.distributed
+import wandb
+import xformers.profiler
+from lingua.args import dataclass_from_dict, dump_config, flatten_dict
+from lingua.data import (
+    DataArgs,
+    PackTokensState,
+    build_dataloader_from_args,
+    init_dataloader_state_from_args,
+)
+from lingua.tokenizers.build_tokenizer import TokenizerArgs
+from omegaconf import OmegaConf
+from pydantic import BaseModel
+from torch.distributed._tensor import DTensor
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.optim import lr_scheduler
+from bytelatent.checkpoint import (
+    CheckpointArgs,
+    CheckpointManager,
+    load_from_checkpoint,
+)
+from bytelatent.distributed import (
+    DistributedArgs,
+    EnvironmentArgs,
+    check_model_value_range,
+    clean_env,
+    dist_mean_dict,
+    get_device_mesh,
+    get_is_master,
+    get_world_size,
+    init_signal_handler,
+    parallelize_model,
+    requeue_slurm_job,
+    setup_env,
+    setup_torch_distributed,
+)
+from bytelatent.logger import init_logger
+from bytelatent.metrics import (
+    GPUMemoryMonitor,
+    LoggingArgs,
+    MetricLogger,
+    get_num_params,
+)
+from bytelatent.optim import OptimArgs, build_optimizer
+from bytelatent.probe import AutoProbeD
+from bytelatent.profiling import ProfilerArgs, maybe_run_profiler
+from bytelatent.stool import StoolArgs, launch_job
+from bytelatent.transformer import (
+    LMTransformer,
+    LMTransformerArgs,
+    build_fsdp_grouping_plan,
+    get_no_recompute_ops,
+    get_num_flop_per_token,
+    tp_parallelize,
+)
+logger = logging.getLogger()
+class TrainArgs(BaseModel):
+    name: str = "lingua"
+    dump_dir: str = ""
+    seed: int = 42
+    # Number of gradient accumulation steps
+    # Total batch size is batch_size*grad_acc_steps
+    grad_acc_steps: int = 1
+    gc_collect_freq: int = 1000
+    probe_freq: int | None = None
+    # Nb optimizer steps to take
+    steps: int = 1000
+    data: DataArgs
+    optim: OptimArgs
+    model: LMTransformerArgs
+    distributed: DistributedArgs
+    env: EnvironmentArgs
+    checkpoint: CheckpointArgs
+    profiling: ProfilerArgs
+    logging: LoggingArgs
+    # If set to None, eval is run locally otherwise it launches a new job with the given number of gpus
+    async_eval_gpus: int | None = None
+    eval: Any | None = None
+@dataclass
+class TrainState(Stateful):
+    step: int  # Nb of steps taken by the optimizer
+    acc_step: int  # Nb of accumulation steps done since last optimizer step
+    scheduler: lr_scheduler.LambdaLR
+    data_loader_state: PackTokensState
+    def state_dict(self) -> Dict[str, Any]:
+        return {
+            "step": self.step,
+            "acc_step": self.acc_step,
+            "data_loader_state": self.data_loader_state,
+            "scheduler": self.scheduler.state_dict(),
+        }
+    def load_state_dict(self, state_dict):
+        self.step = state_dict["step"]
+        self.acc_step = state_dict["acc_step"]
+        self.data_loader_state = PackTokensState(**state_dict["data_loader_state"])
+        self.scheduler.load_state_dict(state_dict["scheduler"])
+def validate_train_args(args: TrainArgs, output_size: int):
+    if args.model.vocab_size < 0:
+        logger.info(f"Setting model output size to {args.model.vocab_size}")
+        args.model.vocab_size = output_size
+    assert (
+        args.model.vocab_size == output_size
+    ), "Vocab size should be the same as output size"
+    assert args.dump_dir, "Dump dir not set"
+    if args.checkpoint.path is None:
+        logger.info(f"Setting checkpoint path to {args.checkpoint.path}")
+        args.checkpoint.path = str(Path(args.dump_dir) / "checkpoints")
+    for source in args.data.sources:
+        data_path = os.path.join(args.data.root_dir, source)
+        assert os.path.exists(data_path), f"{data_path} doesn't exist"
+    if (
+        args.distributed.dp_replicate
+        * args.distributed.dp_shard
+        * args.distributed.tp_size
+        != get_world_size()
+    ):
+        assert get_world_size() % args.distributed.dp_shard == 0
+        args.distributed.dp_replicate = get_world_size() // args.distributed.dp_shard
+        assert args.distributed.dp_replicate % args.distributed.tp_size == 0
+        args.distributed.dp_replicate = (
+            args.distributed.dp_replicate // args.distributed.tp_size
+        )
+        logger.warning(
+            f"Setting Data Parallel size to {args.distributed.dp_replicate * args.distributed.dp_shard}"
+        )
+        assert (
+            args.distributed.dp_replicate
+            * args.distributed.dp_shard
+            * args.distributed.tp_size
+            == get_world_size()
+        )
+        if args.distributed.fsdp_type == "no_shard":
+            assert (
+                args.distributed.dp_shard == 1
+                and args.distributed.dp_replicate == get_world_size()
+            )
+    args.model.max_seqlen = args.data.seq_len
+    if args.distributed.tp_size == 1:
+        logger.warning(
+            "Tensor parallelism has not been tested for a while, use at your own risk"
+        )
+    assert (
+        args.probe_freq != args.profiling.mem_steps
+    ), "Don't profile during probe step"
+    assert (
+        args.probe_freq != args.profiling.profile_steps
+    ), "Don't profile during probe step"
+    if args.logging.wandb is not None:
+        args.logging.wandb.name = args.name
+    if args.probe_freq is not None:
+        assert (
+            args.distributed.tp_size == 1
+        ), "Probing not supported with tensor parallelism"
+        assert (
+            args.distributed.selective_activation_checkpointing is False
+        ), "Probing not supported with selective activation checkpointing"
+preemption_flag = dict(flag=False)
+def set_preemption_flag(signum, frame):
+    logger.warning("Signal handler called with signal " + str(signum))
+    logger.warning("Preemption ! checkpointing asap and exiting.")
+    preemption_flag["flag"] = True
+def every_n_steps(train_state, freq, acc_step=None, acc_freq=None):
+    test = train_state.step % freq == 0
+    if acc_step is not None:
+        test = test and (train_state.acc_step == acc_step)
+    elif acc_freq is not None:
+        test = test and ((train_state.acc_step % acc_freq) == 0)
+    return test
+def train(args: TrainArgs):
+    with ExitStack() as context_stack:
+        tokenizer_args = TokenizerArgs(
+            name=args.data.name,
+            init_kwargs=args.data.tokenizer.init_kwargs,
+        )
+        tokenizer = tokenizer_args.build()
+        validate_train_args(
+            args,
+            tokenizer.n_words,
+        )
+        if get_is_master():
+            os.makedirs(args.dump_dir, exist_ok=True)
+            dump_config(args, Path(args.dump_dir) / "config.yaml")
+        init_logger(Path(args.dump_dir) / "train.log")
+        init_signal_handler(set_preemption_flag)  # For handling preemption signals.
+        setup_env(args.env)
+        setup_torch_distributed(args.distributed)
+        world_mesh = get_device_mesh(args.distributed)
+        logger.info(f"Starting job: {args.name}")
+        # build dataloader
+        # need dp world size and rank
+        dp_mesh = world_mesh["dp_replicate"]
+        dp_degree = dp_mesh.size()
+        dp_rank = dp_mesh.get_local_rank()
+        if args.distributed.dp_shard > 1:
+            dp_rank = dp_rank * dp_degree + world_mesh["dp_shard"].get_local_rank()
+            dp_degree *= world_mesh["dp_shard"].size()
+        logger.info(f"Running on dp rank : {dp_rank}")
+        logger.info(f"Running on dp size : {dp_degree}")
+        torch.manual_seed(args.seed)
+        logger.info("Building model")
+        # Initializing Model in meta device allows us to initialize models much bigger than 1 gpu's memory
+        with torch.device("meta"):
+            model = LMTransformer(args.model)
+        logger.info("Model is built !")
+        model_param_count = get_num_params(model)
+        model = parallelize_model(
+            model,
+            world_mesh,
+            args.model,
+            args.distributed,
+            fsdp_grouping_plan=build_fsdp_grouping_plan(args.model),
+            tp_parallelize=tp_parallelize,
+            no_recompute_ops=get_no_recompute_ops(),
+        )
+        # Once we shard the model on different gpus we can actually initialize the model
+        # First we create empty tensors of the correct shapes
+        model = model.to_empty(device="cuda")
+        # Then we init the model. Please make sure this function initializes *ALL* parameters
+        # and buffers, otherwise you will have random values in the unitialized tensors
+        # which will silently fail (give nan gradients for example)
+        if args.checkpoint.init_ckpt_path:
+            logger.info(f"Loading initial model from {args.checkpoint.init_ckpt_path}")
+            load_from_checkpoint(
+                args.checkpoint.init_ckpt_path, model, model_key="model"
+            )  # Put model_key="" if its directly the model checkpoint
+            model.rope_embeddings.reset_parameters()  # For RoPe initialization since it's a buffer it might not be loaded
+        else:
+            with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
+                torch.manual_seed(args.model.seed)
+                model.init_weights()
+        check_model_value_range(model, range=10.0, std=1.0)
+        # log model size
+        logger.info(f"Model size: {model_param_count:,} total parameters")
+        gpu_memory_monitor = GPUMemoryMonitor("cuda")
+        logger.info(
+            f"GPU capacity: {gpu_memory_monitor.device_name} ({gpu_memory_monitor.device_index}) "
+            f"with {gpu_memory_monitor.device_capacity_gib:.2f}GiB memory"
+        )
+        logger.info(f"GPU memory usage: {gpu_memory_monitor}")
+        # build optimizer after apply parallelisms to the model
+        optimizer, scheduler = build_optimizer(model, args.optim, args.steps)
+        data_loader_state = init_dataloader_state_from_args(
+            args.data, dp_rank, dp_degree
+        )
+        train_state = TrainState(
+            step=0,
+            acc_step=0,
+            data_loader_state=data_loader_state,
+            scheduler=scheduler,
+        )
+        checkpoint = CheckpointManager.instantiate_and_make_dir(args.checkpoint)
+        checkpoint.load(model, optimizer, train_state, world_mesh)
+        # Either load from latest checkpoint or start from scratch
+        if args.probe_freq is not None:
+            if get_is_master():
+                os.makedirs(Path(args.dump_dir) / "probe", exist_ok=True)
+            torch.distributed.barrier()
+            probe = AutoProbeD(
+                model,
+                (
+                    Path(args.dump_dir) / "probe" / f"probe.{dp_rank}.jsonl"
+                    if (dp_rank % 128 == 0)
+                    else None
+                ),
+            )
+            probe_mod = model._orig_mod if args.distributed.compile else model
+        gc.disable()
+        # train loop
+        model.train()
+        metric_logger = context_stack.enter_context(
+            MetricLogger(Path(args.dump_dir) / "metrics.jsonl", args)
+        )
+        data_loader = context_stack.enter_context(
+            build_dataloader_from_args(
+                args.data,
+                state=train_state.data_loader_state,
+            )
+        )
+        torch_profiler = context_stack.enter_context(
+            maybe_run_profiler(args.dump_dir, model, args.profiling)
+        )
+        nwords_since_last_log = 0
+        time_last_log = timer()
+        gc.collect()
+        while train_state.step < args.steps:
+            # We constrain train_state.acc_step to be in range 0 to args.grad_acc_steps - 1
+            train_state.acc_step += 1
+            train_state.acc_step = train_state.acc_step % args.grad_acc_steps
+            # get batch
+            curr_lr = float(optimizer.param_groups[0]["lr"])
+            data_load_start = timer()
+            batch, train_state.data_loader_state = next(data_loader)
+            batch = torch.tensor(
+                batch,
+                dtype=torch.long,
+            )
+            if every_n_steps(train_state, args.gc_collect_freq, acc_step=0):
+                logger.info("garbage collection")
+                # we do garbage collection manually otherwise different processes
+                # run the GC at different times so they slow down the whole pipeline
+                gc.collect()
+            input_ids = batch[:, :, 0].cuda()
+            labels = batch[:, :, 1].cuda()
+            data_load_time = round(timer() - data_load_start, 4)
+            nwords_since_last_log += input_ids.numel()
+            bsz, seqlen = labels.shape
+            # forward
+            start_timer = torch.cuda.Event(enable_timing=True)
+            end_timer = torch.cuda.Event(enable_timing=True)
+            start_timer.record()
+            # This is an automatic probe that will compute statistics
+            # of all linears' inputs, weights and outputs
+            # along with attention logits and entropy
+            # both in forward and backward pass
+            if (args.probe_freq is not None) and every_n_steps(
+                train_state, args.probe_freq, acc_step=1 % args.grad_acc_steps
+            ):
+                # Here we do a fake forward and backward pass on a smaller
+                # batch size to avoid OOM
+                # This assumes the model has no stateful layers (batch norm..)
+                assert (
+                    next(probe_mod.parameters()).grad is None
+                ), "Can't probe model if grads are not reset"
+                with probe:
+                    probe.metadata = {
+                        "it": train_state.step,
+                        "global_step": train_state.step,
+                        "loop": "lingua",
+                    }
+                    # Non compiled model uses roughly 2x memory in our exps
+                    # So we divide bsz by 2 or seqlen by 2
+                    probe_bsz = max(1, bsz // 2)
+                    probe_seq = seqlen if (bsz // 2 >= 1) else (seqlen // 2)
+                    probe_loss = probe_mod(
+                        input_ids[:probe_bsz, :probe_seq],
+                        labels[:probe_bsz, :probe_seq],
+                    )
+                    probe_loss.backward()
+                    # We zero grads to cancel this fake step
+                    optimizer.zero_grad()
+                assert (
+                    next(probe_mod.parameters()).grad is None
+                ), "Probe model shouldn't have grads at this point"
+            loss = model(input_ids, labels)
+            # We scale loss with grad_acc_steps so the gradient is the same
+            # regardless of grad_acc_steps
+            loss = loss / args.grad_acc_steps
+            # backward on scaled loss to create scaled gradients
+            loss.backward()
+            # For logging we undo that scaling
+            loss = loss.detach() * args.grad_acc_steps
+            grad_norm = torch.nn.utils.clip_grad_norm_(
+                model.parameters(), max_norm=args.optim.clip, foreach=True
+            )
+            grad_norm = (
+                grad_norm.full_tensor() if isinstance(grad_norm, DTensor) else grad_norm
+            ).item()
+            # optimizer step
+            if train_state.acc_step == 0:
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+                train_state.step += 1
+            # updates the scale for next iteration
+            # training iteration complete
+            end_timer.record()
+            torch.cuda.synchronize()
+            curr_iter_time = round(start_timer.elapsed_time(end_timer) * 1e-3, 4)
+            # if profiler is active
+            if torch_profiler:
+                xformers.profiler.step()
+            # log metrics
+            if every_n_steps(
+                train_state,
+                args.logging.freq,
+                acc_step=None if args.logging.acc_freq else 0,
+                acc_freq=args.logging.acc_freq,
+            ):
+                time_delta = timer() - time_last_log
+                wps = nwords_since_last_log / (time_delta * args.distributed.tp_size)
+                gpu_mem_stats = gpu_memory_monitor.get_peak_stats()
+                total_acc_steps = (
+                    args.grad_acc_steps * train_state.step + train_state.acc_step
+                )
+                tokens_per_gpu = (
+                    total_acc_steps * args.data.batch_size * args.data.seq_len
+                )
+                total_tokens = dp_degree * tokens_per_gpu
+                # This is an estimate and the correct values may change
+                # if you change the architecture
+                # Use xformer's analyze profile trace to get actual measurement
+                FLOPS = (
+                    get_num_flop_per_token(
+                        model_param_count - args.model.vocab_size * args.model.dim,
+                        args.model.n_layers,
+                        args.model.dim,
+                        args.data.seq_len,
+                    )
+                    * wps
+                )
+                metrics = flatten_dict(
+                    {
+                        "global_step": train_state.step,
+                        "acc_step": train_state.acc_step,
+                        "speed": {
+                            "wps": wps,
+                            "FLOPS": FLOPS,
+                            "curr_iter_time": curr_iter_time,
+                            "data_load_time": data_load_time,
+                        },
+                        "optim": {
+                            "grad_norm": grad_norm,
+                            "lr": curr_lr,
+                            "total_tokens": total_tokens,
+                        },
+                        "memory": gpu_mem_stats._asdict(),
+                    },
+                    sep="/",
+                )
+                to_sync = {}
+                to_sync["loss/out"] = loss.item()
+                metrics.update(dist_mean_dict(to_sync))
+                if get_is_master():
+                    metric_logger.log(metrics)
+                gpu_memory_monitor.reset_peak_stats()
+                nwords_since_last_log = 0
+                time_last_log = timer()
+                logger.info(
+                    f"step: {train_state.step}"
+                    f"  acc: {train_state.acc_step}"
+                    f"  loss: {round(loss.item(),4):>7}"
+                    f"  grad: {grad_norm:.2e}"
+                    f"  flops: {FLOPS:.2e}"
+                    f"  wps: {wps:.2e}"
+                    f"  iter: {curr_iter_time:>7}"
+                    f"  data: {data_load_time:>5}"
+                    f"  lr: {curr_lr:.2e}"
+                    f"  mem: {gpu_mem_stats.max_active_pct:.0f}%"
+                    f"  pow: {gpu_mem_stats.power_draw/1000} W"
+                )
+            saved = False
+            if every_n_steps(
+                train_state, args.checkpoint.dump.every, acc_step=0
+            ) or every_n_steps(train_state, args.checkpoint.eval.every, acc_step=0):
+                saved = checkpoint.save(
+                    model,
+                    optimizer,
+                    train_state,
+                    args,
+                    device_mesh=world_mesh,
+                )
+            if args.eval is not None and every_n_steps(
+                train_state, args.checkpoint.eval.every, acc_step=0
+            ):
+                from apps.main.eval import EVAL_FOLDER_NAME, EvalArgs, launch_eval
+                eval_args = dataclass_from_dict(EvalArgs, args.eval)
+                eval_args.global_step = train_state.step
+                eval_args.ckpt_dir = str(checkpoint.existing_saves[-1])
+                eval_args.dump_dir = str(
+                    os.path.join(
+                        args.dump_dir,
+                        "evals",
+                        EVAL_FOLDER_NAME.format(train_state.step),
+                    )
+                )
+                eval_args.metric_log_dir = args.dump_dir
+                if args.async_eval_gpus is None:
+                    launch_eval(eval_args)
+                elif get_is_master():
+                    if wandb.run is not None and args.logging.wandb is not None:
+                        eval_args.wandb = deepcopy(args.logging.wandb)
+                    assert args.async_eval_gpus > 0
+                    logger.info(f"Launching evals on {args.async_eval_gpus} gpus")
+                    with clean_env():
+                        launch_job(
+                            StoolArgs(
+                                asdict(eval_args),
+                                script="apps.main.eval",
+                                copy_code=False,
+                                nodes=args.async_eval_gpus // 8,
+                                qos="lowest",
+                            )
+                        )
+            if preemption_flag["flag"]:
+                if not saved:
+                    checkpoint.save(
+                        model,
+                        optimizer,
+                        train_state,
+                        args,
+                        device_mesh=world_mesh,
+                    )
+                requeue_slurm_job()
+                sys.exit(0)
+    if not saved:
+        checkpoint.save(
+            model,
+            optimizer,
+            train_state,
+            args,
+            device_mesh=world_mesh,
+        )
+    gc.collect()
+def main():
+    """
+    The command line interface here uses OmegaConf https://omegaconf.readthedocs.io/en/2.3_branch/usage.html#from-command-line-arguments
+    This accepts arguments as a dot list
+    So if the dataclass looks like
+    @dataclass
+    class DummyArgs:
+        name: str
+        model: LMTransformerArgsgs
+    @dataclass
+    class LMTransformerArgsgs:
+        dim: int
+    Then you can pass model.dim=32 to change values in LMTransformerArgsgs
+    or just name=tictac for top level attributes.
+    The behavior here is as follows:
+    1. We instantiate TrainArgs with its default values
+    2. We override those default values with the ones in the provided config file
+    3. We override the result with the additional arguments provided through command line
+    For example, if the config is the following
+    model:
+        dim: 128
+        n_layers: 4
+    and you call train.py with train.py model.dim=64
+    Then the final TrainArgs will have
+    model:
+        dim: 64
+        n_layers: 4
+    Plus all the default values in TrainArgs dataclass.
+    """
+    cli_args = OmegaConf.from_cli()
+    file_cfg = OmegaConf.load(cli_args.config)
+    # We remove 'config' attribute from config as the underlying DataClass does not have it
+    del cli_args.config
+    default_cfg = OmegaConf.structured(TrainArgs())
+    cfg = OmegaConf.merge(default_cfg, file_cfg, cli_args)
+    cfg = OmegaConf.to_object(cfg)
+    train(cfg)
+if __name__ == "__main__":
+    main()

blt-figure.jpg ADDED Viewed

blt-figure.pdf ADDED Viewed

Binary file (62.5 kB). View file

bytelatent/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

bytelatent/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+class ByteLatentError(Exception):
+    pass

bytelatent/args.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import logging
+import os
+from typing import Any
+import numpy as np
+import yaml
+from pydantic import BaseModel, ConfigDict
+from bytelatent.checkpoint import CheckpointArgs
+from bytelatent.data.data_types import Batch
+from bytelatent.data.iterators.abstract_iterator import StatefulIterator
+from bytelatent.data.iterators.arrow_iterator import (
+    ArrowFileIterator,
+    find_and_sanitize_chunks,
+)
+from bytelatent.data.iterators.looping_iterator import LoopingIterator
+from bytelatent.data.iterators.multiprocess_iterator import MultiprocessIterator
+from bytelatent.data.iterators.packing_iterator import PackingArgs, PackingIterator
+from bytelatent.data.iterators.preprocess_iterator import PreprocessIterator
+from bytelatent.data.iterators.sampling_iterator import SamplingIterator
+from bytelatent.data.iterators.sequence_iterator import (
+    SequenceIterator,
+    SequencePackingArgs,
+)
+from bytelatent.data.patcher import PatcherArgs
+from bytelatent.distributed import DistributedArgs, EnvironmentArgs
+from bytelatent.metrics import LoggingArgs
+from bytelatent.model.blt import ByteLatentTransformerArgs
+from bytelatent.optim import OptimArgs
+from bytelatent.profiling import ProfilerArgs
+from bytelatent.tokenizers.build_tokenizer import TokenizerArgs
+logger = logging.getLogger()
+def get_rng_state(seed: int, rank: int, world_size: int) -> dict[str, Any]:
+    return np.random.default_rng((seed, rank, world_size)).bit_generator.state
+def distribute_data_to_rank(
+    *,
+    dataset_path: str,
+    preprocess_dir: str,
+    entropy_model_name: str | None,
+    arrow_batch_size: int,
+    rank: int,
+    world_size: int,
+) -> ArrowFileIterator:
+    dataset_chunks = find_and_sanitize_chunks(dataset_path, world_size)
+    n_workers_per_chunk = world_size // len(dataset_chunks)
+    rank_to_arrow_iterator_params = []
+    for chunk_path in dataset_chunks:
+        for worker_id in range(n_workers_per_chunk):
+            rank_to_arrow_iterator_params.append(
+                ArrowFileIterator(
+                    file_path=chunk_path,
+                    worker_id=worker_id,
+                    num_workers=n_workers_per_chunk,
+                    preprocess_dir=preprocess_dir,
+                    dataset_files=None,
+                    entropy_model_name=entropy_model_name,
+                    arrow_batch_size=arrow_batch_size,
+                )
+            )
+    return rank_to_arrow_iterator_params[rank]
+class DataloaderArgs(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    root_dir: str | None = None
+    sources: dict[str, float] = {}
+    batch_size: int = 2
+    seq_len: int = 2048
+    seed: int = 42
+    add_bos: bool = True
+    add_eos: bool = True
+    load_async: bool = True
+    prefetch_size: int = 64
+    preprocess_dir: str | None = None
+    dataset_files: list[str] | None = None
+    entropy_model_name: str | None = "transformer_100m"
+    arrow_batch_size: int = 100
+    buffer_size: int = 64
+    pad_to_max_length: bool = True
+    max_encoder_seq_length: int = 12288
+    enable_byte_ngrams: bool = False
+    tokenizer_args: TokenizerArgs = TokenizerArgs()
+    patcher_args: PatcherArgs = PatcherArgs()
+    def _create_sequence_iterators(
+        self, rank: int, world_size: int
+    ) -> dict[str, SequenceIterator]:
+        sequence_packing_args = SequencePackingArgs(
+            output_seq_len=self.seq_len,
+            buffer_size=self.buffer_size,
+        )
+        source_to_sequence_iterator: dict[str, SequenceIterator] = {}
+        for dataset_path in self.sources:
+            shuffle_rng_state = get_rng_state(self.seed + 1, rank, world_size)
+            arrow_iterator = distribute_data_to_rank(
+                dataset_path=os.path.join(self.root_dir, dataset_path),
+                preprocess_dir=self.preprocess_dir,
+                entropy_model_name=self.entropy_model_name,
+                arrow_batch_size=self.arrow_batch_size,
+                rank=rank,
+                world_size=world_size,
+            )
+            looping_iterator = LoopingIterator(arrow_iterator)
+            preprocess_iterator = PreprocessIterator(
+                looping_iterator,
+                patcher_args=self.patcher_args,
+                tokenizer_args=self.tokenizer_args,
+            )
+            sequence_iterator = SequenceIterator(
+                preprocess_iterator,
+                sequence_packing_args=sequence_packing_args,
+                rng_state=shuffle_rng_state,
+            )
+            source_to_sequence_iterator[dataset_path] = sequence_iterator
+        return source_to_sequence_iterator
+    def build_from_rank(
+        self, rank: int, world_size: int
+    ) -> StatefulIterator[Batch, Any]:
+        source_to_sequence_iterators = self._create_sequence_iterators(rank, world_size)
+        weight_rng_state = get_rng_state(self.seed + 1, rank, world_size)
+        sampling_iterator = SamplingIterator(
+            rng_state=weight_rng_state,
+            source_to_weight=self.sources,
+            source_to_iterator=source_to_sequence_iterators,
+        )
+        tokenizer = self.tokenizer_args.build()
+        packing_args = PackingArgs(
+            batch_size=self.batch_size,
+            seq_len=self.seq_len,
+            pad_id=tokenizer.boe_id,
+            max_length=self.max_encoder_seq_length,
+            pad_to_max_length=self.pad_to_max_length,
+            enable_byte_ngrams=self.enable_byte_ngrams,
+        )
+        packing_iterator = PackingIterator(sampling_iterator, packing_args=packing_args)
+        mp_iterator = MultiprocessIterator(
+            packing_iterator, n_batches_to_prefetch=self.prefetch_size
+        )
+        return mp_iterator
+class TrainArgs(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    name: str = "lingua"
+    dump_dir: str = ""
+    seed: int = 42
+    # Number of gradient accumulation steps
+    # Total batch size is batch_size*grad_acc_steps
+    grad_acc_steps: int = 1
+    gc_collect_freq: int = 1000
+    probe_freq: int | None = None
+    # Nb optimizer steps to take
+    steps: int = 1000
+    data: DataloaderArgs = DataloaderArgs()
+    optim: OptimArgs = OptimArgs()
+    model: ByteLatentTransformerArgs = ByteLatentTransformerArgs()
+    distributed: DistributedArgs = DistributedArgs()
+    env: EnvironmentArgs = EnvironmentArgs()
+    checkpoint: CheckpointArgs = CheckpointArgs()
+    profiling: ProfilerArgs = ProfilerArgs()
+    logging: LoggingArgs = LoggingArgs()
+    # If set to None, eval is run locally otherwise it launches a new job with the given number of gpus
+    async_eval_gpus: int | None = None
+    eval: Any | None = None
+    eval_on_gpus: int | None = None
+    def dump_to_yaml_file(
+        self, path: str, log_config: bool = True, sort_keys: bool = True
+    ):
+        model_dict = self.model_dump(mode="json")
+        yaml_str = yaml.dump(
+            model_dict,
+            allow_unicode=True,
+            sort_keys=sort_keys,
+            default_flow_style=False,
+        )
+        with open(path, "w") as f:
+            if log_config:
+                logger.info("Using the following config for this run:")
+                logger.info(yaml_str)
+            f.write(yaml_str)

bytelatent/base_transformer.py ADDED Viewed

	@@ -0,0 +1,585 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from enum import Enum
+from typing import Optional, Tuple, Union
+import torch
+from pydantic import BaseModel
+from torch import nn
+from torch.nn import functional as F
+from torch.nn.attention.flex_attention import (
+    BlockMask,
+    _mask_mod_signature,
+    flex_attention,
+)
+from xformers.ops import AttentionBias, fmha
+from bytelatent import probe
+flex_attention_comp = torch.compile(flex_attention)
+class InitStdFactor(Enum):
+    DISABLED = "disabled"  # Init std is divided by 1.0
+    GLOBAL_DEPTH = "global_depth"  # Init std is divided by sqrt(2*n_layers)
+    CURRENT_DEPTH = "current_depth"  # Init std is divided by sqrt(2*depth)
+    DIM_RATIO = "dim_ratio"  # Init std is divided by model_dim/4096
+class BaseTransformerArgs(BaseModel):
+    dim: int = 512
+    n_layers: int = 8
+    head_dim: Optional[int] = None
+    n_heads: Optional[int] = None
+    n_kv_heads: Optional[int] = None
+    ffn_dim_multiplier: Optional[float] = None
+    multiple_of: int = 256
+    norm_eps: float = 1e-5
+    rope_theta: float = 10000.0
+    init_base_std: Optional[float] = None
+    init_std_factor: InitStdFactor = InitStdFactor.DISABLED
+    max_seqlen: int = 1024
+def cross_entropy(pred, target, **kwargs):
+    return F.nll_loss(
+        F.log_softmax(pred.flatten(end_dim=-2).float(), -1),
+        target.flatten(end_dim=-1),
+        **kwargs,
+    )
+def repeat_kv(x: torch.Tensor, n_rep: int, dim: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    assert dim == 2, "Only dim=2 is supported. Check the implementation for other dims."
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :]
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        end (int): End index for precomputing frequencies.
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+    Returns:
+        torch.Tensor: Precomputed frequency tensor with complex exponentials.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    cos, sin = freqs.cos(), freqs.sin()
+    return torch.stack((cos, -sin, sin, cos), dim=-1).view(*freqs.size(), 2, 2)
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor, seq_dim: int):
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    Args:
+        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+        seq_dim (int): Sequence dimension index.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    """
+    ndim = x.ndim
+    assert 0 <= seq_dim < ndim
+    assert freqs_cis.shape == (
+        x.shape[seq_dim],
+        x.shape[-3],
+        2,
+        2,
+    ), f"freqs_cis vs x: {(freqs_cis.shape, x.shape)}"
+    shape = [
+        d if i == seq_dim or i == ndim - 3 else 1 for i, d in enumerate(x.shape[:-2])
+    ] + [2, 2]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    seq_dim: int,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = xq.reshape(*xq.shape[:-1], -1, 1, 2)  # B S H D -> B S H D/2 1 2
+    xk_ = xk.reshape(*xk.shape[:-1], -1, 1, 2)  # B S H D -> B S H D/2 1 2
+    freqs_cis = reshape_for_broadcast(
+        freqs_cis, xq_, seq_dim
+    ).float()  # S D/2 2 2 -> 1 S 1 D/2 2 2
+    xq_out = (xq_ * freqs_cis).sum(5).flatten(3)
+    xk_out = (xk_ * freqs_cis).sum(5).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+def causal_mask(b, h, q_idx, kv_idx):
+    return q_idx >= kv_idx
+def lengths_to_start_ids(lengths):
+    doc_start = lengths.cumsum(0)
+    doc_start = doc_start.roll(1)
+    doc_start[0] = 0
+    return doc_start
+def lengths_to_local_ids(lengths):
+    assert lengths.ndim == 1
+    nb_seqs = lengths.size(0)
+    total_seqlen = lengths.sum()
+    # This gives the document id of each token
+    doc_id = torch.repeat_interleave(lengths)
+    # Compute document start for each document
+    doc_start = lengths_to_start_ids(lengths)
+    # Compute document start for each token
+    doc_start = doc_start[doc_id]
+    # Compute the position of each token within each document
+    tok_id = torch.arange(total_seqlen, device=lengths.device) - doc_start
+    return doc_id, tok_id
+def generate_doc_mask_mod(
+    mask_mod: _mask_mod_signature,
+    lengths: torch.Tensor,
+    kv_lengths: Optional[torch.Tensor] = None,
+) -> _mask_mod_signature:
+    """Generates mask mods that apply to inputs to flex attention in the sequence stacked
+    format.
+    Args:
+        mask_mod: The mask mod to apply to the documents
+        lengths: Lengths of each document
+    Note:
+        What is the sequence stacked format? When assembling batches of inputs, we
+        take multiple sequences and stack them together to form 1 large sequence. We then
+        use masking to ensure that the attention scores are only applied to tokens within
+        the same document.
+    Example:
+    - Square mask
+      doc_mask         lengths
+      a a b b b c c    2 3 2
+    a 1 0 0 0 0 0 0
+    a 1 1 0 0 0 0 0
+    b 0 0 1 0 0 0 0
+    b 0 0 1 1 0 0 0
+    b 0 0 1 1 1 0 0
+    c 0 0 0 0 0 1 0
+    c 0 0 0 0 0 1 1
+    """
+    kv_lengths = kv_lengths if kv_lengths is not None else lengths
+    q_document_id, q_token_id = lengths_to_local_ids(lengths)
+    kv_document_id, kv_token_id = lengths_to_local_ids(kv_lengths)
+    q_max_idx = lengths.sum() - 1
+    kv_max_idx = kv_lengths.sum() - 1
+    def doc_mask_mod(b, h, q_idx, kv_idx):
+        q_idx_cap = torch.minimum(q_max_idx, q_idx)
+        kv_idx_cap = torch.minimum(kv_max_idx, kv_idx)
+        valid_idx = (q_idx <= q_max_idx) & (kv_idx <= kv_max_idx)
+        same_doc = q_document_id[q_idx_cap] == kv_document_id[kv_idx_cap]
+        q_logical = q_token_id[q_idx_cap]
+        kv_logical = kv_token_id[kv_idx_cap]
+        inner_mask = mask_mod(b, h, q_logical, kv_logical)
+        return same_doc & inner_mask & valid_idx
+    return doc_mask_mod
+# Rotary embedding as in xformer, see if torchtrain implementation is not better. Also might be usefull to make it work with batch*seqlen collapsed.
+class RotaryEmbedding(torch.nn.Module):
+    """
+    RotaryEmbedding Module
+    """
+    def __init__(self, theta: float, head_dim: int, max_seqlen: int = 1024):
+        super().__init__()
+        self.theta = theta
+        self.head_dim = head_dim
+        self.max_seqlen = max_seqlen
+        self.register_buffer(
+            "freqs_cis",
+            precompute_freqs_cis(dim=head_dim, end=max_seqlen, theta=theta),
+            persistent=False,
+        )
+    def reset_parameters(self):
+        self.freqs_cis[...] = precompute_freqs_cis(
+            dim=self.head_dim, end=self.max_seqlen, theta=self.theta
+        )
+    def forward(
+        self, seqlen: Optional[int] = None, tok_idx: Optional[torch.Tensor] = None
+    ):
+        """
+        Return freqs_cis corresponding to consecutive seqlen positions or the corresponding tok_idx positions
+        Args:
+            seqlen (int): Contiguous sequence length
+            tok_idx (torch.Tensor[int]): Position indices of each token this overrides seqlen
+        Returns:
+            Tuple(torch.Tensor, torch.Tensor): Embedded input tensor and freqs_cis
+        """
+        test = (seqlen is not None) or (tok_idx is not None)
+        assert test, "Should provide atleast seqlen or tok_idx"
+        if tok_idx is not None:
+            return self.freqs_cis[tok_idx]
+        elif seqlen is not None:
+            return self.freqs_cis[0:seqlen]
+class RMSNorm(nn.Module):
+    """
+    Initialize the RMSNorm normalization layer.
+    Args:
+        dim (int): The dimension of the input tensor.
+        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+    Attributes:
+        eps (float): A small value added to the denominator for numerical stability.
+        weight (nn.Parameter): Learnable scaling parameter.
+    """
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x: torch.Tensor):
+        return x * torch.rsqrt((x * x).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor):
+        x = probe.log_stats(x, "resid")
+        output = self._norm(x.float())
+        return (output * self.weight.float()).type_as(x)
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)  # type: ignore
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        head_dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        rope_theta: float,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = head_dim
+        self.rope_theta = rope_theta
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.heads_per_group = self.n_heads // self.n_kv_heads
+        self.wq = nn.Linear(
+            dim,
+            n_heads * head_dim,
+            bias=False,
+        )
+        self.wk = nn.Linear(
+            dim,
+            n_kv_heads * head_dim,
+            bias=False,
+        )
+        self.wv = nn.Linear(
+            dim,
+            n_kv_heads * head_dim,
+            bias=False,
+        )
+        self.wo = nn.Linear(
+            n_heads * head_dim,
+            dim,
+            bias=False,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        freq_cis: torch.Tensor,
+        tok_idx: Optional[torch.Tensor] = None,
+        mask: Optional[Union[BlockMask, AttentionBias, str]] = None,
+        attn_impl: str = "sdpa",
+    ) -> torch.Tensor:
+        # B S D
+        bsz, seq_len, dim = x.shape
+        xq = self.wq(x.view_as(x))
+        xk = self.wk(x.view_as(x))
+        xv = self.wv(x.view_as(x))
+        output_shape = xq.shape
+        # B S D -> B S H D
+        xq = xq.view(bsz, seq_len, self.n_heads, self.head_dim)
+        xk = xk.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seq_len, self.n_kv_heads, self.head_dim)
+        xq, xk = apply_rotary_emb(xq, xk, 1, freq_cis[0:seq_len])
+        # This condition helps us be easily compatible
+        # with inference by adding a pluggable KVCache
+        if hasattr(self, "kv_cache"):
+            xk, xv = self.kv_cache.update(xk, xv, tok_idx)
+        xk = repeat_kv(xk, self.heads_per_group, dim=2)
+        xv = repeat_kv(xv, self.heads_per_group, dim=2)
+        if attn_impl == "flex_attention":
+            assert mask is None or isinstance(mask, BlockMask)
+            xq, xk, xv = map(lambda e: e.transpose(1, 2), (xq, xk, xv))
+            output = flex_attention_comp(xq, xk, xv, block_mask=mask)
+            output = output.transpose(1, 2).contiguous()  # B H S D -> B S H D
+        elif attn_impl == "fmha":
+            assert mask is None or isinstance(mask, AttentionBias)
+            output = fmha.memory_efficient_attention(xq, xk, xv, attn_bias=mask)
+            # This uses B S H D instead of B H S D of pytorch
+        elif attn_impl == "sdpa":
+            xq, xk, xv = map(lambda e: e.transpose(1, 2), (xq, xk, xv))
+            assert mask is None or isinstance(mask, (str, torch.Tensor))
+            is_causal = (mask == "causal") if isinstance(mask, str) else False
+            mask = mask if isinstance(mask, torch.Tensor) else None
+            output = F.scaled_dot_product_attention(
+                xq,
+                xk,
+                xv,
+                is_causal=is_causal,
+                attn_mask=mask,
+            )
+            output = output.transpose(1, 2).contiguous()  # B H S D -> B S H D
+        else:
+            raise NotImplementedError(
+                f"Attention implementation {attn_impl} not supported"
+            )
+        output = self.wo(output.reshape(output_shape))
+        return output
+    def reset_parameters(self, init_std=None, factor=1.0):
+        init_std = init_std or (self.dim ** (-0.5))
+        for w in [self.wq, self.wk, self.wv]:
+            nn.init.trunc_normal_(
+                w.weight,
+                mean=0.0,
+                std=init_std,
+                a=-3 * init_std,
+                b=3 * init_std,
+            )
+        nn.init.trunc_normal_(
+            self.wo.weight,
+            mean=0.0,
+            std=init_std / factor,
+            a=-3 * init_std,
+            b=3 * init_std,
+        )
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+        mp_size: int = 1,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        assert hidden_dim % mp_size == 0
+        self.dim = dim
+        self.hidden_dim = hidden_dim
+        self.w1 = nn.Linear(
+            dim,
+            hidden_dim,
+            bias=False,
+        )
+        self.w3 = nn.Linear(
+            dim,
+            hidden_dim,
+            bias=False,
+        )
+        self.w2 = nn.Linear(
+            hidden_dim,
+            dim,
+            bias=False,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # B S D
+        x1 = self.w1(x.view_as(x))
+        x3 = self.w3(x.view_as(x))
+        output = self.w2(F.silu(x1) * x3)
+        return output
+    def reset_parameters(self, init_std=None, factor=1.0):
+        in_init_std = init_std or (self.dim ** (-0.5))
+        out_init_std = init_std or (self.hidden_dim ** (-0.5))
+        in_init_std = in_init_std
+        out_init_std = out_init_std / factor
+        for w in [self.w1, self.w3]:
+            nn.init.trunc_normal_(
+                w.weight,
+                mean=0.0,
+                std=in_init_std,
+                a=-3 * in_init_std,
+                b=3 * in_init_std,
+            )
+        nn.init.trunc_normal_(
+            self.w2.weight,
+            mean=0.0,
+            std=out_init_std,
+            a=-3 * out_init_std,
+            b=3 * out_init_std,
+        )
+class TransformerBlock(nn.Module):
+    def __init__(self, args: BaseTransformerArgs):
+        super().__init__()
+        assert (args.head_dim is not None) or (
+            args.n_heads is not None
+        ), "Should specify at least head_dim or n_heads"
+        self.head_dim = args.head_dim or args.dim // args.n_heads
+        self.n_heads = args.n_heads or args.dim // args.head_dim
+        self.n_kv_heads = args.n_kv_heads or self.n_heads
+        assert args.n_heads % self.n_kv_heads == 0
+        assert args.dim % args.n_heads == 0
+        self.attention = Attention(
+            dim=args.dim,
+            head_dim=self.head_dim,
+            n_heads=self.n_heads,
+            n_kv_heads=self.n_kv_heads,
+            rope_theta=args.rope_theta,
+        )
+        self.feed_forward = FeedForward(
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            ffn_dim_multiplier=args.ffn_dim_multiplier,
+        )
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+    def forward(
+        self,
+        x: torch.Tensor,
+        freq_cis: torch.Tensor,
+        tok_idx: Optional[torch.Tensor] = None,
+        mask: Optional[Union[BlockMask, AttentionBias, str]] = None,
+        attn_impl: str = "sdpa",
+    ) -> torch.Tensor:
+        h = x + self.attention(
+            self.attention_norm(x),
+            freq_cis,
+            tok_idx=tok_idx,
+            mask=mask,
+            attn_impl=attn_impl,
+        )
+        out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+    def init_weights(self, init_std=None, factor=1.0):
+        self.attention.reset_parameters(init_std, factor)
+        self.attention_norm.reset_parameters()
+        self.feed_forward.reset_parameters(init_std, factor)
+        self.ffn_norm.reset_parameters()
+class BaseTransformer(nn.Module):
+    def __init__(self, args: BaseTransformerArgs):
+        super().__init__()
+        self.dim = args.dim
+        self.init_base_std = args.init_base_std
+        self.init_std_factor = InitStdFactor(args.init_std_factor)
+        self.max_seqlen = args.max_seqlen
+        self.rope_embeddings = RotaryEmbedding(
+            theta=args.rope_theta,
+            head_dim=args.head_dim or args.dim // args.n_heads,
+            max_seqlen=args.max_seqlen,
+        )
+        self.layers = nn.ModuleList()
+        for _ in range(args.n_layers):
+            self.layers.append(TransformerBlock(args))
+    def forward(
+        self,
+        h,
+        tok_idx: Optional[torch.Tensor] = None,
+        mask: Optional[Union[BlockMask, AttentionBias, str]] = None,
+        attn_impl: str = "sdpa",
+    ):
+        freq_cis = self.rope_embeddings(seqlen=self.max_seqlen, tok_idx=tok_idx)
+        for i, layer in enumerate(self.layers):
+            h = layer(h, freq_cis, tok_idx=tok_idx, mask=mask, attn_impl=attn_impl)
+        return h
+    def reset_parameters(self):
+        # Either use fixed base std or sqrt model dim
+        self.rope_embeddings.reset_parameters()
+    def init_weights(self):
+        self.reset_parameters()
+        for depth, layer in enumerate(self.layers):
+            factor = {
+                InitStdFactor.CURRENT_DEPTH: (2 * (depth + 1)) ** 0.5,
+                InitStdFactor.GLOBAL_DEPTH: (2 * (len(self.layers) + 1)) ** 0.5,
+                InitStdFactor.DIM_RATIO: self.dim / 4096,
+                InitStdFactor.DISABLED: 1.0,
+            }[self.init_std_factor]
+            layer.init_weights(self.init_base_std, factor)

bytelatent/checkpoint.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import json
+import logging
+import os
+import re
+from pathlib import Path
+from typing import List, Optional, Tuple
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
+import torch.nn as nn
+import torch.optim.optimizer
+from pydantic import BaseModel, ConfigDict
+from torch.distributed._tensor import DeviceMesh
+from torch.distributed.checkpoint.format_utils import dcp_to_torch_save
+from torch.distributed.checkpoint.state_dict import (
+    get_model_state_dict,
+    get_state_dict,
+    set_state_dict,
+)
+from bytelatent.distributed import get_is_master
+logger = logging.getLogger("CHECKPOINT")
+FOLDER_NAME = "{:010d}"
+RE_FOLDER = r"\d{10}"
+RE_CKPT = r"__\d_\d\.distcp"
+CONSOLIDATE_FOLDER = "consolidated"
+CONSOLIDATE_NAME = "consolidated.pth"
+CONFIG_NAME = "params.json"
+TRAIN_STATE_NAME = "train_state_{:05d}.json"
+RE_DIGITS = re.compile(r"\d+")
+class SaveEvery(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    every: int = 1000
+    keep: int = 0
+class CheckpointArgs(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    dump: SaveEvery = SaveEvery()
+    eval: SaveEvery = SaveEvery()
+    path: str | None = None
+    init_ckpt_path: str | None = None
+    continue_training_from_init: bool = False
+def _get_key_step(name: str):
+    return int(re.findall(RE_DIGITS, name)[-1])
+def consolidate_checkpoints(ckpt_dir: str):
+    """
+    Consolidates all FSDP checkpoints in a directory to a single file
+    Consolidate checkpoint is saved in a subdirectory of ckpt_dir
+    Parameters:
+        ckpt_dir: str - path to the directory containing the checkpoints
+    Returns the path to the consolidated checkpoint
+    """
+    consolidate_path = Path(ckpt_dir) / CONSOLIDATE_FOLDER
+    if not (consolidate_path / CONSOLIDATE_NAME).exists():
+        consolidate_path.mkdir(exist_ok=True)
+        logger.info(f"Consolidating to: {str(consolidate_path)}")
+        dcp_to_torch_save(ckpt_dir, str(consolidate_path / CONSOLIDATE_NAME))
+        (consolidate_path / CONFIG_NAME).write_text(
+            (Path(ckpt_dir) / CONFIG_NAME).read_text()
+        )
+        logger.info("Consolidated !")
+    return consolidate_path
+def load_from_checkpoint(
+    ckpt_dir: str,
+    model: nn.Module,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    model_key: str = "model",
+    optim_key: str = "optim",
+):
+    if not (Path(ckpt_dir) / ".metadata").exists():
+        raise ValueError(
+            f"Please convert the checkpoint distcp format using `torch.distributed.checkpoint.format_utils.torch_save_to_dcp` before loading it"
+        )
+    state_dict = {}
+    if optimizer is not None:
+        state_dict[model_key], state_dict[optim_key] = get_state_dict(model, optimizer)
+    else:
+        state_dict[model_key] = get_model_state_dict(model)
+        if model_key == "":  # If only loading a model directly, the key should be empty
+            state_dict = state_dict.pop(model_key)
+    dcp.load(state_dict, checkpoint_id=ckpt_dir)
+class CheckpointManager:
+    def __init__(self, args: CheckpointArgs):
+        self.path = args.path
+        self.dump_every = args.dump
+        self.eval_every = args.eval
+        self.init_ckpt_path = args.init_ckpt_path
+        self.continue_training_from_init = args.continue_training_from_init
+        assert os.path.exists(
+            self.path
+        ), f"Path {self.path} does not exist and needs to be created before using CheckpointManager (use instantiate_and_make_dir)"
+        self.existing_saves = self.get_existing_saves()
+    def get_existing_saves(self) -> List[Path]:
+        folders = [
+            p
+            for p in Path(self.path).iterdir()
+            if p.is_dir() and re.match(RE_FOLDER, p.name)
+        ]
+        folders.sort(key=lambda p: _get_key_step(p.name))
+        return folders
+    def clean_up(self):
+        logger.info("Cleaning up checkpoints...")
+        dump_folders = []
+        eval_folders = []
+        other_folders = []
+        for p in self.existing_saves:
+            is_dump = _get_key_step(p.name) % self.dump_every.every == 0
+            is_eval = _get_key_step(p.name) % self.eval_every.every == 0
+            if is_dump:
+                dump_folders.append(p)
+            if is_eval:
+                eval_folders.append(p)
+            if not (is_dump or is_eval):
+                other_folders.append(p)
+        logger.info(f"Dump folders: {dump_folders}")
+        logger.info(f"Eval folders: {eval_folders}")
+        logger.info(f"Other folders: {other_folders}")
+        if self.dump_every.keep > 0:
+            dump_folders = dump_folders[-self.dump_every.keep :]
+        if self.eval_every.keep > 0:
+            eval_folders = eval_folders[-self.eval_every.keep :]
+        folder_to_keep = set(other_folders + dump_folders + eval_folders)
+        folder_to_remove = set(self.existing_saves) - folder_to_keep
+        logger.info(f"Removing folders: {folder_to_remove}")
+        if dist.get_rank() == 0:
+            for folder in folder_to_remove:
+                for file in folder.iterdir():
+                    if file.is_file():
+                        file.unlink()
+                    elif file.is_dir():
+                        assert file.name in [CONSOLIDATE_FOLDER]
+                        for f in file.iterdir():
+                            f.unlink()
+                        file.rmdir()
+                folder.rmdir()
+        dist.barrier()
+        self.existing_saves = list(folder_to_keep)
+        self.existing_saves.sort(key=lambda p: _get_key_step(p.name))
+    def get_last_step_path(self, dp_rank: int = 0) -> Optional[Path]:
+        path = None
+        for p in reversed(self.existing_saves):
+            if (p / TRAIN_STATE_NAME.format(dp_rank)).is_file():
+                path = p
+                break
+        return path
+    def _create_folder(self, base_path: Path, folder_name: str) -> Path:
+        folder = base_path / folder_name
+        if get_is_master():
+            folder.mkdir(parents=False, exist_ok=True)
+        if dist.is_initialized():
+            dist.barrier()
+        return folder
+    def _get_dp_tp_mesh(
+        self, device_mesh: Optional[DeviceMesh] = None
+    ) -> Tuple[int, int]:
+        dp_rank = 0
+        tp_rank = 0
+        if device_mesh is not None:
+            if "dp_replicate" in device_mesh.mesh_dim_names:
+                dp_rank = device_mesh.get_local_rank("dp_replicate")
+                if "dp_shard" in device_mesh.mesh_dim_names:
+                    dp_rank = dp_rank * device_mesh[
+                        "dp_replicate"
+                    ].size() + device_mesh.get_local_rank("dp_shard")
+            if "tp" in device_mesh.mesh_dim_names:
+                tp_rank = device_mesh.get_local_rank("tp")
+        return dp_rank, tp_rank
+    @torch.no_grad()
+    def get_state_dict(
+        self,
+        model,
+        optimizer,
+    ):
+        model_sd, optim_sd = get_state_dict(model, optimizer)
+        return {"model": model_sd, "optim": optim_sd}
+    def save(
+        self,
+        model,
+        optimizer,
+        train_state,
+        config,
+        device_mesh: Optional[DeviceMesh] = None,
+    ) -> bool:
+        # When creating directory check if only rank0 or is there other solution
+        path = Path(self.path)
+        curr_save_dir = self._create_folder(path, FOLDER_NAME.format(train_state.step))
+        logger.info(f"Saving to: {str(curr_save_dir)}")
+        if dist.is_initialized():
+            dist.barrier()
+        logger.info("Saving...")
+        state_dict = self.get_state_dict(model, optimizer)
+        dcp.save(state_dict, checkpoint_id=curr_save_dir)
+        logger.info("State dict saved!")
+        if dist.is_initialized():
+            dist.barrier()
+        if get_is_master():
+            config.dump_to_yaml_file(curr_save_dir / CONFIG_NAME)
+        # Add json dump here
+        dp_rank, tp_rank = self._get_dp_tp_mesh(device_mesh)
+        if tp_rank == 0:
+            train_state_name = TRAIN_STATE_NAME.format(dp_rank)
+            logger.info(
+                f"Saving train state to: {str(curr_save_dir / train_state_name)}"
+            )
+            with open(curr_save_dir / train_state_name, "w") as f:
+                json.dump(train_state.state_dict(), f)
+            logger.info("Train state saved !")
+        self.existing_saves.append(curr_save_dir)
+        self.clean_up()
+        if dist.is_initialized():
+            dist.barrier()
+        return True
+    @torch.no_grad()
+    def load(
+        self,
+        model: nn.Module,
+        optimizer,
+        train_state,
+        device_mesh: DeviceMesh,
+        path: Optional[Path] = None,
+    ):
+        dp_rank, tp_rank = self._get_dp_tp_mesh(device_mesh)
+        # Loading tries to load the provided path, if not available the last saved step and finally from the init path
+        path = path or self.get_last_step_path(dp_rank=dp_rank)
+        # If none of those are available don't do anything
+        if path is None:
+            # If no checkpoints exist do nothing
+            return
+        # Only load train state if it's provided, the files exist and we're not loading from init path
+        train_state_name = TRAIN_STATE_NAME.format(dp_rank)
+        logger.info("Reloading train state")
+        with open(path / train_state_name, "r") as f:
+            train_state_dict = json.load(f)
+        train_state.load_state_dict(train_state_dict)
+        logger.info("Train state reloaded")
+        logger.info(f"Loading from: {str(path)}")
+        state_dict = self.get_state_dict(
+            model=model,
+            optimizer=optimizer,
+        )
+        dcp.load(state_dict, checkpoint_id=path)
+        logger.info("State dict loaded.")
+        logger.info("Reloading model and optim")
+        set_state_dict(
+            model,
+            optimizer,
+            model_state_dict=state_dict["model"],
+            optim_state_dict=state_dict["optim"],
+        )
+        logger.info("Model and optim reloaded")
+    @classmethod
+    def instantiate_and_make_dir(cls, args: CheckpointArgs):
+        if get_is_master():
+            os.makedirs(args.path, exist_ok=True)
+        dist.barrier()
+        return cls(args)

bytelatent/configs/debug.yaml ADDED Viewed

	@@ -0,0 +1,110 @@

+# Template config, need to change dump_dir, data.root_dir and tokenizer.path
+# Evals can be activated by uncommenting its config
+# python -m launchers.stool config=apps/main/configs/debug.yaml nodes=8 account=fair_amaia_cw_codegen qos=lowest
+dump_dir: /tmp/
+name: "debug"
+steps: 100_000
+probe_freq: null
+seed: 777
+optim:
+  lr: 4e-04
+  warmup: 500
+  lr_min_ratio: 0.1
+  clip: 10.0
+distributed:
+  fsdp_type: full_shard
+  compile: true
+  model_dtype: bf16
+  matmul_allow_tf32: false
+  selective_activation_checkpointing: false
+  tp_size: 1
+model:
+  n_heads: 8
+  dim: 512
+  vocab_size: 260
+  dim_token: 256
+  patch_size: 6
+  tokenization_mode: "bytes"
+  patching_mode: "space"
+  tie_local_encoder_decoder_logits: false
+  data_loader_patching: true
+  max_encoder_seq_length: 12288
+  pad_to_max_length: true
+  patching_threshold: 3.1439168453216553
+  encoder_hash_byte_group_size: [4]
+  encoder_hash_byte_group_vocab: 50002
+  encoder_hash_byte_group_nb_functions: 3
+  encoder_enable_byte_ngrams: false
+  cross_attn_encoder: true # assuming cross_attention is true
+  cross_attn_decoder: true # assuming cross_attention is true
+  cross_attn_window_encoder: 512
+  cross_attn_window_decoder: 512
+  dim_local_encoder: 256
+  dim_local_decoder: 256
+  cross_attn_k: 8
+  cross_attn_nheads: 4
+  cross_attn_all_layers_decoder: true
+  cross_attn_all_layers_encoder: true
+  cross_attn_use_flex_attention: true
+  cross_attn_init_by_pooling: true
+  log_patch_lengths: true
+  non_linearity: "swiglu"
+  use_rope: true
+  recompute_fc1_out: false
+  recompute_fc3_out: false
+  recompute_attn: false
+  custom_bwd: false
+  layer_ckpt: "none"
+  efficient_attn: "sdpa"
+  patch_only_encoder: false
+  patch_only_decoder: false
+  use_local_encoder_transformer: true
+  init_use_gaussian: true
+  init_use_depth: "current"
+  attn_bias_type: "block_causal"
+  alpha_depth: "disabled"
+  max_length: 256
+  local_attention_window_len: 512
+  max_seqlen: 12288
+  downsampling_by_pooling: "max"
+data:
+  root_dir: ???
+  sources:
+    dclm_baseline_1.0: 1.0
+  batch_size: 2
+  prefetch_size: 64
+  seq_len: 4096
+  load_async: true
+  preprocess_dir: ???
+  tokenizer_args:
+    name: blt
+    init_kwargs:
+      bpe_tokenizer_path: ???
+profiling:
+  run: false
+checkpoint:
+  dump:
+    every: 500
+    keep: 3
+  eval:
+    every: 1000
+    keep: -1
+logging:
+  freq: 10
+eval_on_gpus: 8
+eval:
+  dataset_dir: /checkpoint/amaia/codegen/datasets/eval
+  tasks: boolq,hellaswag,nq,piqa,siqa,tqa,winogrande,obqa,arc_easy,arc_challenge,race.middle,race.high,gsm8k,math,bbh,copa,human_eval_plus,mbpp,mmlu
+  generator:
+    max_tokens: 65536
+    dtype: bf16
+  mp_size: 1

bytelatent/constants.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import os
+from pathlib import Path
+BLT_DATA = Path(os.environ.get("BLT_DATA", "data"))

bytelatent/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

bytelatent/data/data_types.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import json
+from dataclasses import dataclass
+from typing import Any, Iterator
+import numpy as np
+from pydantic import BaseModel, ConfigDict
+class BltExample(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    sample_id: str
+    text: str
+    tokens: list[int] | None
+    entropies: list[float] | None
+    patch_lengths: list[int] | None
+    mask: list[bool] | None
+class MultiChoiceState(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    root_dir: str
+    sources: dict[str, float]
+    source_to_state: dict[str, Any]
+    rng_state: dict[str, Any]
+class PrefetchState(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    seq_idx: int
+    rng_state: dict[str, Any]
+    prefetch_size: int
+    batch_size: int
+class BltPackTokensState(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    start_token: int
+    output_seq_len: int
+    n_views: int = 2
+class DataLoaderState(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    multi_choice_state: MultiChoiceState
+    pack_tokens_state: BltPackTokensState
+    prefetch_state: PrefetchState
+BltIterator = Iterator[tuple[BltExample, DataLoaderState]]
+class BltSequence(BaseModel):
+    tokens: list[int]
+    mask: list[bool]
+    patch_lengths: list[int]
+@dataclass
+class Batch:
+    x: np.ndarray
+    y: np.ndarray
+    mask: np.ndarray | None = None
+    patch_lengths: np.ndarray | None = None
+    ngram_ids: np.ndarray | None = None
+    is_final: bool = False
+    def to_python_dict(self) -> dict:
+        x = self.x.tolist()
+        y = self.y.tolist()
+        if self.mask is None:
+            mask = None
+        else:
+            mask = self.mask.tolist()
+        if self.patch_lengths is None:
+            patch_lengths = None
+        else:
+            patch_lengths = self.patch_lengths.tolist()
+        if self.ngram_ids is None:
+            ngram_ids = None
+        else:
+            ngram_ids = self.ngram_ids.tolist()
+        return {
+            "x": x,
+            "y": y,
+            "mask": mask,
+            "patch_lengths": patch_lengths,
+            "ngram_ids": ngram_ids,
+            "is_final": self.is_final,
+        }
+    @classmethod
+    def from_python_dict(cls, data: dict) -> "Batch":
+        x = np.array(data["x"])
+        y = np.array(data["y"])
+        if data["mask"] is None:
+            mask = None
+        else:
+            mask = np.array(data["mask"])
+        if data["patch_lengths"] is None:
+            patch_lengths = None
+        else:
+            patch_lengths = np.array(data["patch_lengths"])
+        if data["ngram_ids"] is None:
+            ngram_ids = None
+        else:
+            ngram_ids = np.array(data["ngram_ids"])
+        return Batch(
+            x=x,
+            y=y,
+            mask=mask,
+            patch_lengths=patch_lengths,
+            ngram_ids=ngram_ids,
+            is_final=data["is_final"],
+        )

bytelatent/data/iterators/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

bytelatent/data/iterators/abstract_iterator.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import abc
+from typing import Any, Generator, Generic, TypeVar
+T = TypeVar("T")
+C = TypeVar("C")
+class StatefulIterator(Generic[T, C], abc.ABC):
+    @abc.abstractmethod
+    def get_state(self) -> C:
+        pass
+    @abc.abstractmethod
+    def create_iter(self) -> Generator[T, Any, None]:
+        pass
+class IteratorState(Generic[C]):
+    @abc.abstractmethod
+    def build(self) -> StatefulIterator[T, C]:
+        pass

bytelatent/data/iterators/arrow_iterator.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import re
+from logging import getLogger
+from pathlib import Path
+from typing import Any, Generator
+import pyarrow as pa
+# pyarrow needs the initialization from this import
+import pyarrow.dataset  # pyright: ignore
+from pydantic import BaseModel, ConfigDict
+from bytelatent import ByteLatentError
+from bytelatent.data.data_types import BltExample
+from bytelatent.data.iterators.abstract_iterator import IteratorState, StatefulIterator
+logger = getLogger(__name__)
+class ArrowFileIteratorState(BaseModel, IteratorState):
+    model_config = ConfigDict(extra="forbid")
+    file_path: str | None
+    row_num: int
+    num_workers: int
+    worker_id: int
+    preprocess_dir: str | None
+    dataset_files: list[str] | None
+    entropy_model_name: str | None
+    arrow_batch_size: int = 100
+    def build(self) -> "ArrowFileIterator":
+        arrow_file = ArrowFileIterator(
+            file_path=self.file_path,
+            worker_id=self.worker_id,
+            num_workers=self.num_workers,
+            preprocess_dir=self.preprocess_dir,
+            entropy_model_name=self.entropy_model_name,
+            arrow_batch_size=self.arrow_batch_size,
+            dataset_files=self.dataset_files,
+        )
+        if self.row_num != 0:
+            arrow_file._set_row_num(self.row_num)
+        return arrow_file
+def shard_sort_key(file: str | Path):
+    match = re.search(r".+\.shard_([0-9]+)\.arrow", str(file))
+    shard_number = int(match.group(1))
+    return shard_number
+class ArrowFileIterator(StatefulIterator):
+    def __init__(
+        self,
+        *,
+        file_path: str | None,
+        worker_id: int,
+        num_workers: int,
+        preprocess_dir: str | None,
+        entropy_model_name: str | None,
+        arrow_batch_size: int,
+        dataset_files: list[str] | None = None,
+    ):
+        assert 0 <= worker_id < num_workers, (worker_id, num_workers)
+        if file_path is None and dataset_files is None:
+            raise ByteLatentError("file_path and dataset_files cannot both be None")
+        self.row_num = 0
+        self.iter_id = 0
+        self.batch_iterator = None
+        self.batch_to_consume = None
+        self.dataset = None
+        self.file_path = file_path
+        self.worker_id = worker_id
+        self.num_workers = num_workers
+        self.preprocess_dir = preprocess_dir
+        self.entropy_model_name = entropy_model_name
+        self.arrow_batch_size = arrow_batch_size
+        if dataset_files is None:
+            # Prepare arrow shards
+            jsonl_file = Path(file_path)
+            parts = re.match(r"(.+)\.chunk\.[0-9]+\.jsonl", jsonl_file.name)
+            assert parts is not None
+            dataset = parts.group(1)
+            data_dir = Path(preprocess_dir) / dataset / entropy_model_name
+            shard_files = list(data_dir.glob(f"{jsonl_file.name}.shard_*.arrow"))
+            for s in shard_files:
+                if not (data_dir / f"{s.name}.complete").exists():
+                    raise ValueError(f"Missing .complete for input file: {s}")
+            shard_files = sorted(shard_files, key=shard_sort_key)
+            if len(shard_files) == 0:
+                raise ByteLatentError(
+                    f"Zero shard_files found corresponding to: {file_path} using preprocess_dir={preprocess_dir} and entropy_model_name={entropy_model_name}, so the search path is data_dir={data_dir} for matches to {jsonl_file.name}.shard_*.arrow"
+                )
+            self.dataset_files = [str(f) for f in shard_files]
+        else:
+            self.preprocess_dir = None
+            self.dataset_files = dataset_files
+    def get_state(self) -> ArrowFileIteratorState:
+        return ArrowFileIteratorState(
+            file_path=self.file_path,
+            row_num=self.row_num,
+            worker_id=self.worker_id,
+            num_workers=self.num_workers,
+            preprocess_dir=self.preprocess_dir,
+            entropy_model_name=self.entropy_model_name,
+            arrow_batch_size=self.arrow_batch_size,
+            dataset_files=self.dataset_files,
+        )
+    def create_iter(
+        self,
+    ) -> Generator[BltExample, Any, None]:
+        if self.dataset is None:
+            self.dataset = pa.dataset.dataset(self.dataset_files, format="arrow")
+            self.batch_iterator = self.dataset.to_batches(
+                batch_size=self.arrow_batch_size
+            )
+        self.iter_id += 1
+        if self.batch_to_consume is not None:
+            batch_columns: dict[str, list] = self.batch_to_consume
+            self.batch_to_consume = None
+            sample_ids = batch_columns["sample_id"]
+            texts = batch_columns["text"]
+            entropies = batch_columns["entropies"]
+            for i in range(len(sample_ids)):
+                out = BltExample(
+                    sample_id=sample_ids[i],
+                    entropies=entropies[i],
+                    text=texts[i],
+                    tokens=None,
+                    mask=None,
+                    patch_lengths=None,
+                )
+                self.row_num += 1
+                if (self.row_num - 1) % self.num_workers == self.worker_id:
+                    yield out
+        for batch in self.batch_iterator:
+            batch_columns = batch.to_pydict()
+            sample_ids = batch_columns["sample_id"]
+            texts = batch_columns["text"]
+            entropies = batch_columns["entropies"]
+            for i in range(len(sample_ids)):
+                out = BltExample(
+                    sample_id=sample_ids[i],
+                    entropies=entropies[i],
+                    text=texts[i],
+                    tokens=None,
+                    mask=None,
+                    patch_lengths=None,
+                )
+                self.row_num += 1
+                if (self.row_num - 1) % self.num_workers == self.worker_id:
+                    yield out
+    def _set_row_num(self, target_row_num: int):
+        logger.info(
+            f"Setting arrow position to {target_row_num} for {self.dataset_files}"
+        )
+        if target_row_num is None or target_row_num == 0:
+            self.row_num = 0
+            self.dataset = None
+            self.batch_iterator = None
+            self.batch_to_consume = None
+        else:
+            self.dataset = pa.dataset.dataset(self.dataset_files, format="arrow")
+            self.batch_iterator = self.dataset.to_batches(
+                batch_size=self.arrow_batch_size
+            )
+            curr_remaining = target_row_num
+            for batch in self.batch_iterator:
+                if len(batch) > curr_remaining:
+                    batch_columns: dict[str, list] = batch.to_pydict()
+                    batch_columns["sample_id"] = batch_columns["sample_id"][
+                        curr_remaining:
+                    ]
+                    batch_columns["entropies"] = batch_columns["entropies"][
+                        curr_remaining:
+                    ]
+                    batch_columns["text"] = batch_columns["text"][curr_remaining:]
+                    self.batch_to_consume = batch_columns
+                    break
+                elif len(batch) == curr_remaining:
+                    # We are exactly at the end of the batch,
+                    # so the next batch is the right spot
+                    break
+                else:
+                    curr_remaining -= len(batch)
+            self.row_num = target_row_num
+        logger.info(
+            f"Finished setting arrow position to {target_row_num} for {self.dataset_files}"
+        )
+TRAIN_DATA_FILE_PATTERN = "*.chunk.*.jsonl"
+def find_and_sanitize_chunks(
+    dataset_path: str, world_size: int, file_pattern: str = TRAIN_DATA_FILE_PATTERN
+):
+    dataset_chunks = [str(p) for p in Path(dataset_path).glob(file_pattern)]
+    n_chunks = len(dataset_chunks)
+    if n_chunks > world_size:
+        n_discard = n_chunks - world_size
+        dataset_chunks = dataset_chunks[:world_size]
+    else:
+        assert (
+            world_size % n_chunks == 0
+        ), "World size should be a multiple of number of chunks"
+    assert n_chunks > 0, f"No valid chunks in {dataset_path}"
+    return dataset_chunks

bytelatent/data/iterators/looping_iterator.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from pydantic import BaseModel
+from bytelatent.data.iterators.abstract_iterator import IteratorState, StatefulIterator
+from bytelatent.data.iterators.arrow_iterator import (
+    ArrowFileIterator,
+    ArrowFileIteratorState,
+)
+class LoopingIteratorState(BaseModel, IteratorState):
+    file_iterator_state: ArrowFileIteratorState
+    epoch: int
+    def build(self) -> "LoopingIterator":
+        return LoopingIterator(
+            file_iterator=self.file_iterator_state.build(),
+            epoch=self.epoch,
+        )
+class LoopingIterator(StatefulIterator):
+    def __init__(self, file_iterator: ArrowFileIterator, epoch: int = -1):
+        self.file_iterator = file_iterator
+        self.epoch = epoch
+    def get_state(self):
+        return LoopingIteratorState(
+            file_iterator_state=self.file_iterator.get_state(), epoch=self.epoch
+        )
+    def create_iter(self):
+        while True:
+            self.epoch += 1
+            iterator = self.file_iterator.create_iter()
+            yield from iterator

bytelatent/data/iterators/multiprocess_iterator.py ADDED Viewed

	@@ -0,0 +1,243 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import json
+import logging
+import multiprocessing as mp
+from multiprocessing.synchronize import Event as EventClass
+from queue import Empty, Full
+import numpy as np
+from pydantic import BaseModel, ConfigDict
+from bytelatent.data.data_types import Batch
+from bytelatent.data.iterators.abstract_iterator import IteratorState, StatefulIterator
+from bytelatent.data.iterators.packing_iterator import PackingIteratorState
+logger = logging.getLogger()
+class MultiprocessIteratorState(BaseModel, IteratorState):
+    model_config = ConfigDict(extra="forbid")
+    base_iterator_state: PackingIteratorState
+    n_batches_to_prefetch: int
+    serialized_prefetch_buffer: str
+    def build(self):
+        base_iterator = self.base_iterator_state.build()
+        data = json.loads(self.serialized_prefetch_buffer)
+        prefetch_buffer = [Batch.from_python_dict(item) for item in data]
+        return MultiprocessIterator(
+            base_iterator,
+            n_batches_to_prefetch=self.n_batches_to_prefetch,
+            prefetch_buffer=prefetch_buffer,
+        )
+def start_work_from_state(
+    batch_queue: mp.Queue,
+    state_queue: mp.Queue,
+    stop_event: EventClass,
+    state_dumped_event: EventClass,
+    state: IteratorState,
+):
+    logging.info("Worker thread: Starting base_iterator work")
+    stateful_iterator = state.build()
+    iterator = stateful_iterator.create_iter()
+    for item in iterator:
+        while not stop_event.is_set():
+            try:
+                # Attempt to put on queue or timeout to try again (maybe main thread is busy)
+                batch_queue.put(item, timeout=0.1)
+                # On success, stop trying
+                break
+            except Full:
+                pass
+        if stop_event.is_set():
+            # Signal the end of output, this ensures that even if the queue takes a while to
+            # buffer, that the main thread receives everything (and tosses this fake batch)
+            logging.info(
+                "Worker thread: Stop event detected, outputting is_final=True batch"
+            )
+            batch_queue.put(
+                Batch(
+                    x=np.zeros((1, 1)),
+                    y=np.zeros((1, 1)),
+                    is_final=True,
+                    mask=None,
+                    patch_lengths=None,
+                    ngram_ids=None,
+                )
+            )
+            break
+    try:
+        logging.info("Worker thread: outputting state")
+        state_queue.put(iterator.get_state(), timeout=1)
+        logging.info("Worker thread: state dump complete")
+        state_dumped_event.set()
+        logging.info("Worker thread: set state_dump_event")
+    except Full:
+        raise ValueError(
+            "Attempted to dump state into the state queue, but it was full"
+        )
+class MultiprocessIterator(StatefulIterator):
+    """
+    Design sketch of the multiprocess iterator:
+    Given the base_iterator, the only thing we do with this is call get_state()
+    so that we can pass that through to the background worker process.
+    The background process will receive this, rebuild the iterator, then start yielding from it.
+    However, in order to implement MultiprocessIterator.get_state(), we need to be able to accurately get
+    (1) the state of the iterator in the worker process
+    (2) the currently buffered items in the Queue
+    To do this, we use:
+    - batch_queue: This is the prefetch buffer the worker yields to and the main loop yields from
+    - state_queue: This size 1 queue will be how the worker sends the iterator state once it has halted iterating.
+        It must hold the state in addition to the last batch, if the queue was full at the time the stop event is sent.
+    - stop_iterating_event: Once this is issued from the main loop, the worker will stop iterating and enter cleanup.
+        During cleanup, the iterator will send the state of the current iterator to the main loop,
+        in addition to possibly the last batch if the batch_queue was full at the time
+    - state_dumped_event: When the main loop issues the stop_iterating_event, it will wait until the state_dumped_event to attempt
+        to get state from the state_queue. It must do this since the worker may take some time to create and send the state.
+        Once received by the main loop, the main loop can safely store the Queue (plus maybe the last batch) as the prefetch buffer,
+        get the worker iterator's state, and terminate the background process + delete associated objects.
+    At this point, calling create_iter() again will bootstrap everything from the stored state and the old iterator will throw an error
+    since it will not iterate anymore (so the caller must call create_iter() again to get a python iterator).
+    """
+    def __init__(
+        self,
+        base_iterator: StatefulIterator,
+        *,
+        n_batches_to_prefetch: int,
+        prefetch_buffer: list | None = None
+    ):
+        self.base_iterator = base_iterator
+        self.n_batches_to_prefetch = n_batches_to_prefetch
+        if prefetch_buffer is None:
+            prefetch_buffer = []
+        self.prefetch_buffer = prefetch_buffer
+        self.batch_queue = None
+        self.state_queue = None
+        self.producer = None
+        self.stop_iterating_event = None
+        self.state_dumped_event = None
+    def get_state(self) -> MultiprocessIteratorState:
+        """
+        This is slightly unusual in effectively destroying the current iterator, its necessary
+        to halt the background process and allow it to write the state to the main loop
+        in order to not lose data
+        """
+        if self.producer is None:
+            serialized_prefetch_buffer = json.dumps(
+                [b.to_python_dict() for b in self.prefetch_buffer]
+            )
+            return MultiprocessIteratorState(
+                base_iterator_state=self.base_iterator.get_state(),
+                n_batches_to_prefetch=self.n_batches_to_prefetch,
+                serialized_prefetch_buffer=serialized_prefetch_buffer,
+            )
+        else:
+            logging.info("Main thread: Sending stop iteration event")
+            self.stop_iterating_event.set()
+            logging.info("Main thread: Waiting for state_dumped event")
+            self.state_dumped_event.wait()
+            self.prefetch_buffer = []
+            final_batch_received = False
+            while True:
+                try:
+                    batch = self.batch_queue.get(timeout=1)
+                    if batch.is_final:
+                        final_batch_received = True
+                        break
+                    self.prefetch_buffer.append(batch)
+                except Empty:
+                    logging.warning("Main thread: batch_queue is abnormally empty")
+            assert final_batch_received
+            try:
+                base_iterator_state = self.state_queue.get(timeout=1)
+                assert isinstance(base_iterator_state, IteratorState)
+            except Empty:
+                raise ValueError(
+                    "Attempted to get the state, but it was unexpectantly missing"
+                )
+            self.base_iterator = base_iterator_state.build()
+            self.producer.close()
+            self.producer = None
+            self.batch_queue = None
+            self.state_queue = None
+            self.stop_iterating_event = None
+            self.state_dumped_event = None
+            return MultiprocessIteratorState(
+                base_iterator_state=self.base_iterator.get_state(),
+                n_batches_to_prefetch=self.n_batches_to_prefetch,
+                serialized_prefetch_buffer=json.dumps(
+                    [b.to_python_dict() for b in self.prefetch_buffer]
+                ),
+            )
+    def create_iter(self):
+        logging.info("Main thread: Creating MP iterator")
+        # First yield from the stored prefetch buffer.
+        if self.prefetch_buffer is not None:
+            while len(self.prefetch_buffer) > 0:
+                item = self.prefetch_buffer.pop(0)
+                yield item
+            self.prefetch_buffer = None
+        assert (
+            self.producer is None
+        ), "Cannot create two parallel iterators at once, call get_state() then remake to have two."
+        # using mp context manager avoids excessive CPU loading
+        ctx = mp.get_context("forkserver")
+        self.batch_queue = ctx.Manager().Queue(maxsize=self.n_batches_to_prefetch)
+        # We should only ever one state, which is output at the detection of a stop event
+        self.state_queue = ctx.Manager().Queue(maxsize=1)
+        self.stop_iterating_event = ctx.Event()
+        self.state_dumped_event = ctx.Event()
+        self.producer = mp.Process(
+            name="blt_data_loader",
+            target=start_work_from_state,
+            args=(
+                self.batch_queue,
+                self.state_queue,
+                self.stop_iterating_event,
+                self.state_dumped_event,
+                self.base_iterator.get_state(),
+            ),
+        )
+        logger.info("Async dataloader started")
+        self.producer.start()
+        while True:
+            if self.producer.exitcode is not None:
+                raise RuntimeError(
+                    "Data loader quit unexpectedly, real error has been raised previously"
+                )
+            try:
+                batch = self.batch_queue.get(timeout=0.1)
+                assert isinstance(batch, Batch)
+                assert (
+                    not batch.is_final
+                ), "is_final should only be used during get_state() being called"
+                yield batch
+            except Empty:
+                pass
+            if self.producer is None:
+                raise ValueError(
+                    "Attempted to call this iterator after calling get_state(). You must call create_iter() to make a new iterator instead."
+                )

bytelatent/data/iterators/packing_iterator.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Any
+import numpy as np
+from pydantic import BaseModel, ConfigDict
+from bytelatent.data.data_types import Batch, BltSequence
+from bytelatent.data.iterators.abstract_iterator import IteratorState, StatefulIterator
+from bytelatent.data.iterators.sampling_iterator import SamplingIteratorState
+class PackingArgs(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    batch_size: int
+    seq_len: int
+    pad_id: int
+    max_length: int | None
+    pad_to_max_length: bool
+    enable_byte_ngrams: bool
+class PackingIteratorState(BaseModel, IteratorState):
+    model_config = ConfigDict(extra="forbid")
+    sequence_iterator_state: SamplingIteratorState
+    packing_args: PackingArgs
+    def build(self) -> "PackingIterator":
+        return PackingIterator(
+            sequence_iterator=self.sequence_iterator_state.build(),
+            packing_args=self.packing_args,
+        )
+def _merge_patch_seq_masks(bs, slen: int, mask_seqs: list[list[bool]]):
+    assert len(mask_seqs) == bs
+    lens = [len(m) for m in mask_seqs]
+    if all(all(m) for m in mask_seqs) and all(lens[0] == l for l in lens):
+        return None
+    assert slen == max(lens) - 1
+    mask = np.zeros((bs, slen), dtype=bool)
+    for i, m in enumerate(mask_seqs):
+        if m is None:
+            print(
+                "Did not implement None mask, the mask should be True for all toks, so we need to pass that to this function."
+            )
+            raise NotImplementedError
+        mask[i][: len(mask_seqs[i]) - 1] = mask_seqs[i][1:]
+    return mask
+def truncate_batch(
+    batch: Batch,
+    max_length: int,
+    pad_id: int,
+    pad_to_max_length: bool = False,
+    *,
+    enable_byte_ngrams: bool,
+):
+    """
+    Truncate the x to a given size, making sure we remove the corresponding patch sizes in patch_lenghts
+    and fixing the batch.mask.
+    batch.patch_lengths has unchanged shape
+    x,y, and mask may reduce in size
+    """
+    if batch.patch_lengths is None:
+        return batch
+    seq_lengths = batch.patch_lengths.sum(axis=1)
+    max_length_adj = max_length + 1
+    if np.any(seq_lengths > max_length_adj):
+        for i in range(batch.x.shape[0]):
+            if seq_lengths[i] > max_length_adj:
+                # Find id of patch that tips over max_length + 1
+                count, j = 0, 0
+                while count + batch.patch_lengths[i, j] <= max_length_adj:
+                    count += batch.patch_lengths[i, j]
+                    j += 1
+                # Edit the batch
+                assert j < batch.patch_lengths.shape[1]
+                batch.x[i, max_length:] = pad_id
+                batch.y[i, max_length:] = pad_id
+                if batch.mask is not None:
+                    batch.mask[i, max_length:] = False
+                batch.patch_lengths[i, j:] = 0
+                batch.patch_lengths[i, j] = max_length_adj - count
+        # Truncate if necessary.
+        if max_length < batch.x.shape[1]:
+            batch.x = batch.x[:, :max_length]
+            batch.y = batch.y[:, :max_length]
+            if batch.mask is not None:
+                batch.mask = batch.mask[:, :max_length]
+    # Right pad to max_length if necessary
+    elif pad_to_max_length:
+        if batch.x.shape[1] < max_length:
+            # NOTE: this has to be done on an actual patch.
+            non_zero_indices = (batch.patch_lengths != 0).sum(axis=1) - 1
+            non_zero_indices = np.maximum(0, non_zero_indices)
+            batch.patch_lengths[range(len(batch.patch_lengths)), non_zero_indices] += (
+                max_length - batch.x.shape[1]
+            )
+            # TODO: We could get rid of many of these complications by moving this funciton directly in the dataloader.
+            x = np.full((batch.x.shape[0], max_length), pad_id, dtype=batch.x.dtype)
+            x[:, : batch.x.shape[1]] = batch.x
+            batch.x = x
+        if batch.y.shape[1] < max_length:
+            y = np.full((batch.y.shape[0], max_length), pad_id, dtype=batch.y.dtype)
+            y[:, : batch.y.shape[1]] = batch.y
+            batch.y = y
+        if batch.mask is not None and batch.mask.shape[1] < max_length:
+            mask = np.full(
+                (batch.mask.shape[0], max_length), False, dtype=batch.mask.dtype
+            )
+            mask[:, : batch.mask.shape[1]] = batch.mask
+            batch.mask = mask
+    assert batch.x.shape[1] <= max_length
+    assert batch.y.shape[1] <= max_length
+    assert batch.mask is None or batch.mask.shape[1] <= max_length
+    assert np.all(max_length_adj - batch.patch_lengths.sum(axis=1) == 0)
+    if pad_to_max_length:
+        assert batch.x.shape[1] == max_length
+        assert batch.y.shape[1] == max_length
+        assert batch.mask is None or batch.mask.shape[1] == max_length
+    if enable_byte_ngrams:
+        raise NotImplementedError()
+        # (num_ngram, batch_size, seq_len)
+        ngram_ids = np.array(tokenizer.encode_token_ngrams(batch.x))
+        assert ngram_ids.shape[2] == batch.x.shape[1]
+    else:
+        ngram_ids = None
+    batch.ngram_ids = ngram_ids
+class PackingIterator(StatefulIterator[Batch, PackingIteratorState]):
+    def __init__(
+        self,
+        sequence_iterator: StatefulIterator[BltSequence, Any],
+        *,
+        packing_args: PackingArgs,
+    ):
+        self.sequence_iterator = sequence_iterator
+        self.packing_args = packing_args
+    def get_state(self):
+        return PackingIteratorState(
+            sequence_iterator_state=self.sequence_iterator.get_state(),
+            packing_args=self.packing_args,
+        )
+    def create_iter(self):
+        sequence_iter = self.sequence_iterator.create_iter()
+        batch_size = self.packing_args.batch_size
+        pad_id = self.packing_args.pad_id
+        seq_len = self.packing_args.seq_len
+        pad_to_max_length = self.packing_args.pad_to_max_length
+        enable_byte_ngrams = self.packing_args.enable_byte_ngrams
+        max_length = self.packing_args.max_length
+        while True:
+            tokens: list[list[int]] = []
+            masks: list[list[bool]] = []
+            patch_lengths: list[list[int]] = []
+            for _ in range(self.packing_args.batch_size):
+                sequence = next(sequence_iter)
+                _tokens = sequence.tokens
+                _mask = sequence.mask
+                _patch_lengths = sequence.patch_lengths
+                assert len(sequence.patch_lengths) == self.packing_args.seq_len
+                last_patch_length = 0
+                if _patch_lengths[0] > 1:
+                    last_patch_length = _patch_lengths[-1]
+                    _patch_lengths[0] -= 1
+                    _patch_lengths = [1] + _patch_lengths[:-1]
+                tokens.append(_tokens[: len(_tokens) - last_patch_length])
+                masks.append(_mask[: len(_mask) - last_patch_length])
+                patch_lengths.append(_patch_lengths)
+            x_patch_lengths = np.array(patch_lengths)
+            # pad batch to same length
+            tok_seq_len = max([len(toks) for toks in tokens]) - 1
+            x = np.full((batch_size, tok_seq_len), fill_value=pad_id)
+            y = np.full((batch_size, tok_seq_len), fill_value=pad_id)
+            for i, tok_seq in enumerate(tokens):
+                x[i, : len(tok_seq) - 1] = tok_seq[:-1]
+                y[i, : len(tok_seq) - 1] = tok_seq[1:]
+                # Adjust patch lengths to match x
+                x_patch_lengths[i, -1] += tok_seq_len - (len(tok_seq) - 1)
+            assert x_patch_lengths.shape == (batch_size, seq_len)
+            if enable_byte_ngrams:
+                raise NotImplementedError()
+            else:
+                ngram_ids = None
+            batch = Batch(
+                x=x,
+                y=y,
+                patch_lengths=x_patch_lengths,
+                ngram_ids=ngram_ids,
+                mask=_merge_patch_seq_masks(batch_size, tok_seq_len, masks),
+            )
+            assert (
+                x_patch_lengths.sum() == x.size + batch_size
+            ), f"{x_patch_lengths.sum()} != {x.size + batch_size}"
+            assert (
+                batch.mask is None or np.sum(x != pad_id) == batch.mask.sum()
+            ), f"{np.sum(x != pad_id)} != {batch.mask.sum()}"
+            assert np.all(
+                x_patch_lengths[:, 0] == 1
+            ), f"first patch should always be 1, {x_patch_lengths[:, 0]}"
+            # cuda_gb_allocated = (torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024)
+            # cuda_gb_reserved = torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024
+            # print(f"dataloader cuda_gb_allocated: {cuda_gb_allocated}, cuda_gb_reserved: {cuda_gb_reserved}")
+            truncate_batch(
+                batch,
+                max_length=max_length,
+                pad_id=pad_id,
+                pad_to_max_length=pad_to_max_length,
+                enable_byte_ngrams=enable_byte_ngrams,
+            )
+            yield batch

bytelatent/data/iterators/preprocess_iterator.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Any, Generator
+import torch
+from pydantic import BaseModel, ConfigDict
+from bytelatent.data.data_types import BltExample
+from bytelatent.data.iterators.abstract_iterator import IteratorState, StatefulIterator
+from bytelatent.data.iterators.arrow_iterator import (
+    ArrowFileIterator,
+    ArrowFileIteratorState,
+)
+from bytelatent.data.iterators.looping_iterator import LoopingIteratorState
+from bytelatent.data.patcher import Patcher, PatcherArgs, PatchingModeEnum
+from bytelatent.tokenizers.blt_tokenizer import BltTokenizer
+from bytelatent.tokenizers.build_tokenizer import TokenizerArgs
+class PreprocessIteratorState(BaseModel, IteratorState):
+    model_config = ConfigDict(extra="forbid")
+    arrow_file_iterator_state: ArrowFileIteratorState | LoopingIteratorState
+    add_tokens: bool
+    add_patches: bool
+    tokenizer_args: TokenizerArgs
+    patcher_args: PatcherArgs
+    def build(self):
+        arrow_iterator = self.arrow_file_iterator_state.build()
+        return PreprocessIterator(
+            arrow_iterator,
+            patcher_args=self.patcher_args,
+            tokenizer_args=self.tokenizer_args,
+            add_tokens=self.add_tokens,
+            add_patches=self.add_patches,
+        )
+class PreprocessIterator(StatefulIterator):
+    """
+    Take BltExamples with fields filled in only from ArrowFileIterator, and fill in fields that require
+    preprocessing like tokenization and patching
+    """
+    def __init__(
+        self,
+        arrow_iterator: ArrowFileIterator,
+        *,
+        patcher_args: PatcherArgs,
+        tokenizer_args: TokenizerArgs,
+        add_tokens: bool = True,
+        add_patches: bool = True,
+    ):
+        self.arrow_iterator = arrow_iterator
+        self.tokenizer_args = tokenizer_args
+        self.patcher_args = patcher_args
+        self.add_tokens = add_tokens
+        self.add_patches = add_patches
+        self.tokenizer: BltTokenizer | None = None
+        self.patcher: Patcher | None = None
+    def get_state(self) -> PreprocessIteratorState:
+        """
+        The only state to maintain here is from arrow, there
+        isn't any internal state on this iterator.
+        """
+        return PreprocessIteratorState(
+            arrow_file_iterator_state=self.arrow_iterator.get_state(),
+            tokenizer_args=self.tokenizer_args,
+            patcher_args=self.patcher_args,
+            add_tokens=self.add_tokens,
+            add_patches=self.add_patches,
+        )
+    def create_iter(self) -> Generator[BltExample, Any, None]:
+        if self.tokenizer is None and self.add_tokens:
+            self.tokenizer = self.tokenizer_args.build()
+        if self.patcher is None and self.add_patches:
+            self.patcher = self.patcher_args.build()
+        example_iter = self.arrow_iterator.create_iter()
+        for example in example_iter:
+            if self.add_tokens:
+                tokens = self.tokenizer.encode(example.text)
+            else:
+                tokens = example.tokens
+            if (
+                self.patcher is not None
+                and self.patcher.patching_mode == PatchingModeEnum.entropy
+            ):
+                assert (
+                    example.entropies is not None
+                ), "For patching, entropies cannot be None"
+                entropies = torch.tensor(example.entropies).unsqueeze(0)
+            else:
+                entropies = None
+            if self.patcher is None:
+                patch_lengths = None
+            else:
+                patch_lengths = self.patcher.patch(
+                    torch.tensor(tokens).unsqueeze(0),
+                    include_next_token=False,
+                    entropies=entropies,
+                )[0][0].tolist()
+            yield BltExample(
+                sample_id=example.sample_id,
+                text=example.text,
+                tokens=tokens,
+                mask=[True] * len(tokens),
+                patch_lengths=patch_lengths,
+                entropies=example.entropies,
+            )

bytelatent/data/iterators/sampling_iterator.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Any
+import numpy as np
+from pydantic import BaseModel, ConfigDict
+from bytelatent.data.iterators.abstract_iterator import StatefulIterator
+from bytelatent.data.iterators.sequence_iterator import SequenceIteratorState
+class SamplingIteratorState(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    rng_state: dict[str, Any]
+    source_to_weight: dict[str, float]
+    source_to_iterator_state: dict[str, SequenceIteratorState]
+    def build(self) -> "SamplingIterator":
+        return SamplingIterator(
+            rng_state=self.rng_state,
+            source_to_weight=self.source_to_weight,
+            source_to_iterator={
+                source: state.build()
+                for source, state in self.source_to_iterator_state.items()
+            },
+        )
+class SamplingIterator(StatefulIterator):
+    def __init__(
+        self,
+        *,
+        rng_state: dict[str, Any],
+        source_to_weight: dict[str, float],
+        source_to_iterator: dict[str, StatefulIterator],
+    ):
+        self.rng = np.random.default_rng()
+        self.rng.bit_generator.state = rng_state
+        self.source_to_weight = source_to_weight
+        self.source_to_iterator = source_to_iterator
+    def get_state(self) -> SamplingIteratorState:
+        return SamplingIteratorState(
+            rng_state=self.rng.bit_generator.state,
+            source_to_weight=self.source_to_weight,
+            source_to_iterator_state={
+                source: iterator.get_state()
+                for source, iterator in self.source_to_iterator.items()
+            },
+        )
+    def create_iter(self):
+        n_sources = len(self.source_to_weight)
+        possible_sources = []
+        weights = []
+        for source, w in self.source_to_weight.items():
+            possible_sources.append(source)
+            weights.append(w)
+        source_to_python_iter = {
+            source: self.source_to_iterator[source].create_iter()
+            for source in possible_sources
+        }
+        while True:
+            norm_weights = np.array(weights) / np.array(weights).sum()
+            source_choice = possible_sources[self.rng.choice(n_sources, p=norm_weights)]
+            yield next(source_to_python_iter[source_choice])

bytelatent/data/iterators/sequence_iterator.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from logging import getLogger
+from typing import Any
+import numpy as np
+from pydantic import BaseModel, ConfigDict
+from bytelatent.data.data_types import BltSequence
+from bytelatent.data.iterators.abstract_iterator import IteratorState, StatefulIterator
+from bytelatent.data.iterators.preprocess_iterator import (
+    PreprocessIterator,
+    PreprocessIteratorState,
+)
+logger = getLogger()
+class SequencePackingArgs(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    output_seq_len: int
+    buffer_size: int
+class SequenceIteratorState(BaseModel, IteratorState):
+    model_config = ConfigDict(extra="forbid")
+    sequence_packing_args: SequencePackingArgs
+    preprocess_iterator_state: PreprocessIteratorState
+    rng_state: dict[str, Any]
+    def build(self):
+        preprocess_iterator = self.preprocess_iterator_state.build()
+        return SequenceIterator(
+            preprocess_iterator,
+            sequence_packing_args=self.sequence_packing_args,
+            rng_state=self.rng_state,
+        )
+class SequenceIterator(StatefulIterator):
+    def __init__(
+        self,
+        preprocess_iterator: PreprocessIterator,
+        *,
+        rng_state: dict[str, Any],
+        sequence_packing_args: SequencePackingArgs,
+    ):
+        self.preprocess_iterator = preprocess_iterator
+        self.sequence_packing_args = sequence_packing_args
+        self.output_seq_len = sequence_packing_args.output_seq_len
+        self.buffer_size = sequence_packing_args.buffer_size
+        self.rng = np.random.default_rng()
+        self.rng.bit_generator.state = rng_state
+    def get_state(self):
+        # TODO: need to also perist the current shuffle buffer
+        return SequenceIteratorState(
+            sequence_packing_args=self.sequence_packing_args,
+            preprocess_iterator_state=self.preprocess_iterator.get_state(),
+            rng_state=self.rng.bit_generator.state,
+        )
+    def create_iter(self):
+        example_iter = self.preprocess_iterator.create_iter()
+        n_buffer_patches = self.buffer_size * self.output_seq_len
+        patch_lengths: list[int] = []
+        tokens: list[int] = []
+        mask: list[bool] = []
+        first = True
+        for example in example_iter:
+            assert example.tokens is not None
+            assert example.mask is not None
+            assert example.patch_lengths is not None
+            assert len(example.tokens) != 0
+            assert len(example.mask) != 0
+            assert len(example.tokens) == len(example.mask)
+            assert len(example.tokens) == sum(example.patch_lengths)
+            tokens.extend(example.tokens)
+            mask.extend(example.mask)
+            patch_lengths.extend(example.patch_lengths)
+            while len(patch_lengths) >= n_buffer_patches:
+                if first:
+                    first = False
+                    logger.info("First buffer complete")
+                x_patches = np.array(patch_lengths[:n_buffer_patches]).reshape(
+                    self.buffer_size, self.output_seq_len
+                )
+                seq_tokens = []
+                seq_mask = []
+                start_id = 0
+                # We fix the number of patches and therefore global steps per batch
+                # so we have a variable number of tokens we need to account for
+                for num_tokens in x_patches.sum(axis=-1):
+                    seq_tokens.append(tokens[start_id : start_id + num_tokens])
+                    seq_mask.append(mask[start_id : start_id + num_tokens])
+                    start_id += num_tokens
+                assert start_id == x_patches.sum()
+                # Remove what we just added from the buffer
+                patch_lengths = patch_lengths[n_buffer_patches:]
+                tokens = tokens[x_patches.sum() :]
+                mask = mask[x_patches.sum() :]
+                seq_patch_lengths: list[list[int]] = x_patches.tolist()
+                assert len(seq_patch_lengths) == self.buffer_size
+                for idx in self.rng.permutation(len(seq_patch_lengths)):
+                    assert len(seq_patch_lengths[idx]) == self.output_seq_len
+                    assert (
+                        sum(seq_patch_lengths[idx])
+                        == len(seq_tokens[idx])
+                        == len(seq_mask[idx])
+                    ), f"{sum(seq_patch_lengths[idx])}, {len(seq_tokens[idx])} {len(seq_mask[idx])}, idx={idx}"
+                    assert seq_patch_lengths[idx][0] > 0, f"{seq_patch_lengths[idx]}"
+                    yield BltSequence(
+                        tokens=seq_tokens[idx],
+                        mask=seq_mask[idx],
+                        patch_lengths=seq_patch_lengths[idx],
+                    )

bytelatent/data/iterators/test_arrow_iterator.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import numpy as np
+import pyarrow as pa
+# pyarrow needs the initialization from this import
+import pyarrow.dataset  # pyright: ignore
+from bytelatent.constants import BLT_DATA
+from bytelatent.data.iterators.arrow_iterator import ArrowFileIteratorState
+ENTROPY_MODEL = "transformer_100m"
+ARROW_TEST_DATA_1 = str(BLT_DATA / "stackexchange.chunk.00.jsonl.shard_00.arrow")
+ARROW_TEST_DATA_2 = str(BLT_DATA / "stackexchange.chunk.00.jsonl.shard_01.arrow")
+def test_basic_arrow_file():
+    dataset = pa.dataset.dataset(ARROW_TEST_DATA_1, format="arrow")
+    n_head = 1000
+    head_df = dataset.head(n_head).to_pandas()
+    initial_state = ArrowFileIteratorState(
+        file_path=None,
+        num_workers=1,
+        worker_id=0,
+        preprocess_dir=None,
+        entropy_model_name=ENTROPY_MODEL,
+        dataset_files=[ARROW_TEST_DATA_1],
+        row_num=0,
+        arrow_batch_size=100,
+    )
+    arrow_file = initial_state.build()
+    start_state = arrow_file.get_state()
+    assert start_state.row_num == initial_state.row_num
+    sample_id = None
+    for example in arrow_file.create_iter():
+        sample_id = example.sample_id
+        assert head_df.iloc[0]["sample_id"] == sample_id
+        break
+    assert arrow_file.get_state().row_num == 1
+    arrow_file = initial_state.build()
+    for example in arrow_file.create_iter():
+        assert example.sample_id == sample_id
+        assert head_df.iloc[0]["sample_id"] == sample_id
+        break
+    # Test resume far enough in to be past the batch size of 100
+    resumed_state = ArrowFileIteratorState(
+        file_path=None,
+        num_workers=1,
+        worker_id=0,
+        preprocess_dir=None,
+        entropy_model_name=ENTROPY_MODEL,
+        dataset_files=[ARROW_TEST_DATA_1],
+        row_num=251,
+        arrow_batch_size=100,
+    )
+    arrow_file = resumed_state.build()
+    for example in arrow_file.create_iter():
+        assert example.sample_id == head_df.iloc[251]["sample_id"]
+        assert arrow_file.get_state().row_num == 252
+        break
+    world_rank = 1
+    world_size = 4
+    # Test World Size and Rank
+    rank_state = ArrowFileIteratorState(
+        file_path=None,
+        num_workers=world_size,
+        worker_id=world_rank,
+        preprocess_dir=None,
+        entropy_model_name=ENTROPY_MODEL,
+        dataset_files=[ARROW_TEST_DATA_1],
+        row_num=0,
+        arrow_batch_size=100,
+    )
+    arrow_file = rank_state.build()
+    expected_ids = []
+    for i in range(n_head):
+        if i % world_size == world_rank:
+            expected_ids.append(head_df.iloc[i]["sample_id"])
+    print(len(expected_ids))
+    i = 0
+    for example in arrow_file.create_iter():
+        assert example.sample_id == expected_ids[i]
+        i += 1
+        if i >= len(expected_ids):
+            break

bytelatent/data/iterators/test_iters.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import pandas as pd
+from pydantic import BaseModel
+from bytelatent.constants import BLT_DATA
+from bytelatent.data.data_types import BltExample
+from bytelatent.data.iterators.abstract_iterator import IteratorState, StatefulIterator
+from bytelatent.data.iterators.preprocess_iterator import PreprocessIterator
+from bytelatent.data.patcher import PatcherArgs, PatchingModeEnum
+from bytelatent.tokenizers.build_tokenizer import TokenizerArgs
+class BltTestIteratorState(BaseModel, IteratorState):
+    position: int
+    total: int
+    def build(self):
+        blt_iter = BltTestIteratorState(total=self.total)
+        blt_iter.position = self.position
+        return blt_iter
+class BltTestIterator(StatefulIterator):
+    def __init__(self, total: int):
+        self.position = 0
+        self.total = total
+    def get_state(self):
+        return BltTestIteratorState(position=self.position, total=self.total)
+    def create_iter(self):
+        for i in range(self.total):
+            self.position += 1
+            yield BltExample(
+                sample_id=f"test_{i}",
+                text=f"This is some test {i} text.",
+                tokens=None,
+                mask=None,
+                entropies=None,
+                patch_lengths=None,
+            )
+class BltTestWithEntropiesIteratorState(BaseModel, IteratorState):
+    position: int
+    total: int
+    def build(self):
+        blt_iter = BltTestWithEntropiesIteratorState(total=self.total)
+        blt_iter.position = self.position
+        return blt_iter
+class BltTestWithEntropiesIterator(StatefulIterator):
+    def __init__(self, total: int):
+        self.position = 0
+        self.total = total
+    def get_state(self):
+        return BltTestIteratorState(position=self.position, total=self.total)
+    def create_iter(self):
+        text = "Daenerys Targaryen is in Game of Thrones, a fantasy epic by George R.R. Martin."
+        df = pd.read_json("fixtures/tokens_with_entropies.json")
+        tokens = df["token_ids"].tolist()
+        entropies = df["entropies"].tolist()
+        # BOS and EOS
+        assert len(tokens) == len(text) + 2
+        for i in range(self.total):
+            self.position += 1
+            yield BltExample(
+                sample_id=f"test_{i}",
+                text=text,
+                tokens=tokens,
+                mask=[True] * len(tokens),
+                entropies=entropies,
+                patch_lengths=None,
+            )
+def test_preprocess_iter():
+    total = 3
+    tokenizer_args = TokenizerArgs(
+        name="blt",
+        init_kwargs={
+            "bpe_tokenizer_path": BLT_DATA / "tokenizer_final_32k.minus_inf_ws.model"
+        },
+    )
+    for mode in [
+        PatchingModeEnum.bpe,
+        PatchingModeEnum.space,
+    ]:
+        data_it = BltTestIterator(total)
+        patcher_args = PatcherArgs(patching_mode=mode)
+        example_it = PreprocessIterator(
+            data_it, tokenizer_args=tokenizer_args, patcher_args=patcher_args
+        )
+        count = 0
+        for example in example_it.create_iter():
+            assert isinstance(example.tokens, list)
+            assert isinstance(example.tokens[0], int)
+            # BOS and EOS
+            assert len(example.tokens) == len(example.text) + 2
+            assert example.mask is not None
+            assert len(example.tokens) == len(example.mask)
+            count += 1
+        assert count == total
+def test_non_entropy_patch_iter():
+    total = 3
+    tokenizer_args = TokenizerArgs(
+        name="blt",
+        init_kwargs={
+            "bpe_tokenizer_path": BLT_DATA / "tokenizer_final_32k.minus_inf_ws.model"
+        },
+    )
+    for mode in [
+        PatchingModeEnum.bpe,
+        PatchingModeEnum.space,
+    ]:
+        patcher_args = PatcherArgs(patching_mode=mode)
+        data_it = BltTestIterator(total)
+        example_it = PreprocessIterator(
+            data_it, tokenizer_args=tokenizer_args, patcher_args=patcher_args
+        )
+        count = 0
+        for example in example_it.create_iter():
+            assert isinstance(example.patch_lengths, list)
+            assert isinstance(example.patch_lengths[0], int)
+            assert len(example.tokens) == sum(example.patch_lengths)
+            count += 1
+        assert count == total
+def test_entropy_patch_iter():
+    total = 2
+    patcher_args = PatcherArgs(
+        patching_mode=PatchingModeEnum.entropy, threshold=1.335442066192627
+    )
+    tokenizer_args = TokenizerArgs(
+        name="blt",
+        init_kwargs={
+            "bpe_tokenizer_path": BLT_DATA / "tokenizer_final_32k.minus_inf_ws.model"
+        },
+    )
+    data_it = BltTestWithEntropiesIterator(total)
+    example_it = PreprocessIterator(
+        data_it, tokenizer_args=tokenizer_args, patcher_args=patcher_args
+    )
+    count = 0
+    for example in example_it.create_iter():
+        assert isinstance(example.patch_lengths, list)
+        assert isinstance(example.patch_lengths[0], int)
+        assert len(example.tokens) == sum(example.patch_lengths)
+        count += 1
+    assert count == total

bytelatent/data/ngram_processor.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import pickle
+from pathlib import Path
+import numpy as np
+from bytelatent import ByteLatentError
+LOOKUP_OFFSET = 4
+def apply_lookup_table_wrapper(ngram_to_idx: dict[tuple, int], lookup_offset=1):
+    """
+    Wrapper function for applying the lookup table to each n-gram.
+    :param ngram: Array of numbers representing an n-gram.
+    :param lookup_table: Dictionary where keys are tuples (n-grams) and values are the desired outputs.
+    :param lookup_offset: Offset to add to the lookup result.
+    :return: The value associated with the n-gram tuple in the dictionary, or None if not found.
+    """
+    def apply_lookup_table(ngram):
+        """
+        Function to apply to each n-gram: converts it to a tuple and looks it up in a dictionary.
+        :param ngram: Array of numbers representing an n-gram.
+        :return: The value associated with the n-gram tuple in the dictionary, or None if not found.
+        """
+        # Convert the n-gram to a tuple
+        ngram_tuple = tuple(ngram)
+        if ngram_tuple not in ngram_to_idx:
+            return 0
+        else:
+            return ngram_to_idx[ngram_tuple] + lookup_offset
+    return apply_lookup_table
+def get_byte_ngrams_ids(
+    byte_array: np.ndarray, n: int, ngram_to_idx: dict[tuple, int], pad_value=0
+):
+    """
+    Generate n-grams from a 2D numpy array.
+    :param n: The length of each n-gram.
+    :param pad_value: The value used for padding of the byte values to maintain the same dimensions for the n-grams.
+    :return: A 2D numpy array where each element is the ID of an n-gram offset by LOOKUP_OFFSET.
+    """
+    num_rows, num_cols = byte_array.shape
+    # Create an array to hold the padded version of the original array
+    padded_array = np.pad(
+        byte_array, ((0, 0), (n - 1, 0)), mode="constant", constant_values=pad_value
+    )
+    # Use stride tricks to avoid explicit looping
+    strided = np.lib.stride_tricks.as_strided
+    shape = (num_rows, num_cols, n)
+    strides = padded_array.strides[:2] + (padded_array.strides[1],)
+    ngrams = strided(padded_array, shape=shape, strides=strides)
+    ngram_ids = np.apply_along_axis(
+        apply_lookup_table_wrapper(ngram_to_idx, lookup_offset=LOOKUP_OFFSET), 2, ngrams
+    )
+    assert ngram_ids.shape == byte_array.shape
+    return ngram_ids
+def reload_tables(
+    ngram_table_dir: str, ngram_to_size: dict[int, int], offset: int = LOOKUP_OFFSET
+) -> tuple[dict[int, list], dict[tuple, int], dict[int, int]]:
+    """
+    Reload lookup tables from a directory. Reload only the ngrams in the dictionary and per ngram,
+    only load up to the max specified size. Return the actual number of ngrams taken per ngram size.
+    """
+    idx_to_ngram_tables = {}
+    ngram_to_idx_tables = {}
+    vocab_sizes = {}
+    for ngram, size in ngram_to_size.items():
+        with open(Path(ngram_table_dir) / f"ngram-{ngram}.pickle", "rb") as f:
+            # These are already sorted by count
+            # Value: tuple of: count, ngram, dataset
+            ngram_data: list[tuple[tuple, tuple[int, int, str]]] = pickle.load(f)[
+                "counts"
+            ]
+            table = [ngram for ngram, _ in ngram_data][:size]
+            if len(table) != size:
+                raise ValueError(
+                    f"Ngram table for {ngram}-gram is not large enough to get {size} ngrams, max size is {len(ngram_data)}"
+                )
+            ngram_to_idx = {ngram: idx for idx, ngram in enumerate(table)}
+            actual_size = len(table)
+            idx_to_ngram_tables[ngram] = table
+            ngram_to_idx_tables[ngram] = ngram_to_idx
+            vocab_sizes[ngram] = actual_size + offset
+    return ngram_to_idx_tables, ngram_to_idx_tables, vocab_sizes
+def parse_ngram_to_size(ngram_to_size_str: str | None) -> dict[int, int]:
+    if ngram_to_size_str is None:
+        return None
+    ngram_to_size = {}
+    for entry in ngram_to_size_str.split(","):
+        ngram, size = entry.split(":")
+        ngram = int(ngram)
+        size = int(size)
+        ngram_to_size[ngram] = size
+    return ngram_to_size
+class NgramProcessor:
+    def __init__(
+        self,
+        ngram_table_dir: str | None = None,
+        ngram_to_size: dict[int, int] | None = None,
+    ):
+        if ngram_table_dir is None or ngram_to_size is None:
+            raise ByteLatentError(
+                "ngram_table_dir and ngram_to_size cannot be none if enable_byte_ngrams is True"
+            )
+        (
+            self.ngram_to_idx_tables,
+            self.idx_to_ngram_tables,
+            self.ngram_vocab_sizes,
+        ) = reload_tables(ngram_table_dir, ngram_to_size)
+        # Lowest to highest ngram
+        self.ngram_sizes = sorted(list(self.ngram_to_idx_tables.keys()))
+        # Although the model might not use all the ngrams, we need the tokenizer
+        # to produce ngram_ids such that index zero is the 2-gram, later on in
+        # src.model.megabyte.Megabyte.forward
+        assert self.ngram_sizes[0] == 2
+    def encode_single_ngram_table(self, data: np.ndarray, n: int):
+        """
+        Return the n-grams of the input data for a given n
+        numpy array with ids of shape data.shape
+        """
+        return get_byte_ngrams_ids(data, n, self.ngram_to_idx_tables[n], pad_value=0)
+    def encode_token_ngrams(self, data: np.ndarray):
+        """
+        Return the n-grams of the input data.
+        output shape: [ids with data.shape for n in self.ngram_sizes]
+        """
+        return [self.encode_single_ngram_table(data, n) for n in self.ngram_sizes]

bytelatent/data/patcher.py ADDED Viewed

	@@ -0,0 +1,609 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import math
+import time
+from collections import defaultdict
+from enum import Enum
+import torch
+from pydantic import BaseModel
+from torch.nn import functional as F
+from bytelatent.distributed import get_local_rank
+from bytelatent.entropy_model import load_entropy_model
+# from src.slurm import get_local_rank
+from bytelatent.tokenizers.blt_tokenizer import BPE_ID, OFFSET
+from bytelatent.tokenizers.constants import BPE_ID, OFFSET
+class PatchingModeEnum(str, Enum):
+    entropy = "entropy"
+    bpe = "bpe"
+    bpe_patcher = "bpe_patcher"
+    space = "space"
+class PatcherArgs(BaseModel):
+    patching_mode: PatchingModeEnum = PatchingModeEnum.entropy
+    patching_device: str = "cuda"
+    entropy_model_checkpoint_dir: str | None = None
+    realtime_patching: bool = False
+    threshold: float = 1.335442066192627
+    threshold_add: float | None = None
+    max_patch_length: int | None = None
+    patch_size: float = 4.5
+    patching_batch_size: int = 1
+    data_loader_patching: bool = False
+    device: str = "cuda"
+    monotonicity: bool = False
+    log_time: bool = False
+    def build(self) -> "Patcher":
+        return Patcher(self)
+def entropy(scores):
+    """
+    scores: [bs, seq_len, vocab]
+    returns [bs, seq_len]
+    Computes the entropy for each token in the batch.
+    Note: uses natural log.
+    """
+    log_probs = F.log_softmax(scores, dim=-1)
+    probs = torch.exp(log_probs)
+    p_log_p = log_probs * probs
+    entropy = -p_log_p.sum(dim=-1)
+    return entropy
+def calculate_entropies(
+    tokens: torch.tensor, entropy_model, patching_batch_size, device: str | None = None
+):
+    """
+    tokens: 2D tensor of shape [batch_size, seq_len]
+    Return 2D tensor of shape [batch_size, seq_len] with entropies for each token.
+    Splits the tokens into chunks of size max_length and calculates entropies for each chunk.
+    Entropy model can be executed on cpu or gpu, specify either 'cuda' or 'cpu' in the device argument.
+    """
+    with torch.no_grad():
+        entropies = []
+        max_length = getattr(entropy_model, "max_length", 8192)
+        batch_numel = max_length * patching_batch_size
+        splits = torch.split(tokens.flatten(), batch_numel)
+        for split in splits:
+            pad_size = (max_length - (split.numel() % max_length)) % max_length
+            pad = torch.zeros(
+                pad_size, dtype=split.dtype, device=split.device, requires_grad=False
+            )
+            split = torch.cat((split, pad), dim=0)
+            split = split.reshape(-1, max_length)
+            if device is not None:
+                split = split.to(device)
+            assert torch.all(split >= 0) and torch.all(split < 260)
+            pred, _ = entropy_model(split)
+            pred = pred.reshape(-1, pred.shape[-1])[
+                : split.numel() - pad_size, :
+            ]  # [batch_size * seq_len, vocab]
+            pred_entropies = entropy(pred)
+            entropies.append(pred_entropies)
+        entropies = torch.cat(entropies, dim=0)
+        entropies = entropies.reshape(tokens.shape)
+    return entropies
+def patch_start_mask_from_entropy_with_monotonicity(entropies, t):
+    """
+    entropies: [bs, seq_len] torch tensor of entropies
+    t: threshold
+    returns [bs, seq_len] mask where True indicates the start of a patch
+    """
+    bs, seq_len = entropies.shape
+    mask = torch.zeros_like(entropies, dtype=torch.bool)
+    mask[:, 0] = True
+    # Calculate differences between consecutive elements along the sequence length
+    differences = entropies[:, 1:] - entropies[:, :-1]
+    # Calculate conditions for all elements except the first one in each sequence
+    condition = differences > t
+    # Update the mask based on the condition
+    mask[:, 1:] = condition
+    return mask
+def patch_start_mask_global_and_monotonicity(entropies, t, t_add=0):
+    """
+    entropies: [bs, seq_len] torch tensor of entropies
+    t: threshold
+    returns [bs, seq_len] mask where True indicates the start of a patch
+    """
+    bs, seq_len = entropies.shape
+    mask = torch.zeros_like(entropies, dtype=torch.bool)
+    mask[:, 0] = True
+    # Calculate differences between consecutive elements along the sequence length
+    differences = entropies[:, 1:] - entropies[:, :-1]
+    # Calculate conditions for all elements except the first one in each sequence
+    condition = (differences > t_add) & (entropies[:, 1:] > t) & (~mask[:, :-1])
+    # Update the mask based on the condition
+    mask[:, 1:] = condition
+    return mask
+def patch_start_ids_from_patch_start_mask(patch_start_mask):
+    bs, trunc_seq_len = patch_start_mask.shape
+    max_patches = patch_start_mask.sum(dim=1).max()
+    if max_patches == 0:
+        patch_start_ids = torch.full(
+            (bs, trunc_seq_len),
+            trunc_seq_len,
+            dtype=torch.long,
+            device=patch_start_mask.device,
+        )
+    else:
+        patch_ids = (
+            torch.arange(trunc_seq_len, device=patch_start_mask.device)
+            .unsqueeze(0)
+            .repeat(bs, 1)
+        )
+        extra_patch_ids = torch.full(
+            (bs, trunc_seq_len),
+            trunc_seq_len,
+            dtype=torch.long,
+            device=patch_start_mask.device,
+        )
+        all_patch_ids = torch.cat((patch_ids, extra_patch_ids), dim=1)
+        patch_start_mask_padded = torch.cat(
+            (patch_start_mask, ~patch_start_mask), dim=1
+        )
+        patch_start_ids = all_patch_ids[patch_start_mask_padded].reshape(
+            bs, trunc_seq_len
+        )[:, :max_patches]
+    return patch_start_ids
+def check_non_zero_after_zero(tensor):
+    zero_mask = tensor == 0
+    shifted_mask = torch.cat(
+        [
+            torch.zeros(tensor.shape[0], 1, dtype=torch.bool, device=tensor.device),
+            zero_mask[:, :-1],
+        ],
+        dim=1,
+    )
+    non_zero_after_zero = (tensor != 0) & shifted_mask
+    return non_zero_after_zero.any()
+def patch_lengths_from_start_ids(patch_start_ids, seq_len):
+    """
+    Calculate patch lengths from start ids.
+    start ids: ex: [0, 1, 7, 7, 7, 7, 7], it has the start ids of the patches (here 0, 1), and then
+        the rest are filled to the seq len.
+    seq_len: ex: 7 length of the sequence
+    returns the patch lengths:
+    [1, 6] for the above example.
+    """
+    last_ids = torch.full_like(patch_start_ids[:, :1], seq_len - 1)
+    patch_end_ids = torch.cat((patch_start_ids[:, 1:] - 1, last_ids), dim=1)
+    patch_lengths = patch_end_ids - patch_start_ids + 1
+    assert torch.all(patch_lengths >= 0), f"{patch_lengths}"
+    assert not check_non_zero_after_zero(patch_lengths), f"{patch_lengths}"
+    return patch_lengths
+def find_space_patch_start_ids(tokens):
+    bs, seq_len = tokens.shape
+    tokens_no_offset = tokens - OFFSET
+    patch_end_mask = (
+        (tokens_no_offset < ord("0"))
+        | ((ord("9") < tokens_no_offset) & (tokens_no_offset < ord("A")))
+        | ((ord("Z") < tokens_no_offset) & (tokens_no_offset < ord("a")))
+        | ((ord("z") < tokens_no_offset) & (tokens_no_offset < 0b1000_0000))
+        | (0b1100_0000 <= tokens_no_offset)
+    )
+    patch_end_mask[:, 1:] &= patch_end_mask[:, :-1].bitwise_not()
+    patch_end_mask |= tokens < OFFSET
+    patch_start_mask = torch.cat(
+        [
+            torch.tensor([1, 1], device=tokens.device, dtype=torch.bool)
+            .unsqueeze(0)
+            .repeat(bs, 1),
+            patch_end_mask[:, 1:],
+        ],
+        dim=1,
+    )
+    max_patches = patch_start_mask.sum(dim=1).max()
+    patch_ids = (
+        torch.arange(seq_len + 1, device=tokens.device).unsqueeze(0).repeat(bs, 1)
+    )
+    extra_patch_ids = torch.full(
+        (bs, seq_len + 1), seq_len + 1, dtype=torch.long, device=tokens.device
+    )
+    all_patch_ids = torch.cat((patch_ids, extra_patch_ids), dim=1)
+    patch_start_mask_padded = torch.cat((patch_start_mask, ~patch_start_mask), dim=1)
+    patch_start_ids = all_patch_ids[patch_start_mask_padded].reshape(bs, -1)[
+        :, :max_patches
+    ]
+    return patch_start_ids
+def to_device(entropy_model, device=None):
+    if device == "cuda":
+        rank = get_local_rank()
+        device = f"cuda:{rank}"
+    entropy_model = entropy_model.to(device)
+    return entropy_model, device
+def model_pred_to_bpe_patching_pred(pred):
+    _, indices = torch.max(pred, dim=1)
+    return indices == BPE_ID
+def apply_bpe_patcher(tokens, bpe_patcher, patching_batch_size, device=None):
+    assert tokens.device == torch.device(
+        "cpu"
+    ), f"{tokens.device} != cpu expects tokens to be on cpu"
+    with torch.no_grad():
+        bpe_patcher_device, device = to_device(
+            bpe_patcher, device
+        )  # Get entropy model to right rank device.
+        bpe_patching_mask = []
+        max_length = getattr(bpe_patcher, "max_length", 8192)
+        batch_numel = max_length * patching_batch_size
+        splits = torch.split(tokens.flatten(), batch_numel)
+        for split in splits:
+            pad_size = (max_length - (split.numel() % max_length)) % max_length
+            pad = torch.zeros(
+                pad_size, dtype=split.dtype, device=split.device, requires_grad=False
+            )
+            split = torch.cat((split, pad), dim=0)
+            split = split.reshape(-1, max_length).to(device)
+            assert torch.all(split >= 0) and torch.all(split < 260)
+            pred = bpe_patcher_device(split)
+            pred_cpu = pred[0].cpu()
+            pred_cpu = pred_cpu.reshape(-1, pred_cpu.shape[-1])[
+                : split.numel() - pad_size, :
+            ]  # [batch_size * seq_len, vocab]
+            bpe_patching_pred = model_pred_to_bpe_patching_pred(pred_cpu)
+            bpe_patching_mask.append(bpe_patching_pred)
+        bpe_patching_mask = torch.cat(bpe_patching_mask, dim=0)
+        bpe_patching_mask = bpe_patching_mask.reshape(tokens.shape)
+    return bpe_patching_mask
+def find_bpe_patcher_patch_start_ids(
+    tokens, bpe_patcher, patching_batch_size, device=None, include_next_token=True
+):
+    bs, seq_len = tokens.shape
+    first_ids = (
+        torch.tensor([0, 1], dtype=torch.long, device=tokens.device)
+        .unsqueeze(0)
+        .repeat(bs, 1)
+    )
+    preds_truncation_len = first_ids.shape[1]
+    token_input = tokens[:, 1:] if include_next_token else tokens[:, 1:-1]
+    if token_input.shape[1] >= 1:
+        patch_start_mask = apply_bpe_patcher(
+            token_input, bpe_patcher, patching_batch_size, device
+        )
+        assert (
+            patch_start_mask.shape[1]
+            == tokens.shape[1] + include_next_token - preds_truncation_len
+        ), f"{patch_start_mask.shape[1]} != {tokens.shape[1] + include_next_token - preds_truncation_len}"
+        patch_start_ids = patch_start_ids_from_patch_start_mask(patch_start_mask)
+        patch_start_ids = torch.cat(
+            (first_ids, patch_start_ids + preds_truncation_len), dim=1
+        )
+    else:
+        patch_start_ids = first_ids
+    return patch_start_ids
+def find_entropy_patch_start_ids(
+    entropies,
+    patch_size=None,
+    threshold=None,
+    threshold_add=None,
+    monotonicity=False,
+    include_next_token=True,
+):
+    """
+    Use entropies to find the start ids of each patch.
+    Use patch_size or threshold to figure out the total number of patches to allocate.
+    When threshold is not None the number of patches is not constant between
+    different sequences, but patches can be identified incrementally rather than
+    decided globally using the entire sequence.
+    """
+    bs, seq_len = entropies.shape[:2]
+    first_ids = (
+        torch.tensor([0, 1], dtype=torch.long, device=entropies.device)
+        .unsqueeze(0)
+        .repeat(bs, 1)
+    )
+    preds_truncation_len = first_ids.shape[
+        1
+    ]  # remove the first preds because they will be start of patches.
+    entropies = entropies[:, 1:]
+    if threshold is None:
+        num_patches = seq_len // patch_size
+        patch_start_ids = entropies.topk(num_patches - 2, dim=1).indices
+        patch_start_ids = patch_start_ids.sort(dim=1).values
+    else:
+        # Assumes that there is at least one token going over the threshold
+        if monotonicity:
+            patch_start_mask = patch_start_mask_from_entropy_with_monotonicity(
+                entropies, threshold
+            )
+        elif threshold_add is not None and threshold is not None:
+            patch_start_mask = patch_start_mask_global_and_monotonicity(
+                entropies, threshold, threshold_add
+            )
+        else:
+            patch_start_mask = entropies > threshold
+        if not include_next_token:
+            patch_start_mask = patch_start_mask[:, :-1]
+        # patch_start_mask[1:] |= tokens[:-1] < OFFSET
+        patch_start_ids = patch_start_ids_from_patch_start_mask(patch_start_mask)
+    patch_start_ids = torch.cat(
+        (first_ids, patch_start_ids + preds_truncation_len), dim=1
+    )
+    return patch_start_ids
+def rightpad(seq, pad_id, max_len):
+    return seq + [pad_id] * (max_len - len(seq))
+def find_bpe_delim_patch_start_ids(tokens, delim):
+    ids = (tokens[:, :-1] == delim).nonzero(as_tuple=False)
+    out = [[0, 1] for _ in range(tokens.shape[0])]
+    for x, y in ids:
+        # start is at delim + 1, delim should be the last element in the patch.
+        out[x.item()].append(y.item() + 1)
+    max_len = max([len(elt) for elt in out])
+    out = [rightpad(elt, tokens.shape[1], max_len) for elt in out]
+    patch_start_ids = torch.tensor(out, dtype=tokens.dtype, device=tokens.device)
+    return patch_start_ids
+def find_lookup_table_start_mask(
+    tokens: torch.Tensor, lookup_table: torch.Tensor, include_next_token=True
+):
+    window_size = lookup_table.ndim
+    # Unfold the tensor to get sliding windows
+    unfolded = tokens.unfold(1, window_size, 1)
+    # Gather indices for each dimension
+    indices = [unfolded[..., i] for i in range(window_size)]
+    # Access the lookup table using the gathered indices
+    result = lookup_table[indices]
+    return result
+def find_lookup_table_patch_start_ids(
+    tokens: torch.Tensor, lookup_table: torch.Tensor, include_next_token=True
+):
+    bs, seq_len = tokens.shape
+    first_ids = (
+        torch.tensor([0, 1], dtype=torch.long, device=tokens.device)
+        .unsqueeze(0)
+        .repeat(bs, 1)
+    )
+    preds_truncation_len = first_ids.shape[1]
+    window_size = lookup_table.ndim
+    assert window_size == 2, f"{window_size} != 2"
+    # output dimensions: token_input shape - window_size + 1   --> we want first ids + this = tokens shape + 1 if next token otherwise just token shape
+    token_input = (
+        tokens if include_next_token else tokens[:, : -preds_truncation_len + 1]
+    )
+    if token_input.shape[1] >= window_size:
+        patch_start_mask = find_lookup_table_start_mask(
+            token_input, lookup_table, include_next_token
+        )
+        assert (
+            patch_start_mask.shape[1]
+            == tokens.shape[1] + include_next_token - preds_truncation_len
+        ), f"{patch_start_mask.shape[1]} != {tokens.shape[1] + include_next_token - preds_truncation_len}"
+        patch_start_ids = patch_start_ids_from_patch_start_mask(patch_start_mask)
+        patch_start_ids = torch.cat(
+            (first_ids, patch_start_ids + preds_truncation_len), dim=1
+        )
+    else:
+        patch_start_ids = first_ids
+    return patch_start_ids
+def split_large_numbers(lst, m):
+    new_lst = []
+    for i in lst:
+        if i > m:
+            while i > m:
+                new_lst.append(m)
+                i -= m
+            new_lst.append(i)
+        else:
+            new_lst.append(i)
+    assert sum(new_lst) == sum(lst), f"{sum(new_lst)} != {sum(lst)}"
+    return new_lst
+class Patcher:
+    def __init__(self, patcher_args: PatcherArgs):
+        self.patcher_args = patcher_args
+        self.patching_mode = patcher_args.patching_mode
+        self.realtime_patching = patcher_args.realtime_patching
+        if self.realtime_patching:
+            assert (
+                patcher_args.entropy_model_checkpoint_dir is not None
+            ), "Cannot require realtime patching without an entropy model checkpoint"
+            entropy_model = load_entropy_model(
+                patcher_args.entropy_model_checkpoint_dir
+            )
+            entropy_model, _ = to_device(entropy_model, patcher_args.patching_device)
+            self.entropy_model = entropy_model
+        else:
+            self.entropy_model = None
+        self.threshold = patcher_args.threshold
+        self.threshold_add = patcher_args.threshold_add
+        self.max_patch_length = patcher_args.max_patch_length
+        self.patch_size = patcher_args.patch_size
+        self.patching_batch_size = patcher_args.patching_batch_size
+        self.data_loader_patching = patcher_args.data_loader_patching
+        self.device = patcher_args.device
+        self.monotonicity = patcher_args.monotonicity
+        self.log_time = patcher_args.log_time
+        if self.log_time:
+            self.log = defaultdict(float)
+    def patch(
+        self,
+        tokens: torch.Tensor,
+        include_next_token: bool = False,
+        preds: torch.Tensor | None = None,
+        entropies: torch.Tensor | None = None,
+        threshold: float = None,
+    ) -> torch.Tensor:
+        """
+        tokens: 2D tensor of shape [batch_size, seq_len] that needs to be patched
+        Returns patch lengths and optionally scores associated with the tokens (i.e. entropies, logprobs etc.)
+        -> output tensor: [batch_size, max_num_patches]
+            each tensor is processed independently and gets right padded with zeros.
+        Patching with the following modes:
+        1. patching_mode = None: static patch size
+        2. patching_mode = "entropy":
+            calculate entropy of each token, allocate patches so that the total
+            number of patches is the same as static patching but choose to begin
+            patches on tokens where the model is most uncertain (highest entropy).
+            When threshold is provided, it uses the threshold to decide when to
+            start a new patch.
+        3. patching_mode = "space":
+            use space like tokens to define the patches.
+        4. patching_mode = "bpe":
+            use bpe delim tokens to define the patches.
+        To correctly patch the last token, it may be necessary to include the next token in the patch
+        lengths calculations. This is controlled by the include_next_token argument.
+        """
+        bs, seq_len = tokens.shape
+        seq_len_next_tok = seq_len + 1 if include_next_token else seq_len
+        scores = None
+        # STATIC
+        if self.patching_mode is None:
+            patch_lengths = torch.zeros(
+                (bs, math.ceil(seq_len_next_tok / self.patch_size)),
+                dtype=tokens.dtype,
+                device=tokens.device,
+            ).fill_(self.patch_size)
+            if seq_len_next_tok % self.patch_size != 0:
+                patch_lengths[:, -1] = seq_len_next_tok % self.patch_size
+        # ENTROPY
+        elif self.patching_mode == PatchingModeEnum.entropy:
+            if self.log_time:
+                s = time.time()
+            if entropies is not None:
+                scores = torch.tensor(entropies, dtype=torch.float32)
+            elif preds is not None:
+                scores = entropy(preds)
+            else:
+                start_entropies = time.time()
+                scores = calculate_entropies(
+                    tokens,
+                    self.entropy_model,
+                    self.patching_batch_size,
+                    self.device,
+                )
+            if self.log_time:
+                self.log["calculate_entropies"] += time.time() - s
+                s = time.time()
+            patch_start_ids = find_entropy_patch_start_ids(
+                scores,
+                self.patch_size,
+                include_next_token=include_next_token,
+                threshold=threshold if threshold is not None else self.threshold,
+                threshold_add=self.threshold_add,
+                monotonicity=self.monotonicity,
+            )
+            if self.log_time:
+                self.log["find_entropy_patch_start_ids"] += time.time() - s
+                s = time.time()
+            patch_lengths = patch_lengths_from_start_ids(
+                patch_start_ids, seq_len_next_tok
+            )
+            if self.log_time:
+                self.log["patch_lengths_from_start_ids"] += time.time() - s
+                s = time.time()
+        # BPE
+        elif self.patching_mode == PatchingModeEnum.bpe:
+            patch_start_ids = find_bpe_delim_patch_start_ids(tokens, delim=BPE_ID)
+            patch_lengths = patch_lengths_from_start_ids(
+                patch_start_ids, seq_len_next_tok
+            )
+        elif self.patching_mode == PatchingModeEnum.bpe_patcher:
+            patch_start_ids = find_bpe_patcher_patch_start_ids(
+                tokens,
+                self.entropy_model,
+                self.patching_batch_size,
+                self.device,
+                include_next_token,
+            )
+            patch_lengths = patch_lengths_from_start_ids(
+                patch_start_ids, seq_len_next_tok
+            )
+        # SPACE
+        elif self.patching_mode == PatchingModeEnum.space:
+            patch_start_ids = find_space_patch_start_ids(tokens)
+            patch_lengths = patch_lengths_from_start_ids(
+                patch_start_ids, seq_len_next_tok
+            )
+        else:
+            raise NotImplementedError(f"self.patching_mode {self.patching_mode}")
+        # Apply any processing to patch lengths
+        if self.max_patch_length is not None:
+            # TODO: avoid going back to a list here.
+            patch_lengths = [
+                split_large_numbers(pl, self.max_patch_length)
+                for pl in patch_lengths.tolist()
+            ]
+            max_len = max([len(pl) for pl in patch_lengths])
+            patch_lengths = [rightpad(pl, 0, max_len=max_len) for pl in patch_lengths]
+            patch_lengths = torch.tensor(
+                patch_lengths, dtype=tokens.dtype, device=tokens.device
+            )
+        assert not check_non_zero_after_zero(patch_lengths)
+        # Find the last non-zero column index using argmax on a reversed version of the tensor
+        last_non_zero_col_reversed = (
+            (patch_lengths != 0).flip(dims=[1]).int().argmax(dim=1).min()
+        )
+        # Slice the tensor up to the last non-zero column
+        patch_lengths = patch_lengths[
+            :, : patch_lengths.shape[1] - last_non_zero_col_reversed
+        ]
+        assert (
+            torch.sum(patch_lengths)
+            == tokens.numel() + include_next_token * tokens.shape[0]
+        ), f"{torch.sum(patch_lengths)} != {tokens.numel() + include_next_token * tokens.shape[0]}"
+        if self.log_time:
+            self.log["postprocessing_patch_lengths"] += time.time() - s
+            self.log["tokens"] += patch_lengths.sum().item()
+        return patch_lengths, scores

bytelatent/distributed.py ADDED Viewed

	@@ -0,0 +1,478 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import atexit
+import contextlib
+import logging
+import multiprocessing as mp
+import os
+import random
+import shutil
+import signal
+import socket
+import subprocess
+import sys
+import tempfile
+from dataclasses import asdict, dataclass
+from functools import lru_cache, partial, reduce
+from itertools import chain
+from typing import List, Optional, Tuple, Union
+import torch
+# for no recompute ops
+import xformers.ops
+from pydantic import BaseModel, ConfigDict
+from torch import distributed as dist
+from torch.distributed import ReduceOp
+from torch.distributed._composable.fsdp import MixedPrecisionPolicy, fully_shard
+from torch.distributed._tensor import DTensor
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    checkpoint_wrapper,
+)
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.checkpoint import (
+    CheckpointPolicy,
+    create_selective_checkpoint_contexts,
+)
+from bytelatent.float8 import convert_linears_to_fp8
+logger = logging.getLogger()
+# for selective AC
+default_no_recompute_ops = {
+    torch.ops.aten.mm.default,
+    torch.ops.aten._scaled_mm.default,
+    torch.ops.aten._scaled_dot_product_efficient_attention.default,
+    torch.ops.aten._scaled_dot_product_flash_attention.default,
+    torch.ops.c10d_functional.reduce_scatter_tensor.default,
+    torch.ops.xformers_flash.flash_fwd.default,
+    torch.ops.xformers.efficient_attention_forward_cutlass.default,
+}
+class DistributedArgs(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    dp_shard: int = (
+        1  # In how many shard to split the model weight. Typically number gpu in a node.
+    )
+    dp_replicate: int = (
+        1  # How many times to replicate the model weight. Typically number of nodes.
+    )
+    tp_size: int = 1
+    selective_activation_checkpointing: bool = False
+    compile: bool = False
+    fsdp_type: str = "no_shard"
+    model_dtype: str = "bf16"
+    float8_recipe: str | None = None
+    float8_filter: str = r"layers\.[0-9]+\."
+    matmul_allow_tf32: bool = False
+    allow_bf16_reduced_precision_reduction: bool = True
+    detect_anomaly: bool = False
+    compile_cache_size_limit: int = 8
+    spawn_method: str = "forkserver"
+class EnvironmentArgs(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    # Use GNU openMP (GOMP) instead of Intel OpenMP [Intel Math Kernel Library (MKL)]
+    MKL_SERVICE_FORCE_INTEL: str = "GNU"
+    OMP_NUM_THREADS: str = "1"
+    MKL_NUM_THREADS: str = "1"
+    # faster intra-node collectives, seems to be a cluster specific flag
+    ENABLE_INTRA_NODE_COMM: str = "1"
+    # avoids OOMs with long context
+    TORCH_NCCL_AVOID_RECORD_STREAMS: str = "1"
+    # increasing NCCL timeout time before having some NCCL error 22 should give a 16s timeout
+    NCCL_IB_TIMEOUT: str = "22"
+    NCCL_DEBUG: str = "INFO"
+    TORCH_NCCL_ASYNC_ERROR_HANDLING: str = "1"
+def get_device_mesh(distributed_args: DistributedArgs):
+    tp_size = distributed_args.tp_size
+    dp_replicate = distributed_args.dp_replicate
+    dp_shard = distributed_args.dp_shard
+    assert (
+        dp_replicate * dp_shard * tp_size == get_world_size()
+    ), f"dp_replicate * dp_shard * tp_size ({dp_replicate} * {dp_shard} * {tp_size}) != world_size ({get_world_size()})"
+    dims = []
+    names = []
+    if dp_replicate >= 1:
+        dims.append(dp_replicate)
+        names.append("dp_replicate")
+    if dp_shard > 1 or distributed_args.fsdp_type == "no_shard":
+        dims.append(dp_shard)
+        names.append("dp_shard")
+    if tp_size > 1:
+        dims.append(tp_size)
+        names.append("tp")
+    dims = tuple(dims)
+    names = tuple(names)
+    return init_device_mesh("cuda", mesh_shape=dims, mesh_dim_names=names)
+def dist_max(x: Union[int, float], mesh: DeviceMesh = None):
+    tensor = torch.tensor(x).cuda()
+    dist.all_reduce(tensor, op=ReduceOp.MAX, group=mesh.get_group() if mesh else None)
+    return tensor
+def dist_mean(x: Union[int, float], mesh: DeviceMesh = None):
+    tensor = torch.tensor(x).cuda()
+    dist.all_reduce(tensor, op=ReduceOp.AVG, group=mesh.get_group() if mesh else None)
+    return tensor
+def dist_mean_dict(x):
+    r = dict()
+    for k in x:
+        r[k] = dist_mean(x[k])
+        r[k] = r[k].item() if (r[k].dim() == 0) else r[k].tolist()
+    return r
+@lru_cache()
+def get_is_torch_run() -> bool:
+    return os.environ.get("LOCAL_RANK") is not None
+@lru_cache()
+def get_is_slurm_job() -> bool:
+    return "SLURM_JOB_ID" in os.environ and not get_is_torch_run()
+@lru_cache()
+def get_global_rank() -> int:
+    if get_is_torch_run():
+        return int(os.environ["RANK"])
+    elif get_is_slurm_job():
+        return int(os.environ["SLURM_PROCID"])
+    else:
+        return 0
+@lru_cache()
+def get_local_rank() -> int:
+    if get_is_torch_run():
+        return int(os.environ["LOCAL_RANK"])
+    elif get_is_slurm_job():
+        return int(os.environ["SLURM_LOCALID"])
+    else:
+        return 0
+@lru_cache()
+def get_world_size() -> int:
+    if get_is_torch_run():
+        return int(os.environ["WORLD_SIZE"])
+    elif get_is_slurm_job():
+        return int(os.environ["SLURM_NTASKS"])
+    else:
+        return 1
+@lru_cache()
+def get_is_master() -> bool:
+    return get_global_rank() == 0
+@lru_cache()
+def get_master_port(job_id: int) -> int:
+    if get_is_torch_run():
+        return int(os.environ["MASTER_PORT"])
+    else:
+        MIN_MASTER_PORT, MAX_MASTER_PORT = (20000, 60000)
+        rng = random.Random(job_id)
+        return rng.randint(MIN_MASTER_PORT, MAX_MASTER_PORT)
+@lru_cache()
+def get_master_addr() -> str:
+    if get_is_torch_run():
+        return os.environ["MASTER_ADDR"]
+    elif get_is_slurm_job():
+        hostnames = subprocess.check_output(
+            ["scontrol", "show", "hostnames", os.environ["SLURM_JOB_NODELIST"]]
+        )
+        return hostnames.split()[0].decode("utf-8")
+    else:
+        return "127.0.0.1"
+def setup_env(env_args: EnvironmentArgs):
+    env_vars = env_args.model_dump()
+    # When using Triton, it attempts to locate prebuilt kernels in a cache
+    # located at ~/.triton/cache, but when that's backed by NFS this can fail
+    # with a "OSError: [Errno 116] Stale file handle" error. If we were to set
+    # it to a local directory it would belong to the first user who created it
+    # and it would fail for the job of any other successive user assigned to
+    # that machine. To avoid all this mess we use a temporary per-process cache.
+    triton_cache_dir = tempfile.mkdtemp()
+    atexit.register(shutil.rmtree, triton_cache_dir, ignore_errors=True)
+    env_vars["TRITON_CACHE_DIR"] = triton_cache_dir
+    # We change the tmp dir to /scratch in case it's slurm job
+    # This avoids filling up the host's usually limited tmpfs
+    # A full tmpfs leads to very slow creation of processes and weird bugs
+    if get_is_slurm_job():
+        new_tmp = f"/scratch/slurm_tmpdir/{os.environ['SLURM_JOB_ID']}"
+        if os.path.exists(new_tmp):
+            env_vars["TMP_DIR"] = new_tmp
+    for name, value in env_vars.items():
+        if os.environ.get(name) != str(value):
+            os.environ[name] = str(value)
+            logger.warning(f"WARNING: Setting {name} to {value}")
+def setup_torch_distributed(dist_args):
+    """
+    Handle single and multi-GPU / multi-node / SLURM jobs.
+    Initialize the following variables:
+        - global_rank
+        - world_size
+    """
+    mp.set_start_method(dist_args.spawn_method)
+    with mp.Manager():
+        pass
+    local_rank = get_local_rank()
+    os.environ["RANK"] = str(get_global_rank())
+    os.environ["WORLD_SIZE"] = str(get_world_size())
+    os.environ["MASTER_ADDR"] = get_master_addr()
+    os.environ["MASTER_PORT"] = str(
+        get_master_port(job_id=int(os.environ.get("SLURM_JOB_ID", -1)))
+    )
+    if get_is_torch_run():
+        logger.info(f"Run launched with torchrun, local rank: {local_rank}")
+    elif get_is_slurm_job():
+        logger.info(f"Run launched with slurm, local rank: {local_rank}")
+    else:
+        logger.info("Single GPU job")
+    logger.info(f"ENV: {os.environ}")
+    # set GPU device
+    assert 0 <= local_rank < 8
+    if dist_args.matmul_allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        logger.warning(
+            f"WARNING: Setting torch.backends.matmul.allow_tf32 to True. This is faster but less accurate."
+        )
+    torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (
+        dist_args.allow_bf16_reduced_precision_reduction
+    )
+    if torch.cuda.device_count() > 1:
+        torch.cuda.set_device(local_rank)
+    torch.distributed.init_process_group(init_method="env://", backend="nccl")
+    torch.autograd.set_detect_anomaly(dist_args.detect_anomaly)
+def get_module(module, access_string):
+    names = access_string.split(sep=".")
+    return reduce(getattr, names, module)
+def set_module(module, access_string, value):
+    names = access_string.split(sep=".")
+    parent = reduce(getattr, names[:-1], module)
+    setattr(parent, names[-1], value)
+def default_fsdp_grouping_plan(n_layers: int) -> List[Tuple[str, bool]]:
+    return [(f"layers.{i}", i < n_layers - 1) for i in range(n_layers)]
+def get_default_policy(no_recompute_ops=None):
+    no_recompute_ops = no_recompute_ops or default_no_recompute_ops
+    def default_policy(ctx, func, *args, **kwargs):
+        return (
+            CheckpointPolicy.MUST_SAVE
+            if func in no_recompute_ops
+            else CheckpointPolicy.PREFER_RECOMPUTE
+        )
+    return default_policy
+@torch.no_grad()
+def check_model_value_range(
+    model: torch.nn.Module, range: float = 1e3, std: float = 1e3
+):
+    for name, param in chain(model.named_parameters(), model.named_buffers()):
+        if isinstance(param, DTensor):
+            param = param.to_local()
+        if param.numel() == 0:
+            logger.warning(
+                f"Model parameter {name} is empty, probably because of FSDP sharding"
+            )
+            continue
+        if torch.isnan(param).any() or torch.isinf(param).any():
+            logger.warning(f"Model parameter {name} contains NaN or Inf")
+        param_range = param.max() - param.min()
+        param_std = param.std()
+        if param_range > range:
+            logger.warning(
+                f"Model parameter {name} has a suspiciously large range ({param_range}): please check initialization and init_weights is defined and called"
+            )
+        if param_std > std:
+            logger.warning(
+                f"Model parameter {name} has a suspiciously large standard deviation ({param_std}): please check initialization and init_weights is defined and called"
+            )
+        if (param == 0).all():
+            logger.warning(
+                f"Model parameter {name} is all zeros: it might be because of a missing initialization"
+            )
+def init_signal_handler(callable):
+    """
+    Handle signals sent by SLURM for time limit / pre-emption.
+    """
+    signal.signal(signal.SIGUSR2, callable)
+    logger.warning("Signal handler installed.")
+def requeue_slurm_job():
+    prod_id = int(os.environ["SLURM_PROCID"])
+    logger.warning("Host: %s - Global rank: %i" % (socket.gethostname(), prod_id))
+    if prod_id == 0 and os.environ.get("LAUNCH_WITH", "") != "DORA":
+        logger.warning("Requeuing job " + os.environ["SLURM_JOB_ID"])
+        os.system("scontrol requeue " + os.environ["SLURM_JOB_ID"])
+    else:
+        logger.warning("Not the master process, no need to requeue.")
+    sys.exit(0)
+@contextlib.contextmanager
+def clean_env():
+    distrib_names = (
+        "MASTER_ADDR",
+        "MASTER_PORT",
+        "RANK",
+        "WORLD_SIZE",
+        "LOCAL_RANK",
+        "LOCAL_WORLD_SIZE",
+        "TORCHELASTIC_RUN_ID",
+        "DORA_FORCE_DISTRIB",
+    )
+    cluster_env = {
+        x: os.environ.pop(x)
+        for x in os.environ
+        if x.startswith(
+            ("SLURM_", "SLURMD_", "SRUN_", "SBATCH_", "SUBMITIT_", "WANDB_")
+        )
+        or x in distrib_names
+    }
+    try:
+        yield
+    finally:
+        os.environ.update(cluster_env)
+def parallelize_model(
+    model,
+    device_mesh,
+    model_args,
+    distributed_args: DistributedArgs,
+    fsdp_grouping_plan: Optional[List[Tuple[str, bool]]] = None,
+    tp_parallelize=None,
+    no_recompute_ops=None,
+):
+    if distributed_args.tp_size > 1:
+        assert (
+            distributed_args.fsdp_type == "full_shard"
+        ), "Only full shard is supported for TP parallelism"
+        assert tp_parallelize is not None, "TP plan is required for TP parallelism"
+        assert (
+            distributed_args.compile == False
+        ), "Compile is not supported for TP parallelism"
+        tp_parallelize(model, device_mesh["tp"], model_args, distributed_args)
+    if distributed_args.float8_recipe is not None:
+        if distributed_args.tp_size > 1:
+            raise RuntimeError("float8 is incompatible with tensor-parallelism for now")
+        model = convert_linears_to_fp8(
+            model, distributed_args.float8_recipe, distributed_args.float8_filter
+        )
+    param_dtype = dict(fp32=torch.float32, fp16=torch.float16, bf16=torch.bfloat16)[
+        distributed_args.model_dtype
+    ]
+    if (
+        distributed_args.fsdp_type == "full_shard"
+        or distributed_args.fsdp_type == "no_shard"
+    ):
+        if distributed_args.fsdp_type == "no_shard":
+            assert (
+                distributed_args.dp_shard == 1
+            ), "dp_shard must be 1 for no_shard fsdp_type"
+            assert (
+                device_mesh["dp_shard"].size() == 1
+            ), "dp_shard must be 1 for no_shard fsdp_type"
+        fsdp_config = dict(
+            mp_policy=(
+                MixedPrecisionPolicy(
+                    param_dtype=param_dtype,
+                    reduce_dtype=torch.float32,
+                )
+            ),
+            mesh=(
+                device_mesh["dp_replicate", "dp_shard"]
+                if distributed_args.dp_shard > 1
+                or distributed_args.fsdp_type == "no_shard"
+                else device_mesh["dp_replicate"]
+            ),
+        )
+        if fsdp_grouping_plan is None:
+            # Assume that the model has list of layers and group around it
+            fsdp_grouping_plan = default_fsdp_grouping_plan(len(model.layers))
+        for path, reshard_after_forward in fsdp_grouping_plan:
+            module = get_module(model, path)
+            set_module(
+                model,
+                path,
+                fully_shard(
+                    module, **fsdp_config, reshard_after_forward=reshard_after_forward
+                ),
+            )
+        model = fully_shard(model, **fsdp_config, reshard_after_forward=True)
+    else:
+        raise ValueError(f"Invalid fsdp_type: {distributed_args.fsdp_type}")
+    if distributed_args.selective_activation_checkpointing:
+        model = checkpoint_wrapper(
+            model,
+            context_fn=partial(
+                create_selective_checkpoint_contexts,
+                get_default_policy(no_recompute_ops),
+            ),
+        )
+    if distributed_args.compile:
+        torch._dynamo.config.cache_size_limit = (
+            distributed_args.compile_cache_size_limit
+        )
+        model = torch.compile(model)
+    return model

bytelatent/entropy_model.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import json
+import os
+import re
+import torch
+from bytelatent.transformer import LMTransformer, LMTransformerArgs
+def load_entropy_model(entropy_model_checkpoint_dir, state_dict_path, device="cpu"):
+    with open(os.path.join(entropy_model_checkpoint_dir, "params.json")) as fr:
+        reloaded = json.loads(fr.read())
+    torch.set_default_dtype(torch.bfloat16)
+    model_params = reloaded["model"]
+    entropy_model = LMTransformer(
+        LMTransformerArgs(
+            dim=model_params["dim"],
+            n_layers=model_params["n_layers"],
+            n_heads=model_params["n_heads"],
+            max_seqlen=model_params["max_length"],
+            ffn_dim_multiplier=model_params["ffn_dim_multiplier"],
+            vocab_size=model_params["vocab_size"],
+        )
+    )
+    entropy_model.load_state_dict(
+        torch.load(state_dict_path, map_location=device), strict=False
+    )
+    entropy_model.to(device)
+    entropy_model = entropy_model.eval()
+    # no grads for the model:
+    for param in entropy_model.parameters():
+        param.requires_grad = False
+    return entropy_model

bytelatent/float8.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import re
+import warnings
+from typing import Callable
+import torch
+# avoid division by zero when calculating scale
+EPS = 1e-12
+def scale(t, amax_t, dtype_t):
+    min_v, max_v = torch.finfo(dtype_t).min, torch.finfo(dtype_t).max
+    scale_t = torch.clamp(amax_t.float(), min=EPS) / max_v
+    t_fp8 = (t / scale_t).clamp(min=min_v, max=max_v).to(dtype_t)
+    return t_fp8, scale_t
+def matmul(
+    first, amax_first, dtype_first, second_t, amax_second_t, dtype_second_t, bias
+):
+    first_fp8, scale_first = scale(first, amax_first, dtype_first)
+    second_t_fp8, scale_second_t = scale(second_t, amax_second_t, dtype_second_t)
+    output = torch._scaled_mm(
+        first_fp8,
+        second_t_fp8.t(),
+        scale_a=scale_first,
+        scale_b=scale_second_t.t(),
+        bias=bias,
+        out_dtype=torch.bfloat16,
+        use_fast_accum=True,
+    )
+    return output
+@torch._dynamo.allow_in_graph
+class Fp8LinearFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, a, b_t, bias):
+        amax_a = a.abs().amax(dim=-1, keepdim=True)
+        amax_b_t = b_t.abs().amax(dim=-1, keepdim=True)
+        out = matmul(
+            a, amax_a, torch.float8_e4m3fn, b_t, amax_b_t, torch.float8_e4m3fn, bias
+        )
+        ctx.a_requires_grad = a.requires_grad
+        ctx.b_requires_grad = b_t.requires_grad
+        ctx.bias_requires_grad = bias.requires_grad if bias is not None else False
+        ctx.save_for_backward(a, b_t, amax_b_t.max())
+        return out
+    @staticmethod
+    def backward(ctx, grad_out):
+        a, b_t, amax_b = ctx.saved_tensors
+        if ctx.a_requires_grad:
+            b = b_t.t().contiguous()
+            amax_grad_out = grad_out.abs().amax(dim=-1, keepdim=True)
+            amax_b = amax_b.repeat(b.shape[0], 1)
+            grad_a = matmul(
+                grad_out,
+                amax_grad_out,
+                torch.float8_e4m3fn,
+                b,
+                amax_b,
+                torch.float8_e4m3fn,
+                None,
+            )
+        else:
+            grad_a = None
+        if ctx.b_requires_grad:
+            grad_b = grad_out.t() @ a
+        else:
+            grad_b = None
+        if ctx.bias_requires_grad:
+            grad_bias = grad_out.sum(dim=0)
+        else:
+            grad_bias = None
+        return grad_a, grad_b, grad_bias
+class Fp8Linear(torch.nn.Linear):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        out = Fp8LinearFn.apply(input.flatten(end_dim=-2), self.weight, self.bias)
+        out = out.unflatten(0, input.shape[:-1])
+        return out
+def named_replace(
+    fn: Callable[[torch.nn.Module, str], torch.nn.Module],
+    module: torch.nn.Module,
+    name="",
+) -> torch.nn.Module:
+    for child_name, child_module in list(module.named_children()):
+        full_name = f"{name}.{child_name}" if name else child_name
+        new_child_module = named_replace(fn, child_module, full_name)
+        setattr(module, child_name, new_child_module)
+    module = fn(module, name)
+    return module
+def convert_linears_to_fp8(
+    root_module: torch.nn.Module, recipe: str, filter: str
+) -> torch.nn.Module:
+    if recipe not in ["rowwise"]:
+        raise RuntimeError(f"Unknown float8 recipe {recipe!r}")
+    if recipe == "rowwise" and torch.__version__ < "2.5":
+        # We need https://github.com/pytorch/pytorch/pull/134781.
+        warnings.warn("Float8 row-wise scaling is slow in PyTorch prior to v2.5.0")
+    # Multi-kernel makes Inductor auto-tune between a regular "streaming"-based
+    # reduction kernel and a "persistent" reduction kernel. Since fp8 has some
+    # multi-pass steps (e.g., first get amax, then scale), persistent kernels
+    # should perform better.
+    torch._inductor.config.triton.multi_kernel = 1
+    filter_re = re.compile(filter)
+    def replace(module: torch.nn.Module, name: str) -> torch.nn.Module:
+        if not isinstance(module, torch.nn.Linear) or not filter_re.search(name):
+            return module
+        if type(module) == torch.nn.Linear:
+            if recipe == "rowwise":
+                new_module = Fp8Linear(
+                    in_features=module.in_features,
+                    out_features=module.out_features,
+                    bias=module.bias is not None,
+                    dtype=module.weight.dtype,
+                    device=module.weight.device,
+                )
+                new_module.weight = module.weight
+                new_module.bias = module.bias
+            else:
+                assert False, recipe
+        else:
+            assert False, str(type(module))
+        return new_module
+    out = named_replace(replace, root_module)
+    # Force re-compile everything
+    torch._dynamo.reset_code_caches()
+    from torch._inductor.cudagraph_trees import reset_cudagraph_trees
+    reset_cudagraph_trees()
+    return out

bytelatent/logger.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import logging
+import math
+import sys
+import time
+from datetime import timedelta
+from bytelatent.distributed import get_global_rank, get_is_slurm_job
+class LogFormatter(logging.Formatter):
+    """
+    Custom logger for distributed jobs, displaying rank
+    and preserving indent from the custom prefix format.
+    """
+    def __init__(self):
+        self.start_time = time.time()
+        self.rank = get_global_rank()
+        self.show_rank = not get_is_slurm_job()  # srun has --label
+    def formatTime(self, record):
+        subsecond, seconds = math.modf(record.created)
+        curr_date = (
+            time.strftime("%y-%m-%d %H:%M:%S", time.localtime(seconds))
+            + f".{int(subsecond * 1_000_000):06d}"
+        )
+        delta = timedelta(seconds=round(record.created - self.start_time))
+        return f"{curr_date} - {delta}"
+    def formatPrefix(self, record):
+        fmt_time = self.formatTime(record)
+        if self.show_rank:
+            return f"{self.rank}: {record.levelname:<7} {fmt_time} - "
+        else:
+            return f"{record.levelname:<7} {fmt_time} - "
+    def formatMessage(self, record, indent: str):
+        content = record.getMessage()
+        content = content.replace("\n", "\n" + indent)
+        # Exception handling as in the default formatter, albeit with indenting
+        # according to our custom prefix
+        if record.exc_info:
+            # Cache the traceback text to avoid converting it multiple times
+            # (it's constant anyway)
+            if not record.exc_text:
+                record.exc_text = self.formatException(record.exc_info)
+        if record.exc_text:
+            if content[-1:] != "\n":
+                content = content + "\n" + indent
+            content = content + indent.join(
+                [l + "\n" for l in record.exc_text.splitlines()]
+            )
+            if content[-1:] == "\n":
+                content = content[:-1]
+        if record.stack_info:
+            if content[-1:] != "\n":
+                content = content + "\n" + indent
+            stack_text = self.formatStack(record.stack_info)
+            content = content + indent.join([l + "\n" for l in stack_text.splitlines()])
+            if content[-1:] == "\n":
+                content = content[:-1]
+        return content
+    def format(self, record):
+        prefix = self.formatPrefix(record)
+        indent = " " * len(prefix)
+        content = self.formatMessage(record, indent)
+        return prefix + content
+def set_root_log_level(log_level: str):
+    logger = logging.getLogger()
+    level: int | str = log_level.upper()
+    try:
+        level = int(log_level)
+    except ValueError:
+        pass
+    try:
+        logger.setLevel(level)  # type: ignore
+    except Exception:
+        logger.warning(
+            f"Failed to set logging level to {log_level}, using default 'NOTSET'"
+        )
+        logger.setLevel(logging.NOTSET)
+def init_logger(
+    log_file: str | None = None,
+    *,
+    name: str | None = None,
+    level: str = "NOTSET",
+):
+    """
+    Setup logging.
+    Args:
+        log_file: A file name to save file logs to.
+        name: The name of the logger to configure, by default the root logger.
+        level: The logging level to use.
+    """
+    set_root_log_level(level)
+    logger = logging.getLogger(name)
+    # stdout: everything
+    stdout_handler = logging.StreamHandler(sys.stdout)
+    stdout_handler.setLevel(logging.NOTSET)
+    stdout_handler.setFormatter(LogFormatter())
+    # stderr: warnings / errors and above
+    stderr_handler = logging.StreamHandler(sys.stderr)
+    stderr_handler.setLevel(logging.WARNING)
+    stderr_handler.setFormatter(LogFormatter())
+    # set stream handlers
+    logger.handlers.clear()
+    logger.handlers.append(stdout_handler)
+    logger.handlers.append(stderr_handler)
+    if log_file is not None and get_global_rank() == 0:
+        # build file handler
+        file_handler = logging.FileHandler(log_file, "a")
+        file_handler.setLevel(logging.NOTSET)
+        file_handler.setFormatter(LogFormatter())
+        # update logger
+        logger = logging.getLogger()
+        logger.addHandler(file_handler)

bytelatent/metrics.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+import json
+import logging
+from collections import namedtuple
+from dataclasses import asdict
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Union
+import torch
+import torch.nn as nn
+import wandb
+from pydantic import BaseModel, ConfigDict
+from bytelatent.distributed import get_is_master
+logger = logging.getLogger()
+class WandbArgs(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    job_type: str | None = None
+    dir: str | None = None
+    project: str | None = None
+    entity: str | None = None
+    tags: list | None = None
+    group: str | None = None
+    name: str | None = None
+    notes: str | None = None
+    config_exclude_keys: list[str] | None = None
+    config_include_keys: list[str] | None = None
+    anonymous: str | None = None
+    mode: str | None = None
+    allow_val_change: bool | None = None
+    resume: Union[bool, str] | None = None
+    force: bool | None = None
+    tensorboard: bool | None = None
+    sync_tensorboard: bool | None = None
+    monitor_gym: bool | None = None
+    save_code: bool | None = None
+    id: str | None = None
+    fork_from: str | None = None
+    resume_from: str | None = None
+class LoggingArgs(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    freq: int = 10  # Log every freq optimizer steps
+    acc_freq: int | None = None  # Log every acc_freq gradient accumulation steps
+    wandb: WandbArgs | None = None
+class MetricLogger:
+    def __init__(self, outdir: Path, args: Any | None = None):
+        self.outdir = outdir
+        self.jsonl_writer = None
+        self.args = args
+    def open(self):
+        if self.jsonl_writer is None:
+            self.jsonl_writer = open(self.outdir, "a")
+        if (
+            self.args is not None
+            and self.args.logging.wandb is not None
+            and get_is_master()
+        ):
+            run = wandb.init(
+                config=asdict(self.args),
+                **asdict(self.args.logging.wandb),
+            )
+    def log(self, metrics: dict[str, Any]):
+        if (
+            self.args is not None
+            and self.args.logging.wandb is not None
+            and (wandb.run is not None)
+        ):
+            wandb.log(metrics, step=metrics["global_step"])
+        metrics.update({"created_at": datetime.now(timezone.utc).isoformat()})
+        print(json.dumps(metrics), file=self.jsonl_writer, flush=True)
+    def close(self):
+        if self.jsonl_writer is not None:
+            self.jsonl_writer.close()
+            self.jsonl_writer = None
+    def __enter__(self):
+        self.open()
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+    def __del__(self):
+        self.close()
+GPUMemStats = namedtuple(
+    "GPUMemStats",
+    [
+        "max_active_gib",
+        "max_active_pct",
+        "max_reserved_gib",
+        "max_reserved_pct",
+        "num_alloc_retries",
+        "num_ooms",
+        "power_draw",
+    ],
+)
+class GPUMemoryMonitor:
+    """
+    Class to monitor GPU memory usage
+    """
+    def __init__(self, device: str = "cuda:0"):
+        self.device = torch.device(device)  # device object
+        self.device_name = torch.cuda.get_device_name(self.device)
+        self.device_index = torch.cuda.current_device()
+        self.device_capacity = torch.cuda.get_device_properties(
+            self.device
+        ).total_memory
+        self.device_capacity_gib = self._to_gib(self.device_capacity)
+        # reset stats, clear cache
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.empty_cache()
+    def _to_gib(self, memory_in_bytes):
+        # NOTE: GiB (gibibyte) is 1024, vs GB is 1000
+        _gib_in_bytes = 1024 * 1024 * 1024
+        memory_in_gib = memory_in_bytes / _gib_in_bytes
+        return memory_in_gib
+    def _to_pct(self, memory):
+        return 100 * memory / self.device_capacity
+    def get_peak_stats(self):
+        cuda_info = torch.cuda.memory_stats(self.device)
+        max_active = cuda_info["active_bytes.all.peak"]
+        max_active_gib = self._to_gib(max_active)
+        max_active_pct = self._to_pct(max_active)
+        max_reserved = cuda_info["reserved_bytes.all.peak"]
+        max_reserved_gib = self._to_gib(max_reserved)
+        max_reserved_pct = self._to_pct(max_reserved)
+        num_retries = cuda_info["num_alloc_retries"]
+        num_ooms = cuda_info["num_ooms"]
+        power_draw = torch.cuda.power_draw()
+        if num_retries > 0:
+            logger.warning(f"{num_retries} CUDA memory allocation retries.")
+        if num_ooms > 0:
+            logger.warning(f"{num_ooms} CUDA OOM errors thrown.")
+        return GPUMemStats(
+            max_active_gib,
+            max_active_pct,
+            max_reserved_gib,
+            max_reserved_pct,
+            num_retries,
+            num_ooms,
+            power_draw,
+        )
+    def reset_peak_stats(self):
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.reset_accumulated_memory_stats()
+    def __str__(self):
+        mem_stats = self.get_peak_stats()
+        display_str = f"{self.device_name} ({self.device_index}): {self.device_capacity_gib} GiB capacity, "
+        display_str += (
+            f"{mem_stats.max_reserved_gib} GiB peak, {mem_stats.max_reserved_pct}% peak"
+        )
+        return f"{display_str}"
+def upload_train_to_wandb(
+    ckpt_dir, project="lingua", entity="codegen-team", train=True, eval=True
+):
+    import json
+    from pathlib import Path
+    import wandb
+    from omegaconf import OmegaConf
+    cfg = OmegaConf.load(Path(ckpt_dir) / "config.yaml")
+    cfg = OmegaConf.to_container(cfg)
+    if train:
+        wandb.init(config=cfg, name=cfg["name"], project=project, entity=entity)
+        with open(Path(ckpt_dir) / "metrics.jsonl") as f:
+            for l in f:
+                m = json.loads(l)
+                wandb.log(m, step=m["global_step"])
+        wandb.finish()
+    if eval:
+        wandb.init(config=cfg, name=cfg["name"], project=project, entity=entity)
+        with open(Path(ckpt_dir) / "metrics.eval.jsonl") as f:
+            for l in f:
+                m = json.loads(l)
+                wandb.log(
+                    {
+                        f"evals/{name.replace('/','.')}": value
+                        for name, value in m.items()
+                        if "/" in name
+                    },
+                    step=m["global_step"],
+                )
+        wandb.finish()
+def get_num_params(model: nn.Module) -> int:
+    """
+    Get the total model params
+    Args : only_trainable: whether to only count trainable params
+    """
+    numel = {n: p.numel() for n, p in model.named_parameters()}
+    return sum(numel.values())

bytelatent/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates.

bytelatent/model/blt.py ADDED Viewed

	@@ -0,0 +1,1064 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from enum import Enum, auto
+from typing import Any, Optional
+import torch
+from pydantic import ConfigDict, model_validator
+from torch import nn
+from torch.nn.attention.flex_attention import create_block_mask
+from typing_extensions import Self
+from bytelatent.base_transformer import (
+    BaseTransformerArgs,
+    InitStdFactor,
+    TransformerBlock,
+)
+from bytelatent.data.patcher import Patcher, PatcherArgs
+from bytelatent.model.local_models import LocalDecoder, LocalEncoder
+from bytelatent.model.transformer import GlobalTransformer
+from bytelatent.model.utils import downsample
+from bytelatent.tokenizers.constants import BOE_ID, BOS_ID, EOS_ID, OFFSET, PAD_ID
+def attention_flops_per_token(n_layers, seq_len, dim, causal):
+    # Formula from https://github.com/Dao-AILab/flash-attention/blob/main/benchmarks/benchmark_flash_attention.py#L27-L30
+    return 3.5 * (4 * n_layers * seq_len * dim // (2 if causal else 1))
+def get_num_flop_per_token(
+    num_non_embed_params: int, n_layers: int, dim: int, seq_len: int
+) -> int:
+    return 6 * num_non_embed_params + attention_flops_per_token(
+        n_layers, seq_len, dim, True
+    )
+def causal_mask(b, h, q_idx, kv_idx):
+    return q_idx >= kv_idx
+def setattrs(_self, **kwargs):
+    for k, v in kwargs.items():
+        setattr(_self, k, v)
+def get_encoder_dim_token_emb(args):
+    if args.dim_token is not None:
+        dim_token_emb = args.dim_token
+    elif args.use_local_encoder_transformer:
+        dim_token_emb = args.dim_local_encoder
+    else:
+        dim_token_emb = args.dim_global // args.patch_size
+    return dim_token_emb
+def get_encoder_dim_patch_emb(args):
+    dim_patch_emb = None
+    if args.cross_attn_encoder:
+        if args.cross_attn_init_by_pooling:
+            dim_patch_emb = args.dim_local_encoder
+        else:
+            dim_patch_emb = args.dim_global
+    return dim_patch_emb
+def get_global_dim_patch_emb(args):
+    dim_token_emb = get_encoder_dim_token_emb(args)
+    if args.cross_attn_encoder:
+        dim_patch_emb = dim_token_emb * args.cross_attn_k
+    elif (
+        args.downsampling_by_pooling is None
+        or not args.downsampling_by_pooling
+        or len(args.downsampling_by_pooling) == 0
+    ):
+        dim_patch_emb = dim_token_emb * args.patch_size
+    else:
+        dim_patch_emb = dim_token_emb * sum(
+            [
+                pooling in args.downsampling_by_pooling
+                for pooling in ["avg", "min", "max"]
+            ]
+        )
+    return dim_patch_emb
+def get_decoder_dim_token_emb(args):
+    if args.share_encoder_decoder_emb:
+        dim_token_emb = get_encoder_dim_token_emb(args)
+    elif args.dim_token is not None:
+        dim_token_emb = args.dim_token
+    else:
+        dim_token_emb = args.dim_local_decoder
+    return dim_token_emb
+def parse_ngram_to_size(ngram_to_size_str: str | None) -> dict[int, int]:
+    if ngram_to_size_str is None:
+        return None
+    ngram_to_size = {}
+    for entry in ngram_to_size_str.split(","):
+        ngram, size = entry.split(":")
+        ngram = int(ngram)
+        size = int(size)
+        ngram_to_size[ngram] = size
+    return ngram_to_size
+def fill_tokens(tokens, patch_size, fill_id):
+    batch_size, seq_len = tokens.shape
+    if seq_len % patch_size == 0:
+        return tokens
+    else:
+        remaining = patch_size - seq_len % patch_size
+        final_padding = tokens.new(batch_size, remaining).fill_(fill_id)
+        return torch.cat((tokens, final_padding), dim=1)
+def decoder_patch_ids_from_lengths(patch_lengths, nb_boe, seq_len):
+    first_patch_length = patch_lengths[0, 0]
+    assert torch.all(
+        first_patch_length == patch_lengths[:, 0]
+    ), "first patch should always be the same size (1 for dynamic, patch_size for static)."
+    assert (
+        first_patch_length - nb_boe == 1
+    ), f"First patch (patch length: {first_patch_length}) should have one non-boe token (boe toks: {nb_boe})"
+    # Remove first patch from patch_ids for local decoder inputs and shift the last patch.
+    # decoder_patch_lengths = patch_lengths[:, 1:].clone()
+    # decoder_patch_lengths = add_to_last_nonzero_patch(decoder_patch_lengths, 1)
+    decoder_patch_lengths = patch_lengths[:, 1:]
+    assert (
+        decoder_patch_lengths.sum() + (nb_boe + 1) * patch_lengths.shape[0]
+        == patch_lengths.sum()
+    ), f"{decoder_patch_lengths.sum() + (nb_boe + 1) * patch_lengths.shape[0]} != {patch_lengths.sum()}"
+    assert torch.all(decoder_patch_lengths >= 0), f"{decoder_patch_lengths}"
+    decoder_patch_ids = patch_ids_from_lengths(
+        patch_lengths=decoder_patch_lengths, seq_len=seq_len
+    )
+    return decoder_patch_ids
+primes = [
+    1000000007,
+    5915587277,
+    1500450271,
+    3267000013,
+    5754853343,
+    4093082899,
+    9576890767,
+    3628273133,
+    2860486313,
+    5463458053,
+    3367900313,
+]
+def rolling_polynomial_hash(t, hash_func_nb: int = 0):
+    prime = torch.tensor(primes[hash_func_nb], dtype=torch.int64, device=t.device)
+    prime_powers = torch.stack([prime**i for i in range(t.shape[-1])])
+    return torch.sum(t * prime_powers, dim=-1)
+def get_rolling_polynomial_hash_fn(hash_func_nb: int = 0, group_size: int = 2):
+    prime = torch.tensor(primes[hash_func_nb], dtype=torch.int64)
+    prime_powers = torch.stack([prime**i for i in range(group_size)])
+    def rolling_polynomial_hash_fn(t):
+        return torch.sum(t * prime_powers, dim=-1)
+    return rolling_polynomial_hash_fn
+def byte_group_hash_function(
+    x: torch.Tensor, group_size: int = 2, hash_func_nb: int = 0, max_hash: int = 30000
+):
+    """
+    Returns a hash of the input x and maps it to a value in the range [0, max_hash].
+    expects: x of shape (batch_size, seq_len) with values as ids in the token vocab.
+    returns a tensor  of shape (batch_size, seq_len) with values in the range [0, max_hash].
+    Note: max hash can make a big difference on the number of collisions.
+    """
+    with torch.no_grad():
+        bs, seq_len = x.shape
+        # x_numpy = x.numpy()
+        # hash_values = torch.zeros(bs, seq_len, dtype=torch.int64, requires_grad=False)
+        # for i in range(bs):
+        #     for j in range(seq_len):
+        #         start = max(j, j-group_size+1)
+        #         end = j+1
+        #         hash_values[i, j] = hash_array(x_numpy[i, start:end], max_hash)
+        prefix = torch.zeros(bs, group_size - 1, dtype=torch.int64, device=x.device)
+        x = torch.cat([prefix, x], dim=1)
+        windows = x.unfold(1, group_size, 1)
+        # hashes = get_rolling_polynomial_hash_fn(hash_func_nb, group_size)(windows)
+        hashes = rolling_polynomial_hash(windows, hash_func_nb)
+        hash_values_range = hashes % max_hash
+    hash_values_range.requires_grad = False
+    return hash_values_range
+def create_patch_mask_from_ids(
+    patch_ids, num_patches, window=None, patches_as_queries=False
+):
+    """
+    Creates a tensor of shape [bs, seq_len, num_patches] where each element at position (i, j, k)
+    is True if the patch id at position (i, j) is less than or equal to k.
+    Args:
+        patch_ids (torch.Tensor): Tensor of shape [bs, seq_len] containing patch ids.
+        num_patches (int): Total number of patches.
+        window (int): If not None, only considers patches within a window of size window.
+        patches_as_queries (bool): If True, the patches are used as queries
+    Returns:
+        torch.Tensor: Tensor of shape [bs, q_len, kv_len] with the desired mask.
+    """
+    bs, seq_len = patch_ids.shape
+    if not patches_as_queries:
+        q_ids = patch_ids.unsqueeze(-1).expand(bs, seq_len, num_patches)
+        kv_ids = (
+            torch.arange(num_patches, device=patch_ids.device)
+            .unsqueeze(0)
+            .unsqueeze(0)
+            .expand(bs, seq_len, num_patches)
+        )
+    else:
+        kv_ids = patch_ids.unsqueeze(1).expand(bs, num_patches, seq_len)
+        q_ids = (
+            torch.arange(num_patches, device=patch_ids.device)
+            .unsqueeze(0)
+            .unsqueeze(-1)
+            .expand(bs, num_patches, seq_len)
+        )
+    if window is None:
+        mask = q_ids == kv_ids
+    else:
+        mask = (kv_ids <= q_ids) & (q_ids < kv_ids + window)
+    return mask
+def cross_attn_mask(
+    patch_ids,
+    patch_lengths,
+    N,
+    patches_as_queries=False,
+    cross_attn_k=1,
+    window=None,
+    block_mask=True,
+):
+    bs = patch_ids.shape[0]
+    with torch.no_grad():
+        # Create the patch mask
+        cross_mask = create_patch_mask_from_ids(
+            patch_ids,
+            patch_lengths.shape[1],
+            window=window,
+            patches_as_queries=patches_as_queries,
+        ).repeat_interleave(cross_attn_k, dim=1 if patches_as_queries else -1)
+        q_len = patch_lengths.shape[1] * cross_attn_k if patches_as_queries else N
+        kv_len = N if patches_as_queries else patch_lengths.shape[1] * cross_attn_k
+        assert cross_mask.shape == (
+            bs,
+            q_len,
+            kv_len,
+        ), f"{cross_mask.shape} != {(bs, q_len, kv_len)}"
+        if block_mask:
+            def patch_mask(b, h, q_idx, kv_idx):
+                return cross_mask[b, q_idx, kv_idx]
+            block_mask = create_block_mask(
+                patch_mask,
+                B=bs,
+                H=None,
+                Q_LEN=q_len,
+                KV_LEN=kv_len,
+                _compile=True,
+            )
+            return block_mask
+        else:
+            return torch.where(
+                cross_mask, torch.tensor(0.0), torch.tensor(float("-inf"))
+            ).unsqueeze(
+                1
+            )  # [bs, 1, q_len, kv_len]
+def get_blt_input(
+    tokens: torch.Tensor,
+    enforce_patch_size_multiple: bool,
+    nb_boe: torch.Tensor,
+    patch_size: int,
+    boe_id: int,
+):
+    """
+        This function returns X_et, X_gt and X_dt, the encoder, global, and decoder
+    tokens respectively.
+    Consider the input and target sequences:
+    X=[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12,13]
+    Y=[4,5,6,7,eos,bos,8,9,10,eos,bos,11,12,13,14]
+    with patch_size=4
+    Note 1: that there will be no special tokens introduced at the patch level.
+    Note 2: X_e needs to be trimmed to be passed to Global
+    Current without boe:
+    X_et = [[boe,boe,boe,boe] [3,4,5,6],      [7,eos,bos,8],    [9,10,eos,bos] [11,12,13, pad]]
+    X_g =  [[boe,boe,boe,boe] [3,4,5,6],      [7,eos,bos,8],    [9,10,eos,bos] [11,12,13, pad]] # remove last glob patch
+    X_dt = [[3,4,5,6]         [7,eos,bos,8],  [9,10,eos,bos],   [11,12,13]]
+    Y =    [[4,5,6,7]         [eos,bos,8,9],  [10,eos,bos,11],  [12,13,14]]
+    --> lag fix:
+    X_et = [[boe,boe,boe,3]   [4,5,6,7],      [eos,bos,8,9],    [10,eos,bos,11] [12,13,pad,pad]]
+    X_g =  [[boe,boe,boe,3]   [4,5,6,7],      [eos,bos,8,9],    [10,eos,bos,11]]
+    X_dt = [[3,4,5,6]         [7,eos,bos,8],  [9,10,eos,bos],   [11,12,13]]
+    Y =    [[4,5,6,7]    	  [eos,bos,8,9],  [10,eos,bos,11],  [12,13,14]]
+    Dynamic (current):
+    X = [3,4,5,6,7,eos,bos,8,9,10,eos,bos]
+    Y = [4,5,6,7,eos,bos,8,9,10,eos,bos,11]
+    entropy patching:
+    input: 7, bos, 9, 10
+    pred (high entropy): eos, 8, 10, eos
+    X_et = [[boe,3,4,5,6,7,eos,bos,8,9,10,eos,bos]
+    X_g =  [[boe],      [3,4,5,6], [7,eos],[bos,8],[9],     [10,eos]]
+    X_dt = [[3,4,5,6],  [7,eos],   [bos,8],[9],    [10,eos],[bos]]
+    Y =    [4,5,6,7,eos,bos,8,9,10,eos,bos,11]
+    --> lag fix no boe (force single byte first patch):
+    X_et = [[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12]
+    X_g =  [[3],        [4,5,6,7], [eos,bos],[8,9], [10],       [eos,bos],      [11,12]] # remove last global patch
+    X_dt = [[3,4,5,6],  [7,eos],   [bos,8], [9],    [10,eos],   [bos,11,12]]
+    Y =    [4,5,6,7,    eos,bos,    8,9,    10,     eos,bos,    11,12,13]
+    input: 4, 7, bos, 9, 10
+    pred (high entropy): 5, eos, 8, 10, eos
+    X_et = [[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12]
+    X_g =  [[3],        [4]   ,   [5,6,7], [eos,bos],[8,9], [10],       [eos,bos],      [11,12]] # remove last global patch
+    X_dt = [[3]         [4,5,6],  [7,eos],   [bos,8], [9],    [10,eos],   [bos,11,12]]
+    Y =    [4,]         [5,6,7,    eos,bos,    8,9,    10,     eos,bos,    11,12,13]
+    Handle the last byte properly.
+    patch_lengths = [1, 1,         3,      2,         2      1           2               2         1]
+    X_et = [[3,4,5,6,7,eos,bos,8,9,10,eos,bos,11,12]
+    X_g =  [[3],        [4]   ,   [5,6,7], [eos,bos],[8,9], [10],       [eos,bos],      [11,12]] # do not remove last global patch
+    X_dt = [[3]         [4,5,6],  [7,eos],   [bos,8], [9],    [10,eos],   [bos,11]       [12]]
+    Y =    [4,]         [5,6,7,    eos,bos,    8,9,    10,     eos,bos,    11,12,        13]]
+    bpe delim
+    X_et = [[3,4,5,6,7,<d>,eos,bos,<d>,8,9,<d>,10,<d>,eos,bos,11,12]
+    X_g =  [[3],          [4,5,6,7,<d>],     [eos,bos,<d>], ..
+    X_dt = [[3,4,5,6,7],  [<d>,eos,bos],     [<d>,bos,8], ..
+    Y =    [4,5,6,7,<d>,    eos,bos,<d>       8,9,<d>, ..
+    Note 1: that there will be no special tokens introduced at the patch level.
+    Note 2: X_e needs to be trimmed to be passed to Global
+    """
+    batch_size, seq_len = tokens.shape
+    local_encoder_tokens = tokens
+    local_decoder_tokens = tokens
+    if nb_boe > 0:
+        padded_patch = tokens.new(batch_size, nb_boe).fill_(boe_id)
+        local_encoder_tokens = torch.cat((padded_patch, local_encoder_tokens), dim=1)
+    # global_tokens = tokens.new(batch_size, ((seq_len-1) // patch_size)+1).fill_(boe_id)
+    # create global tokens, contains boe tokens and eos
+    # padded_local_encoder_tokens = fill_tokens(local_encoder_tokens, patch_size, boe_id)
+    # patches = padded_local_encoder_tokens.view(batch_size, -1, patch_size)
+    # global_tokens = (patches.eq(eos_id).any(dim=2).int() * eos_id)[:, 1:]
+    # global_tokens += global_tokens.eq(0).int() * boe_id
+    # TODO: fix this when we want to use block causal in the global.
+    if enforce_patch_size_multiple and local_encoder_tokens.shape[-1] % patch_size != 0:
+        local_encoder_tokens = fill_tokens(local_encoder_tokens, patch_size, boe_id)
+    return local_encoder_tokens, None, local_decoder_tokens
+def patch_ids_from_lengths(patch_lengths, seq_len):
+    bs, num_patches = patch_lengths.shape
+    # Create a tensor of cumulative sums of the patch lengths
+    cum_d = torch.cat(
+        [
+            torch.zeros(bs, 1, dtype=patch_lengths.dtype, device=patch_lengths.device),
+            patch_lengths.cumsum(dim=-1),
+        ],
+        dim=-1,
+    )
+    patch_ids = (cum_d.unsqueeze(-1) <= torch.arange(seq_len, device=cum_d.device)).sum(
+        dim=-2
+    ) - 1
+    assert not (
+        torch.max(patch_ids) > patch_lengths.shape[-1] or torch.min(patch_ids) < 0
+    ), f"{torch.max(patch_ids)} > {patch_lengths.shape[-1]} or {torch.min(patch_ids)} < 0"
+    return patch_ids
+class ByteLatentTransformerArgs(BaseTransformerArgs):
+    model_config = ConfigDict(extra="forbid")
+    # Basic model configuration
+    seed: int = 42
+    vocab_size: int = -1
+    dim: int = 512
+    n_layers: int = 8
+    n_heads: int = 8
+    # TODO: What is the purpose of this parameter?
+    weight_tying: bool = False
+    sliding_window: Optional[int] = None
+    # Architecture and dimensions
+    dim_token: int = 256
+    dim_global: int = 512
+    dim_local_decoder: int = 512
+    dim_local_encoder: int = 512
+    n_layers_global: int = 8
+    n_layers_local_decoder: int = 8
+    n_layers_local_encoder: int = 8
+    # Tokenization and patching
+    tokenization_mode: str = "bpe"
+    patch_size: float | None = None
+    patching_mode: str | None = None
+    patching_threshold: float | None = None
+    patching_threshold_add: float | None = None
+    monotonicity: bool = False
+    patching_batch_size: int = 1
+    patching_device: str = "cuda"
+    data_loader_patching: bool = False
+    max_patch_length: int | None = None
+    # Encoder/Decoder configuration
+    tie_local_encoder_decoder_logits: bool = False
+    use_local_encoder_transformer: bool = False
+    encoder_lm_loss: bool = False
+    max_encoder_seq_length: int | None = None
+    pad_to_max_length: bool = False
+    encoder_enable_byte_ngrams: bool = False
+    encoder_enable_byte_group_hash: bool = False
+    ngram_vocab_sizes: int | None = None
+    # Cross attention configurations
+    cross_attn_encoder: bool = False
+    cross_attn_decoder: bool = False
+    cross_attn_window_encoder: int | None = None
+    cross_attn_window_decoder: int | None = None
+    cross_attn_k: int | None = None
+    cross_attn_nheads: int | None = None
+    cross_attn_all_layers_decoder: bool = False
+    cross_attn_all_layers_encoder: bool = False
+    cross_attn_use_flex_attention: bool = True
+    cross_attn_init_by_pooling: bool = False
+    # Encoder hash configurations
+    encoder_hash_byte_group_size: Any | None = None
+    encoder_hash_byte_group_vocab: int = 30000
+    encoder_hash_byte_group_nb_functions: int = 3
+    # Model behavior and optimization
+    log_patch_lengths: bool = False
+    non_linearity: str = "swiglu"
+    use_rope: bool = True
+    recompute_fc1_out: bool = False
+    recompute_fc3_out: bool = False
+    recompute_attn: bool = True
+    custom_bwd: bool = False
+    layer_ckpt: str = "all"
+    efficient_attn: str | None = None
+    # Architecture options
+    patch_only_encoder: bool = False
+    patch_only_decoder: bool = False
+    # Initialization and attention
+    init_use_gaussian: bool = True
+    init_use_depth: str = "current"
+    attn_bias_type: str = "causal"
+    alpha_depth: str = "disabled"
+    max_length: int = 2048
+    # Norm configuration
+    norm_eps: float = 1e-5
+    norm_affine: bool = True
+    pre_norm: bool = True
+    norm_type: str = "rmsnorm"
+    # Additional configurations
+    multiple_of: int = 256
+    ffn_dim_multiplier: float = 1.0
+    dropout: float = 0
+    output_size: int = -1
+    # Additional parameters from ModelArgs
+    architecture: str = "vanilla"
+    share_encoder_decoder_emb: bool = True
+    global_local_decoder_residual_layer: str | None = None
+    tokenize_with_bpe_delimiter: bool = False
+    patching_thresholds_str: str | None = None
+    tie_local_encoder_decoder: bool = False
+    encoder_preds_low_entropy_toks: float | None = None
+    encoder_preds_random_toks: float | None = None
+    dim_token_emb: int | None = None
+    dim_patch_emb: int | None = None
+    encoder_ngram_table_dir: str | None = None
+    encoder_ngram_to_size_str: str | None = None
+    # Model architecture params
+    entropy_model_checkpoint_dir: str | None = None
+    entropy_model_is_ngram_model: bool = False
+    downsampling_by_pooling: str | None = None
+    n_heads_global: int = 8
+    n_heads_local_decoder: int = 8
+    n_heads_local_encoder: int = 8
+    n_kv_heads: int | None = None
+    n_kv_heads_global: int | None = None
+    conv_kernel_size: int | None = None
+    local_attention_window_len: int | None = None
+    # Performance optimization
+    sequence_parallel: bool = False
+    loss_parallel: bool = False
+    fuse_sequence_parallel: bool = False
+    use_fsdp: bool = True
+    attn_to_keep: str = "all"
+    # RoPE parameters
+    rope_theta: float = 10000.0
+    rope_use_fp32_in_outer_product: bool = False
+    # Parameter mixing
+    pm_size: int = 0
+    # Logging
+    full_logging_n_layers: int = 4
+    # Special token config
+    eos_id: int | None = None
+    @model_validator(mode="after")
+    def check_hash_byte_sizes(self) -> Self:
+        if (
+            self.encoder_hash_byte_group_size is not None
+            and type(self.encoder_hash_byte_group_size) == str
+        ):
+            self.encoder_hash_byte_group_size = [
+                int(x)
+                for x in self.encoder_hash_byte_group_size.split(",")
+                if len(x) > 0
+            ]
+        return self
+class LocalEncoderArgs(ByteLatentTransformerArgs):
+    # Local encoder specific dimensions
+    n_heads_local_encoder: int = 8
+    dim_token_emb: int | None = None
+    dim_patch_emb: int | None = None
+    def __post_init__(self):
+        # Override base args with local encoder specific values
+        self.dim = self.dim_local_encoder
+        self.n_layers = self.n_layers_local_encoder
+        self.n_heads = self.n_heads_local_encoder
+        self.cross_attn_decoder = False
+        self.cross_attn_k = self.cross_attn_k if self.cross_attn_encoder else None
+        self.attn_bias_type = "local_block_causal"
+class GlobalTransformerArgs(ByteLatentTransformerArgs):
+    # Global encoder specific dimensions
+    dim_token_emb: int | None = None
+    dim_patch_emb: int | None = None
+    def __post_init__(self):
+        # Override base args with global encoder specific values
+        self.dim = self.dim_global
+        self.n_layers = self.n_layers_global
+        self.n_heads = self.n_heads_global
+        self.n_kv_heads = self.n_kv_heads_global
+        self.local_attention_window_len = None
+        self.cross_attn_encoder = False
+        self.cross_attn_decoder = False
+class LocalDecoderArgs(ByteLatentTransformerArgs):
+    # Local decoder specific dimensions
+    dim_token_emb: int | None = None
+    dim_patch_emb: int | None = None
+    def __post_init__(self):
+        # Override base args with local decoder specific values
+        self.dim = self.dim_local_decoder
+        self.n_layers = self.n_layers_local_decoder
+        self.n_heads = self.n_heads_local_decoder
+        self.cross_attn_encoder = False
+        self.cross_attn_init_by_pooling = False
+        self.attn_bias_type = "local_block_causal"
+def create_global_transformer(args: ByteLatentTransformerArgs) -> GlobalTransformer:
+    global_args = args.model_copy(
+        deep=True,
+        update=dict(
+            dim=args.dim_global,
+            n_layers=args.n_layers_global,
+            n_heads=args.n_heads_global,
+            n_kv_heads=args.n_kv_heads_global,
+            local_attention_window_len=None,
+            dim_token_emb=get_global_dim_patch_emb(args),
+            dim_patch_emb=None,
+            cross_attn_encoder=False,
+            cross_attn_decoder=False,
+        ),
+    )
+    return GlobalTransformer(global_args)
+def create_local_encoder(args: ByteLatentTransformerArgs) -> LocalEncoder:
+    # First deep copy the original args
+    # Replace with local encoder specific values
+    local_encoder_args = args.model_copy(
+        deep=True,
+        update=dict(
+            dim=args.dim_local_encoder,
+            n_layers=args.n_layers_local_encoder,
+            n_heads=args.n_heads_local_encoder,
+            dim_token_emb=get_encoder_dim_token_emb(args),
+            dim_patch_emb=get_encoder_dim_patch_emb(args),
+            cross_attn_decoder=False,
+            cross_attn_k=args.cross_attn_k if args.cross_attn_encoder else None,
+            attn_bias_type="local_block_causal",
+        ),
+    )
+    return LocalEncoder(local_encoder_args)
+def create_local_decoder(args: ByteLatentTransformerArgs) -> LocalDecoder:
+    # First deep copy the original args
+    local_decoder_args = args.model_copy(
+        deep=True,
+        update=dict(
+            dim=args.dim_local_decoder,
+            n_layers=args.n_layers_local_decoder,
+            n_heads=args.n_heads_local_decoder,
+            cross_attn_encoder=False,
+            cross_attn_init_by_pooling=False,  # states are already defined
+            dim_token_emb=get_decoder_dim_token_emb(args),
+            dim_patch_emb=args.dim_global,
+            cross_attn_k=args.cross_attn_k if args.cross_attn_decoder else None,
+        ),
+    )
+    return LocalDecoder(local_decoder_args)
+class EmbeddingType(Enum):
+    HASH_TOK = auto()
+    NGRAM = auto()
+def init_embeddings(
+    args,
+    embedding_type: EmbeddingType,
+    local_encoder_dim: int,
+    encoder_hash_byte_group_size: list = None,
+):
+    if (
+        embedding_type == EmbeddingType.HASH_TOK
+        and args.encoder_hash_byte_group_size is None
+    ):
+        return None
+    if embedding_type == EmbeddingType.NGRAM and args.encoder_ngram_to_size_str is None:
+        return None
+    embeddings = []
+    if embedding_type == EmbeddingType.HASH_TOK:
+        emb_dim = local_encoder_dim
+        encoder_hash_byte_group_vocab = args.encoder_hash_byte_group_vocab
+        for _ in range(args.encoder_hash_byte_group_nb_functions):
+            for _ in encoder_hash_byte_group_size:
+                embeddings.append(
+                    nn.Embedding(
+                        encoder_hash_byte_group_vocab,
+                        emb_dim,
+                    )
+                )
+    elif embedding_type == EmbeddingType.NGRAM:
+        encoder_ngram_to_size = parse_ngram_to_size(args.encoder_ngram_to_size_str)
+        emb_dim = local_encoder_dim
+        OFFSET = 4  # This should be passed as parameter if it's variable
+        for ngram_vocab_size in encoder_ngram_to_size.values():
+            embeddings.append(nn.Embedding(ngram_vocab_size + OFFSET, emb_dim))
+    return nn.ModuleList(embeddings)
+def compute_hash_embeddings(
+    local_encoder_tokens: torch.Tensor,
+    local_encoder,
+    encoder_hash_tok_embedding: nn.ModuleList,
+    encoder_hash_byte_group_nb_functions: int,
+    encoder_hash_byte_group_size: list,
+    encoder_hash_byte_group_vocab: int,
+) -> torch.Tensor:
+    """
+    Compute embeddings using hash token embeddings.
+    Args:
+        local_encoder_tokens: Input tokens tensor
+        local_encoder: Encoder object with tok_embeddings method
+        encoder_hash_tok_embedding: ModuleList of hash token embeddings
+        encoder_hash_byte_group_nb_functions: Number of hash functions
+        encoder_hash_byte_group_size: List of byte group sizes
+        encoder_hash_byte_group_vocab: Vocabulary size for hash embeddings
+    Returns:
+        torch.Tensor: Combined embeddings
+    """
+    if encoder_hash_tok_embedding is None:
+        return None
+    local_encoder_embeds = local_encoder.tok_embeddings(local_encoder_tokens)
+    i = 0
+    for func_nb in range(encoder_hash_byte_group_nb_functions):
+        for byte_group_size in encoder_hash_byte_group_size:
+            hash_ids = byte_group_hash_function(
+                local_encoder_tokens,
+                byte_group_size,
+                hash_func_nb=func_nb,
+                max_hash=encoder_hash_byte_group_vocab,
+            )
+            hash_tok_embedding = encoder_hash_tok_embedding[i]
+            local_encoder_embeds = local_encoder_embeds + hash_tok_embedding(hash_ids)
+            i += 1
+    assert i == len(encoder_hash_tok_embedding)
+    return local_encoder_embeds
+class ByteLatentTransformer(nn.Module):
+    """
+    The ByteLatentTransformer (BLT) is a byte-level language model architecture that processes byte sequences
+    by dynamically segmenting them into patches. It uses a combination of local encoders, global transformers,
+    and local decoders to efficiently encode and decode byte sequences, leveraging patch-based processing for
+    improved performance and inference efficiency.
+    """
+    def __init__(self, args: ByteLatentTransformerArgs):
+        super().__init__()
+        # General configuration
+        self.weight_tying = args.weight_tying
+        self.sliding_window = args.sliding_window
+        self.patch_size = args.patch_size
+        self.patching_mode = args.patching_mode
+        self.boe_id, self.bos_id, self.pad_id, self.eos_id = (
+            BOE_ID,
+            BOS_ID,
+            PAD_ID,
+            EOS_ID,
+        )
+        self.downsampling_by_pooling = args.downsampling_by_pooling
+        self.patching_threshold = args.patching_threshold
+        self.dim = args.dim
+        self.init_base_std = args.init_base_std
+        self.init_std_factor = InitStdFactor(args.init_std_factor)
+        self.max_seqlen = args.max_seqlen
+        # Cross attention configuration
+        self.cross_attn_encoder = args.cross_attn_encoder
+        self.cross_attn_decoder = args.cross_attn_decoder
+        self.cross_attn_k = args.cross_attn_k
+        self.cross_attn_window_encoder = args.cross_attn_window_encoder
+        self.cross_attn_window_decoder = args.cross_attn_window_decoder
+        self.cross_attn_use_flex_attention = args.cross_attn_use_flex_attention
+        # Encoder hash configuration
+        self.encoder_hash_byte_group_size = args.encoder_hash_byte_group_size
+        self.encoder_hash_byte_group_vocab = args.encoder_hash_byte_group_vocab
+        self.encoder_hash_byte_group_nb_functions = (
+            args.encoder_hash_byte_group_nb_functions
+        )
+        # ByteLatent modules
+        self.local_encoder = create_local_encoder(args)
+        self.global_transformer = create_global_transformer(args)
+        self.local_decoder = create_local_decoder(args)
+        self.encoder_hash_tok_embedding = init_embeddings(
+            args,
+            EmbeddingType.HASH_TOK,
+            local_encoder_dim=self.local_encoder.dim,
+            encoder_hash_byte_group_size=self.encoder_hash_byte_group_size,
+        )
+        self.encoder_ngram_embedding = init_embeddings(
+            args,
+            EmbeddingType.NGRAM,
+            local_encoder_dim=self.local_encoder.dim,
+            encoder_hash_byte_group_size=None,
+        )
+        self.tok_embeddings = torch.nn.Embedding(args.vocab_size, args.dim)
+        # Transformer layers
+        self.layers = nn.ModuleList(
+            [TransformerBlock(args) for _ in range(args.n_layers)]
+        )
+        # Encoder ngram embedding tables
+        self.encoder_ngram_embedding = None
+        if args.encoder_enable_byte_ngrams:
+            self.encoder_ngram_embedding = nn.ModuleList()
+            assert args.ngram_vocab_sizes is not None
+            self.encoder_ngram_to_size = parse_ngram_to_size(
+                args.encoder_ngram_to_size_str
+            )
+            ngram_emb_dim = self.local_encoder.dim
+            for ngram_vocab_size in self.encoder_ngram_to_size.values():
+                self.encoder_ngram_embedding.append(
+                    nn.Embedding(ngram_vocab_size + OFFSET, ngram_emb_dim)
+                )
+        # Output layer
+        assert args.vocab_size > 0, "vocab_size must be greater than 0"
+        self.output = nn.Linear(args.dim, args.vocab_size, bias=False)
+        if args.weight_tying:
+            self.output.weight = self.tok_embeddings.weight
+        # Patcher module
+        if not args.data_loader_patching:
+            self.patcher = Patcher(
+                PatcherArgs(
+                    patch_size=args.patch_size,
+                    patching_mode=args.patching_mode,
+                    patching_threshold=args.patching_threshold,
+                    patching_threshold_add=args.patching_threshold_add,
+                    monotonicity=args.monotonicity,
+                    max_patch_length=args.max_patch_length,
+                )
+            )
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        patch_lengths: Optional[torch.Tensor] = None,
+        ngram_ids: Optional[torch.Tensor] = None,
+    ):
+        # Ensure ngram_ids is either a tensor or None
+        assert (
+            isinstance(ngram_ids, torch.Tensor) or ngram_ids is None
+        ), f"ngram_ids must be a tensor or None, but was: {type(ngram_ids)}"
+        bs, N = tokens.shape  # Batch size and sequence length
+        # Get megabyte inputs
+        nb_boe = int(0 if self.patching_mode != "" else self.patch_size - 1)
+        local_encoder_tokens, _, local_decoder_tokens = get_blt_input(
+            tokens=tokens,
+            enforce_patch_size_multiple=False,
+            nb_boe=nb_boe,
+            patch_size=self.patch_size,
+            boe_id=self.boe_id,
+        )
+        # Patching
+        if patch_lengths is None:
+            assert (
+                getattr(self, "patcher", None) is not None
+            ), "Patcher not defined and no patch_lengths passed."
+            patch_lengths, tok_scores = self.patcher.patch(
+                local_encoder_tokens,
+                include_next_token=True,
+                threshold=self.patcher.threshold,
+            )
+        else:
+            if nb_boe > 0:
+                patch_lengths[:, 0] += nb_boe
+        assert torch.min(patch_lengths) >= 0
+        # Generate patch IDs from patch_lengths
+        patch_ids = patch_ids_from_lengths(
+            patch_lengths, local_encoder_tokens.shape[-1]
+        )
+        assert torch.max(patch_ids) + 1 <= torch.max(
+            (patch_lengths != 0).sum(dim=-1)
+        ), f"{torch.max(patch_ids) + 1} > {torch.max((patch_lengths != 0).sum(dim=-1))}"
+        cross_attn_mask_enc = None
+        # Cross-attention encoder
+        if self.cross_attn_encoder:
+            cross_attn_mask_enc = cross_attn_mask(
+                patch_ids,
+                patch_lengths,
+                N,
+                patches_as_queries=True,
+                cross_attn_k=self.cross_attn_k,
+                window=self.cross_attn_window_encoder,
+                block_mask=self.cross_attn_use_flex_attention,
+            )
+        # Hashing and embedding
+        local_encoder_embeds = compute_hash_embeddings(
+            local_encoder_tokens=local_encoder_tokens,
+            local_encoder=self.local_encoder,
+            encoder_hash_tok_embedding=self.encoder_hash_tok_embedding,
+            encoder_hash_byte_group_nb_functions=self.encoder_hash_byte_group_nb_functions,
+            encoder_hash_byte_group_size=self.encoder_hash_byte_group_size,
+            encoder_hash_byte_group_vocab=self.encoder_hash_byte_group_vocab,
+        )
+        # N-gram table embeddings
+        if self.encoder_ngram_embedding is not None:
+            assert ngram_ids is not None, "ngram_ids must be provided"
+            if local_encoder_embeds is None:
+                local_encoder_embeds = self.local_encoder.tok_embeddings(
+                    local_encoder_tokens
+                )
+            assert len(ngram_ids) == len(
+                self.encoder_ngram_embedding
+            ), f"ngram_ids.shape[0]={ngram_ids.shape[0]} versus len(encoder_ngram_embedding)={len(self.encoder_ngram_embedding)}, ngram_ids.shape={ngram_ids.shape}"
+            for i in range(ngram_ids.shape[0]):
+                ngram_embedding = self.encoder_ngram_embedding[i]
+                ngram_embeds = ngram_embedding(ngram_ids[i])
+                assert (
+                    local_encoder_embeds.shape == ngram_embeds.shape
+                ), f"Shape mismatch: {local_encoder_embeds.shape} vs {ngram_embeds.shape}, ngram_ids.shape={ngram_ids.shape}"
+                local_encoder_embeds = local_encoder_embeds + ngram_embeds
+        # Local encoder
+        h_cross = None
+        (h_encoder, h_cross), cache_encoder = self.local_encoder(
+            tokens=local_encoder_tokens,
+            embeds=local_encoder_embeds,
+            patch_embeds=h_cross if self.cross_attn_encoder else None,
+            cross_mask=cross_attn_mask_enc,
+            num_patches=patch_lengths.shape[1],
+            patch_ids=patch_ids,
+        )
+        # Downsampling
+        if not self.cross_attn_encoder:
+            assert (
+                patch_ids.shape[1] == h_encoder.shape[1]
+            ), f"{patch_ids.shape[1]} != {h_encoder.shape[1]}"
+            h = downsample(
+                h_encoder,
+                patch_lengths.shape[1],
+                patch_lengths,
+                patch_ids,
+                downsampling_by_pooling=self.downsampling_by_pooling,
+                patch_size=self.patch_size,
+            )
+        else:
+            # Reshape h_cross
+            h = h_cross.view(bs, patch_lengths.shape[1], -1)
+        # Global transformer
+        global_tokens = tokens.new(h.shape[0], h.shape[1]).fill_(self.boe_id)
+        rows, cols = torch.where(local_encoder_tokens == self.eos_id)
+        eos_patch_ids = patch_ids[rows, cols]
+        global_tokens[rows, eos_patch_ids] = self.eos_id
+        h, _ = self.global_transformer(
+            embeds=h,
+            tokens=global_tokens,
+        )
+        # Unpatching
+        dec_embeds = h_encoder[:, nb_boe : nb_boe + N, :]
+        # Generate decoder patch IDs
+        decoder_patch_ids = decoder_patch_ids_from_lengths(
+            patch_lengths, nb_boe, local_decoder_tokens.shape[-1]
+        )
+        assert (
+            torch.max(decoder_patch_ids) + 1 <= h.shape[1]
+        ), f"{torch.max(decoder_patch_ids) + 1} > {h.shape[1]}"
+        assert (
+            decoder_patch_ids.shape[1] == dec_embeds.shape[1]
+        ), f"{decoder_patch_ids.shape[1]} != {dec_embeds.shape[1]}"
+        # Cross-attention decoder
+        if not self.cross_attn_decoder:
+            h = torch.gather(
+                h, 1, decoder_patch_ids.unsqueeze(-1).expand(-1, -1, h.shape[-1])
+            )
+            cross_attn_mask_dec = None
+            assert local_decoder_tokens.shape == h.shape[:-1]
+        else:
+            cross_attn_mask_dec = cross_attn_mask(
+                decoder_patch_ids,
+                patch_lengths,
+                N,
+                patches_as_queries=False,
+                cross_attn_k=self.cross_attn_k,
+                window=self.cross_attn_window_decoder,
+                block_mask=self.cross_attn_use_flex_attention,
+            )
+        # Local decoder
+        output, _ = self.local_decoder(
+            embeds=dec_embeds,
+            patch_embeds=h,
+            tokens=local_decoder_tokens,
+            cross_mask=cross_attn_mask_dec,
+        )
+        return output
+    def reset_parameters(self, init_std=None):
+        # Either use fixed base std or sqrt model dim
+        init_std = init_std or (self.dim ** (-0.5))
+        nn.init.trunc_normal_(
+            self.tok_embeddings.weight,
+            mean=0.0,
+            std=init_std,
+            a=-3 * init_std,
+            b=3 * init_std,
+        )
+        if not self.weight_tying:
+            nn.init.trunc_normal_(
+                self.output.weight,
+                mean=0.0,
+                std=init_std,
+                a=-3 * init_std,
+                b=3 * init_std,
+            )
+    def init_weights(self):
+        self.reset_parameters()
+        self.init_base_std = self.init_base_std or (self.dim ** (-0.5))
+        for depth, layer in enumerate(self.layers):
+            factor = {
+                InitStdFactor.CURRENT_DEPTH: (2 * (depth + 1)) ** 0.5,
+                InitStdFactor.GLOBAL_DEPTH: (2 * (len(self.layers) + 1)) ** 0.5,
+                InitStdFactor.DIM_RATIO: self.dim / 4096,
+                InitStdFactor.DISABLED: 1.0,
+            }[self.init_std_factor]
+            layer.init_weights(self.init_base_std, factor)
+        self.local_decoder.init_weights(self.init_base_std)
+        self.global_transformer.init_weights(self.init_base_std)
+        self.local_encoder.init_weights(self.init_base_std)
+        for emb in self.encoder_hash_tok_embedding:
+            nn.init.trunc_normal_(
+                emb.weight,
+                mean=0.0,
+                std=self.init_base_std,
+                a=-3 * self.init_base_std,
+                b=3 * self.init_base_std,
+            )

bytelatent/model/local_models.py ADDED Viewed

	@@ -0,0 +1,356 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import logging
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.nn.attention.flex_attention import BlockMask
+from xformers.ops import AttentionBias
+from bytelatent.base_transformer import (
+    InitStdFactor,
+    RMSNorm,
+    RotaryEmbedding,
+    TransformerBlock,
+)
+from bytelatent.model.transformer import CrossAttention
+from bytelatent.model.utils import create_causal_mask, downsample
+from bytelatent.tokenizers.blt_tokenizer import BOE_ID
+logger = logging.getLogger()
+class LocalModelBase(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.dim = args.dim
+        self.dropout = args.dropout
+        self.vocab_size = args.vocab_size + args.pm_size
+        self.patch_size = args.patch_size
+        self.efficient_attn = args.efficient_attn
+        self.sliding_window = args.sliding_window
+        self.use_rope = args.use_rope
+        self.init_std_factor = args.init_std_factor
+        self.cross_attn_encoder = getattr(args, "cross_attn_encoder", None)
+        self.cross_attn_decoder = getattr(args, "cross_attn_decoder", None)
+        self.cross_attn_k = getattr(args, "cross_attn_k", None)
+        self.boe_id = BOE_ID
+        self.norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.layers = nn.ModuleList(
+            [TransformerBlock(args) for _ in range(args.n_layers)]
+        )
+        self.tok_embeddings = nn.Embedding(self.vocab_size, args.dim)
+        if not self.use_rope:
+            self.pos_embeddings = nn.Embedding(args.max_length, args.dim)
+        else:
+            self.rope = RotaryEmbedding(
+                theta=args.rope_theta,
+                head_dim=args.head_dim or args.dim // args.n_heads,
+                max_seqlen=getattr(args, "max_encoder_seq_length", args.max_length),
+            )
+            self.pos_embeddings = None
+        self.token_embedding_projection = (
+            nn.Linear(args.dim_token_emb, args.dim, bias=False)
+            if hasattr(args, "dim_token_emb") and args.dim_token_emb != self.dim
+            else None
+        )
+        self.patch_embedding_projection = self._create_patch_projection(args)
+    def _should_create_patch_projection(self, args):
+        dimension_mismatch = (
+            getattr(args, "dim_patch_emb") and args.dim_patch_emb != self.dim
+        )
+        # Check cross attention conditions
+        cross_attn_conditions = (
+            hasattr(args, "cross_attn_encoder")
+            and args.cross_attn_encoder
+            and getattr(args, "cross_attn_init_by_pooling")
+        ) or (
+            hasattr(args, "cross_attn_decoder")
+            and args.cross_attn_decoder
+            and getattr(args, "cross_attn_init_by_pooling")
+        )
+        return dimension_mismatch or cross_attn_conditions
+    def _create_patch_projection(self, args):
+        if not self._should_create_patch_projection(args):
+            return None
+        output_dim = args.dim_token_emb * (self.cross_attn_k or 1)
+        return nn.Linear(
+            in_features=args.dim_patch_emb,
+            out_features=output_dim,
+            bias=False,
+        )
+    def apply_embedding(self, tokens, embeds):
+        if embeds is not None:
+            return embeds
+        else:
+            return self.tok_embeddings(tokens)
+    def init_weights(self, init_std=None):
+        self.rope.reset_parameters()
+        init_std = init_std or (self.dim ** (-0.5))
+        nn.init.trunc_normal_(
+            self.tok_embeddings.weight,
+            mean=0.0,
+            std=init_std,
+            a=-3 * init_std,
+            b=3 * init_std,
+        )
+        if self.pos_embeddings is not None:
+            nn.init.trunc_normal_(
+                self.pos_embeddings.weight,
+                mean=0.0,
+                std=init_std,
+                a=-3 * init_std,
+                b=3 * init_std,
+            )
+        for depth, layer in enumerate(self.layers):
+            factor = {
+                InitStdFactor.CURRENT_DEPTH: (2 * (depth + 1)) ** 0.5,
+                InitStdFactor.GLOBAL_DEPTH: (2 * (len(self.layers) + 1)) ** 0.5,
+                InitStdFactor.DIM_RATIO: self.dim / 4096,
+                InitStdFactor.DISABLED: 1.0,
+            }[self.init_std_factor]
+            layer.init_weights(init_std, factor)
+        if self.token_embedding_projection is not None:
+            nn.init.trunc_normal_(
+                self.token_embedding_projection.weight,
+                mean=0.0,
+                std=init_std,
+                a=-3 * init_std,
+                b=3 * init_std,
+            )
+        if self.patch_embedding_projection is not None:
+            nn.init.trunc_normal_(
+                self.patch_embedding_projection.weight,
+                mean=0.0,
+                std=init_std,
+                a=-3 * init_std,
+                b=3 * init_std,
+            )
+        if hasattr(self, "output"):
+            nn.init.trunc_normal_(
+                self.output.weight,
+                mean=0.0,
+                std=init_std,
+                a=-3 * init_std,
+                b=3 * init_std,
+            )
+        if self.cross_attn_layers is not None:
+            for depth, layer in enumerate(self.cross_attn_layers):
+                factor = {
+                    InitStdFactor.CURRENT_DEPTH: (2 * (depth + 1)) ** 0.5,
+                    InitStdFactor.GLOBAL_DEPTH: (2 * (len(self.layers) + 1)) ** 0.5,
+                    InitStdFactor.DIM_RATIO: self.dim / 4096,
+                    InitStdFactor.DISABLED: 1.0,
+                }[self.init_std_factor]
+                layer.init_weights(init_std, factor)
+class LocalEncoder(LocalModelBase):
+    def __init__(self, args):
+        super().__init__(args)
+        self.output_proj = (
+            args.patching_mode in ["entropy", "probmax"]
+        ) and args.entropy_model_checkpoint_dir is None
+        self.apply_transformer = args.use_local_encoder_transformer
+        self.downsampling_by_pooling = args.downsampling_by_pooling
+        self.patch_only = args.patch_only_encoder
+        self.expects_hash_embeddings = args.encoder_hash_byte_group_size is not None
+        self.cross_attn_encoder = args.cross_attn_encoder
+        self.cross_attn_all_layers_encoder = args.cross_attn_all_layers_encoder
+        self.cross_attn_init_by_pooling = args.cross_attn_init_by_pooling
+        self.cross_attn_nheads = args.cross_attn_nheads
+        if self.cross_attn_encoder:
+            self.cross_attn_layers = torch.nn.ModuleList()
+            layers_to_add = args.n_layers if self.cross_attn_all_layers_encoder else 1
+            for _ in range(layers_to_add):
+                self.cross_attn_layers.append(
+                    CrossAttention(
+                        dim=self.dim,
+                        head_dim=self.dim // self.cross_attn_nheads,
+                        n_heads=self.cross_attn_nheads,
+                        n_kv_heads=self.cross_attn_nheads,
+                        norm_eps=args.norm_eps,
+                    )
+                )
+    def apply_embedding(self, tokens, embeds):
+        if embeds is not None:
+            assert (
+                self.expects_hash_embeddings
+            ), "Not expecting embeddings to be passed."
+            return embeds
+        else:
+            return self.tok_embeddings(tokens)
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        embeds: Optional[torch.Tensor] = None,
+        patch_embeds: Optional[torch.Tensor] = None,
+        mask: Optional[Union["BlockMask", "AttentionBias", torch.Tensor, str]] = None,
+        cross_mask: Optional[torch.Tensor] = None,
+        num_patches: Optional[int] = None,
+        patch_ids: Optional[torch.Tensor] = None,
+        cache: Optional[List[Tuple[torch.Tensor, torch.Tensor, int]]] = None,
+    ):
+        """ """
+        bs, seqlen = tokens.shape
+        if mask is None:
+            mask = create_causal_mask(seqlen, self.efficient_attn, self.sliding_window)
+        h = self.apply_embedding(tokens, embeds)
+        freqs_cis = self.rope(seqlen=seqlen) if self.use_rope else None
+        h = F.dropout(h, p=self.dropout, training=self.training)
+        for i, layer in enumerate(self.layers):
+            h = layer(h, mask=mask, freq_cis=freqs_cis, attn_impl=self.efficient_attn)
+            # check if cross attention should be applied to either all layer or only the last layer
+            if self.cross_attn_encoder and (
+                i == len(self.layers) - 1 or self.cross_attn_all_layers_encoder
+            ):
+                patch_embeds = self.apply_cross_attention(
+                    h, patch_embeds, i, bs, num_patches, patch_ids, cross_mask
+                )
+        h_residual = patch_embeds if self.cross_attn_encoder else None
+        return (h, h_residual), cache
+    def apply_cross_attention(
+        self, h, patch_embeds, layer_idx, bs, num_patches, patch_ids, cross_mask
+    ):
+        # apply pooling and project
+        if self.cross_attn_init_by_pooling and patch_embeds is None:
+            patch_embeds = downsample(
+                h,
+                num_patches,
+                patch_ids=patch_ids,
+                downsampling_by_pooling=self.downsampling_by_pooling,
+                patch_size=self.patch_size,
+            )
+            if self.patch_embedding_projection is not None:
+                patch_embeds = self.patch_embedding_projection(patch_embeds)
+                patch_embeds = patch_embeds.reshape(
+                    bs, patch_embeds.shape[1] * self.cross_attn_k, self.dim
+                )
+        layer_idx = layer_idx if self.cross_attn_all_layers_encoder else 0
+        patch_embeds_cross = self.cross_attn_layers[layer_idx](
+            x=patch_embeds,
+            kv=h,
+            mask=cross_mask,
+        )
+        patch_embeds += patch_embeds_cross
+        return patch_embeds
+class LocalDecoder(LocalModelBase):
+    def __init__(self, args):
+        super().__init__(args)
+        # Model configuration flags
+        self.patch_only = args.patch_only_decoder
+        self.expects_embeddings = args.share_encoder_decoder_emb
+        self.cross_attn_decoder = args.cross_attn_decoder
+        self.cross_attn_all_layers_decoder = args.cross_attn_all_layers_decoder
+        self.cross_attn_init_by_pooling = args.cross_attn_init_by_pooling
+        self.cross_attn_nheads = args.cross_attn_nheads
+        if self.cross_attn_decoder:
+            self.cross_attn_layers = torch.nn.ModuleList()
+            layers_to_add = args.n_layers if self.cross_attn_all_layers_decoder else 1
+            for _ in range(layers_to_add):
+                self.cross_attn_layers.append(
+                    CrossAttention(
+                        dim=self.dim,
+                        head_dim=self.dim // self.cross_attn_nheads,
+                        n_heads=self.cross_attn_nheads,
+                        n_kv_heads=self.cross_attn_nheads,
+                        norm_eps=args.norm_eps,
+                    )
+                )
+        self.output = nn.Linear(
+            self.dim,
+            args.vocab_size,
+            bias=False,
+        )
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        embeds: Optional[torch.Tensor],
+        patch_embeds: Optional[torch.Tensor] = None,
+        mask: Optional[Union["BlockMask", "AttentionBias", torch.Tensor, str]] = None,
+        cross_mask: Optional[torch.Tensor] = None,
+        cache: Optional[List[Tuple[torch.Tensor, torch.Tensor, int]]] = None,
+    ):
+        bs, seqlen = tokens.shape
+        assert embeds is not None, "Embeddings must be provided"
+        if mask is None:
+            mask = create_causal_mask(seqlen, self.efficient_attn, self.sliding_window)
+        h = embeds
+        if self.patch_embedding_projection is not None:
+            assert patch_embeds is not None, "Patch embeddings must be passed."
+            patch_embeds = self.patch_embedding_projection(patch_embeds)
+            if self.cross_attn_k is not None:
+                patch_embeds = patch_embeds.reshape(
+                    bs, patch_embeds.shape[1] * self.cross_attn_k, self.dim
+                )
+        if patch_embeds is not None and not self.cross_attn_decoder:
+            h = h + patch_embeds
+        freqs_cis = self.rope(seqlen=seqlen) if self.use_rope else None
+        h = F.dropout(h, p=self.dropout, training=self.training)
+        for i, layer in enumerate(self.layers):
+            if self.cross_attn_decoder and (
+                i == 0 or self.cross_attn_all_layers_decoder
+            ):
+                # Use cross attention to extract info from patch_embeds into h
+                h_cross = self.cross_attn_layers[i](
+                    x=h,
+                    kv=patch_embeds,
+                    mask=cross_mask,
+                )
+                h = h + h_cross
+            h = layer(h, mask=mask, freq_cis=freqs_cis, attn_impl=self.efficient_attn)
+        h_preds = self.norm(h)
+        h_preds = F.dropout(h_preds, p=self.dropout, training=self.training)
+        h_preds = self.output(h_preds)
+        h_preds = h_preds.float()
+        return h_preds, cache

bytelatent/model/transformer.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import logging
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn
+import torch.nn as nn
+from torch.nn import functional as F
+from torch.nn.attention.flex_attention import BlockMask
+from xformers.ops import AttentionBias
+from bytelatent.base_transformer import (
+    BaseTransformer,
+    RMSNorm,
+    flex_attention_comp,
+    repeat_kv,
+)
+from bytelatent.model.utils import create_causal_mask
+logger = logging.getLogger()
+class CrossAttention(nn.Module):
+    """
+    CrossAttention block to attend to the encoder states from the decoder.
+    Rope is not supported.
+    """
+    def __init__(
+        self,
+        dim: int,
+        head_dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        norm_eps: float,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.head_dim = head_dim
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.heads_per_group = self.n_heads // self.n_kv_heads
+        self.cross_attn_norm_q = RMSNorm(dim, eps=norm_eps)
+        self.cross_attn_norm_kv = RMSNorm(dim, eps=norm_eps)
+        self.wq = nn.Linear(
+            dim,
+            n_heads * head_dim,
+            bias=False,
+        )
+        self.wk = nn.Linear(
+            dim,
+            n_kv_heads * head_dim,
+            bias=False,
+        )
+        self.wv = nn.Linear(
+            dim,
+            n_kv_heads * head_dim,
+            bias=False,
+        )
+        self.wo = nn.Linear(
+            n_heads * head_dim,
+            dim,
+            bias=False,
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        kv: torch.Tensor,
+        mask: Optional[Union[BlockMask, AttentionBias, str]] = None,
+    ) -> torch.Tensor:
+        # B S D
+        bsz, seq_len, _ = x.shape
+        _, slen_kv, _ = kv.shape
+        x = self.cross_attn_norm_q(x)
+        kv = self.cross_attn_norm_kv(kv)
+        xq = self.wq(x)
+        xk = self.wk(kv)
+        xv = self.wv(kv)
+        output_shape = xq.shape
+        # B S D -> B S H D
+        xq = xq.view(bsz, seq_len, self.n_heads, self.head_dim)
+        xk = xk.view(bsz, slen_kv, self.n_kv_heads, self.head_dim)
+        xv = xv.view(bsz, slen_kv, self.n_kv_heads, self.head_dim)
+        xk = repeat_kv(xk, self.heads_per_group, dim=2)
+        xv = repeat_kv(xv, self.heads_per_group, dim=2)
+        assert mask is None or isinstance(mask, BlockMask)
+        xq, xk, xv = map(lambda e: e.transpose(1, 2), (xq, xk, xv))
+        output = flex_attention_comp(xq, xk, xv, block_mask=mask)
+        output = output.transpose(1, 2).contiguous()  # B H S D -> B S H D
+        output = self.wo(output.reshape(output_shape))
+        return x + output
+    def init_weights(self, base_std: float, factor: float = 1.0):
+        std = base_std * factor
+        nn.init.trunc_normal_(
+            self.wq.weight,
+            mean=0.0,
+            std=std,
+            a=-3 * std,
+            b=3 * std,
+        )
+        nn.init.trunc_normal_(
+            self.wk.weight,
+            mean=0.0,
+            std=std,
+            a=-3 * std,
+            b=3 * std,
+        )
+        nn.init.trunc_normal_(
+            self.wv.weight,
+            mean=0.0,
+            std=std,
+            a=-3 * std,
+            b=3 * std,
+        )
+        output_std = std / (2**0.5)
+        nn.init.trunc_normal_(
+            self.wo.weight,
+            mean=0.0,
+            std=output_std,
+            a=-3 * output_std,
+            b=3 * output_std,
+        )
+        self.cross_attn_norm_q.reset_parameters()
+        self.cross_attn_norm_kv.reset_parameters()
+class GlobalTransformer(BaseTransformer):
+    def __init__(self, args):
+        super().__init__(args)
+        self.dropout = args.dropout
+        self.sliding_window = args.sliding_window
+        self.efficient_attn = args.efficient_attn
+        self.token_embedding_projection = None
+        if args.dim_token_emb is not None and args.dim_token_emb != self.dim:
+            self.token_embedding_projection = nn.Linear(
+                args.dim_token_emb,
+                args.dim,
+                bias=False,
+            )
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        tok_idx: Optional[torch.Tensor] = None,
+        embeds: Optional[torch.Tensor] = None,
+        mask: Optional[Union[BlockMask, AttentionBias, torch.Tensor, str]] = None,
+        cache: Optional[List[Tuple[torch.Tensor, torch.Tensor, int]]] = None,
+    ):
+        """
+        Similar to BaseTransformer.forward, but with an additional embeds argument
+        and projection to the token space.
+        """
+        bs, seqlen = tokens.shape
+        attn_impl = self.efficient_attn
+        h = embeds
+        mask = (
+            mask
+            if mask is not None
+            else create_causal_mask(seqlen, attn_impl, self.sliding_window)
+        )
+        if self.token_embedding_projection is not None and h.shape[-1] != self.dim:
+            h = self.token_embedding_projection(h)
+        h = F.dropout(h, p=self.dropout, training=self.training)
+        h = super().forward(h, tok_idx=tok_idx, mask=mask, attn_impl=attn_impl)
+        return h, cache
+    def init_weights(self, init_base_std: float):
+        super().init_weights()
+        if self.token_embedding_projection is not None:
+            nn.init.trunc_normal_(
+                self.token_embedding_projection.weight,
+                mean=0.0,
+                std=init_base_std,
+                a=-3 * init_base_std,
+                b=3 * init_base_std,
+            )

bytelatent/model/utils.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import torch
+from torch.nn.attention.flex_attention import create_block_mask
+from xformers.ops import fmha
+def patch_reduce(h, max_num_patches, reduction, patch_ids):
+    """
+    Reduce variable length patches to single embedding per patch
+    Note: this works with variable number of patches for different sequences in the batch
+    It handles variable length patches by assuming that patch_lengths will be 0 for any
+    extra patches on the *right*. Since there can be a variable number of patches
+    this function also return the number of patches for each sequence in the batch.
+    Any embeddings on the right that are not allocated to a patch
+    (i.e. if the sum(patch_lengths[i]) < seq_len for any i)
+    will be sent to a dummy patch, which is trimmed before returning.
+    """
+    bs, seq_len, emb_dim = h.shape
+    patch_ids = patch_ids.unsqueeze(-1).expand(-1, -1, h.shape[-1])
+    reduced_embs = torch.zeros(
+        (bs, max_num_patches, emb_dim), dtype=h.dtype, device=h.device
+    )
+    reduced_embs = reduced_embs.scatter_reduce(
+        src=h,
+        dim=1,
+        index=patch_ids,
+        reduce=reduction,
+        include_self=False,
+    )
+    reduced_embs = reduced_embs[:, :max_num_patches, :]
+    return reduced_embs
+def concat_downsample(h, patch_lengths, patch_size):
+    # The assumption in this function is that seq_len = patch_size * num_patches.
+    bs, seq_len, emb_dim = h.shape
+    patch_end_ids = torch.cumsum(patch_lengths, dim=1)
+    patch_ids = patch_end_ids.unsqueeze(-1) - torch.arange(patch_size, 0, -1).to(
+        patch_end_ids.device
+    )
+    # Is clamp ok here?
+    patch_ids = patch_ids.clamp(min=0).unsqueeze(-1).expand(-1, -1, -1, h.shape[-1])
+    patch_ids = patch_ids.view(bs, -1, emb_dim)
+    # after gather h.shape = [batch_size, seq_len, dim]
+    h = torch.gather(h, 1, patch_ids)
+    h = h.reshape(bs, patch_lengths.shape[1], patch_size * h.size(-1))
+    return h
+def pooling_downsample(h, max_num_patches, pooling_mode, patch_ids):
+    cat = []
+    if "avg" in pooling_mode or "mean" in pooling_mode:
+        cat.append(patch_reduce(h, max_num_patches, "mean", patch_ids))
+    if "min" in pooling_mode:
+        cat.append(patch_reduce(h, max_num_patches, "amin", patch_ids))
+    if "max" in pooling_mode:
+        cat.append(patch_reduce(h, max_num_patches, "amax", patch_ids))
+    assert len(cat) > 0
+    h = torch.cat(cat, dim=-1)
+    return h
+def downsample(
+    h,
+    num_patches,
+    patch_lengths=None,
+    patch_ids=None,
+    downsampling_by_pooling=None,
+    patch_size=4,
+):
+    """
+    Downsampling:
+        a. concatenating embeddings in the patch
+            Note: with dynamic patching, patch the last patch_size tokens.
+        b. pooling embeddings in the patch
+    """
+    # input: h.shape = [batch_size, seq_len, dim]
+    # input: pool h.shape = [batch_size, seq_len / patch_size, dim]
+    # if we don't use the cros_attn, we pool so that we convert bytes rep to patch rep
+    if downsampling_by_pooling is not None and len(downsampling_by_pooling) > 0:
+        # By pooling
+        max_num_patches = num_patches
+        assert patch_ids is not None
+        h = pooling_downsample(h, max_num_patches, downsampling_by_pooling, patch_ids)
+    else:
+        # TODO: remove this condition
+        # By concatenating (fixed lengths patching)
+        assert patch_lengths is not None
+        h = concat_downsample(h, patch_lengths, patch_size)
+    return h
+def causal_mask(b, h, q_idx, kv_idx):
+    return q_idx >= kv_idx
+def create_causal_mask(seqlen, attn_impl, sliding_window):
+    if sliding_window is not None and attn_impl == "xformers":
+        return fmha.attn_bias.LocalAttentionFromBottomRightMask(
+            window_left=sliding_window - 1, window_right=0
+        )
+    elif attn_impl == "xformers":
+        return fmha.attn_bias.LowerTriangularMask()
+    elif attn_impl == "sdpa":
+        return "causal"
+    elif attn_impl == "flex_attention":
+        return create_block_mask(causal_mask, None, None, seqlen, seqlen)
+    elif attn_impl == "fmha":
+        return None
+    else:
+        raise NotImplementedError(
+            f"Attention {attn_impl} with {sliding_window} sliding window not implemented"
+        )