Spaces:

NTUST-DDRC
/

gen3c

Build error

App Files Files Community

elungky commited on 20 days ago

Commit

28451f7

0 Parent(s):

Initial commit for new Space - pre-built Docker image

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.flake8 +10 -0
.gitattributes +44 -0
.gitignore +247 -0
.gitmodules +27 -0
.pre-commit-config.yaml +55 -0
ATTRIBUTIONS.md +0 -0
CONTRIBUTING.md +51 -0
INSTALL.md +48 -0
LICENSE +201 -0
README.md +248 -0
assets/demo_1.gif +3 -0
assets/demo_2.gif +3 -0
assets/demo_3.gif +3 -0
assets/demo_dynamic.gif +3 -0
assets/diffusion/000000.png +3 -0
assets/diffusion/000001.png +3 -0
assets/diffusion/000002.png +3 -0
assets/diffusion/000003.png +3 -0
assets/diffusion/000004.png +3 -0
assets/diffusion/000005.png +3 -0
assets/diffusion/000006.png +3 -0
assets/diffusion/000007.png +3 -0
assets/diffusion/000008.png +3 -0
assets/diffusion/000009.png +3 -0
assets/diffusion/000010.png +3 -0
assets/diffusion/000011.png +3 -0
assets/diffusion/000012.png +3 -0
assets/diffusion/000013.png +3 -0
assets/diffusion/000014.png +3 -0
assets/diffusion/000015.png +3 -0
checkpoints/README.md +4 -0
cosmos-predict1.yaml +29 -0
cosmos_predict1/__init__.py +14 -0
cosmos_predict1/autoregressive/__init__.py +14 -0
cosmos_predict1/autoregressive/callbacks/video_sampling_teacher_forcing.py +352 -0
cosmos_predict1/autoregressive/configs/__init__.py +14 -0
cosmos_predict1/autoregressive/configs/base/__init__.py +14 -0
cosmos_predict1/autoregressive/configs/base/callbacks.py +33 -0
cosmos_predict1/autoregressive/configs/base/dataloader.py +72 -0
cosmos_predict1/autoregressive/configs/base/dataset.py +39 -0
cosmos_predict1/autoregressive/configs/base/model.py +318 -0
cosmos_predict1/autoregressive/configs/base/model_config.py +718 -0
cosmos_predict1/autoregressive/configs/base/model_parallel.py +33 -0
cosmos_predict1/autoregressive/configs/base/optim.py +86 -0
cosmos_predict1/autoregressive/configs/base/tokenizer.py +139 -0
cosmos_predict1/autoregressive/configs/config.py +111 -0
cosmos_predict1/autoregressive/configs/experiment/video2video/__init__.py +0 -0
cosmos_predict1/autoregressive/configs/experiment/video2video/basic.py +163 -0
cosmos_predict1/autoregressive/configs/inference/inference_config.py +102 -0
cosmos_predict1/autoregressive/configs/registry.py +89 -0

.flake8 ADDED Viewed

	@@ -0,0 +1,10 @@

+[flake8]
+enable-extensions = G
+select = B,C,E,F,G,P,SIM1,T4,W,B9
+max-line-length = 120
+# C408 ignored because we like the dict keyword argument syntax
+# E501 is not flexible enough, we're using B950 instead
+ignore =
+    E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,E226,E265
+exclude =
+    third_party

.gitattributes ADDED Viewed

	@@ -0,0 +1,44 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+<<<<<<< HEAD
+assets/*.gif filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+=======
+>>>>>>> 0453ffbfce197070bb0c254a11ef21f15d1ad986
+transformer_engine_torch-1.12.0+cu121-cp310-cp310-linux_x86_64.whl filter=lfs diff=lfs merge=lfs -text
+transformer_engine.whl filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,247 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Misc
+outputs/
+checkpoints/*
+!checkpoints/README.md
+datasets/*
+!datasets/README.md
+apex/
+# Data types
+*.jit
+*.pt
+*.hdr
+*.webp
+*.pgm
+*.tiff
+*.tif
+*.tar
+*.tar.gz
+*.gz
+*.pkl
+*.pt
+*.bin
+*.pickle
+*.txt
+# Other uncheckable file types
+*.zip
+*.exe
+*.dll
+*.swp
+*.vscode
+*.DS_Store
+*.pyc
+*Thumbs.db
+*.patch
+# Credential information that should never be checked in
+credentials
+*.secret
+# ------------------------ BELOW IS AUTO-GENERATED FOR PYTHON REPOS ------------------------
+# Byte-compiled / optimized / DLL files
+**/__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+results/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.config
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Third party
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# ruff
+.ruff_cache
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+CLIP
+.devcontainer/devcontainer.json
+# Coverage
+.coverage
+coverage.xml
+# JUnit Reports
+report.xml
+# CI-CD
+temp/
+envs.txt
+manifest.json
+# locks and t5 temp files
+*.locks*
+*.no_exist*
+*models--t5*
+# OneLogger
+wandb/
+onelogger.err
+onelogger.log

.gitmodules ADDED Viewed

	@@ -0,0 +1,27 @@

+[submodule "gui/dependencies/pybind11"]
+	path = gui/dependencies/pybind11
+	url = https://github.com/Tom94/pybind11
+[submodule "gui/dependencies/glfw"]
+	path = gui/dependencies/glfw
+	url = https://github.com/Tom94/glfw
+[submodule "gui/dependencies/args"]
+	path = gui/dependencies/args
+	url = https://github.com/Taywee/args
+[submodule "gui/dependencies/tinylogger"]
+	path = gui/dependencies/tinylogger
+	url = https://github.com/Tom94/tinylogger
+[submodule "gui/dependencies/imgui"]
+	path = gui/dependencies/imgui
+	url = https://github.com/ocornut/imgui.git
+[submodule "gui/dependencies/dlss"]
+	path = gui/dependencies/dlss
+	url = https://github.com/NVIDIA/DLSS
+[submodule "gui/dependencies/OpenXR-SDK"]
+	path = gui/dependencies/OpenXR-SDK
+	url = https://github.com/KhronosGroup/OpenXR-SDK.git
+[submodule "gui/dependencies/zlib"]
+	path = gui/dependencies/zlib
+	url = https://github.com/Tom94/zlib
+[submodule "gui/dependencies/fmt"]
+	path = gui/dependencies/fmt
+	url = https://github.com/fmtlib/fmt

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+default_language_version:
+  python: python3.10
+repos:
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.0.0
+    hooks:
+      - id: flake8
+        args:
+          - --max-line-length=120
+          - --ignore=E501,F401,E203,E402,E265,E741,F841,F821,F811,W503,E231,E225,E702
+        exclude: ^dist/|^third_party/
+  - repo: https://github.com/psf/black
+    rev: 23.12.1
+    hooks:
+      - id: black
+        args: [--line-length=120]
+        exclude: ^dist/|^third_party/
+  - repo: https://github.com/timothycrosley/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: [--line-length=120]
+  - repo: https://github.com/MarcoGorelli/absolufy-imports
+    rev: v0.3.1
+    hooks:
+    -   id: absolufy-imports
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+      - id: trailing-whitespace
+        exclude: ^tests/.*/fixtures/.*
+        args: [--markdown-linebreak-ext=md]
+      - id: end-of-file-fixer
+        exclude: ^tests/.*/fixtures/.*
+      - id: check-added-large-files
+        args: ['--maxkb=2000']

ATTRIBUTIONS.md ADDED Viewed

The diff for this file is too large to render. See raw diff

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,51 @@

+# How to Contribute
+We'd love to receive your patches and contributions. Please keep your PRs as draft until such time that you would like us to review them.
+## Code Reviews
+All submissions, including submissions by project members, require review. We use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more information on using pull requests.
+## Signing Your Work
+* We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
+  * Any contribution which contains commits that are not Signed-Off will not be accepted.
+* To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
+  ```bash
+  $ git commit -s -m "Add cool feature."
+  ```
+  This will append the following to your commit message:
+  ```
+  Signed-off-by: Your Name <your@email.com>
+  ```
+* Full text of the DCO:
+  ```
+    Developer Certificate of Origin
+    Version 1.1
+    Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+    1 Letterman Drive
+    Suite D4700
+    San Francisco, CA, 94129
+    Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
+  ```
+  ```
+    Developer's Certificate of Origin 1.1
+    By making a contribution to this project, I certify that:
+    (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
+    (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
+    (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
+    (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
+  ```

INSTALL.md ADDED Viewed

	@@ -0,0 +1,48 @@

+## Environment setup
+Cosmos runs only on Linux systems. We have tested the installation with Ubuntu 24.04, 22.04, and 20.04.
+Cosmos requires the Python version to be `3.10.x`. Please also make sure you have `conda` installed ([instructions](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)).
+### Inference
+The below commands creates the `cosmos-predict1` conda environment and installs the dependencies for inference:
+```bash
+# Create the cosmos-predict1 conda environment.
+conda env create --file cosmos-predict1.yaml
+# Activate the cosmos-predict1 conda environment.
+conda activate cosmos-predict1
+# Install the dependencies.
+pip install -r requirements.txt
+# Patch Transformer engine linking issues in conda environments.
+ln -sf $CONDA_PREFIX/lib/python3.10/site-packages/nvidia/*/include/* $CONDA_PREFIX/include/
+ln -sf $CONDA_PREFIX/lib/python3.10/site-packages/nvidia/*/include/* $CONDA_PREFIX/include/python3.10
+# Install Transformer engine.
+pip install transformer-engine[pytorch]==1.12.0
+# Install Apex for inference.
+git clone https://github.com/NVIDIA/apex
+CUDA_HOME=$CONDA_PREFIX pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./apex
+# Install MoGe for inference.
+pip install git+https://github.com/microsoft/MoGe.git
+```
+* Alternatively, if you are more familiar with a containerized environment, you can build the dockerfile and run it to get an environment with all the packages pre-installed.
+    This requires docker to be already present on your system with the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed.
+    ```bash
+    docker build -f Dockerfile . -t nvcr.io/$USER/cosmos-predict1:latest
+    ```
+    Note: In case you encounter permission issues while mounting local files inside the docker, you can share the folders from your current directory to all users (including docker) using this helpful alias `alias share='sudo chown -R ${USER}:users $PWD && sudo chmod g+w $PWD'` before running the docker.
+You can test the environment setup for inference with
+```bash
+CUDA_HOME=$CONDA_PREFIX PYTHONPATH=$(pwd) python scripts/test_environment.py
+```
+### Post-training
+🛠️ *Under construction* 👷
+Stay tuned!

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,248 @@

+---
+title: GEN3C Project (from DGX Station)
+emoji: 🫁
+colorFrom: green
+colorTo: blue
+sdk: docker
+image: elungky/gen3c:latest
+# app_port: 7860 # Remove or comment this line as the image handles the port
+---
+# GEN3C: 3D-Informed World-Consistent Video Generation with Precise Camera Control
+<!-- Note: this video is hosted by GitHub and gets embedded automatically when viewing in the GitHub UI -->
+https://github.com/user-attachments/assets/247e1719-9f8f-4504-bfa3-f9706bd8682d
+**GEN3C: 3D-Informed World-Consistent Video Generation with Precise Camera Control**<br>
+[Xuanchi Ren*](https://xuanchiren.com/),
+[Tianchang Shen*](https://www.cs.toronto.edu/~shenti11/),
+[Jiahui Huang](https://huangjh-pub.github.io/),
+[Huan Ling](https://www.cs.toronto.edu/~linghuan/),
+[Yifan Lu](https://yifanlu0227.github.io/),
+[Merlin Nimier-David](https://merlin.nimierdavid.fr/),
+[Thomas Müller](https://research.nvidia.com/person/thomas-muller),
+[Alexander Keller](https://research.nvidia.com/person/alex-keller),
+[Sanja Fidler](https://www.cs.toronto.edu/~fidler/),
+[Jun Gao](https://www.cs.toronto.edu/~jungao/) <br>
+\* indicates equal contribution <br>
+**[Paper](https://arxiv.org/pdf/2503.03751), [Project Page](https://research.nvidia.com/labs/toronto-ai/GEN3C/), [HuggingFace](https://huggingface.co/collections/nvidia/gen3c-683f3f9540a8f9c98cf46a8d)**
+Abstract: We present GEN3C, a generative video model with precise Camera Control and
+temporal 3D Consistency. Prior video models already generate realistic videos,
+but they tend to leverage little 3D information, leading to inconsistencies,
+such as objects popping in and out of existence. Camera control, if implemented
+at all, is imprecise, because camera parameters are mere inputs to the neural
+network which must then infer how the video depends on the camera. In contrast,
+GEN3C is guided by a 3D cache: point clouds obtained by predicting the
+pixel-wise depth of seed images or previously generated frames. When generating
+the next frames, GEN3C is conditioned on the 2D renderings of the 3D cache with
+the new camera trajectory provided by the user. Crucially, this means that
+GEN3C neither has to remember what it previously generated nor does it have to
+infer the image structure from the camera pose. The model, instead, can focus
+all its generative power on previously unobserved regions, as well as advancing
+the scene state to the next frame. Our results demonstrate more precise camera
+control than prior work, as well as state-of-the-art results in sparse-view
+novel view synthesis, even in challenging settings such as driving scenes and
+monocular dynamic video. Results are best viewed in videos.
+For business inquiries, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/).
+For any other questions related to the model, please contact Xuanchi, Tianchang or Jun.
+## News
+- 2025-06-06 Code and model released! In a future update, we plan to include the pipeline for jointly predicting depth and camera pose from video, as well as a driving-finetuned model. Stay tuned!
+## Installation
+Please follow the "Inference" section in [INSTALL.md](INSTALL.md) to set up your environment.
+## Inference
+### Download checkpoints
+1. Generate a [Hugging Face](https://huggingface.co/settings/tokens) access token (if you haven't done so already). Set the access token to `Read` permission (default is `Fine-grained`).
+2. Log in to Hugging Face with the access token:
+   ```bash
+   huggingface-cli login
+   ```
+3. Download the GEN3C model weights from [Hugging Face](https://huggingface.co/nvidia/GEN3C-Cosmos-7B):
+   ```bash
+   CUDA_HOME=$CONDA_PREFIX PYTHONPATH=$(pwd) python scripts/download_gen3c_checkpoints.py --checkpoint_dir checkpoints
+   ```
+### Interactive GUI usage
+<div align="center">
+  <img src="gui/assets/gui_preview.webp" alt="GEN3C interactive GUI"  width="1080px"/>
+</div>
+GEN3C can be used through an interactive GUI, allowing to visualize the inputs in 3D, author arbitrary camera trajectories, and start inference from a single window.
+Please see the [dedicated instructions](gui/README.md).
+### Command-line usage
+GEN3C supports both images and videos as input. Below are examples of running GEN3C on single images and videos with predefined camera trajectory patterns.
+### Example 1: Single Image to Video Generation
+#### Single GPU
+Generate a 121-frame video from a single image:
+```bash
+CUDA_HOME=$CONDA_PREFIX PYTHONPATH=$(pwd) python cosmos_predict1/diffusion/inference/gen3c_single_image.py \
+    --checkpoint_dir checkpoints \
+    --input_image_path assets/diffusion/000000.png \
+    --video_save_name test_single_image \
+    --guidance 1 \
+    --foreground_masking
+```
+#### Multi-GPU (8 GPUs)
+```bash
+NUM_GPUS=8
+CUDA_HOME=$CONDA_PREFIX PYTHONPATH=$(pwd) torchrun --nproc_per_node=${NUM_GPUS} cosmos_predict1/diffusion/inference/gen3c_single_image.py \
+    --checkpoint_dir checkpoints \
+    --input_image_path assets/diffusion/000000.png \
+    --video_save_name test_single_image_multigpu \
+    --num_gpus ${NUM_GPUS} \
+    --guidance 1 \
+    --foreground_masking
+```
+#### Additional Options
+- To generate longer videos autoregressively, specify the number of frames using `--num_video_frames`. The number of frames must follow the pattern: 121 * N - 1 (e.g., 241, 361, etc.)
+- To save buffer images alongside the output video, add the `--save_buffer` flag
+- You can control camera trajectories using `--trajectory`, `--camera_rotation`, and `--movement_distance` arguments. See the "Camera Movement Options" section below for details.
+#### Camera Movement Options
+##### Trajectory Types
+The `--trajectory` argument controls the path the camera takes during video generation. Available options:
+| Option | Description |
+|--------|-------------|
+| `left` | Camera moves to the left (default) |
+| `right` | Camera moves to the right |
+| `up` | Camera moves upward |
+| `down` | Camera moves downward |
+| `zoom_in` | Camera moves closer to the scene |
+| `zoom_out` | Camera moves away from the scene |
+| `clockwise` | Camera moves in a clockwise circular path |
+| `counterclockwise` | Camera moves in a counterclockwise circular path |
+##### Camera Rotation Modes
+The `--camera_rotation` argument controls how the camera rotates during movement. Available options:
+| Option | Description |
+|--------|-------------|
+| `center_facing` | Camera always rotates to look at the (estimated) center of the scene (default) |
+| `no_rotation` | Camera maintains its original orientation while moving |
+| `trajectory_aligned` | Camera rotates to align with the direction of movement |
+##### Movement Distance
+The `--movement_distance` argument controls how far the camera moves from its initial position. The default value is 0.3. A larger value will result in more dramatic camera movement, while a smaller value will create more subtle movement.
+##### GPU Memory Requirements
+We have tested GEN3C only on H100 and A100 GPUs. For GPUs with limited memory, you can fully offload all models by appending the following flags to your command:
+```bash
+--offload_diffusion_transformer \
+--offload_tokenizer \
+--offload_text_encoder_model \
+--offload_prompt_upsampler \
+--offload_guardrail_models \
+--disable_guardrail \
+--disable_prompt_encoder
+```
+Maximum observed memory during inference with full offloading: ~43GB. Note: Memory usage may vary depending on system specifications and is provided for reference only.
+### Example 2: Video to Video Generation
+For video input, GEN3C requires additional depth information, camera intrinsics, and extrinsics. These can be obtained using your choice of SLAM packages. For testing purposes, we provide example data.
+First, you need to download the test samples:
+```bash
+# Download test samples from Hugging Face
+huggingface-cli download nvidia/GEN3C-Testing-Example --repo-type dataset --local-dir assets/diffusion/dynamic_video_samples
+```
+#### Single GPU
+```bash
+CUDA_HOME=$CONDA_PREFIX PYTHONPATH=$(pwd) python cosmos_predict1/diffusion/inference/gen3c_dynamic.py \
+    --checkpoint_dir checkpoints \
+    --input_image_path assets/diffusion/dynamic_video_samples/batch_0000 \
+    --video_save_name test_dynamic_video \
+    --guidance 1
+```
+#### Multi-GPU (8 GPUs)
+```bash
+NUM_GPUS=8
+CUDA_HOME=$CONDA_PREFIX PYTHONPATH=$(pwd) torchrun --nproc_per_node=${NUM_GPUS} cosmos_predict1/diffusion/inference/gen3c_dynamic.py \
+    --checkpoint_dir checkpoints \
+    --input_image_path assets/diffusion/dynamic_video_samples/batch_0000 \
+    --video_save_name test_dynamic_video_multigpu \
+    --num_gpus ${NUM_GPUS} \
+    --guidance 1
+```
+## Gallery
+- **GEN3C** can be easily applied to video/scene creation from a single image
+<div align="center">
+  <img src="assets/demo_3.gif" alt=""  width="1100" />
+</div>
+- ... or sparse-view images (we use 5 images here)
+<div align="center">
+  <img src="assets/demo_2.gif" alt=""  width="1100" />
+</div>
+- .. and dynamic videos
+<div align="center">
+  <img src="assets/demo_dynamic.gif" alt=""  width="1100" />
+</div>
+## Acknowledgement
+Our model is based on [NVIDIA Cosmos](https://github.com/NVIDIA/Cosmos) and [Stable Video Diffusion](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid).
+We are also grateful to several other open-source repositories that we drew inspiration from or built upon during the development of our pipeline:
+- [MoGe](https://github.com/microsoft/MoGe)
+- [TrajectoryCrafter](https://github.com/TrajectoryCrafter/TrajectoryCrafter)
+- [DimensionX](https://github.com/wenqsun/DimensionX)
+- [Depth Anything V2](https://github.com/DepthAnything/Depth-Anything-V2)
+- [Video Depth Anything](https://github.com/DepthAnything/Video-Depth-Anything)
+## Citation
+```
+ @inproceedings{ren2025gen3c,
+    title={GEN3C: 3D-Informed World-Consistent Video Generation with Precise Camera Control},
+    author={Ren, Xuanchi and Shen, Tianchang and Huang, Jiahui and Ling, Huan and
+        Lu, Yifan and Nimier-David, Merlin and Müller, Thomas and Keller, Alexander and
+        Fidler, Sanja and Gao, Jun},
+    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+    year={2025}
+}
+```
+## License and Contact
+This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.
+GEN3C source code is released under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0).
+GEN3C models are released under the [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). For a custom license, please visit our website and submit the form: [NVIDIA Research Licensing](https://www.nvidia.com/en-us/research/inquiries/).
+=======
+title: Gen3c
+emoji: 🌍
+colorFrom: indigo
+colorTo: blue
+sdk: docker
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+>>>>>>> 0453ffbfce197070bb0c254a11ef21f15d1ad986

assets/demo_1.gif ADDED Viewed

Git LFS Details

SHA256: e6162366c56277d084b05a37c617e2994ba75285d421e203556dcff08128b32b
Pointer size: 133 Bytes
Size of remote file: 14.7 MB

assets/demo_2.gif ADDED Viewed

Git LFS Details

SHA256: e765e71d3016c6e314b6403f82313a1df42f68f6fb0f9416f197d82e0710f27e
Pointer size: 133 Bytes
Size of remote file: 10.6 MB

assets/demo_3.gif ADDED Viewed

Git LFS Details

SHA256: 8c4cf4a4bf62daf03b25ac66c2c3693adbf7cd459e55d3481a65a9ff4a9d09d9
Pointer size: 133 Bytes
Size of remote file: 35.3 MB

assets/demo_dynamic.gif ADDED Viewed

Git LFS Details

SHA256: 174faba45ae701eaa432dd14de1297c0479b6c0b832adbc211cbb529fbec6c61
Pointer size: 133 Bytes
Size of remote file: 24.5 MB

assets/diffusion/000000.png ADDED Viewed

Git LFS Details

SHA256: b7e6eab7548c2ede900f8b504a5cef981e0cd0ec38af90dbea3f0db860e002c3
Pointer size: 132 Bytes
Size of remote file: 1.33 MB

assets/diffusion/000001.png ADDED Viewed

Git LFS Details

SHA256: abe310078829c9e1375ac30c7c270c84c8f68a09f3857bd35c7a5754f3326151
Pointer size: 132 Bytes
Size of remote file: 1.13 MB

assets/diffusion/000002.png ADDED Viewed

Git LFS Details

SHA256: 7ad89b53e9fafed0d8eefd1cfc7cc4889c5d2f510ed32d5247c5adab4cb0c622
Pointer size: 131 Bytes
Size of remote file: 789 kB

assets/diffusion/000003.png ADDED Viewed

Git LFS Details

SHA256: 22f39915f1b277e70683befbc18ac5859c65c3d389e4dbb5127a539a411fec54
Pointer size: 132 Bytes
Size of remote file: 1.11 MB

assets/diffusion/000004.png ADDED Viewed

Git LFS Details

SHA256: e2f957208849c0f86b89545734bb7b243868b574554cb6aeed248b04e7234ad4
Pointer size: 132 Bytes
Size of remote file: 1.26 MB

assets/diffusion/000005.png ADDED Viewed

Git LFS Details

SHA256: 267f6ae47d0e2aebda89fac5416bc0915855043131d0d8d8a4fc9506cabd4681
Pointer size: 132 Bytes
Size of remote file: 1.36 MB

assets/diffusion/000006.png ADDED Viewed

Git LFS Details

SHA256: 4b6fd098366bcd54bd21a5707ae6d9f78d74c2eefcfbb6919569c0d1741d837f
Pointer size: 132 Bytes
Size of remote file: 1.21 MB

assets/diffusion/000007.png ADDED Viewed

Git LFS Details

SHA256: 334733b7428f9521e625a8b310770fbba3e4616ccbe0af625d07e2b065e6e9ad
Pointer size: 132 Bytes
Size of remote file: 1.15 MB

assets/diffusion/000008.png ADDED Viewed

Git LFS Details

SHA256: 7eae1abb3343c1e11f4e42172eba85eeed0fb2a5f7701a42e5003cf84f1696cd
Pointer size: 132 Bytes
Size of remote file: 1.68 MB

assets/diffusion/000009.png ADDED Viewed

Git LFS Details

SHA256: 2a5c5711d41f56bb307ef6020d0dffec9ce2297bda9ef9ae465237d8347adb34
Pointer size: 131 Bytes
Size of remote file: 603 kB

assets/diffusion/000010.png ADDED Viewed

Git LFS Details

SHA256: e4d32f1d1c6d427e421d6f4478d4c2c697cb0406a18ecc3b8ebeeb2a0cbba7f5
Pointer size: 132 Bytes
Size of remote file: 1.18 MB

assets/diffusion/000011.png ADDED Viewed

Git LFS Details

SHA256: e352d7435d3b313fcc47efd9bd0dc6e0dd5d5e8af8c50e965c57987bee1c94ec
Pointer size: 131 Bytes
Size of remote file: 944 kB

assets/diffusion/000012.png ADDED Viewed

Git LFS Details

SHA256: b672d43521890b2852976a0c12828ad16b9288277efff6c41189dc0c04c9c6e1
Pointer size: 132 Bytes
Size of remote file: 1.1 MB

assets/diffusion/000013.png ADDED Viewed

Git LFS Details

SHA256: eab3a655213eede094889bab94313e1cef142b811429bee9e0f3420c2b013105
Pointer size: 132 Bytes
Size of remote file: 1.24 MB

assets/diffusion/000014.png ADDED Viewed

Git LFS Details

SHA256: eb014db53082677aca35a3fc27daa1f306452c5cb7130a4ed6468cae144a0b63
Pointer size: 132 Bytes
Size of remote file: 1.35 MB

assets/diffusion/000015.png ADDED Viewed

Git LFS Details

SHA256: a6ac0d4e7eb6d4dbc3ae997fafc28721b716db092aaa52ede11e4d87b3e9b20d
Pointer size: 132 Bytes
Size of remote file: 1.49 MB

checkpoints/README.md ADDED Viewed

	@@ -0,0 +1,4 @@


1	+
2	+ ### Checkpoint directory
3	+
4	+ Model checkpoints will be downloaded to this directory.

cosmos-predict1.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# conda env create --file cosmos-predict1.yaml
+name: cosmos-predict1
+channels:
+  - conda-forge
+dependencies:
+  - python=3.10
+  - pip=25.0
+  - cmake
+  - ninja
+  - gcc=12.4.0
+  - gxx=12.4.0
+  - cuda=12.4
+  - cuda-nvcc=12.4
+  - cuda-toolkit=12.4

cosmos_predict1/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos_predict1/autoregressive/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos_predict1/autoregressive/callbacks/video_sampling_teacher_forcing.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+import math
+import os
+from typing import Optional
+import numpy as np
+import torch
+import torchvision
+import torchvision.transforms.functional as torchvision_F
+import wandb
+from einops import rearrange
+from megatron.core import parallel_state
+from torch.distributed import get_process_group_ranks
+from cosmos_predict1.autoregressive.utils.parallel import (
+    broadcast_data_batch_in_tp_cp_group,
+    gather_batch_from_cp_ranks,
+    get_batch_on_this_cp_rank,
+)
+from cosmos_predict1.callbacks.every_n import EveryN
+from cosmos_predict1.utils import distributed, log, misc
+from cosmos_predict1.utils.model import Model
+from cosmos_predict1.utils.trainer import Trainer
+def resize_image(image: torch.Tensor, resize_factor=0.5) -> torch.Tensor:
+    _, _, h, w = image.shape
+    new_h, new_w = int(resize_factor * h), int(resize_factor * w)
+    return torchvision_F.resize(image, (new_h, new_w))
+class VideoSamplingTeacherForcing(EveryN):
+    def __init__(
+        self,
+        every_n: int,
+        step_size: int = 1,
+        video_latent_shape: list = [6, 24, 40],
+        num_frames_to_display: int = 4,
+        save_folder: Optional[str] = None,
+        num_file_to_log: int = 8,
+    ):
+        r"""
+        This callback enables us to perform teacher forcing inference on the training data.
+        By teacher forcing, we mean providing ground truth video tokens as inputs, and simply asking the model
+        to predict the next tokens. The predicted next tokens are then visualized. This does not perform
+        autoregressive sampling.
+        We also upload the downsampled video frames to wandb. Downsampling is needed for wandb to work fast.
+        Args:
+            every_n (int): Call this callback every_n steps
+            step_size (int): Number of steps taken for gradient accumulation. Global iteration number is
+                iteration // self.step_size
+            video_latent_shape (list): Shape of the video latent
+            num_frames_to_display (int): Number of frames to subsample for displaying in wandb
+            save_folder (str): Name of the local folder to save the video
+            num_file_to_log (int): Number of files to upload to wandb
+        """
+        super().__init__(every_n, step_size)
+        self.save_folder = save_folder if save_folder else self.__class__.__name__
+        self.video_latent_shape = video_latent_shape
+        self.num_frames_to_display = num_frames_to_display
+        self.num_file_to_log = num_file_to_log
+        self.rank = distributed.get_rank()
+    def on_train_start(self, model: Model, iteration: int = 0) -> None:
+        config_job = self.config.job
+        self.local_dir = f"{config_job.path_local}/{self.save_folder}"
+        if self.rank == 0:
+            os.makedirs(self.local_dir, exist_ok=True)
+            log.info(f"Video Teacher-Forcing Callback: local_dir: {self.local_dir}")
+    @torch.inference_mode()
+    def every_n_impl(
+        self,
+        trainer: Trainer,
+        model: Model,
+        data_batch: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        loss: torch.Tensor,
+        iteration: int,
+    ) -> None:
+        # Tokenize the data
+        broadcast_data_batch_in_tp_cp_group(data_batch)
+        input_vid = data_batch[model.tokenizer.tokenizer_config.video_tokenizer.data_key]
+        dataset_name = data_batch.get("dataset_name", None)
+        if dataset_name is not None and dataset_name.startswith("image"):
+            # we disable the callback if the input video is an image batch
+            log.info(f"dataset_name is {dataset_name}, skip this callback")
+            return
+        # get the caption
+        captions = data_batch.get("caption", None)
+        # get the context embedding and mask
+        context = data_batch.get("context", None)
+        context_mask = data_batch.get("context_mask", None)
+        if context is not None:
+            context = misc.to(context, "cuda").detach().clone()
+        if context_mask is not None:
+            context_mask = misc.to(context_mask, "cuda").detach().clone()
+        # get the action
+        action = data_batch.get("action", None)
+        if action is not None:
+            action = misc.to(action, "cuda").detach().clone()
+        # Input tokens
+        tokens, _ = model.tokenizer.tokenize(data_batch)
+        tokens = misc.to(tokens, "cuda").detach().clone()
+        skip_save_file = False
+        if parallel_state.get_context_parallel_world_size() > 1:
+            cp_group = parallel_state.get_context_parallel_group()
+            if self.rank != min(get_process_group_ranks(cp_group)):
+                skip_save_file = True
+            tokens = get_batch_on_this_cp_rank(tokens)
+        if parallel_state.get_tensor_model_parallel_world_size() > 1:
+            # Turn on TP
+            tp_group = parallel_state.get_tensor_model_parallel_group()
+            if self.rank != min(get_process_group_ranks(tp_group)):
+                skip_save_file = True
+        tokens_encoded_in_train = output_batch["encode_tokens"].detach()
+        percent_token_diff = (tokens != tokens_encoded_in_train).float().mean()
+        percent_token_diff = distributed.dist_reduce_tensor(percent_token_diff)
+        input_tokens = tokens
+        num_tokens_to_generate = np.prod(self.video_latent_shape)
+        # Do a forward pass
+        logits = model.model.forward(
+            tokens,
+            input_pos=None,
+            context=context,
+            context_mask=context_mask,
+            action=action,
+        )
+        if parallel_state.get_context_parallel_world_size() > 1:
+            logits = gather_batch_from_cp_ranks(logits)
+            input_tokens = gather_batch_from_cp_ranks(input_tokens)
+        # Start position for video tokens in the vocabulary
+        video_token_start = self.config.model.tokenizer_config.video_tokenizer.tokenizer_offset
+        video_vocab_size = self.config.model.tokenizer_config.video_tokenizer.vocab_size
+        # Clipping logits only to video tokens. We remove the text vocab predictions.
+        # This will ensure that the video tokens only correspond to the video part of the vocabulary.
+        logits = logits[:, :, video_token_start : video_token_start + video_vocab_size]
+        # Sample with argmax token. This should be good for teacher forcing experiment.
+        logits = logits.contiguous()
+        generations = torch.argmax(logits, dim=-1)
+        # For each video in the batch, subsample frames for display
+        batch_size = input_tokens.shape[0]
+        out_frames = []
+        out_videos_gen = []
+        out_videos_rec = []
+        out_videos_gt = []
+        # log the accuracy of teacher-forcing
+        acc = []
+        loss_list = []
+        for sample_num in range(batch_size):
+            # Subsample the generations to the video part.
+            # This corresponds to the part from begin of video to end of video.
+            bov_token = model.tokenizer.video_special_tokens["<|begin_of_video|>"]
+            bov_index = input_tokens[sample_num] == bov_token
+            use_special_token = sum(bov_index) != 0
+            if use_special_token:
+                bov_index = bov_index.nonzero().item()
+                # generations: <bov> real_token1 real_token2, ... real_token7680; total 7680
+                # gen_video_tokens: real_token1 real_token2, ..., real_token7680; total 7680
+                # for vis: real_token1 real_token2, ..., real_token7680; total 7680
+                # for accuracy: real_token1 real_token2, ..., real_token7680; total 7680
+                gen_video_tokens = generations[sample_num][bov_index : bov_index + num_tokens_to_generate]
+                gen_video_tokens_vis = gen_video_tokens
+                gen_video_tokens_acc = gen_video_tokens
+                logits_loss = logits[sample_num][bov_index : bov_index + num_tokens_to_generate]
+            else:
+                # generations: real_token1 real_token2, ... real_token7680
+                # gen_video_tokens: real_token2 real_token3, ..., real_token7680; total 7679
+                # We need different tokens for vis and accuracy compute
+                # for acc: real_token2 real_token3, ..., real_token7680; total 7679
+                # for vis: pad_token (real_token2, ..., real_token7680); total 1 + 7679
+                gen_video_tokens = generations[sample_num][
+                    : num_tokens_to_generate - 1
+                ]  # remove the last token since there is no gt
+                # Since the first token is not predicted, we need to add the gt first token to make sure the shape is correct
+                gen_video_tokens_vis = torch.cat([input_tokens[sample_num][0:1], gen_video_tokens])
+                gen_video_tokens_acc = gen_video_tokens
+                logits_loss = logits[sample_num][: num_tokens_to_generate - 1]
+            # Rearrange the video to a spatial tensor
+            gen_video_tokens_vis_BTHW = rearrange(
+                gen_video_tokens_vis.unsqueeze(0),
+                "B (T H W) -> B T H W",
+                T=self.video_latent_shape[0],
+                H=self.video_latent_shape[1],
+                W=self.video_latent_shape[2],
+            )
+            # for real videos, we need to skip the bov and eov tokens for decoding
+            if use_special_token:
+                # input_tokens: <bov> real_token1 real_token2 ... <eov> <eov> ...
+                # real_video_tokens: real_token1 real_token2 ... real_token7680; total 7680
+                # for vis: real_token1 real_token2 ... real_token7680; total 7680
+                # for accuracy: real_token1 real_token2 ... real_token7680; total 7680; we include real_token1 since the output prediction also includes it, see gen_video_tokens_acc above
+                real_video_tokens = (
+                    input_tokens[sample_num][bov_index + 1 : bov_index + num_tokens_to_generate + 1] - video_token_start
+                )
+                real_video_tokens_vis = real_video_tokens
+                real_video_tokens_acc = real_video_tokens
+            else:
+                # input_tokens: real_token1 real_token2,... real_token7680; total 7680
+                # real_video_tokens: real_token1 real_token2,... real_token7680; total 7680
+                # for acc: gt start from real_token2, real_token3; total 7679, remove the first token since it is not predicted
+                # for vis: gt start from real_token1, real_token2; total 7680
+                real_video_tokens = input_tokens[sample_num][:num_tokens_to_generate] - video_token_start
+                real_video_tokens_vis = real_video_tokens
+                real_video_tokens_acc = real_video_tokens[1:].flatten()
+            real_video_tokens_vis_BTHW = rearrange(
+                real_video_tokens_vis.unsqueeze(0),
+                "B (T H W) -> B T H W",
+                T=self.video_latent_shape[0],
+                H=self.video_latent_shape[1],
+                W=self.video_latent_shape[2],
+            )
+            # Calculate accuracy
+            correct_predictions = (gen_video_tokens_acc == real_video_tokens_acc).float()
+            labels = real_video_tokens_acc.clone()
+            if model.config.ignore_first_num_tokens > 0:
+                labels[: model.config.ignore_first_num_tokens] = model.tokenizer.ignore_index
+            select_index = labels != model.tokenizer.ignore_index
+            correct_predictions = correct_predictions[select_index]
+            loss = torch.nn.functional.cross_entropy(
+                logits_loss, labels, ignore_index=model.tokenizer.ignore_index, reduction="none"
+            )
+            acc.append(correct_predictions.mean() * 100.0)
+            loss_list.append(loss.mean())
+            # Decode the predicted latents
+            if model.tokenizer.tokenizer_config.video_tokenizer.temporal_overlap == 0:
+                vid_decoded = model.tokenizer.video_tokenizer.decode(gen_video_tokens_vis_BTHW.cuda())
+            else:
+                vid_decoded = model.tokenizer.video_tokenizer.decode_with_overlap(
+                    gen_video_tokens_vis_BTHW.cuda(),
+                    temporal_overlap=model.tokenizer.tokenizer_config.video_tokenizer.temporal_overlap,
+                )
+            # normalize decoded images from [-1, 1] to [0, 1], and clip value
+            vid_decoded = (vid_decoded * 0.5 + 0.5).clamp_(0, 1)
+            vid_decoded = vid_decoded[0]
+            # Decode the GT latents
+            if model.tokenizer.tokenizer_config.video_tokenizer.temporal_overlap == 0:
+                vid_rec = model.tokenizer.video_tokenizer.decode(real_video_tokens_vis_BTHW.cuda())
+            else:
+                vid_rec = model.tokenizer.video_tokenizer.decode_with_overlap(
+                    real_video_tokens_vis_BTHW.cuda(),
+                    temporal_overlap=model.tokenizer.tokenizer_config.video_tokenizer.temporal_overlap,
+                )
+            # normalize decoded image from [-1, 1] to [0, 1], and clip value
+            vid_rec = (vid_rec * 0.5 + 0.5).clamp_(0, 1)
+            vid_rec = vid_rec[0]
+            vid_input = input_vid[sample_num]  # [-1, 1], input_vid shape: [B, C, L, H, W]
+            vid_input = (vid_input * 0.5 + 0.5).clamp_(0, 1).cuda()  # Convert to [0, 1], [C, L, H, W]
+            # Subsample real and generated video frames
+            input_video_frames = vid_input.transpose(0, 1)  # [L, C, H, W]
+            rec_video_frames = vid_rec.transpose(0, 1)
+            gen_video_frames = vid_decoded.transpose(0, 1)
+            out_videos_gen.append(gen_video_frames)
+            out_videos_rec.append(rec_video_frames)
+            out_videos_gt.append(input_video_frames)
+            stride = math.ceil(rec_video_frames.shape[0] / self.num_frames_to_display)
+            input_video_frames_subsampled = resize_image(input_video_frames[0::stride], resize_factor=0.5)
+            input_video_frames_subsampled = torchvision.utils.make_grid(
+                input_video_frames_subsampled, nrow=input_video_frames_subsampled.shape[0]
+            )
+            gt_video_frames_subsampled = resize_image(rec_video_frames[0::stride], resize_factor=0.5)
+            gt_video_frames_subsampled = torchvision.utils.make_grid(
+                gt_video_frames_subsampled, nrow=gt_video_frames_subsampled.shape[0]
+            )
+            gen_video_frames_subsampled = resize_image(gen_video_frames[0::stride], resize_factor=0.5)
+            gen_video_frames_subsampled = torchvision.utils.make_grid(
+                gen_video_frames_subsampled, nrow=gen_video_frames_subsampled.shape[0]
+            )
+            out_frames.append(input_video_frames_subsampled)
+            out_frames.append(gt_video_frames_subsampled)
+            out_frames.append(gen_video_frames_subsampled)
+        scaled_num_rank_to_log = (
+            self.num_file_to_log
+            * parallel_state.get_context_parallel_world_size()
+            * parallel_state.get_tensor_model_parallel_world_size()
+        )
+        if self.rank < scaled_num_rank_to_log and not skip_save_file:
+            local_path = f"{self.local_dir}/vid_teacher_forcing_iter_{iteration:09d}_{self.rank:04d}.jpg"
+            out_image_grid = torchvision.utils.make_grid(out_frames, nrow=1, padding=0, normalize=False)
+            os.makedirs(os.path.dirname(local_path), exist_ok=True)
+            torchvision.utils.save_image(out_image_grid, local_path)
+        # Log to wandb
+        avg_acc = distributed.dist_reduce_tensor(torch.stack(acc).mean()).item()
+        avg_loss = distributed.dist_reduce_tensor(torch.stack(loss_list).mean()).item()
+        log_info = ""
+        if "acc" in output_batch:
+            log_info = f"train acc: {(output_batch['acc'].mean().item()):.6f}%"
+        if percent_token_diff is not None:
+            log_info += f"; percent_token_diff_train_val: {percent_token_diff.item() * 100:.6f}%"
+        log.info(
+            f"Eval iteration {iteration} teacher-forcing accuracy: {avg_acc:.6f}%, loss: {avg_loss:.4f}; {log_info}"
+        )
+        if self.rank == 0 and wandb.run:
+            local_files = glob.glob(f"{self.local_dir}/vid_teacher_forcing_iter_{iteration:09d}_*.jpg")
+            local_files = sorted(local_files)[: self.num_file_to_log]
+            if captions is None:
+                captions = ["vid_frames_teacher_forcing"] * len(local_files)
+            for local_path, caption in zip(local_files, captions):
+                wandb.log(
+                    {"frames": [wandb.Image(local_path, caption=caption)]},
+                    step=iteration,
+                )
+            wandb.log({"eval/teacher_forcing_acc": avg_acc}, step=iteration)
+            wandb.log({"eval/teacher_forcing_loss": avg_loss}, step=iteration)
+            if percent_token_diff is not None:
+                wandb.log({"eval/percent_token_diff_train_val": percent_token_diff.item() * 100}, step=iteration)

cosmos_predict1/autoregressive/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos_predict1/autoregressive/configs/base/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos_predict1/autoregressive/configs/base/callbacks.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from cosmos_predict1.autoregressive.callbacks.video_sampling_teacher_forcing import VideoSamplingTeacherForcing
+from cosmos_predict1.callbacks.grad_clip import GradClip
+from cosmos_predict1.utils.callback import ProgressBarCallback
+from cosmos_predict1.utils.lazy_config import LazyCall as L
+BASIC_CALLBACKS = dict(
+    progress_bar=L(ProgressBarCallback)(),
+    grad_clip=L(GradClip)(clip_norm=1.0, fsdp_enabled="${model.model_config.fsdp_enabled}", model_key="model"),
+)
+VIDEO_TEACHER_FORCING_CALLBACK = dict(
+    vid_sampling_tf=L(VideoSamplingTeacherForcing)(
+        every_n=500,
+        video_latent_shape="${model.model_config.video_latent_shape}",
+        num_frames_to_display=4,
+        save_folder="video_sampling_teacher_forcing",
+    )
+)

cosmos_predict1/autoregressive/configs/base/dataloader.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from megatron.core import parallel_state
+from torch.utils.data import DataLoader, DistributedSampler
+from cosmos_predict1.autoregressive.configs.base.dataset import VideoDatasetConfig
+from cosmos_predict1.autoregressive.datasets.video_dataset import VideoDataset
+from cosmos_predict1.utils import log
+from cosmos_predict1.utils.lazy_config import LazyCall as L
+DATALOADER_OPTIONS = {}
+def get_sampler(dataset):
+    return DistributedSampler(
+        dataset,
+        num_replicas=parallel_state.get_data_parallel_world_size(),
+        rank=parallel_state.get_data_parallel_rank(),
+        shuffle=True,
+        seed=0,
+    )
+def dataloader_register(key):
+    log.info(f"registering dataloader {key}...")
+    def decorator(func):
+        DATALOADER_OPTIONS[key] = func
+        return func
+    return decorator
+@dataloader_register("tealrobot_video")
+def get_tealrobot_video(
+    batch_size: int = 1,
+    dataset_dir: str = "datasets/cosmos_nemo_assets/videos/",
+    sequence_interval: int = 1,
+    num_frames: int = 33,
+    video_size: list[int, int] = [640, 848],
+    start_frame_interval: int = 1,
+):
+    dataset = L(VideoDataset)(
+        config=VideoDatasetConfig(
+            dataset_dir=dataset_dir,
+            sequence_interval=sequence_interval,
+            num_frames=num_frames,
+            video_size=video_size,
+            start_frame_interval=start_frame_interval,
+        )
+    )
+    return L(DataLoader)(
+        dataset=dataset,
+        sampler=L(get_sampler)(dataset=dataset),
+        batch_size=batch_size,
+        drop_last=True,
+        pin_memory=True,
+        num_workers=8,
+    )

cosmos_predict1/autoregressive/configs/base/dataset.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dataset config class."""
+import attrs
+from cosmos_predict1.utils.config import make_freezable
+@make_freezable
+@attrs.define(slots=False)
+class VideoDatasetConfig:
+    """
+    Args:
+        dataset_dir (str): Base path to the dataset directory
+        sequence_interval (int): Interval between sampled frames in a sequence
+        num_frames (int): Number of frames to load per sequence
+        video_size (list): Target size [H,W] for video frames
+        start_frame_interval (int): Interval between starting frames of sequences
+    """
+    dataset_dir: str = "datasets/cosmos_nemo_assets/videos/"
+    sequence_interval: int = 1
+    num_frames: int = 33
+    video_size: list[int, int] = [640, 848]
+    start_frame_interval: int = 1

cosmos_predict1/autoregressive/configs/base/model.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import attrs
+from cosmos_predict1.autoregressive.configs.base.tokenizer import TokenizerConfig
+from cosmos_predict1.utils import config
+_ACTION_DIM = 8
+from cosmos_predict1.utils.lazy_config import LazyDict
+@attrs.define
+class ModelConfig:
+    """
+    A class to hold model configuration arguments.
+    Args:
+        dim (int): The dimensionality of the input and output of each transformer block.
+        n_layers (int): Number of layers in the transformer.
+        n_heads (int): Number of attention heads.
+        n_kv_heads (Optional[int]): Number of key-value heads. If None, defaults to n_heads. Note: this is equivalent to
+            `num_gqa_groups` in TransformerEngine, where GQA means Grouped Query Attention.
+        head_dim (Optional[int]): Dimensionality of each head. If None, defaults to dim // n_heads.
+        vocab_size (int): Vocabulary size.
+        ffn_hidden_size (int): Hidden size for feedforward network.
+        norm_eps (float): Epsilon value for normalization.
+        rope_theta (float): Theta value for rotary positional embeddings.
+        apply_abs_pos_emb (bool): Whether to apply absolute position embeddings.
+        max_batch_size (int): Maximum batch size for inference.
+        max_seq_len (int): Maximum sequence length for input text.
+        fuse_qkv (bool): Whether to fuse QKV in attention. Defaults to True.
+        causal_mask (bool): Whether to use causal mask. Defaults to True.
+        norm_type (str): Type of normalization layer. Choices: "rmsnorm", "fused_rmsnorm", "layernorm", "np_layernorm".
+        precision (str): Data type for the model.
+        use_qk_normalization (bool): Whether to enable QK normalization.
+        tensor_model_parallel_size (int): Tensor model parallel size. Defaults to 1.
+        ckpt_dir (str): Checkpoint directory.
+        ckpt_path (str): Checkpoint path.
+        apply_yarn (Optional[bool]): Whether to apply YaRN (long-context extension).
+        yarn_scale (Optional[float]): Scale factor for YaRN.
+        yarn_beta_fast (Optional[int]): Beta fast variable for YaRN (i.e., low_freq_factor in Llama 3.1 RoPE scaling code)
+        yarn_beta_slow (Optional[int]): Beta slow variable for YaRN (i.e., high_freq_factor in Llama 3.1 RoPE scaling code)
+        original_seq_len (Optional[int]): Original sequence length.
+        vision_encoder (Optional[str]): Vision encoder name.
+        mm_projector (Optional[str]): Multi-modal projector name.
+        vision_encoder_in_channels (Optional[int]): Number of channels in the input image for the vision encoder. Default is 3, you can specify to int larger than 3. E.g. if you have 4-channel images with the last channel as the alpha channel, set this to 4.
+        rope_dim (Optional[str]): Dimensionality of the RoPE. Choices: "1D", "3D".
+        pytorch_rope_version (Optional[str]): Version of the PyTorch RoPE implementation. Choices: "v1", "v2".
+        original_latent_shape (Optional[list]): Original shape of the latent tensor needed for rope extension.
+        pad_to_multiple_of (Optional[int]): Pad the position embedding to a multiple of this value.
+        vision_encoder_in_channels (Optional[int]): Number of channels in the input image for the vision encoder. Default is 3.
+        insert_cross_attn (bool): Whether to insert the cross-attention layers after each multi-head self-attention (MSA) layer.
+        insert_cross_attn_every_k_layers (int): Insert cross-attention layers every k TransformerLayers.
+        context_dim (Optional[int]): The dimensionality of cross-attention embedding, e.g., T5 embed feature dim.
+        num_video_frames (Optional[int]): Number of video frames.
+        video_height (Optional[int]): Raw video pixel height dimension.
+        video_width (Optional[int]): Raw video pixel width dimension.
+        video_latent_shape (Optional[list]): Video tokenizer output dimension, in (T,H,W).
+    """
+    dim: int = attrs.field(default=4096)
+    n_layers: int = attrs.field(default=32)
+    n_heads: int = attrs.field(default=32)
+    n_kv_heads: Optional[int] = attrs.field(default=8)
+    head_dim: Optional[int] = attrs.field(default=None)
+    vocab_size: int = attrs.field(default=128256)
+    ffn_hidden_size: int = attrs.field(default=14336)
+    norm_eps: float = attrs.field(default=1e-5)
+    rope_theta: float = attrs.field(default=500000)
+    apply_abs_pos_emb: bool = attrs.field(default=False)
+    max_batch_size: int = attrs.field(default=1)
+    max_seq_len: int = attrs.field(default=8192)
+    fuse_qkv: bool = attrs.field(default=False)
+    causal_mask: bool = attrs.field(default=True)
+    norm_type: str = attrs.field(default="rmsnorm")
+    precision: str = attrs.field(default="bfloat16")
+    use_qk_normalization: bool = False
+    tokenizer: Optional[TokenizerConfig] = None
+    tensor_model_parallel_size: int = attrs.field(default=1)
+    ckpt_dir: Optional[str] = attrs.field(default=None)
+    ckpt_path: Optional[str] = attrs.field(
+        default=None
+    )  # If not None, load the model from this path instead of ckpt_dir
+    apply_yarn: Optional[bool] = attrs.field(default=False)
+    yarn_scale: Optional[float] = attrs.field(default=None)
+    yarn_beta_fast: Optional[int] = attrs.field(default=None)
+    yarn_beta_slow: Optional[int] = attrs.field(default=None)
+    original_seq_len: Optional[int] = attrs.field(default=None)
+    vision_encoder: Optional[str] = attrs.field(default=None)
+    vision_encoder_in_channels: Optional[int] = attrs.field(default=3)
+    mm_projector: Optional[str] = attrs.field(default=None)
+    rope_dim: Optional[str] = attrs.field(default="1D")
+    pytorch_rope_version: Optional[str] = attrs.field(default="v2")
+    original_latent_shape: Optional[list] = None
+    pad_to_multiple_of: Optional[int] = None
+    vision_encoder_in_channels: Optional[int] = attrs.field(default=3)
+    insert_cross_attn: bool = False
+    insert_cross_attn_every_k_layers: int = 1
+    context_dim: Optional[int] = attrs.field(default=1024)
+    # For video training
+    num_video_frames: Optional[int] = None
+    # Raw video pixel dimension
+    video_height: Optional[int] = None
+    video_width: Optional[int] = None
+    # Video tokenizer output dimension, in (T,H,W), it's computed by num_video_frames/temporal_compress_factor, video_height/spatial_compression_fact, video_width/spatial_compression_fact
+    video_latent_shape: Optional[list] = None
+    def __getitem__(self, item):
+        return getattr(self, item)
+@attrs.define
+class TrainingModelConfig:
+    """
+    A class to hold model configuration arguments.
+    Args:
+        dim (int): The dimensionality of the input and output of each transformer block.
+        n_layers (int): Number of layers in the transformer.
+        n_heads (int): Number of attention heads.
+        n_kv_heads (Optional[int]): Number of key-value heads. If None, defaults to n_heads. Note: this is equivalent to
+            `num_gqa_groups` in TransformerEngine, where GQA means Grouped Query Attention.
+        head_dim (Optional[int]): Dimensionality of each head. If None, defaults to dim // n_heads.
+        vocab_size (int): Vocabulary size.
+        multiple_of (int): Ensures the hidden layer size is a multiple of this value for SwiGLU activation.
+        ffn_dim_multiplier (Optional[float]): Multiplier for feedforward network dimension.
+        ffn_hidden_size (Optional[int]): Hidden size for feedforward network. If None, use ffn_dim_multiplier to compute it.
+        norm_eps (float): Epsilon value for normalization.
+        rope_theta (float): Theta value for rotary positional embeddings.
+        apply_abs_pos_emb (bool): Whether to apply absolute position embeddings.
+        max_batch_size (int): Maximum batch size for inference (determines KV cache size).
+        max_seq_len (int): Maximum sequence length for input text (determines KV cache size).
+        fuse_qkv (bool): Whether to fuse QKV in attention. Flag for the pytorch backend.
+        causal_mask (bool): Whether to use causal mask. Defaults to True.
+        flash_attn (bool): Whether to use Flash attention.
+        norm_type (str): Type of normalization layer. Choices: "rmsnorm", "fused_rmsnorm", "layernorm", "np_layernorm".
+        backend (str): Backend for the model.
+        precision (str): Data type for the model.
+        ema (config.EMAConfig): Configuration for exponential moving average.
+        embedding_dropout(float): Dropout rate for the embedding layer.
+        attention_dropout(float): Dropout rate for attention.
+        hidden_dropout(float): Dropout after the attention and feed-forward layers (following TransformerEngine's
+                implementation in its TransformerLayer class).
+        use_qk_normalization (bool): Whether to enable QK normalization.
+        inference (bool): Whether the model is used for inference.
+        act_ckpt_enabled (bool): Whether to enable activation checkpointing.
+        fsdp_enabled (bool): Whether to enable FSDP.
+        fsdp (LazyDict): Configuration for FSDP.
+        ckpt_dir (str): Checkpoint directory.
+        ckpt_path (str): Checkpoint path.
+        cache_dir (str): Cache directory.
+        apply_yarn (Optional[bool]): Whether to apply YaRN (long-context extension).
+        yarn_scale (Optional[float]): Scale factor for YaRN.
+        yarn_beta_fast (Optional[int]): Beta fast variable for YaRN (i.e., low_freq_factor in Llama 3.1 RoPE scaling code)
+        yarn_beta_slow (Optional[int]): Beta slow variable for YaRN (i.e., high_freq_factor in Llama 3.1 RoPE scaling code)
+        original_seq_len (Optional[int]): Original sequence length.
+        depth_init (bool): If `True`, then each transformer block init uses its layer ID, and if `False`, each uses the
+            total number of transformer blocks. Defaults to `True` (following the TorchTitan implementation of Llama3).
+        context_parallel_size (int): Context parallel size. Defaults to 1.
+        tensor_model_parallel_size (int): Tensor model parallel size. Defaults to 1.
+        sequence_parallel (bool): Whether to use sequence parallelism. Defaults to False.
+        set_parallel_mode (bool): It is a boolean flag used by TransformerEngine to handle Tensor Parallelism.
+            Essentially, it is equivalent to `tensor_model_parallel_size > 1`. Defaults to `False`.
+        attention_tp (bool): Whether to use tensor parallelism for attention layers.
+        mm_projector (Optional[str]): Multimodal projector used for vision-language modeling. Defaults to None.
+            Choices: "identity", "linear", "mlp", "mlp_downsample".
+        video_latent_shape (Optional[list]): Shape of the video latent tensor. [T, H, W]
+        image_latent_shape (Optional[list]): Shape of the image latent tensor. [H, W]
+        num_video_frames (Optional[int]): Number of video frames.
+        rope_dim (Optional[str]): Dimensionality of the RoPE. Choices: "1D", "2D", "3D".
+        pytorch_rope_version (Optional[str]): Version of the RoPE for the `pytorch` backend. "v1" is the Llama implementation, and "v2" is HuggingFace/TransformerEngine implementation.
+        original_latent_shape (Optional[list]): Original shape of the latent tensor needed for rope extension.
+        pad_to_multiple_of (Optional[int]): Pad the position embedding to a multiple of this value.
+        peft_last_n_layers (Optional[int]): Number of last few layers to fine-tune in Parameter Efficient Fine-Tuning (PEFT). When this and peft_every_n_layers are both 0, it means all layers are fine-tuned (FFT).
+        peft_every_n_layers (Optional[int]): In Parameter Efficient Fine-Tuning (PEFT), every n layers are unfrozen and can be trained (in flamingo style). When this and peft_last_n_layers are both 0,
+            it means all layers are fine-tuned (FFT). For example, for a 40 layer model, n=8 means training layers 7, 15, 23, 31, 39, which includes the final layer.
+            It is advised to pick n such that the final layer is included.
+        freeze_vision_encoder (bool): Whether to freeze the vision encoder in vision-language model training. Defaults to False.
+        vision_encoder_in_channels (Optional[int]): Number of channels in the input image for the vision encoder. Default is 3, you can specify to int larger than 3. E.g. if you have 4-channel images with the last channel as the alpha channel, set this to 4.
+        insert_cross_attn (bool): Whether to insert the cross-attention layers after each multi-head self-attention (MSA) layer.
+        insert_cross_attn_every_k_layers (int): Insert cross-attention layers every k TransformerLayers.
+        context_dim (Optional[int]): The dimensionality of cross-attention embedding, e.g., T5 embed feature dim.
+        finetune_layers_with_cross_attn (bool): Whether to finetune Transformer layers w/ CA (cross-attn).
+        finetune_layers_without_cross_attn (bool): Whether to finetune Transformer layers w/o CA (cross-attn).
+        use_action_condition (bool): Whether to use the robot action condition.
+        action_embedding_mode (Optional[str]): The mode of the robot action embedding. Choices: "matrix", "mlp".
+        action_dim (Optional[int]): The dimensionality of the raw robot action tensor (e.g., 7 for DROID, [Δx, Δy, Δz, rx, ry, rz, gripper_open]).
+        action_embedding_dim (Optional[int]): The dimensionality of the robot action embedding.
+        group_causal_mask_mode (Optional[str]): The mode of the group causal mask. Choices: "causal", "group_diagonal".
+        sync_1d_parameters (bool): Whether to synchronize layernorm parameters (1D) across tensor parallel ranks (default True).
+            Note: this is to ensure all TP-ranks have the same layernorm parameters.
+        z_loss_coeff (float): The coefficient for the z-loss.
+        insert_medusa_head (bool): Whether to insert the Medusa head.
+        ft_medusa_option (str): Options on which layers to finetune, choices like:
+            "fft": fully fine-tune both medusa heads and all LLM backbone;
+            "head": fine-tune medusa heads;
+            "head_out": fine-tune medusa heads, and the output layer;
+            "head_out_last_k_layer": fine-tune medusa heads, the output layer, and the last k layer(s) of the LLM backbone.
+        medusa_num_heads (int): Number of heads in the Medusa head.
+        medusa_num_layers (int): Number of layers in the Medusa head.
+        medusa_concat_heads (bool): Whether to concatenate multiple medusa heads into fused matrix, only applicable when medusa_num_layers = 1.
+        zero_init_cross_attn_proj (bool): Whether to initialize the cross-attn proj layer with zeros (default False).
+        concat_action_to_context (bool): Whether to concatenate the action embedding to the context (default False).
+    """
+    dim: int = attrs.field(default=4096)
+    n_layers: int = attrs.field(default=32)
+    n_heads: int = attrs.field(default=32)
+    n_kv_heads: Optional[int] = attrs.field(default=8)
+    head_dim: Optional[int] = attrs.field(default=None)
+    vocab_size: int = attrs.field(default=128256)
+    multiple_of: int = attrs.field(default=1024)  # make SwiGLU hidden layer size multiple of large power of 2
+    ffn_dim_multiplier: Optional[float] = attrs.field(default=1.3)
+    ffn_hidden_size: Optional[int] = attrs.field(default=None)
+    norm_eps: float = attrs.field(default=1e-5)
+    rope_theta: float = attrs.field(default=500000)
+    apply_abs_pos_emb: bool = attrs.field(default=False)
+    max_batch_size: int = attrs.field(default=1)
+    max_seq_len: int = attrs.field(default=8192)
+    fuse_qkv: bool = attrs.field(default=False)
+    causal_mask: bool = attrs.field(default=True)
+    flash_attn: bool = attrs.field(default=True)
+    norm_type: str = attrs.field(default="rmsnorm")
+    backend: str = attrs.field(default="pytorch")
+    precision: str = attrs.field(default="bfloat16")
+    ema: config.EMAConfig = config.EMAConfig(enabled=False)
+    embedding_dropout: float = 0.0
+    attention_dropout: float = 0.0
+    hidden_dropout: float = 0.0
+    use_qk_normalization: bool = False
+    tokenizer: Optional[TokenizerConfig] = None
+    inference: bool = False
+    act_ckpt_enabled: bool = False
+    fsdp_enabled: bool = False
+    context_parallel_size: int = attrs.field(default=1)
+    tensor_model_parallel_size: int = attrs.field(default=1)
+    sequence_parallel: bool = attrs.field(default=False)
+    set_parallel_mode: bool = attrs.field(default=False)
+    fsdp: LazyDict = LazyDict(
+        dict(
+            policy="auto",  # choices: ["size", "auto"]
+            min_num_params=1024,  # Used as policy == "size"
+            sharding_strategy="hybrid",  # Choices: ["full", "hybrid"]. "full" means sharding_group_size = world_size
+            sharding_group_size=8,  # If None, defaults to min(world_size, 8). Recommends 8 for training on 8-GPU nodes.
+        )
+    )
+    ckpt_dir: Optional[str] = attrs.field(default="")
+    ckpt_path: Optional[str] = attrs.field(
+        default=None
+    )  # If not None, load the model from this path instead of ckpt_dir
+    cache_dir: Optional[str] = attrs.field(default="/project/cosmos/ar/cache")
+    apply_yarn: Optional[bool] = attrs.field(default=False)
+    yarn_scale: Optional[float] = attrs.field(default=None)
+    yarn_beta_fast: Optional[int] = attrs.field(default=None)
+    yarn_beta_slow: Optional[int] = attrs.field(default=None)
+    original_seq_len: Optional[int] = attrs.field(default=None)
+    depth_init: bool = attrs.field(default=True)
+    ignore_first_num_tokens: int = 0
+    z_loss_coeff: float = 1e-4
+    attention_tp: bool = False
+    vision_encoder: Optional[str] = attrs.field(default=None)
+    mm_projector: Optional[str] = attrs.field(default=None)
+    rope_dim: Optional[str] = attrs.field(default="1D")
+    pytorch_rope_version: Optional[str] = attrs.field(default="v2")
+    original_latent_shape: Optional[list] = None
+    pad_to_multiple_of: Optional[int] = None
+    peft_last_n_layers: Optional[int] = attrs.field(default=0)
+    peft_every_n_layers: Optional[int] = attrs.field(default=0)
+    freeze_vision_encoder: bool = False
+    vision_encoder_in_channels: Optional[int] = attrs.field(default=3)
+    insert_cross_attn: bool = False
+    insert_cross_attn_every_k_layers: int = 1
+    context_dim: Optional[int] = attrs.field(default=1024)
+    finetune_layers_with_cross_attn: bool = False
+    finetune_layers_without_cross_attn: bool = False
+    use_action_condition: bool = False
+    action_embedding_mode: Optional[str] = attrs.field(default="mlp")
+    action_dim: Optional[int] = attrs.field(default=_ACTION_DIM)
+    action_embedding_dim: Optional[int] = attrs.field(default=1024)
+    group_causal_mask_mode: Optional[str] = attrs.field(default=None)
+    sync_1d_parameters: bool = True
+    # hyper-parameters for the medusa head configs
+    insert_medusa_head: bool = False
+    ft_medusa_option: str = "fft"
+    medusa_num_heads: int = 7
+    medusa_num_layers: int = 1
+    medusa_concat_heads: bool = True
+    # For video training
+    num_video_frames: Optional[int] = None
+    # Raw video pixel dimension
+    video_height: Optional[int] = None
+    video_width: Optional[int] = None
+    # Video tokenizer output dimension, in (T,H,W), it's computed by num_video_frames/temporal_compress_factor, video_height/spatial_compression_fact, video_width/spatial_compression_fact
+    video_latent_shape: Optional[list] = None
+    # For image training
+    image_latent_shape: Optional[list] = None
+    # For robot training (action)
+    zero_init_cross_attn_proj: bool = False
+    # For robot training (action)
+    concat_action_to_context: bool = False
+    def __getitem__(self, item):
+        return getattr(self, item)

cosmos_predict1/autoregressive/configs/base/model_config.py ADDED Viewed

	@@ -0,0 +1,718 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from typing import Callable, List, Optional
+import torch
+from megatron.core import ModelParallelConfig
+from cosmos_predict1.autoregressive.configs.base.model import ModelConfig, TrainingModelConfig
+from cosmos_predict1.autoregressive.configs.base.tokenizer import (
+    TextTokenizerConfig,
+    TokenizerConfig,
+    VideoTokenizerConfig,
+    create_discrete_video_fsq_tokenizer_state_dict_config,
+)
+from cosmos_predict1.autoregressive.tokenizer.image_text_tokenizer import ImageTextTokenizer
+from cosmos_predict1.autoregressive.tokenizer.text_tokenizer import TextTokenizer
+from cosmos_predict1.autoregressive.training.model import AutoRegressiveTrainingModel
+from cosmos_predict1.utils import log
+from cosmos_predict1.utils.config import EMAConfig
+from cosmos_predict1.utils.lazy_config import LazyCall as L
+# Common architecture specifications
+BASE_CONFIG = {"n_kv_heads": 8, "norm_type": "rmsnorm", "norm_eps": 1e-5, "ffn_hidden_size": 14336}
+COSMOS_ARCHITECTURES = {
+    "1b": {
+        "n_layers": 16,
+        "dim": 2048,
+        "n_heads": 32,
+    },
+    "4b": {
+        "n_layers": 16,
+        "dim": 4096,
+        "n_heads": 32,
+    },
+    "12b": {
+        "n_layers": 40,
+        "dim": 5120,
+        "n_heads": 32,
+        "head_dim": 128,
+    },
+}
+COSMOS_YARN_CONFIG = {
+    "original_latent_shape": [3, 40, 64],
+    "apply_yarn": True,
+    "yarn_beta_fast": 4,
+    "yarn_beta_slow": 1,
+    "yarn_scale": 2,
+}
+# Llama3 architecture specifications for different model sizes
+LLAMA3_ARCHITECTURES = {
+    "8b": {
+        "n_layers": 32,
+        "dim": 4096,
+        "n_heads": 32,
+        "ffn_hidden_size": 14336,
+    },
+}
+# Llama3.1 uses YaRN for long context support (context of 128k tokens)
+LLAMA_YARN_CONFIG = {
+    "apply_yarn": True,
+    "yarn_scale": 8,
+    "yarn_beta_fast": 4,
+    "yarn_beta_slow": 1,
+}
+# Mistral architecture specifications for different model sizes
+MISTRAL_ARCHITECTURES = {
+    "12b": {
+        "n_layers": 40,
+        "dim": 5120,
+        "n_heads": 32,
+        "ffn_hidden_size": 14336,
+        "head_dim": 128,
+    },
+}
+PIXTRAL_VISION_ARCHITECTURES = {
+    "12b": {"vision_encoder": "pixtral-12b-vit", "mm_projector": "mlp"},
+}
+def get_model_arch_specs(model_size: str, model_family: str = "mistral", pretrained: bool = False) -> dict:
+    """
+    Get the model architecture specifications for the given model size, model family and pretrained status.
+    Args:
+        model_size (str): Model size. Choices: "1b", "3b", "4b", "7b", etc.
+        model_family (str): Model family. Choices: "llama", "llama3", "llama3.1", "mistral"
+        pretrained (bool): Whether to load pretrained weights.
+    Returns:
+        dict: A dictionary containing the model architecture specifications.
+    """
+    arch_specs = copy.deepcopy(BASE_CONFIG)
+    model_size = model_size.lower()
+    if model_family.startswith("cosmos"):
+        arch_specs.update(COSMOS_ARCHITECTURES[model_size])
+    elif model_family.startswith("llama"):
+        arch_specs.update(LLAMA3_ARCHITECTURES[model_size])
+    elif model_family in ["mistral", "pixtral"]:
+        arch_specs.update(MISTRAL_ARCHITECTURES[model_size])
+        if model_family == "pixtral":
+            arch_specs.update(PIXTRAL_VISION_ARCHITECTURES[model_size])
+    else:
+        raise ValueError(f"Model family {model_family} is not supported.")
+    if pretrained:
+        if model_family == "cosmos":
+            if model_size == "12b":
+                arch_specs.update(COSMOS_YARN_CONFIG)
+                log.debug(f"Using YaRN for RoPE extension with config: {COSMOS_YARN_CONFIG}")
+            else:
+                pass
+        elif model_family in ["llama", "llama3"]:
+            pretrained_specs = {
+                "rope_theta": 500000,
+                "max_seq_len": 8192,
+                "vocab_size": 128256,
+            }
+            arch_specs.update(pretrained_specs)
+        elif model_family == "llama3.1":
+            pretrained_specs = {
+                "rope_theta": 500000,
+                "max_seq_len": 131072,
+                "original_seq_len": 8192,
+                "vocab_size": 128256,
+                **LLAMA_YARN_CONFIG,
+            }
+            arch_specs.update(pretrained_specs)
+        elif model_family == "mistral":
+            assert model_size == "12b", "We only support Mistral-Nemo-12B model."
+            pretrained_specs = {
+                "rope_theta": 1000000,
+                "max_seq_len": 128000,
+                "vocab_size": 131072,
+            }
+            arch_specs.update(pretrained_specs)
+        elif model_family == "pixtral":
+            assert model_size == "12b", "We only support Pixtral 12B model."
+            pretrained_specs = {"rope_theta": 1000000000, "max_seq_len": 128000, "vocab_size": 131072}
+            arch_specs.update(pretrained_specs)
+        else:
+            raise ValueError(f"Model family {model_family} doesn't have a pretrained config.")
+    return arch_specs
+def create_text_model_config(
+    model_ckpt_path: str,
+    tokenizer_path: str,
+    tensor_model_parallel_size: int = 1,
+    model_family: str = "mistral",
+    model_size: str = "12b",
+    is_instruct_model: bool = True,
+    max_seq_len: int = None,
+    max_batch_size: int = 1,
+    rope_dim: str = "1D",
+    add_special_tokens: bool = True,
+    pytorch_rope_version: str = None,
+) -> dict:
+    """Create a text model for training or inference.
+    Args:
+        model_ckpt_path (str): Path to the model checkpoint.
+        tokenizer_path (str): Path to the tokenizer folder.
+        tensor_model_parallel_size (int): Number of tensor model parallel groups.
+        model_family (str): Model family. Choices: "llama", "llama3", "llama3.1", "mistral".
+        model_size (str): Model size. Choices: "1b", "3b", "4b", "7b", "8b", "72b", etc.
+        is_instruct_model (bool): Whether the model is an instruct model.
+        inference (bool): Whether to create the model for inference.
+        max_seq_len (int): Maximum sequence length.
+        max_batch_size (int): Maximum batch size.
+        rope_dim (str): RoPE dimension. Choices: "1D", "3D".
+        add_special_tokens (bool): Whether to add special tokens.
+    Returns:
+        dict: A dictionary containing the model configuration, which can be used to instantiate the model object.
+    """
+    # Model size specific parameters
+    model_arch_specs = get_model_arch_specs(model_family=model_family, model_size=model_size, pretrained=True)
+    if max_seq_len is not None:
+        # Override the max_seq_len if provided
+        model_arch_specs["max_seq_len"] = max_seq_len
+    if pytorch_rope_version is not None:
+        model_arch_specs["pytorch_rope_version"] = pytorch_rope_version
+    model_config = ModelConfig(
+        max_batch_size=max_batch_size,
+        precision="bfloat16",
+        ckpt_path=model_ckpt_path,
+        use_qk_normalization=False,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        rope_dim=rope_dim,
+        **model_arch_specs,
+    )
+    tokenizer_config = TokenizerConfig(
+        text_tokenizer=TextTokenizerConfig(
+            config=L(TextTokenizer)(
+                model_family=model_family,
+                is_instruct_model=is_instruct_model,
+                local_path=tokenizer_path,
+            ),
+            data_key="text",
+            tokenizer_offset=model_config.vocab_size,
+            tokenize_here=False,
+            vocab_size=model_config.vocab_size,
+        ),
+        seq_len=model_config.max_seq_len,
+        training_type="text_only",
+        add_special_tokens=add_special_tokens,
+    )
+    return model_config, tokenizer_config
+def create_vision_language_model_config(
+    model_ckpt_path: str,
+    tokenizer_ckpt_path: str,
+    tensor_model_parallel_size: int = 1,
+    model_family: str = "pixtral",
+    model_size: str = "12b",
+    is_instruct_model: bool = True,
+    max_batch_size: int = 1,
+    rope_dim: str = "1D",
+    add_special_tokens: bool = True,
+    max_seq_len: int = None,
+    vision_encoder_in_channels: int = 3,
+    fuse_qkv: bool = False,
+    pytorch_rope_version: str = None,
+) -> dict:
+    """Create a vision-language model for training or inference.
+    Args:
+        model_ckpt_path (str): Path to the model checkpoint.
+        tokenizer_ckpt_path (str): Path to the tokenizer checkpoint.
+        tensor_model_parallel_size (int): Number of tensor model parallel groups.
+        model_family (str): Model family. Choices: "pixtral".
+        model_size (str): Model size. Choices: "12b".
+        is_instruct_model (bool): Whether the model is an instruct model.
+        rope_dim (str): RoPE dimension. Choices: "1D".
+        add_special_tokens (bool): Whether to add special tokens.
+        max_seq_len (int): Maximum sequence length.
+        vision_encoder_in_channels (int): Number of channels in the input image for the vision encoder. Default is 3, you can specify to int larger than 3. E.g. if you have 4 channel images where last channel is binary mask, set this to 4.
+        fuse_qkv (bool): Whether to fuse the QKV linear layers.
+    Returns:
+        dict: A dictionary containing the model configuration, which can be used to instantiate the model object.
+    """
+    # Model size specific parameters
+    model_arch_specs = get_model_arch_specs(model_family=model_family, model_size=model_size, pretrained=True)
+    if max_seq_len is not None:
+        # Override the max_seq_len if provided
+        model_arch_specs["max_seq_len"] = max_seq_len
+    if pytorch_rope_version is not None:
+        model_arch_specs["pytorch_rope_version"] = pytorch_rope_version
+    model_config = ModelConfig(
+        max_batch_size=max_batch_size,
+        precision="bfloat16",
+        ckpt_path=model_ckpt_path,
+        use_qk_normalization=False,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        rope_dim=rope_dim,
+        vision_encoder_in_channels=vision_encoder_in_channels,
+        fuse_qkv=fuse_qkv,
+        **model_arch_specs,
+    )
+    # Vision-language tokenizer
+    tokenizer_config = TokenizerConfig(
+        text_tokenizer=TextTokenizerConfig(
+            config=L(ImageTextTokenizer)(
+                model_family=model_family,
+                is_instruct_model=is_instruct_model,
+                image_processor_path=tokenizer_ckpt_path,
+                tokenizer_path=tokenizer_ckpt_path,
+            ),
+            data_key="image_text_interleaved",
+            tokenizer_offset=model_config.vocab_size,
+            tokenize_here=False,
+            vocab_size=model_config.vocab_size,
+        ),
+        seq_len=model_config.max_seq_len,
+        training_type="image_text_interleaved",
+        add_special_tokens=add_special_tokens,
+    )
+    return model_config, tokenizer_config
+def create_video2world_model_config(
+    model_ckpt_path: str,
+    tokenizer_ckpt_path: str,
+    tensor_model_parallel_size: int = 1,
+    model_family: str = "cosmos",
+    model_size: str = "4b",
+    pixel_chunk_duration: int = 9,
+    num_video_frames: int = 36,
+    compression_ratio: List[int] = [8, 16, 16],
+    original_seq_len: int = 8192,
+    num_condition_latents_t: int = 1,
+    num_tokens_to_ignore: int = -1,
+    batch_size: int = 2,
+    video_tokenizer_config_creator: Callable = create_discrete_video_fsq_tokenizer_state_dict_config,
+    rope_dim: str = "3D",
+    add_special_tokens: bool = True,
+    video_height: int = 384,
+    video_width: int = 640,
+    use_qk_normalization: bool = True,
+    insert_cross_attn: bool = False,
+    insert_cross_attn_every_k_layers: int = 1,
+    context_dim: int = 1024,
+    training_type: str = "video_to_video",
+    pad_to_multiple_of: Optional[int] = 64,
+    vocab_size: int = 64000,
+    apply_abs_pos_emb: bool = False,
+) -> dict:
+    """Create a video-to-world model config.
+    Args:
+        tensor_model_parallel_size (int): Number of tensor model parallel groups.
+        model_family (str): Model family. Choices: "llama", "llama3", "llama3.1", "mistral".
+        model_size (str): Model size. Choices: "1b", "8b", "3b".
+        pixel_chunk_duration (int): Number of frames in each chunk.
+        num_video_frames (int): Number of video frames.
+        compression_ratio (List[int]): Compression ratio for the video frames. Choices: [8, 16, 16] or [4, 8, 8].
+        original_seq_len (int): Original sequence length.
+        apply_yarn (bool): Whether to apply YaRN for long context scaling.
+        yarn_beta_fast (Optional[int]): Fast beta for YaRN.
+        yarn_beta_slow (Optional[int]): Slow beta for YaRN.
+        yarn_scale (Optional[int]): Scale factor for ctx extension.
+        use_qk_normalization (bool): Whether to use Query-Key normalization.
+        training_type (str): Type of training task.
+        batch_size (int): Batch size.
+        video_tokenizer_config_creator (Callable): Method that takes "pixel_chunk_duration: int" and "version: str" as arguments and returns video tokenizer config
+        video_tokenizer_version (str): Version of the video tokenizer.
+        num_condition_latents_t (int): Number of conditioning latent channels
+        num_tokens_to_ignore (int) = Number of tokens to ignore. This takes the precedence
+        video_height (int): Height of the video frame. Defaults to 384.
+        video_width (int): Width of the video frame. Defaults to 640.
+        rope_dim (str): RoPE dimension. Choices: "1D", "3D".
+        add_special_tokens (bool): Whether to add special tokens, use False for 2D/3D RoPE.
+        pad_to_multiple_of (int): Pad the token sequence length to the nearest multiple of this number. Defaults to 64.
+        vocab_size (int): Vocabulary size.
+        apply_abs_pos_emb (bool): Whether to apply absolute positional embeddings.
+    Returns:
+        dict: A dictionary containing the model configuration representing the model object, can be instantiated.
+    """
+    assert (
+        pixel_chunk_duration % compression_ratio[0] == 1
+    ), f"pixel_chunk_duration({pixel_chunk_duration}) should be k*n + 1 (k={compression_ratio[0]})"
+    latent_chunk_duration = (pixel_chunk_duration - 1) // compression_ratio[0] + 1
+    latent_height = video_height // compression_ratio[1]
+    latent_width = video_width // compression_ratio[2]
+    # Do some math to compute the video latent shape and sequence length
+    assert (
+        num_video_frames % pixel_chunk_duration == 0
+    ), f"num_video_frames {num_video_frames} should be divisible by pixel_chunk_duration {pixel_chunk_duration}"
+    video_latent_shape = [
+        num_video_frames // pixel_chunk_duration * latent_chunk_duration,
+        latent_height,
+        latent_width,
+    ]
+    # product of video_latent_shape
+    num_token_video_latent = video_latent_shape[0] * video_latent_shape[1] * video_latent_shape[2]
+    if add_special_tokens:
+        seq_len = num_token_video_latent + 3  # Sequence length per batch, max_seq_len + 3
+        seq_len = (seq_len + 63) // 64 * 64  # Round up to multiple of 64
+    # for text to video, we need to add <bov> token to indicate the start of the video
+    elif training_type == "text_to_video":
+        seq_len = num_token_video_latent + 1
+    else:
+        seq_len = num_token_video_latent
+    if seq_len % pad_to_multiple_of != 0:
+        # Round up to the nearest multiple of pad_to_multiple_of
+        seq_len = ((seq_len + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
+    # Model size specific parameters
+    model_arch_specs = get_model_arch_specs(model_family=model_family, model_size=model_size, pretrained=True)
+    # Whether skip the loss for first chunk or not, note the first token is already skipped when computing the loss
+    # If num_tokens_to_ignore is specified, use it.
+    # Else compute it from num_condition_latents_t
+    if num_tokens_to_ignore < 0:
+        num_tokens_to_ignore = latent_height * latent_width * num_condition_latents_t
+        if not add_special_tokens and num_condition_latents_t > 0:
+            # If there are no special tokens (bov), do a -1 so that you can compute the loss
+            # from the first token of the next chunk
+            num_tokens_to_ignore -= 1
+    model_config = ModelConfig(
+        video_height=video_height,
+        video_width=video_width,
+        max_seq_len=seq_len,
+        max_batch_size=batch_size,
+        precision="bfloat16",
+        ckpt_path=model_ckpt_path,
+        use_qk_normalization=use_qk_normalization,
+        vocab_size=64000,
+        original_seq_len=original_seq_len,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        video_latent_shape=video_latent_shape,
+        num_video_frames=num_video_frames,
+        rope_dim=rope_dim,
+        pad_to_multiple_of=pad_to_multiple_of,
+        insert_cross_attn=insert_cross_attn,
+        insert_cross_attn_every_k_layers=insert_cross_attn_every_k_layers,
+        context_dim=context_dim,
+        apply_abs_pos_emb=apply_abs_pos_emb,
+        **model_arch_specs,
+    )
+    video_tokenizer_config = video_tokenizer_config_creator(
+        tokenizer_ckpt_path, pixel_chunk_duration, compression_ratio
+    )
+    tokenizer_config = TokenizerConfig(
+        text_tokenizer=None,
+        video_tokenizer=VideoTokenizerConfig(
+            config=video_tokenizer_config,
+            data_key="video",
+            tokenizer_offset=0,  # Since there is no text embeddings in the model. Note this only apply when the model is trained from scratch. If we use text pretrained model, the offset will be vocab_size of text token.
+            tokenize_here=True,
+            max_seq_len=num_token_video_latent,
+            vocab_size=vocab_size,
+        ),
+        seq_len=seq_len,
+        training_type=training_type,
+        add_special_tokens=add_special_tokens,
+        pad_to_multiple_of=pad_to_multiple_of,
+    )
+    return model_config, tokenizer_config
+def create_video2world_model(
+    tensor_model_parallel_size: int = 1,
+    context_parallel_size: int = 1,
+    shard_checkpoint: bool = False,
+    model_family: str = "cosmos",
+    model_size: str = "1b",
+    backend: str = "pytorch",
+    pixel_chunk_duration: int = 9,
+    num_video_frames: int = 36,
+    compression_ratio: List[int] = [8, 16, 16],
+    original_seq_len: int = 8192,
+    apply_yarn: bool = False,
+    yarn_beta_fast: Optional[int] = None,
+    yarn_beta_slow: Optional[int] = None,
+    yarn_scale: Optional[int] = None,
+    num_condition_latents_t: int = 1,
+    num_tokens_to_ignore: int = -1,
+    batch_size: int = 1,
+    fsdp_enabled: bool = False,
+    act_ckpt_enabled: bool = False,
+    video_tokenizer_config_creator: Callable = create_discrete_video_fsq_tokenizer_state_dict_config,
+    rope_dim: str = "3D",
+    add_special_tokens: bool = False,
+    video_height: int = 384,
+    video_width: int = 640,
+    original_latent_shape: Optional[List[int]] = None,
+    use_qk_normalization: bool = True,
+    sequence_parallel: bool = False,
+    insert_cross_attn: bool = False,
+    insert_cross_attn_every_k_layers: int = 1,
+    context_dim: int = 1024,
+    finetune_layers_with_cross_attn: bool = False,
+    finetune_layers_without_cross_attn: bool = False,
+    use_action_condition: bool = False,
+    action_embedding_mode: Optional[str] = "mlp",
+    action_dim: int = 8,  # ACTION_DIM,
+    action_embedding_dim: int = 1024,
+    group_causal_mask_mode: Optional[str] = None,
+    training_type: str = "video_to_video",
+    pad_to_multiple_of: Optional[int] = 1,
+    z_loss_coeff: float = 1e-4,
+    temporal_overlap: int = 0,
+    embedding_dropout: float = 0.0,
+    insert_medusa_head: bool = False,
+    ft_medusa_option: str = "fft",
+    medusa_num_heads: int = 7,
+    medusa_num_layers: int = 1,
+    medusa_concat_heads: bool = True,
+    fuse_qkv: bool = False,
+    zero_init_cross_attn_proj: bool = False,
+    concat_action_to_context: bool = False,
+    tokenizer_ckpt_path: str = "checkpoints/Cosmos-1.0-Tokenizer-DV8x16x16/ema.jit",
+) -> dict:
+    """Create a video-to-video model for training.
+    Args:
+        tensor_model_parallel_size (int): Number of tensor model parallel groups.
+        context_parallel_size (int): Number of context parallel groups.
+        model_family (str): Model family. Choices: "llama", "llama3", "llama3.1", "mistral".
+        model_size (str): Model size. Choices: "1b", "8b", "3b".
+        backend (str): Backend for the model. Choices: "pytorch", "transformer_engine".
+        pixel_chunk_duration (int): Number of frames in each chunk.
+        num_video_frames (int): Number of video frames.
+        compression_ratio (List[int]): Compression ratio for the video frames. Choices: [8, 16, 16] or [4, 8, 8].
+        original_seq_len (int): Original sequence length.
+        apply_yarn (bool): Whether to apply YaRN for long context scaling.
+        yarn_beta_fast (Optional[int]): Fast beta for YaRN.
+        yarn_beta_slow (Optional[int]): Slow beta for YaRN.
+        yarn_scale (Optional[int]): Scale factor for ctx extension.
+        fsdp_enabled (bool): Whether Fully Sharded Data Parallel (FSDP) is enabled.
+        act_ckpt_enabled (bool): Whether activation checkpointing is enabled.
+        use_qk_normalization (bool): Whether to use Query-Key normalization.
+        training_type (str): Type of training task.
+        batch_size (int): Batch size.
+        video_tokenizer_config_creator (Callable): Method that takes "pixel_chunk_duration: int" and "version: str" as arguments and returns video tokenizer config
+        video_tokenizer_version (str): Version of the video tokenizer.
+        num_condition_latents_t (int): Number of conditioning latent channels
+        num_tokens_to_ignore (int) = Number of tokens to ignore. This takes the precedence
+        video_height (int): Height of the video frame. Defaults to 384.
+        video_width (int): Width of the video frame. Defaults to 640.
+        rope_dim (str): RoPE dimension. Choices: "1D", "2D", "3D".
+        add_special_tokens (bool): Whether to add special tokens, use False for 2D/3D RoPE.
+        original_latent_shape (list): Original latent shape before RoPE scaling.
+        sequence_parallel (bool): Whether to enable sequence parallelism.
+        insert_cross_attn (bool): Whether to insert the cross-attention layers after each multi-head self-attention (MSA) layer.
+        insert_cross_attn_every_k_layers (int): Insert cross-attention layers every k TransformerLayers.
+        context_dim (Optional[int]): The dimensionality of cross-attention embedding, e.g., T5 embed feature dim.
+        finetune_layers_with_cross_attn (bool): Whether to finetune Transformer layers w/ CA (cross-attn).
+        finetune_layers_without_cross_attn (bool): Whether to finetune Transformer layers w/o CA (cross-attn).
+        use_action_condition (bool): Whether to use action condition.
+        action_embedding_mode (Optional[str]): The mode of the robot action embedding. Choices: "matrix", "mlp".
+        action_dim (int): Dimension of the raw robot action tensor (e.g., 7 for DROID, [Δx, Δy, Δz, rx, ry, rz, gripper_open]).
+        action_embedding_dim (int): Dimension of the action embedding.
+        group_causal_mask_mode (Optional[str]): The mode of the group causal mask. Choices: "causal", "group_diagonal".
+        pad_to_multiple_of (int): Pad the token sequence length to the nearest multiple of this number. Defaults to 64.
+        z_loss_coeff (float): Coefficient for the z loss.
+        temporal_overlap (int): Temporal overlap in the latent space.
+        embedding_dropout (float): Dropout rate for the embeddings.
+        insert_medusa_head (bool): Whether to insert the Medusa head.
+        ft_medusa_option (str): Options on which layers to finetune, choices like:
+            "fft": fully fine-tune both medusa heads and all LLM backbone;
+            "head": fine-tune medusa heads;
+            "head_out": fine-tune medusa heads, and the output layer;
+            "head_out_last_k_layer": fine-tune medusa heads, the output layer, and the last k layer(s) of the LLM backbone.
+        medusa_num_heads (int): Number of heads in the Medusa head.
+        medusa_num_layers (int): Number of layers in the Medusa head.
+        medusa_concat_heads (bool): Whether to concatenate multiple medusa heads into fused matrix, only applicable when medusa_num_layers = 1.
+        fuse_qkv (bool): Whether to fuse the QKV linear layers.
+        zero_init_cross_attn_proj (bool): Whether to zero-initialize the cross-attention projection weights (default False).
+        concat_action_to_context (bool): Whether to concatenate the action embedding to the context (default False).
+    Returns:
+        dict: A dictionary containing the model configuration representing the model object, can be instantiated.
+    """
+    assert (
+        pixel_chunk_duration % compression_ratio[0] == 1
+    ), f"pixel_chunk_duration({pixel_chunk_duration}) should be k*n + 1 (k={compression_ratio[0]})"
+    latent_chunk_duration = (pixel_chunk_duration - 1) // compression_ratio[0] + 1
+    latent_height = video_height // compression_ratio[1]
+    latent_width = video_width // compression_ratio[2]
+    # Compute the video latent shape and sequence length
+    if temporal_overlap == 0:
+        assert (
+            num_video_frames % pixel_chunk_duration == 0
+        ), f"num_video_frames {num_video_frames} should be divisible by pixel_chunk_duration {pixel_chunk_duration}"
+        video_latent_shape = [
+            num_video_frames // pixel_chunk_duration * latent_chunk_duration,
+            latent_height,
+            latent_width,
+        ]
+    else:
+        # Calculate temporal overlap in the latent space
+        temporal_overlap_latent = temporal_overlap // compression_ratio[0]
+        # Calculate the effective number of latent chunks for the video
+        latent_chunks = (num_video_frames - temporal_overlap) // (pixel_chunk_duration - temporal_overlap)
+        # Compute the total duration of the latent chunks, accounting for overlap
+        effective_latent_duration = (
+            latent_chunk_duration - temporal_overlap_latent
+        ) * latent_chunks + temporal_overlap_latent
+        # Define the shape of the video in the latent space
+        video_latent_shape = [
+            effective_latent_duration,  # Temporal dimension
+            latent_height,  # Height in the latent space
+            latent_width,  # Width in the latent space
+        ]
+    # product of video_latent_shape
+    num_token_video_latent = video_latent_shape[0] * video_latent_shape[1] * video_latent_shape[2]
+    if add_special_tokens:
+        seq_len = num_token_video_latent + 3  # Sequence length per batch, max_seq_len + 3
+        seq_len = (seq_len + 63) // 64 * 64  # Round up to multiple of 64
+    # for text to video, we need to add <bov> token to indicate the start of the video
+    elif training_type == "text_to_video":
+        seq_len = num_token_video_latent + 1
+    else:
+        seq_len = num_token_video_latent
+    if seq_len % pad_to_multiple_of != 0:
+        # Round up to the nearest multiple of pad_to_multiple_of
+        seq_len = ((seq_len + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of
+    # Model size specific parameters
+    model_arch_specs = get_model_arch_specs(model_family=model_family, model_size=model_size, pretrained=False)
+    inference = False  # False for training, True for inference
+    # set_parallel_mode = True
+    set_parallel_mode = tensor_model_parallel_size > 1
+    attention_tp = True
+    if context_parallel_size > 1:
+        assert backend == "transformer_engine", "Context parallelism is only supported in transformer engine."
+    if tensor_model_parallel_size > 1:
+        assert set_parallel_mode, "Tensor model parallelism is only supported in parallel mode."
+    # Whether skip the loss for first chunk or not, note the first token is already skipped when computing the loss
+    # If num_tokens_to_ignore is specified, use it.
+    # Else compute it from num_condition_latents_t
+    if num_tokens_to_ignore < 0:
+        num_tokens_to_ignore = latent_height * latent_width * num_condition_latents_t
+        if not add_special_tokens and num_condition_latents_t > 0:
+            # If there are no special tokens (bov), do a -1 so that you can compute the loss
+            # from the first token of the next chunk
+            num_tokens_to_ignore -= 1
+    model_config = TrainingModelConfig(
+        video_height=video_height,
+        video_width=video_width,
+        max_seq_len=seq_len,
+        max_batch_size=batch_size,
+        inference=inference,
+        backend=backend,
+        precision="bfloat16",
+        ema=EMAConfig(enabled=False),
+        act_ckpt_enabled=act_ckpt_enabled,
+        fsdp_enabled=fsdp_enabled,
+        cache_dir=None,
+        ckpt_path="checkpoints/Cosmos-Predict1-4B/model.pt",
+        use_qk_normalization=use_qk_normalization,
+        vocab_size=64000,
+        ignore_first_num_tokens=num_tokens_to_ignore,
+        apply_yarn=apply_yarn,
+        yarn_beta_fast=yarn_beta_fast,
+        yarn_beta_slow=yarn_beta_slow,
+        original_seq_len=original_seq_len,
+        yarn_scale=yarn_scale,
+        context_parallel_size=context_parallel_size,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        set_parallel_mode=set_parallel_mode,
+        attention_tp=attention_tp,
+        video_latent_shape=video_latent_shape,
+        num_video_frames=num_video_frames,
+        rope_dim=rope_dim,
+        original_latent_shape=original_latent_shape,
+        pad_to_multiple_of=pad_to_multiple_of,
+        sequence_parallel=sequence_parallel,
+        insert_cross_attn=insert_cross_attn,
+        insert_cross_attn_every_k_layers=insert_cross_attn_every_k_layers,
+        context_dim=context_dim,
+        finetune_layers_with_cross_attn=finetune_layers_with_cross_attn,
+        finetune_layers_without_cross_attn=finetune_layers_without_cross_attn,
+        use_action_condition=use_action_condition,
+        action_embedding_mode=action_embedding_mode,
+        action_dim=action_dim,
+        action_embedding_dim=action_embedding_dim,
+        group_causal_mask_mode=group_causal_mask_mode,
+        z_loss_coeff=z_loss_coeff,
+        embedding_dropout=embedding_dropout,
+        insert_medusa_head=insert_medusa_head,
+        ft_medusa_option=ft_medusa_option,
+        medusa_num_heads=medusa_num_heads,
+        medusa_num_layers=medusa_num_layers,
+        medusa_concat_heads=medusa_concat_heads,
+        fuse_qkv=fuse_qkv,
+        zero_init_cross_attn_proj=zero_init_cross_attn_proj,
+        concat_action_to_context=concat_action_to_context,
+        **model_arch_specs,
+    )
+    tokenizer_config = TokenizerConfig(
+        text_tokenizer=None,
+        video_tokenizer=VideoTokenizerConfig(
+            config=video_tokenizer_config_creator(
+                ckpt_path=tokenizer_ckpt_path, pixel_chunk_duration=pixel_chunk_duration
+            ),
+            data_key="video",
+            tokenizer_offset=0,
+            vocab_size=64000,
+            tokenize_here=True,
+            max_seq_len=num_token_video_latent,
+            temporal_overlap=temporal_overlap,
+        ),
+        seq_len="${model.model_config.max_seq_len}",
+        training_type=training_type,
+        add_special_tokens=add_special_tokens,
+        pad_to_multiple_of=pad_to_multiple_of,
+    )
+    model_parallel = ModelParallelConfig(
+        bf16=True,
+        params_dtype=getattr(torch, "bfloat16"),
+    )
+    model_parallel.tensor_model_parallel_size = "${model.model_config.tensor_model_parallel_size}"
+    model_parallel.context_parallel_size = "${model.model_config.context_parallel_size}"
+    model_parallel.sequence_parallel = "${model.model_config.sequence_parallel}"
+    return L(AutoRegressiveTrainingModel.build)(
+        seed=0,
+        train_from_scratch=True,
+        model_config=model_config,
+        fsdp_checkpointer=None,
+        tokenizer_config=tokenizer_config,
+        model_parallel=model_parallel,
+        shard_checkpoint=shard_checkpoint,
+    )

cosmos_predict1/autoregressive/configs/base/model_parallel.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from megatron.core import ModelParallelConfig
+from cosmos_predict1.utils.lazy_config import LazyDict
+def create_model_parallel_config():
+    model_parallel = ModelParallelConfig(bf16=True, params_dtype=getattr(torch, "bfloat16"))
+    model_parallel.tensor_model_parallel_size = "${model.model_parallel.tensor_model_parallel_size}"
+    model_parallel.context_parallel_size = "${model.model_parallel.context_parallel_size}"
+    model_parallel.sequence_parallel = "${model.model_parallel.sequence_parallel}"
+    MODEL_PARALLELS = LazyDict(
+        dict(
+            model_parallel_bf16=model_parallel,
+        ),
+        flags={"allow_objects": True},
+    )
+    return MODEL_PARALLELS["model_parallel_bf16"]

cosmos_predict1/autoregressive/configs/base/optim.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from cosmos_predict1.utils.lazy_config import LazyCall as L
+class LambdaLinearWarmupScheduler:
+    """
+    A learning rate scheduler that implements linear warm-up and cool-down.
+    This scheduler provides three phases:
+    1. Warm-up: Learning rate linearly increases from 0 to 1.
+    2. Constant: Learning rate remains at 1.
+    3. Cool-down: Learning rate linearly decreases from 1 to 0.
+    Args:
+        warmup_steps (int): Number of steps for the warm-up phase.
+        warmup_offset (int): Starts warmup from this offset.
+        max_iter (int, optional): Total number of iterations. Required if cooldown_steps is provided.
+        cooldown_steps (int, optional): Number of steps for the cool-down phase.
+    Raises:
+        ValueError: If cooldown_steps is provided without max_iter, or if an invalid step is given.
+    """
+    def __init__(self, warmup_steps: int, warmup_offset: int = 0, max_iter: int = None, cooldown_steps: int = None):
+        self.warmup_steps = warmup_steps
+        self.warmup_offset = warmup_offset
+        self.max_iter = max_iter
+        self.cooldown_steps = cooldown_steps
+        if cooldown_steps is not None:
+            if max_iter is None:
+                raise ValueError("max_iter must be specified when cooldown_steps is provided")
+            self.cooldown_start = max_iter - cooldown_steps
+        else:
+            self.cooldown_start = None
+    def __call__(self, step):
+        # Warm-up phase
+        if step < self.warmup_offset:
+            return 0
+        if step < self.warmup_steps + self.warmup_offset:
+            return float(step - self.warmup_offset) / float(max(1, self.warmup_steps))
+        # Constant phase (no cool-down)
+        elif self.cooldown_steps is None:
+            return 1.0
+        # Constant phase (before cool-down starts)
+        elif step < self.cooldown_start:
+            return 1.0
+        # Cool-down phase
+        elif self.cooldown_start <= step < self.max_iter:
+            cooldown_progress = (step - self.cooldown_start) / self.cooldown_steps
+            return 1.0 - cooldown_progress
+        # After max_iter
+        elif step >= self.max_iter:
+            return 0.0
+        # Unexpected case
+        else:
+            raise ValueError(f"Invalid step {step}")
+LambdaLinearLR = L(torch.optim.lr_scheduler.LambdaLR)(
+    optimizer=None,
+    lr_lambda=L(LambdaLinearWarmupScheduler)(warmup_steps=5000),
+)

cosmos_predict1/autoregressive/configs/base/tokenizer.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import attrs
+from cosmos_predict1.autoregressive.tokenizer.discrete_video import DiscreteVideoFSQStateDictTokenizer
+from cosmos_predict1.autoregressive.tokenizer.networks import CausalDiscreteVideoTokenizer
+from cosmos_predict1.utils.lazy_config import LazyCall as L
+from cosmos_predict1.utils.lazy_config import LazyDict
+def create_discrete_video_fsq_tokenizer_state_dict_config(
+    ckpt_path, pixel_chunk_duration=33, compression_ratio=[8, 16, 16]
+) -> LazyDict:
+    CausalDiscreteFactorizedVideoTokenizerConfig: LazyDict = L(CausalDiscreteVideoTokenizer)(
+        # The new causal discrete tokenizer, that is at least 2x more efficient in memory and runtime.
+        # - It relies on fully 3D discrete wavelet transform
+        # - Uses a layer norm instead of a group norm
+        # - Factorizes full convolutions into spatial and temporal convolutions
+        # - Factorizes full attention into spatial and temporal attention
+        # - Strictly causal, with flexible temporal length at inference.
+        attn_resolutions=[32],
+        channels=128,
+        channels_mult=[2, 4, 4],
+        dropout=0.0,
+        in_channels=3,
+        num_res_blocks=2,
+        out_channels=3,
+        resolution=1024,
+        patch_size=4,
+        patch_method="haar",
+        z_channels=16,
+        z_factor=1,
+        num_groups=1,
+        legacy_mode=False,
+        spatial_compression=16,
+        temporal_compression=8,
+        embedding_dim=6,
+        levels=[8, 8, 8, 5, 5, 5],
+        name="CausalDiscreteFactorizedVideoTokenizer",
+    )
+    return L(DiscreteVideoFSQStateDictTokenizer)(
+        enc_fp=ckpt_path.replace("ema.jit", "encoder.jit"),
+        dec_fp=ckpt_path.replace("ema.jit", "decoder.jit"),
+        tokenizer_module=CausalDiscreteFactorizedVideoTokenizerConfig,
+        name="discrete_video_fsq",
+        latent_ch=6,
+        is_bf16=True,
+        pixel_chunk_duration=pixel_chunk_duration,
+        latent_chunk_duration=1 + (pixel_chunk_duration - 1) // compression_ratio[0],
+        max_enc_batch_size=8,
+        max_dec_batch_size=4,
+        levels=[8, 8, 8, 5, 5, 5],
+        compression_ratio=compression_ratio,
+    )
+@attrs.define(slots=False)
+class TextTokenizerConfig:
+    """
+    Text tokenizer config
+    Args:
+        config: Config file to define the text tokenizer class.
+        data_key (str): The input key from data_dict that will be passed to the text tokenizer.
+        tokenize_here (bool): Whether to use the tokenizer to perform online tokenization.
+        tokenizer_offset (int): Offset that is added to the tokens.
+        vocab_size (int): Vocabulary size of the tokenizer.
+    """
+    config: LazyDict
+    data_key: str = ""
+    tokenize_here: bool = False
+    tokenizer_offset: int = 0
+    vocab_size: int = 0
+@attrs.define(slots=False)
+class VideoTokenizerConfig:
+    """
+    Video tokenizer config
+    Args:
+        config: Config file to define the video tokenizer class.
+        data_key (str): The input key from data_dict that will be passed to the video tokenizer.
+        tokenize_here (bool): Whether to use the tokenizer to perform online tokenization.
+        tokenizer_offset (int): Offset that is added to the tokens. In case of joint text-video tokenizers, we
+            add an offset to make sure that video tokens and text tokens don't overlap.
+        vocab_size (int): Vocabulary size of the tokenizer.
+        max_seq_len (int): Maximum token length for an input video.
+        temporal_overlap (int): Overlap between consecutive video chunks.
+    """
+    config: LazyDict
+    data_key: str = ""
+    tokenize_here: bool = True
+    tokenizer_offset: int = 0
+    vocab_size: int = 0
+    max_seq_len: int = -1
+    temporal_overlap: int = 0
+@attrs.define(slots=False)
+class TokenizerConfig:
+    """
+    Joint tokenizer config
+    Args:
+        text_tokenizer (TextTokenizerConfig): Text tokenizer config file
+        class_tokenizer (ClassTokenizerConfig): Class tokenizer config file
+        video_tokenizer (VideoTokenizerConfig): Video tokenizer config file
+        image_tokenizer (ImageTokenizerConfig): Image tokenizer config file
+        seq_len (int): Final token sequence length
+        training_type (str): Type of training we use. Supports ["text_only", "text_to_video", "class_to_image", "image_text_interleaved"]
+        add_special_tokens (bool): Whether to add special tokens to the output tokens
+        pad_to_multiple_of (int): Pad the token sequence length to the nearest multiple of this number. Defaults to 64.
+    """
+    text_tokenizer: Optional[TextTokenizerConfig] = None
+    video_tokenizer: Optional[VideoTokenizerConfig] = None
+    seq_len: int = 4096
+    training_type: str = None
+    add_special_tokens: bool = True
+    pad_to_multiple_of: Optional[int] = 64

cosmos_predict1/autoregressive/configs/config.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Default config for cosmos_ar project."""
+import os
+from typing import Any, List
+import attrs
+from cosmos_predict1.autoregressive.configs.registry import register_configs
+from cosmos_predict1.autoregressive.trainer import Trainer
+from cosmos_predict1.utils import config, log
+from cosmos_predict1.utils.config_helper import import_all_modules_from_package
+@attrs.define(slots=False)
+class Config(config.Config):
+    defaults: List[Any] = attrs.field(
+        factory=lambda: [
+            "_self_",
+            {"model": None},
+            {"data_train": "mock_video"},
+            {"data_val": None},
+            {"optimizer": "fused_adamw"},
+            {"scheduler": "warmup_cosine_lr"},
+            {"checkpoint": "local"},
+            {"callbacks": "basic"},
+            {"global_config": None},
+            {"experiment": None},
+        ]
+    )
+    def validate(self) -> None:
+        """Validate that the config has all required fields."""
+        assert self.job.project != "", "job.project is not set"
+        assert self.job.group != "", "job.group is not set"
+        assert self.job.name != "", "job.name is not set"
+        log.info("Validating config for cosmos_autoregressive job")
+        # FSDP config check
+        if self.model.model_config.fsdp_enabled:
+            assert self.trainer.distributed_parallelism == "fsdp"
+        else:
+            assert self.trainer.distributed_parallelism == "ddp"
+        # Transformer Engine config check
+        if self.model.model_config.backend == "transformer_engine":
+            assert (
+                "NVTE_FLASH_ATTN" in os.environ and os.environ["NVTE_FLASH_ATTN"] == "1"
+            )  # Enable Flash attention for transformer engine
+        # TP, CP config check
+        if self.model_parallel is not None:
+            if self.model_parallel.context_parallel_size > 1:
+                assert (
+                    self.model.model_config.backend == "transformer_engine"
+                ), "Context parallelism is only supported in transformer engine."
+            if self.model_parallel.tensor_model_parallel_size > 1:
+                assert (
+                    self.model.model_config.set_parallel_mode
+                ), "Tensor model parallelism is only supported in parallel mode."
+            if self.model_parallel.sequence_parallel:
+                assert (
+                    self.model_parallel.tensor_model_parallel_size > 1
+                ), "Sequence parallelism is only supported in tensor model parallelism."
+                assert (
+                    self.model.model_config.backend == "transformer_engine"
+                ), "Sequence parallelism is only supported in transformer engine."
+def make_config():
+    c = Config(
+        model=None,
+        optimizer=None,
+        scheduler=None,
+        dataloader_train=None,
+        dataloader_val=None,
+        checkpoint=None,
+    )
+    c.job.project = "cosmos_autoregressive"
+    c.job.group = "debug"
+    c.job.name = "default_${now:%Y-%m-%d}_${now:%H-%M-%S}"
+    c.trainer.type = Trainer
+    c.trainer.run_validation = True
+    c.trainer.seed = 0
+    c.trainer.max_iter = 10
+    c.trainer.logging_iter = 1
+    c.trainer.callbacks = None
+    register_configs()
+    # experiment config are defined in the experiment folder
+    # call import_all_modules_from_package to register them
+    import_all_modules_from_package("cosmos_predict1.autoregressive.configs.experiment")
+    return c

cosmos_predict1/autoregressive/configs/experiment/video2video/__init__.py ADDED Viewed

File without changes

cosmos_predict1/autoregressive/configs/experiment/video2video/basic.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    This file contains a basic configuration for video2video experiments.
+"""
+from hydra.core.config_store import ConfigStore
+from cosmos_predict1.autoregressive.configs.base.model_config import create_video2world_model
+from cosmos_predict1.autoregressive.configs.base.model_parallel import create_model_parallel_config
+from cosmos_predict1.utils import log
+from cosmos_predict1.utils.lazy_config import LazyDict
+cs = ConfigStore.instance()
+"""
+   Finetune 4B model with TP=1, pytorch backend, low resolution tealrobot data, frames 33, chunk 33.
+   Usage:
+   torchrun --nproc_per_node=1 -m cosmos_predict1.autoregressive.train --config=cosmos_predict1/autoregressive/configs/config.py -- experiment=base_4b_example_tealrobotsmall_tp1
+"""
+base_4b_example_tealrobotsmall_tp1: LazyDict = LazyDict(
+    dict(
+        defaults=[
+            {"override /data_train": "tealrobot_video_small"},
+            {
+                "override /callbacks": [
+                    "basic",
+                    "video_teacher_forcing",
+                ]
+            },
+            {"override /checkpoint": "local"},
+            {"override /optimizer": "fused_adamw"},
+            {"override /scheduler": "warmup_cosine_lr"},
+            "_self_",
+        ],
+        job=dict(
+            project="posttraining",
+            group="autoregressive_base",
+            name="base_4b_example_tealrobotsmall_tp1",
+        ),
+        model=create_video2world_model(
+            model_size="4b",
+            model_family="cosmos",
+            backend="pytorch",
+            tensor_model_parallel_size=1,
+            batch_size=1,
+            pixel_chunk_duration=33,
+            num_video_frames=33,
+            video_height=384,
+            video_width=640,
+            tokenizer_ckpt_path="checkpoints/Cosmos-Tokenize1-DV8x16x16-720p/ema.jit",
+            add_special_tokens=False,
+        ),
+        trainer=dict(
+            max_iter=50000,
+            grad_accum_iter=1,
+            grad_scaler_args=dict(enabled=False),
+            run_validation=False,  # No need for validation as epoch <= 1
+            distributed_parallelism="ddp",
+            callbacks=dict(
+                vid_sampling_tf=dict(
+                    every_n=500,
+                ),
+            ),
+        ),
+        checkpoint=dict(
+            load_path="checkpoints/Cosmos-Predict1-4B/model.pt",
+            load_training_state=False,
+            strict_resume=True,
+            save_iter=1000,
+        ),
+        model_parallel=create_model_parallel_config(),
+    ),
+)
+"""
+   Finetune 4B model with TP=4, pytorch backend, high resolution tealrobot data, frame 33, chunk 33.
+   Usage:
+   torchrun --nproc_per_node=4 -m cosmos_predict1.autoregressive.train --config=cosmos_predict1/autoregressive/configs/config.py -- experiment=base_4b_example_tealrobot_tp4
+"""
+base_4b_example_tealrobot_tp4: LazyDict = LazyDict(
+    dict(
+        defaults=[
+            {"override /data_train": "tealrobot_video"},
+            {
+                "override /callbacks": [
+                    "basic",
+                    "video_teacher_forcing",
+                ]
+            },
+            {"override /checkpoint": "local"},
+            {"override /optimizer": "fused_adamw"},
+            {"override /scheduler": "warmup_cosine_lr"},
+            "_self_",
+        ],
+        job=dict(
+            project="posttraining",
+            group="autoregressive_base",
+            name="base_4b_example_tealrobot_tp4",
+        ),
+        model=create_video2world_model(
+            model_size="4b",
+            model_family="cosmos",
+            backend="pytorch",
+            tensor_model_parallel_size=4,
+            batch_size=1,
+            pixel_chunk_duration=33,
+            num_video_frames=33,
+            video_height=640,
+            video_width=848,
+            tokenizer_ckpt_path="checkpoints/Cosmos-Tokenize1-DV8x16x16-720p/ema.jit",
+            add_special_tokens=False,
+        ),
+        trainer=dict(
+            max_iter=50000,
+            grad_accum_iter=1,
+            grad_scaler_args=dict(enabled=False),
+            run_validation=False,  # No need for validation as epoch <= 1
+            distributed_parallelism="ddp",
+            callbacks=dict(
+                vid_sampling_tf=dict(
+                    every_n=500,
+                ),
+            ),
+        ),
+        checkpoint=dict(
+            load_path="checkpoints/Cosmos-Predict1-4B/model.pt",
+            load_training_state=False,
+            strict_resume=False,
+            save_iter=1000,
+        ),
+        model_parallel=create_model_parallel_config(),
+    ),
+)
+def register_experiments(cs):
+    # Register the experiments
+    for _item in [
+        base_4b_example_tealrobotsmall_tp1,
+        base_4b_example_tealrobot_tp4,
+    ]:
+        cs.store(
+            group="experiment",
+            package="_global_",
+            name=_item["job"]["name"],
+            node=_item,
+        )

cosmos_predict1/autoregressive/configs/inference/inference_config.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, List, Optional, Union
+import attrs
+from cosmos_predict1.autoregressive.configs.base.model import ModelConfig, TokenizerConfig
+@attrs.define(slots=False)
+class DataShapeConfig:
+    latent_shape: list = []
+    num_video_frames: Union[None, int] = None
+    height: Union[None, int] = None
+    width: Union[None, int] = None
+@attrs.define(slots=False)
+class SamplingConfig:
+    """
+    Sampling config
+    Args:
+        temperature (float): Temperature value for controlling randomness in sampling. Defaults to 0.6.
+        top_p (float): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
+        logprobs (bool): Flag indicating whether to compute token log probabilities. Defaults to False.
+        echo (bool): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
+    """
+    temperature: float = 0.6
+    top_k: int = None
+    top_p: float = 0.9
+    compile_prefill: bool = False
+    compile_sampling: bool = True
+    logprobs: bool = False
+    echo: bool = False
+@attrs.define(slots=False)
+class DiffusionDecoderSamplingConfig:
+    """
+    Diffusion decoder sampling config
+    Args:
+        guidance (float): Guidance scale for the diffusion process. Controls how much the model follows the conditioning. Defaults to 0.8.
+        sigma_min (float): Minimum noise level for the diffusion process. Defaults to 0.02.
+        sigma (float): Initial noise level for the diffusion process. Defaults to 8.
+        num_steps (int): Number of denoising steps to perform. Defaults to 35.
+        overlap (int): Number of overlapping frames between video chunks during processing. Defaults to 2.
+        continuous_tokenizer_channel (int): Number of channels in the continuous tokenizer of diffusion decoder. Defaults to 16.
+        continuous_tokenizer_spatial_compression_ratio (int): Spatial compression ratio for the continuous tokenizer of diffusion decoder. Defaults to 8.
+        dd_train_num_video_frames (int): Number of video frames used during training for diffusion decoder. Defaults to 57.
+    """
+    guidance: float = 1.8
+    sigma_min: float = 0.02
+    sigma: float = 8
+    num_steps: int = 15
+    overlap: int = 2
+    continuous_tokenizer_channel = 16
+    continuous_tokenizer_spatial_compression_ratio = 8
+    dd_train_num_video_frames: int = 57
+    max_iter: int = 99
+    fps: int = 24
+@attrs.define(slots=False)
+class InferenceConfig:
+    """
+    Inference config
+    Args:
+        model_config (ModelConfig): Model config
+        tokenizer_config (TokenizerConfig): Tokenizer config
+        ckpt_path (str): Path to the checkpoint
+        latent_shape (list): Shape of the latent
+    """
+    model_config: ModelConfig = None
+    tokenizer_config: TokenizerConfig = None
+    ckpt_path: str = ""
+    data_shape_config: DataShapeConfig = None
+    defaults: List[Any] = attrs.field(
+        factory=lambda: [
+            "_self_",
+            {"data_val": None},
+            {"data_shape_config": "video_shape_as_model_config"},
+            {"eval_job": None},
+        ]
+    )

cosmos_predict1/autoregressive/configs/registry.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from hydra.core.config_store import ConfigStore
+from cosmos_predict1.autoregressive.configs.base.callbacks import BASIC_CALLBACKS, VIDEO_TEACHER_FORCING_CALLBACK
+from cosmos_predict1.autoregressive.configs.base.dataloader import get_tealrobot_video
+from cosmos_predict1.autoregressive.configs.base.optim import LambdaLinearLR
+from cosmos_predict1.autoregressive.configs.experiment.video2video.basic import register_experiments
+from cosmos_predict1.utils import config, log
+from cosmos_predict1.utils.lazy_config import LazyCall as L
+from cosmos_predict1.utils.scheduler import WarmupCosineLR
+def register_checkpoint(cs):
+    checkpoint_local = config.CheckpointConfig(
+        save_iter=5000,
+        broadcast_via_filesystem=True,
+    )
+    cs.store(group="checkpoint", package="checkpoint", name="local", node=checkpoint_local)
+def register_callbacks(cs):
+    cs.store(group="callbacks", package="trainer.callbacks", name="basic", node=BASIC_CALLBACKS)
+    cs.store(
+        group="callbacks",
+        package="trainer.callbacks",
+        name="video_teacher_forcing",
+        node=VIDEO_TEACHER_FORCING_CALLBACK,
+    )
+def register_scheduler(cs):
+    cs.store(
+        group="scheduler",
+        package="scheduler",
+        name="warmup_cosine_lr",
+        node=L(WarmupCosineLR)(optimizer=None, warmup_iters=5000, lr_decay_iters="${trainer.max_iter}", min_lr=1e-8),
+    )
+    cs.store(group="scheduler", package="scheduler", name="lambdalinear", node=LambdaLinearLR)
+def register_optimizer(cs):
+    cs.store(
+        group="optimizer",
+        package="optimizer",
+        name="fused_adamw",
+        node=L(torch.optim.AdamW)(params=None, lr=1e-3, weight_decay=0.05, fused=True),
+    )
+    cs.store(
+        group="optimizer",
+        package="optimizer",
+        name="sgd",
+        node=L(torch.optim.SGD)(params=None, lr=5e-6, momentum=0.9),
+    )
+def register_training_data(cs):
+    cs.store(
+        group="data_train",
+        package="dataloader_train",
+        name="tealrobot_video_small",
+        node=get_tealrobot_video(num_frames=33, video_size=[384, 640]),
+    )
+    cs.store(group="data_train", package="dataloader_train", name="tealrobot_video", node=get_tealrobot_video())
+def register_configs():
+    log.info("Registering configs for autoregressive_base")
+    cs = ConfigStore.instance()
+    register_callbacks(cs)
+    register_checkpoint(cs)
+    register_optimizer(cs)
+    register_scheduler(cs)
+    register_training_data(cs)
+    register_experiments(cs)