letitiaaa
/

open-clip

Model card Files Files and versions Community

letitiaaa commited on May 15

Commit

e66e8cc

1 Parent(s): d136541

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.girattributes +2 -0
.github/workflows/ci.yml +121 -0
.github/workflows/clear-cache.yml +29 -0
.github/workflows/python-publish.yml +37 -0
.gitignore +153 -0
CITATION.cff +33 -0
HISTORY.md +223 -0
LICENSE +23 -0
MANIFEST.in +3 -0
README.md +618 -0
models.txt +2 -0
pytest.ini +3 -0
requirements.txt +8 -0
src/open_clip/__init__.py +18 -0
src/open_clip/coca_model.py +582 -0
src/open_clip/constants.py +11 -0
src/open_clip/convert.py +206 -0
src/open_clip/factory.py +586 -0
src/open_clip/hf_configs.py +67 -0
src/open_clip/hf_model.py +193 -0
src/open_clip/loss.py +447 -0
src/open_clip/model.py +919 -0
src/open_clip/model_configs/EVA01-g-14-plus.json +18 -0
src/open_clip/model_configs/EVA01-g-14.json +18 -0
src/open_clip/model_configs/EVA02-B-16.json +18 -0
src/open_clip/model_configs/EVA02-E-14-plus.json +18 -0
src/open_clip/model_configs/EVA02-E-14.json +18 -0
src/open_clip/model_configs/EVA02-L-14-336.json +18 -0
src/open_clip/model_configs/EVA02-L-14.json +18 -0
src/open_clip/model_configs/MobileCLIP-B.json +21 -0
src/open_clip/model_configs/MobileCLIP-S1.json +21 -0
src/open_clip/model_configs/MobileCLIP-S2.json +21 -0
src/open_clip/model_configs/RN101-quickgelu.json +22 -0
src/open_clip/model_configs/RN101.json +21 -0
src/open_clip/model_configs/RN50-quickgelu.json +22 -0
src/open_clip/model_configs/RN50.json +21 -0
src/open_clip/model_configs/RN50x16-quickgelu.json +22 -0
src/open_clip/model_configs/RN50x16.json +21 -0
src/open_clip/model_configs/RN50x4-quickgelu.json +22 -0
src/open_clip/model_configs/RN50x4.json +21 -0
src/open_clip/model_configs/RN50x64-quickgelu.json +22 -0
src/open_clip/model_configs/RN50x64.json +21 -0
src/open_clip/model_configs/ViT-B-16-SigLIP-256.json +29 -0
src/open_clip/model_configs/ViT-B-16-SigLIP-384.json +29 -0
src/open_clip/model_configs/ViT-B-16-SigLIP-512.json +29 -0
src/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json +29 -0
src/open_clip/model_configs/ViT-B-16-SigLIP.json +29 -0
src/open_clip/model_configs/ViT-B-16-SigLIP2-256.json +32 -0
src/open_clip/model_configs/ViT-B-16-SigLIP2-384.json +32 -0
src/open_clip/model_configs/ViT-B-16-SigLIP2-512.json +32 -0

.girattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.py linguist-language=python
2	+ *.ipynb linguist-documentation

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,121 @@

+name: Continuous integration
+on:
+  push:
+    branches:
+    - main
+    paths-ignore:
+    - '**.md'
+    - 'CITATION.cff'
+    - 'LICENSE'
+    - '.gitignore'
+    - 'docs/**'
+  pull_request:
+    branches:
+    - main
+    paths-ignore:
+    - '**.md'
+    - 'CITATION.cff'
+    - 'LICENSE'
+    - '.gitignore'
+    - 'docs/**'
+  workflow_dispatch:
+    inputs:
+      manual_revision_reference:
+        required: false
+        type: string
+      manual_revision_test:
+        required: false
+        type: string
+env:
+  REVISION_REFERENCE: v2.8.2
+  #9d31b2ec4df6d8228f370ff20c8267ec6ba39383 earliest compatible v2.7.0 + pretrained_hf param
+jobs:
+  Tests:
+    strategy:
+      matrix:
+        os: [ ubuntu-latest ] #, macos-latest ]
+        python: [ 3.8 ]
+        job_num: [ 4 ]
+        job: [ 1, 2, 3, 4 ]
+    runs-on: ${{ matrix.os }}
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+        ref: ${{ inputs.manual_revision_test }}
+    - name: Set up Python ${{ matrix.python }}
+      id: pythonsetup
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python }}
+    - name: Venv cache
+      id: venv-cache
+      uses: actions/cache@v3
+      with:
+        path: .env
+        key: venv-${{ matrix.os }}-${{ steps.pythonsetup.outputs.python-version }}-${{ hashFiles('requirements*') }}
+    - name: Pytest durations cache
+      uses: actions/cache@v3
+      with:
+        path: .test_durations
+        key: test_durations-${{ matrix.os }}-${{ steps.pythonsetup.outputs.python-version }}-${{ matrix.job }}-${{ github.run_id }}
+        restore-keys: test_durations-0-
+    - name: Setup
+      if: steps.venv-cache.outputs.cache-hit != 'true'
+      run: |
+        python3 -m venv .env
+        source .env/bin/activate
+        pip install -e .[test]
+    - name: Prepare test data
+      run: |
+        source .env/bin/activate
+        python -m pytest \
+          --quiet --co \
+          --splitting-algorithm least_duration \
+          --splits ${{ matrix.job_num }} \
+          --group ${{ matrix.job }} \
+          -m regression_test \
+          tests \
+          | head -n -2 | grep -Po 'test_inference_with_data\[\K[^]]*(?=-False]|-True])' \
+          > models_gh_runner.txt
+        if [ -n "${{ inputs.manual_revision_reference }}" ]; then
+          REVISION_REFERENCE=${{ inputs.manual_revision_reference }}
+        fi
+        python tests/util_test.py \
+          --save_model_list models_gh_runner.txt \
+          --model_list models_gh_runner.txt \
+          --git_revision $REVISION_REFERENCE
+    - name: Unit tests
+      run: |
+        source .env/bin/activate
+        if [[ -f .test_durations ]]
+        then
+          cp .test_durations durations_1
+          mv .test_durations durations_2
+        fi
+        python -m pytest \
+          -x -s -v \
+          --splitting-algorithm least_duration \
+          --splits ${{ matrix.job_num }} \
+          --group ${{ matrix.job }} \
+          --store-durations \
+          --durations-path durations_1 \
+          --clean-durations \
+          -m "not regression_test" \
+          tests
+        OPEN_CLIP_TEST_REG_MODELS=models_gh_runner.txt python -m pytest \
+          -x -s -v \
+          --store-durations \
+          --durations-path durations_2 \
+          --clean-durations \
+          -m "regression_test" \
+          tests
+        jq -s -S 'add' durations_* > .test_durations
+    - name: Collect pytest durations
+      uses: actions/upload-artifact@v4
+      with:
+        name: pytest_durations_${{ matrix.os }}-${{ matrix.python }}-${{ matrix.job }}
+        path: .test_durations

.github/workflows/clear-cache.yml ADDED Viewed

	@@ -0,0 +1,29 @@

+name: Clear cache
+on:
+  workflow_dispatch:
+permissions:
+  actions: write
+jobs:
+  clear-cache:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clear cache
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const caches = await github.rest.actions.getActionsCacheList({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+            })
+            for (const cache of caches.data.actions_caches) {
+              console.log(cache)
+              await github.rest.actions.deleteActionsCacheById({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                cache_id: cache.id,
+              })
+            }

.github/workflows/python-publish.yml ADDED Viewed

	@@ -0,0 +1,37 @@

+name: Release
+on:
+  push:
+    branches:
+    - main
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions-ecosystem/action-regex-match@v2
+      id: regex-match
+      with:
+        text: ${{ github.event.head_commit.message }}
+        regex: '^Release ([^ ]+)'
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.8'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools wheel twine build
+    - name: Release
+      if: ${{ steps.regex-match.outputs.match != '' }}
+      uses: softprops/action-gh-release@v1
+      with:
+        tag_name: v${{ steps.regex-match.outputs.group1 }}
+    - name: Build and publish
+      if: ${{ steps.regex-match.outputs.match != '' }}
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+      run: |
+        python -m build
+        twine upload dist/*

.gitignore ADDED Viewed

	@@ -0,0 +1,153 @@

+**/logs/
+**/wandb/
+models/
+features/
+results/
+tests/data/
+*.pt
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+sync.sh
+gpu1sync.sh
+.idea
+*.pdf
+**/._*
+**/*DS_*
+**.jsonl
+src/sbatch
+src/misc
+.vscode
+src/debug
+core.*
+# Allow
+!src/evaluation/misc/results_dbs/*

CITATION.cff ADDED Viewed

	@@ -0,0 +1,33 @@

+cff-version: 1.1.0
+message: If you use this software, please cite it as below.
+authors:
+  - family-names: Ilharco
+    given-names: Gabriel
+  - family-names: Wortsman
+    given-names: Mitchell
+  - family-names: Wightman
+    given-names: Ross
+  - family-names: Gordon
+    given-names: Cade
+  - family-names: Carlini
+    given-names: Nicholas
+  - family-names: Taori
+    given-names: Rohan
+  - family-names: Dave
+    given-names: Achal
+  - family-names: Shankar
+    given-names: Vaishaal
+  - family-names: Namkoong
+    given-names: Hongseok
+  - family-names: Miller
+    given-names: John
+  - family-names: Hajishirzi
+    given-names: Hannaneh
+  - family-names: Farhadi
+    given-names: Ali
+  - family-names: Schmidt
+    given-names: Ludwig
+title: OpenCLIP
+version: v0.1
+doi: 10.5281/zenodo.5143773
+date-released: 2021-07-28

HISTORY.md ADDED Viewed

	@@ -0,0 +1,223 @@

+## 2.24.0
+* Fix missing space in error message
+* use model flag for normalizing embeddings
+* init logit_bias for non siglip pretrained models
+* Fix logit_bias load_checkpoint addition
+* Make CoCa model match CLIP models for logit scale/bias init
+* Fix missing return of "logit_bias" in CoCa.forward
+* Add NLLB-CLIP with SigLIP models
+* Add get_logits method and NLLB tokenizer
+* Remove the empty file src/open_clip/generation_utils.py
+* Update params.py: "BatchNorm" -> "LayerNorm" in the description string for "--lock-text-freeze-layer-norm"
+## 2.23.0
+* Add CLIPA-v2 models
+* Add SigLIP models
+* Add MetaCLIP models
+* Add NLLB-CLIP models
+* CLIPA train code
+* Minor changes/fixes
+    * Remove protobuf version limit
+    * Stop checking model name when loading CoCa models
+    * Log native wandb step
+    * Use bool instead of long masks
+## 2.21.0
+* Add SigLIP loss + training support
+* Add more DataComp models (B/16, B/32 and B/32@256)
+* Update default num workers
+* Update CoCa generation for `transformers>=4.31`
+* PyTorch 2.0 `state_dict()` compatibility fix for compiled models
+* Fix padding in `ResizeMaxSize`
+* Convert JIT model on state dict load for `pretrained='filename…'`
+* Other minor changes and fixes (typos, README, dependencies, CI)
+## 2.20.0
+* Add EVA models
+* Support serial worker training
+* Fix Python 3.7 compatibility
+## 2.19.0
+* Add DataComp models
+## 2.18.0
+* Enable int8 inference without `.weight` attribute
+## 2.17.2
+* Update push_to_hf_hub
+## 2.17.0
+* Add int8 support
+* Update notebook demo
+* Refactor zero-shot classification code
+## 2.16.2
+* Fixes for context_length and vocab_size attributes
+## 2.16.1
+* Fixes for context_length and vocab_size attributes
+* Fix --train-num-samples logic
+* Add HF BERT configs for PubMed CLIP model
+## 2.16.0
+* Add improved g-14 weights
+* Update protobuf version
+## 2.15.0
+* Add convnext_xxlarge weights
+* Fixed import in readme
+* Add samples per second per gpu logging
+* Fix slurm example
+## 2.14.0
+* Move dataset mixtures logic to shard level
+* Fix CoCa accum-grad training
+* Safer transformers import guard
+* get_labels refactoring
+## 2.13.0
+* Add support for dataset mixtures with different sampling weights
+* Make transformers optional again
+## 2.12.0
+* Updated convnext configs for consistency
+* Added input_patchnorm option
+* Clean and improve CoCa generation
+* Support model distillation
+* Add ConvNeXt-Large 320x320 fine-tune weights
+## 2.11.1
+* Make transformers optional
+* Add MSCOCO CoCa finetunes to pretrained models
+## 2.11.0
+* coca support and weights
+* ConvNeXt-Large weights
+## 2.10.1
+* `hf-hub:org/model_id` support for loading models w/ config and weights in Hugging Face Hub
+## 2.10.0
+* Added a ViT-bigG-14 model.
+* Added an up-to-date example slurm script for large training jobs.
+* Added a option to sync logs and checkpoints to S3 during training.
+* New options for LR schedulers, constant and constant with cooldown
+* Fix wandb autoresuming when resume is not set
+* ConvNeXt `base` & `base_w` pretrained models added
+* `timm-` model prefix removed from configs
+* `timm` augmentation + regularization (dropout / drop-path) supported
+## 2.9.3
+* Fix wandb collapsing multiple parallel runs into a single one
+## 2.9.2
+* Fix braceexpand memory explosion for complex webdataset urls
+## 2.9.1
+* Fix release
+## 2.9.0
+* Add training feature to auto-resume from the latest checkpoint on restart via `--resume latest`
+* Allow webp in webdataset
+* Fix logging for number of samples when using gradient accumulation
+* Add model configs for convnext xxlarge
+## 2.8.2
+* wrapped patchdropout in a torch.nn.Module
+## 2.8.1
+* relax protobuf dependency
+* override the default patch dropout value in 'vision_cfg'
+## 2.8.0
+* better support for HF models
+* add support for gradient accumulation
+* CI fixes
+* add support for patch dropout
+* add convnext configs
+## 2.7.0
+* add multilingual H/14 xlm roberta large
+## 2.6.1
+* fix setup.py _read_reqs
+## 2.6.0
+* Make openclip training usable from pypi.
+* Add xlm roberta large vit h 14 config.
+## 2.5.0
+* pretrained B/32 xlm roberta base: first multilingual clip trained on laion5B
+* pretrained B/32 roberta base: first clip trained using an HF text encoder
+## 2.4.1
+* Add missing hf_tokenizer_name in CLIPTextCfg.
+## 2.4.0
+* Fix #211, missing RN50x64 config. Fix type of dropout param for ResNet models
+* Bring back LayerNorm impl that casts to input for non bf16/fp16
+* zero_shot.py: set correct tokenizer based on args
+* training/params.py: remove hf params and get them from model config
+## 2.3.1
+* Implement grad checkpointing for hf model.
+* custom_text: True if hf_model_name is set
+* Disable hf tokenizer parallelism
+## 2.3.0
+* Generalizable Text Transformer with HuggingFace Models (@iejMac)
+## 2.2.0
+* Support for custom text tower
+* Add checksum verification for pretrained model weights
+## 2.1.0
+* lot including sota models, bfloat16 option, better loading, better metrics
+## 1.2.0
+* ViT-B/32 trained on Laion2B-en
+* add missing openai RN50x64 model
+## 1.1.1
+* ViT-B/16+
+* Add grad checkpointing support
+* more robust data loader

LICENSE ADDED Viewed

	@@ -0,0 +1,23 @@

+Copyright (c) 2012-2021 Gabriel Ilharco, Mitchell Wortsman,
+Nicholas Carlini, Rohan Taori, Achal Dave, Vaishaal Shankar,
+John Miller, Hongseok Namkoong, Hannaneh Hajishirzi, Ali Farhadi,
+Ludwig Schmidt
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ include src/open_clip/bpe_simple_vocab_16e6.txt.gz
2	+ include src/open_clip/model_configs/*.json
3	+

README.md ADDED Viewed

	@@ -0,0 +1,618 @@

+# OpenCLIP
+[[Paper]](https://arxiv.org/abs/2212.07143) [[Citations]](#citing) [[Clip Colab]](https://colab.research.google.com/github/mlfoundations/open_clip/blob/master/docs/Interacting_with_open_clip.ipynb) [[Coca Colab]](https://colab.research.google.com/github/mlfoundations/open_clip/blob/master/docs/Interacting_with_open_coca.ipynb)
+[![pypi](https://img.shields.io/pypi/v/open_clip_torch.svg)](https://pypi.python.org/pypi/open_clip_torch)
+Welcome to an open source implementation of OpenAI's [CLIP](https://arxiv.org/abs/2103.00020) (Contrastive Language-Image Pre-training).
+Using this codebase, we have trained several models on a variety of data sources and compute budgets, ranging from [small-scale experiments](docs/LOW_ACC.md) to larger runs including models trained on datasets such as [LAION-400M](https://arxiv.org/abs/2111.02114), [LAION-2B](https://arxiv.org/abs/2210.08402) and [DataComp-1B](https://arxiv.org/abs/2304.14108).
+Many of our models and their scaling properties are studied in detail in the paper [reproducible scaling laws for contrastive language-image learning](https://arxiv.org/abs/2212.07143).
+Some of the best models we've trained and their zero-shot ImageNet-1k accuracy are shown below, along with the ViT-L model trained by OpenAI and other state-of-the-art open source alternatives (all can be loaded via OpenCLIP).
+We provide more details about our full collection of pretrained models [here](docs/PRETRAINED.md), and zero-shot results for 38 datasets [here](docs/openclip_results.csv).
+| Model    | Training data | Resolution | # of samples seen | ImageNet zero-shot acc. |
+| -------- | ------- |  ------- |  ------- |  ------- |
+| ConvNext-Base | LAION-2B  | 256px | 13B | 71.5% |
+| ConvNext-Large | LAION-2B  | 320px | 29B | 76.9% |
+| ConvNext-XXLarge | LAION-2B | 256px | 34B | 79.5% |
+| ViT-B/32  | DataComp-1B  | 256px | 34B | 72.8% |
+| ViT-B/16  | DataComp-1B  | 224px | 13B | 73.5% |
+| ViT-L/14  | LAION-2B  | 224px | 32B | 75.3% |
+| ViT-H/14  | LAION-2B  | 224px | 32B | 78.0% |
+| ViT-L/14  | DataComp-1B  | 224px | 13B | 79.2% |
+| ViT-G/14  | LAION-2B  | 224px | 34B | 80.1% |
+|  |  |   |   |  |
+| ViT-L/14-quickgelu [(Original CLIP)](https://arxiv.org/abs/2103.00020) | WIT | 224px | 13B | 75.5% |
+| ViT-SO400M/14 [(SigLIP)](https://arxiv.org/abs/2303.15343) | WebLI | 224px | 45B | 82.0% |
+| ViT-L/14 [(DFN)](https://arxiv.org/abs/2309.17425) | DFN-2B | 224px | 39B | 82.2% |
+| ViT-SO400M-14-SigLIP-384 [(SigLIP)](https://arxiv.org/abs/2303.15343) |  WebLI | 384px | 45B | 83.1% |
+| ViT-H/14-quickgelu [(DFN)](https://arxiv.org/abs/2309.17425) | DFN-5B | 224px | 39B | 83.4% |
+| ViT-H-14-378-quickgelu [(DFN)](https://arxiv.org/abs/2309.17425) | DFN-5B | 378px | 44B | 84.4% |
+Model cards with additional model specific details can be found on the Hugging Face Hub under the OpenCLIP library tag: https://huggingface.co/models?library=open_clip.
+If you found this repository useful, please consider [citing](#citing).
+We welcome anyone to submit an issue or send an email if you have any other requests or suggestions.
+Note that portions of `src/open_clip/` modelling and tokenizer code are adaptations of OpenAI's official [repository](https://github.com/openai/CLIP).
+## Approach
+| ![CLIP](https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/CLIP.png) |
+|:--:|
+| Image Credit: https://github.com/openai/CLIP |
+## Usage
+```
+pip install open_clip_torch
+```
+```python
+import torch
+from PIL import Image
+import open_clip
+model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
+model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
+tokenizer = open_clip.get_tokenizer('ViT-B-32')
+image = preprocess(Image.open("docs/CLIP.png")).unsqueeze(0)
+text = tokenizer(["a diagram", "a dog", "a cat"])
+with torch.no_grad(), torch.autocast("cuda"):
+    image_features = model.encode_image(image)
+    text_features = model.encode_text(text)
+    image_features /= image_features.norm(dim=-1, keepdim=True)
+    text_features /= text_features.norm(dim=-1, keepdim=True)
+    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]
+```
+If model uses `timm` image encoders (convnext, siglip, eva, etc) ensure the latest timm is installed. Upgrade `timm` if you see 'Unknown model' errors for the image encoder.
+If model uses transformers tokenizers, ensure `transformers` is installed.
+See also this [[Clip Colab]](https://colab.research.google.com/github/mlfoundations/open_clip/blob/master/docs/Interacting_with_open_clip.ipynb).
+To compute billions of embeddings efficiently, you can use [clip-retrieval](https://github.com/rom1504/clip-retrieval) which has openclip support.
+### Pretrained models
+We offer a simple model interface to instantiate both pre-trained and untrained models.
+To see which pretrained models are available, use the following code snippet.
+More details about our pretrained models are available [here](docs/PRETRAINED.md).
+```python
+>>> import open_clip
+>>> open_clip.list_pretrained()
+```
+You can find more about the models we support (e.g. number of parameters, FLOPs) in [this table](docs/model_profile.csv).
+NOTE: Many existing checkpoints use the QuickGELU activation from the original OpenAI models. This activation is actually less efficient than native torch.nn.GELU in recent versions of PyTorch. The model defaults are now nn.GELU, so one should use model definitions with `-quickgelu` postfix for the OpenCLIP pretrained weights. All OpenAI pretrained weights will always default to QuickGELU. One can also use the non `-quickgelu` model definitions with pretrained weights using QuickGELU but there will be an accuracy drop, for fine-tune that will likely vanish for longer runs.
+Future trained models will use nn.GELU.
+### Loading models
+Models can be loaded with `open_clip.create_model_and_transforms`, as shown in the example below. The model name and corresponding `pretrained` keys are compatible with the outputs of `open_clip.list_pretrained()`.
+The `pretrained` argument also accepts local paths, for example `/path/to/my/b32.pt`.
+You can also load checkpoints from huggingface this way. To do so, download the `open_clip_pytorch_model.bin` file (for example, [https://huggingface.co/laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K/tree/main](https://huggingface.co/laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K/blob/main/open_clip_pytorch_model.bin)), and use `pretrained=/path/to/open_clip_pytorch_model.bin`.
+```python
+# pretrained also accepts local paths
+model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
+```
+## Fine-tuning on classification tasks
+This repository is focused on training CLIP models. To fine-tune a *trained* zero-shot model on a downstream classification task such as ImageNet, please see [our other repository: WiSE-FT](https://github.com/mlfoundations/wise-ft). The [WiSE-FT repository](https://github.com/mlfoundations/wise-ft) contains code for our paper on [Robust Fine-tuning of Zero-shot Models](https://arxiv.org/abs/2109.01903), in which we introduce a technique for fine-tuning zero-shot models while preserving robustness under distribution shift.
+## Data
+To download datasets as webdataset, we recommend [img2dataset](https://github.com/rom1504/img2dataset).
+### Conceptual Captions
+See [cc3m img2dataset example](https://github.com/rom1504/img2dataset/blob/main/dataset_examples/cc3m.md).
+### YFCC and other datasets
+In addition to specifying the training data via CSV files as mentioned above, our codebase also supports [webdataset](https://github.com/webdataset/webdataset), which is recommended for larger scale datasets. The expected format is a series of `.tar` files. Each of these `.tar` files should contain two files for each training example, one for the image and one for the corresponding text. Both files should have the same name but different extensions. For instance, `shard_001.tar` could contain files such as `abc.jpg` and `abc.txt`. You can learn more about `webdataset` at [https://github.com/webdataset/webdataset](https://github.com/webdataset/webdataset). We use `.tar` files with 1,000 data points each, which we create using [tarp](https://github.com/webdataset/tarp).
+You can download the YFCC dataset from [Multimedia Commons](http://mmcommons.org/).
+Similar to OpenAI, we used a subset of YFCC to reach the aforementioned accuracy numbers.
+The indices of images in this subset are in [OpenAI's CLIP repository](https://github.com/openai/CLIP/blob/main/data/yfcc100m.md).
+## Training CLIP
+### Install
+We advise you first create a virtual environment with:
+```
+python3 -m venv .env
+source .env/bin/activate
+pip install -U pip
+```
+You can then install openclip for training with `pip install 'open_clip_torch[training]'`.
+#### Development
+If you want to make changes to contribute code, you can clone openclip then run `make install` in openclip folder (after creating a virtualenv)
+Install pip PyTorch as per https://pytorch.org/get-started/locally/
+You may run `make install-training` to install training deps
+#### Testing
+Test can be run with `make install-test` then `make test`
+`python -m pytest -x -s -v tests -k "training"` to run a specific test
+Running regression tests against a specific git revision or tag:
+1. Generate testing data
+    ```sh
+    python tests/util_test.py --model RN50 RN101 --save_model_list models.txt --git_revision 9d31b2ec4df6d8228f370ff20c8267ec6ba39383
+    ```
+    **_WARNING_: This will invoke git and modify your working tree, but will reset it to the current state after data has been generated! \
+    Don't modify your working tree while test data is being generated this way.**
+2. Run regression tests
+    ```sh
+    OPEN_CLIP_TEST_REG_MODELS=models.txt python -m pytest -x -s -v -m regression_test
+    ```
+### Sample single-process running code:
+```bash
+python -m open_clip_train.main \
+    --save-frequency 1 \
+    --zeroshot-frequency 1 \
+    --report-to tensorboard \
+    --train-data="/path/to/train_data.csv"  \
+    --val-data="/path/to/validation_data.csv"  \
+    --csv-img-key filepath \
+    --csv-caption-key title \
+    --imagenet-val=/path/to/imagenet/root/val/ \
+    --warmup 10000 \
+    --batch-size=128 \
+    --lr=1e-3 \
+    --wd=0.1 \
+    --epochs=30 \
+    --workers=8 \
+    --model RN50
+```
+Note: `imagenet-val` is the path to the *validation* set of ImageNet for zero-shot evaluation, not the training set!
+You can remove this argument if you do not want to perform zero-shot evaluation on ImageNet throughout training. Note that the `val` folder should contain subfolders. If it does not, please use [this script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh).
+### Multi-GPU and Beyond
+This code has been battle tested up to 1024 A100s and offers a variety of solutions
+for distributed training. We include native support for SLURM clusters.
+As the number of devices used to train increases, so does the space complexity of
+the the logit matrix. Using a naïve all-gather scheme, space complexity will be
+`O(n^2)`. Instead, complexity may become effectively linear if the flags
+`--gather-with-grad` and `--local-loss` are used. This alteration results in one-to-one
+numerical results as the naïve method.
+#### Epochs
+For larger datasets (eg Laion2B), we recommend setting `--train-num-samples` to a lower value than the full epoch, for example `--train-num-samples 135646078` to 1/16 of an epoch in conjunction with `--dataset-resampled` to do sampling with replacement. This allows having frequent checkpoints to evaluate more often.
+#### Patch Dropout
+<a href="https://arxiv.org/abs/2212.00794">Recent research</a> has shown that one can dropout half to three-quarters of the visual tokens, leading to up to 2-3x training speeds without loss of accuracy.
+You can set this on your visual transformer config with the key `patch_dropout`.
+In the paper, they also finetuned without the patch dropout at the end. You can do this with the command-line argument `--force-patch-dropout 0.`
+#### Multiple data sources
+OpenCLIP supports using multiple data sources, by separating different data paths with `::`.
+For instance, to train on CC12M and on LAION, one might use `--train-data "/data/cc12m/cc12m-train-{0000..2175}.tar::/data/LAION-400M/{00000..41455}.tar"`.
+Using `--dataset-resampled` is recommended for these cases.
+By default, on expectation the amount of times the model will see a sample from each source is proportional to the size of the source.
+For instance, when training on one data source with size 400M and one with size 10M, samples from the first source are 40x more likely to be seen in expectation.
+We also support different weighting of the data sources, by using the `--train-data-upsampling-factors` flag.
+For instance, using `--train-data-upsampling-factors=1::1` in the above scenario is equivalent to not using the flag, and `--train-data-upsampling-factors=1::2` is equivalent to upsampling the second data source twice.
+If you want to sample from data sources with the same frequency, the upsampling factors should be inversely proportional to the sizes of the data sources.
+For instance, if dataset `A` has 1000 samples and dataset `B` has 100 samples, you can use `--train-data-upsampling-factors=0.001::0.01` (or analogously, `--train-data-upsampling-factors=1::10`).
+#### Single-Node
+We make use of `torchrun` to launch distributed jobs. The following launches a
+a job on a node of 4 GPUs:
+```bash
+cd open_clip/src
+torchrun --nproc_per_node 4 -m open_clip_train.main \
+    --train-data '/data/cc12m/cc12m-train-{0000..2175}.tar' \
+    --train-num-samples 10968539 \
+    --dataset-type webdataset \
+    --batch-size 320 \
+    --precision amp \
+    --workers 4 \
+    --imagenet-val /data/imagenet/validation/
+```
+#### Multi-Node
+The same script above works, so long as users include information about the number
+of nodes and host node.
+```bash
+cd open_clip/src
+torchrun --nproc_per_node=4 \
+    --rdzv_endpoint=$HOSTE_NODE_ADDR \
+    -m open_clip_train.main \
+    --train-data '/data/cc12m/cc12m-train-{0000..2175}.tar' \
+    --train-num-samples 10968539 \
+    --dataset-type webdataset \
+    --batch-size 320 \
+    --precision amp \
+    --workers 4 \
+    --imagenet-val /data/imagenet/validation/
+```
+#### SLURM
+This is likely the easiest solution to utilize. The following script was used to
+train our largest models:
+```bash
+#!/bin/bash -x
+#SBATCH --nodes=32
+#SBATCH --gres=gpu:4
+#SBATCH --ntasks-per-node=4
+#SBATCH --cpus-per-task=6
+#SBATCH --wait-all-nodes=1
+#SBATCH --job-name=open_clip
+#SBATCH --account=ACCOUNT_NAME
+#SBATCH --partition PARTITION_NAME
+eval "$(/path/to/conda/bin/conda shell.bash hook)" # init conda
+conda activate open_clip
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export MASTER_PORT=12802
+master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_ADDR=$master_addr
+cd /shared/open_clip
+export PYTHONPATH="$PYTHONPATH:$PWD/src"
+srun --cpu_bind=v --accel-bind=gn python -u src/open_clip_train/main.py \
+    --save-frequency 1 \
+    --report-to tensorboard \
+    --train-data="/data/LAION-400M/{00000..41455}.tar" \
+    --warmup 2000 \
+    --batch-size=256 \
+    --epochs=32 \
+    --workers=8 \
+    --model ViT-B-32 \
+    --name "ViT-B-32-Vanilla" \
+    --seed 0 \
+    --local-loss \
+    --gather-with-grad
+```
+### Resuming from a checkpoint:
+```bash
+python -m open_clip_train.main \
+    --train-data="/path/to/train_data.csv" \
+    --val-data="/path/to/validation_data.csv"  \
+    --resume /path/to/checkpoints/epoch_K.pt
+```
+### Training CoCa:
+Training [CoCa](https://arxiv.org/abs/2205.01917) models is enabled through specifying a CoCa config using the ```--model``` parameter of the training script. Currently available configs are "coca_base", "coca_ViT-B-32", and "coca_roberta-ViT-B-32" (which uses RoBERTa as the text encoder). CoCa configs are different from CLIP configs because they have an additional "multimodal_cfg" component which specifies parameters for the multimodal text decoder. Here's an example from the coca_ViT-B-32 config:
+```json
+"multimodal_cfg": {
+	"context_length": 76,
+	"vocab_size": 49408,
+	"width": 512,
+	"heads": 8,
+	"layers": 12,
+	"latent_dim": 512,
+	"attn_pooler_heads": 8
+}
+```
+Credit to [lucidrains](https://github.com/lucidrains) for [initial code](https://github.com/lucidrains/CoCa-pytorch), [gpucce](https://github.com/gpucce) for adapting the code to open_clip, and [iejMac](https://github.com/iejMac) for training the models.
+### Generating text with CoCa
+```python
+import open_clip
+import torch
+from PIL import Image
+model, _, transform = open_clip.create_model_and_transforms(
+  model_name="coca_ViT-L-14",
+  pretrained="mscoco_finetuned_laion2B-s13B-b90k"
+)
+im = Image.open("cat.jpg").convert("RGB")
+im = transform(im).unsqueeze(0)
+with torch.no_grad(), torch.cuda.amp.autocast():
+  generated = model.generate(im)
+print(open_clip.decode(generated[0]).split("<end_of_text>")[0].replace("<start_of_text>", ""))
+```
+See also this [[Coca Colab]](https://colab.research.google.com/github/mlfoundations/open_clip/blob/master/docs/Interacting_with_open_coca.ipynb)
+### Fine Tuning CoCa
+To fine-tune coca on mscoco, first create the dataset, one way is using a csvdataset and perhaps the simplest way to do it is using [CLIP_benchmark](https://github.com/LAION-AI/CLIP_benchmark) which in turn uses [pycocotools](https://github.com/cocodataset/cocoapi) (that can be used also by itself).
+```python
+from clip_benchmark.datasets.builder import build_dataset
+import pandas as pd
+import os
+root_path = "path/to/data/dir" # set this to smth meaningful
+ds = build_dataset("mscoco_captions", root=root_path, split="train", task="captioning") # this downloads the dataset if it is not there already
+coco = ds.coco
+imgs = coco.loadImgs(coco.getImgIds())
+future_df = {"filepath":[], "title":[]}
+for img in imgs:
+    caps = coco.imgToAnns[img["id"]]
+    for cap in caps:
+        future_df["filepath"].append(img["file_name"])
+        future_df["title"].append(cap["caption"])
+pd.DataFrame.from_dict(future_df).to_csv(
+  os.path.join(root_path, "train2014.csv"), index=False, sep="\t"
+)
+```
+This should create a csv dataset that one can use to fine-tune coca with open_clip
+```bash
+python -m open_clip_train.main \
+    --dataset-type "csv" \
+    --train-data "path/to/data/dir/train2014.csv" \
+    --warmup 1000 \
+    --batch-size 128 \
+    --lr 1e-5 \
+    --wd 0.1 \
+    --epochs 1 \
+    --workers 3 \
+    --model "coca_ViT-L-14" \
+    --report-to "wandb" \
+    --coca-contrastive-loss-weight 0 \
+    --coca-caption-loss-weight 1 \
+    --log-every-n-steps 100
+```
+This is a general setting, open_clip has very parameters that can be set, ```python -m open_clip_train.main --help``` should show them. The only relevant change compared to pre-training are the two arguments
+```bash
+--coca-contrastive-loss-weight 0
+--coca-caption-loss-weight 1
+```
+which make the model only train the generative side.
+### Training with pre-trained language models as text encoder:
+If you wish to use different language models as the text encoder for CLIP you can do so by using one of the Hugging Face model configs in ```src/open_clip/model_configs``` and passing in it's tokenizer as the ```--model``` and ```--hf-tokenizer-name``` parameters respectively. Currently we only support RoBERTa ("test-roberta" config), however adding new models should be trivial. You can also determine how many layers, from the end, to leave unfrozen with the ```--lock-text-unlocked-layers``` parameter. Here's an example command to train CLIP with the RoBERTa LM that has it's last 10 layers unfrozen:
+```bash
+python -m open_clip_train.main \
+         --train-data="pipe:aws s3 cp s3://s-mas/cc3m/{00000..00329}.tar -" \
+         --train-num-samples 3000000 \
+         --val-data="pipe:aws s3 cp s3://s-mas/cc3m/{00330..00331}.tar -" \
+         --val-num-samples 10000 \
+         --dataset-type webdataset \
+         --batch-size 256 \
+         --warmup 2000 \
+         --epochs 10 \
+         --lr 5e-4 \
+         --precision amp \
+         --workers 6 \
+         --model "roberta-ViT-B-32" \
+         --lock-text \
+         --lock-text-unlocked-layers 10 \
+         --name "10_unfrozen" \
+         --report-to "tensorboard" \
+```
+### Loss Curves
+When run on a machine with 8 GPUs the command should produce the following training curve for Conceptual Captions:
+![CLIP zero shot training curve](https://raw.githubusercontent.com/mlfoundations/open_clip/main/docs/clip_zeroshot.png)
+More detailed curves for Conceptual Captions are given at [/docs/clip_conceptual_captions.md](/docs/clip_conceptual_captions.md).
+When training a RN50 on YFCC the same hyperparameters as above are used, with the exception of `lr=5e-4` and `epochs=32`.
+Note that to use another model, like `ViT-B/32` or `RN50x4` or `RN50x16` or `ViT-B/16`, specify with `--model RN50x4`.
+### Logging
+For tensorboard logging, run:
+```bash
+tensorboard --logdir=logs/tensorboard/ --port=7777
+```
+For wandb logging, we recommend looking at the `step` variable instead of `Step`, since the later was not properly set in earlier versions of this codebase.
+For older runs with models trained before https://github.com/mlfoundations/open_clip/pull/613, the `Step` variable should be ignored.
+For newer runs, after that PR, the two variables are the same.
+## Evaluation / Zero-Shot
+We recommend https://github.com/LAION-AI/CLIP_benchmark#how-to-use for systematic evaluation on 40 datasets.
+### Evaluating local checkpoint:
+```bash
+python -m open_clip_train.main \
+    --val-data="/path/to/validation_data.csv"  \
+    --model RN101 \
+    --pretrained /path/to/checkpoints/epoch_K.pt
+```
+### Evaluating hosted pretrained checkpoint on ImageNet zero-shot prediction:
+```bash
+python -m open_clip_train.main \
+    --imagenet-val /path/to/imagenet/validation \
+    --model ViT-B-32-quickgelu \
+    --pretrained laion400m_e32
+```
+### Model distillation
+You can distill from a pre-trained by using `--distill-model` and `--distill-pretrained` to specify the model you'd like to distill from.
+For instance, to distill from OpenAI ViT-L/14 use `--distill-model ViT-L-14 --distill-pretrained openai`.
+### Gradient accumulation
+To simulate larger batches use `--accum-freq k`. If per gpu batch size, `--batch-size`, is `m`, then the effective batch size will be `k * m * num_gpus`.
+When increasing `--accum-freq` from its default of 1, samples/s will remain approximately constant (batch size will double, as will time-per-batch). It is recommended to use other features to reduce batch size such as `--grad-checkpointing --local-loss --gather-with-grad` before increasing `--accum-freq`. `--accum-freq` can be used in addition to these features.
+Instead of 1 forward pass per example, there are now 2 forward passes per-example. However, the first is done with `torch.no_grad`.
+There is some additional GPU memory required --- the features and data from all `m` batches are stored in memory.
+There are also `m` loss computations instead of the usual 1.
+For more information see Cui et al. (https://arxiv.org/abs/2112.09331) or Pham et al. (https://arxiv.org/abs/2111.10050).
+### Int8 Support
+We have beta support for int8 training and inference.
+You can enable int8 training with `--use-bnb-linear SwitchBackLinearGlobal` or `--use-bnb-linear SwitchBackLinearGlobalMemEfficient`.
+Please see the bitsandbytes library for definitions for these layers.
+For CLIP VIT-Huge this should currently correspond to a 10% training speedup with no accuracy loss.
+More speedups comin when the attention layer is refactored so that linear layers man be replaced there, too.
+See the tutorial https://github.com/mlfoundations/open_clip/blob/main/tutorials/int8_tutorial.ipynb or [paper](https://arxiv.org/abs/2304.13013).
+### Support for remote loading/training
+It is always possible to resume directly from a remote file, e.g., a file in an s3 bucket. Just set `--resume s3://<path-to-checkpoint> `.
+This will work with any filesystem supported by `fsspec`.
+It is also possible to train `open_clip` models while continuously backing up to s3. This can help to avoid slow local file systems.
+Say that your node has a local ssd `/scratch`, an s3 bucket `s3://<path-to-bucket>`.
+In that case, set `--logs /scratch` and `--remote-sync s3://<path-to-bucket>`. Then, a background process will sync `/scratch/<run-name>` to `s3://<path-to-bucket>/<run-name>`. After syncing, the background process will sleep for `--remote-sync-frequency` seconds, which defaults to 5 minutes.
+There is also experimental support for syncing to other remote file systems, not just s3. To do so, specify `--remote-sync-protocol fsspec`. However, this is currently very slow and not recommended.
+Also, to optionally avoid saving too many checkpoints locally when using these features, you can use `--delete-previous-checkpoint` which deletes the previous checkpoint after saving a new one.
+Note: if you are using this feature with `--resume latest`, there are a few warnings. First, use with `--save-most-recent` is not supported. Second, only `s3` is supported. Finally, since the sync happens in the background, it is possible that the most recent checkpoint may not be finished syncing to the remote.
+### Pushing Models to Hugging Face Hub
+The module `open_clip.push_to_hf_hub` includes helpers for pushing models /w weights and config to the HF Hub.
+The tool can be run from command line, ex:
+`python -m open_clip.push_to_hf_hub --model convnext_large_d_320 --pretrained /train/checkpoints/epoch_12.pt --repo-id laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft`
+## Acknowledgments
+We gratefully acknowledge the Gauss Centre for Supercomputing e.V. (www.gauss-centre.eu) for funding this part of work by providing computing time through the John von Neumann Institute for Computing (NIC) on the GCS Supercomputer JUWELS Booster at Jülich Supercomputing Centre (JSC).
+## The Team
+Current development of this repository is led by [Ross Wightman](https://rwightman.com/), [Romain Beaumont](https://github.com/rom1504), [Cade Gordon](http://cadegordon.io/), and [Vaishaal Shankar](http://vaishaal.com/).
+The original version of this repository is from a group of researchers at UW, Google, Stanford, Amazon, Columbia, and Berkeley.
+[Gabriel Ilharco*](http://gabrielilharco.com/), [Mitchell Wortsman*](https://mitchellnw.github.io/), [Nicholas Carlini](https://nicholas.carlini.com/), [Rohan Taori](https://www.rohantaori.com/), [Achal Dave](http://www.achaldave.com/), [Vaishaal Shankar](http://vaishaal.com/), [John Miller](https://people.eecs.berkeley.edu/~miller_john/), [Hongseok Namkoong](https://hsnamkoong.github.io/), [Hannaneh Hajishirzi](https://homes.cs.washington.edu/~hannaneh/), [Ali Farhadi](https://homes.cs.washington.edu/~ali/), [Ludwig Schmidt](https://people.csail.mit.edu/ludwigs/)
+Special thanks to [Jong Wook Kim](https://jongwook.kim/) and [Alec Radford](https://github.com/Newmu) for help with reproducing CLIP!
+## Citing
+If you found this repository useful, please consider citing:
+```bibtex
+@software{ilharco_gabriel_2021_5143773,
+  author       = {Ilharco, Gabriel and
+                  Wortsman, Mitchell and
+                  Wightman, Ross and
+                  Gordon, Cade and
+                  Carlini, Nicholas and
+                  Taori, Rohan and
+                  Dave, Achal and
+                  Shankar, Vaishaal and
+                  Namkoong, Hongseok and
+                  Miller, John and
+                  Hajishirzi, Hannaneh and
+                  Farhadi, Ali and
+                  Schmidt, Ludwig},
+  title        = {OpenCLIP},
+  month        = jul,
+  year         = 2021,
+  note         = {If you use this software, please cite it as below.},
+  publisher    = {Zenodo},
+  version      = {0.1},
+  doi          = {10.5281/zenodo.5143773},
+  url          = {https://doi.org/10.5281/zenodo.5143773}
+}
+```
+```bibtex
+@inproceedings{cherti2023reproducible,
+  title={Reproducible scaling laws for contrastive language-image learning},
+  author={Cherti, Mehdi and Beaumont, Romain and Wightman, Ross and Wortsman, Mitchell and Ilharco, Gabriel and Gordon, Cade and Schuhmann, Christoph and Schmidt, Ludwig and Jitsev, Jenia},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={2818--2829},
+  year={2023}
+}
+```
+```bibtex
+@inproceedings{Radford2021LearningTV,
+  title={Learning Transferable Visual Models From Natural Language Supervision},
+  author={Alec Radford and Jong Wook Kim and Chris Hallacy and A. Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},
+  booktitle={ICML},
+  year={2021}
+}
+```
+```bibtex
+@inproceedings{schuhmann2022laionb,
+  title={{LAION}-5B: An open large-scale dataset for training next generation image-text models},
+  author={Christoph Schuhmann and
+          Romain Beaumont and
+          Richard Vencu and
+          Cade W Gordon and
+          Ross Wightman and
+          Mehdi Cherti and
+          Theo Coombes and
+          Aarush Katta and
+          Clayton Mullis and
+          Mitchell Wortsman and
+          Patrick Schramowski and
+          Srivatsa R Kundurthy and
+          Katherine Crowson and
+          Ludwig Schmidt and
+          Robert Kaczmarczyk and
+          Jenia Jitsev},
+  booktitle={Thirty-sixth Conference on Neural Information Processing Systems Datasets and Benchmarks Track},
+  year={2022},
+  url={https://openreview.net/forum?id=M3Y74vmsMcY}
+}
+```
+[![DOI](https://zenodo.org/badge/390536799.svg)](https://zenodo.org/badge/latestdoi/390536799)

models.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ RN101
2	+ RN50

pytest.ini ADDED Viewed

	@@ -0,0 +1,3 @@

+[pytest]
+markers =
+    regression_test

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch>=1.9.0
+torchvision
+regex
+ftfy
+tqdm
+huggingface_hub
+safetensors
+timm

src/open_clip/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from .version import __version__
+from .coca_model import CoCa
+from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
+from .factory import list_models, add_model_config, get_model_config, load_checkpoint
+from .loss import ClipLoss, DistillClipLoss, CoCaLoss
+from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
+    convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype, \
+    get_model_tokenize_cfg, get_model_preprocess_cfg, set_model_preprocess_cfg
+from .openai import load_openai_model, list_openai_models
+from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
+    get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
+from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
+from .tokenizer import SimpleTokenizer, tokenize, decode
+from .transform import image_transform, AugmentationCfg
+from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy
+from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES

src/open_clip/coca_model.py ADDED Viewed

	@@ -0,0 +1,582 @@

+from typing import Dict, List, Optional, Union
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+from dataclasses import dataclass
+from .transformer import (
+    LayerNormFp32,
+    LayerNorm,
+    QuickGELU,
+    MultimodalTransformer,
+)
+from .model import CLIPTextCfg, CLIPVisionCfg, _build_vision_tower, _build_text_tower
+try:
+    from transformers import (
+        BeamSearchScorer,
+        LogitsProcessorList,
+        TopPLogitsWarper,
+        TopKLogitsWarper,
+        RepetitionPenaltyLogitsProcessor,
+        MinLengthLogitsProcessor,
+        MaxLengthCriteria,
+        StopStringCriteria,
+        EosTokenCriteria,
+        StoppingCriteriaList
+    )
+    GENERATION_TYPES = {
+        "top_k": TopKLogitsWarper,
+        "top_p": TopPLogitsWarper,
+        "beam_search": "beam_search"
+    }
+    _has_transformers = True
+except ImportError as e:
+    GENERATION_TYPES = {
+        "top_k": None,
+        "top_p": None,
+        "beam_search": "beam_search"
+    }
+    _has_transformers = False
+@dataclass
+class MultimodalCfg(CLIPTextCfg):
+    mlp_ratio: int = 4
+    dim_head: int = 64
+    heads: int = 8
+    n_queries: int = 256
+    attn_pooler_heads: int = 8
+def _build_text_decoder_tower(
+        embed_dim,
+        multimodal_cfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None,
+):
+    multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
+    act_layer = QuickGELU if quick_gelu else nn.GELU
+    norm_layer = (
+        LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+    )
+    decoder = MultimodalTransformer(
+        context_length=multimodal_cfg.context_length,
+        width=multimodal_cfg.width,
+        heads=multimodal_cfg.heads,
+        layers=multimodal_cfg.layers,
+        ls_init_value=multimodal_cfg.ls_init_value,
+        output_dim=embed_dim,
+        act_layer=act_layer,
+        norm_layer=norm_layer,
+    )
+    return decoder
+def _token_to_tensor(token_id, device: str = "cpu") -> torch.Tensor:
+    if not isinstance(token_id, torch.Tensor):
+        if isinstance(token_id, int):
+            token_id = [token_id]
+        token_id = torch.tensor(token_id, device=device)
+    return token_id
+class CoCa(nn.Module):
+    def __init__(
+            self,
+            embed_dim,
+            multimodal_cfg: MultimodalCfg,
+            text_cfg: CLIPTextCfg,
+            vision_cfg: CLIPVisionCfg,
+            quick_gelu: bool = False,
+            init_logit_scale: float = np.log(1 / 0.07),
+            init_logit_bias: Optional[float] = None,
+            nonscalar_logit_scale: bool = False,
+            cast_dtype: Optional[torch.dtype] = None,
+            pad_id: int = 0,
+    ):
+        super().__init__()
+        multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
+        text_cfg = CLIPTextCfg(**text_cfg) if isinstance(text_cfg, dict) else text_cfg
+        vision_cfg = CLIPVisionCfg(**vision_cfg) if isinstance(vision_cfg, dict) else vision_cfg
+        self.text = _build_text_tower(
+            embed_dim=embed_dim,
+            text_cfg=text_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        vocab_size = (
+            text_cfg.vocab_size  # for hf models
+            if hasattr(text_cfg, "hf_model_name") and text_cfg.hf_model_name is not None
+            else text_cfg.vocab_size
+        )
+        self.visual = _build_vision_tower(
+            embed_dim=embed_dim,
+            vision_cfg=vision_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        self.text_decoder = _build_text_decoder_tower(
+            vocab_size,
+            multimodal_cfg=multimodal_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        lshape = [1] if nonscalar_logit_scale else []
+        self.logit_scale = nn.Parameter(torch.ones(lshape) * init_logit_scale)
+        if init_logit_bias is not None:
+            self.logit_bias = nn.Parameter(torch.ones(lshape) * init_logit_bias)
+        else:
+            self.logit_bias = None
+        self.pad_id = pad_id
+        self.context_length = multimodal_cfg.context_length
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable: bool = True):
+        self.visual.set_grad_checkpointing(enable)
+        self.text.set_grad_checkpointing(enable)
+        self.text_decoder.set_grad_checkpointing(enable)
+    def _encode_image(self, images, normalize: bool = True):
+        image_latent, tokens_embs = self.visual(images)
+        image_latent = F.normalize(image_latent, dim=-1) if normalize else image_latent
+        return image_latent, tokens_embs
+    def _encode_text(self, text, normalize: bool = True):
+        text_latent, token_emb = self.text(text)
+        text_latent = F.normalize(text_latent, dim=-1) if normalize else text_latent
+        return text_latent, token_emb
+    def encode_image(self, images, normalize: bool = True):
+        image_latent, _ = self._encode_image(images, normalize=normalize)
+        return image_latent
+    def encode_text(self, text, normalize: bool = True):
+        text_latent, _ = self._encode_text(text, normalize=normalize)
+        return text_latent
+    def forward_intermediates(
+            self,
+            image: Optional[torch.Tensor] = None,
+            text: Optional[torch.Tensor] = None,
+            image_indices: Optional[Union[int, List[int]]] = None,
+            text_indices: Optional[Union[int, List[int]]] = None,
+            stop_early: bool = False,
+            normalize: bool = True,
+            normalize_intermediates: bool = False,
+            intermediates_only: bool = False,
+            image_output_fmt: str = 'NCHW',
+            image_output_extra_tokens: bool = False,
+            text_output_fmt: str = 'NLC',
+            text_output_extra_tokens: bool = False,
+            output_logits: bool = False,
+            output_logit_scale_bias: bool = False,
+    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
+        """ Forward features that returns intermediates.
+        Args:
+            image: Input image tensor
+            text: Input text tensor
+            image_indices: For image tower, Take last n blocks if int, all if None, select matching indices if sequence
+            text_indices: Take last n blocks if int, all if None, select matching indices if sequence
+            stop_early: Stop iterating over blocks when last desired intermediate hit
+            normalize: L2 Normalize final image and text features (if present)
+            normalize_intermediates: Apply final encoder norm layer to all intermediates (if possible)
+            intermediates_only: Only return intermediate features, do not return final features
+            image_output_fmt: Shape of intermediate image feature outputs
+            image_output_extra_tokens: Return both prefix and spatial intermediate tokens
+            text_output_fmt: Shape of intermediate text feature outputs
+            text_output_extra_tokens: Return both prefix and spatial intermediate tokens
+            output_logits: Include logits in output
+            output_logit_scale_bias: Include the logit scale bias in the output
+        Returns:
+        """
+        output = {}
+        if intermediates_only:
+            # intermediates only disables final feature normalization, and include logits
+            normalize = False
+            output_logits = False
+        if output_logits:
+            assert False, 'FIXME, needs implementing'
+        if image is not None:
+            image_output = self.visual.forward_intermediates(
+                image,
+                indices=image_indices,
+                stop_early=stop_early,
+                normalize_intermediates=normalize_intermediates,
+                intermediates_only=intermediates_only,
+                output_fmt=image_output_fmt,
+                output_extra_tokens=image_output_extra_tokens,
+            )
+            if normalize and "image_features" in image_output:
+                image_output["image_features"] = F.normalize(image_output["image_features"], dim=-1)
+            output.update(image_output)
+        if text is not None:
+            text_output = self.text.forward_intermediates(
+                text,
+                indices=text_indices,
+                stop_early=stop_early,
+                normalize_intermediates=normalize_intermediates,
+                intermediates_only=intermediates_only,
+                output_fmt=text_output_fmt,
+                output_extra_tokens=text_output_extra_tokens,
+            )
+            if normalize and "text_features" in text_output:
+                text_output["text_features"] = F.normalize(text_output["text_features"], dim=-1)
+            output.update(text_output)
+        # FIXME text decoder
+        logit_scale_exp = self.logit_scale.exp() if output_logits or output_logit_scale_bias else None
+        if output_logit_scale_bias:
+            output["logit_scale"] = logit_scale_exp
+            if self.logit_bias is not None:
+                output['logit_bias'] = self.logit_bias
+        return output
+    def forward(
+            self,
+            image,
+            text: Optional[torch.Tensor] = None,
+            image_latent: Optional[torch.Tensor] = None,
+            image_embs: Optional[torch.Tensor] = None,
+            output_labels: bool = True,
+    ):
+        if image_latent is None or image_embs is None:
+            image_latent, image_embs = self._encode_image(image)
+        if text is None:
+            return {"image_features": image_latent, "image_embs": image_embs}
+        text_latent, token_embs = self._encode_text(text)
+        # FIXME this isn't an ideal solution, would like to improve -RW
+        labels: Optional[torch.Tensor] = text[:, 1:] if output_labels else None
+        if output_labels:
+            # align text_embs and thus logits with labels for teacher-forcing caption loss
+            token_embs = token_embs[:, :-1]
+        logits = self.text_decoder(image_embs, token_embs)
+        out_dict = {
+            "image_features": image_latent,
+            "text_features": text_latent,
+            "logits": logits,
+            "logit_scale": self.logit_scale.exp()
+        }
+        if labels is not None:
+            out_dict["labels"] = labels
+        if self.logit_bias is not None:
+            out_dict["logit_bias"] = self.logit_bias
+        return out_dict
+    def generate(
+        self,
+        image,
+        text=None,
+        seq_len=30,
+        max_seq_len=77,
+        temperature=1.,
+        generation_type="beam_search",
+        top_p=0.1,  # keep tokens in the 1 - top_p quantile
+        top_k=1,  # keeps the top_k most probable tokens
+        pad_token_id=None,
+        eos_token_id=None,
+        sot_token_id=None,
+        num_beams=6,
+        num_beam_groups=3,
+        min_seq_len=5,
+        stopping_criteria=None,
+        repetition_penalty=1.0,
+        fixed_output_length=False # if True output.shape == (batch_size, seq_len)
+    ):
+        # taking many ideas and components from HuggingFace GenerationMixin
+        # https://huggingface.co/docs/transformers/main/en/main_classes/text_generation
+        assert _has_transformers, "Please install transformers for generate functionality. `pip install transformers`."
+        assert seq_len > min_seq_len, "seq_len must be larger than min_seq_len"
+        device = image.device
+        with torch.no_grad():
+            sot_token_id = _token_to_tensor(49406 if sot_token_id is None else sot_token_id, device=device)
+            eos_token_id = _token_to_tensor(49407 if eos_token_id is None else eos_token_id, device=device)
+            pad_token_id = self.pad_id if pad_token_id is None else pad_token_id
+            logit_processor = LogitsProcessorList(
+                [
+                    MinLengthLogitsProcessor(min_seq_len, eos_token_id),
+                    RepetitionPenaltyLogitsProcessor(repetition_penalty),
+                ]
+            )
+            if stopping_criteria is None:
+                stopping_criteria = [MaxLengthCriteria(max_length=seq_len)]
+            stopping_criteria = StoppingCriteriaList(stopping_criteria)
+            if generation_type == "beam_search":
+                output = self._generate_beamsearch(
+                    image_inputs=image,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                    sot_token_id=sot_token_id,
+                    num_beams=num_beams,
+                    num_beam_groups=num_beam_groups,
+                    min_seq_len=min_seq_len,
+                    stopping_criteria=stopping_criteria,
+                    logit_processor=logit_processor,
+                )
+                if fixed_output_length and output.shape[1] < seq_len:
+                    pad_len = seq_len - output.shape[1]
+                    return torch.cat((
+                            output,
+                            torch.ones(output.shape[0], pad_len, device=device, dtype=output.dtype) * pad_token_id
+                        ),
+                        dim=1
+                    )
+                return output
+            elif generation_type == "top_p":
+                logit_warper = GENERATION_TYPES[generation_type](top_p)
+            elif generation_type == "top_k":
+                logit_warper = GENERATION_TYPES[generation_type](top_k)
+            else:
+                raise ValueError(
+                    f"generation_type has to be one of "
+                    f"{'| ' + ' | '.join(list(GENERATION_TYPES.keys())) + ' |'}."
+                )
+            image_latent, image_embs = self._encode_image(image)
+            if text is None:
+                text = torch.ones((image.shape[0], 1), device=device, dtype=torch.long) * sot_token_id
+            was_training = self.training
+            num_dims = len(text.shape)
+            if num_dims == 1:
+                text = text[None, :]
+            self.eval()
+            out = text
+            while True:
+                x = out[:, -max_seq_len:]
+                cur_len = x.shape[1]
+                logits = self(
+                    image,
+                    x,
+                    image_latent=image_latent,
+                    image_embs=image_embs,
+                    output_labels=False,
+                )["logits"][:, -1]
+                mask = (out[:, -1] == eos_token_id) | (out[:, -1] == pad_token_id)
+                sample = torch.ones((out.shape[0], 1), device=device, dtype=torch.long) * pad_token_id
+                if mask.all():
+                    if not fixed_output_length:
+                        break
+                else:
+                    logits = logits[~mask, :]
+                    filtered_logits = logit_processor(x[~mask, :], logits)
+                    filtered_logits = logit_warper(x[~mask, :], filtered_logits)
+                    probs = F.softmax(filtered_logits / temperature, dim=-1)
+                    if (cur_len + 1 == seq_len):
+                        sample[~mask, :] = torch.ones((sum(~mask), 1), device=device, dtype=torch.long) * eos_token_id
+                    else:
+                        sample[~mask, :] = torch.multinomial(probs, 1)
+                out = torch.cat((out, sample), dim=-1)
+                cur_len += 1
+                if all(stopping_criteria(out, None)):
+                    break
+            if num_dims == 1:
+                out = out.squeeze(0)
+            self.train(was_training)
+            return out
+    def _generate_beamsearch(
+            self,
+            image_inputs,
+            pad_token_id=None,
+            eos_token_id=None,
+            sot_token_id=None,
+            num_beams=6,
+            num_beam_groups=3,
+            min_seq_len=5,
+            stopping_criteria=None,
+            logit_processor=None,
+            logit_warper=None,
+    ):
+        device = image_inputs.device
+        batch_size = image_inputs.shape[0]
+        image_inputs = torch.repeat_interleave(image_inputs, num_beams, dim=0)
+        image_latent, image_embs = self._encode_image(image_inputs)
+        input_ids = torch.ones((batch_size * num_beams, 1), device=device, dtype=torch.long)
+        input_ids = input_ids * sot_token_id
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=device,
+            num_beam_groups=num_beam_groups,
+        )
+        # instantiate logits processors
+        logits_processor = (
+            LogitsProcessorList([MinLengthLogitsProcessor(min_seq_len, eos_token_id=eos_token_id)])
+            if logit_processor is None
+            else logit_processor
+        )
+        num_beams = beam_scorer.num_beams
+        num_beam_groups = beam_scorer.num_beam_groups
+        num_sub_beams = num_beams // num_beam_groups
+        batch_size = len(beam_scorer._beam_hyps) // num_beam_groups
+        batch_beam_size, cur_len = input_ids.shape
+        beam_indices = None
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+        beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
+        # initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in
+        # the same group don't produce same tokens everytime.
+        beam_scores[:, ::num_sub_beams] = 0
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+        while True:
+            # predicted tokens in cur_len step
+            current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
+            # indices which will form the beams in the next time step
+            reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)
+            # do one decoder step on all beams of all sentences in batch
+            model_inputs = prepare_inputs_for_generation(input_ids=input_ids, image_inputs=image_inputs)
+            outputs = self(
+                model_inputs['images'],
+                model_inputs['text'],
+                image_latent=image_latent,
+                image_embs=image_embs,
+                output_labels=False,
+            )
+            for beam_group_idx in range(num_beam_groups):
+                group_start_idx = beam_group_idx * num_sub_beams
+                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
+                group_size = group_end_idx - group_start_idx
+                # indices of beams of current group among all sentences in batch
+                batch_group_indices = []
+                for batch_idx in range(batch_size):
+                    batch_group_indices.extend(
+                        [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
+                    )
+                group_input_ids = input_ids[batch_group_indices]
+                # select outputs of beams of currentg group only
+                next_token_logits = outputs['logits'][batch_group_indices, -1, :]
+                vocab_size = next_token_logits.shape[-1]
+                next_token_scores_processed = logits_processor(
+                    group_input_ids, next_token_logits, current_tokens=current_tokens, beam_group_idx=beam_group_idx
+                )
+                next_token_scores = next_token_scores_processed + beam_scores[batch_group_indices].unsqueeze(-1)
+                next_token_scores = next_token_scores.expand_as(next_token_scores_processed)
+                # reshape for beam search
+                next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
+                next_token_scores, next_tokens = torch.topk(
+                    next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
+                )
+                next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+                next_tokens = next_tokens % vocab_size
+                # stateless
+                process_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
+                beam_outputs = beam_scorer.process(
+                    group_input_ids,
+                    next_token_scores,
+                    next_tokens,
+                    next_indices,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                    beam_indices=process_beam_indices,
+                    group_index=beam_group_idx,
+                )
+                beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
+                beam_next_tokens = beam_outputs["next_beam_tokens"]
+                beam_idx = beam_outputs["next_beam_indices"]
+                input_ids[batch_group_indices] = group_input_ids[beam_idx]
+                group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+                current_tokens[batch_group_indices] = group_input_ids[:, -1]
+                # (beam_idx // group_size) -> batch_idx
+                # (beam_idx % group_size) -> offset of idx inside the group
+                reordering_indices[batch_group_indices] = (
+                    num_beams * torch.div(beam_idx, group_size, rounding_mode="floor") + group_start_idx + (beam_idx % group_size)
+                )
+            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
+            # increase cur_len
+            cur_len = cur_len + 1
+            if beam_scorer.is_done or all(stopping_criteria(input_ids, None)):
+                break
+        final_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=final_beam_indices,
+        )
+        return sequence_outputs['sequences']
+def prepare_inputs_for_generation(input_ids, image_inputs, past=None, **kwargs):
+    if past:
+        input_ids = input_ids[:, -1].unsqueeze(-1)
+    attention_mask = kwargs.get("attention_mask", None)
+    position_ids = kwargs.get("position_ids", None)
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+    else:
+        position_ids = None
+    return {
+        "text": input_ids,
+        "images": image_inputs,
+        "past_key_values": past,
+        "position_ids": position_ids,
+        "attention_mask": attention_mask,
+    }

src/open_clip/constants.py ADDED Viewed

	@@ -0,0 +1,11 @@

+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+INCEPTION_MEAN = (0.5, 0.5, 0.5)
+INCEPTION_STD = (0.5, 0.5, 0.5)
+# Default name for a weights file hosted on the Huggingface Hub.
+HF_WEIGHTS_NAME = "open_clip_pytorch_model.bin"  # default pytorch pkl
+HF_SAFE_WEIGHTS_NAME = "open_clip_model.safetensors"  # safetensors version
+HF_CONFIG_NAME = 'open_clip_config.json'

src/open_clip/convert.py ADDED Viewed

	@@ -0,0 +1,206 @@

+""" Conversion functions for 3rd part state-dicts and non-torch native checkpoint formats.
+"""
+from typing import Union
+import torch
+import numpy as np
+from .model import CLIP, CustomTextCLIP
+from .transformer import TextTransformer, Transformer
+@torch.no_grad()
+def load_big_vision_weights(model: CustomTextCLIP, checkpoint_path: str):
+    """ Load weights from .npz checkpoints for official Google big_vision image-text models
+    Currently, the SigLIP source models are supported and a CustomTextCLIP destination model
+    w/ timm image encoder.
+    """
+    from timm.layers import resample_patch_embed, resample_abs_pos_embed
+    def _n2p(w, t=True, idx=None):
+        if idx is not None:
+            w = w[idx]
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+    w = np.load(checkpoint_path)
+    interpolation = 'bilinear'
+    antialias = False
+    def _convert_timm_img(module, prefix):
+        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
+        if embed_conv_w.shape[-2:] != module.patch_embed.proj.weight.shape[-2:]:
+            embed_conv_w = resample_patch_embed(
+                embed_conv_w,
+                module.patch_embed.proj.weight.shape[-2:],
+                interpolation=interpolation,
+                antialias=antialias,
+                verbose=True,
+            )
+        module.patch_embed.proj.weight.copy_(embed_conv_w)
+        module.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
+        if module.cls_token is not None:
+            module.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
+        pos_embed_w = _n2p(w[f'{prefix}pos_embedding'], t=False)
+        if pos_embed_w.shape != module.pos_embed.shape:
+            assert False, f'{pos_embed_w.shape}, {module.pos_embed.shape}'
+            num_prefix_tokens = 0 if getattr(module, 'no_embed_class', False) else getattr(module, 'num_prefix_tokens', 1)
+            pos_embed_w = resample_abs_pos_embed(  # resize pos embedding when different size from pretrained weights
+                pos_embed_w,
+                new_size=module.patch_embed.grid_size,
+                num_prefix_tokens=num_prefix_tokens,
+                interpolation=interpolation,
+                antialias=antialias,
+                verbose=True,
+            )
+        module.pos_embed.copy_(pos_embed_w)
+        mha_sub, b_sub, ln1_sub = (0, 0, 1)
+        for i, block in enumerate(module.blocks.children()):
+            if f'{prefix}Transformer/encoderblock/LayerNorm_0/scale' in w:
+                block_prefix = f'{prefix}Transformer/encoderblock/'
+                idx = i
+            else:
+                block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
+                idx = None
+            mha_prefix = block_prefix + f'MultiHeadDotProductAttention_{mha_sub}/'
+            block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'], idx=idx))
+            block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'], idx=idx))
+            block.attn.qkv.weight.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False, idx=idx).flatten(1).T for n in ('query', 'key', 'value')]))
+            block.attn.qkv.bias.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False, idx=idx).reshape(-1) for n in ('query', 'key', 'value')]))
+            block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel'], idx=idx).flatten(1))
+            block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'], idx=idx))
+            block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_{ln1_sub}/scale'], idx=idx))
+            block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_{ln1_sub}/bias'], idx=idx))
+            for r in range(2):
+                getattr(block.mlp, f'fc{r + 1}').weight.copy_(
+                    _n2p(w[f'{block_prefix}MlpBlock_{b_sub}/Dense_{r}/kernel'], idx=idx))
+                getattr(block.mlp, f'fc{r + 1}').bias.copy_(
+                    _n2p(w[f'{block_prefix}MlpBlock_{b_sub}/Dense_{r}/bias'], idx=idx))
+        module.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
+        module.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
+        if module.attn_pool is not None:
+            block_prefix = f'{prefix}MAPHead_0/'
+            mha_prefix = block_prefix + f'MultiHeadDotProductAttention_0/'
+            module.attn_pool.latent.copy_(_n2p(w[f'{block_prefix}probe'], t=False))
+            module.attn_pool.q.weight.copy_(_n2p(w[f'{mha_prefix}query/kernel'], t=False).flatten(1).T)
+            module.attn_pool.q.bias.copy_(_n2p(w[f'{mha_prefix}query/bias'], t=False).reshape(-1))
+            module.attn_pool.kv.weight.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('key', 'value')]))
+            module.attn_pool.kv.bias.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('key', 'value')]))
+            module.attn_pool.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+            module.attn_pool.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+            module.attn_pool.norm.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+            module.attn_pool.norm.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+            for r in range(2):
+                getattr(module.attn_pool.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_{r}/kernel']))
+                getattr(module.attn_pool.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_{r}/bias']))
+    def _convert_openclip_transformer(module: Transformer, prefix):
+        for i, block in enumerate(module.resblocks.children()):
+            if f'{prefix}encoderblock/LayerNorm_0/scale' in w:
+                block_prefix = f'{prefix}encoderblock/'
+                idx = i
+            else:
+                block_prefix = f'{prefix}encoderblock_{i}/'
+                idx = None
+            mha_prefix = block_prefix + f'MultiHeadDotProductAttention_0/'
+            block.ln_1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'], idx=idx))
+            block.ln_1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'], idx=idx))
+            block.attn.in_proj_weight.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False, idx=idx).flatten(1).T for n in ('query', 'key', 'value')]))
+            block.attn.in_proj_bias.copy_(torch.cat([
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False, idx=idx).reshape(-1) for n in ('query', 'key', 'value')]))
+            block.attn.out_proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel'], idx=idx).flatten(1))
+            block.attn.out_proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'], idx=idx))
+            block.ln_2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/scale'], idx=idx))
+            block.ln_2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/bias'], idx=idx))
+            block.mlp.c_fc.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/kernel'], idx=idx))
+            block.mlp.c_fc.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/bias'], idx=idx))
+            block.mlp.c_proj.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/kernel'], idx=idx))
+            block.mlp.c_proj.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/bias'], idx=idx))
+    def _convert_openclip_txt(module: TextTransformer, prefix):
+        module.token_embedding.weight.copy_(_n2p(w[f'{prefix}Embed_0/embedding'], t=False))
+        pos_embed_w = _n2p(w[f'{prefix}pos_embedding'], t=False).squeeze(0)
+        module.positional_embedding.copy_(pos_embed_w)
+        _convert_openclip_transformer(module.transformer, prefix=prefix + 'Encoder_0/')
+        module.ln_final.weight.copy_(_n2p(w[f'{prefix}Encoder_0/encoder_norm/scale']))
+        module.ln_final.bias.copy_(_n2p(w[f'{prefix}Encoder_0/encoder_norm/bias']))
+        if module.text_projection is not None:
+            module.text_projection.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+            module.text_projection.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+    root_prefix = 'params/' if 'params/b' in w else ''
+    _convert_timm_img(model.visual.trunk, f'{root_prefix}img/')
+    _convert_openclip_txt(model.text, f'{root_prefix}txt/')
+    model.logit_bias.copy_(_n2p(w[f'{root_prefix}b'])[0])
+    model.logit_scale.copy_(_n2p(w[f'{root_prefix}t'])[0])
+@torch.no_grad()
+def convert_mobile_clip_state_dict(model: CustomTextCLIP, state_dict, fastvit = True):
+    def _convert_timm_img(state_dict):
+        if fastvit:
+            from timm.models.fastvit import checkpoint_filter_fn
+        else:
+            from timm.models.vision_transformer_hybrid import checkpoint_filter_fn
+        timm_state_dict = checkpoint_filter_fn(state_dict, model.visual.trunk)
+        timm_state_dict = {'visual.trunk.' + k: v for k, v in timm_state_dict.items()}
+        return timm_state_dict
+    def _convert_openclip_txt(state_dict, prefix='text_encoder.'):
+        text_dict = {}
+        for k, v in state_dict.items():
+            if not k.startswith(prefix):
+                continue
+            k = k.replace(prefix, '')
+            k = k.replace('projection_layer', 'text_projection')
+            k = k.replace('embedding_layer', 'token_embedding')
+            if k.startswith('positional_embedding.pos_embed.pos_embed'):
+                k = k.replace('positional_embedding.pos_embed.pos_embed', 'positional_embedding')
+                v = v.squeeze()
+            k = k.replace('final_layer_norm', 'ln_final')
+            k = k.replace('pre_norm_mha.0', 'ln_1')
+            k = k.replace('pre_norm_mha.1', 'attn')
+            k = k.replace('pre_norm_ffn.0', 'ln_2')
+            k = k.replace('pre_norm_ffn.1', 'mlp.c_fc')
+            k = k.replace('pre_norm_ffn.4', 'mlp.c_proj')
+            k = k.replace('qkv_proj.weight', 'in_proj_weight')
+            k = k.replace('qkv_proj.bias', 'in_proj_bias')
+            k = k.replace('transformer.', 'transformer.resblocks.')
+            text_dict['text.' + k] = v
+        return text_dict
+    image_dict = _convert_timm_img(state_dict)
+    text_dict = _convert_openclip_txt(state_dict)
+    out_dict = {**image_dict, **text_dict}
+    out_dict['logit_scale'] = state_dict['logit_scale']
+    return out_dict
+def convert_state_dict(model: Union[CustomTextCLIP, CLIP], state_dict):
+    if 'image_encoder.model.patch_embed.0.rbr_conv.0.conv.weight' in state_dict:
+        # Apple MobileCLIP s1 & s2 state_dicts (s0 and b not currently supported)
+        state_dict = convert_mobile_clip_state_dict(model, state_dict)
+    if 'image_encoder.model.patch_emb.0.block.conv.weight' in state_dict:
+        # convert b model
+        state_dict = convert_mobile_clip_state_dict(model, state_dict, fastvit=False)
+    return state_dict

src/open_clip/factory.py ADDED Viewed

	@@ -0,0 +1,586 @@

+import json
+import logging
+import os
+import re
+import warnings
+from copy import deepcopy
+from dataclasses import asdict
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+from .convert import convert_state_dict
+from .model import CLIP, CustomTextCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
+    resize_pos_embed, get_cast_dtype, resize_text_pos_embed, set_model_preprocess_cfg
+from .coca_model import CoCa
+from .loss import ClipLoss, DistillClipLoss, CoCaLoss, SigLipLoss
+from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained,\
+    list_pretrained_tags_by_model, download_pretrained_from_hf
+from .transform import image_transform_v2, AugmentationCfg, PreprocessCfg, merge_preprocess_dict, merge_preprocess_kwargs
+from .tokenizer import HFTokenizer, SimpleTokenizer, SigLipTokenizer, DEFAULT_CONTEXT_LENGTH
+HF_HUB_PREFIX = 'hf-hub:'
+_MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
+_MODEL_CONFIGS = {}  # directory (model_name: config) of model architecture configs
+def _natural_key(string_):
+    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())]
+def _rescan_model_configs():
+    global _MODEL_CONFIGS
+    config_ext = ('.json',)
+    config_files = []
+    for config_path in _MODEL_CONFIG_PATHS:
+        if config_path.is_file() and config_path.suffix in config_ext:
+            config_files.append(config_path)
+        elif config_path.is_dir():
+            for ext in config_ext:
+                config_files.extend(config_path.glob(f'*{ext}'))
+    for cf in config_files:
+        with open(cf, 'r') as f:
+            model_cfg = json.load(f)
+            if all(a in model_cfg for a in ('embed_dim', 'vision_cfg', 'text_cfg')):
+                _MODEL_CONFIGS[cf.stem] = model_cfg
+    _MODEL_CONFIGS = {k: v for k, v in sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))}
+_rescan_model_configs()  # initial populate of model config registry
+def list_models():
+    """ enumerate available model architectures based on config files """
+    return list(_MODEL_CONFIGS.keys())
+def add_model_config(path):
+    """ add model config path or file and update registry """
+    if not isinstance(path, Path):
+        path = Path(path)
+    _MODEL_CONFIG_PATHS.append(path)
+    _rescan_model_configs()
+def get_model_config(model_name):
+    """ Fetch model config from builtin (local library) configs.
+    """
+    if model_name in _MODEL_CONFIGS:
+        return deepcopy(_MODEL_CONFIGS[model_name])
+    else:
+        return None
+def _get_hf_config(
+        model_id: str,
+        cache_dir: Optional[str] = None,
+):
+    """ Fetch model config from HuggingFace Hub.
+    """
+    config_path = download_pretrained_from_hf(
+        model_id,
+        filename='open_clip_config.json',
+        cache_dir=cache_dir,
+    )
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config = json.load(f)
+    return config
+def get_tokenizer(
+        model_name: str = '',
+        context_length: Optional[int] = None,
+        cache_dir: Optional[str] = None,
+        **kwargs,
+):
+    if model_name.startswith(HF_HUB_PREFIX):
+        model_name = model_name[len(HF_HUB_PREFIX):]
+        try:
+            config = _get_hf_config(model_name, cache_dir=cache_dir)['model_cfg']
+        except Exception:
+            tokenizer = HFTokenizer(
+                model_name,
+                context_length=context_length or DEFAULT_CONTEXT_LENGTH,
+                cache_dir=cache_dir,
+                **kwargs,
+            )
+            return tokenizer
+    else:
+        config = get_model_config(model_name)
+        assert config is not None, f"No valid model config found for {model_name}."
+    text_config = config.get('text_cfg', {})
+    if 'tokenizer_kwargs' in text_config:
+        tokenizer_kwargs = dict(text_config['tokenizer_kwargs'], **kwargs)
+    else:
+        tokenizer_kwargs = kwargs
+    if context_length is None:
+        context_length = text_config.get('context_length', DEFAULT_CONTEXT_LENGTH)
+    model_name = model_name.lower()
+    if text_config.get('hf_tokenizer_name', ''):
+        tokenizer = HFTokenizer(
+            text_config['hf_tokenizer_name'],
+            context_length=context_length,
+            cache_dir=cache_dir,
+            **tokenizer_kwargs,
+        )
+    elif 'siglip' in model_name:
+        tn = 'gemma' if 'siglip2'  in model_name else 'mc4' if 'i18n' in model_name else 'c4-en'
+        tokenizer = SigLipTokenizer(
+            tn,
+            context_length=context_length,
+            # **tokenizer_kwargs,
+        )
+    else:
+        tokenizer = SimpleTokenizer(
+            context_length=context_length,
+            **tokenizer_kwargs,
+        )
+    return tokenizer
+def load_state_dict(
+        checkpoint_path: str,
+        device='cpu',
+        weights_only=True,
+):
+    # Check if safetensors or not and load weights accordingly
+    if str(checkpoint_path).endswith(".safetensors"):
+        from safetensors.torch import load_file
+        checkpoint = load_file(checkpoint_path, device=device)
+    else:
+        try:
+            checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=weights_only)
+        except TypeError:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+    if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    elif isinstance(checkpoint, torch.jit.ScriptModule):
+        state_dict = checkpoint.state_dict()
+        for key in ["input_resolution", "context_length", "vocab_size"]:
+            state_dict.pop(key, None)
+    else:
+        state_dict = checkpoint
+    if next(iter(state_dict.items()))[0].startswith('module'):
+        state_dict = {k[7:]: v for k, v in state_dict.items()}
+    return state_dict
+def load_checkpoint(
+        model: Union[CLIP, CustomTextCLIP],
+        checkpoint_path: str,
+        strict: bool = True,
+        weights_only: bool = True,
+        device='cpu',
+):
+    if Path(checkpoint_path).suffix in ('.npz', '.npy'):
+        # Separate path loading numpy big_vision (SigLIP) weights
+        from open_clip.convert import load_big_vision_weights
+        load_big_vision_weights(model, checkpoint_path)
+        return {}
+    state_dict = load_state_dict(checkpoint_path, device=device, weights_only=weights_only)
+    # Detect & convert 3rd party state_dicts -> open_clip
+    state_dict = convert_state_dict(model, state_dict)
+    # Detect old format and make compatible with new format
+    if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'):
+        state_dict = convert_to_custom_text_state_dict(state_dict)
+    # correct if logit_scale differs in being scaler vs 1d param
+    if 'logit_scale' in state_dict and model.logit_scale.ndim != state_dict['logit_scale'].ndim:
+        state_dict['logit_scale'] = state_dict['logit_scale'].reshape(model.logit_scale.shape)
+    # correct if logit_bias differs in being scaler vs 1d param
+    if 'logit_bias' in state_dict and model.logit_bias.ndim != state_dict['logit_bias'].ndim:
+        state_dict['logit_bias'] = state_dict['logit_bias'].reshape(model.logit_bias.shape)
+    # If loading a non-SigLIP model for SigLIP training. See https://github.com/mlfoundations/open_clip/issues/712
+    if 'logit_bias' not in state_dict and model.logit_bias is not None:
+        state_dict["logit_bias"] = torch.zeros_like(state_dict["logit_scale"])
+    # Certain text transformers no longer expect position_ids after transformers==4.31
+    position_id_key = 'text.transformer.embeddings.position_ids'
+    if position_id_key in state_dict and not hasattr(model, position_id_key):
+        del state_dict[position_id_key]
+    resize_pos_embed(state_dict, model)
+    resize_text_pos_embed(state_dict, model)
+    # Finally, load the massaged state_dict into model
+    incompatible_keys = model.load_state_dict(state_dict, strict=strict)
+    return incompatible_keys
+def create_model(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_patch_dropout: Optional[float] = None,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        force_preprocess_cfg: Optional[Dict[str, Any]] = None,
+        pretrained_image: bool = False,
+        pretrained_hf: bool = True,
+        cache_dir: Optional[str] = None,
+        output_dict: Optional[bool] = None,
+        require_pretrained: bool = False,
+        load_weights_only: bool = True,
+        **model_kwargs,
+):
+    """Creates and configures a contrastive vision-language model.
+    Args:
+        model_name: Name of the model architecture to create. Can be a local model name
+            or a Hugging Face model ID prefixed with 'hf-hub:'.
+        pretrained: Tag/path for pretrained model weights. Can be:
+            - A pretrained tag name (e.g., 'openai')
+            - A path to local weights
+            - None to initialize with random weights
+        precision: Model precision/AMP configuration. Options:
+            - 'fp32': 32-bit floating point
+            - 'fp16'/'bf16': Mixed precision with FP32 for certain layers
+            - 'pure_fp16'/'pure_bf16': Pure 16-bit precision
+        device: Device to load the model on ('cpu', 'cuda', or torch.device object)
+        jit: If True, JIT compile the model
+        force_quick_gelu: Force use of QuickGELU activation
+        force_custom_text: Force use of custom text encoder
+        force_patch_dropout: Override default patch dropout value
+        force_image_size: Override default image size for vision encoder
+        force_preprocess_cfg: Override default preprocessing configuration
+        pretrained_image: Load pretrained weights for timm vision models
+        pretrained_hf: Load pretrained weights for HF text models when not loading CLIP weights
+        cache_dir: Override default cache directory for downloaded model files
+        output_dict: If True and model supports it, return dictionary of features
+        require_pretrained: Raise error if pretrained weights cannot be loaded
+        load_weights_only: Only deserialize model weights and unpickling torch checkpoints (for safety)
+        **model_kwargs: Additional keyword arguments passed to model constructor
+    Returns:
+        Created and configured model instance
+    Raises:
+        RuntimeError: If model config is not found or required pretrained weights
+            cannot be loaded
+    Examples:
+        # Create basic CLIP model
+        model = create_model('ViT-B/32')
+        # Create CLIP model with mixed precision on GPU
+        model = create_model('ViT-B/32', precision='fp16', device='cuda')
+        # Load pretrained OpenAI weights
+        model = create_model('ViT-B/32', pretrained='openai')
+        # Load Hugging Face model
+        model = create_model('hf-hub:organization/model-name')
+    """
+    force_preprocess_cfg = force_preprocess_cfg or {}
+    preprocess_cfg = asdict(PreprocessCfg())
+    has_hf_hub_prefix = model_name.startswith(HF_HUB_PREFIX)
+    if has_hf_hub_prefix:
+        model_id = model_name[len(HF_HUB_PREFIX):]
+        checkpoint_path = download_pretrained_from_hf(model_id, cache_dir=cache_dir)
+        config = _get_hf_config(model_id, cache_dir=cache_dir)
+        preprocess_cfg = merge_preprocess_dict(preprocess_cfg, config['preprocess_cfg'])
+        model_cfg = config['model_cfg']
+        pretrained_hf = False  # override, no need to load original HF text weights
+    else:
+        model_name = model_name.replace('/', '-')  # for callers using old naming with / in ViT names
+        checkpoint_path = None
+        model_cfg = None
+    if isinstance(device, str):
+        device = torch.device(device)
+    model_cfg = model_cfg or get_model_config(model_name)
+    if model_cfg is not None:
+        logging.info(f'Loaded {model_name} model config.')
+    else:
+        logging.error(f'Model config for {model_name} not found; available models {list_models()}.')
+        raise RuntimeError(f'Model config for {model_name} not found.')
+    if force_quick_gelu:
+        # override for use of QuickGELU on non-OpenAI transformer models
+        model_cfg["quick_gelu"] = True
+    if force_patch_dropout is not None:
+        # override the default patch dropout value
+        model_cfg["vision_cfg"]["patch_dropout"] = force_patch_dropout
+    if force_image_size is not None:
+        # override model config's image size
+        model_cfg["vision_cfg"]["image_size"] = force_image_size
+    is_timm_model = 'timm_model_name' in model_cfg.get('vision_cfg', {})
+    if pretrained_image:
+        if is_timm_model:
+            # pretrained weight loading for timm models set via vision_cfg
+            model_cfg['vision_cfg']['timm_model_pretrained'] = True
+        else:
+            assert False, 'pretrained image towers currently only supported for timm models'
+    # cast_dtype set for fp16 and bf16 (manual mixed-precision), not set for 'amp' or 'pure' modes
+    cast_dtype = get_cast_dtype(precision)
+    is_hf_model = 'hf_model_name' in model_cfg.get('text_cfg', {})
+    if is_hf_model:
+        # load pretrained weights for HF text model IFF no CLIP weights being loaded
+        model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf and not pretrained
+    custom_text = model_cfg.pop('custom_text', False) or force_custom_text or is_hf_model
+    model_cfg = dict(model_cfg, **model_kwargs)  # merge cfg dict w/ kwargs (kwargs overrides cfg)
+    if custom_text:
+        if "multimodal_cfg" in model_cfg:
+            model = CoCa(**model_cfg, cast_dtype=cast_dtype)
+        else:
+            model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
+    else:
+        model = CLIP(**model_cfg, cast_dtype=cast_dtype)
+    if precision in ("fp16", "bf16"):
+        dtype = torch.float16 if 'fp16' in precision else torch.bfloat16
+        # manual mixed precision that matches original OpenAI behaviour
+        if is_timm_model:
+            # FIXME this is a bit janky, create timm based model in low-precision and
+            # then cast only LayerNormFp32 instances back to float32 so they don't break.
+            # Why? The convert_weights_to_lp fn only works with native models.
+            model.to(device=device, dtype=dtype)
+            from .transformer import LayerNormFp32
+            def _convert_ln(m):
+                if isinstance(m, LayerNormFp32):
+                    m.weight.data = m.weight.data.to(torch.float32)
+                    m.bias.data = m.bias.data.to(torch.float32)
+            model.apply(_convert_ln)
+        else:
+            model.to(device=device)
+            convert_weights_to_lp(model, dtype=dtype)
+    elif precision in ("pure_fp16", "pure_bf16"):
+        dtype = torch.float16 if 'fp16' in precision else torch.bfloat16
+        model.to(device=device, dtype=dtype)
+    else:
+        model.to(device=device)
+    pretrained_loaded = False
+    if pretrained:
+        checkpoint_path = ''
+        pretrained_cfg = get_pretrained_cfg(model_name, pretrained)
+        if pretrained_cfg:
+            checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir)
+            preprocess_cfg = merge_preprocess_dict(preprocess_cfg, pretrained_cfg)
+            pretrained_quick_gelu = pretrained_cfg.get('quick_gelu', False)
+            model_quick_gelu = model_cfg.get('quick_gelu', False)
+            if pretrained_quick_gelu and not model_quick_gelu:
+                warnings.warn(
+                    f'These pretrained weights were trained with QuickGELU activation but the model config does '
+                    f'not have that enabled. Consider using a model config with a "-quickgelu" suffix or enable with a flag.')
+            elif not pretrained_quick_gelu and model_quick_gelu:
+                warnings.warn(
+                    f'The pretrained weights were not trained with QuickGELU but this activation is enabled in the '
+                    f'model config, consider using a model config without QuickGELU or disable override flags.')
+        elif os.path.exists(pretrained):
+            checkpoint_path = pretrained
+        if checkpoint_path:
+            logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
+            load_checkpoint(model, checkpoint_path, weights_only=load_weights_only)
+        else:
+            error_str = (
+                f'Pretrained weights ({pretrained}) not found for model {model_name}.'
+                f' Available pretrained tags ({list_pretrained_tags_by_model(model_name)}.')
+            logging.warning(error_str)
+            raise RuntimeError(error_str)
+        pretrained_loaded = True
+    elif has_hf_hub_prefix:
+        logging.info(f'Loading pretrained {model_name} weights ({checkpoint_path}).')
+        load_checkpoint(model, checkpoint_path, weights_only=load_weights_only)
+        pretrained_loaded = True
+    if require_pretrained and not pretrained_loaded:
+        # callers of create_model_from_pretrained always expect pretrained weights
+        raise RuntimeError(
+            f'Pretrained weights were required for (model: {model_name}, pretrained: {pretrained}) but not loaded.')
+    if output_dict and hasattr(model, "output_dict"):
+        model.output_dict = True
+    if jit:
+        model = torch.jit.script(model)
+    # set image preprocessing configuration in model attributes for convenience
+    if getattr(model.visual, 'image_size', None) is not None:
+        # use image_size set on model creation (via config or force_image_size arg)
+        force_preprocess_cfg['size'] = model.visual.image_size
+    set_model_preprocess_cfg(model, merge_preprocess_dict(preprocess_cfg, force_preprocess_cfg))
+    return model
+def create_loss(args):
+    if args.distill:
+        return DistillClipLoss(
+            local_loss=args.local_loss,
+            gather_with_grad=args.gather_with_grad,
+            cache_labels=True,
+            rank=args.rank,
+            world_size=args.world_size,
+            use_horovod=args.horovod,
+        )
+    elif "coca" in args.model.lower():
+        return CoCaLoss(
+            caption_loss_weight=args.coca_caption_loss_weight,
+            clip_loss_weight=args.coca_contrastive_loss_weight,
+            local_loss=args.local_loss,
+            gather_with_grad=args.gather_with_grad,
+            cache_labels=True,
+            rank=args.rank,
+            world_size=args.world_size,
+            use_horovod=args.horovod,
+        )
+    elif args.siglip:
+        assert not args.horovod, "Horovod not currently supported for SigLip"
+        return SigLipLoss(
+            rank=args.rank,
+            world_size=args.world_size,
+            dist_impl=args.loss_dist_impl,  # siglip has multiple distributed implementations to choose from
+        )
+    return ClipLoss(
+        local_loss=args.local_loss,
+        gather_with_grad=args.gather_with_grad,
+        cache_labels=True,
+        rank=args.rank,
+        world_size=args.world_size,
+        use_horovod=args.horovod,
+    )
+def create_model_and_transforms(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_patch_dropout: Optional[float] = None,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        image_mean: Optional[Tuple[float, ...]] = None,
+        image_std: Optional[Tuple[float, ...]] = None,
+        image_interpolation: Optional[str] = None,
+        image_resize_mode: Optional[str] = None,  # only effective for inference
+        aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
+        pretrained_image: bool = False,
+        pretrained_hf: bool = True,
+        cache_dir: Optional[str] = None,
+        output_dict: Optional[bool] = None,
+        load_weights_only: bool = True,
+        **model_kwargs,
+):
+    force_preprocess_cfg = merge_preprocess_kwargs(
+        {},
+        mean=image_mean,
+        std=image_std,
+        interpolation=image_interpolation,
+        resize_mode=image_resize_mode,
+    )
+    model = create_model(
+        model_name,
+        pretrained,
+        precision=precision,
+        device=device,
+        jit=jit,
+        force_quick_gelu=force_quick_gelu,
+        force_custom_text=force_custom_text,
+        force_patch_dropout=force_patch_dropout,
+        force_image_size=force_image_size,
+        force_preprocess_cfg=force_preprocess_cfg,
+        pretrained_image=pretrained_image,
+        pretrained_hf=pretrained_hf,
+        cache_dir=cache_dir,
+        output_dict=output_dict,
+        load_weights_only=load_weights_only,
+        **model_kwargs,
+    )
+    pp_cfg = PreprocessCfg(**model.visual.preprocess_cfg)
+    preprocess_train = image_transform_v2(
+        pp_cfg,
+        is_train=True,
+        aug_cfg=aug_cfg,
+    )
+    preprocess_val = image_transform_v2(
+        pp_cfg,
+        is_train=False,
+    )
+    return model, preprocess_train, preprocess_val
+def create_model_from_pretrained(
+        model_name: str,
+        pretrained: Optional[str] = None,
+        precision: str = 'fp32',
+        device: Union[str, torch.device] = 'cpu',
+        jit: bool = False,
+        force_quick_gelu: bool = False,
+        force_custom_text: bool = False,
+        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        image_mean: Optional[Tuple[float, ...]] = None,
+        image_std: Optional[Tuple[float, ...]] = None,
+        image_interpolation: Optional[str] = None,
+        image_resize_mode: Optional[str] = None,  # only effective for inference
+        return_transform: bool = True,
+        cache_dir: Optional[str] = None,
+        load_weights_only: bool = True,
+        **model_kwargs,
+):
+    force_preprocess_cfg = merge_preprocess_kwargs(
+        {},
+        mean=image_mean,
+        std=image_std,
+        interpolation=image_interpolation,
+        resize_mode=image_resize_mode,
+    )
+    model = create_model(
+        model_name,
+        pretrained,
+        precision=precision,
+        device=device,
+        jit=jit,
+        force_quick_gelu=force_quick_gelu,
+        force_custom_text=force_custom_text,
+        force_image_size=force_image_size,
+        force_preprocess_cfg=force_preprocess_cfg,
+        cache_dir=cache_dir,
+        require_pretrained=True,
+        load_weights_only=load_weights_only,
+        **model_kwargs,
+    )
+    if not return_transform:
+        return model
+    preprocess = image_transform_v2(
+        PreprocessCfg(**model.visual.preprocess_cfg),
+        is_train=False,
+    )
+    return model, preprocess

src/open_clip/hf_configs.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# HF architecture dict:
+arch_dict = {
+    # https://huggingface.co/docs/transformers/model_doc/roberta#roberta
+    "roberta": {
+        "config_names": {
+            "context_length": "max_position_embeddings",
+            "vocab_size": "vocab_size",
+            "width": "hidden_size",
+            "heads": "num_attention_heads",
+            "layers": "num_hidden_layers",
+            "layer_attr": "layer",
+            "token_embeddings_attr": "embeddings"
+        },
+        "pooler": "mean_pooler",
+    },
+    # https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaConfig
+    "xlm-roberta": {
+        "config_names": {
+            "context_length": "max_position_embeddings",
+            "vocab_size": "vocab_size",
+            "width": "hidden_size",
+            "heads": "num_attention_heads",
+            "layers": "num_hidden_layers",
+            "layer_attr": "layer",
+            "token_embeddings_attr": "embeddings"
+        },
+        "pooler": "mean_pooler",
+    },
+    # https://huggingface.co/docs/transformers/model_doc/mt5#mt5
+    "mt5": {
+        "config_names": {
+            # unlimited seqlen
+            # https://github.com/google-research/text-to-text-transfer-transformer/issues/273
+            # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/t5/modeling_t5.py#L374
+            "context_length": "",
+            "vocab_size": "vocab_size",
+            "width": "d_model",
+            "heads": "num_heads",
+            "layers": "num_layers",
+            "layer_attr": "block",
+            "token_embeddings_attr": "embed_tokens"
+        },
+        "pooler": "mean_pooler",
+    },
+    # https://huggingface.co/docs/transformers/model_doc/bert
+    "bert": {
+        "config_names": {
+            "context_length": "max_position_embeddings",
+            "vocab_size": "vocab_size",
+            "width": "hidden_size",
+            "heads": "num_attention_heads",
+            "layers": "num_hidden_layers",
+        },
+        "pooler": "cls_pooler",
+    },
+    # https://huggingface.co/docs/transformers/model_doc/m2m_100
+    "m2m_100": {
+        "config_names": {
+            "context_length": "max_position_embeddings",
+            "vocab_size": "vocab_size",
+            "width": "d_model",
+            "heads": "encoder_attention_heads",
+            "layers": "encoder_layers",
+        },
+        "pooler": "cls_pooler",
+    },
+}

src/open_clip/hf_model.py ADDED Viewed

	@@ -0,0 +1,193 @@

+""" huggingface model adapter
+Wraps HuggingFace transformers (https://github.com/huggingface/transformers) models for use as a text tower in CLIP model.
+"""
+import re
+import torch
+import torch.nn as nn
+from torch import TensorType
+try:
+    import transformers
+    from transformers import AutoModel, AutoTokenizer, AutoConfig, PretrainedConfig
+    from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, \
+        BaseModelOutputWithPoolingAndCrossAttentions
+except ImportError as e:
+    transformers = None
+    class BaseModelOutput:
+        pass
+    class PretrainedConfig:
+        pass
+from .hf_configs import arch_dict
+# utils
+def _camel2snake(s):
+    return re.sub(r'(?<!^)(?=[A-Z])', '_', s).lower()
+# TODO: ?last - for gpt-like models
+_POOLERS = {}
+def register_pooler(cls):
+    """Decorator registering pooler class"""
+    _POOLERS[_camel2snake(cls.__name__)] = cls
+    return cls
+@register_pooler
+class MeanPooler(nn.Module):
+    """Mean pooling"""
+    def forward(self, x: BaseModelOutput, attention_mask: TensorType):
+        masked_output = x.last_hidden_state * attention_mask.unsqueeze(-1)
+        return masked_output.sum(dim=1) / attention_mask.sum(-1, keepdim=True)
+@register_pooler
+class MaxPooler(nn.Module):
+    """Max pooling"""
+    def forward(self, x: BaseModelOutput, attention_mask: TensorType):
+        masked_output = x.last_hidden_state.masked_fill(attention_mask.unsqueeze(-1), -torch.inf)
+        return masked_output.max(1).values
+@register_pooler
+class ClsPooler(nn.Module):
+    """CLS token pooling"""
+    def __init__(self, use_pooler_output=True):
+        super().__init__()
+        self.cls_token_position = 0
+        self.use_pooler_output = use_pooler_output
+    def forward(self, x: BaseModelOutput, attention_mask: TensorType):
+        if (self.use_pooler_output and
+            isinstance(x, (BaseModelOutputWithPooling, BaseModelOutputWithPoolingAndCrossAttentions)) and
+            (x.pooler_output is not None)
+        ):
+            return x.pooler_output
+        return x.last_hidden_state[:, self.cls_token_position, :]
+@register_pooler
+class ClsLastHiddenStatePooler(nn.Module):
+    """CLS token pooling
+    NOTE: this is equivalent to ClsPooler above with use_pooler_output=False
+    """
+    def __init__(self):
+        super().__init__()
+        self.cls_token_position = 0
+    def forward(self, x: BaseModelOutput, attention_mask: TensorType):
+        return x.last_hidden_state[:, self.cls_token_position, :]
+class HFTextEncoder(nn.Module):
+    """HuggingFace model adapter"""
+    output_tokens: torch.jit.Final[bool]
+    def __init__(
+            self,
+            model_name_or_path: str,
+            output_dim: int,
+            config: PretrainedConfig = None,
+            pooler_type: str = None,
+            proj_type: str = None,
+            pretrained: bool = True,
+            output_tokens: bool = False,
+    ):
+        super().__init__()
+        self.output_tokens = output_tokens
+        self.output_dim = output_dim
+        # TODO: find better way to get this information
+        uses_transformer_pooler = (pooler_type == "cls_pooler")
+        if transformers is None:
+            raise RuntimeError("Please `pip install transformers` to use pre-trained HuggingFace models")
+        if config is None:
+            self.config = AutoConfig.from_pretrained(model_name_or_path)
+            create_func, model_args = (AutoModel.from_pretrained, model_name_or_path) if pretrained else (
+                AutoModel.from_config, self.config)
+            # TODO: do all model configs have this attribute? PretrainedConfig does so yes??
+            if hasattr(self.config, "is_encoder_decoder") and self.config.is_encoder_decoder:
+                self.transformer = create_func(model_args)
+                self.transformer = self.transformer.encoder
+            else:
+                self.transformer = create_func(model_args, add_pooling_layer=uses_transformer_pooler)
+        else:
+            self.config = config
+            self.transformer = AutoModel.from_config(config)
+        if pooler_type is None:  # get default arch pooler
+            pooler_type = (arch_dict[self.config.model_type]["pooler"])
+        # FIXME downstream users of OpenCLIP models use these attr, need to verify valid across all models
+        self.vocab_size = getattr(self.config, 'vocab_size', 0)
+        self.context_length = getattr(self.config, 'max_position_embeddings', 0)
+        self.pooler = _POOLERS[pooler_type]()
+        d_model = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["width"])
+        if (d_model == output_dim) and (proj_type is None):  # do we always need a proj?
+            self.proj = nn.Identity()
+        elif proj_type == 'linear':
+            self.proj = nn.Linear(d_model, output_dim, bias=False)
+        elif proj_type == 'mlp':
+            hidden_size = (d_model + output_dim) // 2
+            self.proj = nn.Sequential(
+                nn.Linear(d_model, hidden_size, bias=False),
+                nn.GELU(),
+                nn.Linear(hidden_size, output_dim, bias=False),
+            )
+    def forward(self, x: TensorType):
+        attn_mask = (x != self.config.pad_token_id).long()
+        out = self.transformer(input_ids=x, attention_mask=attn_mask)
+        pooled_out = self.pooler(out, attn_mask)
+        projected = self.proj(pooled_out)
+        seq_len = out.last_hidden_state.shape[1]
+        tokens = (
+            out.last_hidden_state[:, torch.arange(seq_len) != self.pooler.cls_token_position, :]
+            if type(self.pooler) == ClsPooler
+            else out.last_hidden_state
+        )
+        if self.output_tokens:
+            return projected, tokens
+        return projected
+    def lock(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        if not unlocked_layers:  # full freezing
+            for n, p in self.transformer.named_parameters():
+                p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
+            return
+        encoder = self.transformer.encoder if hasattr(self.transformer, 'encoder') else self.transformer
+        layer_list = getattr(encoder, arch_dict[self.config.model_type]["config_names"]["layer_attr"])
+        print(f"Unlocking {unlocked_layers}/{len(layer_list) + 1} layers of hf model")
+        embeddings = getattr(
+            self.transformer, arch_dict[self.config.model_type]["config_names"]["token_embeddings_attr"])
+        modules = [embeddings, *layer_list][:-unlocked_layers]
+        # freeze layers
+        for module in modules:
+            for n, p in module.named_parameters():
+                p.requires_grad = (not freeze_layer_norm) if "LayerNorm" in n.split(".") else False
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.transformer.gradient_checkpointing_enable()
+    def init_parameters(self):
+        pass

src/open_clip/loss.py ADDED Viewed

	@@ -0,0 +1,447 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+try:
+    import torch.distributed.nn
+    from torch import distributed as dist
+    has_distributed = True
+except ImportError:
+    has_distributed = False
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+def gather_features(
+        image_features,
+        text_features,
+        local_loss=False,
+        gather_with_grad=False,
+        rank=0,
+        world_size=1,
+        use_horovod=False
+):
+    assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
+    if use_horovod:
+        assert hvd is not None, 'Please install horovod'
+        if gather_with_grad:
+            all_image_features = hvd.allgather(image_features)
+            all_text_features = hvd.allgather(text_features)
+        else:
+            with torch.no_grad():
+                all_image_features = hvd.allgather(image_features)
+                all_text_features = hvd.allgather(text_features)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
+                gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
+                gathered_image_features[rank] = image_features
+                gathered_text_features[rank] = text_features
+                all_image_features = torch.cat(gathered_image_features, dim=0)
+                all_text_features = torch.cat(gathered_text_features, dim=0)
+    else:
+        # We gather tensors from all gpus
+        if gather_with_grad:
+            all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
+            all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
+        else:
+            gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
+            gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
+            dist.all_gather(gathered_image_features, image_features)
+            dist.all_gather(gathered_text_features, text_features)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_image_features[rank] = image_features
+                gathered_text_features[rank] = text_features
+            all_image_features = torch.cat(gathered_image_features, dim=0)
+            all_text_features = torch.cat(gathered_text_features, dim=0)
+    return all_image_features, all_text_features
+class ClipLoss(nn.Module):
+    def __init__(
+            self,
+            local_loss=False,
+            gather_with_grad=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False,
+    ):
+        super().__init__()
+        self.local_loss = local_loss
+        self.gather_with_grad = gather_with_grad
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.use_horovod = use_horovod
+        # cache state
+        self.prev_num_logits = 0
+        self.labels = {}
+    def get_ground_truth(self, device, num_logits) -> torch.Tensor:
+        # calculated ground-truth and cache if enabled
+        if self.prev_num_logits != num_logits or device not in self.labels:
+            labels = torch.arange(num_logits, device=device, dtype=torch.long)
+            if self.world_size > 1 and self.local_loss:
+                labels = labels + num_logits * self.rank
+            if self.cache_labels:
+                self.labels[device] = labels
+                self.prev_num_logits = num_logits
+        else:
+            labels = self.labels[device]
+        return labels
+    def get_logits(self, image_features, text_features, logit_scale):
+        if self.world_size > 1:
+            all_image_features, all_text_features = gather_features(
+                image_features,
+                text_features,
+                local_loss=self.local_loss,
+                gather_with_grad=self.gather_with_grad,
+                rank=self.rank,
+                world_size=self.world_size,
+                use_horovod=self.use_horovod,
+            )
+            if self.local_loss:
+                logits_per_image = logit_scale * image_features @ all_text_features.T
+                logits_per_text = logit_scale * text_features @ all_image_features.T
+            else:
+                logits_per_image = logit_scale * all_image_features @ all_text_features.T
+                logits_per_text = logits_per_image.T
+        else:
+            logits_per_image = logit_scale * image_features @ text_features.T
+            logits_per_text = logit_scale * text_features @ image_features.T
+        return logits_per_image, logits_per_text
+    def forward(self, image_features, text_features, logit_scale, output_dict=False):
+        device = image_features.device
+        logits_per_image, logits_per_text = self.get_logits(image_features, text_features, logit_scale)
+        labels = self.get_ground_truth(device, logits_per_image.shape[0])
+        total_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_text, labels)
+        ) / 2
+        return {"contrastive_loss": total_loss} if output_dict else total_loss
+class CoCaLoss(ClipLoss):
+    def __init__(
+            self,
+            caption_loss_weight,
+            clip_loss_weight,
+            pad_id=0,  # pad_token for open_clip custom tokenizer
+            local_loss=False,
+            gather_with_grad=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False,
+    ):
+        super().__init__(
+            local_loss=local_loss,
+            gather_with_grad=gather_with_grad,
+            cache_labels=cache_labels,
+            rank=rank,
+            world_size=world_size,
+            use_horovod=use_horovod
+        )
+        self.clip_loss_weight = clip_loss_weight
+        self.caption_loss_weight = caption_loss_weight
+        self.caption_loss = nn.CrossEntropyLoss(ignore_index=pad_id)
+    def forward(self, image_features, text_features, logits, labels, logit_scale, output_dict=False):
+        if self.clip_loss_weight:
+            clip_loss = super().forward(image_features, text_features, logit_scale)
+            clip_loss = self.clip_loss_weight * clip_loss
+        else:
+            clip_loss = torch.tensor(0, device=logits.device)
+        caption_loss = self.caption_loss(
+            logits.permute(0, 2, 1),
+            labels,
+        )
+        caption_loss = caption_loss * self.caption_loss_weight
+        if output_dict:
+            return {"contrastive_loss": clip_loss, "caption_loss": caption_loss}
+        return clip_loss, caption_loss
+class DistillClipLoss(ClipLoss):
+    def dist_loss(self, teacher_logits, student_logits):
+        return -(teacher_logits.softmax(dim=1) * student_logits.log_softmax(dim=1)).sum(dim=1).mean(dim=0)
+    def forward(
+            self,
+            image_features,
+            text_features,
+            logit_scale,
+            dist_image_features,
+            dist_text_features,
+            dist_logit_scale,
+            output_dict=False,
+    ):
+        logits_per_image, logits_per_text = \
+            self.get_logits(image_features, text_features, logit_scale)
+        dist_logits_per_image, dist_logits_per_text = \
+            self.get_logits(dist_image_features, dist_text_features, dist_logit_scale)
+        labels = self.get_ground_truth(image_features.device, logits_per_image.shape[0])
+        contrastive_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_text, labels)
+        ) / 2
+        distill_loss = (
+            self.dist_loss(dist_logits_per_image, logits_per_image) +
+            self.dist_loss(dist_logits_per_text, logits_per_text)
+        ) / 2
+        if output_dict:
+            return {"contrastive_loss": contrastive_loss, "distill_loss": distill_loss}
+        return contrastive_loss, distill_loss
+def neighbour_exchange(from_rank, to_rank, tensor, group=None):
+    tensor_recv = torch.zeros_like(tensor)
+    send_op = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor,
+        to_rank,
+        group=group,
+    )
+    recv_op = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_recv,
+        from_rank,
+        group=group,
+    )
+    reqs = torch.distributed.batch_isend_irecv([send_op, recv_op])
+    for req in reqs:
+        req.wait()
+    return tensor_recv
+def neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None):
+    tensor_from_left = torch.zeros_like(tensor_to_right)
+    tensor_from_right = torch.zeros_like(tensor_to_left)
+    send_op_left = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor_to_left,
+        left_rank,
+        group=group,
+    )
+    send_op_right = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor_to_right,
+        right_rank,
+        group=group,
+    )
+    recv_op_left = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_from_left,
+        left_rank,
+        group=group,
+    )
+    recv_op_right = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_from_right,
+        right_rank,
+        group=group,
+    )
+    reqs = torch.distributed.batch_isend_irecv([send_op_right, send_op_left, recv_op_right, recv_op_left])
+    for req in reqs:
+        req.wait()
+    return tensor_from_right, tensor_from_left
+class NeighbourExchange(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, from_rank, to_rank, group, tensor):
+        ctx.group = group
+        ctx.from_rank = from_rank
+        ctx.to_rank = to_rank
+        return neighbour_exchange(from_rank, to_rank, tensor, group=group)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (None, None, None) + (NeighbourExchange.apply(ctx.to_rank, ctx.from_rank, ctx.group, grad_output),)
+def neighbour_exchange_with_grad(from_rank, to_rank, tensor, group=None):
+    return NeighbourExchange.apply(from_rank, to_rank, group, tensor)
+class NeighbourExchangeBidir(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, left_rank, right_rank, group, tensor_to_left, tensor_to_right):
+        ctx.group = group
+        ctx.left_rank = left_rank
+        ctx.right_rank = right_rank
+        return neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=group)
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return (None, None, None) + \
+            NeighbourExchangeBidir.apply(ctx.right_rank, ctx.left_rank, ctx.group, *grad_outputs)
+def neighbour_exchange_bidir_with_grad(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None):
+    return NeighbourExchangeBidir.apply(left_rank, right_rank, group, tensor_to_left, tensor_to_right)
+class SigLipLoss(nn.Module):
+    """ Sigmoid Loss for Language Image Pre-Training (SigLIP) - https://arxiv.org/abs/2303.15343
+    @article{zhai2023sigmoid,
+      title={Sigmoid loss for language image pre-training},
+      author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
+      journal={arXiv preprint arXiv:2303.15343},
+      year={2023}
+    }
+    """
+    def __init__(
+            self,
+            cache_labels: bool = False,
+            rank: int = 0,
+            world_size: int = 1,
+            dist_impl: Optional[str] = None,
+    ):
+        super().__init__()
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.dist_impl = dist_impl or 'bidir'  # default to bidir exchange for now, this will likely change
+        assert self.dist_impl in ('bidir', 'shift', 'reduce', 'gather')
+        # cache state FIXME cache not currently used, worthwhile?
+        self.prev_num_logits = 0
+        self.labels = {}
+    def get_ground_truth(self, device, dtype, num_logits, negative_only=False) -> torch.Tensor:
+        labels = -torch.ones((num_logits, num_logits), device=device, dtype=dtype)
+        if not negative_only:
+            labels = 2 * torch.eye(num_logits, device=device, dtype=dtype) + labels
+        return labels
+    def get_logits(self, image_features, text_features, logit_scale, logit_bias=None):
+        logits = logit_scale * image_features @ text_features.T
+        if logit_bias is not None:
+            logits += logit_bias
+        return logits
+    def _loss(self, image_features, text_features, logit_scale, logit_bias=None, negative_only=False):
+        logits = self.get_logits(image_features, text_features, logit_scale, logit_bias)
+        labels = self.get_ground_truth(
+            image_features.device,
+            image_features.dtype,
+            image_features.shape[0],
+            negative_only=negative_only,
+        )
+        loss = -F.logsigmoid(labels * logits).sum() / image_features.shape[0]
+        return loss
+    def forward(self, image_features, text_features, logit_scale, logit_bias, output_dict=False):
+        loss = self._loss(image_features, text_features, logit_scale, logit_bias)
+        if self.world_size > 1:
+            if self.dist_impl == 'bidir':
+                right_rank = (self.rank + 1) % self.world_size
+                left_rank = (self.rank - 1 + self.world_size) % self.world_size
+                text_features_to_right = text_features_to_left = text_features
+                num_bidir, remainder = divmod(self.world_size - 1, 2)
+                for i in range(num_bidir):
+                    text_features_recv = neighbour_exchange_bidir_with_grad(
+                        left_rank,
+                        right_rank,
+                        text_features_to_left,
+                        text_features_to_right,
+                    )
+                    for f in text_features_recv:
+                        loss += self._loss(
+                            image_features,
+                            f,
+                            logit_scale,
+                            logit_bias,
+                            negative_only=True,
+                        )
+                    text_features_to_left, text_features_to_right = text_features_recv
+                if remainder:
+                    text_features_recv = neighbour_exchange_with_grad(
+                        left_rank,
+                        right_rank,
+                        text_features_to_right
+                    )
+                    loss += self._loss(
+                        image_features,
+                        text_features_recv,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+            elif self.dist_impl == "shift":
+                right_rank = (self.rank + 1) % self.world_size
+                left_rank = (self.rank - 1 + self.world_size) % self.world_size
+                text_features_to_right = text_features
+                for i in range(self.world_size - 1):
+                    text_features_from_left = neighbour_exchange_with_grad(
+                        left_rank,
+                        right_rank,
+                        text_features_to_right,
+                    )
+                    loss += self._loss(
+                        image_features,
+                        text_features_from_left,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+                    text_features_to_right = text_features_from_left
+            elif self.dist_impl == "reduce":
+                for i in range(self.world_size):
+                    text_from_other = torch.distributed.nn.all_reduce(
+                        text_features * (self.rank == i),
+                        torch.distributed.ReduceOp.SUM,
+                    )
+                    loss += float(i != self.rank) * self._loss(
+                        image_features,
+                        text_from_other,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+            elif self.dist_impl == "gather":
+                all_text = torch.distributed.nn.all_gather(text_features)
+                for i in range(self.world_size):
+                    loss += float(i != self.rank) * self._loss(
+                        image_features,
+                        all_text[i],
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+            else:
+                assert False
+        return {"contrastive_loss": loss} if output_dict else loss

src/open_clip/model.py ADDED Viewed

	@@ -0,0 +1,919 @@

+""" CLIP Model
+Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
+"""
+import copy
+import logging
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+from functools import partial
+from .hf_model import HFTextEncoder
+from .modified_resnet import ModifiedResNet
+from .timm_model import TimmModel
+from .transformer import LayerNormFp32, LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer,\
+    text_global_pool
+from .utils import to_2tuple
+@dataclass
+class CLIPVisionCfg:
+    layers: Union[Tuple[int, int, int, int], int] = 12
+    width: int = 768
+    head_width: int = 64
+    mlp_ratio: float = 4.0
+    patch_size: int = 16
+    image_size: Union[Tuple[int, int], int] = 224
+    in_chans: int = 3
+    ls_init_value: Optional[float] = None  # layer scale initial value
+    patch_dropout: float = 0.  # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
+    attentional_pool: bool = False  # whether to use attentional pooler in the last embedding layer (overrides pool_type)
+    attn_pooler_queries: int = 256  # n_queries for attentional pooler
+    attn_pooler_heads: int = 8  # n heads for attentional_pooling
+    no_ln_pre: bool = False  # disable pre transformer LayerNorm
+    pos_embed_type: str = 'learnable'
+    final_ln_after_pool: bool = False  # apply final LayerNorm after pooling
+    pool_type: str = 'tok'
+    output_tokens: bool = False
+    act_kwargs: Optional[dict] = None
+    norm_kwargs: Optional[dict] = None
+    timm_model_name: Optional[str] = None  # a valid model name overrides layers, width, patch_size
+    timm_model_pretrained: bool = False  # use (imagenet) pretrained weights for named model
+    timm_pool: str = 'avg'  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
+    timm_proj: str = 'linear'  # linear projection for timm model output ('linear', 'mlp', '')
+    timm_proj_bias: bool = False  # enable bias final projection
+    timm_drop: float = 0.  # head dropout
+    timm_drop_path: Optional[float] = None  # backbone stochastic depth
+@dataclass
+class CLIPTextCfg:
+    context_length: int = 77
+    vocab_size: int = 49408
+    hf_tokenizer_name: Optional[str] = None
+    tokenizer_kwargs: Optional[dict] = None
+    width: int = 512
+    heads: int = 8
+    layers: int = 12
+    mlp_ratio: float = 4.0
+    ls_init_value: Optional[float] = None  # layer scale initial value
+    embed_cls: bool = False
+    pad_id: int = 0
+    no_causal_mask: bool = False  # disable causal masking
+    final_ln_after_pool: bool = False  # apply final LayerNorm after pooling
+    pool_type: str = 'argmax'
+    proj_bias: bool = False
+    proj_type: str = 'linear'  # control final text projection, 'none' forces no projection
+    output_tokens: bool = False
+    act_kwargs: dict = None
+    norm_kwargs: dict = None
+    # HuggingFace specific text tower config
+    hf_model_name: Optional[str] = None
+    hf_model_pretrained: bool = True
+    hf_proj_type: str = 'mlp'
+    hf_pooler_type: str = 'mean_pooler'  # attentional pooling for HF models
+def get_cast_dtype(precision: str):
+    cast_dtype = None
+    if precision == 'bf16':
+        cast_dtype = torch.bfloat16
+    elif precision == 'fp16':
+        cast_dtype = torch.float16
+    return cast_dtype
+def get_input_dtype(precision: str):
+    input_dtype = None
+    if precision in ('bf16', 'pure_bf16'):
+        input_dtype = torch.bfloat16
+    elif precision in ('fp16', 'pure_fp16'):
+        input_dtype = torch.float16
+    return input_dtype
+def _build_vision_tower(
+        embed_dim: int,
+        vision_cfg: CLIPVisionCfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None
+):
+    if isinstance(vision_cfg, dict):
+        vision_cfg = CLIPVisionCfg(**vision_cfg)
+    # OpenAI models are pretrained w/ QuickGELU but native nn.GELU is both faster and more
+    # memory efficient in recent PyTorch releases (>= 1.10).
+    # NOTE: timm models always use native GELU regardless of quick_gelu flag.
+    act_layer = QuickGELU if quick_gelu else nn.GELU
+    if vision_cfg.timm_model_name:
+        visual = TimmModel(
+            vision_cfg.timm_model_name,
+            pretrained=vision_cfg.timm_model_pretrained,
+            pool=vision_cfg.timm_pool,
+            proj=vision_cfg.timm_proj,
+            proj_bias=vision_cfg.timm_proj_bias,
+            drop=vision_cfg.timm_drop,
+            drop_path=vision_cfg.timm_drop_path,
+            patch_drop=vision_cfg.patch_dropout if vision_cfg.patch_dropout > 0 else None,
+            embed_dim=embed_dim,
+            image_size=vision_cfg.image_size,
+        )
+    elif isinstance(vision_cfg.layers, (tuple, list)):
+        vision_heads = vision_cfg.width * 32 // vision_cfg.head_width
+        visual = ModifiedResNet(
+            layers=vision_cfg.layers,
+            output_dim=embed_dim,
+            heads=vision_heads,
+            image_size=vision_cfg.image_size,
+            width=vision_cfg.width,
+        )
+    else:
+        vision_heads = vision_cfg.width // vision_cfg.head_width
+        norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+        if vision_cfg.norm_kwargs:
+            norm_layer = partial(norm_layer, **vision_cfg.norm_kwargs)
+        if vision_cfg.act_kwargs is not None:
+            act_layer = partial(act_layer, **vision_cfg.act_kwargs)
+        visual = VisionTransformer(
+            image_size=vision_cfg.image_size,
+            patch_size=vision_cfg.patch_size,
+            width=vision_cfg.width,
+            layers=vision_cfg.layers,
+            heads=vision_heads,
+            mlp_ratio=vision_cfg.mlp_ratio,
+            ls_init_value=vision_cfg.ls_init_value,
+            patch_dropout=vision_cfg.patch_dropout,
+            attentional_pool=vision_cfg.attentional_pool,
+            attn_pooler_queries=vision_cfg.attn_pooler_queries,
+            attn_pooler_heads=vision_cfg.attn_pooler_heads,
+            pos_embed_type=vision_cfg.pos_embed_type,
+            no_ln_pre=vision_cfg.no_ln_pre,
+            final_ln_after_pool=vision_cfg.final_ln_after_pool,
+            pool_type=vision_cfg.pool_type,
+            output_tokens=vision_cfg.output_tokens,
+            output_dim=embed_dim,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            in_chans=vision_cfg.in_chans,
+        )
+    return visual
+def _build_text_tower(
+        embed_dim: int,
+        text_cfg: CLIPTextCfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None,
+):
+    if isinstance(text_cfg, dict):
+        text_cfg = CLIPTextCfg(**text_cfg)
+    if text_cfg.hf_model_name:
+        text = HFTextEncoder(
+            text_cfg.hf_model_name,
+            output_dim=embed_dim,
+            proj_type=text_cfg.hf_proj_type,
+            pooler_type=text_cfg.hf_pooler_type,
+            pretrained=text_cfg.hf_model_pretrained,
+            output_tokens=text_cfg.output_tokens,
+        )
+    else:
+        act_layer = QuickGELU if quick_gelu else nn.GELU
+        norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+        if text_cfg.norm_kwargs:
+            norm_layer = partial(norm_layer, **text_cfg.norm_kwargs)
+        if text_cfg.act_kwargs is not None:
+            act_layer = partial(act_layer, **text_cfg.act_kwargs)
+        text = TextTransformer(
+            context_length=text_cfg.context_length,
+            vocab_size=text_cfg.vocab_size,
+            width=text_cfg.width,
+            heads=text_cfg.heads,
+            layers=text_cfg.layers,
+            mlp_ratio=text_cfg.mlp_ratio,
+            ls_init_value=text_cfg.ls_init_value,
+            output_dim=embed_dim,
+            embed_cls=text_cfg.embed_cls,
+            no_causal_mask=text_cfg.no_causal_mask,
+            pad_id=text_cfg.pad_id,
+            pool_type=text_cfg.pool_type,
+            proj_type=text_cfg.proj_type,
+            proj_bias=text_cfg.proj_bias,
+            output_tokens=text_cfg.output_tokens,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+    return text
+class TrunkNet(nn.Module):
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            LayerNorm(hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            LayerNorm(hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, output_dim)
+        )
+    def forward(self, x):
+        for i, layer in enumerate(self.net):
+            x = layer(x)
+        return x
+class MultiTrunkNet(nn.Module):
+    def __init__(self, embed_dim: int):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.compound_trunk = TrunkNet(input_dim=159, hidden_dim=embed_dim, output_dim=embed_dim)
+        self.concentration_trunk = TrunkNet(input_dim=2, hidden_dim=embed_dim, output_dim=embed_dim)
+        self.time_trunk = TrunkNet(input_dim=1, hidden_dim=embed_dim, output_dim=embed_dim)
+        total_dim = embed_dim * 3
+        self.projection = nn.Linear(total_dim, embed_dim)
+    def forward(self, compound_embedding: torch.Tensor, concentration: torch.Tensor, time: torch.Tensor):
+        # Process each input through its own trunk
+        compound_features = self.compound_trunk(compound_embedding)
+        concentration_features = self.concentration_trunk(concentration)
+        time = time.unsqueeze(-1) if time.dim() == 1 else time
+        time_features = self.time_trunk(time)
+        # Concatenate all features
+        return compound_features, concentration_features, time_features
+class CLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+    def __init__(
+            self,
+            embed_dim: int,
+            vision_cfg: CLIPVisionCfg,
+            text_cfg: CLIPTextCfg,
+            quick_gelu: bool = False,
+            init_logit_scale: float = np.log(1 / 0.07),
+            init_logit_bias: Optional[float] = None,
+            nonscalar_logit_scale: bool = False,
+            cast_dtype: Optional[torch.dtype] = None,
+            output_dict: bool = False,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
+        text = _build_text_tower(int(embed_dim/4), text_cfg, quick_gelu, cast_dtype)
+        self.transformer = text.transformer
+        self.context_length = text.context_length
+        self.vocab_size = text.vocab_size
+        self.token_embedding = text.token_embedding
+        self.positional_embedding = text.positional_embedding
+        self.ln_final = text.ln_final
+        self.text_projection = text.text_projection
+        self.text_pool_type = text.pool_type
+        self.register_buffer('attn_mask', text.attn_mask, persistent=False)
+        # Add multi-trunk net for additional inputs
+        self.multi_trunk = MultiTrunkNet(int(embed_dim/4))
+        # # Add projection layer for concatenated features
+        # self.fusion_projection = nn.Linear(embed_dim * 4, embed_dim)
+        lshape = [1] if nonscalar_logit_scale else []
+        self.logit_scale = nn.Parameter(torch.ones(lshape) * init_logit_scale)
+        if init_logit_bias is not None:
+            self.logit_bias = nn.Parameter(torch.ones(lshape) * init_logit_bias)
+        else:
+            self.logit_bias = None
+    def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
+        # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
+        self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.transformer.grad_checkpointing = enable
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        # for timm optimizers, 1d params like logit_scale, logit_bias, ln/bn scale, biases are excluded by default
+        no_wd = {'positional_embedding'}
+        if hasattr(self.visual, 'no_weight_decay'):
+            for n in self.visual.no_weight_decay():
+                no_wd.add('visual.' + n)
+        return no_wd
+    def encode_image(self, image, normalize: bool = False):
+        features = self.visual(image)
+        return F.normalize(features, dim=-1) if normalize else features
+    def encode_text(self, text, normalize: bool = False, concentration: Optional[torch.Tensor] = None,
+                   time: Optional[torch.Tensor] = None, compound_embedding: Optional[torch.Tensor] = None):
+        cast_dtype = self.transformer.get_cast_dtype()
+        x = self.token_embedding(text).to(cast_dtype)
+        x = x + self.positional_embedding.to(cast_dtype)
+        x = self.transformer(x, attn_mask=self.attn_mask)
+        x = self.ln_final(x)
+        x = text_global_pool(x, text, self.text_pool_type)
+        if self.text_projection is not None:
+            if isinstance(self.text_projection, nn.Linear):
+                x = self.text_projection(x)
+            else:
+                x = x @ self.text_projection
+        if compound_embedding is not None and concentration is not None and time is not None:
+            compound_features, concentration_features, time_features = self.multi_trunk(compound_embedding, concentration, time)
+            x = torch.cat([x, compound_features, concentration_features, time_features], dim=-1)
+        if normalize:
+            x = F.normalize(x, dim=-1)
+        return x
+    def get_logits(self, image, text, concentration: Optional[torch.Tensor] = None,
+                 time: Optional[torch.Tensor] = None,
+                 compound_embedding: Optional[torch.Tensor] = None):
+        image_features = self.encode_image(image, normalize=True)
+        text_features = self.encode_text(text, normalize=True,
+                                       concentration=concentration,
+                                       time=time,
+                                       compound_embedding=compound_embedding)
+        image_logits = self.logit_scale.exp() * image_features @ text_features.T
+        if self.logit_bias is not None:
+            image_logits += self.logit_bias
+        text_logits = image_logits.T
+        return image_logits, text_logits
+    def forward_intermediates(
+            self,
+            image: Optional[torch.Tensor] = None,
+            text: Optional[torch.Tensor] = None,
+            image_indices: Optional[Union[int, List[int]]] = None,
+            text_indices: Optional[Union[int, List[int]]] = None,
+            stop_early: bool = False,
+            normalize: bool = True,
+            normalize_intermediates: bool = False,
+            intermediates_only: bool = False,
+            image_output_fmt: str = 'NCHW',
+            image_output_extra_tokens: bool = False,
+            text_output_fmt: str = 'NLC',
+            text_output_extra_tokens: bool = False,
+            output_logits: bool = False,
+            output_logit_scale_bias: bool = False,
+    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
+        """ Forward features that returns intermediates.
+        Args:
+            image: Input image tensor
+            text: Input text tensor
+            image_indices: For image tower, Take last n blocks if int, all if None, select matching indices if sequence
+            text_indices: Take last n blocks if int, all if None, select matching indices if sequence
+            stop_early: Stop iterating over blocks when last desired intermediate hit
+            normalize_intermediates: Apply final norm layer to all intermediates
+            normalize: L2 Normalize final features
+            intermediates_only: Only return intermediate features, do not return final features
+            image_output_fmt: Shape of intermediate image feature outputs
+            image_output_extra_tokens: Return both prefix and spatial intermediate tokens
+            text_output_fmt: Shape of intermediate text feature outputs (ignored for this model)
+            text_output_extra_tokens: Return both prefix and spatial intermediate tokens (ignored for this model)
+            output_logits: Include logits in output
+            output_logit_scale_bias: Include the logit scale bias in the output
+        Returns:
+        """
+        output = {}
+        if intermediates_only:
+            # intermediates only disables final feature normalization, and include logits
+            normalize = False
+            output_logits = False
+        if output_logits:
+            assert image is not None and text is not None, 'Both image and text inputs are required to compute logits'
+        if image is not None:
+            image_output = self.visual.forward_intermediates(
+                image,
+                indices=image_indices,
+                stop_early=stop_early,
+                normalize_intermediates=normalize_intermediates,
+                intermediates_only=intermediates_only,
+                output_fmt=image_output_fmt,
+                output_extra_tokens=image_output_extra_tokens,
+            )
+            if normalize and "image_features" in image_output:
+                image_output["image_features"] = F.normalize(image_output["image_features"], dim=-1)
+            output.update(image_output)
+        if text is not None:
+            cast_dtype = self.transformer.get_cast_dtype()
+            x = self.token_embedding(text).to(cast_dtype)  # [batch_size, n_ctx, d_model]
+            x = x + self.positional_embedding.to(cast_dtype)
+            x, intermediates = self.transformer.forward_intermediates(
+                x,
+                attn_mask=self.attn_mask,
+                indices=text_indices
+            )
+            if normalize_intermediates:
+                intermediates = [self.ln_final(xi) for xi in intermediates]
+            # NOTE this model doesn't support cls embed in text transformer, no need for extra intermediate tokens
+            output["text_intermediates"] = intermediates
+            if not intermediates_only:
+                x = self.ln_final(x)  # [batch_size, n_ctx, transformer.width]
+                x = text_global_pool(x, text, self.text_pool_type)
+                if self.text_projection is not None:
+                    if isinstance(self.text_projection, nn.Linear):
+                        x = self.text_projection(x)
+                    else:
+                        x = x @ self.text_projection
+                if normalize:
+                    x = F.normalize(x, dim=-1)
+                output["text_features"] = x
+        logit_scale_exp = self.logit_scale.exp() if output_logits or output_logit_scale_bias else None
+        if output_logits:
+            image_logits = logit_scale_exp * output["image_features"] @ output["text_features"].T
+            if self.logit_bias is not None:
+                image_logits += self.logit_bias
+            text_logits = image_logits.T
+            output["image_logits"] = image_logits
+            output["text_logits"] = text_logits
+        if output_logit_scale_bias:
+            output["logit_scale"] = logit_scale_exp
+            if self.logit_bias is not None:
+                output['logit_bias'] = self.logit_bias
+        return output
+    def forward(
+        self,
+        image: Optional[torch.Tensor] = None,
+        text: Optional[torch.Tensor] = None,
+        concentration: Optional[torch.Tensor] = None,
+        time: Optional[torch.Tensor] = None,
+        compound_embedding: Optional[torch.Tensor] = None,
+    ):
+        image_features = self.encode_image(image, normalize=True) if image is not None else None
+        text_features = self.encode_text(text, normalize=True, concentration=concentration, time=time, compound_embedding=compound_embedding)
+        if self.output_dict:
+            out_dict = {
+                "image_features": image_features,
+                "text_features": text_features,
+                "logit_scale": self.logit_scale.exp()
+            }
+            if self.logit_bias is not None:
+                out_dict['logit_bias'] = self.logit_bias
+            return out_dict
+        if self.logit_bias is not None:
+            return image_features, text_features, self.logit_scale.exp(), self.logit_bias
+        return image_features, text_features, self.logit_scale.exp()
+class CustomTextCLIP(nn.Module):
+    output_dict: torch.jit.Final[bool]
+    def __init__(
+            self,
+            embed_dim: int,
+            vision_cfg: CLIPVisionCfg,
+            text_cfg: CLIPTextCfg,
+            quick_gelu: bool = False,
+            init_logit_scale: float = np.log(1 / 0.07),
+            init_logit_bias: Optional[float] = None,
+            nonscalar_logit_scale: bool = False,
+            cast_dtype: Optional[torch.dtype] = None,
+            output_dict: bool = False,
+    ):
+        super().__init__()
+        self.output_dict = output_dict
+        self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype)
+        self.text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype)
+        self.context_length = self.text.context_length
+        self.vocab_size = self.text.vocab_size
+        lshape = [1] if nonscalar_logit_scale else []
+        self.logit_scale = nn.Parameter(torch.ones(lshape) * init_logit_scale)
+        if init_logit_bias is not None:
+            self.logit_bias = nn.Parameter(torch.ones(lshape) * init_logit_bias)
+        else:
+            self.logit_bias = None
+    def lock_image_tower(self, unlocked_groups=0, freeze_bn_stats=False):
+        # lock image tower as per LiT - https://arxiv.org/abs/2111.07991
+        self.visual.lock(unlocked_groups=unlocked_groups, freeze_bn_stats=freeze_bn_stats)
+    def lock_text_tower(self, unlocked_layers: int = 0, freeze_layer_norm: bool = True):
+        self.text.lock(unlocked_layers, freeze_layer_norm)
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.visual.set_grad_checkpointing(enable)
+        self.text.set_grad_checkpointing(enable)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        # for timm optimizers, 1d params like logit_scale, logit_bias, ln/bn scale, biases are excluded by default
+        no_wd = set()
+        if hasattr(self.visual, 'no_weight_decay'):
+            for n in self.visual.no_weight_decay():
+                no_wd.add('visual.' + n)
+        if hasattr(self.text, 'no_weight_decay'):
+            for n in self.visual.no_weight_decay():
+                no_wd.add('text.' + n)
+        return no_wd
+    def encode_image(self, image, normalize: bool = False):
+        features = self.visual(image)
+        return F.normalize(features, dim=-1) if normalize else features
+    def encode_text(self, text, normalize: bool = False):
+        features = self.text(text)
+        return F.normalize(features, dim=-1) if normalize else features
+    def get_logits(self, image, text):
+        image_features = self.encode_image(image, normalize=True)
+        text_features = self.encode_text(text, normalize=True)
+        image_logits = self.logit_scale.exp() * image_features @ text_features.T
+        if self.logit_bias is not None:
+            image_logits += self.logit_bias
+        text_logits = image_logits.T
+        return image_logits, text_logits
+    def forward_intermediates(
+            self,
+            image: Optional[torch.Tensor] = None,
+            text: Optional[torch.Tensor] = None,
+            image_indices: Optional[Union[int, List[int]]] = None,
+            text_indices: Optional[Union[int, List[int]]] = None,
+            stop_early: bool = False,
+            normalize: bool = True,
+            normalize_intermediates: bool = False,
+            intermediates_only: bool = False,
+            image_output_fmt: str = 'NCHW',
+            image_output_extra_tokens: bool = False,
+            text_output_fmt: str = 'NLC',
+            text_output_extra_tokens: bool = False,
+            output_logits: bool = False,
+            output_logit_scale_bias: bool = False,
+    ) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
+        """ Forward features that returns intermediates.
+        Args:
+            image: Input image tensor
+            text: Input text tensor
+            image_indices: For image tower, Take last n blocks if int, all if None, select matching indices if sequence
+            text_indices: Take last n blocks if int, all if None, select matching indices if sequence
+            stop_early: Stop iterating over blocks when last desired intermediate hit
+            normalize: L2 Normalize final image and text features (if present)
+            normalize_intermediates: Apply final encoder norm layer to all intermediates (if possible)
+            intermediates_only: Only return intermediate features, do not return final features
+            image_output_fmt: Shape of intermediate image feature outputs
+            image_output_extra_tokens: Return both prefix and spatial intermediate tokens
+            text_output_fmt: Shape of intermediate text feature outputs
+            text_output_extra_tokens: Return both prefix and spatial intermediate tokens
+            output_logits: Include logits in output
+            output_logit_scale_bias: Include the logit scale bias in the output
+        Returns:
+        """
+        output = {}
+        if intermediates_only:
+            # intermediates only disables final feature normalization, and include logits
+            normalize = False
+            output_logits = False
+        if output_logits:
+            assert image is not None and text is not None, 'Both image and text inputs are required to compute logits'
+        if image is not None:
+            image_output = self.visual.forward_intermediates(
+                image,
+                indices=image_indices,
+                stop_early=stop_early,
+                normalize_intermediates=normalize_intermediates,
+                intermediates_only=intermediates_only,
+                output_fmt=image_output_fmt,
+                output_extra_tokens=image_output_extra_tokens,
+            )
+            if normalize and "image_features" in image_output:
+                image_output["image_features"] = F.normalize(image_output["image_features"], dim=-1)
+            output.update(image_output)
+        if text is not None:
+            text_output = self.text.forward_intermediates(
+                text,
+                indices=text_indices,
+                stop_early=stop_early,
+                normalize_intermediates=normalize_intermediates,
+                intermediates_only=intermediates_only,
+                output_fmt=text_output_fmt,
+                output_extra_tokens=text_output_extra_tokens,
+            )
+            if normalize and "text_features" in text_output:
+                text_output["text_features"] = F.normalize(text_output["text_features"], dim=-1)
+            output.update(text_output)
+        logit_scale_exp = self.logit_scale.exp() if output_logits or output_logit_scale_bias else None
+        if output_logits:
+            image_logits = logit_scale_exp * output["image_features"] @ output["text_features"].T
+            if self.logit_bias is not None:
+                image_logits += self.logit_bias
+            text_logits = image_logits.T
+            output["image_logits"] = image_logits
+            output["text_logits"] = text_logits
+        if output_logit_scale_bias:
+            output["logit_scale"] = logit_scale_exp
+            if self.logit_bias is not None:
+                output['logit_bias'] = self.logit_bias
+        return output
+    def forward(
+            self,
+            image: Optional[torch.Tensor] = None,
+            text: Optional[torch.Tensor] = None,
+    ):
+        image_features = self.encode_image(image, normalize=True) if image is not None else None
+        text_features = self.encode_text(text, normalize=True) if text is not None else None
+        if self.output_dict:
+            out_dict = {
+                "image_features": image_features,
+                "text_features": text_features,
+                "logit_scale": self.logit_scale.exp()
+            }
+            if self.logit_bias is not None:
+                out_dict['logit_bias'] = self.logit_bias
+            return out_dict
+        if self.logit_bias is not None:
+            return image_features, text_features, self.logit_scale.exp(), self.logit_bias
+        return image_features, text_features, self.logit_scale.exp()
+def convert_weights_to_lp(model: nn.Module, dtype=torch.float16):
+    """Convert applicable model parameters to low-precision (bf16 or fp16)"""
+    def _convert_weights(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.to(dtype)
+            if l.bias is not None:
+                l.bias.data = l.bias.data.to(dtype)
+        if isinstance(l, (nn.MultiheadAttention, Attention)):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.to(dtype)
+        if isinstance(l, (CLIP, TextTransformer)):
+            # convert text nn.Parameter projections
+            attr = getattr(l, "text_projection", None)
+            if attr is not None:
+                attr.data = attr.data.to(dtype)
+        if isinstance(l, VisionTransformer):
+            # convert vision nn.Parameter projections
+            attr = getattr(l, "proj", None)
+            if attr is not None:
+                attr.data = attr.data.to(dtype)
+    model.apply(_convert_weights)
+convert_weights_to_fp16 = convert_weights_to_lp  # backwards compat
+# used to maintain checkpoint compatibility
+def convert_to_custom_text_state_dict(state_dict: dict):
+    if 'text_projection' in state_dict:
+        # old format state_dict, move text tower -> .text
+        new_state_dict = {}
+        for k, v in state_dict.items():
+            if any(k.startswith(p) for p in (
+                'text_projection',
+                'positional_embedding',
+                'token_embedding',
+                'transformer',
+                'ln_final',
+            )):
+                k = 'text.' + k
+            new_state_dict[k] = v
+        return new_state_dict
+    return state_dict
+def build_model_from_openai_state_dict(
+        state_dict: dict,
+        quick_gelu=True,
+        cast_dtype=torch.float16,
+):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len(
+            [k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_size = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_size = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    vision_cfg = CLIPVisionCfg(
+        layers=vision_layers,
+        width=vision_width,
+        patch_size=vision_patch_size,
+        image_size=image_size,
+    )
+    text_cfg = CLIPTextCfg(
+        context_length=context_length,
+        vocab_size=vocab_size,
+        width=transformer_width,
+        heads=transformer_heads,
+        layers=transformer_layers,
+    )
+    model = CLIP(
+        embed_dim,
+        vision_cfg=vision_cfg,
+        text_cfg=text_cfg,
+        quick_gelu=quick_gelu,  # OpenAI models were trained with QuickGELU
+        cast_dtype=cast_dtype,
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        state_dict.pop(key, None)
+    convert_weights_to_fp16(model)  # OpenAI state dicts are partially converted to float16
+    model.load_state_dict(state_dict)
+    return model.eval()
+def trace_model(model, batch_size=256, device=torch.device('cpu')):
+    model.eval()
+    image_size = model.visual.image_size
+    example_images = torch.ones((batch_size, 2, image_size, image_size), device=device)
+    example_text = torch.zeros((batch_size, model.context_length), dtype=torch.int, device=device)
+    example_concentration = torch.rand((batch_size, 2), device=device)
+    example_time = torch.rand((batch_size, 1), device=device)
+    example_compound_embedding = torch.rand((batch_size, 159), device=device)
+    model = torch.jit.trace_module(
+        model,
+        inputs=dict(
+            forward=(example_images, example_text, example_concentration, example_time, example_compound_embedding),
+            encode_text=(example_text, True, example_concentration, example_time, example_compound_embedding),
+            encode_image=(example_images,)
+        ))
+    model.visual.image_size = image_size
+    return model
+def resize_pos_embed(state_dict, model, interpolation: str = 'bicubic', antialias: bool = True):
+    # Rescale the grid of position embeddings when loading from state_dict
+    old_pos_embed = state_dict.get('visual.positional_embedding', None)
+    if old_pos_embed is None or not hasattr(model.visual, 'grid_size'):
+        return
+    grid_size = to_2tuple(model.visual.grid_size)
+    extra_tokens = 1  # FIXME detect different token configs (ie no class token, or more)
+    new_seq_len = grid_size[0] * grid_size[1] + extra_tokens
+    if new_seq_len == old_pos_embed.shape[0]:
+        return
+    if extra_tokens:
+        pos_emb_tok, pos_emb_img = old_pos_embed[:extra_tokens], old_pos_embed[extra_tokens:]
+    else:
+        pos_emb_tok, pos_emb_img = None, old_pos_embed
+    old_grid_size = to_2tuple(int(math.sqrt(len(pos_emb_img))))
+    logging.info('Resizing position embedding grid-size from %s to %s', old_grid_size, grid_size)
+    pos_emb_img = pos_emb_img.reshape(1, old_grid_size[0], old_grid_size[1], -1).permute(0, 3, 1, 2)
+    pos_emb_img = F.interpolate(
+        pos_emb_img,
+        size=grid_size,
+        mode=interpolation,
+        antialias=antialias,
+        align_corners=False,
+    )
+    pos_emb_img = pos_emb_img.permute(0, 2, 3, 1).reshape(1, grid_size[0] * grid_size[1], -1)[0]
+    if pos_emb_tok is not None:
+        new_pos_embed = torch.cat([pos_emb_tok, pos_emb_img], dim=0)
+    else:
+        new_pos_embed = pos_emb_img
+    state_dict['visual.positional_embedding'] = new_pos_embed
+def resize_text_pos_embed(state_dict, model, interpolation: str = 'linear', antialias: bool = False):
+    old_pos_embed = state_dict.get('positional_embedding', None)
+    if old_pos_embed is None:
+        return
+    # FIXME add support for text cls_token
+    model_pos_embed = getattr(model, 'positional_embedding', None)
+    if model_pos_embed is None:
+        model_pos_embed = getattr(model.text, 'positional_embedding', None)
+    old_num_pos = old_pos_embed.shape[0]
+    old_width = old_pos_embed.shape[1]
+    num_pos = model_pos_embed.shape[0]
+    width = model_pos_embed.shape[1]
+    assert old_width == width, 'text pos_embed width changed!'
+    if old_num_pos == num_pos:
+        return
+    logging.info('Resizing text position embedding num_pos from %s to %s', old_num_pos, num_pos)
+    old_pos_embed = old_pos_embed.reshape(1, old_num_pos, old_width).permute(0, 2, 1)
+    old_pos_embed = F.interpolate(
+        old_pos_embed,
+        size=num_pos,
+        mode=interpolation,
+        antialias=antialias,
+        align_corners=False,
+    )
+    old_pos_embed = old_pos_embed.permute(0, 2, 1)[0]
+    new_pos_embed = old_pos_embed
+    state_dict['positional_embedding'] = new_pos_embed
+def get_model_preprocess_cfg(model):
+    module = getattr(model, 'visual', model)
+    preprocess_cfg = getattr(module, 'preprocess_cfg', {})
+    if not preprocess_cfg:
+        # use separate legacy attributes if preprocess_cfg dict not found
+        size = getattr(module, 'image_size')
+        if size is not None:
+            preprocess_cfg['size'] = size
+        mean = getattr(module, 'image_mean', None)
+        if mean is not None:
+            preprocess_cfg['mean'] = mean
+        std = getattr(module, 'image_std', None)
+        if std is not None:
+            preprocess_cfg['std'] = std
+    return preprocess_cfg
+def set_model_preprocess_cfg(model, preprocess_cfg: Dict[str, Any]):
+    module = getattr(model, 'visual', model)
+    module.image_mean = preprocess_cfg['mean']  # legacy attribute, keeping for bwd compat
+    module.image_std = preprocess_cfg['std']  # legacy attribute, keeping for bwd compat
+    module.preprocess_cfg = copy.deepcopy(preprocess_cfg)  # new attr, package all pp cfg as dict
+def get_model_tokenize_cfg(model):
+    module = getattr(model, 'text', model)
+    cfg = {}
+    context_length = getattr(module, 'context_length', None)
+    if context_length is not None:
+        cfg['context_length'] = context_length
+    vocab_size = getattr(module, 'vocab_size', None)
+    if vocab_size is not None:
+        cfg['vocab_size'] = vocab_size
+    return cfg

src/open_clip/model_configs/EVA01-g-14-plus.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "eva_giant_patch14_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 24
+    },
+    "custom_text": true
+}

src/open_clip/model_configs/EVA01-g-14.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "eva_giant_patch14_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    },
+    "custom_text": true
+}

src/open_clip/model_configs/EVA02-B-16.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "eva02_base_patch16_clip_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    },
+    "custom_text": true
+}

src/open_clip/model_configs/EVA02-E-14-plus.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "eva02_enormous_patch14_clip_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1280,
+        "heads": 20,
+        "layers": 32
+    },
+    "custom_text": true
+}

src/open_clip/model_configs/EVA02-E-14.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "eva02_enormous_patch14_clip_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 24
+    },
+    "custom_text": true
+}

src/open_clip/model_configs/EVA02-L-14-336.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 336,
+        "timm_model_name": "eva02_large_patch14_clip_336",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    },
+    "custom_text": true
+}

src/open_clip/model_configs/EVA02-L-14.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "eva02_large_patch14_clip_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    },
+    "custom_text": true
+}

src/open_clip/model_configs/MobileCLIP-B.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "timm_model_name": "vit_base_mci_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null,
+        "timm_drop": 0.0,
+        "timm_drop_path": 0.0,
+        "image_size": 224
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12,
+        "no_causal_mask": false
+    },
+    "custom_text": true
+}

src/open_clip/model_configs/MobileCLIP-S1.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "timm_model_name": "fastvit_mci1",
+        "timm_model_pretrained": false,
+        "timm_pool": "avg",
+        "timm_proj": null,
+        "timm_drop": 0.0,
+        "timm_drop_path": 0.0,
+        "image_size": 256
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12,
+        "no_causal_mask": true
+    },
+    "custom_text": true
+}

src/open_clip/model_configs/MobileCLIP-S2.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "timm_model_name": "fastvit_mci2",
+        "timm_model_pretrained": false,
+        "timm_pool": "avg",
+        "timm_proj": null,
+        "timm_drop": 0.0,
+        "timm_drop_path": 0.0,
+        "image_size": 256
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12,
+        "no_causal_mask": true
+    },
+    "custom_text": true
+}

src/open_clip/model_configs/RN101-quickgelu.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "embed_dim": 512,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": [
+            3,
+            4,
+            23,
+            3
+        ],
+        "width": 64,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

src/open_clip/model_configs/RN101.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": [
+            3,
+            4,
+            23,
+            3
+        ],
+        "width": 64,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

src/open_clip/model_configs/RN50-quickgelu.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "embed_dim": 1024,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": [
+            3,
+            4,
+            6,
+            3
+        ],
+        "width": 64,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

src/open_clip/model_configs/RN50.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": [
+            3,
+            4,
+            6,
+            3
+        ],
+        "width": 64,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

src/open_clip/model_configs/RN50x16-quickgelu.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "embed_dim": 768,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 384,
+        "layers": [
+            6,
+            8,
+            18,
+            8
+        ],
+        "width": 96,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}

src/open_clip/model_configs/RN50x16.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 384,
+        "layers": [
+            6,
+            8,
+            18,
+            8
+        ],
+        "width": 96,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}

src/open_clip/model_configs/RN50x4-quickgelu.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "embed_dim": 640,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 288,
+        "layers": [
+            4,
+            6,
+            10,
+            6
+        ],
+        "width": 80,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 640,
+        "heads": 10,
+        "layers": 12
+    }
+}

src/open_clip/model_configs/RN50x4.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 640,
+    "vision_cfg": {
+        "image_size": 288,
+        "layers": [
+            4,
+            6,
+            10,
+            6
+        ],
+        "width": 80,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 640,
+        "heads": 10,
+        "layers": 12
+    }
+}

src/open_clip/model_configs/RN50x64-quickgelu.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "embed_dim": 1024,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 448,
+        "layers": [
+            3,
+            15,
+            36,
+            10
+        ],
+        "width": 128,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 12
+    }
+}

src/open_clip/model_configs/RN50x64.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 448,
+        "layers": [
+            3,
+            15,
+            36,
+            10
+        ],
+        "width": 128,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 12
+    }
+}

src/open_clip/model_configs/ViT-B-16-SigLIP-256.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 256,
+        "timm_model_name": "vit_base_patch16_siglip_256",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        }
+    }
+}

src/open_clip/model_configs/ViT-B-16-SigLIP-384.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 384,
+        "timm_model_name": "vit_base_patch16_siglip_384",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        }
+    }
+}

src/open_clip/model_configs/ViT-B-16-SigLIP-512.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 512,
+        "timm_model_name": "vit_base_patch16_siglip_512",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        }
+    }
+}

src/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 256,
+        "timm_model_name": "vit_base_patch16_siglip_256",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 250000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP-i18n-256",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        }
+    }
+}

src/open_clip/model_configs/ViT-B-16-SigLIP.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "vit_base_patch16_siglip_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        }
+    }
+}

src/open_clip/model_configs/ViT-B-16-SigLIP2-256.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 256,
+        "timm_model_name": "vit_base_patch16_siglip_256",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 256000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP2-256",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        },
+        "act_kwargs": {
+            "approximate": "tanh"
+        }
+    }
+}

src/open_clip/model_configs/ViT-B-16-SigLIP2-384.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 384,
+        "timm_model_name": "vit_base_patch16_siglip_384",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 256000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP2-384",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        },
+        "act_kwargs": {
+            "approximate": "tanh"
+        }
+    }
+}

src/open_clip/model_configs/ViT-B-16-SigLIP2-512.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 512,
+        "timm_model_name": "vit_base_patch16_siglip_512",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 256000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP2-512",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        },
+        "act_kwargs": {
+            "approximate": "tanh"
+        }
+    }
+}