talkingAvater_bgk

Runtime error

App Files Files Community

oKen38461 commited on Jul 17

Commit

ac7cda5

1 Parent(s): d291c0c

初回コミットに基づくファイルの追加

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +43 -0
LICENSE +201 -0
README_ditto-talkinghead.md +232 -0
core/atomic_components/audio2motion.py +196 -0
core/atomic_components/avatar_registrar.py +102 -0
core/atomic_components/cfg.py +111 -0
core/atomic_components/condition_handler.py +168 -0
core/atomic_components/decode_f3d.py +22 -0
core/atomic_components/loader.py +133 -0
core/atomic_components/motion_stitch.py +491 -0
core/atomic_components/putback.py +60 -0
core/atomic_components/source2info.py +155 -0
core/atomic_components/warp_f3d.py +22 -0
core/atomic_components/wav2feat.py +110 -0
core/atomic_components/writer.py +36 -0
core/aux_models/blaze_face.py +351 -0
core/aux_models/face_mesh.py +101 -0
core/aux_models/hubert_stream.py +29 -0
core/aux_models/insightface_det.py +245 -0
core/aux_models/insightface_landmark106.py +100 -0
core/aux_models/landmark203.py +58 -0
core/aux_models/mediapipe_landmark478.py +118 -0
core/aux_models/modules/__init__.py +5 -0
core/aux_models/modules/hubert_stream.py +21 -0
core/aux_models/modules/landmark106.py +83 -0
core/aux_models/modules/landmark203.py +42 -0
core/aux_models/modules/landmark478.py +35 -0
core/aux_models/modules/retinaface.py +215 -0
core/models/appearance_extractor.py +29 -0
core/models/decoder.py +30 -0
core/models/lmdm.py +140 -0
core/models/modules/LMDM.py +154 -0
core/models/modules/__init__.py +6 -0
core/models/modules/appearance_feature_extractor.py +74 -0
core/models/modules/convnextv2.py +150 -0
core/models/modules/dense_motion.py +104 -0
core/models/modules/lmdm_modules/model.py +398 -0
core/models/modules/lmdm_modules/rotary_embedding_torch.py +132 -0
core/models/modules/lmdm_modules/utils.py +96 -0
core/models/modules/motion_extractor.py +25 -0
core/models/modules/spade_generator.py +87 -0
core/models/modules/stitching_network.py +65 -0
core/models/modules/util.py +452 -0
core/models/modules/warping_network.py +87 -0
core/models/motion_extractor.py +49 -0
core/models/stitch_network.py +30 -0
core/models/warp_network.py +35 -0
core/utils/blend/__init__.py +4 -0
core/utils/blend/blend.pyx +38 -0
core/utils/blend/blend.pyxbld +11 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,43 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*__pycache__
+**/__pycache__/
+*.py[cod]
+**/*.py[cod]
+*$py.class
+# Model weights
+checkpoints
+**/*.pth
+**/*.onnx
+**/*.pt
+**/*.pth.tar
+.idea
+.vscode
+.DS_Store
+*.DS_Store
+*.swp
+tmp*
+*build
+*.egg-info/
+*.mp4
+log/*
+*.mp4
+*.png
+*.jpg
+*.wav
+*.pth
+*.pyc
+*.jpeg
+# Folders to ignore
+example/
+ToDo/
+!example/audio.wav
+!example/image.png

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README_ditto-talkinghead.md ADDED Viewed

	@@ -0,0 +1,232 @@

+<h2 align='center'>Ditto: Motion-Space Diffusion for Controllable Realtime Talking Head Synthesis</h2>
+<div align='center'>
+    <a href=""><strong>Tianqi Li</strong></a>
+    ·
+    <a href=""><strong>Ruobing Zheng</strong></a><sup>†</sup>
+    ·
+    <a href=""><strong>Minghui Yang</strong></a>
+    ·
+    <a href=""><strong>Jingdong Chen</strong></a>
+    ·
+    <a href=""><strong>Ming Yang</strong></a>
+</div>
+<div align='center'>
+Ant Group
+</div>
+<br>
+<div align='center'>
+    <a href='https://arxiv.org/abs/2411.19509'><img src='https://img.shields.io/badge/Paper-arXiv-red'></a>
+    <a href='https://digital-avatar.github.io/ai/Ditto/'><img src='https://img.shields.io/badge/Project-Page-blue'></a>
+    <a href='https://huggingface.co/digital-avatar/ditto-talkinghead'><img src='https://img.shields.io/badge/Model-HuggingFace-yellow'></a>
+    <a href='https://github.com/antgroup/ditto-talkinghead'><img src='https://img.shields.io/badge/Code-GitHub-purple'></a>
+    <!-- <a href='https://github.com/antgroup/ditto-talkinghead'><img src='https://img.shields.io/github/stars/antgroup/ditto-talkinghead?style=social'></a> -->
+    <a href='https://colab.research.google.com/drive/19SUi1TiO32IS-Crmsu9wrkNspWE8tFbs?usp=sharing'><img src='https://img.shields.io/badge/Demo-Colab-orange'></a>
+</div>
+<br>
+<div align="center">
+    <video style="width: 95%; object-fit: cover;" controls loop src="https://github.com/user-attachments/assets/ef1a0b08-bff3-4997-a6dd-62a7f51cdb40" muted="false"></video>
+    <p>
+    ✨  For more results, visit our <a href="https://digital-avatar.github.io/ai/Ditto/"><strong>Project Page</strong></a> ✨
+    </p>
+</div>
+## 📌 Updates
+* [2025.07.11] 🔥 The [PyTorch model](#-pytorch-model) is now available.
+* [2025.07.07] 🔥 Ditto is accepted by ACM MM 2025.
+* [2025.01.21] 🔥 We update the [Colab](https://colab.research.google.com/drive/19SUi1TiO32IS-Crmsu9wrkNspWE8tFbs?usp=sharing) demo, welcome to try it.
+* [2025.01.10] 🔥 We release our inference [codes](https://github.com/antgroup/ditto-talkinghead) and [models](https://huggingface.co/digital-avatar/ditto-talkinghead).
+* [2024.11.29] 🔥 Our [paper](https://arxiv.org/abs/2411.19509) is in public on arxiv.
+## 🛠️ Installation
+Tested Environment
+- System: Centos 7.2
+- GPU: A100
+- Python: 3.10
+- tensorRT: 8.6.1
+Clone the codes from [GitHub](https://github.com/antgroup/ditto-talkinghead):
+```bash
+git clone https://github.com/antgroup/ditto-talkinghead
+cd ditto-talkinghead
+```
+### Conda
+Create `conda` environment:
+```bash
+conda env create -f environment.yaml
+conda activate ditto
+```
+### Pip
+If you have problems creating a conda environment, you can also refer to our [Colab](https://colab.research.google.com/drive/19SUi1TiO32IS-Crmsu9wrkNspWE8tFbs?usp=sharing).
+After correctly installing `pytorch`, `cuda` and `cudnn`, you only need to install a few packages using pip:
+```bash
+pip install \
+    tensorrt==8.6.1 \
+    librosa \
+    tqdm \
+    filetype \
+    imageio \
+    opencv_python_headless \
+    scikit-image \
+    cython \
+    cuda-python \
+    imageio-ffmpeg \
+    colored \
+    polygraphy \
+    numpy==2.0.1
+```
+If you don't use `conda`, you may also need to install `ffmpeg` according to the [official website](https://www.ffmpeg.org/download.html).
+## 📥 Download Checkpoints
+Download checkpoints from [HuggingFace](https://huggingface.co/digital-avatar/ditto-talkinghead) and put them in `checkpoints` dir:
+```bash
+git lfs install
+git clone https://huggingface.co/digital-avatar/ditto-talkinghead checkpoints
+```
+The `checkpoints` should be like:
+```text
+./checkpoints/
+├── ditto_cfg
+│   ├── v0.4_hubert_cfg_trt.pkl
+│   └── v0.4_hubert_cfg_trt_online.pkl
+├── ditto_onnx
+│   ├── appearance_extractor.onnx
+│   ├── blaze_face.onnx
+│   ├── decoder.onnx
+│   ├── face_mesh.onnx
+│   ├── hubert.onnx
+│   ├── insightface_det.onnx
+│   ├── landmark106.onnx
+│   ├── landmark203.onnx
+│   ├── libgrid_sample_3d_plugin.so
+│   ├── lmdm_v0.4_hubert.onnx
+│   ├── motion_extractor.onnx
+│   ├── stitch_network.onnx
+│   └── warp_network.onnx
+└── ditto_trt_Ampere_Plus
+    ├── appearance_extractor_fp16.engine
+    ├── blaze_face_fp16.engine
+    ├── decoder_fp16.engine
+    ├── face_mesh_fp16.engine
+    ├── hubert_fp32.engine
+    ├── insightface_det_fp16.engine
+    ├── landmark106_fp16.engine
+    ├── landmark203_fp16.engine
+    ├── lmdm_v0.4_hubert_fp32.engine
+    ├── motion_extractor_fp32.engine
+    ├── stitch_network_fp16.engine
+    └── warp_network_fp16.engine
+```
+- The `ditto_cfg/v0.4_hubert_cfg_trt_online.pkl` is online config
+- The `ditto_cfg/v0.4_hubert_cfg_trt.pkl` is offline config
+## 🚀 Inference
+Run `inference.py`:
+```shell
+python inference.py \
+    --data_root "<path-to-trt-model>" \
+    --cfg_pkl "<path-to-cfg-pkl>" \
+    --audio_path "<path-to-input-audio>" \
+    --source_path "<path-to-input-image>" \
+    --output_path "<path-to-output-mp4>"
+```
+For example:
+```shell
+python inference.py \
+    --data_root "./checkpoints/ditto_trt_Ampere_Plus" \
+    --cfg_pkl "./checkpoints/ditto_cfg/v0.4_hubert_cfg_trt.pkl" \
+    --audio_path "./example/audio.wav" \
+    --source_path "./example/image.png" \
+    --output_path "./tmp/result.mp4"
+```
+❗Note:
+We have provided the tensorRT model with `hardware-compatibility-level=Ampere_Plus` (`checkpoints/ditto_trt_Ampere_Plus/`). If your GPU does not support it, please execute the `cvt_onnx_to_trt.py` script to convert from the general onnx model (`checkpoints/ditto_onnx/`) to the tensorRT model.
+```bash
+python scripts/cvt_onnx_to_trt.py --onnx_dir "./checkpoints/ditto_onnx" --trt_dir "./checkpoints/ditto_trt_custom"
+```
+Then run `inference.py` with `--data_root=./checkpoints/ditto_trt_custom`.
+## ⚡ PyTorch Model
+*Based on community interest and to better support further development, we are now open-sourcing the PyTorch version of the model.*
+We have added the PyTorch model and corresponding configuration files to the [HuggingFace](https://huggingface.co/digital-avatar/ditto-talkinghead). Please refer to [Download Checkpoints](#-download-checkpoints) to prepare the model files.
+The `checkpoints` should be like:
+```text
+./checkpoints/
+├── ditto_cfg
+│   ├── ...
+│   └── v0.4_hubert_cfg_pytorch.pkl
+├── ...
+└── ditto_pytorch
+    ├── aux_models
+    │   ├── 2d106det.onnx
+    │   ├── det_10g.onnx
+    │   ├── face_landmarker.task
+    │   ├── hubert_streaming_fix_kv.onnx
+    │   └── landmark203.onnx
+    └── models
+        ├── appearance_extractor.pth
+        ├── decoder.pth
+        ├── lmdm_v0.4_hubert.pth
+        ├── motion_extractor.pth
+        ├── stitch_network.pth
+        └── warp_network.pth
+```
+To run inference, execute the following command:
+```shell
+python inference.py \
+    --data_root "./checkpoints/ditto_pytorch" \
+    --cfg_pkl "./checkpoints/ditto_cfg/v0.4_hubert_cfg_pytorch.pkl" \
+    --audio_path "./example/audio.wav" \
+    --source_path "./example/image.png" \
+    --output_path "./tmp/result.mp4"
+```
+## 📧 Acknowledgement
+Our implementation is based on [S2G-MDDiffusion](https://github.com/thuhcsi/S2G-MDDiffusion) and [LivePortrait](https://github.com/KwaiVGI/LivePortrait). Thanks for their remarkable contribution and released code! If we missed any open-source projects or related articles, we would like to complement the acknowledgement of this specific work immediately.
+## ⚖️ License
+This repository is released under the Apache-2.0 license as found in the [LICENSE](LICENSE) file.
+## 📚 Citation
+If you find this codebase useful for your research, please use the following entry.
+```BibTeX
+@article{li2024ditto,
+    title={Ditto: Motion-Space Diffusion for Controllable Realtime Talking Head Synthesis},
+    author={Li, Tianqi and Zheng, Ruobing and Yang, Minghui and Chen, Jingdong and Yang, Ming},
+    journal={arXiv preprint arXiv:2411.19509},
+    year={2024}
+}
+```
+## 🌟 Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=antgroup/ditto-talkinghead&type=Date)](https://www.star-history.com/#antgroup/ditto-talkinghead&Date)

core/atomic_components/audio2motion.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import numpy as np
+from ..models.lmdm import LMDM
+"""
+lmdm_cfg = {
+    "model_path": "",
+    "device": "cuda",
+    "motion_feat_dim": 265,
+    "audio_feat_dim": 1024+35,
+    "seq_frames": 80,
+}
+"""
+def _cvt_LP_motion_info(inp, mode, ignore_keys=()):
+    ks_shape_map = [
+        ['scale', (1, 1), 1],
+        ['pitch', (1, 66), 66],
+        ['yaw',   (1, 66), 66],
+        ['roll',  (1, 66), 66],
+        ['t',     (1, 3), 3],
+        ['exp', (1, 63), 63],
+        ['kp',  (1, 63), 63],
+    ]
+    def _dic2arr(_dic):
+        arr = []
+        for k, _, ds in ks_shape_map:
+            if k not in _dic or k in ignore_keys:
+                continue
+            v = _dic[k].reshape(ds)
+            if k == 'scale':
+                v = v - 1
+            arr.append(v)
+        arr = np.concatenate(arr, -1)  # (133)
+        return arr
+    def _arr2dic(_arr):
+        dic = {}
+        s = 0
+        for k, ds, ss in ks_shape_map:
+            if k in ignore_keys:
+                continue
+            v = _arr[s:s + ss].reshape(ds)
+            if k == 'scale':
+                v = v + 1
+            dic[k] = v
+            s += ss
+            if s >= len(_arr):
+                break
+        return dic
+    if mode == 'dic2arr':
+        assert isinstance(inp, dict)
+        return _dic2arr(inp)   # (dim)
+    elif mode == 'arr2dic':
+        assert inp.shape[0] >= 265, f"{inp.shape}"
+        return _arr2dic(inp)   # {k: (1, dim)}
+    else:
+        raise ValueError()
+class Audio2Motion:
+    def __init__(
+        self,
+        lmdm_cfg,
+    ):
+        self.lmdm = LMDM(**lmdm_cfg)
+    def setup(
+        self,
+        x_s_info,
+        overlap_v2=10,
+        fix_kp_cond=0,
+        fix_kp_cond_dim=None,
+        sampling_timesteps=50,
+        online_mode=False,
+        v_min_max_for_clip=None,
+        smo_k_d=3,
+    ):
+        self.smo_k_d = smo_k_d
+        self.overlap_v2 = overlap_v2
+        self.seq_frames = self.lmdm.seq_frames
+        self.valid_clip_len = self.seq_frames - self.overlap_v2
+        # for fuse
+        self.online_mode = online_mode
+        if self.online_mode:
+            self.fuse_length = min(self.overlap_v2, self.valid_clip_len)
+        else:
+            self.fuse_length = self.overlap_v2
+        self.fuse_alpha = np.arange(self.fuse_length, dtype=np.float32).reshape(1, -1, 1) / self.fuse_length
+        self.fix_kp_cond = fix_kp_cond
+        self.fix_kp_cond_dim = fix_kp_cond_dim
+        self.sampling_timesteps = sampling_timesteps
+        self.v_min_max_for_clip = v_min_max_for_clip
+        if self.v_min_max_for_clip is not None:
+            self.v_min = self.v_min_max_for_clip[0][None]    # [dim, 1]
+            self.v_max = self.v_min_max_for_clip[1][None]
+        kp_source = _cvt_LP_motion_info(x_s_info, mode='dic2arr', ignore_keys={'kp'})[None]
+        self.s_kp_cond = kp_source.copy().reshape(1, -1)
+        self.kp_cond = self.s_kp_cond.copy()
+        self.lmdm.setup(sampling_timesteps)
+        self.clip_idx = 0
+    def _fuse(self, res_kp_seq, pred_kp_seq):
+        ## ========================
+        ## offline fuse mode
+        ## last clip:  -------
+        ## fuse part:    *****
+        ## curr clip:    -------
+        ## output:       ^^
+        #
+        ## online fuse mode
+        ## last clip:  -------
+        ## fuse part:       **
+        ## curr clip:    -------
+        ## output:          ^^
+        ## ========================
+        fuse_r1_s = res_kp_seq.shape[1] - self.fuse_length
+        fuse_r1_e = res_kp_seq.shape[1]
+        fuse_r2_s = self.seq_frames - self.valid_clip_len - self.fuse_length
+        fuse_r2_e = self.seq_frames - self.valid_clip_len
+        r1 = res_kp_seq[:, fuse_r1_s:fuse_r1_e]     # [1, fuse_len, dim]
+        r2 = pred_kp_seq[:, fuse_r2_s: fuse_r2_e]   # [1, fuse_len, dim]
+        r_fuse = r1 * (1 - self.fuse_alpha) + r2 * self.fuse_alpha
+        res_kp_seq[:, fuse_r1_s:fuse_r1_e] = r_fuse    # fuse last
+        res_kp_seq = np.concatenate([res_kp_seq, pred_kp_seq[:, fuse_r2_e:]], 1)  # len(res_kp_seq) + valid_clip_len
+        return res_kp_seq
+    def _update_kp_cond(self, res_kp_seq, idx):
+        if self.fix_kp_cond == 0:  # 不重置
+            self.kp_cond = res_kp_seq[:, idx-1]
+        elif self.fix_kp_cond > 0:
+            if self.clip_idx % self.fix_kp_cond == 0:  # 重置
+                self.kp_cond = self.s_kp_cond.copy()  # 重置所有
+                if self.fix_kp_cond_dim is not None:
+                    ds, de = self.fix_kp_cond_dim
+                    self.kp_cond[:, ds:de] = res_kp_seq[:, idx-1, ds:de]
+            else:
+                self.kp_cond = res_kp_seq[:, idx-1]
+    def _smo(self, res_kp_seq, s, e):
+        if self.smo_k_d <= 1:
+            return res_kp_seq
+        new_res_kp_seq = res_kp_seq.copy()
+        n = res_kp_seq.shape[1]
+        half_k = self.smo_k_d // 2
+        for i in range(s, e):
+            ss = max(0, i - half_k)
+            ee = min(n, i + half_k + 1)
+            res_kp_seq[:, i, :202] = np.mean(new_res_kp_seq[:, ss:ee, :202], axis=1)
+        return res_kp_seq
+    def __call__(self, aud_cond, res_kp_seq=None):
+        """
+        aud_cond: (1, seq_frames, dim)
+        """
+        pred_kp_seq = self.lmdm(self.kp_cond, aud_cond, self.sampling_timesteps)
+        if res_kp_seq is None:
+            res_kp_seq = pred_kp_seq   # [1, seq_frames, dim]
+            res_kp_seq = self._smo(res_kp_seq, 0, res_kp_seq.shape[1])
+        else:
+            res_kp_seq = self._fuse(res_kp_seq, pred_kp_seq)  # len(res_kp_seq) + valid_clip_len
+            res_kp_seq = self._smo(res_kp_seq, res_kp_seq.shape[1] - self.valid_clip_len - self.fuse_length, res_kp_seq.shape[1] - self.valid_clip_len + 1)
+        self.clip_idx += 1
+        idx = res_kp_seq.shape[1] - self.overlap_v2
+        self._update_kp_cond(res_kp_seq, idx)
+        return res_kp_seq
+    def cvt_fmt(self, res_kp_seq):
+        # res_kp_seq: [1, n, dim]
+        if self.v_min_max_for_clip is not None:
+            tmp_res_kp_seq = np.clip(res_kp_seq[0], self.v_min, self.v_max)
+        else:
+            tmp_res_kp_seq = res_kp_seq[0]
+        x_d_info_list = []
+        for i in range(tmp_res_kp_seq.shape[0]):
+            x_d_info = _cvt_LP_motion_info(tmp_res_kp_seq[i], 'arr2dic')   # {k: (1, dim)}
+            x_d_info_list.append(x_d_info)
+        return x_d_info_list

core/atomic_components/avatar_registrar.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import numpy as np
+from .loader import load_source_frames
+from .source2info import Source2Info
+def _mean_filter(arr, k):
+    n = arr.shape[0]
+    half_k = k // 2
+    res = []
+    for i in range(n):
+        s = max(0, i - half_k)
+        e = min(n, i + half_k + 1)
+        res.append(arr[s:e].mean(0))
+    res = np.stack(res, 0)
+    return res
+def smooth_x_s_info_lst(x_s_info_list, ignore_keys=(), smo_k=13):
+    keys = x_s_info_list[0].keys()
+    N = len(x_s_info_list)
+    smo_dict = {}
+    for k in keys:
+        _lst = [x_s_info_list[i][k] for i in range(N)]
+        if k not in ignore_keys:
+            _lst = np.stack(_lst, 0)
+            _smo_lst = _mean_filter(_lst, smo_k)
+        else:
+            _smo_lst = _lst
+        smo_dict[k] = _smo_lst
+    smo_res = []
+    for i in range(N):
+        x_s_info = {k: smo_dict[k][i] for k in keys}
+        smo_res.append(x_s_info)
+    return smo_res
+class AvatarRegistrar:
+    """
+    source image|video -> rgb_list -> source_info
+    """
+    def __init__(
+        self,
+        insightface_det_cfg,
+        landmark106_cfg,
+        landmark203_cfg,
+        landmark478_cfg,
+        appearance_extractor_cfg,
+        motion_extractor_cfg,
+    ):
+        self.source2info = Source2Info(
+            insightface_det_cfg,
+            landmark106_cfg,
+            landmark203_cfg,
+            landmark478_cfg,
+            appearance_extractor_cfg,
+            motion_extractor_cfg,
+        )
+    def register(
+        self,
+        source_path,  # image | video
+        max_dim=1920,
+        n_frames=-1,
+        **kwargs,
+    ):
+        """
+        kwargs:
+            crop_scale: 2.3
+            crop_vx_ratio: 0
+            crop_vy_ratio: -0.125
+            crop_flag_do_rot: True
+        """
+        rgb_list, is_image_flag = load_source_frames(source_path, max_dim=max_dim, n_frames=n_frames)
+        source_info = {
+            "x_s_info_lst": [],
+            "f_s_lst": [],
+            "M_c2o_lst": [],
+            "eye_open_lst": [],
+            "eye_ball_lst": [],
+        }
+        keys = ["x_s_info", "f_s", "M_c2o", "eye_open", "eye_ball"]
+        last_lmk = None
+        for rgb in rgb_list:
+            info = self.source2info(rgb, last_lmk, **kwargs)
+            for k in keys:
+                source_info[f"{k}_lst"].append(info[k])
+            last_lmk = info["lmk203"]
+        sc_f0 = source_info['x_s_info_lst'][0]['kp'].flatten()
+        source_info["sc"] = sc_f0
+        source_info["is_image_flag"] = is_image_flag
+        source_info["img_rgb_lst"] = rgb_list
+        return source_info
+    def __call__(self, *args, **kwargs):
+        return self.register(*args, **kwargs)

core/atomic_components/cfg.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os
+import pickle
+import numpy as np
+def load_pkl(pkl):
+    with open(pkl, "rb") as f:
+        return pickle.load(f)
+def parse_cfg(cfg_pkl, data_root, replace_cfg=None):
+    def _check_path(p):
+        if os.path.isfile(p):
+            return p
+        else:
+            return os.path.join(data_root, p)
+    cfg = load_pkl(cfg_pkl)
+    # ---
+    # replace cfg for debug
+    if isinstance(replace_cfg, dict):
+        for k, v in replace_cfg.items():
+            if not isinstance(v, dict):
+                continue
+            for kk, vv in v.items():
+                cfg[k][kk] = vv
+    # ---
+    base_cfg = cfg["base_cfg"]
+    audio2motion_cfg = cfg["audio2motion_cfg"]
+    default_kwargs = cfg["default_kwargs"]
+    for k in base_cfg:
+        if k == "landmark478_cfg":
+            for kk in ["task_path", "blaze_face_model_path", "face_mesh_model_path"]:
+                if kk in base_cfg[k] and base_cfg[k][kk]:
+                    base_cfg[k][kk] = _check_path(base_cfg[k][kk])
+        else:
+            base_cfg[k]["model_path"] = _check_path(base_cfg[k]["model_path"])
+    audio2motion_cfg["model_path"] = _check_path(audio2motion_cfg["model_path"])
+    avatar_registrar_cfg = {
+        k: base_cfg[k]
+        for k in [
+            "insightface_det_cfg",
+            "landmark106_cfg",
+            "landmark203_cfg",
+            "landmark478_cfg",
+            "appearance_extractor_cfg",
+            "motion_extractor_cfg",
+        ]
+    }
+    stitch_network_cfg = base_cfg["stitch_network_cfg"]
+    warp_network_cfg = base_cfg["warp_network_cfg"]
+    decoder_cfg = base_cfg["decoder_cfg"]
+    condition_handler_cfg = {
+        k: audio2motion_cfg[k]
+        for k in [
+            "use_emo",
+            "use_sc",
+            "use_eye_open",
+            "use_eye_ball",
+            "seq_frames",
+        ]
+    }
+    lmdm_cfg = {
+        k: audio2motion_cfg[k]
+        for k in [
+            "model_path",
+            "device",
+            "motion_feat_dim",
+            "audio_feat_dim",
+            "seq_frames",
+        ]
+    }
+    w2f_type = audio2motion_cfg["w2f_type"]
+    wav2feat_cfg = {
+        "w2f_cfg": base_cfg["hubert_cfg"] if w2f_type == "hubert" else base_cfg["wavlm_cfg"],
+        "w2f_type": w2f_type,
+    }
+    return [
+        avatar_registrar_cfg,
+        condition_handler_cfg,
+        lmdm_cfg,
+        stitch_network_cfg,
+        warp_network_cfg,
+        decoder_cfg,
+        wav2feat_cfg,
+        default_kwargs,
+    ]
+def print_cfg(**kwargs):
+    for k, v in kwargs.items():
+        if k == "ch_info":
+            print(k, type(v))
+        elif k == "ctrl_info":
+            print(k, type(v), len(v))
+        else:
+            if isinstance(v, np.ndarray):
+                print(k, type(v), v.shape)
+            else:
+                print(k, type(v), v)

core/atomic_components/condition_handler.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import numpy as np
+from scipy.special import softmax
+import copy
+def _get_emo_avg(idx=6):
+    emo_avg = np.zeros(8, dtype=np.float32)
+    if isinstance(idx, (list, tuple)):
+        for i in idx:
+            emo_avg[i] = 8
+    else:
+        emo_avg[idx] = 8
+    emo_avg = softmax(emo_avg)
+    #emo_avg = None
+    # 'Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise', 'Contempt'
+    return emo_avg
+def _mirror_index(index, size):
+    turn = index // size
+    res = index % size
+    if turn % 2 == 0:
+        return res
+    else:
+        return size - res - 1
+class ConditionHandler:
+    """
+    aud_feat, emo_seq, eye_seq, sc_seq -> cond_seq
+    """
+    def __init__(
+        self,
+        use_emo=True,
+        use_sc=True,
+        use_eye_open=True,
+        use_eye_ball=True,
+        seq_frames=80,
+    ):
+        self.use_emo = use_emo
+        self.use_sc = use_sc
+        self.use_eye_open = use_eye_open
+        self.use_eye_ball = use_eye_ball
+        self.seq_frames = seq_frames
+    def setup(self, setup_info, emo, eye_f0_mode=False, ch_info=None):
+        """
+        emo: int | [int] | [[int]] | numpy
+        """
+        if ch_info is None:
+            source_info = copy.deepcopy(setup_info)
+        else:
+            source_info = ch_info
+        self.eye_f0_mode = eye_f0_mode
+        self.x_s_info_0 = source_info['x_s_info_lst'][0]
+        if self.use_sc:
+            self.sc = source_info["sc"]    # 63
+            self.sc_seq = np.stack([self.sc] * self.seq_frames, 0)
+        if self.use_eye_open:
+            self.eye_open_lst = np.concatenate(source_info["eye_open_lst"], 0)  # [n, 2]
+            self.num_eye_open = len(self.eye_open_lst)
+            if self.num_eye_open == 1 or self.eye_f0_mode:
+                self.eye_open_seq = np.stack([self.eye_open_lst[0]] * self.seq_frames, 0)
+            else:
+                self.eye_open_seq = None
+        if self.use_eye_ball:
+            self.eye_ball_lst = np.concatenate(source_info["eye_ball_lst"], 0)  # [n, 6]
+            self.num_eye_ball = len(self.eye_ball_lst)
+            if self.num_eye_ball == 1 or self.eye_f0_mode:
+                self.eye_ball_seq = np.stack([self.eye_ball_lst[0]] * self.seq_frames, 0)
+            else:
+                self.eye_ball_seq = None
+        if self.use_emo:
+            self.emo_lst = self._parse_emo_seq(emo)
+            self.num_emo = len(self.emo_lst)
+            if self.num_emo == 1:
+                self.emo_seq = np.concatenate([self.emo_lst] * self.seq_frames, 0)
+            else:
+                self.emo_seq = None
+    @staticmethod
+    def _parse_emo_seq(emo, seq_len=-1):
+        if isinstance(emo, np.ndarray) and emo.ndim == 2 and emo.shape[1] == 8:
+            # emo arr, e.g. real
+            emo_seq = emo   # [m, 8]
+        elif isinstance(emo, int) and 0 <= emo < 8:
+            # emo label, e.g. 4
+            emo_seq = _get_emo_avg(emo).reshape(1, 8)    # [1, 8]
+        elif isinstance(emo, (list, tuple)) and 0 < len(emo) < 8 and isinstance(emo[0], int):
+            # emo labels, e.g. [3,4]
+            emo_seq = _get_emo_avg(emo).reshape(1, 8)    # [1, 8]
+        elif isinstance(emo, list) and emo and isinstance(emo[0], (list, tuple)):
+            # emo label list, e.g. [[4], [3,4], [3],[3,4,5], ...]
+            emo_seq = np.stack([_get_emo_avg(i) for i in emo], 0)    # [m, 8]
+        else:
+            raise ValueError(f"Unsupported emo type: {emo}")
+        if seq_len > 0:
+            if len(emo_seq) == seq_len:
+                return emo_seq
+            elif len(emo_seq) == 1:
+                return np.concatenate([emo_seq] * seq_len, 0)
+            elif len(emo_seq) > seq_len:
+                return emo_seq[:seq_len]
+            else:
+                raise ValueError(f"emo len {len(emo_seq)} can not match seq len ({seq_len})")
+        else:
+            return emo_seq
+    def __call__(self, aud_feat, idx, emo=None):
+        """
+        aud_feat: [n, 1024]
+        idx: int, <0 means pad (first clip buffer)
+        """
+        frame_num = len(aud_feat)
+        more_cond = [aud_feat]
+        if self.use_emo:
+            if emo is not None:
+                emo_seq = self._parse_emo_seq(emo, frame_num)
+            elif self.emo_seq is not None and len(self.emo_seq) == frame_num:
+                emo_seq = self.emo_seq
+            else:
+                emo_idx_list = [max(i, 0) % self.num_emo for i in range(idx, idx + frame_num)]
+                emo_seq = self.emo_lst[emo_idx_list]
+            more_cond.append(emo_seq)
+        if self.use_eye_open:
+            if self.eye_open_seq is not None and len(self.eye_open_seq) == frame_num:
+                eye_open_seq = self.eye_open_seq
+            else:
+                if self.eye_f0_mode:
+                    eye_idx_list = [0] * frame_num
+                else:
+                    eye_idx_list = [_mirror_index(max(i, 0), self.num_eye_open) for i in range(idx, idx + frame_num)]
+                eye_open_seq = self.eye_open_lst[eye_idx_list]
+            more_cond.append(eye_open_seq)
+        if self.use_eye_ball:
+            if self.eye_ball_seq is not None and len(self.eye_ball_seq) == frame_num:
+                eye_ball_seq = self.eye_ball_seq
+            else:
+                if self.eye_f0_mode:
+                    eye_idx_list = [0] * frame_num
+                else:
+                    eye_idx_list = [_mirror_index(max(i, 0), self.num_eye_ball) for i in range(idx, idx + frame_num)]
+                eye_ball_seq = self.eye_ball_lst[eye_idx_list]
+            more_cond.append(eye_ball_seq)
+        if self.use_sc:
+            if len(self.sc_seq) == frame_num:
+                sc_seq = self.sc_seq
+            else:
+                sc_seq = np.stack([self.sc] * frame_num, 0)
+            more_cond.append(sc_seq)
+        if len(more_cond) > 1:
+            cond_seq = np.concatenate(more_cond, -1)    # [n, dim_cond]
+        else:
+            cond_seq = aud_feat
+        return cond_seq

core/atomic_components/decode_f3d.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from ..models.decoder import Decoder
+"""
+# __init__
+decoder_cfg = {
+    "model_path": "",
+    "device": "cuda",
+}
+"""
+class DecodeF3D:
+    def __init__(
+        self,
+        decoder_cfg,
+    ):
+        self.decoder = Decoder(**decoder_cfg)
+    def __call__(self, f_s):
+        out = self.decoder(f_s)
+        return out

core/atomic_components/loader.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import filetype
+import imageio
+import cv2
+def is_image(file_path):
+    return filetype.is_image(file_path)
+def is_video(file_path):
+    return filetype.is_video(file_path)
+def check_resize(h, w, max_dim=1920, division=2):
+    rsz_flag = False
+    # ajust the size of the image according to the maximum dimension
+    if max_dim > 0 and max(h, w) > max_dim:
+        rsz_flag = True
+        if h > w:
+            new_h = max_dim
+            new_w = int(round(w * max_dim / h))
+        else:
+            new_w = max_dim
+            new_h = int(round(h * max_dim / w))
+    else:
+        new_h = h
+        new_w = w
+    # ensure that the image dimensions are multiples of n
+    if new_h % division != 0:
+        new_h = new_h - (new_h % division)
+        rsz_flag = True
+    if new_w % division != 0:
+        new_w = new_w - (new_w % division)
+        rsz_flag = True
+    return new_h, new_w, rsz_flag
+def load_image(image_path, max_dim=-1):
+    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    h, w = img.shape[:2]
+    new_h, new_w, rsz_flag = check_resize(h, w, max_dim)
+    if rsz_flag:
+        img = cv2.resize(img, (new_w, new_h))
+    return img
+def load_video(video_path, n_frames=-1, max_dim=-1):
+    reader = imageio.get_reader(video_path, "ffmpeg")
+    new_h, new_w, rsz_flag = None, None, None
+    ret = []
+    for idx, frame_rgb in enumerate(reader):
+        if n_frames > 0 and idx >= n_frames:
+            break
+        if rsz_flag is None:
+            h, w = frame_rgb.shape[:2]
+            new_h, new_w, rsz_flag = check_resize(h, w, max_dim)
+        if rsz_flag:
+            frame_rgb = cv2.resize(frame_rgb, (new_w, new_h))
+        ret.append(frame_rgb)
+    reader.close()
+    return ret
+def load_source_frames(source_path, max_dim=-1, n_frames=-1):
+    if is_image(source_path):
+        rgb = load_image(source_path, max_dim)
+        rgb_list = [rgb]
+        is_image_flag = True
+    elif is_video(source_path):
+        rgb_list = load_video(source_path, n_frames, max_dim)
+        is_image_flag = False
+    else:
+        raise ValueError(f"Unsupported source type: {source_path}")
+    return rgb_list, is_image_flag
+def _mirror_index(index, size):
+    turn = index // size
+    res = index % size
+    if turn % 2 == 0:
+        return res
+    else:
+        return size - res - 1
+class LoopLoader:
+    def __init__(self, item_list, max_iter_num=-1, mirror_loop=True):
+        self.item_list = item_list
+        self.idx = 0
+        self.item_num = len(self.item_list)
+        self.max_iter_num = max_iter_num if max_iter_num > 0 else self.item_num
+        self.mirror_loop = mirror_loop
+    def __len__(self):
+        return self.max_iter_num
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self.idx >= self.max_iter_num:
+            raise StopIteration
+        if self.mirror_loop:
+            idx = _mirror_index(self.idx, self.item_num)
+        else:
+            idx = self.idx % self.item_num
+        item = self.item_list[idx]
+        self.idx += 1
+        return item
+    def __call__(self):
+        return self.__iter__()
+    def reset(self, max_iter_num=-1):
+        self.frame_idx = 0
+        self.max_iter_num = max_iter_num if max_iter_num > 0 else self.item_num

core/atomic_components/motion_stitch.py ADDED Viewed

	@@ -0,0 +1,491 @@

+import copy
+import random
+import numpy as np
+from scipy.special import softmax
+from ..models.stitch_network import StitchNetwork
+"""
+# __init__
+stitch_network_cfg = {
+    "model_path": "",
+    "device": "cuda",
+}
+# __call__
+kwargs:
+    fade_alpha
+    fade_out_keys
+    delta_pitch
+    delta_yaw
+    delta_roll
+"""
+def ctrl_motion(x_d_info, **kwargs):
+    # pose + offset
+    for kk in ["delta_pitch", "delta_yaw", "delta_roll"]:
+        if kk in kwargs:
+            k = kk[6:]
+            x_d_info[k] = bin66_to_degree(x_d_info[k]) + kwargs[kk]
+    # pose * alpha
+    for kk in ["alpha_pitch", "alpha_yaw", "alpha_roll"]:
+        if kk in kwargs:
+            k = kk[6:]
+            x_d_info[k] = x_d_info[k] * kwargs[kk]
+    # exp + offset
+    if "delta_exp" in kwargs:
+        k = "exp"
+        x_d_info[k] = x_d_info[k] + kwargs["delta_exp"]
+    return x_d_info
+def fade(x_d_info, dst, alpha, keys=None):
+    if keys is None:
+        keys = x_d_info.keys()
+    for k in keys:
+        if k == 'kp':
+            continue
+        x_d_info[k] = x_d_info[k] * alpha + dst[k] * (1 - alpha)
+    return x_d_info
+def ctrl_vad(x_d_info, dst, alpha):
+    exp = x_d_info["exp"]
+    exp_dst = dst["exp"]
+    _lip = [6, 12, 14, 17, 19, 20]
+    _a1 = np.zeros((21, 3), dtype=np.float32)
+    _a1[_lip] = alpha
+    _a1 = _a1.reshape(1, -1)
+    x_d_info["exp"] = exp * alpha + exp_dst * (1 - alpha)
+    return x_d_info
+def _mix_s_d_info(
+    x_s_info,
+    x_d_info,
+    use_d_keys=("exp", "pitch", "yaw", "roll", "t"),
+    d0=None,
+):
+    if d0 is not None:
+        if isinstance(use_d_keys, dict):
+            x_d_info = {
+                k: x_s_info[k] + (v - d0[k]) * use_d_keys.get(k, 1)
+                for k, v in x_d_info.items()
+            }
+        else:
+            x_d_info = {k: x_s_info[k] + (v - d0[k]) for k, v in x_d_info.items()}
+    for k, v in x_s_info.items():
+        if k not in x_d_info or k not in use_d_keys:
+            x_d_info[k] = v
+    if isinstance(use_d_keys, dict) and d0 is None:
+        for k, alpha in use_d_keys.items():
+            x_d_info[k] *= alpha
+    return x_d_info
+def _set_eye_blink_idx(N, blink_n=15, open_n=-1):
+    """
+    open_n:
+        -1: no blink
+        0: random open_n
+        >0: fix open_n
+        list: loop open_n
+    """
+    OPEN_MIN = 60
+    OPEN_MAX = 100
+    idx = [0] * N
+    if isinstance(open_n, int):
+        if open_n < 0:  # no blink
+            return idx
+        elif open_n > 0:  # fix open_n
+            open_ns = [open_n]
+        else:  # open_n == 0:  # random open_n, 60-100
+            open_ns = []
+    elif isinstance(open_n, list):
+        open_ns = open_n  # loop open_n
+    else:
+        raise ValueError()
+    blink_idx = list(range(blink_n))
+    start_n = open_ns[0] if open_ns else random.randint(OPEN_MIN, OPEN_MAX)
+    end_n = open_ns[-1] if open_ns else random.randint(OPEN_MIN, OPEN_MAX)
+    max_i = N - max(end_n, blink_n)
+    cur_i = start_n
+    cur_n_i = 1
+    while cur_i < max_i:
+        idx[cur_i : cur_i + blink_n] = blink_idx
+        if open_ns:
+            cur_n = open_ns[cur_n_i % len(open_ns)]
+            cur_n_i += 1
+        else:
+            cur_n = random.randint(OPEN_MIN, OPEN_MAX)
+        cur_i = cur_i + blink_n + cur_n
+    return idx
+def _fix_exp_for_x_d_info(x_d_info, x_s_info, delta_eye=None, drive_eye=True):
+    _eye = [11, 13, 15, 16, 18]
+    _lip = [6, 12, 14, 17, 19, 20]
+    alpha = np.zeros((21, 3), dtype=x_d_info["exp"].dtype)
+    alpha[_lip] = 1
+    if delta_eye is None and drive_eye:  # use d eye
+        alpha[_eye] = 1
+    alpha = alpha.reshape(1, -1)
+    x_d_info["exp"] = x_d_info["exp"] * alpha + x_s_info["exp"] * (1 - alpha)
+    if delta_eye is not None and drive_eye:
+        alpha = np.zeros((21, 3), dtype=x_d_info["exp"].dtype)
+        alpha[_eye] = 1
+        alpha = alpha.reshape(1, -1)
+        x_d_info["exp"] = (delta_eye + x_s_info["exp"]) * alpha + x_d_info["exp"] * (
+            1 - alpha
+        )
+    return x_d_info
+def _fix_exp_for_x_d_info_v2(x_d_info, x_s_info, delta_eye, a1, a2, a3):
+    x_d_info["exp"] = x_d_info["exp"] * a1 + x_s_info["exp"] * a2 + delta_eye * a3
+    return x_d_info
+def bin66_to_degree(pred):
+    if pred.ndim > 1 and pred.shape[1] == 66:
+        idx = np.arange(66).astype(np.float32)
+        pred = softmax(pred, axis=1)
+        degree = np.sum(pred * idx, axis=1) * 3 - 97.5
+        return degree
+    return pred
+def _eye_delta(exp, dx=0, dy=0):
+    if dx > 0:
+        exp[0, 33] += dx * 0.0007
+        exp[0, 45] += dx * 0.001
+    else:
+        exp[0, 33] += dx * 0.001
+        exp[0, 45] += dx * 0.0007
+    exp[0, 34] += dy * -0.001
+    exp[0, 46] += dy * -0.001
+    return exp
+def _fix_gaze(pose_s, x_d_info):
+    x_ratio = 0.26
+    y_ratio = 0.28
+    yaw_s, pitch_s = pose_s
+    yaw_d = bin66_to_degree(x_d_info['yaw']).item()
+    pitch_d = bin66_to_degree(x_d_info['pitch']).item()
+    delta_yaw = yaw_d - yaw_s
+    delta_pitch = pitch_d - pitch_s
+    dx = delta_yaw * x_ratio
+    dy = delta_pitch * y_ratio
+    x_d_info['exp'] = _eye_delta(x_d_info['exp'], dx, dy)
+    return x_d_info
+def get_rotation_matrix(pitch_, yaw_, roll_):
+    """ the input is in degree
+    """
+    # transform to radian
+    pitch = pitch_ / 180 * np.pi
+    yaw = yaw_ / 180 * np.pi
+    roll = roll_ / 180 * np.pi
+    if pitch.ndim == 1:
+        pitch = pitch[:, None]
+    if yaw.ndim == 1:
+        yaw = yaw[:, None]
+    if roll.ndim == 1:
+        roll = roll[:, None]
+    # calculate the euler matrix
+    bs = pitch.shape[0]
+    ones = np.ones((bs, 1), dtype=np.float32)
+    zeros = np.zeros((bs, 1), dtype=np.float32)
+    x, y, z = pitch, yaw, roll
+    rot_x = np.concatenate([
+        ones, zeros, zeros,
+        zeros, np.cos(x), -np.sin(x),
+        zeros, np.sin(x), np.cos(x)
+    ], axis=1).reshape(bs, 3, 3)
+    rot_y = np.concatenate([
+        np.cos(y), zeros, np.sin(y),
+        zeros, ones, zeros,
+        -np.sin(y), zeros, np.cos(y)
+    ], axis=1).reshape(bs, 3, 3)
+    rot_z = np.concatenate([
+        np.cos(z), -np.sin(z), zeros,
+        np.sin(z), np.cos(z), zeros,
+        zeros, zeros, ones
+    ], axis=1).reshape(bs, 3, 3)
+    rot = np.matmul(np.matmul(rot_z, rot_y), rot_x)
+    return np.transpose(rot, (0, 2, 1))
+def transform_keypoint(kp_info: dict):
+    """
+    transform the implicit keypoints with the pose, shift, and expression deformation
+    kp: BxNx3
+    """
+    kp = kp_info['kp']    # (bs, k, 3)
+    pitch, yaw, roll = kp_info['pitch'], kp_info['yaw'], kp_info['roll']
+    t, exp = kp_info['t'], kp_info['exp']
+    scale = kp_info['scale']
+    pitch = bin66_to_degree(pitch)
+    yaw = bin66_to_degree(yaw)
+    roll = bin66_to_degree(roll)
+    bs = kp.shape[0]
+    if kp.ndim == 2:
+        num_kp = kp.shape[1] // 3  # Bx(num_kpx3)
+    else:
+        num_kp = kp.shape[1]  # Bxnum_kpx3
+    rot_mat = get_rotation_matrix(pitch, yaw, roll)    # (bs, 3, 3)
+    # Eqn.2: s * (R * x_c,s + exp) + t
+    kp_transformed = np.matmul(kp.reshape(bs, num_kp, 3), rot_mat) + exp.reshape(bs, num_kp, 3)
+    kp_transformed *= scale[..., None]  # (bs, k, 3) * (bs, 1, 1) = (bs, k, 3)
+    kp_transformed[:, :, 0:2] += t[:, None, 0:2]  # remove z, only apply tx ty
+    return kp_transformed
+class MotionStitch:
+    def __init__(
+        self,
+        stitch_network_cfg,
+    ):
+        self.stitch_net = StitchNetwork(**stitch_network_cfg)
+    def set_Nd(self, N_d=-1):
+        # only for offline (make start|end eye open)
+        if N_d == self.N_d:
+            return
+        self.N_d = N_d
+        if self.drive_eye and self.delta_eye_arr is not None:
+            N = 3000 if self.N_d == -1 else self.N_d
+            self.delta_eye_idx_list = _set_eye_blink_idx(
+                N, len(self.delta_eye_arr), self.delta_eye_open_n
+            )
+    def setup(
+        self,
+        N_d=-1,
+        use_d_keys=None,
+        relative_d=True,
+        drive_eye=None,  # use d eye or s eye
+        delta_eye_arr=None,  # fix eye
+        delta_eye_open_n=-1,  # int|list
+        fade_out_keys=("exp",),
+        fade_type="",   # "" | "d0" | "s"
+        flag_stitching=True,
+        is_image_flag=True,
+        x_s_info=None,
+        d0=None,
+        ch_info=None,
+        overall_ctrl_info=None,
+    ):
+        self.is_image_flag = is_image_flag
+        if use_d_keys is None:
+            if self.is_image_flag:
+                self.use_d_keys = ("exp", "pitch", "yaw", "roll", "t")
+            else:
+                self.use_d_keys = ("exp", )
+        else:
+            self.use_d_keys = use_d_keys
+        if drive_eye is None:
+            if self.is_image_flag:
+                self.drive_eye = True
+            else:
+                self.drive_eye = False
+        else:
+            self.drive_eye = drive_eye
+        self.N_d = N_d
+        self.relative_d = relative_d
+        self.delta_eye_arr = delta_eye_arr
+        self.delta_eye_open_n = delta_eye_open_n
+        self.fade_out_keys = fade_out_keys
+        self.fade_type = fade_type
+        self.flag_stitching = flag_stitching
+        _eye = [11, 13, 15, 16, 18]
+        _lip = [6, 12, 14, 17, 19, 20]
+        _a1 = np.zeros((21, 3), dtype=np.float32)
+        _a1[_lip] = 1
+        _a2 = 0
+        if self.drive_eye:
+            if self.delta_eye_arr is None:
+                _a1[_eye] = 1
+            else:
+                _a2 = np.zeros((21, 3), dtype=np.float32)
+                _a2[_eye] = 1
+                _a2 = _a2.reshape(1, -1)
+        _a1 = _a1.reshape(1, -1)
+        self.fix_exp_a1 = _a1 * (1 - _a2)
+        self.fix_exp_a2 = (1 - _a1) + _a1 * _a2
+        self.fix_exp_a3 = _a2
+        if self.drive_eye and self.delta_eye_arr is not None:
+            N = 3000 if self.N_d == -1 else self.N_d
+            self.delta_eye_idx_list = _set_eye_blink_idx(
+                N, len(self.delta_eye_arr), self.delta_eye_open_n
+            )
+        self.pose_s = None
+        self.x_s = None
+        self.fade_dst = None
+        if self.is_image_flag and x_s_info is not None:
+            yaw_s = bin66_to_degree(x_s_info['yaw']).item()
+            pitch_s = bin66_to_degree(x_s_info['pitch']).item()
+            self.pose_s = [yaw_s, pitch_s]
+            self.x_s = transform_keypoint(x_s_info)
+            if self.fade_type == "s":
+                self.fade_dst = copy.deepcopy(x_s_info)
+        if ch_info is not None:
+            self.scale_a = ch_info['x_s_info_lst'][0]['scale'].item()
+            if x_s_info is not None:
+                self.scale_b = x_s_info['scale'].item()
+                self.scale_ratio = self.scale_a / self.scale_b
+                self._set_scale_ratio(self.scale_ratio)
+            else:
+                self.scale_ratio = None
+        else:
+            self.scale_ratio = 1
+        self.overall_ctrl_info = overall_ctrl_info
+        self.d0 = d0
+        self.idx = 0
+    def _set_scale_ratio(self, scale_ratio=1):
+        if scale_ratio == 1:
+            return
+        if isinstance(self.use_d_keys, dict):
+            self.use_d_keys = {k: v * (scale_ratio if k in {"exp", "pitch", "yaw", "roll"} else 1) for k, v in self.use_d_keys.items()}
+        else:
+            self.use_d_keys = {k: scale_ratio if k in {"exp", "pitch", "yaw", "roll"} else 1 for k in self.use_d_keys}
+    @staticmethod
+    def _merge_kwargs(default_kwargs, run_kwargs):
+        if default_kwargs is None:
+            return run_kwargs
+        for k, v in default_kwargs.items():
+            if k not in run_kwargs:
+                run_kwargs[k] = v
+        return run_kwargs
+    def __call__(self, x_s_info, x_d_info, **kwargs):
+        # return x_s, x_d
+        kwargs = self._merge_kwargs(self.overall_ctrl_info, kwargs)
+        if self.scale_ratio is None:
+            self.scale_b = x_s_info['scale'].item()
+            self.scale_ratio = self.scale_a / self.scale_b
+            self._set_scale_ratio(self.scale_ratio)
+        if self.relative_d and self.d0 is None:
+            self.d0 = copy.deepcopy(x_d_info)
+        x_d_info = _mix_s_d_info(
+            x_s_info,
+            x_d_info,
+            self.use_d_keys,
+            self.d0,
+        )
+        delta_eye = 0
+        if self.drive_eye and self.delta_eye_arr is not None:
+            delta_eye = self.delta_eye_arr[
+                self.delta_eye_idx_list[self.idx % len(self.delta_eye_idx_list)]
+            ][None]
+        x_d_info = _fix_exp_for_x_d_info_v2(
+            x_d_info,
+            x_s_info,
+            delta_eye,
+            self.fix_exp_a1,
+            self.fix_exp_a2,
+            self.fix_exp_a3,
+        )
+        if kwargs.get("vad_alpha", 1) < 1:
+            x_d_info = ctrl_vad(x_d_info, x_s_info, kwargs.get("vad_alpha", 1))
+        x_d_info = ctrl_motion(x_d_info, **kwargs)
+        if self.fade_type == "d0" and self.fade_dst is None:
+            self.fade_dst = copy.deepcopy(x_d_info)
+        # fade
+        if "fade_alpha" in kwargs and self.fade_type in ["d0", "s"]:
+            fade_alpha = kwargs["fade_alpha"]
+            fade_keys = kwargs.get("fade_out_keys", self.fade_out_keys)
+            if self.fade_type == "d0":
+                fade_dst = self.fade_dst
+            elif self.fade_type == "s":
+                if self.fade_dst is not None:
+                    fade_dst = self.fade_dst
+                else:
+                    fade_dst = copy.deepcopy(x_s_info)
+                    if self.is_image_flag:
+                        self.fade_dst = fade_dst
+            x_d_info = fade(x_d_info, fade_dst, fade_alpha, fade_keys)
+        if self.drive_eye:
+            if self.pose_s is None:
+                yaw_s = bin66_to_degree(x_s_info['yaw']).item()
+                pitch_s = bin66_to_degree(x_s_info['pitch']).item()
+                self.pose_s = [yaw_s, pitch_s]
+            x_d_info = _fix_gaze(self.pose_s, x_d_info)
+        if self.x_s is not None:
+            x_s = self.x_s
+        else:
+            x_s = transform_keypoint(x_s_info)
+            if self.is_image_flag:
+                self.x_s = x_s
+        x_d = transform_keypoint(x_d_info)
+        if self.flag_stitching:
+            x_d = self.stitch_net(x_s, x_d)
+        self.idx += 1
+        return x_s, x_d

core/atomic_components/putback.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import cv2
+import numpy as np
+from ..utils.blend import blend_images_cy
+from ..utils.get_mask import get_mask
+class PutBackNumpy:
+    def __init__(
+        self,
+        mask_template_path=None,
+    ):
+        if mask_template_path is None:
+            mask = get_mask(512, 512, 0.9, 0.9)
+            self.mask_ori_float = np.concatenate([mask] * 3, 2)
+        else:
+            mask = cv2.imread(mask_template_path, cv2.IMREAD_COLOR)
+            self.mask_ori_float = mask.astype(np.float32) / 255.0
+    def __call__(self, frame_rgb, render_image, M_c2o):
+        h, w = frame_rgb.shape[:2]
+        mask_warped = cv2.warpAffine(
+            self.mask_ori_float, M_c2o[:2, :], dsize=(w, h), flags=cv2.INTER_LINEAR
+        ).clip(0, 1)
+        frame_warped = cv2.warpAffine(
+            render_image, M_c2o[:2, :], dsize=(w, h), flags=cv2.INTER_LINEAR
+        )
+        result = mask_warped * frame_warped + (1 - mask_warped) * frame_rgb
+        result = np.clip(result, 0, 255)
+        result = result.astype(np.uint8)
+        return result
+class PutBack:
+    def __init__(
+        self,
+        mask_template_path=None,
+    ):
+        if mask_template_path is None:
+            mask = get_mask(512, 512, 0.9, 0.9)
+            mask = np.concatenate([mask] * 3, 2)
+        else:
+            mask = cv2.imread(mask_template_path, cv2.IMREAD_COLOR).astype(np.float32) / 255.0
+        self.mask_ori_float = np.ascontiguousarray(mask)[:,:,0]
+        self.result_buffer = None
+    def __call__(self, frame_rgb, render_image, M_c2o):
+        h, w = frame_rgb.shape[:2]
+        mask_warped = cv2.warpAffine(
+            self.mask_ori_float, M_c2o[:2, :], dsize=(w, h), flags=cv2.INTER_LINEAR
+        ).clip(0, 1)
+        frame_warped = cv2.warpAffine(
+            render_image, M_c2o[:2, :], dsize=(w, h), flags=cv2.INTER_LINEAR
+        )
+        self.result_buffer = np.empty((h, w, 3), dtype=np.uint8)
+        # Use Cython implementation for blending
+        blend_images_cy(mask_warped, frame_warped, frame_rgb, self.result_buffer)
+        return self.result_buffer

core/atomic_components/source2info.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import numpy as np
+import cv2
+from ..aux_models.insightface_det import InsightFaceDet
+from ..aux_models.insightface_landmark106 import Landmark106
+from ..aux_models.landmark203 import Landmark203
+from ..aux_models.mediapipe_landmark478 import Landmark478
+from ..models.appearance_extractor import AppearanceExtractor
+from ..models.motion_extractor import MotionExtractor
+from ..utils.crop import crop_image
+from ..utils.eye_info import EyeAttrUtilsByMP
+"""
+insightface_det_cfg = {
+    "model_path": "",
+    "device": "cuda",
+    "force_ori_type": False,
+}
+landmark106_cfg = {
+    "model_path": "",
+    "device": "cuda",
+    "force_ori_type": False,
+}
+landmark203_cfg = {
+    "model_path": "",
+    "device": "cuda",
+    "force_ori_type": False,
+}
+landmark478_cfg = {
+    "blaze_face_model_path": "",
+    "face_mesh_model_path": "",
+    "device": "cuda",
+    "force_ori_type": False,
+    "task_path": "",
+}
+appearance_extractor_cfg = {
+    "model_path": "",
+    "device": "cuda",
+}
+motion_extractor_cfg = {
+    "model_path": "",
+    "device": "cuda",
+}
+"""
+class Source2Info:
+    def __init__(
+        self,
+        insightface_det_cfg,
+        landmark106_cfg,
+        landmark203_cfg,
+        landmark478_cfg,
+        appearance_extractor_cfg,
+        motion_extractor_cfg,
+    ):
+        self.insightface_det = InsightFaceDet(**insightface_det_cfg)
+        self.landmark106 = Landmark106(**landmark106_cfg)
+        self.landmark203 = Landmark203(**landmark203_cfg)
+        self.landmark478 = Landmark478(**landmark478_cfg)
+        self.appearance_extractor = AppearanceExtractor(**appearance_extractor_cfg)
+        self.motion_extractor = MotionExtractor(**motion_extractor_cfg)
+    def _crop(self, img, last_lmk=None, **kwargs):
+        # img_rgb -> det->landmark106->landmark203->crop
+        if last_lmk is None:  # det for first frame or image
+            det, _ = self.insightface_det(img)
+            boxes = det[np.argsort(-(det[:, 2] - det[:, 0]) * (det[:, 3] - det[:, 1]))]
+            if len(boxes) == 0:
+                return None
+            lmk_for_track = self.landmark106(img, boxes[0])  # 106
+        else:  # track for video frames
+            lmk_for_track = last_lmk  # 203
+        crop_dct = crop_image(
+            img,
+            lmk_for_track,
+            dsize=self.landmark203.dsize,
+            scale=1.5,
+            vy_ratio=-0.1,
+            pt_crop_flag=False,
+        )
+        lmk203 = self.landmark203(crop_dct["img_crop"], crop_dct["M_c2o"])
+        ret_dct = crop_image(
+            img,
+            lmk203,
+            dsize=512,
+            scale=kwargs.get("crop_scale", 2.3),
+            vx_ratio=kwargs.get("crop_vx_ratio", 0),
+            vy_ratio=kwargs.get("crop_vy_ratio", -0.125),
+            flag_do_rot=kwargs.get("crop_flag_do_rot", True),
+            pt_crop_flag=False,
+        )
+        img_crop = ret_dct["img_crop"]
+        M_c2o = ret_dct["M_c2o"]
+        return img_crop, M_c2o, lmk203
+    @staticmethod
+    def _img_crop_to_bchw256(img_crop):
+        rgb_256 = cv2.resize(img_crop, (256, 256), interpolation=cv2.INTER_AREA)
+        rgb_256_bchw = (rgb_256.astype(np.float32) / 255.0)[None].transpose(0, 3, 1, 2)
+        return rgb_256_bchw
+    def _get_kp_info(self, img):
+        # rgb_256_bchw_norm01
+        kp_info = self.motion_extractor(img)
+        return kp_info
+    def _get_f3d(self, img):
+        # rgb_256_bchw_norm01
+        fs = self.appearance_extractor(img)
+        return fs
+    def _get_eye_info(self, img):
+        # rgb uint8
+        lmk478 = self.landmark478(img)  # [1, 478, 3]
+        attr = EyeAttrUtilsByMP(lmk478)
+        lr_open = attr.LR_open().reshape(-1, 2)   # [1, 2]
+        lr_ball = attr.LR_ball_move().reshape(-1, 6)   # [1, 3, 2] -> [1, 6]
+        return [lr_open, lr_ball]
+    def __call__(self, img, last_lmk=None, **kwargs):
+        """
+        img: rgb, uint8
+        last_lmk: last frame lmk203, for video tracking
+        kwargs: optional crop cfg
+            crop_scale: 2.3
+            crop_vx_ratio: 0
+            crop_vy_ratio: -0.125
+            crop_flag_do_rot: True
+        """
+        img_crop, M_c2o, lmk203 = self._crop(img, last_lmk=last_lmk, **kwargs)
+        eye_open, eye_ball = self._get_eye_info(img_crop)
+        rgb_256_bchw = self._img_crop_to_bchw256(img_crop)
+        kp_info = self._get_kp_info(rgb_256_bchw)
+        fs = self._get_f3d(rgb_256_bchw)
+        source_info = {
+            "x_s_info": kp_info,
+            "f_s": fs,
+            "M_c2o": M_c2o,
+            "eye_open": eye_open,   # [1, 2]
+            "eye_ball": eye_ball,    # [1, 6]
+            "lmk203": lmk203,  # for track
+        }
+        return source_info

core/atomic_components/warp_f3d.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from ..models.warp_network import WarpNetwork
+"""
+# __init__
+warp_network_cfg = {
+    "model_path": "",
+    "device": "cuda",
+}
+"""
+class WarpF3D:
+    def __init__(
+        self,
+        warp_network_cfg,
+    ):
+        self.warp_net = WarpNetwork(**warp_network_cfg)
+    def __call__(self, f_s, x_s, x_d):
+        out = self.warp_net(f_s, x_s, x_d)
+        return out

core/atomic_components/wav2feat.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import librosa
+import numpy as np
+import math
+from ..aux_models.hubert_stream import HubertStreaming
+"""
+wavlm_cfg = {
+    "model_path": "",
+    "device": "cuda",
+    "force_ori_type": False,
+}
+hubert_cfg = {
+    "model_path": "",
+    "device": "cuda",
+    "force_ori_type": False,
+}
+"""
+class Wav2Feat:
+    def __init__(self, w2f_cfg, w2f_type="hubert"):
+        self.w2f_type = w2f_type.lower()
+        if self.w2f_type == "hubert":
+            self.w2f = Wav2FeatHubert(hubert_cfg=w2f_cfg)
+            self.feat_dim = 1024
+            self.support_streaming = True
+        else:
+            raise ValueError(f"Unsupported w2f_type: {w2f_type}")
+    def __call__(
+        self,
+        audio,
+        sr=16000,
+        norm_mean_std=None,   # for s2g
+        chunksize=(3, 5, 2),   # for hubert
+    ):
+        if self.w2f_type == "hubert":
+            feat = self.w2f(audio, chunksize=chunksize)
+        elif self.w2f_type == "s2g":
+            feat = self.w2f(audio, sr=sr, norm_mean_std=norm_mean_std)
+        else:
+            raise ValueError(f"Unsupported w2f_type: {self.w2f_type}")
+        return feat
+    def wav2feat(
+        self,
+        audio,
+        sr=16000,
+        norm_mean_std=None,   # for s2g
+        chunksize=(3, 5, 2),
+    ):
+        # for offline
+        if self.w2f_type == "hubert":
+            feat = self.w2f.wav2feat(audio, sr=sr, chunksize=chunksize)
+        elif self.w2f_type == "s2g":
+            feat = self.w2f(audio, sr=sr, norm_mean_std=norm_mean_std)
+        else:
+            raise ValueError(f"Unsupported w2f_type: {self.w2f_type}")
+        return feat
+class Wav2FeatHubert:
+    def __init__(
+        self,
+        hubert_cfg,
+    ):
+        self.hubert = HubertStreaming(**hubert_cfg)
+    def __call__(self, audio_chunk, chunksize=(3, 5, 2)):
+        """
+        audio_chunk: int(sum(chunksize) * 0.04 * 16000) + 80    # 6480
+        """
+        valid_feat_s = - sum(chunksize[1:]) * 2   # -7
+        valid_feat_e = - chunksize[2] * 2   # -2
+        encoding_chunk = self.hubert(audio_chunk)
+        valid_encoding = encoding_chunk[valid_feat_s:valid_feat_e]
+        valid_feat = valid_encoding.reshape(chunksize[1], 2, 1024).mean(1)    # [5, 1024]
+        return valid_feat
+    def wav2feat(self, audio, sr, chunksize=(3, 5, 2)):
+        # for offline
+        if sr != 16000:
+            audio_16k = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+        else:
+            audio_16k = audio
+        num_f = math.ceil(len(audio_16k) / 16000 * 25)
+        split_len = int(sum(chunksize) * 0.04 * 16000) + 80    # 6480
+        speech_pad = np.concatenate([
+            np.zeros((split_len - int(sum(chunksize[1:]) * 0.04 * 16000),), dtype=audio_16k.dtype),
+            audio_16k,
+            np.zeros((split_len,), dtype=audio_16k.dtype),
+        ], 0)
+        i = 0
+        res_lst = []
+        while i < num_f:
+            sss = int(i * 0.04 * 16000)
+            eee = sss + split_len
+            audio_chunk = speech_pad[sss:eee]
+            valid_feat = self.__call__(audio_chunk, chunksize)
+            res_lst.append(valid_feat)
+            i += chunksize[1]
+        ret = np.concatenate(res_lst, 0)
+        ret = ret[:num_f]
+        return ret

core/atomic_components/writer.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import imageio
+import os
+class VideoWriterByImageIO:
+    def __init__(self, video_path, fps=25, **kwargs):
+        video_format = kwargs.get("format", "mp4")  # default is mp4 format
+        codec = kwargs.get("vcodec", "libx264")  # default is libx264 encoding
+        quality = kwargs.get("quality")  # video quality
+        pixelformat = kwargs.get("pixelformat", "yuv420p")  # video pixel format
+        macro_block_size = kwargs.get("macro_block_size", 2)
+        ffmpeg_params = ["-crf", str(kwargs.get("crf", 18))]
+        os.makedirs(os.path.dirname(video_path), exist_ok=True)
+        writer = imageio.get_writer(
+            video_path,
+            fps=fps,
+            format=video_format,
+            codec=codec,
+            quality=quality,
+            ffmpeg_params=ffmpeg_params,
+            pixelformat=pixelformat,
+            macro_block_size=macro_block_size,
+        )
+        self.writer = writer
+    def __call__(self, img, fmt="bgr"):
+        if fmt == "bgr":
+            frame = img[..., ::-1]
+        else:
+            frame = img
+        self.writer.append_data(frame)
+    def close(self):
+        self.writer.close()

core/aux_models/blaze_face.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import numpy as np
+import cv2
+from ..utils.load_model import load_model
+def intersect(box_a, box_b):
+    """We resize both arrays to [A,B,2] without new malloc:
+    [A,2] -> [A,1,2] -> [A,B,2]
+    [B,2] -> [1,B,2] -> [A,B,2]
+    Then we compute the area of intersect between box_a and box_b.
+    Args:
+      box_a: (array) bounding boxes, Shape: [A,4].
+      box_b: (array) bounding boxes, Shape: [B,4].
+    Return:
+      (array) intersection area, Shape: [A,B].
+    """
+    A = box_a.shape[0]
+    B = box_b.shape[0]
+    max_xy = np.minimum(
+        np.expand_dims(box_a[:, 2:], axis=1).repeat(B, axis=1),
+        np.expand_dims(box_b[:, 2:], axis=0).repeat(A, axis=0),
+    )
+    min_xy = np.maximum(
+        np.expand_dims(box_a[:, :2], axis=1).repeat(B, axis=1),
+        np.expand_dims(box_b[:, :2], axis=0).repeat(A, axis=0),
+    )
+    inter = np.clip((max_xy - min_xy), a_min=0, a_max=None)
+    return inter[:, :, 0] * inter[:, :, 1]
+def jaccard(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.  Here we operate on
+    ground truth boxes and default boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: (array) Ground truth bounding boxes, Shape: [num_objects,4]
+        box_b: (array) Prior boxes from priorbox layers, Shape: [num_priors,4]
+    Return:
+        jaccard overlap: (array) Shape: [box_a.size(0), box_b.size(0)]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = (
+        ((box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1]))
+        .reshape(-1, 1)
+        .repeat(box_b.shape[0], axis=1)
+    )  # [A,B]
+    area_b = (
+        ((box_b[:, 2] - box_b[:, 0]) * (box_b[:, 3] - box_b[:, 1]))
+        .reshape(1, -1)
+        .repeat(box_a.shape[0], axis=0)
+    )  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+def overlap_similarity(box, other_boxes):
+    """Computes the IOU between a bounding box and set of other boxes."""
+    box = np.expand_dims(box, axis=0)  # Equivalent to unsqueeze(0) in PyTorch
+    iou = jaccard(box, other_boxes)
+    return np.squeeze(iou, axis=0)  # Equivalent to squeeze(0) in PyTorch
+class BlazeFace:
+    def __init__(self, model_path, device="cuda"):
+        self.anchor_options = {
+            "num_layers": 4,
+            "min_scale": 0.1484375,
+            "max_scale": 0.75,
+            "input_size_height": 128,
+            "input_size_width": 128,
+            "anchor_offset_x": 0.5,
+            "anchor_offset_y": 0.5,
+            "strides": [8, 16, 16, 16],
+            "aspect_ratios": [1.0],
+            "reduce_boxes_in_lowest_layer": False,
+            "interpolated_scale_aspect_ratio": 1.0,
+            "fixed_anchor_size": True,
+        }
+        self.num_classes = 1
+        self.num_anchors = 896
+        self.num_coords = 16
+        self.x_scale = 128.0
+        self.y_scale = 128.0
+        self.h_scale = 128.0
+        self.w_scale = 128.0
+        self.min_score_thresh = 0.5
+        self.min_suppression_threshold = 0.3
+        self.anchors = self.generate_anchors(self.anchor_options)
+        self.anchors = np.array(self.anchors)
+        assert len(self.anchors) == 896
+        self.model, self.model_type = load_model(model_path, device=device)
+        self.output_names = ["regressors", "classificators"]
+    def __call__(self, image: np.ndarray):
+        """
+        image: RGB image
+        """
+        image = cv2.resize(image, (128, 128))
+        image = image[np.newaxis, :, :, :].astype(np.float32)
+        image = image / 127.5 - 1.0
+        outputs = {}
+        if self.model_type == "onnx":
+            out_list = self.model.run(None, {"input": image})
+            for i, name in enumerate(self.output_names):
+                outputs[name] = out_list[i]
+        elif self.model_type == "tensorrt":
+            self.model.setup({"input": image})
+            self.model.infer()
+            for name in self.output_names:
+                outputs[name] = self.model.buffer[name][0]
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        boxes = self.postprocess(outputs["regressors"], outputs["classificators"])
+        return boxes
+    def calculate_scale(self, min_scale, max_scale, stride_index, num_strides):
+        return min_scale + (max_scale - min_scale) * stride_index / (num_strides - 1.0)
+    def generate_anchors(self, options):
+        strides_size = len(options["strides"])
+        assert options["num_layers"] == strides_size
+        anchors = []
+        layer_id = 0
+        while layer_id < strides_size:
+            anchor_height = []
+            anchor_width = []
+            aspect_ratios = []
+            scales = []
+            # For same strides, we merge the anchors in the same order.
+            last_same_stride_layer = layer_id
+            while (last_same_stride_layer < strides_size) and (
+                options["strides"][last_same_stride_layer]
+                == options["strides"][layer_id]
+            ):
+                scale = self.calculate_scale(
+                    options["min_scale"],
+                    options["max_scale"],
+                    last_same_stride_layer,
+                    strides_size,
+                )
+                if (
+                    last_same_stride_layer == 0
+                    and options["reduce_boxes_in_lowest_layer"]
+                ):
+                    # For first layer, it can be specified to use predefined anchors.
+                    aspect_ratios.append(1.0)
+                    aspect_ratios.append(2.0)
+                    aspect_ratios.append(0.5)
+                    scales.append(0.1)
+                    scales.append(scale)
+                    scales.append(scale)
+                else:
+                    for aspect_ratio in options["aspect_ratios"]:
+                        aspect_ratios.append(aspect_ratio)
+                        scales.append(scale)
+                    if options["interpolated_scale_aspect_ratio"] > 0.0:
+                        scale_next = (
+                            1.0
+                            if last_same_stride_layer == strides_size - 1
+                            else self.calculate_scale(
+                                options["min_scale"],
+                                options["max_scale"],
+                                last_same_stride_layer + 1,
+                                strides_size,
+                            )
+                        )
+                        scales.append(np.sqrt(scale * scale_next))
+                        aspect_ratios.append(options["interpolated_scale_aspect_ratio"])
+                last_same_stride_layer += 1
+            for i in range(len(aspect_ratios)):
+                ratio_sqrts = np.sqrt(aspect_ratios[i])
+                anchor_height.append(scales[i] / ratio_sqrts)
+                anchor_width.append(scales[i] * ratio_sqrts)
+            stride = options["strides"][layer_id]
+            feature_map_height = int(np.ceil(options["input_size_height"] / stride))
+            feature_map_width = int(np.ceil(options["input_size_width"] / stride))
+            for y in range(feature_map_height):
+                for x in range(feature_map_width):
+                    for anchor_id in range(len(anchor_height)):
+                        x_center = (x + options["anchor_offset_x"]) / feature_map_width
+                        y_center = (y + options["anchor_offset_y"]) / feature_map_height
+                        new_anchor = [x_center, y_center, 0, 0]
+                        if options["fixed_anchor_size"]:
+                            new_anchor[2] = 1.0
+                            new_anchor[3] = 1.0
+                        else:
+                            new_anchor[2] = anchor_width[anchor_id]
+                            new_anchor[3] = anchor_height[anchor_id]
+                        anchors.append(new_anchor)
+            layer_id = last_same_stride_layer
+        return anchors
+    def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
+        """The output of the neural network is a tensor of shape (b, 896, 16)
+        containing the bounding box regressor predictions, as well as a tensor
+        of shape (b, 896, 1) with the classification confidences.
+        This function converts these two "raw" tensors into proper detections.
+        Returns a list of (num_detections, 17) tensors, one for each image in
+        the batch.
+        This is based on the source code from:
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.cc
+        mediapipe/calculators/tflite/tflite_tensors_to_detections_calculator.proto
+        """
+        assert raw_box_tensor.ndim == 3
+        assert raw_box_tensor.shape[1] == self.num_anchors
+        assert raw_box_tensor.shape[2] == self.num_coords
+        assert raw_score_tensor.ndim == 3
+        assert raw_score_tensor.shape[1] == self.num_anchors
+        assert raw_score_tensor.shape[2] == self.num_classes
+        assert raw_box_tensor.shape[0] == raw_score_tensor.shape[0]
+        detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
+        raw_score_tensor = np.clip(raw_score_tensor, -50, 100)
+        detection_scores = 1 / (1 + np.exp(-raw_score_tensor))
+        mask = detection_scores >= self.min_score_thresh
+        mask = mask[0, :, 0]
+        boxes = detection_boxes[0, mask, :]
+        scores = detection_scores[0, mask, :]
+        return np.concatenate((boxes, scores), axis=-1)
+    def _decode_boxes(self, raw_boxes, anchors):
+        """Converts the predictions into actual coordinates using
+        the anchor boxes. Processes the entire batch at once.
+        """
+        boxes = np.zeros_like(raw_boxes)
+        x_center = raw_boxes[..., 0] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+        y_center = raw_boxes[..., 1] / self.y_scale * anchors[:, 3] + anchors[:, 1]
+        w = raw_boxes[..., 2] / self.w_scale * anchors[:, 2]
+        h = raw_boxes[..., 3] / self.h_scale * anchors[:, 3]
+        boxes[..., 0] = self.x_scale * (x_center - w / 2.0)  # xmin
+        boxes[..., 1] = self.y_scale * (y_center - h / 2.0)  # ymin
+        boxes[..., 2] = self.w_scale * (x_center + w / 2.0)  # xmax
+        boxes[..., 3] = self.h_scale * (y_center + h / 2.0)  # ymax
+        for k in range(6):
+            offset = 4 + k * 2
+            keypoint_x = (
+                raw_boxes[..., offset] / self.x_scale * anchors[:, 2] + anchors[:, 0]
+            )
+            keypoint_y = (
+                raw_boxes[..., offset + 1] / self.y_scale * anchors[:, 3]
+                + anchors[:, 1]
+            )
+            boxes[..., offset] = keypoint_x
+            boxes[..., offset + 1] = keypoint_y
+        return boxes
+    def _weighted_non_max_suppression(self, detections):
+        """The alternative NMS method as mentioned in the BlazeFace paper:
+        "We replace the suppression algorithm with a blending strategy that
+        estimates the regression parameters of a bounding box as a weighted
+        mean between the overlapping predictions."
+        The original MediaPipe code assigns the score of the most confident
+        detection to the weighted detection, but we take the average score
+        of the overlapping detections.
+        The input detections should be a NumPy array of shape (count, 17).
+        Returns a list of NumPy arrays, one for each detected face.
+        This is based on the source code from:
+        mediapipe/calculators/util/non_max_suppression_calculator.cc
+        mediapipe/calculators/util/non_max_suppression_calculator.proto
+        """
+        if len(detections) == 0:
+            return []
+        output_detections = []
+        # Sort the detections from highest to lowest score.
+        remaining = np.argsort(detections[:, 16])[::-1]
+        while len(remaining) > 0:
+            detection = detections[remaining[0]]
+            # Compute the overlap between the first box and the other
+            # remaining boxes. (Note that the other_boxes also include
+            # the first_box.)
+            first_box = detection[:4]
+            other_boxes = detections[remaining, :4]
+            ious = overlap_similarity(first_box, other_boxes)
+            # If two detections don't overlap enough, they are considered
+            # to be from different faces.
+            mask = ious > self.min_suppression_threshold
+            overlapping = remaining[mask]
+            remaining = remaining[~mask]
+            # Take an average of the coordinates from the overlapping
+            # detections, weighted by their confidence scores.
+            weighted_detection = detection.copy()
+            if len(overlapping) > 1:
+                coordinates = detections[overlapping, :16]
+                scores = detections[overlapping, 16:17]
+                total_score = scores.sum()
+                weighted = (coordinates * scores).sum(axis=0) / total_score
+                weighted_detection[:16] = weighted
+                weighted_detection[16] = total_score / len(overlapping)
+            output_detections.append(weighted_detection)
+        return output_detections
+    def postprocess(self, raw_boxes, scores):
+        detections = self._tensors_to_detections(raw_boxes, scores, self.anchors)
+        detections = self._weighted_non_max_suppression(detections)
+        detections = np.array(detections)
+        return detections
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default="")
+    parser.add_argument("--image", type=str, default=None)
+    args = parser.parse_args()
+    blaze_face = BlazeFace(args.model)
+    image = cv2.imread(args.image)
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    image = cv2.resize(image, (128, 128))
+    image = image[np.newaxis, :, :, :].astype(np.float32)
+    image = image / 127.5 - 1.0
+    boxes = blaze_face(image)
+    print(boxes)

core/aux_models/face_mesh.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import cv2
+import numpy as np
+from ..utils.load_model import load_model
+class FaceMesh:
+    def __init__(self, model_path, device="cuda"):
+        self.model, self.model_type = load_model(model_path, device=device)
+        self.input_size = (256, 256)  # (w, h)
+        self.output_names = [
+            "Identity",
+            "Identity_1",
+            "Identity_2",
+        ]  # Identity is the mesh
+    def project_landmarks(self, points, roi):
+        width, height = self.input_size
+        points /= (width, height, width)
+        sin, cos = np.sin(roi[4]), np.cos(roi[4])
+        matrix = np.array([[cos, sin, 0.0], [-sin, cos, 0.0], [1.0, 1.0, 1.0]])
+        points -= (0.5, 0.5, 0.0)
+        rotated = np.matmul(points * (1, 1, 0), matrix)
+        points *= (0, 0, 1)
+        points += rotated
+        points *= (roi[2], roi[3], roi[2])
+        points += (roi[0], roi[1], 0.0)
+        return points
+    def __call__(self, image, roi):
+        """
+        image: np.ndarray, RGB, (H, W, C), [0, 255]
+        roi: np.ndarray, (cx, cy, w, h, rotation), rotation in radian
+        """
+        cx, cy, w, h = roi[:4]
+        w_half, h_half = w / 2, h / 2
+        pts = [
+            (cx - w_half, cy - h_half),
+            (cx + w_half, cy - h_half),
+            (cx + w_half, cy + h_half),
+            (cx - w_half, cy + h_half),
+        ]
+        rotation = roi[4]
+        s, c = np.sin(rotation), np.cos(rotation)
+        t = np.array(pts) - (cx, cy)
+        r = np.array([[c, s], [-s, c]])
+        src_pts = np.matmul(t, r) + (cx, cy)
+        src_pts = src_pts.astype(np.float32)
+        dst_pts = np.array(
+            [
+                [0.0, 0.0],
+                [self.input_size[0], 0.0],
+                [self.input_size[0], self.input_size[1]],
+                [0.0, self.input_size[1]],
+            ]
+        ).astype(np.float32)
+        M = cv2.getPerspectiveTransform(src_pts, dst_pts)
+        roi_image = cv2.warpPerspective(
+            image, M, self.input_size, flags=cv2.INTER_LINEAR
+        )
+        # cv2.imwrite('test.jpg', cv2.cvtColor(roi_image, cv2.COLOR_RGB2BGR))
+        roi_image = roi_image / 255.0
+        roi_image = roi_image.astype(np.float32)
+        roi_image = roi_image[np.newaxis, :, :, :]
+        outputs = {}
+        if self.model_type == "onnx":
+            out_list = self.model.run(None, {"input": roi_image})
+            for i, name in enumerate(self.output_names):
+                outputs[name] = out_list[i]
+        elif self.model_type == "tensorrt":
+            self.model.setup({"input": roi_image})
+            self.model.infer()
+            for name in self.output_names:
+                outputs[name] = self.model.buffer[name][0]
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        points = outputs["Identity"].reshape(1434 // 3, 3)
+        points = self.project_landmarks(points, roi)
+        return points
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="model path")
+    parser.add_argument("--image", type=str, help="image path")
+    parser.add_argument("--device", type=str, default="cuda", help="device")
+    args = parser.parse_args()
+    face_mesh = FaceMesh(args.model, args.device)
+    image = cv2.imread(args.image, cv2.IMREAD_COLOR)
+    image = cv2.resize(image, (256, 256))
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    face_mesh = FaceMesh(args.model, args.device)
+    roi = np.array([128, 128, 256, 256, np.pi / 2])
+    mesh = face_mesh(image, roi)
+    print(mesh.shape)

core/aux_models/hubert_stream.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from ..utils.load_model import load_model
+class HubertStreaming:
+    def __init__(self, model_path, device="cuda", **kwargs):
+        kwargs["model_file"] = model_path
+        kwargs["module_name"] = "HubertStreamingONNX"
+        kwargs["package_name"] = "..aux_models.modules"
+        self.model, self.model_type = load_model(model_path, device=device, **kwargs)
+        self.device = device
+    def forward_chunk(self, audio_chunk):
+        if self.model_type == "onnx":
+            output = self.model.run(None, {"input_values": audio_chunk.reshape(1, -1)})[0]
+        elif self.model_type == "tensorrt":
+            self.model.setup({"input_values": audio_chunk.reshape(1, -1)})
+            self.model.infer()
+            output = self.model.buffer["encoding_out"][0]
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        return output
+    def __call__(self, audio_chunk):
+        if self.model_type == "ori":
+            output = self.model.forward_chunk(audio_chunk)
+        else:
+            output = self.forward_chunk(audio_chunk)
+        return output

core/aux_models/insightface_det.py ADDED Viewed

	@@ -0,0 +1,245 @@

+from __future__ import division
+import numpy as np
+import cv2
+from ..utils.load_model import load_model
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return np.stack([x1, y1, x2, y2], axis=-1)
+def distance2kps(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    preds = []
+    for i in range(0, distance.shape[1], 2):
+        px = points[:, i%2] + distance[:, i]
+        py = points[:, i%2+1] + distance[:, i+1]
+        if max_shape is not None:
+            px = px.clamp(min=0, max=max_shape[1])
+            py = py.clamp(min=0, max=max_shape[0])
+        preds.append(px)
+        preds.append(py)
+    return np.stack(preds, axis=-1)
+class InsightFaceDet:
+    def __init__(self, model_path, device="cuda", **kwargs):
+        kwargs["model_file"] = model_path
+        kwargs["module_name"] = "RetinaFace"
+        kwargs["package_name"] = "..aux_models.modules"
+        self.model, self.model_type = load_model(model_path, device=device, **kwargs)
+        self.device = device
+        if self.model_type != "ori":
+            self._init_vars()
+    def _init_vars(self):
+        self.center_cache = {}
+        self.nms_thresh = 0.4
+        self.det_thresh = 0.5
+        self.input_size = (512, 512)
+        self.input_mean = 127.5
+        self.input_std = 128.0
+        self._anchor_ratio = 1.0
+        self.fmc = 3
+        self._feat_stride_fpn = [8, 16, 32]
+        self._num_anchors = 2
+        self.use_kps = True
+        self.output_names = [
+            "scores1",
+            "scores2",
+            "scores3",
+            "boxes1",
+            "boxes2",
+            "boxes3",
+            "kps1",
+            "kps2",
+            "kps3",
+        ]
+    def _run_model(self, blob):
+        if self.model_type == "onnx":
+            net_outs = self.model.run(None, {"image": blob})
+        elif self.model_type == "tensorrt":
+            self.model.setup({"image": blob})
+            self.model.infer()
+            net_outs = [self.model.buffer[name][0] for name in self.output_names]
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        return net_outs
+    def _forward(self, img, threshold):
+        """
+        img: np.ndarray, shape (h, w, 3)
+        """
+        scores_list = []
+        bboxes_list = []
+        kpss_list = []
+        input_size = tuple(img.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        # (1, 3, 512, 512)
+        net_outs = self._run_model(blob)
+        input_height = blob.shape[2]
+        input_width = blob.shape[3]
+        fmc = self.fmc
+        for idx, stride in enumerate(self._feat_stride_fpn):
+            scores = net_outs[idx]
+            bbox_preds = net_outs[idx+fmc]
+            bbox_preds = bbox_preds * stride
+            if self.use_kps:
+                kps_preds = net_outs[idx+fmc*2] * stride
+            height = input_height // stride
+            width = input_width // stride
+            # K = height * width
+            key = (height, width, stride)
+            if key in self.center_cache:
+                anchor_centers = self.center_cache[key]
+            else:
+                #solution-3:
+                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
+                if self._num_anchors>1:
+                    anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )
+                if len(self.center_cache)<100:
+                    self.center_cache[key] = anchor_centers
+            pos_inds = np.where(scores>=threshold)[0]
+            bboxes = distance2bbox(anchor_centers, bbox_preds)
+            pos_scores = scores[pos_inds]
+            pos_bboxes = bboxes[pos_inds]
+            scores_list.append(pos_scores)
+            bboxes_list.append(pos_bboxes)
+            if self.use_kps:
+                kpss = distance2kps(anchor_centers, kps_preds)
+                kpss = kpss.reshape( (kpss.shape[0], -1, 2) )
+                pos_kpss = kpss[pos_inds]
+                kpss_list.append(pos_kpss)
+        return scores_list, bboxes_list, kpss_list
+    def detect(self, img, input_size=None, max_num=0, metric='default', det_thresh=None):
+        input_size = self.input_size if input_size is None else input_size
+        det_thresh = self.det_thresh if det_thresh is None else det_thresh
+        im_ratio = float(img.shape[0]) / img.shape[1]
+        model_ratio = float(input_size[1]) / input_size[0]
+        if im_ratio>model_ratio:
+            new_height = input_size[1]
+            new_width = int(new_height / im_ratio)
+        else:
+            new_width = input_size[0]
+            new_height = int(new_width * im_ratio)
+        det_scale = float(new_height) / img.shape[0]
+        resized_img = cv2.resize(img, (new_width, new_height))
+        det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
+        det_img[:new_height, :new_width, :] = resized_img
+        scores_list, bboxes_list, kpss_list = self._forward(det_img, det_thresh)
+        scores = np.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        bboxes = np.vstack(bboxes_list) / det_scale
+        if self.use_kps:
+            kpss = np.vstack(kpss_list) / det_scale
+        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
+        pre_det = pre_det[order, :]
+        keep = self.nms(pre_det)
+        det = pre_det[keep, :]
+        if self.use_kps:
+            kpss = kpss[order,:,:]
+            kpss = kpss[keep,:,:]
+        else:
+            kpss = None
+        if max_num > 0 and det.shape[0] > max_num:
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] - det[:, 1])
+            img_center = img.shape[0] // 2, img.shape[1] // 2
+            offsets = np.vstack([
+                (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+            ])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            if metric=='max':
+                values = area
+            else:
+                values = area - offset_dist_squared * 2.0  # some extra weight on the centering
+            bindex = np.argsort(values)[::-1]  # some extra weight on the centering
+            bindex = bindex[0:max_num]
+            det = det[bindex, :]
+            if kpss is not None:
+                kpss = kpss[bindex, :]
+        return det, kpss
+    def nms(self, dets):
+        thresh = self.nms_thresh
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = dets[:, 2]
+        y2 = dets[:, 3]
+        scores = dets[:, 4]
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+        return keep
+    def __call__(self, img, **kwargs):
+        if self.model_type == "ori":
+            det, kpss = self.model.detect(img, **kwargs)
+        else:
+            det, kpss = self.detect(img, **kwargs)
+        return det, kpss

core/aux_models/insightface_landmark106.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from __future__ import division
+import numpy as np
+import torch
+import cv2
+from skimage import transform as trans
+from ..utils.load_model import load_model
+def transform(data, center, output_size, scale, rotation):
+    scale_ratio = scale
+    rot = float(rotation) * np.pi / 180.0
+    t1 = trans.SimilarityTransform(scale=scale_ratio)
+    cx = center[0] * scale_ratio
+    cy = center[1] * scale_ratio
+    t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))
+    t3 = trans.SimilarityTransform(rotation=rot)
+    t4 = trans.SimilarityTransform(translation=(output_size / 2,
+                                                output_size / 2))
+    t = t1 + t2 + t3 + t4
+    M = t.params[0:2]
+    cropped = cv2.warpAffine(data,
+                             M, (output_size, output_size),
+                             borderValue=0.0)
+    return cropped, M
+def trans_points2d(pts, M):
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        new_pts[i] = new_pt[0:2]
+    return new_pts
+class Landmark106:
+    def __init__(self, model_path, device="cuda", **kwargs):
+        kwargs["model_file"] = model_path
+        kwargs["module_name"] = "Landmark106"
+        kwargs["package_name"] = "..aux_models.modules"
+        self.model, self.model_type = load_model(model_path, device=device, **kwargs)
+        self.device = device
+        if self.model_type != "ori":
+            self._init_vars()
+    def _init_vars(self):
+        self.input_mean = 0.0
+        self.input_std = 1.0
+        self.input_size = (192, 192)
+        self.lmk_num = 106
+        self.output_names = ["fc1"]
+    def _run_model(self, blob):
+        if self.model_type == "onnx":
+            pred = self.model.run(None, {"data": blob})[0]
+        elif self.model_type == "tensorrt":
+            self.model.setup({"data": blob})
+            self.model.infer()
+            pred = self.model.buffer[self.output_names[0]][0]
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        return pred
+    def get(self, img, bbox):
+        w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
+        center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
+        rotate = 0
+        _scale = self.input_size[0]  / (max(w, h)*1.5)
+        aimg, M = transform(img, center, self.input_size[0], _scale, rotate)
+        input_size = tuple(aimg.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        pred = self._run_model(blob)
+        pred = pred.reshape((-1, 2))
+        if self.lmk_num < pred.shape[0]:
+            pred = pred[self.lmk_num*-1:,:]
+        pred[:, 0:2] += 1
+        pred[:, 0:2] *= (self.input_size[0] // 2)
+        IM = cv2.invertAffineTransform(M)
+        pred = trans_points2d(pred, IM)
+        return pred
+    def __call__(self, img, bbox):
+        if self.model_type == "ori":
+            pred = self.model.get(img, bbox)
+        else:
+            pred = self.get(img, bbox)
+        return pred

core/aux_models/landmark203.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import numpy as np
+from ..utils.load_model import load_model
+def _transform_pts(pts, M):
+    """ conduct similarity or affine transformation to the pts
+    pts: Nx2 ndarray
+    M: 2x3 matrix or 3x3 matrix
+    return: Nx2
+    """
+    return pts @ M[:2, :2].T + M[:2, 2]
+class Landmark203:
+    def __init__(self, model_path, device="cuda", **kwargs):
+        kwargs["model_file"] = model_path
+        kwargs["module_name"] = "Landmark203"
+        kwargs["package_name"] = "..aux_models.modules"
+        self.model, self.model_type = load_model(model_path, device=device, **kwargs)
+        self.device = device
+        self.output_names = ["landmarks"]
+        self.dsize = 224
+    def _run_model(self, inp):
+        if self.model_type == "onnx":
+            out_pts = self.model.run(None, {"input": inp})[0]
+        elif self.model_type == "tensorrt":
+            self.model.setup({"input": inp})
+            self.model.infer()
+            out_pts = self.model.buffer[self.output_names[0]][0]
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        return out_pts
+    def run(self, img_crop_rgb, M_c2o=None):
+        # img_crop_rgb: 224x224
+        inp = (img_crop_rgb.astype(np.float32) / 255.).transpose(2, 0, 1)[None, ...]  # HxWx3 (BGR) -> 1x3xHxW (RGB!)
+        out_pts = self._run_model(inp)
+        # 2d landmarks 203 points
+        lmk = out_pts[0].reshape(-1, 2) * self.dsize  # scale to 0-224
+        if M_c2o is not None:
+            lmk = _transform_pts(lmk, M=M_c2o)
+        return lmk
+    def __call__(self, img_crop_rgb, M_c2o=None):
+        if self.model_type == "ori":
+            lmk = self.model.run(img_crop_rgb, M_c2o)
+        else:
+            lmk = self.run(img_crop_rgb, M_c2o)
+        return lmk

core/aux_models/mediapipe_landmark478.py ADDED Viewed

	@@ -0,0 +1,118 @@

+from enum import Enum
+import numpy as np
+from ..utils.load_model import load_model
+from .blaze_face import BlazeFace
+from .face_mesh import FaceMesh
+class SizeMode(Enum):
+    DEFAULT = 0
+    SQUARE_LONG = 1
+    SQUARE_SHORT = 2
+def _select_roi_size(
+    bbox: np.ndarray, image_size, size_mode: SizeMode  # x1, y1, x2, y2  # w,h
+):
+    """Return the size of an ROI based on bounding box, image size and mode"""
+    width, height = bbox[2] - bbox[0], bbox[3] - bbox[1]
+    image_width, image_height = image_size
+    if size_mode == SizeMode.SQUARE_LONG:
+        long_size = max(width, height)
+        width, height = long_size, long_size
+    elif size_mode == SizeMode.SQUARE_SHORT:
+        short_side = min(width, height)
+        width, height = short_side, short_side
+    return width, height
+def bbox_to_roi(
+    bbox: np.ndarray,
+    image_size,  # w,h
+    rotation_keypoints=None,
+    scale=(1.0, 1.0),  # w, h
+    size_mode: SizeMode = SizeMode.SQUARE_LONG,
+):
+    PI = np.pi
+    TWO_PI = 2 * np.pi
+    # select ROI dimensions
+    width, height = _select_roi_size(bbox, image_size, size_mode)
+    scale_x, scale_y = scale
+    # calculate ROI size and -centre
+    width, height = width * scale_x, height * scale_y
+    cx = (bbox[0] + bbox[2]) / 2
+    cy = (bbox[1] + bbox[3]) / 2
+    # calculate rotation of required
+    if rotation_keypoints is None or len(rotation_keypoints) < 2:
+        return np.array([cx, cy, width, height, 0])
+    x0, y0 = rotation_keypoints[0]
+    x1, y1 = rotation_keypoints[1]
+    angle = -np.atan2(y0 - y1, x1 - x0)
+    # normalise to [0, 2*PI]
+    rotation = angle - TWO_PI * np.floor((angle + PI) / TWO_PI)
+    return np.array([cx, cy, width, height, rotation])
+class Landmark478:
+    def __init__(self, blaze_face_model_path="", face_mesh_model_path="", device="cuda", **kwargs):
+        if kwargs.get("force_ori_type", False):
+            assert "task_path" in kwargs
+            kwargs["module_name"] = "Landmark478"
+            kwargs["package_name"] = "..aux_models.modules"
+            self.model, self.model_type = load_model("", device=device, **kwargs)
+        else:
+            self.blaze_face = BlazeFace(blaze_face_model_path, device)
+            self.face_mesh = FaceMesh(face_mesh_model_path, device)
+            self.model_type = ""
+    def get(self, image):
+        bboxes = self.blaze_face(image)
+        if len(bboxes) == 0:
+            return None
+        bbox = bboxes[0]
+        scale = (image.shape[1] / 128.0, image.shape[0] / 128.0)
+        # The first 4 numbers describe the bounding box corners:
+        #
+        # ymin, xmin, ymax, xmax
+        # These are normalized coordinates (between 0 and 1).
+        # The next 12 numbers are the x,y-coordinates of the 6 facial landmark keypoints:
+        #
+        # right_eye_x, right_eye_y
+        # left_eye_x, left_eye_y
+        # nose_x, nose_y
+        # mouth_x, mouth_y
+        # right_ear_x, right_ear_y
+        # left_ear_x, left_ear_y
+        # Tip: these labeled as seen from the perspective of the person, so their right is your left.
+        # The final number is the confidence score that this detection really is a face.
+        bbox[0] = bbox[0] * scale[1]
+        bbox[1] = bbox[1] * scale[0]
+        bbox[2] = bbox[2] * scale[1]
+        bbox[3] = bbox[3] * scale[0]
+        left_eye = (bbox[4], bbox[5])
+        right_eye = (bbox[6], bbox[7])
+        roi = bbox_to_roi(
+            bbox,
+            (image.shape[1], image.shape[0]),
+            rotation_keypoints=[left_eye, right_eye],
+            scale=(1.5, 1.5),
+            size_mode=SizeMode.SQUARE_LONG,
+        )
+        mesh = self.face_mesh(image, roi)
+        mesh = mesh / (image.shape[1], image.shape[0], image.shape[1])
+        return mesh
+    def __call__(self, image):
+        if self.model_type == "ori":
+            det = self.model.detect_from_npimage(image.copy())
+            lmk = self.model.mplmk_to_nplmk(det)
+            return lmk
+        else:
+            lmk = self.get(image)
+            lmk = lmk.reshape(1, -1, 3).astype(np.float32)
+            return lmk

core/aux_models/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .retinaface import RetinaFace
+from .landmark106 import Landmark106
+from .landmark203 import Landmark203
+from .landmark478 import Landmark478
+from .hubert_stream import HubertStreamingONNX

core/aux_models/modules/hubert_stream.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import onnxruntime
+class HubertStreamingONNX:
+    def __init__(self, model_file, device="cuda"):
+        if device == "cuda":
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        else:
+            providers = ["CPUExecutionProvider"]
+        self.session = onnxruntime.InferenceSession(model_file, providers=providers)
+    def forward_chunk(self, input_values):
+        encoding_out = self.session.run(
+            None,
+            {"input_values": input_values.reshape(1, -1)}
+        )[0]
+        return encoding_out

core/aux_models/modules/landmark106.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# insightface
+from __future__ import division
+import onnxruntime
+import cv2
+import numpy as np
+from skimage import transform as trans
+def transform(data, center, output_size, scale, rotation):
+    scale_ratio = scale
+    rot = float(rotation) * np.pi / 180.0
+    t1 = trans.SimilarityTransform(scale=scale_ratio)
+    cx = center[0] * scale_ratio
+    cy = center[1] * scale_ratio
+    t2 = trans.SimilarityTransform(translation=(-1 * cx, -1 * cy))
+    t3 = trans.SimilarityTransform(rotation=rot)
+    t4 = trans.SimilarityTransform(translation=(output_size / 2,
+                                                output_size / 2))
+    t = t1 + t2 + t3 + t4
+    M = t.params[0:2]
+    cropped = cv2.warpAffine(data,
+                             M, (output_size, output_size),
+                             borderValue=0.0)
+    return cropped, M
+def trans_points2d(pts, M):
+    new_pts = np.zeros(shape=pts.shape, dtype=np.float32)
+    for i in range(pts.shape[0]):
+        pt = pts[i]
+        new_pt = np.array([pt[0], pt[1], 1.], dtype=np.float32)
+        new_pt = np.dot(M, new_pt)
+        new_pts[i] = new_pt[0:2]
+    return new_pts
+class Landmark106:
+    def __init__(self, model_file, device="cuda"):
+        if device == "cuda":
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        else:
+            providers = ["CPUExecutionProvider"]
+        self.session = onnxruntime.InferenceSession(model_file, providers=providers)
+        self.input_mean = 0.0
+        self.input_std = 1.0
+        self.input_size = (192, 192)
+        input_cfg = self.session.get_inputs()[0]
+        input_name = input_cfg.name
+        outputs = self.session.get_outputs()
+        output_names = []
+        for out in outputs:
+            output_names.append(out.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        self.lmk_num = 106
+    def get(self, img, bbox):
+        w, h = (bbox[2] - bbox[0]), (bbox[3] - bbox[1])
+        center = (bbox[2] + bbox[0]) / 2, (bbox[3] + bbox[1]) / 2
+        rotate = 0
+        _scale = self.input_size[0]  / (max(w, h)*1.5)
+        aimg, M = transform(img, center, self.input_size[0], _scale, rotate)
+        input_size = tuple(aimg.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(aimg, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        pred = self.session.run(self.output_names, {self.input_name : blob})[0][0]
+        pred = pred.reshape((-1, 2))
+        if self.lmk_num < pred.shape[0]:
+            pred = pred[self.lmk_num*-1:,:]
+        pred[:, 0:2] += 1
+        pred[:, 0:2] *= (self.input_size[0] // 2)
+        IM = cv2.invertAffineTransform(M)
+        pred = trans_points2d(pred, IM)
+        return pred

core/aux_models/modules/landmark203.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import onnxruntime
+import numpy as np
+def _transform_pts(pts, M):
+    """ conduct similarity or affine transformation to the pts
+    pts: Nx2 ndarray
+    M: 2x3 matrix or 3x3 matrix
+    return: Nx2
+    """
+    return pts @ M[:2, :2].T + M[:2, 2]
+class Landmark203:
+    def __init__(self, model_file, device="cuda"):
+        if device == "cuda":
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        else:
+            providers = ["CPUExecutionProvider"]
+        self.session = onnxruntime.InferenceSession(model_file, providers=providers)
+        self.dsize = 224
+    def _run(self, inp):
+        out = self.session.run(None, {'input': inp})
+        return out
+    def run(self, img_crop_rgb, M_c2o=None):
+        # img_crop_rgb: 224x224
+        inp = (img_crop_rgb.astype(np.float32) / 255.).transpose(2, 0, 1)[None, ...]  # HxWx3 (BGR) -> 1x3xHxW (RGB!)
+        out_lst = self._run(inp)
+        out_pts = out_lst[2]
+        # 2d landmarks 203 points
+        lmk = out_pts[0].reshape(-1, 2) * self.dsize  # scale to 0-224
+        if M_c2o is not None:
+            lmk = _transform_pts(lmk, M=M_c2o)
+        return lmk

core/aux_models/modules/landmark478.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import numpy as np
+import mediapipe as mp
+from mediapipe.tasks.python import vision, BaseOptions
+class Landmark478:
+    def __init__(self, task_path):
+        base_options = BaseOptions(model_asset_path=task_path)
+        options = vision.FaceLandmarkerOptions(
+            base_options=base_options,
+            output_face_blendshapes=True,
+            output_facial_transformation_matrixes=True,
+            num_faces=1,
+        )
+        detector = vision.FaceLandmarker.create_from_options(options)
+        self.detector = detector
+    def detect_from_imp(self, imp):
+        image = mp.Image.create_from_file(imp)
+        detection_result = self.detector.detect(image)
+        return detection_result
+    def detect_from_npimage(self, img):
+        image = mp.Image(image_format=mp.ImageFormat.SRGB, data=img)
+        detection_result = self.detector.detect(image)
+        return detection_result
+    @staticmethod
+    def mplmk_to_nplmk(results):
+        face_landmarks_list = results.face_landmarks
+        np_lms = []
+        for face_lms in face_landmarks_list:
+            lms = [[lm.x, lm.y, lm.z] for lm in face_lms]
+            np_lms.append(lms)
+        return np.array(np_lms).astype(np.float32)

core/aux_models/modules/retinaface.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# insightface
+from __future__ import division
+import onnxruntime
+import cv2
+import numpy as np
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    x1 = points[:, 0] - distance[:, 0]
+    y1 = points[:, 1] - distance[:, 1]
+    x2 = points[:, 0] + distance[:, 2]
+    y2 = points[:, 1] + distance[:, 3]
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1])
+        y1 = y1.clamp(min=0, max=max_shape[0])
+        x2 = x2.clamp(min=0, max=max_shape[1])
+        y2 = y2.clamp(min=0, max=max_shape[0])
+    return np.stack([x1, y1, x2, y2], axis=-1)
+def distance2kps(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom).
+        max_shape (tuple): Shape of the image.
+    Returns:
+        Tensor: Decoded bboxes.
+    """
+    preds = []
+    for i in range(0, distance.shape[1], 2):
+        px = points[:, i%2] + distance[:, i]
+        py = points[:, i%2+1] + distance[:, i+1]
+        if max_shape is not None:
+            px = px.clamp(min=0, max=max_shape[1])
+            py = py.clamp(min=0, max=max_shape[0])
+        preds.append(px)
+        preds.append(py)
+    return np.stack(preds, axis=-1)
+class RetinaFace:
+    def __init__(self, model_file, device="cuda"):
+        if device == "cuda":
+            providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+        else:
+            providers = ["CPUExecutionProvider"]
+        self.session = onnxruntime.InferenceSession(model_file, providers=providers)
+        self.center_cache = {}
+        self.nms_thresh = 0.4
+        self.det_thresh = 0.5
+        self._init_vars()
+    def _init_vars(self):
+        self.input_size = (512, 512)
+        input_cfg = self.session.get_inputs()[0]
+        input_name = input_cfg.name
+        outputs = self.session.get_outputs()
+        output_names = []
+        for o in outputs:
+            output_names.append(o.name)
+        self.input_name = input_name
+        self.output_names = output_names
+        self.input_mean = 127.5
+        self.input_std = 128.0
+        self._anchor_ratio = 1.0
+        self.fmc = 3
+        self._feat_stride_fpn = [8, 16, 32]
+        self._num_anchors = 2
+        self.use_kps = True
+    def forward(self, img, threshold):
+        scores_list = []
+        bboxes_list = []
+        kpss_list = []
+        input_size = tuple(img.shape[0:2][::-1])
+        blob = cv2.dnn.blobFromImage(img, 1.0/self.input_std, input_size, (self.input_mean, self.input_mean, self.input_mean), swapRB=True)
+        net_outs = self.session.run(self.output_names, {self.input_name : blob})
+        input_height = blob.shape[2]
+        input_width = blob.shape[3]
+        fmc = self.fmc
+        for idx, stride in enumerate(self._feat_stride_fpn):
+            scores = net_outs[idx]
+            bbox_preds = net_outs[idx+fmc]
+            bbox_preds = bbox_preds * stride
+            if self.use_kps:
+                kps_preds = net_outs[idx+fmc*2] * stride
+            height = input_height // stride
+            width = input_width // stride
+            # K = height * width
+            key = (height, width, stride)
+            if key in self.center_cache:
+                anchor_centers = self.center_cache[key]
+            else:
+                #solution-3:
+                anchor_centers = np.stack(np.mgrid[:height, :width][::-1], axis=-1).astype(np.float32)
+                anchor_centers = (anchor_centers * stride).reshape( (-1, 2) )
+                if self._num_anchors>1:
+                    anchor_centers = np.stack([anchor_centers]*self._num_anchors, axis=1).reshape( (-1,2) )
+                if len(self.center_cache)<100:
+                    self.center_cache[key] = anchor_centers
+            pos_inds = np.where(scores>=threshold)[0]
+            bboxes = distance2bbox(anchor_centers, bbox_preds)
+            pos_scores = scores[pos_inds]
+            pos_bboxes = bboxes[pos_inds]
+            scores_list.append(pos_scores)
+            bboxes_list.append(pos_bboxes)
+            if self.use_kps:
+                kpss = distance2kps(anchor_centers, kps_preds)
+                kpss = kpss.reshape( (kpss.shape[0], -1, 2) )
+                pos_kpss = kpss[pos_inds]
+                kpss_list.append(pos_kpss)
+        return scores_list, bboxes_list, kpss_list
+    def detect(self, img, input_size=None, max_num=0, metric='default', det_thresh=None):
+        input_size = self.input_size if input_size is None else input_size
+        det_thresh = self.det_thresh if det_thresh is None else det_thresh
+        im_ratio = float(img.shape[0]) / img.shape[1]
+        model_ratio = float(input_size[1]) / input_size[0]
+        if im_ratio>model_ratio:
+            new_height = input_size[1]
+            new_width = int(new_height / im_ratio)
+        else:
+            new_width = input_size[0]
+            new_height = int(new_width * im_ratio)
+        det_scale = float(new_height) / img.shape[0]
+        resized_img = cv2.resize(img, (new_width, new_height))
+        det_img = np.zeros( (input_size[1], input_size[0], 3), dtype=np.uint8 )
+        det_img[:new_height, :new_width, :] = resized_img
+        scores_list, bboxes_list, kpss_list = self.forward(det_img, det_thresh)
+        scores = np.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        bboxes = np.vstack(bboxes_list) / det_scale
+        if self.use_kps:
+            kpss = np.vstack(kpss_list) / det_scale
+        pre_det = np.hstack((bboxes, scores)).astype(np.float32, copy=False)
+        pre_det = pre_det[order, :]
+        keep = self.nms(pre_det)
+        det = pre_det[keep, :]
+        if self.use_kps:
+            kpss = kpss[order,:,:]
+            kpss = kpss[keep,:,:]
+        else:
+            kpss = None
+        if max_num > 0 and det.shape[0] > max_num:
+            area = (det[:, 2] - det[:, 0]) * (det[:, 3] - det[:, 1])
+            img_center = img.shape[0] // 2, img.shape[1] // 2
+            offsets = np.vstack([
+                (det[:, 0] + det[:, 2]) / 2 - img_center[1],
+                (det[:, 1] + det[:, 3]) / 2 - img_center[0]
+            ])
+            offset_dist_squared = np.sum(np.power(offsets, 2.0), 0)
+            if metric=='max':
+                values = area
+            else:
+                values = area - offset_dist_squared * 2.0  # some extra weight on the centering
+            bindex = np.argsort(values)[::-1]  # some extra weight on the centering
+            bindex = bindex[0:max_num]
+            det = det[bindex, :]
+            if kpss is not None:
+                kpss = kpss[bindex, :]
+        return det, kpss
+    def nms(self, dets):
+        thresh = self.nms_thresh
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = dets[:, 2]
+        y2 = dets[:, 3]
+        scores = dets[:, 4]
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+        return keep

core/models/appearance_extractor.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import numpy as np
+import torch
+from ..utils.load_model import load_model
+class AppearanceExtractor:
+    def __init__(self, model_path, device="cuda"):
+        kwargs = {
+            "module_name": "AppearanceFeatureExtractor",
+        }
+        self.model, self.model_type = load_model(model_path, device=device, **kwargs)
+        self.device = device
+    def __call__(self, image):
+        """
+        image: np.ndarray, shape (1, 3, 256, 256), float32, range [0, 1]
+        """
+        if self.model_type == "onnx":
+            pred = self.model.run(None, {"image": image})[0]
+        elif self.model_type == "tensorrt":
+            self.model.setup({"image": image})
+            self.model.infer()
+            pred = self.model.buffer["pred"][0].copy()
+        elif self.model_type == 'pytorch':
+            with torch.no_grad(), torch.autocast(device_type=self.device[:4], dtype=torch.float16, enabled=True):
+                pred = self.model(torch.from_numpy(image).to(self.device)).float().cpu().numpy()
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        return pred

core/models/decoder.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import numpy as np
+import torch
+from ..utils.load_model import load_model
+class Decoder:
+    def __init__(self, model_path, device="cuda"):
+        kwargs = {
+            "module_name": "SPADEDecoder",
+        }
+        self.model, self.model_type = load_model(model_path, device=device, **kwargs)
+        self.device = device
+    def __call__(self, feature):
+        if self.model_type == "onnx":
+            pred = self.model.run(None, {"feature": feature})[0]
+        elif self.model_type == "tensorrt":
+            self.model.setup({"feature": feature})
+            self.model.infer()
+            pred = self.model.buffer["output"][0].copy()
+        elif self.model_type == 'pytorch':
+            with torch.no_grad(), torch.autocast(device_type=self.device[:4], dtype=torch.float16, enabled=True):
+                pred = self.model(torch.from_numpy(feature).to(self.device)).float().cpu().numpy()
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        pred = np.transpose(pred[0], [1, 2, 0]).clip(0, 1) * 255    # [h, w, c]
+        return pred

core/models/lmdm.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import numpy as np
+import torch
+from ..utils.load_model import load_model
+def make_beta(n_timestep, cosine_s=8e-3):
+    timesteps = (
+        torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+    )
+    alphas = timesteps / (1 + cosine_s) * np.pi / 2
+    alphas = torch.cos(alphas).pow(2)
+    alphas = alphas / alphas[0]
+    betas = 1 - alphas[1:] / alphas[:-1]
+    betas = np.clip(betas, a_min=0, a_max=0.999)
+    return betas.numpy()
+class LMDM:
+    def __init__(self, model_path, device="cuda", **kwargs):
+        kwargs["module_name"] = "LMDM"
+        self.model, self.model_type = load_model(model_path, device=device, **kwargs)
+        self.device = device
+        self.motion_feat_dim = kwargs.get("motion_feat_dim", 265)
+        self.audio_feat_dim = kwargs.get("audio_feat_dim", 1024+35)
+        self.seq_frames = kwargs.get("seq_frames", 80)
+        if self.model_type == "pytorch":
+            pass
+        else:
+            self._init_np()
+    def setup(self, sampling_timesteps):
+        if self.model_type == "pytorch":
+            self.model.setup(sampling_timesteps)
+        else:
+            self._setup_np(sampling_timesteps)
+    def _init_np(self):
+        self.sampling_timesteps = None
+        self.n_timestep = 1000
+        betas = torch.Tensor(make_beta(n_timestep=self.n_timestep))
+        alphas = 1.0 - betas
+        self.alphas_cumprod = torch.cumprod(alphas, axis=0).cpu().numpy()
+    def _setup_np(self, sampling_timesteps=50):
+        if self.sampling_timesteps == sampling_timesteps:
+            return
+        self.sampling_timesteps = sampling_timesteps
+        total_timesteps = self.n_timestep
+        eta = 1
+        shape = (1, self.seq_frames, self.motion_feat_dim)
+        times = torch.linspace(-1, total_timesteps - 1, steps=sampling_timesteps + 1)   # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
+        times = list(reversed(times.int().tolist()))
+        self.time_pairs = list(zip(times[:-1], times[1:])) # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)]
+        self.time_cond_list = []
+        self.alpha_next_sqrt_list = []
+        self.sigma_list = []
+        self.c_list = []
+        self.noise_list = []
+        for time, time_next in self.time_pairs:
+            time_cond = np.full((1,), time, dtype=np.int64)
+            self.time_cond_list.append(time_cond)
+            if time_next < 0:
+                continue
+            alpha = self.alphas_cumprod[time]
+            alpha_next = self.alphas_cumprod[time_next]
+            sigma = eta * np.sqrt((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha))
+            c = np.sqrt(1 - alpha_next - sigma ** 2)
+            noise = np.random.randn(*shape).astype(np.float32)
+            self.alpha_next_sqrt_list.append(np.sqrt(alpha_next))
+            self.sigma_list.append(sigma)
+            self.c_list.append(c)
+            self.noise_list.append(noise)
+    def _one_step(self, x, cond_frame, cond, time_cond):
+        if self.model_type == "onnx":
+            pred = self.model.run(None, {"x": x, "cond_frame": cond_frame, "cond": cond, "time_cond": time_cond})
+            pred_noise, x_start = pred[0], pred[1]
+        elif self.model_type == "tensorrt":
+            self.model.setup({"x": x, "cond_frame": cond_frame, "cond": cond, "time_cond": time_cond})
+            self.model.infer()
+            pred_noise, x_start = self.model.buffer["pred_noise"][0], self.model.buffer["x_start"][0]
+        elif self.model_type == "pytorch":
+            with torch.no_grad():
+                pred_noise, x_start = self.model(x, cond_frame, cond, time_cond)
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        return pred_noise, x_start
+    def _call_np(self, kp_cond, aud_cond, sampling_timesteps):
+        self._setup_np(sampling_timesteps)
+        cond_frame = kp_cond
+        cond = aud_cond
+        x = np.random.randn(1, self.seq_frames, self.motion_feat_dim).astype(np.float32)
+        x_start = None
+        i = 0
+        for _, time_next in self.time_pairs:
+            time_cond = self.time_cond_list[i]
+            pred_noise, x_start = self._one_step(x, cond_frame, cond, time_cond)
+            if time_next < 0:
+                x = x_start
+                continue
+            alpha_next_sqrt = self.alpha_next_sqrt_list[i]
+            c = self.c_list[i]
+            sigma = self.sigma_list[i]
+            noise = self.noise_list[i]
+            x = x_start * alpha_next_sqrt + c * pred_noise + sigma * noise
+            i += 1
+        return x
+    def __call__(self, kp_cond, aud_cond, sampling_timesteps):
+        if self.model_type == "pytorch":
+            pred_kp_seq = self.model.ddim_sample(
+                torch.from_numpy(kp_cond).to(self.device),
+                torch.from_numpy(aud_cond).to(self.device),
+                sampling_timesteps,
+            ).cpu().numpy()
+        else:
+            pred_kp_seq = self._call_np(kp_cond, aud_cond, sampling_timesteps)
+        return pred_kp_seq

core/models/modules/LMDM.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# Latent Motion Diffusion Model
+import torch
+import torch.nn as nn
+from .lmdm_modules.model import MotionDecoder
+from .lmdm_modules.utils import extract, make_beta_schedule
+class LMDM(nn.Module):
+    def __init__(
+        self,
+        motion_feat_dim=265,
+        audio_feat_dim=1024+35,
+        seq_frames=80,
+        checkpoint='',
+        device='cuda',
+        clip_denoised=False,    # clip denoised (-1,1)
+        multi_cond_frame=False,
+    ):
+        super().__init__()
+        self.motion_feat_dim = motion_feat_dim
+        self.audio_feat_dim = audio_feat_dim
+        self.seq_frames = seq_frames
+        self.device = device
+        self.n_timestep = 1000
+        self.clip_denoised = clip_denoised
+        self.guidance_weight = 2
+        self.model = MotionDecoder(
+            nfeats=motion_feat_dim,
+            seq_len=seq_frames,
+            latent_dim=512,
+            ff_size=1024,
+            num_layers=8,
+            num_heads=8,
+            dropout=0.1,
+            cond_feature_dim=audio_feat_dim,
+            multi_cond_frame=multi_cond_frame,
+        )
+        self.init_diff()
+        self.sampling_timesteps = None
+    def init_diff(self):
+        n_timestep = self.n_timestep
+        betas = torch.Tensor(
+            make_beta_schedule(schedule="cosine", n_timestep=n_timestep)
+        )
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, axis=0)
+        self.register_buffer("alphas_cumprod", alphas_cumprod)
+        self.register_buffer(
+            "sqrt_recipm1_alphas_cumprod", torch.sqrt(1.0 / alphas_cumprod - 1)
+        )
+        self.register_buffer("sqrt_recip1m_alphas_cumprod", torch.sqrt(1.0 / (1.0 - alphas_cumprod)))
+    def predict_noise_from_start(self, x_t, t, x0):
+        a = extract(self.sqrt_recip1m_alphas_cumprod, t, x_t.shape)
+        b = extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+        return (a * x_t - x0 / b)
+    def maybe_clip(self, x):
+        if self.clip_denoised:
+            return torch.clamp(x, min=-1., max=1.)
+        else:
+            return x
+    def model_predictions(self, x, cond_frame, cond, t):
+        weight = self.guidance_weight
+        x_start = self.model.guided_forward(x, cond_frame, cond, t, weight)
+        x_start = self.maybe_clip(x_start)
+        pred_noise = self.predict_noise_from_start(x, t, x_start)
+        return pred_noise, x_start
+    @torch.no_grad()
+    def forward(self, x, cond_frame, cond, time_cond):
+        pred_noise, x_start = self.model_predictions(x, cond_frame, cond, time_cond)
+        return pred_noise, x_start
+    def load_model(self, ckpt_path):
+        checkpoint = torch.load(ckpt_path, map_location='cpu')
+        self.model.load_state_dict(checkpoint["model_state_dict"])
+        self.eval()
+        return self
+    def setup(self, sampling_timesteps=50):
+        if self.sampling_timesteps == sampling_timesteps:
+            return
+        self.sampling_timesteps = sampling_timesteps
+        total_timesteps = self.n_timestep
+        device = self.device
+        eta = 1
+        shape = (1, self.seq_frames, self.motion_feat_dim)
+        times = torch.linspace(-1, total_timesteps - 1, steps=sampling_timesteps + 1)   # [-1, 0, 1, 2, ..., T-1] when sampling_timesteps == total_timesteps
+        times = list(reversed(times.int().tolist()))
+        self.time_pairs = list(zip(times[:-1], times[1:])) # [(T-1, T-2), (T-2, T-3), ..., (1, 0), (0, -1)]
+        self.time_cond_list = []
+        self.alpha_next_sqrt_list = []
+        self.sigma_list = []
+        self.c_list = []
+        self.noise_list = []
+        for time, time_next in self.time_pairs:
+            time_cond = torch.full((1,), time, device=device, dtype=torch.long)
+            self.time_cond_list.append(time_cond)
+            if time_next < 0:
+                continue
+            alpha = self.alphas_cumprod[time]
+            alpha_next = self.alphas_cumprod[time_next]
+            sigma = eta * ((1 - alpha / alpha_next) * (1 - alpha_next) / (1 - alpha)).sqrt()
+            c = (1 - alpha_next - sigma ** 2).sqrt()
+            noise = torch.randn(shape, device=device)
+            self.alpha_next_sqrt_list.append(alpha_next.sqrt())
+            self.sigma_list.append(sigma)
+            self.c_list.append(c)
+            self.noise_list.append(noise)
+    @torch.no_grad()
+    def ddim_sample(self, kp_cond, aud_cond, sampling_timesteps):
+        self.setup(sampling_timesteps)
+        cond_frame = kp_cond
+        cond = aud_cond
+        shape = (1, self.seq_frames, self.motion_feat_dim)
+        x = torch.randn(shape, device=self.device)
+        x_start = None
+        i = 0
+        for _, time_next in self.time_pairs:
+            time_cond = self.time_cond_list[i]
+            pred_noise, x_start = self.model_predictions(x, cond_frame, cond, time_cond)
+            if time_next < 0:
+                x = x_start
+                continue
+            alpha_next_sqrt = self.alpha_next_sqrt_list[i]
+            c = self.c_list[i]
+            sigma = self.sigma_list[i]
+            noise = self.noise_list[i]
+            x = x_start * alpha_next_sqrt + c * pred_noise + sigma * noise
+            i += 1
+        return x  # pred_kp_seq

core/models/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .appearance_feature_extractor import AppearanceFeatureExtractor
+from .motion_extractor import MotionExtractor
+from .warping_network import WarpingNetwork
+from .spade_generator import SPADEDecoder
+from .stitching_network import StitchingNetwork
+from .LMDM import LMDM

core/models/modules/appearance_feature_extractor.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# coding: utf-8
+"""
+Appearance extractor(F) defined in paper, which maps the source image s to a 3D appearance feature volume.
+"""
+import torch
+from torch import nn
+from .util import SameBlock2d, DownBlock2d, ResBlock3d
+class AppearanceFeatureExtractor(nn.Module):
+    def __init__(
+        self,
+        image_channel=3,
+        block_expansion=64,
+        num_down_blocks=2,
+        max_features=512,
+        reshape_channel=32,
+        reshape_depth=16,
+        num_resblocks=6,
+    ):
+        super(AppearanceFeatureExtractor, self).__init__()
+        self.image_channel = image_channel
+        self.block_expansion = block_expansion
+        self.num_down_blocks = num_down_blocks
+        self.max_features = max_features
+        self.reshape_channel = reshape_channel
+        self.reshape_depth = reshape_depth
+        self.first = SameBlock2d(
+            image_channel, block_expansion, kernel_size=(3, 3), padding=(1, 1)
+        )
+        down_blocks = []
+        for i in range(num_down_blocks):
+            in_features = min(max_features, block_expansion * (2**i))
+            out_features = min(max_features, block_expansion * (2 ** (i + 1)))
+            down_blocks.append(
+                DownBlock2d(
+                    in_features, out_features, kernel_size=(3, 3), padding=(1, 1)
+                )
+            )
+        self.down_blocks = nn.ModuleList(down_blocks)
+        self.second = nn.Conv2d(
+            in_channels=out_features, out_channels=max_features, kernel_size=1, stride=1
+        )
+        self.resblocks_3d = torch.nn.Sequential()
+        for i in range(num_resblocks):
+            self.resblocks_3d.add_module(
+                "3dr" + str(i), ResBlock3d(reshape_channel, kernel_size=3, padding=1)
+            )
+    def forward(self, source_image):
+        out = self.first(source_image)  # Bx3x256x256 -> Bx64x256x256
+        for i in range(len(self.down_blocks)):
+            out = self.down_blocks[i](out)
+        out = self.second(out)
+        bs, c, h, w = out.shape  # ->Bx512x64x64
+        f_s = out.view(
+            bs, self.reshape_channel, self.reshape_depth, h, w
+        )  # ->Bx32x16x64x64
+        f_s = self.resblocks_3d(f_s)  # ->Bx32x16x64x64
+        return f_s
+    def load_model(self, ckpt_path):
+        self.load_state_dict(torch.load(ckpt_path, map_location=lambda storage, loc: storage))
+        self.eval()
+        return self

core/models/modules/convnextv2.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# coding: utf-8
+"""
+This moudle is adapted to the ConvNeXtV2 version for the extraction of implicit keypoints, poses, and expression deformation.
+"""
+import torch
+import torch.nn as nn
+# from timm.models.layers import trunc_normal_, DropPath
+from .util import LayerNorm, DropPath, trunc_normal_, GRN
+__all__ = ['convnextv2_tiny']
+class Block(nn.Module):
+    """ ConvNeXtV2 Block.
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+    """
+    def __init__(self, dim, drop_path=0.):
+        super().__init__()
+        self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.grn = GRN(4 * dim)
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.permute(0, 2, 3, 1)  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.grn(x)
+        x = self.pwconv2(x)
+        x = x.permute(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
+        x = input + self.drop_path(x)
+        return x
+class ConvNeXtV2(nn.Module):
+    """ ConvNeXt V2
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+    def __init__(
+        self,
+        in_chans=3,
+        depths=[3, 3, 9, 3],
+        dims=[96, 192, 384, 768],
+        drop_path_rate=0.,
+        **kwargs
+    ):
+        super().__init__()
+        self.depths = depths
+        self.downsample_layers = nn.ModuleList()  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(dims[0], eps=1e-6, data_format="channels_first")
+        )
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv2d(dims[i], dims[i+1], kernel_size=2, stride=2),
+            )
+            self.downsample_layers.append(downsample_layer)
+        self.stages = nn.ModuleList()  # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(
+                *[Block(dim=dims[i], drop_path=dp_rates[cur + j]) for j in range(depths[i])]
+            )
+            self.stages.append(stage)
+            cur += depths[i]
+        self.norm = nn.LayerNorm(dims[-1], eps=1e-6)  # final norm layer
+        # NOTE: the output semantic items
+        num_bins = kwargs.get('num_bins', 66)
+        num_kp = kwargs.get('num_kp', 24)  # the number of implicit keypoints
+        self.fc_kp = nn.Linear(dims[-1], 3 * num_kp)  # implicit keypoints
+        # print('dims[-1]: ', dims[-1])
+        self.fc_scale = nn.Linear(dims[-1], 1)  # scale
+        self.fc_pitch = nn.Linear(dims[-1], num_bins)  # pitch bins
+        self.fc_yaw = nn.Linear(dims[-1], num_bins)  # yaw bins
+        self.fc_roll = nn.Linear(dims[-1], num_bins)  # roll bins
+        self.fc_t = nn.Linear(dims[-1], 3)  # translation
+        self.fc_exp = nn.Linear(dims[-1], 3 * num_kp)  # expression / delta
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            trunc_normal_(m.weight, std=.02)
+            nn.init.constant_(m.bias, 0)
+    def forward_features(self, x):
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+        return self.norm(x.mean([-2, -1]))  # global average pooling, (N, C, H, W) -> (N, C)
+    def forward(self, x):
+        x = self.forward_features(x)
+        # implicit keypoints
+        kp = self.fc_kp(x)
+        # pose and expression deformation
+        pitch = self.fc_pitch(x)
+        yaw = self.fc_yaw(x)
+        roll = self.fc_roll(x)
+        t = self.fc_t(x)
+        exp = self.fc_exp(x)
+        scale = self.fc_scale(x)
+        # ret_dct = {
+        #     'pitch': pitch,
+        #     'yaw': yaw,
+        #     'roll': roll,
+        #     't': t,
+        #     'exp': exp,
+        #     'scale': scale,
+        #     'kp': kp,  # canonical keypoint
+        # }
+        # return ret_dct
+        return pitch, yaw, roll, t, exp, scale, kp
+def convnextv2_tiny(**kwargs):
+    model = ConvNeXtV2(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+    return model

core/models/modules/dense_motion.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# coding: utf-8
+"""
+The module that predicting a dense motion from sparse motion representation given by kp_source and kp_driving
+"""
+from torch import nn
+import torch.nn.functional as F
+import torch
+from .util import Hourglass, make_coordinate_grid, kp2gaussian
+class DenseMotionNetwork(nn.Module):
+    def __init__(self, block_expansion, num_blocks, max_features, num_kp, feature_channel, reshape_depth, compress, estimate_occlusion_map=True):
+        super(DenseMotionNetwork, self).__init__()
+        self.hourglass = Hourglass(block_expansion=block_expansion, in_features=(num_kp+1)*(compress+1), max_features=max_features, num_blocks=num_blocks)  # ~60+G
+        self.mask = nn.Conv3d(self.hourglass.out_filters, num_kp + 1, kernel_size=7, padding=3)  # 65G! NOTE: computation cost is large
+        self.compress = nn.Conv3d(feature_channel, compress, kernel_size=1)  # 0.8G
+        self.norm = nn.BatchNorm3d(compress, affine=True)
+        self.num_kp = num_kp
+        self.flag_estimate_occlusion_map = estimate_occlusion_map
+        if self.flag_estimate_occlusion_map:
+            self.occlusion = nn.Conv2d(self.hourglass.out_filters*reshape_depth, 1, kernel_size=7, padding=3)
+        else:
+            self.occlusion = None
+    def create_sparse_motions(self, feature, kp_driving, kp_source):
+        bs, _, d, h, w = feature.shape  # (bs, 4, 16, 64, 64)
+        identity_grid = make_coordinate_grid((d, h, w), ref=kp_source)  # (16, 64, 64, 3)
+        identity_grid = identity_grid.view(1, 1, d, h, w, 3)  # (1, 1, d=16, h=64, w=64, 3)
+        coordinate_grid = identity_grid - kp_driving.view(bs, self.num_kp, 1, 1, 1, 3)
+        k = coordinate_grid.shape[1]
+        # NOTE: there lacks an one-order flow
+        driving_to_source = coordinate_grid + kp_source.view(bs, self.num_kp, 1, 1, 1, 3)    # (bs, num_kp, d, h, w, 3)
+        # adding background feature
+        identity_grid = identity_grid.repeat(bs, 1, 1, 1, 1, 1)
+        sparse_motions = torch.cat([identity_grid, driving_to_source], dim=1)  # (bs, 1+num_kp, d, h, w, 3)
+        return sparse_motions
+    def create_deformed_feature(self, feature, sparse_motions):
+        bs, _, d, h, w = feature.shape
+        feature_repeat = feature.unsqueeze(1).unsqueeze(1).repeat(1, self.num_kp+1, 1, 1, 1, 1, 1)      # (bs, num_kp+1, 1, c, d, h, w)
+        feature_repeat = feature_repeat.view(bs * (self.num_kp+1), -1, d, h, w)                         # (bs*(num_kp+1), c, d, h, w)
+        sparse_motions = sparse_motions.view((bs * (self.num_kp+1), d, h, w, -1))                       # (bs*(num_kp+1), d, h, w, 3)
+        sparse_deformed = F.grid_sample(feature_repeat, sparse_motions, align_corners=False)
+        sparse_deformed = sparse_deformed.view((bs, self.num_kp+1, -1, d, h, w))                        # (bs, num_kp+1, c, d, h, w)
+        return sparse_deformed
+    def create_heatmap_representations(self, feature, kp_driving, kp_source):
+        spatial_size = feature.shape[3:]  # (d=16, h=64, w=64)
+        gaussian_driving = kp2gaussian(kp_driving, spatial_size=spatial_size, kp_variance=0.01)  # (bs, num_kp, d, h, w)
+        gaussian_source = kp2gaussian(kp_source, spatial_size=spatial_size, kp_variance=0.01)  # (bs, num_kp, d, h, w)
+        heatmap = gaussian_driving - gaussian_source  # (bs, num_kp, d, h, w)
+        # adding background feature
+        zeros = torch.zeros(heatmap.shape[0], 1, spatial_size[0], spatial_size[1], spatial_size[2]).type(heatmap.dtype).to(heatmap.device)
+        heatmap = torch.cat([zeros, heatmap], dim=1)
+        heatmap = heatmap.unsqueeze(2)         # (bs, 1+num_kp, 1, d, h, w)
+        return heatmap
+    def forward(self, feature, kp_driving, kp_source):
+        bs, _, d, h, w = feature.shape  # (bs, 32, 16, 64, 64)
+        feature = self.compress(feature)  # (bs, 4, 16, 64, 64)
+        feature = self.norm(feature)  # (bs, 4, 16, 64, 64)
+        feature = F.relu(feature)  # (bs, 4, 16, 64, 64)
+        out_dict = dict()
+        # 1. deform 3d feature
+        sparse_motion = self.create_sparse_motions(feature, kp_driving, kp_source)  # (bs, 1+num_kp, d, h, w, 3)
+        deformed_feature = self.create_deformed_feature(feature, sparse_motion)  # (bs, 1+num_kp, c=4, d=16, h=64, w=64)
+        # 2. (bs, 1+num_kp, d, h, w)
+        heatmap = self.create_heatmap_representations(deformed_feature, kp_driving, kp_source)  # (bs, 1+num_kp, 1, d, h, w)
+        input = torch.cat([heatmap, deformed_feature], dim=2)  # (bs, 1+num_kp, c=5, d=16, h=64, w=64)
+        input = input.view(bs, -1, d, h, w)  # (bs, (1+num_kp)*c=105, d=16, h=64, w=64)
+        prediction = self.hourglass(input)
+        mask = self.mask(prediction)
+        mask = F.softmax(mask, dim=1)  # (bs, 1+num_kp, d=16, h=64, w=64)
+        out_dict['mask'] = mask
+        mask = mask.unsqueeze(2)                                   # (bs, num_kp+1, 1, d, h, w)
+        sparse_motion = sparse_motion.permute(0, 1, 5, 2, 3, 4)    # (bs, num_kp+1, 3, d, h, w)
+        deformation = (sparse_motion * mask).sum(dim=1)            # (bs, 3, d, h, w)  mask take effect in this place
+        deformation = deformation.permute(0, 2, 3, 4, 1)           # (bs, d, h, w, 3)
+        out_dict['deformation'] = deformation
+        if self.flag_estimate_occlusion_map:
+            bs, _, d, h, w = prediction.shape
+            prediction_reshape = prediction.view(bs, -1, h, w)
+            occlusion_map = torch.sigmoid(self.occlusion(prediction_reshape))  # Bx1x64x64
+            out_dict['occlusion_map'] = occlusion_map
+        return out_dict

core/models/modules/lmdm_modules/model.py ADDED Viewed

	@@ -0,0 +1,398 @@

+from typing import Callable, Optional, Union
+import torch
+import torch.nn as nn
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from torch import Tensor
+from torch.nn import functional as F
+from .rotary_embedding_torch import RotaryEmbedding
+from .utils import PositionalEncoding, SinusoidalPosEmb, prob_mask_like
+class DenseFiLM(nn.Module):
+    """Feature-wise linear modulation (FiLM) generator."""
+    def __init__(self, embed_channels):
+        super().__init__()
+        self.embed_channels = embed_channels
+        self.block = nn.Sequential(
+            nn.Mish(), nn.Linear(embed_channels, embed_channels * 2)
+        )
+    def forward(self, position):
+        pos_encoding = self.block(position)
+        pos_encoding = rearrange(pos_encoding, "b c -> b 1 c")
+        scale_shift = pos_encoding.chunk(2, dim=-1)
+        return scale_shift
+def featurewise_affine(x, scale_shift):
+    scale, shift = scale_shift
+    return (scale + 1) * x + shift
+class TransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
+        layer_norm_eps: float = 1e-5,
+        batch_first: bool = False,
+        norm_first: bool = True,
+        device=None,
+        dtype=None,
+        rotary=None,
+    ) -> None:
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout, batch_first=batch_first
+        )
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm_first = norm_first
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = activation
+        self.rotary = rotary
+        self.use_rotary = rotary is not None
+    def forward(
+        self,
+        src: Tensor,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        x = src
+        if self.norm_first:
+            x = x + self._sa_block(self.norm1(x), src_mask, src_key_padding_mask)
+            x = x + self._ff_block(self.norm2(x))
+        else:
+            x = self.norm1(x + self._sa_block(x, src_mask, src_key_padding_mask))
+            x = self.norm2(x + self._ff_block(x))
+        return x
+    # self-attention block
+    def _sa_block(
+        self, x: Tensor, attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]
+    ) -> Tensor:
+        qk = self.rotary.rotate_queries_or_keys(x) if self.use_rotary else x
+        x = self.self_attn(
+            qk,
+            qk,
+            x,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return self.dropout1(x)
+    # feed forward block
+    def _ff_block(self, x: Tensor) -> Tensor:
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout2(x)
+class FiLMTransformerDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation=F.relu,
+        layer_norm_eps=1e-5,
+        batch_first=False,
+        norm_first=True,
+        device=None,
+        dtype=None,
+        rotary=None,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout, batch_first=batch_first
+        )
+        self.multihead_attn = nn.MultiheadAttention(
+            d_model, nhead, dropout=dropout, batch_first=batch_first
+        )
+        # Feedforward
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm_first = norm_first
+        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = activation
+        self.film1 = DenseFiLM(d_model)
+        self.film2 = DenseFiLM(d_model)
+        self.film3 = DenseFiLM(d_model)
+        self.rotary = rotary
+        self.use_rotary = rotary is not None
+    # x, cond, t
+    def forward(
+        self,
+        tgt,
+        memory,
+        t,
+        tgt_mask=None,
+        memory_mask=None,
+        tgt_key_padding_mask=None,
+        memory_key_padding_mask=None,
+    ):
+        x = tgt
+        if self.norm_first:
+            # self-attention -> film -> residual
+            x_1 = self._sa_block(self.norm1(x), tgt_mask, tgt_key_padding_mask)
+            x = x + featurewise_affine(x_1, self.film1(t))
+            # cross-attention -> film -> residual
+            x_2 = self._mha_block(
+                self.norm2(x), memory, memory_mask, memory_key_padding_mask
+            )
+            x = x + featurewise_affine(x_2, self.film2(t))
+            # feedforward -> film -> residual
+            x_3 = self._ff_block(self.norm3(x))
+            x = x + featurewise_affine(x_3, self.film3(t))
+        else:
+            x = self.norm1(
+                x
+                + featurewise_affine(
+                    self._sa_block(x, tgt_mask, tgt_key_padding_mask), self.film1(t)
+                )
+            )
+            x = self.norm2(
+                x
+                + featurewise_affine(
+                    self._mha_block(x, memory, memory_mask, memory_key_padding_mask),
+                    self.film2(t),
+                )
+            )
+            x = self.norm3(x + featurewise_affine(self._ff_block(x), self.film3(t)))
+        return x
+    # self-attention block
+    # qkv
+    def _sa_block(self, x, attn_mask, key_padding_mask):
+        qk = self.rotary.rotate_queries_or_keys(x) if self.use_rotary else x
+        x = self.self_attn(
+            qk,
+            qk,
+            x,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return self.dropout1(x)
+    # multihead attention block
+    # qkv
+    def _mha_block(self, x, mem, attn_mask, key_padding_mask):
+        q = self.rotary.rotate_queries_or_keys(x) if self.use_rotary else x
+        k = self.rotary.rotate_queries_or_keys(mem) if self.use_rotary else mem
+        x = self.multihead_attn(
+            q,
+            k,
+            mem,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
+            need_weights=False,
+        )[0]
+        return self.dropout2(x)
+    # feed forward block
+    def _ff_block(self, x):
+        x = self.linear2(self.dropout(self.activation(self.linear1(x))))
+        return self.dropout3(x)
+class DecoderLayerStack(nn.Module):
+    def __init__(self, stack):
+        super().__init__()
+        self.stack = stack
+    def forward(self, x, cond, t):
+        for layer in self.stack:
+            x = layer(x, cond, t)
+        return x
+class MotionDecoder(nn.Module):
+    def __init__(
+        self,
+        nfeats: int,
+        seq_len: int = 100,  # 4 seconds, 25 fps
+        latent_dim: int = 256,
+        ff_size: int = 1024,
+        num_layers: int = 4,
+        num_heads: int = 4,
+        dropout: float = 0.1,
+        cond_feature_dim: int = 4800,
+        activation: Callable[[Tensor], Tensor] = F.gelu,
+        use_rotary=True,
+        multi_cond_frame=False,
+        **kwargs
+    ) -> None:
+        super().__init__()
+        self.multi_cond_frame = multi_cond_frame
+        output_feats = nfeats
+        # positional embeddings
+        self.rotary = None
+        self.abs_pos_encoding = nn.Identity()
+        # if rotary, replace absolute embedding with a rotary embedding instance (absolute becomes an identity)
+        if use_rotary:
+            self.rotary = RotaryEmbedding(dim=latent_dim)
+        else:
+            self.abs_pos_encoding = PositionalEncoding(
+                latent_dim, dropout, batch_first=True
+            )
+        # time embedding processing
+        self.time_mlp = nn.Sequential(
+            SinusoidalPosEmb(latent_dim),  # learned?
+            nn.Linear(latent_dim, latent_dim * 4),
+            nn.Mish(),
+        )
+        self.to_time_cond = nn.Sequential(nn.Linear(latent_dim * 4, latent_dim),)
+        self.to_time_tokens = nn.Sequential(
+            nn.Linear(latent_dim * 4, latent_dim * 2),  # 2 time tokens
+            Rearrange("b (r d) -> b r d", r=2),
+        )
+        # null embeddings for guidance dropout
+        self.null_cond_embed = nn.Parameter(torch.randn(1, seq_len, latent_dim))
+        self.null_cond_hidden = nn.Parameter(torch.randn(1, latent_dim))
+        self.norm_cond = nn.LayerNorm(latent_dim)
+        # input projection
+        if self.multi_cond_frame:
+            self.input_projection = nn.Linear(nfeats * 2 + 1, latent_dim)
+        else:
+            self.input_projection = nn.Linear(nfeats * 2, latent_dim)
+        self.cond_encoder = nn.Sequential()
+        for _ in range(2):
+            self.cond_encoder.append(
+                TransformerEncoderLayer(
+                    d_model=latent_dim,
+                    nhead=num_heads,
+                    dim_feedforward=ff_size,
+                    dropout=dropout,
+                    activation=activation,
+                    batch_first=True,
+                    rotary=self.rotary,
+                )
+            )
+        # conditional projection
+        self.cond_projection = nn.Linear(cond_feature_dim, latent_dim)
+        self.non_attn_cond_projection = nn.Sequential(
+            nn.LayerNorm(latent_dim),
+            nn.Linear(latent_dim, latent_dim),
+            nn.SiLU(),
+            nn.Linear(latent_dim, latent_dim),
+        )
+        # decoder
+        decoderstack = nn.ModuleList([])
+        for _ in range(num_layers):
+            decoderstack.append(
+                FiLMTransformerDecoderLayer(
+                    latent_dim,
+                    num_heads,
+                    dim_feedforward=ff_size,
+                    dropout=dropout,
+                    activation=activation,
+                    batch_first=True,
+                    rotary=self.rotary,
+                )
+            )
+        self.seqTransDecoder = DecoderLayerStack(decoderstack)
+        self.final_layer = nn.Linear(latent_dim, output_feats)
+        self.epsilon = 0.00001
+    def guided_forward(self, x, cond_frame, cond_embed, times, guidance_weight):
+        unc = self.forward(x, cond_frame, cond_embed, times, cond_drop_prob=1)
+        conditioned = self.forward(x, cond_frame, cond_embed, times, cond_drop_prob=0)
+        return unc + (conditioned - unc) * guidance_weight
+    def forward(
+        self, x: Tensor, cond_frame: Tensor, cond_embed: Tensor, times: Tensor, cond_drop_prob: float = 0.0
+    ):
+        batch_size, device = x.shape[0], x.device
+        # concat last frame, project to latent space
+        # cond_frame: [b, dim] | [b, n, dim+1]
+        if self.multi_cond_frame:
+            # [b, n, dim+1] (+1 mask)
+            x = torch.cat([x, cond_frame], dim=-1)
+        else:
+            # [b, dim]
+            x = torch.cat([x, cond_frame.unsqueeze(1).repeat(1, x.shape[1], 1)], dim=-1)
+        x = self.input_projection(x)
+        # add the positional embeddings of the input sequence to provide temporal information
+        x = self.abs_pos_encoding(x)
+        # create audio conditional embedding with conditional dropout
+        keep_mask = prob_mask_like((batch_size,), 1 - cond_drop_prob, device=device)
+        keep_mask_embed = rearrange(keep_mask, "b -> b 1 1")
+        keep_mask_hidden = rearrange(keep_mask, "b -> b 1")
+        cond_tokens = self.cond_projection(cond_embed)
+        # encode tokens
+        cond_tokens = self.abs_pos_encoding(cond_tokens)
+        cond_tokens = self.cond_encoder(cond_tokens)
+        null_cond_embed = self.null_cond_embed.to(cond_tokens.dtype)
+        cond_tokens = torch.where(keep_mask_embed, cond_tokens, null_cond_embed)
+        mean_pooled_cond_tokens = cond_tokens.mean(dim=-2)
+        cond_hidden = self.non_attn_cond_projection(mean_pooled_cond_tokens)
+        # create the diffusion timestep embedding, add the extra audio projection
+        t_hidden = self.time_mlp(times)
+        # project to attention and FiLM conditioning
+        t = self.to_time_cond(t_hidden)
+        t_tokens = self.to_time_tokens(t_hidden)
+        # FiLM conditioning
+        null_cond_hidden = self.null_cond_hidden.to(t.dtype)
+        cond_hidden = torch.where(keep_mask_hidden, cond_hidden, null_cond_hidden)
+        t += cond_hidden
+        # cross-attention conditioning
+        c = torch.cat((cond_tokens, t_tokens), dim=-2)
+        cond_tokens = self.norm_cond(c)
+        # Pass through the transformer decoder
+        # attending to the conditional embedding
+        output = self.seqTransDecoder(x, cond_tokens, t)
+        output = self.final_layer(output)
+        return output

core/models/modules/lmdm_modules/rotary_embedding_torch.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from inspect import isfunction
+from math import log, pi
+import torch
+from einops import rearrange, repeat
+from torch import einsum, nn
+# helper functions
+def exists(val):
+    return val is not None
+def broadcat(tensors, dim=-1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all(
+        [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]
+    ), "invalid dimensions for broadcastable concatentation"
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim=dim)
+# rotary embedding helper functions
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+def apply_rotary_emb(freqs, t, start_index=0):
+    freqs = freqs.to(t)
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert (
+        rot_dim <= t.shape[-1]
+    ), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    t_left, t, t_right = (
+        t[..., :start_index],
+        t[..., start_index:end_index],
+        t[..., end_index:],
+    )
+    t = (t * freqs.cos()) + (rotate_half(t) * freqs.sin())
+    return torch.cat((t_left, t, t_right), dim=-1)
+# learned rotation helpers
+def apply_learned_rotations(rotations, t, start_index=0, freq_ranges=None):
+    if exists(freq_ranges):
+        rotations = einsum("..., f -> ... f", rotations, freq_ranges)
+        rotations = rearrange(rotations, "... r f -> ... (r f)")
+    rotations = repeat(rotations, "... n -> ... (n r)", r=2)
+    return apply_rotary_emb(rotations, t, start_index=start_index)
+# classes
+class RotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        custom_freqs=None,
+        freqs_for="lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+        learned_freq=False,
+    ):
+        super().__init__()
+        if exists(custom_freqs):
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (
+                theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)
+            )
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f"unknown modality {freqs_for}")
+        self.cache = dict()
+        if learned_freq:
+            self.freqs = nn.Parameter(freqs)
+        else:
+            self.register_buffer("freqs", freqs)
+    def rotate_queries_or_keys(self, t, seq_dim=-2):
+        device = t.device
+        seq_len = t.shape[seq_dim]
+        freqs = self.forward(
+            lambda: torch.arange(seq_len, device=device), cache_key=seq_len
+        )
+        return apply_rotary_emb(freqs, t)
+    def forward(self, t, cache_key=None):
+        if exists(cache_key) and cache_key in self.cache:
+            return self.cache[cache_key]
+        if isfunction(t):
+            t = t()
+        freqs = self.freqs
+        freqs = torch.einsum("..., f -> ... f", t.type(freqs.dtype), freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        if exists(cache_key):
+            self.cache[cache_key] = freqs
+        return freqs

core/models/modules/lmdm_modules/utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import math
+import numpy as np
+import torch
+from torch import nn
+# absolute positional embedding used for vanilla transformer sequential data
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, max_len=500, batch_first=False):
+        super().__init__()
+        self.batch_first = batch_first
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        if self.batch_first:
+            x = x + self.pe.permute(1, 0, 2)[:, : x.shape[1], :]
+        else:
+            x = x + self.pe[: x.shape[0], :]
+        return self.dropout(x)
+# very similar positional embedding used for diffusion timesteps
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+    def forward(self, x):
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+# dropout mask
+def prob_mask_like(shape, prob, device):
+    if prob == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif prob == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.zeros(shape, device=device).float().uniform_(0, 1) < prob
+def extract(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def make_beta_schedule(
+    schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3
+):
+    if schedule == "linear":
+        betas = (
+            torch.linspace(
+                linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64
+            )
+            ** 2
+        )
+    elif schedule == "cosine":
+        timesteps = (
+            torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+        )
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+    elif schedule == "sqrt_linear":
+        betas = torch.linspace(
+            linear_start, linear_end, n_timestep, dtype=torch.float64
+        )
+    elif schedule == "sqrt":
+        betas = (
+            torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+            ** 0.5
+        )
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()

core/models/modules/motion_extractor.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# coding: utf-8
+"""
+Motion extractor(M), which directly predicts the canonical keypoints, head pose and expression deformation of the input image
+"""
+from torch import nn
+import torch
+from .convnextv2 import convnextv2_tiny
+class MotionExtractor(nn.Module):
+    def __init__(self, num_kp=21, backbone="convnextv2_tiny"):
+        super(MotionExtractor, self).__init__()
+        self.detector = convnextv2_tiny(num_kp=num_kp, backbone=backbone)
+    def forward(self, x):
+        out = self.detector(x)
+        return out  # pitch, yaw, roll, t, exp, scale, kp
+    def load_model(self, ckpt_path):
+        self.load_state_dict(torch.load(ckpt_path, map_location=lambda storage, loc: storage))
+        self.eval()
+        return self

core/models/modules/spade_generator.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# coding: utf-8
+"""
+Spade decoder(G) defined in the paper, which input the warped feature to generate the animated image.
+"""
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .util import SPADEResnetBlock
+class SPADEDecoder(nn.Module):
+    def __init__(
+        self,
+        upscale=2,
+        max_features=512,
+        block_expansion=64,
+        out_channels=64,
+        num_down_blocks=2,
+    ):
+        for i in range(num_down_blocks):
+            input_channels = min(max_features, block_expansion * (2 ** (i + 1)))
+        self.upscale = upscale
+        super().__init__()
+        norm_G = "spadespectralinstance"
+        label_num_channels = input_channels  # 256
+        self.fc = nn.Conv2d(input_channels, 2 * input_channels, 3, padding=1)
+        self.G_middle_0 = SPADEResnetBlock(
+            2 * input_channels, 2 * input_channels, norm_G, label_num_channels
+        )
+        self.G_middle_1 = SPADEResnetBlock(
+            2 * input_channels, 2 * input_channels, norm_G, label_num_channels
+        )
+        self.G_middle_2 = SPADEResnetBlock(
+            2 * input_channels, 2 * input_channels, norm_G, label_num_channels
+        )
+        self.G_middle_3 = SPADEResnetBlock(
+            2 * input_channels, 2 * input_channels, norm_G, label_num_channels
+        )
+        self.G_middle_4 = SPADEResnetBlock(
+            2 * input_channels, 2 * input_channels, norm_G, label_num_channels
+        )
+        self.G_middle_5 = SPADEResnetBlock(
+            2 * input_channels, 2 * input_channels, norm_G, label_num_channels
+        )
+        self.up_0 = SPADEResnetBlock(
+            2 * input_channels, input_channels, norm_G, label_num_channels
+        )
+        self.up_1 = SPADEResnetBlock(
+            input_channels, out_channels, norm_G, label_num_channels
+        )
+        self.up = nn.Upsample(scale_factor=2)
+        if self.upscale is None or self.upscale <= 1:
+            self.conv_img = nn.Conv2d(out_channels, 3, 3, padding=1)
+        else:
+            self.conv_img = nn.Sequential(
+                nn.Conv2d(out_channels, 3 * (2 * 2), kernel_size=3, padding=1),
+                nn.PixelShuffle(upscale_factor=2),
+            )
+    def forward(self, feature):
+        seg = feature  # Bx256x64x64
+        x = self.fc(feature)  # Bx512x64x64
+        x = self.G_middle_0(x, seg)
+        x = self.G_middle_1(x, seg)
+        x = self.G_middle_2(x, seg)
+        x = self.G_middle_3(x, seg)
+        x = self.G_middle_4(x, seg)
+        x = self.G_middle_5(x, seg)
+        x = self.up(x)  # Bx512x64x64 -> Bx512x128x128
+        x = self.up_0(x, seg)  # Bx512x128x128 -> Bx256x128x128
+        x = self.up(x)  # Bx256x128x128 -> Bx256x256x256
+        x = self.up_1(x, seg)  # Bx256x256x256 -> Bx64x256x256
+        x = self.conv_img(F.leaky_relu(x, 2e-1))  # Bx64x256x256 -> Bx3xHxW
+        x = torch.sigmoid(x)  # Bx3xHxW
+        return x
+    def load_model(self, ckpt_path):
+        self.load_state_dict(torch.load(ckpt_path, map_location=lambda storage, loc: storage))
+        self.eval()
+        return self

core/models/modules/stitching_network.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# coding: utf-8
+"""
+Stitching module(S) and two retargeting modules(R) defined in the paper.
+- The stitching module pastes the animated portrait back into the original image space without pixel misalignment, such as in
+the stitching region.
+- The eyes retargeting module is designed to address the issue of incomplete eye closure during cross-id reenactment, especially
+when a person with small eyes drives a person with larger eyes.
+- The lip retargeting module is designed similarly to the eye retargeting module, and can also normalize the input by ensuring that
+the lips are in a closed state, which facilitates better animation driving.
+"""
+import torch
+from torch import nn
+def remove_ddp_dumplicate_key(state_dict):
+    from collections import OrderedDict
+    state_dict_new = OrderedDict()
+    for key in state_dict.keys():
+        state_dict_new[key.replace('module.', '')] = state_dict[key]
+    return state_dict_new
+class StitchingNetwork(nn.Module):
+    def __init__(self, input_size=126, hidden_sizes=[128, 128, 64], output_size=65):
+        super(StitchingNetwork, self).__init__()
+        layers = []
+        for i in range(len(hidden_sizes)):
+            if i == 0:
+                layers.append(nn.Linear(input_size, hidden_sizes[i]))
+            else:
+                layers.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
+            layers.append(nn.ReLU(inplace=True))
+        layers.append(nn.Linear(hidden_sizes[-1], output_size))
+        self.mlp = nn.Sequential(*layers)
+    def _forward(self, x):
+        return self.mlp(x)
+    def load_model(self, ckpt_path):
+        checkpoint = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
+        self.load_state_dict(remove_ddp_dumplicate_key(checkpoint['retarget_shoulder']))
+        self.eval()
+        return self
+    def stitching(self, kp_source: torch.Tensor, kp_driving: torch.Tensor) -> torch.Tensor:
+        """ conduct the stitching
+        kp_source: Bxnum_kpx3
+        kp_driving: Bxnum_kpx3
+        """
+        bs, num_kp = kp_source.shape[:2]
+        kp_driving_new = kp_driving.clone()
+        delta = self._forward(torch.cat([kp_source.view(bs, -1), kp_driving_new.view(bs, -1)], dim=1))
+        delta_exp = delta[..., :3*num_kp].reshape(bs, num_kp, 3)  # 1x20x3
+        delta_tx_ty = delta[..., 3*num_kp:3*num_kp+2].reshape(bs, 1, 2)  # 1x1x2
+        kp_driving_new += delta_exp
+        kp_driving_new[..., :2] += delta_tx_ty
+        return kp_driving_new
+    def forward(self, kp_source, kp_driving):
+        out = self.stitching(kp_source, kp_driving)
+        return out

core/models/modules/util.py ADDED Viewed

	@@ -0,0 +1,452 @@

+# coding: utf-8
+"""
+This file defines various neural network modules and utility functions, including convolutional and residual blocks,
+normalizations, and functions for spatial transformation and tensor manipulation.
+"""
+from torch import nn
+import torch.nn.functional as F
+import torch
+import torch.nn.utils.spectral_norm as spectral_norm
+import math
+import warnings
+import collections.abc
+from itertools import repeat
+def kp2gaussian(kp, spatial_size, kp_variance):
+    """
+    Transform a keypoint into gaussian like representation
+    """
+    mean = kp
+    coordinate_grid = make_coordinate_grid(spatial_size, mean)
+    number_of_leading_dimensions = len(mean.shape) - 1
+    shape = (1,) * number_of_leading_dimensions + coordinate_grid.shape
+    coordinate_grid = coordinate_grid.view(*shape)
+    repeats = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 1)
+    coordinate_grid = coordinate_grid.repeat(*repeats)
+    # Preprocess kp shape
+    shape = mean.shape[:number_of_leading_dimensions] + (1, 1, 1, 3)
+    mean = mean.view(*shape)
+    mean_sub = (coordinate_grid - mean)
+    out = torch.exp(-0.5 * (mean_sub ** 2).sum(-1) / kp_variance)
+    return out
+def make_coordinate_grid(spatial_size, ref, **kwargs):
+    d, h, w = spatial_size
+    x = torch.arange(w).type(ref.dtype).to(ref.device)
+    y = torch.arange(h).type(ref.dtype).to(ref.device)
+    z = torch.arange(d).type(ref.dtype).to(ref.device)
+    # NOTE: must be right-down-in
+    x = (2 * (x / (w - 1)) - 1)  # the x axis faces to the right
+    y = (2 * (y / (h - 1)) - 1)  # the y axis faces to the bottom
+    z = (2 * (z / (d - 1)) - 1)  # the z axis faces to the inner
+    yy = y.view(1, -1, 1).repeat(d, 1, w)
+    xx = x.view(1, 1, -1).repeat(d, h, 1)
+    zz = z.view(-1, 1, 1).repeat(1, h, w)
+    meshed = torch.cat([xx.unsqueeze_(3), yy.unsqueeze_(3), zz.unsqueeze_(3)], 3)
+    return meshed
+class ConvT2d(nn.Module):
+    """
+    Upsampling block for use in decoder.
+    """
+    def __init__(self, in_features, out_features, kernel_size=3, stride=2, padding=1, output_padding=1):
+        super(ConvT2d, self).__init__()
+        self.convT = nn.ConvTranspose2d(in_features, out_features, kernel_size=kernel_size, stride=stride,
+                                        padding=padding, output_padding=output_padding)
+        self.norm = nn.InstanceNorm2d(out_features)
+    def forward(self, x):
+        out = self.convT(x)
+        out = self.norm(out)
+        out = F.leaky_relu(out)
+        return out
+class ResBlock3d(nn.Module):
+    """
+    Res block, preserve spatial resolution.
+    """
+    def __init__(self, in_features, kernel_size, padding):
+        super(ResBlock3d, self).__init__()
+        self.conv1 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)
+        self.conv2 = nn.Conv3d(in_channels=in_features, out_channels=in_features, kernel_size=kernel_size, padding=padding)
+        self.norm1 = nn.BatchNorm3d(in_features, affine=True)
+        self.norm2 = nn.BatchNorm3d(in_features, affine=True)
+    def forward(self, x):
+        out = self.norm1(x)
+        out = F.relu(out)
+        out = self.conv1(out)
+        out = self.norm2(out)
+        out = F.relu(out)
+        out = self.conv2(out)
+        out += x
+        return out
+class UpBlock3d(nn.Module):
+    """
+    Upsampling block for use in decoder.
+    """
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(UpBlock3d, self).__init__()
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+        self.norm = nn.BatchNorm3d(out_features, affine=True)
+    def forward(self, x):
+        out = F.interpolate(x, scale_factor=(1, 2, 2))
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out)
+        return out
+class DownBlock2d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(DownBlock2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)
+        self.norm = nn.BatchNorm2d(out_features, affine=True)
+        self.pool = nn.AvgPool2d(kernel_size=(2, 2))
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = F.relu(out)
+        out = self.pool(out)
+        return out
+class DownBlock3d(nn.Module):
+    """
+    Downsampling block for use in encoder.
+    """
+    def __init__(self, in_features, out_features, kernel_size=3, padding=1, groups=1):
+        super(DownBlock3d, self).__init__()
+        '''
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                                padding=padding, groups=groups, stride=(1, 2, 2))
+        '''
+        self.conv = nn.Conv3d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size,
+                              padding=padding, groups=groups)
+        self.norm = nn.BatchNorm3d(out_features, affine=True)
+        self.pool = nn.AvgPool3d(kernel_size=(1, 2, 2))
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = F.relu(out)
+        out = self.pool(out)
+        return out
+class SameBlock2d(nn.Module):
+    """
+    Simple block, preserve spatial resolution.
+    """
+    def __init__(self, in_features, out_features, groups=1, kernel_size=3, padding=1, lrelu=False):
+        super(SameBlock2d, self).__init__()
+        self.conv = nn.Conv2d(in_channels=in_features, out_channels=out_features, kernel_size=kernel_size, padding=padding, groups=groups)
+        self.norm = nn.BatchNorm2d(out_features, affine=True)
+        if lrelu:
+            self.ac = nn.LeakyReLU()
+        else:
+            self.ac = nn.ReLU()
+    def forward(self, x):
+        out = self.conv(x)
+        out = self.norm(out)
+        out = self.ac(out)
+        return out
+class Encoder(nn.Module):
+    """
+    Hourglass Encoder
+    """
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Encoder, self).__init__()
+        down_blocks = []
+        for i in range(num_blocks):
+            down_blocks.append(DownBlock3d(in_features if i == 0 else min(max_features, block_expansion * (2 ** i)), min(max_features, block_expansion * (2 ** (i + 1))), kernel_size=3, padding=1))
+        self.down_blocks = nn.ModuleList(down_blocks)
+    def forward(self, x):
+        outs = [x]
+        for down_block in self.down_blocks:
+            outs.append(down_block(outs[-1]))
+        return outs
+class Decoder(nn.Module):
+    """
+    Hourglass Decoder
+    """
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Decoder, self).__init__()
+        up_blocks = []
+        for i in range(num_blocks)[::-1]:
+            in_filters = (1 if i == num_blocks - 1 else 2) * min(max_features, block_expansion * (2 ** (i + 1)))
+            out_filters = min(max_features, block_expansion * (2 ** i))
+            up_blocks.append(UpBlock3d(in_filters, out_filters, kernel_size=3, padding=1))
+        self.up_blocks = nn.ModuleList(up_blocks)
+        self.out_filters = block_expansion + in_features
+        self.conv = nn.Conv3d(in_channels=self.out_filters, out_channels=self.out_filters, kernel_size=3, padding=1)
+        self.norm = nn.BatchNorm3d(self.out_filters, affine=True)
+    def forward(self, x):
+        out = x.pop()
+        for up_block in self.up_blocks:
+            out = up_block(out)
+            skip = x.pop()
+            out = torch.cat([out, skip], dim=1)
+        out = self.conv(out)
+        out = self.norm(out)
+        out = F.relu(out)
+        return out
+class Hourglass(nn.Module):
+    """
+    Hourglass architecture.
+    """
+    def __init__(self, block_expansion, in_features, num_blocks=3, max_features=256):
+        super(Hourglass, self).__init__()
+        self.encoder = Encoder(block_expansion, in_features, num_blocks, max_features)
+        self.decoder = Decoder(block_expansion, in_features, num_blocks, max_features)
+        self.out_filters = self.decoder.out_filters
+    def forward(self, x):
+        return self.decoder(self.encoder(x))
+class SPADE(nn.Module):
+    def __init__(self, norm_nc, label_nc):
+        super().__init__()
+        self.param_free_norm = nn.InstanceNorm2d(norm_nc, affine=False)
+        nhidden = 128
+        self.mlp_shared = nn.Sequential(
+            nn.Conv2d(label_nc, nhidden, kernel_size=3, padding=1),
+            nn.ReLU())
+        self.mlp_gamma = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)
+        self.mlp_beta = nn.Conv2d(nhidden, norm_nc, kernel_size=3, padding=1)
+    def forward(self, x, segmap):
+        normalized = self.param_free_norm(x)
+        segmap = F.interpolate(segmap, size=x.size()[2:], mode='nearest')
+        actv = self.mlp_shared(segmap)
+        gamma = self.mlp_gamma(actv)
+        beta = self.mlp_beta(actv)
+        out = normalized * (1 + gamma) + beta
+        return out
+class SPADEResnetBlock(nn.Module):
+    def __init__(self, fin, fout, norm_G, label_nc, use_se=False, dilation=1):
+        super().__init__()
+        # Attributes
+        self.learned_shortcut = (fin != fout)
+        fmiddle = min(fin, fout)
+        self.use_se = use_se
+        # create conv layers
+        self.conv_0 = nn.Conv2d(fin, fmiddle, kernel_size=3, padding=dilation, dilation=dilation)
+        self.conv_1 = nn.Conv2d(fmiddle, fout, kernel_size=3, padding=dilation, dilation=dilation)
+        if self.learned_shortcut:
+            self.conv_s = nn.Conv2d(fin, fout, kernel_size=1, bias=False)
+        # apply spectral norm if specified
+        if 'spectral' in norm_G:
+            self.conv_0 = spectral_norm(self.conv_0)
+            self.conv_1 = spectral_norm(self.conv_1)
+            if self.learned_shortcut:
+                self.conv_s = spectral_norm(self.conv_s)
+        # define normalization layers
+        self.norm_0 = SPADE(fin, label_nc)
+        self.norm_1 = SPADE(fmiddle, label_nc)
+        if self.learned_shortcut:
+            self.norm_s = SPADE(fin, label_nc)
+    def forward(self, x, seg1):
+        x_s = self.shortcut(x, seg1)
+        dx = self.conv_0(self.actvn(self.norm_0(x, seg1)))
+        dx = self.conv_1(self.actvn(self.norm_1(dx, seg1)))
+        out = x_s + dx
+        return out
+    def shortcut(self, x, seg1):
+        if self.learned_shortcut:
+            x_s = self.conv_s(self.norm_s(x, seg1))
+        else:
+            x_s = x
+        return x_s
+    def actvn(self, x):
+        return F.leaky_relu(x, 2e-1)
+def filter_state_dict(state_dict, remove_name='fc'):
+    new_state_dict = {}
+    for key in state_dict:
+        if remove_name in key:
+            continue
+        new_state_dict[key] = state_dict[key]
+    return new_state_dict
+class GRN(nn.Module):
+    """ GRN (Global Response Normalization) layer
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+    def forward(self, x):
+        Gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        Nx = Gx / (Gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * Nx) + self.beta + x
+class LayerNorm(nn.Module):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def drop_path(x, drop_prob=0., training=False, scale_by_keep=True):
+    """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+class DropPath(nn.Module):
+    """ Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None, scale_by_keep=True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)

core/models/modules/warping_network.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# coding: utf-8
+"""
+Warping field estimator(W) defined in the paper, which generates a warping field using the implicit
+keypoint representations x_s and x_d, and employs this flow field to warp the source feature volume f_s.
+"""
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .util import SameBlock2d
+from .dense_motion import DenseMotionNetwork
+class WarpingNetwork(nn.Module):
+    def __init__(
+        self,
+        num_kp=21,
+        block_expansion=64,
+        max_features=512,
+        num_down_blocks=2,
+        reshape_channel=32,
+        estimate_occlusion_map=True,
+        **kwargs
+    ):
+        super(WarpingNetwork, self).__init__()
+        self.upscale = kwargs.get('upscale', 1)
+        self.flag_use_occlusion_map = kwargs.get('flag_use_occlusion_map', True)
+        dense_motion_params = {
+            "block_expansion": 32,
+            "max_features": 1024,
+            "num_blocks": 5,
+            "reshape_depth": 16,
+            "compress": 4,
+        }
+        self.dense_motion_network = DenseMotionNetwork(
+            num_kp=num_kp,
+            feature_channel=reshape_channel,
+            estimate_occlusion_map=estimate_occlusion_map,
+            **dense_motion_params
+        )
+        self.third = SameBlock2d(max_features, block_expansion * (2 ** num_down_blocks), kernel_size=(3, 3), padding=(1, 1), lrelu=True)
+        self.fourth = nn.Conv2d(in_channels=block_expansion * (2 ** num_down_blocks), out_channels=block_expansion * (2 ** num_down_blocks), kernel_size=1, stride=1)
+        self.estimate_occlusion_map = estimate_occlusion_map
+    def deform_input(self, inp, deformation):
+        return F.grid_sample(inp, deformation, align_corners=False)
+    def forward(self, feature_3d, kp_source, kp_driving):
+        # Feature warper, Transforming feature representation according to deformation and occlusion
+        dense_motion = self.dense_motion_network(
+            feature=feature_3d, kp_driving=kp_driving, kp_source=kp_source
+        )
+        if 'occlusion_map' in dense_motion:
+            occlusion_map = dense_motion['occlusion_map']  # Bx1x64x64
+        else:
+            occlusion_map = None
+        deformation = dense_motion['deformation']  # Bx16x64x64x3
+        out = self.deform_input(feature_3d, deformation)  # Bx32x16x64x64
+        bs, c, d, h, w = out.shape  # Bx32x16x64x64
+        out = out.view(bs, c * d, h, w)  # -> Bx512x64x64
+        out = self.third(out)  # -> Bx256x64x64
+        out = self.fourth(out)  # -> Bx256x64x64
+        if self.flag_use_occlusion_map and (occlusion_map is not None):
+            out = out * occlusion_map
+        # ret_dct = {
+        #     'occlusion_map': occlusion_map,
+        #     'deformation': deformation,
+        #     'out': out,
+        # }
+        # return ret_dct
+        return out
+    def load_model(self, ckpt_path):
+        self.load_state_dict(torch.load(ckpt_path, map_location=lambda storage, loc: storage))
+        self.eval()
+        return self

core/models/motion_extractor.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import numpy as np
+import torch
+from ..utils.load_model import load_model
+class MotionExtractor:
+    def __init__(self, model_path, device="cuda"):
+        kwargs = {
+            "module_name": "MotionExtractor",
+        }
+        self.model, self.model_type = load_model(model_path, device=device, **kwargs)
+        self.device = device
+        self.output_names = [
+            "pitch",
+            "yaw",
+            "roll",
+            "t",
+            "exp",
+            "scale",
+            "kp",
+        ]
+    def __call__(self, image):
+        """
+        image: np.ndarray, shape (1, 3, 256, 256), RGB, 0-1
+        """
+        outputs = {}
+        if self.model_type == "onnx":
+            out_list = self.model.run(None, {"image": image})
+            for i, name in enumerate(self.output_names):
+                outputs[name] = out_list[i]
+        elif self.model_type == "tensorrt":
+            self.model.setup({"image": image})
+            self.model.infer()
+            for name in self.output_names:
+                outputs[name] = self.model.buffer[name][0].copy()
+        elif self.model_type == "pytorch":
+            with torch.no_grad(), torch.autocast(device_type=self.device[:4], dtype=torch.float16, enabled=True):
+                pred = self.model(torch.from_numpy(image).to(self.device))
+                for i, name in enumerate(self.output_names):
+                    outputs[name] = pred[i].float().cpu().numpy()
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        outputs["exp"] = outputs["exp"].reshape(1, -1)
+        outputs["kp"] = outputs["kp"].reshape(1, -1)
+        return outputs

core/models/stitch_network.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import numpy as np
+import torch
+from ..utils.load_model import load_model
+class StitchNetwork:
+    def __init__(self, model_path, device="cuda"):
+        kwargs = {
+            "module_name": "StitchingNetwork",
+        }
+        self.model, self.model_type = load_model(model_path, device=device, **kwargs)
+        self.device = device
+    def __call__(self, kp_source, kp_driving):
+        if self.model_type == "onnx":
+            pred = self.model.run(None, {"kp_source": kp_source, "kp_driving": kp_driving})[0]
+        elif self.model_type == "tensorrt":
+            self.model.setup({"kp_source": kp_source, "kp_driving": kp_driving})
+            self.model.infer()
+            pred = self.model.buffer["out"][0].copy()
+        elif self.model_type == 'pytorch':
+            with torch.no_grad():
+                pred = self.model(
+                    torch.from_numpy(kp_source).to(self.device),
+                    torch.from_numpy(kp_driving).to(self.device)
+                ).cpu().numpy()
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        return pred

core/models/warp_network.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import numpy as np
+import torch
+from ..utils.load_model import load_model
+class WarpNetwork:
+    def __init__(self, model_path, device="cuda"):
+        kwargs = {
+            "module_name": "WarpingNetwork",
+        }
+        self.model, self.model_type = load_model(model_path, device=device, **kwargs)
+        self.device = device
+    def __call__(self, feature_3d, kp_source, kp_driving):
+        """
+        feature_3d: np.ndarray, shape (1, 32, 16, 64, 64)
+        kp_source | kp_driving: np.ndarray, shape (1, 21, 3)
+        """
+        if self.model_type == "onnx":
+            pred = self.model.run(None, {"feature_3d": feature_3d, "kp_source": kp_source, "kp_driving": kp_driving})[0]
+        elif self.model_type == "tensorrt":
+            self.model.setup({"feature_3d": feature_3d, "kp_source": kp_source, "kp_driving": kp_driving})
+            self.model.infer()
+            pred = self.model.buffer["out"][0].copy()
+        elif self.model_type == 'pytorch':
+            with torch.no_grad(), torch.autocast(device_type=self.device[:4], dtype=torch.float16, enabled=True):
+                pred = self.model(
+                    torch.from_numpy(feature_3d).to(self.device),
+                    torch.from_numpy(kp_source).to(self.device),
+                    torch.from_numpy(kp_driving).to(self.device)
+                ).float().cpu().numpy()
+        else:
+            raise ValueError(f"Unsupported model type: {self.model_type}")
+        return pred

core/utils/blend/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import pyximport
+pyximport.install()
+from .blend import blend_images_cy

core/utils/blend/blend.pyx ADDED Viewed

	@@ -0,0 +1,38 @@

+#cython: language_level=3
+import numpy as np
+cimport numpy as np
+cdef extern from "blend_impl.h":
+    void _blend_images_cy_impl(
+        const float* mask_warped,
+        const float* frame_warped,
+        const unsigned char* frame_rgb,
+        const int height,
+        const int width,
+        unsigned char* result
+    ) noexcept nogil
+def blend_images_cy(
+    np.ndarray[np.float32_t, ndim=2] mask_warped,
+    np.ndarray[np.float32_t, ndim=3] frame_warped,
+    np.ndarray[np.uint8_t, ndim=3] frame_rgb,
+    np.ndarray[np.uint8_t, ndim=3] result
+):
+    cdef int h = mask_warped.shape[0]
+    cdef int w = mask_warped.shape[1]
+    if not mask_warped.flags['C_CONTIGUOUS']:
+        mask_warped = np.ascontiguousarray(mask_warped)
+    if not frame_warped.flags['C_CONTIGUOUS']:
+        frame_warped = np.ascontiguousarray(frame_warped)
+    if not frame_rgb.flags['C_CONTIGUOUS']:
+        frame_rgb = np.ascontiguousarray(frame_rgb)
+    with nogil:
+        _blend_images_cy_impl(
+            <const float*>mask_warped.data,
+            <const float*>frame_warped.data,
+            <const unsigned char*>frame_rgb.data,
+            h, w,
+            <unsigned char*>result.data
+        )

core/utils/blend/blend.pyxbld ADDED Viewed

	@@ -0,0 +1,11 @@

+import numpy as np
+import os
+def make_ext(modname, pyxfilename):
+    from distutils.extension import Extension
+    return Extension(name=modname,
+                     sources=[pyxfilename, os.path.join(os.path.dirname(pyxfilename), "blend_impl.c")],
+                     include_dirs=[np.get_include(), os.path.dirname(pyxfilename)],
+                     extra_compile_args=["-O3", "-std=c99", "-march=native", "-ffast-math"],
+                    )