import os import gradio as gr import shutil from inference import tango import numpy as np SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__)) examples_audio = [ ["./datasets/cached_audio/example_male_voice_9_seconds.wav"], # ["./datasets/cached_audio/example_female_voice_9_seconds.wav"], ] examples_video = [ # ["./datasets/cached_audio/speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4"], # ["./datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4"], ["./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4"], # ["./datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4"], # ["./datasets/cached_audio/101099-00_18_09-00_18_19.mp4"], ] combined_examples = [ ["./datasets/cached_audio/example_female_voice_9_seconds.wav", "./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4", 2024], ] def tango_wrapper(audio_path, character_name, seed=2024, create_graph=False, video_folder_path=None): if isinstance(audio_path, tuple): sample_rate, audio_waveform = audio_path if audio_waveform.dtype != np.float32: audio_waveform = audio_waveform.astype(np.float32) / 32768.0 audio_path = (sample_rate, audio_waveform) return tango(audio_path, character_name, seed=seed, create_graph=create_graph, video_folder_path=video_folder_path) def make_demo(): with gr.Blocks(analytics_enabled=False) as Interface: gr.Markdown( """

TANGO

Generating full-body talking videos from audio and reference video

\ Haiyang Liu, \ Xingchao Yang, \ Tomoya Akiyama, \ Yuantian Huang, \ Qiaoge Li, \ Shigeru Kuriyama, \ Takafumi Taketomi\

""" ) # Create a gallery with 5 videos with gr.Row(): gr.Video(value="./datasets/cached_audio/demo1.mp4", label="Demo 0") gr.Video(value="./datasets/cached_audio/demo2.mp4", label="Demo 1") gr.Video(value="./datasets/cached_audio/demo3.mp4", label="Demo 2") gr.Video(value="./datasets/cached_audio/demo4.mp4", label="Demo 3") gr.Video(value="./datasets/cached_audio/demo5.mp4", label="Demo 4") with gr.Row(): gr.Video(value="./datasets/cached_audio/demo6.mp4", label="Demo 5") gr.Video(value="./datasets/cached_audio/demo0.mp4", label="Demo 6") gr.Video(value="./datasets/cached_audio/demo7.mp4", label="Demo 7") gr.Video(value="./datasets/cached_audio/demo8.mp4", label="Demo 8") gr.Video(value="./datasets/cached_audio/demo9.mp4", label="Demo 9") with gr.Row(): gr.Markdown( """

This is an open-source project running locally, operates in low-quality mode. Some generated results from high-quality mode are shown above.
News:
[10/15]: Add watermark, fix bugs on custom character by downgrades to py3.9, fix bugs to support audio less than 4s.

""" ) with gr.Row(): with gr.Column(scale=4): video_output_1 = gr.Video( label="Generated video - 1", interactive=False, autoplay=False, loop=False, show_share_button=True, ) with gr.Column(scale=4): video_output_2 = gr.Video( label="Generated video - 2", interactive=False, autoplay=False, loop=False, show_share_button=True, ) with gr.Column(scale=1): file_output_1 = gr.File(label="Download 3D Motion and Visualize in Blender") file_output_2 = gr.File(label="Download 3D Motion and Visualize in Blender") gr.Markdown("""

Details of the low-quality mode:
1. lower resolution, video resized as long-side 512 and keep aspect ratio.
2. subgraph instead of full-graph, causing noticeable "frame jumps".
3. only use the first 8s of your input audio.
4. only use the first 20s of your input video for custom character. if you custom character, it will only generate one video result without "smoothing" for saving time.
5. use open-source tools like SMPLerX-s-model, Wav2Lip, and FiLM for faster processing.

Feel free to open an issue on GitHub or contact the authors if this does not meet your needs.

""") with gr.Row(): with gr.Column(scale=1): audio_input = gr.Audio(label="Upload your audio") seed_input = gr.Number(label="Seed", value=2024, interactive=True) with gr.Column(scale=2): gr.Examples( examples=examples_audio, inputs=[audio_input], outputs=[video_output_1, video_output_2, file_output_1, file_output_2], label="Select existing Audio examples", cache_examples=False, ) with gr.Column(scale=1): video_input = gr.Video(label="Default Character", value="./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4", interactive=False, elem_classes="video") gr.Markdown( """ Custom character upload is not supported in gradio 5.x (python 3.10).
To use it, download to local and set up a py39 environment for SimplerX and mmcv. """ ) with gr.Column(scale=2): gr.Markdown( """ The character is fixed to the default one on the left. """ ) # Fourth row: Generate video button with gr.Row(): run_button = gr.Button("Generate Video") # Define button click behavior run_button.click( fn=tango_wrapper, inputs=[audio_input, video_input, seed_input], outputs=[video_output_1, video_output_2, file_output_1, file_output_2], ) with gr.Row(): with gr.Column(scale=4): gr.Examples( examples=combined_examples, inputs=[audio_input, video_input, seed_input], # Both audio and video as inputs outputs=[video_output_1, video_output_2, file_output_1, file_output_2], fn=tango_wrapper, # Function that processes both audio and video inputs label="Select Combined Audio and Video Examples (Cached)", cache_examples=True, ) return Interface if __name__ == "__main__": os.environ["MASTER_ADDR"] = "127.0.0.1" os.environ["MASTER_PORT"] = "8675" demo = make_demo() demo.launch(share=True)