import os
import gradio as gr
import shutil
from inference import tango
import numpy as np
SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
examples_audio = [
["./datasets/cached_audio/example_male_voice_9_seconds.wav"],
# ["./datasets/cached_audio/example_female_voice_9_seconds.wav"],
]
examples_video = [
# ["./datasets/cached_audio/speaker8_jjRWaMCWs44_00-00-30.16_00-00-33.32.mp4"],
# ["./datasets/cached_audio/speaker7_iuYlGRnC7J8_00-00-0.00_00-00-3.25.mp4"],
["./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4"],
# ["./datasets/cached_audio/1wrQ6Msp7wM_00-00-39.69_00-00-45.68.mp4"],
# ["./datasets/cached_audio/101099-00_18_09-00_18_19.mp4"],
]
combined_examples = [
["./datasets/cached_audio/example_female_voice_9_seconds.wav", "./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4", 2024],
]
def tango_wrapper(audio_path, character_name, seed=2024, create_graph=False, video_folder_path=None):
if isinstance(audio_path, tuple):
sample_rate, audio_waveform = audio_path
if audio_waveform.dtype != np.float32:
audio_waveform = audio_waveform.astype(np.float32) / 32768.0
audio_path = (sample_rate, audio_waveform)
return tango(audio_path, character_name, seed=seed, create_graph=create_graph, video_folder_path=video_folder_path)
def make_demo():
with gr.Blocks(analytics_enabled=False) as Interface:
gr.Markdown(
"""
TANGO
Generating full-body talking videos from audio and reference video
"""
)
# Create a gallery with 5 videos
with gr.Row():
gr.Video(value="./datasets/cached_audio/demo1.mp4", label="Demo 0")
gr.Video(value="./datasets/cached_audio/demo2.mp4", label="Demo 1")
gr.Video(value="./datasets/cached_audio/demo3.mp4", label="Demo 2")
gr.Video(value="./datasets/cached_audio/demo4.mp4", label="Demo 3")
gr.Video(value="./datasets/cached_audio/demo5.mp4", label="Demo 4")
with gr.Row():
gr.Video(value="./datasets/cached_audio/demo6.mp4", label="Demo 5")
gr.Video(value="./datasets/cached_audio/demo0.mp4", label="Demo 6")
gr.Video(value="./datasets/cached_audio/demo7.mp4", label="Demo 7")
gr.Video(value="./datasets/cached_audio/demo8.mp4", label="Demo 8")
gr.Video(value="./datasets/cached_audio/demo9.mp4", label="Demo 9")
with gr.Row():
gr.Markdown(
"""
This is an open-source project running locally, operates in low-quality mode. Some generated results from high-quality mode are shown above.
News:
[10/15]: Add watermark, fix bugs on custom character by downgrades to py3.9, fix bugs to support audio less than 4s.
"""
)
with gr.Row():
with gr.Column(scale=4):
video_output_1 = gr.Video(
label="Generated video - 1",
interactive=False,
autoplay=False,
loop=False,
show_share_button=True,
)
with gr.Column(scale=4):
video_output_2 = gr.Video(
label="Generated video - 2",
interactive=False,
autoplay=False,
loop=False,
show_share_button=True,
)
with gr.Column(scale=1):
file_output_1 = gr.File(label="Download 3D Motion and Visualize in Blender")
file_output_2 = gr.File(label="Download 3D Motion and Visualize in Blender")
gr.Markdown("""
Details of the low-quality mode:
1. lower resolution, video resized as long-side 512 and keep aspect ratio.
2. subgraph instead of full-graph, causing noticeable "frame jumps".
3. only use the first 8s of your input audio.
4. only use the first 20s of your input video for custom character. if you custom character, it will only generate one video result without "smoothing" for saving time.
5. use open-source tools like SMPLerX-s-model, Wav2Lip, and FiLM for faster processing.
Feel free to open an issue on GitHub or contact the authors if this does not meet your needs.
""")
with gr.Row():
with gr.Column(scale=1):
audio_input = gr.Audio(label="Upload your audio")
seed_input = gr.Number(label="Seed", value=2024, interactive=True)
with gr.Column(scale=2):
gr.Examples(
examples=examples_audio,
inputs=[audio_input],
outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
label="Select existing Audio examples",
cache_examples=False,
)
with gr.Column(scale=1):
video_input = gr.Video(label="Default Character", value="./datasets/cached_audio/speaker9_o7Ik1OB4TaE_00-00-38.15_00-00-42.33.mp4", interactive=False, elem_classes="video")
gr.Markdown(
"""
Custom character upload is not supported in gradio 5.x (python 3.10).
To use it, download to local and set up a py39 environment for SimplerX and mmcv.
"""
)
with gr.Column(scale=2):
gr.Markdown(
"""
The character is fixed to the default one on the left.
"""
)
# Fourth row: Generate video button
with gr.Row():
run_button = gr.Button("Generate Video")
# Define button click behavior
run_button.click(
fn=tango_wrapper,
inputs=[audio_input, video_input, seed_input],
outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
)
with gr.Row():
with gr.Column(scale=4):
gr.Examples(
examples=combined_examples,
inputs=[audio_input, video_input, seed_input], # Both audio and video as inputs
outputs=[video_output_1, video_output_2, file_output_1, file_output_2],
fn=tango_wrapper, # Function that processes both audio and video inputs
label="Select Combined Audio and Video Examples (Cached)",
cache_examples=True,
)
return Interface
if __name__ == "__main__":
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "8675"
demo = make_demo()
demo.launch(share=True)