Spaces:

alvinichi
/

imageToVideo

Running

App Files Files Community

alvinichi commited on Apr 17

Commit

1a447e6

1 Parent(s): 7a842d4

update lib

Browse files

Files changed (3) hide show

README.md +15 -18
app.py +136 -136
requirements.txt +8 -5

README.md CHANGED Viewed

@@ -1,30 +1,27 @@
 ---
-title: Animate Person From Image
-emoji: 🎭
-colorFrom: pink
-colorTo: purple
 sdk: gradio
 sdk_version: 4.0.2
 app_file: app.py
 pinned: false
 ---
-# Ứng dụng tạo video người chuyển động từ ảnh
-Ứng dụng này sử dụng AI để tạo video người chuyển động từ một ảnh tĩnh.
 ## Cách sử dụng
-1. Tải lên ảnh chứa người
-2. Nhập mô tả cho kiểu chuyển động mong muốn
-3. Điều chỉnh các tham số (tùy chọn)
-4. Nhấn "Tạo video"
-## Các tham số
-- **Mô tả chuyển động**: Mô tả bằng lời cách người sẽ chuyển động
-- **Mức độ chuyển động**: Điều chỉnh cường độ chuyển động (1-255)
-- **FPS**: Số khung hình mỗi giây của video kết quả
-## Mẹo sử dụng
-- Ảnh nên có nền đơn giản để có kết quả tốt nhất
-- Người trong ảnh nên ở tư thế tự nhiên, không quá phức tạp
-- Thử nghiệm với các mô tả khác nhau để có hiệu quả tốt nhất

 ---
+title: First Order Motion Model Animation
+emoji: 🎬
+colorFrom: blue
+colorTo: indigo
 sdk: gradio
 sdk_version: 4.0.2
 app_file: app.py
 pinned: false
 ---
+# First Order Motion Model
+Ứng dụng này sử dụng First Order Motion Model để tạo video người chuyển động từ một ảnh tĩnh.
 ## Cách sử dụng
+1. Tải lên ảnh nguồn chứa người/đối tượng bạn muốn làm chuyển động
+2. Tải lên video tham chiếu có chuyển động bạn muốn áp dụng
+3. Điều chỉnh các tùy chọn (tùy chọn)
+4. Nhấn "Tạo video" và chờ kết quả
+## Mô hình được sử dụng
+First Order Motion Model (FOMM) là một mô hình deep learning cho phép tạo chuyển động cho một đối tượng trong ảnh tĩnh dựa trên chuyển động từ video tham chiếu.
+## Paper & Code
+- Paper: [First Order Motion Model for Image Animation](https://arxiv.org/abs/2003.00196)
+- Code gốc: [AliaksandrSiarohin/first-order-model](https://github.com/AliaksandrSiarohin/first-order-model)

app.py CHANGED Viewed

@@ -1,156 +1,151 @@
 import gradio as gr
-import torch
 import numpy as np
 import imageio
-import os
-from PIL import Image
-import cv2
-# Hàm tách đối tượng khỏi nền
-def segment_person(image):
-    # Trong thực tế, bạn sẽ sử dụng một mô hình phân đoạn như U2Net
-    # Đây là một phiên bản đơn giản sử dụng phân ngưỡng màu
-    # Chuyển sang không gian màu HSV
-    image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2HSV)
-    # Tạo mặt nạ đơn giản (trong thực tế cần mô hình phân đoạn thật)
-    # Giả sử nền sáng hơn đối tượng
-    _, mask = cv2.threshold(image_cv[:, :, 2], 127, 255, cv2.THRESH_BINARY_INV)
-    # Xử lý mặt nạ
-    kernel = np.ones((5, 5), np.uint8)
-    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
-    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
-    return mask
-# Tạo video người chuyển động
-def animate_person(image, movement_type, num_frames=24):
-    if image is None:
-        return None, "Vui lòng tải lên một hình ảnh."
     try:
-        # Đảm bảo hình ảnh là định dạng RGB
-        if image.mode != "RGB":
-            image = image.convert("RGB")
-        # Thay đổi kích thước hình ảnh
-        image = image.resize((512, 512))
-        image_array = np.array(image)
-        # Tách người từ nền
-        mask = segment_person(image)
-        # Tạo ảnh nền và ảnh người
-        background = image_array.copy()
-        person = image_array.copy()
-        # Áp dụng mặt nạ
-        person_mask = np.stack([mask, mask, mask], axis=2) / 255.0
-        person = person * person_mask
-        background = background * (1 - person_mask)
-        # Tạo frames dựa vào loại chuyển động
-        frames = []
-        if movement_type == "Đi bộ":
-            # Mô phỏng đi bộ - di chuyển lên xuống và sang ngang
-            for i in range(num_frames):
-                y_offset = int(np.sin(i/8 * 2 * np.pi) * 10)
-                x_offset = i % 4 - 2  # Nhịp bước nhỏ
-                # Tạo frame mới với nền tĩnh
-                frame = background.copy()
-                # Thêm người với offset
-                M = np.float32([[1, 0, x_offset], [0, 1, y_offset]])
-                moved_person = cv2.warpAffine(person, M, (512, 512))
-                # Kết hợp nền và người
-                frame = frame + moved_person
-                frames.append(frame.astype(np.uint8))
-        elif movement_type == "Vẫy tay":
-            # Mô phỏng vẫy tay - xoay nhẹ phần trên
-            for i in range(num_frames):
-                angle = np.sin(i/6 * 2 * np.pi) * 5  # Xoay ±5 độ
-                # Tạo ma trận xoay
-                center = (256, 200)  # Giả sử tâm xoay ở phần trên của người
-                M = cv2.getRotationMatrix2D(center, angle, 1.0)
-                # Xoay người
-                rotated_person = cv2.warpAffine(person, M, (512, 512))
-                # Kết hợp nền và người đã xoay
-                frame = background.copy() + rotated_person
-                frames.append(frame.astype(np.uint8))
-        elif movement_type == "Nhảy múa":
-            # Mô phỏng nhảy múa - kết hợp chuyển động
-            for i in range(num_frames):
-                y_offset = int(np.sin(i/6 * 2 * np.pi) * 15)
-                x_offset = int(np.sin(i/4 * 2 * np.pi) * 10)
-                angle = np.sin(i/8 * 2 * np.pi) * 3
-                # Xoay người
-                center = (256, 256)
-                M_rot = cv2.getRotationMatrix2D(center, angle, 1.0)
-                rotated_person = cv2.warpAffine(person, M_rot, (512, 512))
-                # Di chuyển người đã xoay
-                M_trans = np.float32([[1, 0, x_offset], [0, 1, y_offset]])
-                moved_person = cv2.warpAffine(rotated_person, M_trans, (512, 512))
-                # Kết hợp nền và người đã di chuyển
-                frame = background.copy() + moved_person
-                frames.append(frame.astype(np.uint8))
-        else:  # Chuyển động nhẹ
-            for i in range(num_frames):
-                angle = np.sin(i/12 * 2 * np.pi) * 2
-                y_offset = int(np.sin(i/10 * 2 * np.pi) * 5)
-                # Xoay người
-                center = (256, 256)
-                M_rot = cv2.getRotationMatrix2D(center, angle, 1.0)
-                rotated_person = cv2.warpAffine(person, M_rot, (512, 512))
-                # Di chuyển người đã xoay
-                M_trans = np.float32([[1, 0, 0], [0, 1, y_offset]])
-                moved_person = cv2.warpAffine(rotated_person, M_trans, (512, 512))
-                # Kết hợp nền và người đã di chuyển
-                frame = background.copy() + moved_person
-                frames.append(frame.astype(np.uint8))
-        # Lưu video
-        output_path = "animated_person.mp4"
-        imageio.mimsave(output_path, frames, fps=8)
-        return output_path, "Video được tạo thành công!"
     except Exception as e:
         return None, f"Lỗi: {str(e)}"
 # Tạo giao diện Gradio
-with gr.Blocks(title="Ứng dụng tạo chuyển động cho người trong ảnh") as demo:
-    gr.Markdown("# Tạo video người chuyển động từ ảnh")
-    gr.Markdown("Tạo video trong đó chỉ người trong ảnh chuyển động, nền vẫn giữ nguyên")
     with gr.Row():
         with gr.Column():
-            image_input = gr.Image(type="pil", label="Tải lên ảnh người")
-            movement_type = gr.Radio(
-                ["Đi bộ", "Vẫy tay", "Nhảy múa", "Chuyển động nhẹ"],
-                label="Loại chuyển động",
-                value="Chuyển động nhẹ"
-            )
-            num_frames = gr.Slider(
-                minimum=12, maximum=36, value=24, step=4,
-                label="Số khung hình"
-            )
             submit_btn = gr.Button("Tạo video")
         with gr.Column():
@@ -158,14 +153,19 @@ with gr.Blocks(title="Ứng dụng tạo chuyển động cho người trong ả
             output_message = gr.Textbox(label="Thông báo")
     submit_btn.click(
-        fn=animate_person,
-        inputs=[image_input, movement_type, num_frames],
         outputs=[output_video, output_message]
     )
     gr.Markdown("### Lưu ý")
-    gr.Markdown("- Sử dụng ảnh có người trên nền đơn giản để có kết quả tốt nhất")
-    gr.Markdown("- Phương pháp này tách người và nền, chỉ làm chuyển động người")
-    gr.Markdown("- Đây là phiên bản đơn giản, kết quả thực tế sẽ phụ thuộc vào chất lượng hình ảnh")
 demo.launch()

 import gradio as gr
+import os
 import numpy as np
+import torch
 import imageio
+import subprocess
+from skimage.transform import resize
+from skimage import img_as_ubyte
+# Clone repo nếu chưa có
+if not os.path.exists('first-order-model'):
+    subprocess.call(['git', 'clone', 'https://github.com/AliaksandrSiarohin/first-order-model.git'])
+    os.rename('first-order-model', 'first_order_model')
+# Thêm đường dẫn vào PYTHONPATH
+import sys
+sys.path.append('first_order_model')
+# Import các module cần thiết từ repo
+from first_order_model.demo import load_checkpoints
+from first_order_model.animate import normalize_kp
+# Tải mô hình pre-trained
+def download_model():
+    model_path = 'checkpoints/vox-cpk.pth.tar'
+    if not os.path.exists(model_path):
+        os.makedirs('checkpoints', exist_ok=True)
+        subprocess.call([
+            'wget', 'https://drive.google.com/uc?export=download&id=1PyQJmkdCsAkOYwUyaj_l-l0as-iLDgeH',
+            '-O', model_path
+        ])
+    config_path = 'first_order_model/config/vox-256.yaml'
+    if not os.path.exists(config_path):
+        os.makedirs('first_order_model/config', exist_ok=True)
+        subprocess.call([
+            'wget', 'https://drive.google.com/uc?export=download&id=1pZUMNRjkBiuBEM68oj9nskuWgJR-5QMn',
+            '-O', config_path
+        ])
+    return config_path, model_path
+# Hàm tạo animation
+def make_animation(source_image, driving_video, relative=True, adapt_movement_scale=True):
+    config_path, checkpoint_path = download_model()
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    # Tải mô hình và cấu hình
+    generator, kp_detector = load_checkpoints(config_path, checkpoint_path, device=device)
+    # Đọc source_image và driving_video
+    source = imageio.imread(source_image)
+    reader = imageio.get_reader(driving_video)
+    fps = reader.get_meta_data()['fps']
+    driving = []
+    try:
+        for im in reader:
+            driving.append(im)
+    except RuntimeError:
+        pass
+    reader.close()
+    # Tiền xử lý
+    source = resize(source, (256, 256))[..., :3]
+    driving = [resize(frame, (256, 256))[..., :3] for frame in driving]
+    # Chuyển đổi thành tensor
+    source = torch.tensor(source[np.newaxis].astype(np.float32)).permute(0, 3, 1, 2).to(device)
+    driving = torch.tensor(np.array(driving).astype(np.float32)).permute(0, 3, 1, 2).to(device)
+    # Trích xuất keypoints
+    kp_source = kp_detector(source)
+    kp_driving_initial = kp_detector(driving[0:1])
+    # Tạo animation
+    with torch.no_grad():
+        predictions = []
+        for frame_idx in range(driving.shape[0]):
+            driving_frame = driving[frame_idx:frame_idx+1]
+            kp_driving = kp_detector(driving_frame)
+            # Chuẩn hóa keypoints
+            if relative:
+                kp_norm = normalize_kp(
+                    kp_source=kp_source,
+                    kp_driving=kp_driving,
+                    kp_driving_initial=kp_driving_initial,
+                    use_relative_movement=relative,
+                    use_relative_jacobian=relative,
+                    adapt_movement_scale=adapt_movement_scale
+                )
+            else:
+                kp_norm = kp_driving
+            # Tạo frame
+            out = generator(source, kp_source=kp_source, kp_driving=kp_norm)
+            predictions.append(np.transpose(out['prediction'].data.cpu().numpy(), [0, 2, 3, 1])[0])
+    # Lưu video kết quả
+    output_path = 'result.mp4'
+    imageio.mimsave(output_path, [img_as_ubyte(frame) for frame in predictions], fps=fps)
+    return output_path
+# Định nghĩa giao diện Gradio
+def animate_fomm(source_image, driving_video, relative=True, adapt_scale=True):
+    if source_image is None or driving_video is None:
+        return None, "Vui lòng tải lên cả ảnh nguồn và video tham chiếu."
     try:
+        # Lưu tạm ảnh và video tải lên
+        source_path = "source_image.jpg"
+        driving_path = "driving_video.mp4"
+        # Lưu ảnh nguồn
+        source_image.save(source_path)
+        # Lưu video tham chiếu
+        with open(driving_path, 'wb') as f:
+            f.write(driving_video)
+        # Tạo animation
+        result_path = make_animation(
+            source_path,
+            driving_path,
+            relative=relative,
+            adapt_movement_scale=adapt_scale
+        )
+        return result_path, "Video được tạo thành công!"
     except Exception as e:
         return None, f"Lỗi: {str(e)}"
 # Tạo giao diện Gradio
+with gr.Blocks(title="First Order Motion Model - Tạo video người chuyển động") as demo:
+    gr.Markdown("# First Order Motion Model")
+    gr.Markdown("Tạo video người chuyển động từ một ảnh tĩnh và video tham chiếu")
     with gr.Row():
         with gr.Column():
+            source_image = gr.Image(type="pil", label="Tải lên ảnh nguồn")
+            driving_video = gr.Video(label="Tải lên video tham chiếu")
+            with gr.Row():
+                relative = gr.Checkbox(value=True, label="Chuyển động tương đối")
+                adapt_scale = gr.Checkbox(value=True, label="Điều chỉnh tỷ lệ chuyển động")
             submit_btn = gr.Button("Tạo video")
         with gr.Column():
             output_message = gr.Textbox(label="Thông báo")
     submit_btn.click(
+        fn=animate_fomm,
+        inputs=[source_image, driving_video, relative, adapt_scale],
         outputs=[output_video, output_message]
     )
+    gr.Markdown("### Cách sử dụng")
+    gr.Markdown("1. Tải lên **ảnh nguồn** - ảnh chứa người/đối tượng bạn muốn làm chuyển động")
+    gr.Markdown("2. Tải lên **video tham chiếu** - video có chuyển động bạn muốn áp dụng")
+    gr.Markdown("3. Nhấn **Tạo video** và chờ kết quả")
     gr.Markdown("### Lưu ý")
+    gr.Markdown("- Ảnh nguồn và video tham chiếu nên có đối tượng tương tự (người với người, mặt với mặt)")
+    gr.Markdown("- Đối tượng nên ở vị trí tương tự trong cả ảnh nguồn và khung đầu tiên của video tham chiếu")
+    gr.Markdown("- Quá trình tạo video có thể mất vài phút")
 demo.launch()

requirements.txt CHANGED Viewed

@@ -1,7 +1,10 @@
 gradio==4.0.2
-torch
 numpy
-Pillow
-imageio==2.31.1
-imageio-ffmpeg
-opencv-python

 gradio==4.0.2
+torch==1.13.1
+torchvision==0.14.1
 numpy
+imageio==2.9.0
+imageio-ffmpeg==0.4.5
+scikit-image==0.19.3
+matplotlib
+PyYAML==5.3.1
+face-alignment==1.3.5