Merge branch 'fudan-generative-vision:main' into main

Generjet · Jun 21, 2024 · b4d429d · b4d429d
2 parents 8fecd8c + d8e2ccc
commit b4d429d
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@
 ## 📸 Showcase
 
 
-https://github.com/fudan-generative-vision/hallo/assets/17402682/294e78ef-c60d-4c32-8e3c-7f8d6934c6bd
+https://github.com/fudan-generative-vision/hallo/assets/17402682/9d1a0de4-3470-4d38-9e4f-412f517f834c
 
 ### 🎬 Honoring Classic Films
 
@@ -75,6 +75,8 @@ Explore the resources developed by our community to enhance your experience with
 - [hallo-webui](https://github.com/daswer123/hallo-webui) - Explore the WebUI created by [@daswer123](https://github.com/daswer123).
 - [hallo-for-windows](https://github.com/sdbds/hallo-for-windows) - Utilize Hallo on Windows with the guide by [@sdbds](https://github.com/sdbds).
 - [ComfyUI-Hallo](https://github.com/AIFSH/ComfyUI-Hallo) - Integrate Hallo with the ComfyUI tool by [@AIFSH](https://github.com/AIFSH).
+- [hallo-docker](https://github.com/ashleykleynhans/hallo-docker) - Docker image for Hallo by [@ashleykleynhans](https://github.com/ashleykleynhans).
+- [RunPod Template](https://runpod.io/console/deploy?template=aeyibwyvzy&ref=2xxro4syy) - Deploy Hallo to RunPod by [@ashleykleynhans](https://github.com/ashleykleynhans).
 
 Thanks to all of them.
 
@@ -292,6 +294,12 @@ Interested individuals are encouraged to contact us at [[email protected]](ma
 
 The development of portrait image animation technologies driven by audio inputs poses social risks, such as the ethical implications of creating realistic portraits that could be misused for deepfakes. To mitigate these risks, it is crucial to establish ethical guidelines and responsible use practices. Privacy and consent concerns also arise from using individuals' images and voices. Addressing these involves transparent data usage policies, informed consent, and safeguarding privacy rights. By addressing these risks and implementing mitigations, the research aims to ensure the responsible and ethical development of this technology.
 
+## 🤗 Acknowledgements
+
+We would like to thank the contributors to the [magic-animate](https://github.com/magic-research/magic-animate), [AnimateDiff](https://github.com/guoyww/AnimateDiff), [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui), [AniPortrait](https://github.com/Zejun-Yang/AniPortrait) and [Moore-AnimateAnyone](https://github.com/MooreThreads/Moore-AnimateAnyone) repositories, for their open research and exploration.
+
+If we missed any open-source projects or related articles, we would like to complement the acknowledgement of this specific work immediately.
+
 ## 👏 Community Contributors
 
 Thank you to all the contributors who have helped to make this project better!

diff --git a/hallo/utils/util.py b/hallo/utils/util.py
@@ -315,7 +315,7 @@ def make_frame(t):
     new_video_clip = VideoClip(make_frame, duration=tensor.shape[0] / fps)
     audio_clip = AudioFileClip(audio_source).subclip(0, tensor.shape[0] / fps)
     new_video_clip = new_video_clip.set_audio(audio_clip)
-    new_video_clip.write_videofile(output_video_file, fps=fps)
+    new_video_clip.write_videofile(output_video_file, fps=fps, audio_codec='aac')
 
 
 silhouette_ids = [

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
+--find-links https://download.pytorch.org/whl/torch_stable.html
+
 accelerate==0.28.0
 audio-separator==0.17.2
 av==12.1.0
@@ -20,11 +22,12 @@ opencv-python-headless==4.9.0.80
 opencv-python==4.9.0.80
 pillow==10.3.0
 setuptools==70.0.0
-torch==2.2.2
-torchvision==0.17.2
+torch==2.2.2+cu121
+torchvision==0.17.2+cu121
 tqdm==4.66.4
 transformers==4.39.2
 xformers==0.0.25.post1
 isort==5.13.2
 pylint==3.2.2
-pre-commit==3.7.1
+pre-commit==3.7.1
+gradio==4.36.1
diff --git a/scripts/app.py b/scripts/app.py
@@ -0,0 +1,51 @@
+"""
+This script is a gradio web ui.
+
+The script takes an image and an audio clip, and lets you configure all the
+variables such as cfg_scale, pose_weight, face_weight, lip_weight, etc.
+
+Usage:
+This script can be run from the command line with the following command:
+
+python scripts/app.py
+"""
+import argparse
+
+import gradio as gr
+from inference import inference_process
+
+
+def predict(image, audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)):
+    """
+    Create a gradio interface with the configs.
+    """
+    _ = progress
+    config = {
+        'source_image': image,
+        'driving_audio': audio,
+        'pose_weight': pose_weight,
+        'face_weight': face_weight,
+        'lip_weight': lip_weight,
+        'face_expand_ratio': face_expand_ratio,
+        'config': 'configs/inference/default.yaml',
+        'checkpoint': None,
+        'output': ".cache/output.mp4"
+    }
+    args = argparse.Namespace()
+    for key, value in config.items():
+        setattr(args, key, value)
+    return inference_process(args)
+
+app = gr.Interface(
+    fn=predict,
+    inputs=[
+      gr.Image(label="source image (no webp)", type="filepath", format="jpeg"),
+      gr.Audio(label="source audio", type="filepath"),
+      gr.Number(label="pose weight", value=1.0),
+      gr.Number(label="face weight", value=1.0),
+      gr.Number(label="lip weight", value=1.0),
+      gr.Number(label="face expand ratio", value=1.2),
+    ],
+    outputs=[gr.Video()],
+)
+app.launch()
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -288,6 +288,7 @@ def inference_process(args: argparse.Namespace):
     generator = torch.manual_seed(42)
 
     for t in range(times):
+        print(f"[{t+1}/{times}]")
 
         if len(tensor_result) == 0:
             # The first iteration
@@ -342,6 +343,7 @@ def inference_process(args: argparse.Namespace):
     output_file = config.output
     # save the result after all iteration
     tensor_to_video(tensor_result, output_file, driving_audio_path)
+    return output_file
 
 
 if __name__ == "__main__":