From cfb9dd8a620abf8cf4cb5a488538f90e6d7a7116 Mon Sep 17 00:00:00 2001 From: AricGamma Date: Wed, 19 Jun 2024 23:19:50 +0800 Subject: [PATCH 1/6] docs: Update head video in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dff2e71..9c6b6c8 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ ## 📸 Showcase -https://github.com/fudan-generative-vision/hallo/assets/17402682/294e78ef-c60d-4c32-8e3c-7f8d6934c6bd +https://github.com/fudan-generative-vision/hallo/assets/17402682/9d1a0de4-3470-4d38-9e4f-412f517f834c ### 🎬 Honoring Classic Films From ec80fdf3a86ff6c990aca7ab6018bf60dcf0e5d4 Mon Sep 17 00:00:00 2001 From: crystallee <120543703+crystallee-ai@users.noreply.github.com> Date: Thu, 20 Jun 2024 11:59:01 +0800 Subject: [PATCH 2/6] docs: update Acknowledgements (#65) --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index dff2e71..9109778 100644 --- a/README.md +++ b/README.md @@ -282,6 +282,12 @@ Interested individuals are encouraged to contact us at [siyuzhu@fudan.edu.cn](ma The development of portrait image animation technologies driven by audio inputs poses social risks, such as the ethical implications of creating realistic portraits that could be misused for deepfakes. To mitigate these risks, it is crucial to establish ethical guidelines and responsible use practices. Privacy and consent concerns also arise from using individuals' images and voices. Addressing these involves transparent data usage policies, informed consent, and safeguarding privacy rights. By addressing these risks and implementing mitigations, the research aims to ensure the responsible and ethical development of this technology. +## 🤗 Acknowledgements + +We would like to thank the contributors to the [magic-animate](https://github.com/magic-research/magic-animate), [AnimateDiff](https://github.com/guoyww/AnimateDiff), [ultimatevocalremovergui](https://github.com/Anjok07/ultimatevocalremovergui), [AniPortrait](https://github.com/Zejun-Yang/AniPortrait) and [Moore-AnimateAnyone](https://github.com/MooreThreads/Moore-AnimateAnyone) repositories, for their open research and exploration. + +If we missed any open-source projects or related articles, we would like to complement the acknowledgement of this specific work immediately. + ## 👏 Community Contributors Thank you to all the contributors who have helped to make this project better! From 8464f720e6f00089af8f6893e557e1ed13eb171a Mon Sep 17 00:00:00 2001 From: "leeway.zlw" Date: Thu, 20 Jun 2024 13:59:19 +0800 Subject: [PATCH 3/6] fix: use pytorch+cuda version. test on Ubuntu & Windows --- requirements.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 40eff18..36d7869 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +--find-links https://download.pytorch.org/whl/torch_stable.html + accelerate==0.28.0 audio-separator==0.17.2 av==12.1.0 @@ -20,8 +22,8 @@ opencv-python-headless==4.9.0.80 opencv-python==4.9.0.80 pillow==10.3.0 setuptools==70.0.0 -torch==2.2.2 -torchvision==0.17.2 +torch==2.2.2+cu121 +torchvision==0.17.2+cu121 tqdm==4.66.4 transformers==4.39.2 xformers==0.0.25.post1 From 07ffd49627f20638cb0d98cc6ad6ca5e832e538f Mon Sep 17 00:00:00 2001 From: cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com> Date: Thu, 20 Jun 2024 02:01:24 -0400 Subject: [PATCH 4/6] feat: gradio WebUI (#51) * WebUI + Audio Fix 1. audio fix: explicitly specify the audio codec in `util.py`, otherwise the video is technically corrupt and doesn't play sound 2. web ui: gradio web ui 3. print the current step while running inference gradio * lint * update --- hallo/utils/util.py | 2 +- requirements.txt | 3 +- scripts/app.py | 65 ++++++++++++++++++++++++++++++++++++++++++++ scripts/inference.py | 2 ++ 4 files changed, 70 insertions(+), 2 deletions(-) create mode 100644 scripts/app.py diff --git a/hallo/utils/util.py b/hallo/utils/util.py index 3a460f7..f4b6563 100644 --- a/hallo/utils/util.py +++ b/hallo/utils/util.py @@ -315,7 +315,7 @@ def make_frame(t): new_video_clip = VideoClip(make_frame, duration=tensor.shape[0] / fps) audio_clip = AudioFileClip(audio_source).subclip(0, tensor.shape[0] / fps) new_video_clip = new_video_clip.set_audio(audio_clip) - new_video_clip.write_videofile(output_video_file, fps=fps) + new_video_clip.write_videofile(output_video_file, fps=fps, audio_codec='aac') silhouette_ids = [ diff --git a/requirements.txt b/requirements.txt index 40eff18..7c3c5dc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -27,4 +27,5 @@ transformers==4.39.2 xformers==0.0.25.post1 isort==5.13.2 pylint==3.2.2 -pre-commit==3.7.1 \ No newline at end of file +pre-commit==3.7.1 +gradio==4.36.1 diff --git a/scripts/app.py b/scripts/app.py new file mode 100644 index 0000000..e106c02 --- /dev/null +++ b/scripts/app.py @@ -0,0 +1,65 @@ +""" +This script is a gradio web ui. + +The script takes an image and an audio clip, and lets you configure all the +variables such as cfg_scale, pose_weight, face_weight, lip_weight, etc. + +Usage: +This script can be run from the command line with the following command: + +python scripts/app.py +""" +import argparse + +import gradio as gr +from inference import inference_process + + +def predict(image, audio, size, steps, fps, cfg, pose_weight, face_weight, lip_weight, face_expand_ratio): + """ + Create a gradio interface with the configs. + """ + config = { + 'data': { + 'source_image': { + 'width': size, + 'height': size + }, + 'export_video': { + 'fps': fps + } + }, + 'cfg_scale': cfg, + 'source_image': image, + 'driving_audio': audio, + 'pose_weight': pose_weight, + 'face_weight': face_weight, + 'lip_weight': lip_weight, + 'face_expand_ratio': face_expand_ratio, + 'config': 'configs/inference/default.yaml', + 'checkpoint': None, + 'output': ".cache/output.mp4", + 'inference_steps': steps + } + args = argparse.Namespace() + for key, value in config.items(): + setattr(args, key, value) + return inference_process(args) + +app = gr.Interface( + fn=predict, + inputs=[ + gr.Image(label="source image (no webp)", type="filepath", format="jpeg"), + gr.Audio(label="source audio", type="filepath"), + gr.Number(label="size", value=512, minimum=256, maximum=512, step=64, precision=0), + gr.Number(label="steps", value=40, minimum=1, step=1, precision=0), + gr.Number(label="fps", value=25, minimum=1, step=1, precision=0), + gr.Slider(label="CFG Scale", value=3.5, minimum=0, maximum=10, step=0.01), + gr.Number(label="pose weight", value=1.0), + gr.Number(label="face weight", value=1.0), + gr.Number(label="lip weight", value=1.0), + gr.Number(label="face expand ratio", value=1.2), + ], + outputs=[gr.Video()], +) +app.launch() diff --git a/scripts/inference.py b/scripts/inference.py index 8bbc5cc..c2ef0bb 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -288,6 +288,7 @@ def inference_process(args: argparse.Namespace): generator = torch.manual_seed(42) for t in range(times): + print(f"[{t+1}/{times}]") if len(tensor_result) == 0: # The first iteration @@ -342,6 +343,7 @@ def inference_process(args: argparse.Namespace): output_file = config.output # save the result after all iteration tensor_to_video(tensor_result, output_file, driving_audio_path) + return output_file if __name__ == "__main__": From 4f99075c78ee840b32c7101e45840673d28073f6 Mon Sep 17 00:00:00 2001 From: Ashley Kleynhans Date: Fri, 21 Jun 2024 03:52:39 +0200 Subject: [PATCH 5/6] docs: Added link to Docker image and RunPod template in README (#71) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 6cd9e9e..e9735ad 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,8 @@ Explore the resources developed by our community to enhance your experience with - [hallo-webui](https://github.com/daswer123/hallo-webui) - Explore the WebUI created by [@daswer123](https://github.com/daswer123). - [hallo-for-windows](https://github.com/sdbds/hallo-for-windows) - Utilize Hallo on Windows with the guide by [@sdbds](https://github.com/sdbds). - [ComfyUI-Hallo](https://github.com/AIFSH/ComfyUI-Hallo) - Integrate Hallo with the ComfyUI tool by [@AIFSH](https://github.com/AIFSH). +- [hallo-docker](https://github.com/ashleykleynhans/hallo-docker) - Docker image for Hallo by [@ashleykleynhans](https://github.com/ashleykleynhans). +- [RunPod Template](https://runpod.io/console/deploy?template=aeyibwyvzy&ref=2xxro4syy) - Deploy Hallo to RunPod by [@ashleykleynhans](https://github.com/ashleykleynhans). Thanks to all of them. From d8e2ccc8333929f5ddbc544c04ba526e55d3560a Mon Sep 17 00:00:00 2001 From: AricGamma Date: Fri, 21 Jun 2024 13:08:47 +0800 Subject: [PATCH 6/6] fix: optimize gradio demo. remove unnecessary options (#76) Co-authored-by: leeway.zlw --- scripts/app.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/scripts/app.py b/scripts/app.py index e106c02..770ffcb 100644 --- a/scripts/app.py +++ b/scripts/app.py @@ -15,21 +15,12 @@ from inference import inference_process -def predict(image, audio, size, steps, fps, cfg, pose_weight, face_weight, lip_weight, face_expand_ratio): +def predict(image, audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)): """ Create a gradio interface with the configs. """ + _ = progress config = { - 'data': { - 'source_image': { - 'width': size, - 'height': size - }, - 'export_video': { - 'fps': fps - } - }, - 'cfg_scale': cfg, 'source_image': image, 'driving_audio': audio, 'pose_weight': pose_weight, @@ -38,8 +29,7 @@ def predict(image, audio, size, steps, fps, cfg, pose_weight, face_weight, lip_w 'face_expand_ratio': face_expand_ratio, 'config': 'configs/inference/default.yaml', 'checkpoint': None, - 'output': ".cache/output.mp4", - 'inference_steps': steps + 'output': ".cache/output.mp4" } args = argparse.Namespace() for key, value in config.items(): @@ -51,10 +41,6 @@ def predict(image, audio, size, steps, fps, cfg, pose_weight, face_weight, lip_w inputs=[ gr.Image(label="source image (no webp)", type="filepath", format="jpeg"), gr.Audio(label="source audio", type="filepath"), - gr.Number(label="size", value=512, minimum=256, maximum=512, step=64, precision=0), - gr.Number(label="steps", value=40, minimum=1, step=1, precision=0), - gr.Number(label="fps", value=25, minimum=1, step=1, precision=0), - gr.Slider(label="CFG Scale", value=3.5, minimum=0, maximum=10, step=0.01), gr.Number(label="pose weight", value=1.0), gr.Number(label="face weight", value=1.0), gr.Number(label="lip weight", value=1.0),