Skip to content

Commit

Permalink
feat: gradio WebUI (fudan-generative-vision#51)
Browse files Browse the repository at this point in the history
* WebUI + Audio Fix

1. audio fix: explicitly specify the audio codec in `util.py`, otherwise the video is technically corrupt and doesn't play sound
2. web ui: gradio web ui
3. print the current step while running inference

gradio

* lint

* update
  • Loading branch information
cocktailpeanut authored Jun 20, 2024
1 parent ec80fdf commit 07ffd49
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 2 deletions.
2 changes: 1 addition & 1 deletion hallo/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def make_frame(t):
new_video_clip = VideoClip(make_frame, duration=tensor.shape[0] / fps)
audio_clip = AudioFileClip(audio_source).subclip(0, tensor.shape[0] / fps)
new_video_clip = new_video_clip.set_audio(audio_clip)
new_video_clip.write_videofile(output_video_file, fps=fps)
new_video_clip.write_videofile(output_video_file, fps=fps, audio_codec='aac')


silhouette_ids = [
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ transformers==4.39.2
xformers==0.0.25.post1
isort==5.13.2
pylint==3.2.2
pre-commit==3.7.1
pre-commit==3.7.1
gradio==4.36.1
65 changes: 65 additions & 0 deletions scripts/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
This script is a gradio web ui.
The script takes an image and an audio clip, and lets you configure all the
variables such as cfg_scale, pose_weight, face_weight, lip_weight, etc.
Usage:
This script can be run from the command line with the following command:
python scripts/app.py
"""
import argparse

import gradio as gr
from inference import inference_process


def predict(image, audio, size, steps, fps, cfg, pose_weight, face_weight, lip_weight, face_expand_ratio):
"""
Create a gradio interface with the configs.
"""
config = {
'data': {
'source_image': {
'width': size,
'height': size
},
'export_video': {
'fps': fps
}
},
'cfg_scale': cfg,
'source_image': image,
'driving_audio': audio,
'pose_weight': pose_weight,
'face_weight': face_weight,
'lip_weight': lip_weight,
'face_expand_ratio': face_expand_ratio,
'config': 'configs/inference/default.yaml',
'checkpoint': None,
'output': ".cache/output.mp4",
'inference_steps': steps
}
args = argparse.Namespace()
for key, value in config.items():
setattr(args, key, value)
return inference_process(args)

app = gr.Interface(
fn=predict,
inputs=[
gr.Image(label="source image (no webp)", type="filepath", format="jpeg"),
gr.Audio(label="source audio", type="filepath"),
gr.Number(label="size", value=512, minimum=256, maximum=512, step=64, precision=0),
gr.Number(label="steps", value=40, minimum=1, step=1, precision=0),
gr.Number(label="fps", value=25, minimum=1, step=1, precision=0),
gr.Slider(label="CFG Scale", value=3.5, minimum=0, maximum=10, step=0.01),
gr.Number(label="pose weight", value=1.0),
gr.Number(label="face weight", value=1.0),
gr.Number(label="lip weight", value=1.0),
gr.Number(label="face expand ratio", value=1.2),
],
outputs=[gr.Video()],
)
app.launch()
2 changes: 2 additions & 0 deletions scripts/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ def inference_process(args: argparse.Namespace):
generator = torch.manual_seed(42)

for t in range(times):
print(f"[{t+1}/{times}]")

if len(tensor_result) == 0:
# The first iteration
Expand Down Expand Up @@ -342,6 +343,7 @@ def inference_process(args: argparse.Namespace):
output_file = config.output
# save the result after all iteration
tensor_to_video(tensor_result, output_file, driving_audio_path)
return output_file


if __name__ == "__main__":
Expand Down

0 comments on commit 07ffd49

Please sign in to comment.