feat: merge config with cli args

fudan-generative-vision · Jun 24, 2024 · 685bedf · 685bedf
1 parent 01d7fa2
commit 685bedf
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 16 deletions.
diff --git a/configs/inference/default.yaml b/configs/inference/default.yaml
@@ -1,5 +1,5 @@
-source_image: ./default.png
-driving_audio: default.wav
+source_image: examples/reference_images/1.jpg
+driving_audio: examples/driving_audios/1.wav
 
 weight_dtype: fp16
 
@@ -38,10 +38,10 @@ vae:
 
 save_path: ./.cache
 
-face_expand_ratio: 1.1
-pose_weight: 1.1
-face_weight: 1.1
-lip_weight: 1.1
+face_expand_ratio: 1.2
+pose_weight: 1.0
+face_weight: 1.0
+lip_weight: 1.0
 
 unet_additional_kwargs:
   use_inflated_groupnorm: true

diff --git a/hallo/utils/config.py b/hallo/utils/config.py
@@ -0,0 +1,8 @@
+from typing import Dict
+
+
+def filter_non_none(dict_obj: Dict):
+    non_none_filter = { k: v for k, v in dict_obj.items() if v is not None }
+    dict_obj.clear()
+    dict_obj.update(non_none_filter)
+    return dict_obj
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -44,6 +44,7 @@
 from hallo.models.image_proj import ImageProjModel
 from hallo.models.unet_2d_condition import UNet2DConditionModel
 from hallo.models.unet_3d import UNet3DConditionModel
+from hallo.utils.config import filter_non_none
 from hallo.utils.util import tensor_to_video
 
 
@@ -125,16 +126,16 @@ def inference_process(args: argparse.Namespace):
     modules and variables to prepare for the upcoming inference steps.
     """
     # 1. init config
+    cli_args = filter_non_none(vars(args))
     config = OmegaConf.load(args.config)
-    config = OmegaConf.merge(config, vars(args))
+    config = OmegaConf.merge(config, cli_args)
     source_image_path = config.source_image
     driving_audio_path = config.driving_audio
     save_path = config.save_path
     if not os.path.exists(save_path):
         os.makedirs(save_path)
     motion_scale = [config.pose_weight, config.face_weight, config.lip_weight]
-    if args.checkpoint is not None:
-        config.audio_ckpt_dir = args.checkpoint
+
     # 2. runtime variables
     device = torch.device(
         "cuda") if torch.cuda.is_available() else torch.device("cpu")
@@ -353,21 +354,21 @@ def inference_process(args: argparse.Namespace):
     parser.add_argument(
         "-c", "--config", default="configs/inference/default.yaml")
     parser.add_argument("--source_image", type=str, required=False,
-                        help="source image", default="test_data/source_images/6.jpg")
+                        help="source image")
     parser.add_argument("--driving_audio", type=str, required=False,
-                        help="driving audio", default="test_data/driving_audios/singing/sing_4.wav")
+                        help="driving audio")
     parser.add_argument(
         "--output", type=str, help="output video file name", default=".cache/output.mp4")
     parser.add_argument(
-        "--pose_weight", type=float, help="weight of pose", default=1.0)
+        "--pose_weight", type=float, help="weight of pose", required=False)
     parser.add_argument(
-        "--face_weight", type=float, help="weight of face", default=1.0)
+        "--face_weight", type=float, help="weight of face", required=False)
     parser.add_argument(
-        "--lip_weight", type=float, help="weight of lip", default=1.0)
+        "--lip_weight", type=float, help="weight of lip", required=False)
     parser.add_argument(
-        "--face_expand_ratio", type=float, help="face region", default=1.2)
+        "--face_expand_ratio", type=float, help="face region", required=False)
     parser.add_argument(
-        "--checkpoint", type=str, help="which checkpoint", default=None)
+        "--audio_ckpt_dir", "--checkpoint", type=str, help="specific checkpoint dir", required=False)
 
 
     command_line_args = parser.parse_args()