Add files via upload

XiangGao1102 · Mar 10, 2024 · cb5ec44 · cb5ec44
1 parent 8375227
commit cb5ec44
Show file tree

Hide file tree

Showing 100 changed files with 16,331 additions and 0 deletions.
diff --git a/configs/model_config.yaml b/configs/model_config.yaml
@@ -0,0 +1,84 @@
+model:
+  target: fcdiffusion.fcdiffusion.FCDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 2000
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+    only_mid_control: False
+    control_mode: "mini_pass"  # must be in ['mini_pass', 'low_pass', 'mid_pass', 'high_pass']
+
+    control_stage_config:
+      target: fcdiffusion.fcdiffusion.FreqControlNet
+      params:
+        use_checkpoint: True
+        image_size: 64 # unused
+        in_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64  # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+
+    unet_config:
+      target: fcdiffusion.fcdiffusion.ControlledUnetModel
+      params:
+        use_checkpoint: True
+        image_size: 64 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
diff --git a/fcdiffusion/dataset.py b/fcdiffusion/dataset.py
@@ -0,0 +1,63 @@
+import json
+import cv2
+import numpy as np
+from torch.utils.data import Dataset
+
+
+class TrainDataset(Dataset):
+    def __init__(self):
+        self.data = []
+        with open('datasets/training_data.json', 'rt') as f:
+            for line in f:
+                self.data.append(json.loads(line))
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        item = self.data[idx]
+
+        img_path = item['img_path']
+        prompt = item['prompt']
+
+        img = cv2.imread(img_path)
+
+        # resize img to 512 x 512
+        img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_AREA)
+
+        # Do not forget that OpenCV read images in BGR order.
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+        # Normalize images to [-1, 1].
+        img = (img.astype(np.float32) / 127.5) - 1.0
+
+        return dict(jpg=img, txt=prompt, path=img_path.split('6.5')[1])
+
+
+class TestDataset(Dataset):
+    def __init__(self, img_path: str, prompt: str, res_num: int):
+        self.data = []
+        self.img_path = img_path
+        self.prompt = prompt
+        self.res_num = res_num
+
+        for i in range(self.res_num):
+            self.data.append({'jpg': img_path,
+                              'txt': prompt})
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        item = self.data[idx]
+
+        img_path = item['jpg']
+        prompt = item['txt']
+
+        img = cv2.imread(img_path)
+        img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_AREA)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = (img.astype(np.float32) / 127.5) - 1.0
+
+        return dict(jpg=img, txt=prompt, path=img_path)
+