Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
XiangGao1102 authored Mar 10, 2024
1 parent 8375227 commit cb5ec44
Show file tree
Hide file tree
Showing 100 changed files with 16,331 additions and 0 deletions.
84 changes: 84 additions & 0 deletions configs/model_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
model:
target: fcdiffusion.fcdiffusion.FCDiffusion
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 2000
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
image_size: 64
channels: 4
cond_stage_trainable: false
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: False
only_mid_control: False
control_mode: "mini_pass" # must be in ['mini_pass', 'low_pass', 'mid_pass', 'high_pass']

control_stage_config:
target: fcdiffusion.fcdiffusion.FreqControlNet
params:
use_checkpoint: True
image_size: 64 # unused
in_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_head_channels: 64 # need to fix for flash-attn
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024
legacy: False

unet_config:
target: fcdiffusion.fcdiffusion.ControlledUnetModel
params:
use_checkpoint: True
image_size: 64 # unused
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_head_channels: 64 # need to fix for flash-attn
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024
legacy: False

first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
#attn_type: "vanilla-xformers"
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity

cond_stage_config:
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
params:
freeze: True
layer: "penultimate"
63 changes: 63 additions & 0 deletions fcdiffusion/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import json
import cv2
import numpy as np
from torch.utils.data import Dataset


class TrainDataset(Dataset):
def __init__(self):
self.data = []
with open('datasets/training_data.json', 'rt') as f:
for line in f:
self.data.append(json.loads(line))

def __len__(self):
return len(self.data)

def __getitem__(self, idx):
item = self.data[idx]

img_path = item['img_path']
prompt = item['prompt']

img = cv2.imread(img_path)

# resize img to 512 x 512
img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_AREA)

# Do not forget that OpenCV read images in BGR order.
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Normalize images to [-1, 1].
img = (img.astype(np.float32) / 127.5) - 1.0

return dict(jpg=img, txt=prompt, path=img_path.split('6.5')[1])


class TestDataset(Dataset):
def __init__(self, img_path: str, prompt: str, res_num: int):
self.data = []
self.img_path = img_path
self.prompt = prompt
self.res_num = res_num

for i in range(self.res_num):
self.data.append({'jpg': img_path,
'txt': prompt})

def __len__(self):
return len(self.data)

def __getitem__(self, idx):
item = self.data[idx]

img_path = item['jpg']
prompt = item['txt']

img = cv2.imread(img_path)
img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_AREA)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = (img.astype(np.float32) / 127.5) - 1.0

return dict(jpg=img, txt=prompt, path=img_path)

Loading

0 comments on commit cb5ec44

Please sign in to comment.