Code
https://github.com/CompVis/stable-diffusion
https://github.com/huggingface/diffusers
def load_model_from_config(config, ckpt, verbose=False):
print(f"Loading model from {ckpt}")
pl_sd = torch.load(ckpt, map_location="cpu")
if "global_step" in pl_sd:
print(f"Global Step: {pl_sd['global_step']}")
sd = pl_sd["state_dict"]
model = instantiate_from_config(config.model)
m, u = model.load_state_dict(sd, strict=False)
if len(m) > 0 and verbose:
print("missing keys:")
print(m)
if len(u) > 0 and verbose:
print("unexpected keys:")
print(u)
model.cuda()
model.eval()
return model
model = load_model_from_config(config, opt["ckpt"])
model.decode_first_stage()
model.encode_first_stage()
model.get_learned_conditioning() # CLIP 编码
model.apply_model() # U-Net 预测当前图像应去除的噪声
p_sample_ddim(...) # 采样器计算下一去噪迭代的图像
pipe = load_stable_diffusion(sd_version=str(opt["model_id"]), precision_t=opt["dtype"])
image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
latents = pipe.vae.encode(image).latent_dist.sample(generator) * pipe.vae.config.scaling_factor
pipe.encode_prompt()
pipe.unet(, return_dict=False)[0]
pipe.scheduler.step(, return_dict=False)[0]