Code

https://github.com/CompVis/stable-diffusion

https://github.com/huggingface/diffusers

def load_model_from_config(config, ckpt, verbose=False):
    print(f"Loading model from {ckpt}")
    pl_sd = torch.load(ckpt, map_location="cpu")
    if "global_step" in pl_sd:
        print(f"Global Step: {pl_sd['global_step']}")
    sd = pl_sd["state_dict"]
    model = instantiate_from_config(config.model)
    m, u = model.load_state_dict(sd, strict=False)
    if len(m) > 0 and verbose:
        print("missing keys:")
        print(m)
    if len(u) > 0 and verbose:
        print("unexpected keys:")
        print(u)

    model.cuda()
    model.eval()
    return model
model = load_model_from_config(config, opt["ckpt"])    
model.decode_first_stage()
model.encode_first_stage()

model.get_learned_conditioning()    # CLIP 编码

model.apply_model()                 # U-Net 预测当前图像应去除的噪声

p_sample_ddim(...)                  # 采样器计算下一去噪迭代的图像
pipe = load_stable_diffusion(sd_version=str(opt["model_id"]), precision_t=opt["dtype"])
image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
latents = pipe.vae.encode(image).latent_dist.sample(generator) * pipe.vae.config.scaling_factor

pipe.encode_prompt()

pipe.unet(, return_dict=False)[0]

pipe.scheduler.step(, return_dict=False)[0]