Autoencoders
Conv autoencoder (MNIST-scale)
import torch
import torch.nn as nn
class ConvAE(nn.Module):
def __init__(self, latent_dim=32):
super().__init__()
self.enc = nn.Sequential(
nn.Conv2d(1, 32, 3, stride=2, padding=1), nn.ReLU(True),
nn.Conv2d(32, 64, 3, stride=2, padding=1), nn.ReLU(True),
nn.Conv2d(64, 128, 3, stride=2, padding=1), nn.ReLU(True),
nn.Flatten(),
nn.Linear(128 * 4 * 4, latent_dim),
)
self.dec_fc = nn.Linear(latent_dim, 128 * 4 * 4)
self.dec = nn.Sequential(
nn.Unflatten(1, (128, 4, 4)),
nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1), nn.ReLU(True),
nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1), nn.ReLU(True),
nn.ConvTranspose2d(32, 1, 4, stride=2, padding=1), nn.Sigmoid(),
)
def encode(self, x):
return self.enc(x)
def decode(self, z):
h = self.dec_fc(z)
return self.dec(h)
def forward(self, x):
return self.decode(self.encode(x))
Assumes input [B,1,32,32] so 3 stride-2 stages land at 4×4; adjust channels/sizes for your resolution.
Training step (MSE)
model = ConvAE(latent_dim=32)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
def train_step(x):
x_hat = model(x)
loss = nn.functional.mse_loss(x_hat, x)
opt.zero_grad()
loss.backward()
opt.step()
return loss.item()
Denoising variant
def add_noise(x, sigma=0.2):
return (x + sigma * torch.randn_like(x)).clamp(0, 1)
def denoise_step(x_clean):
x_noisy = add_noise(x_clean)
x_hat = model(x_noisy)
loss = nn.functional.mse_loss(x_hat, x_clean)
opt.zero_grad()
loss.backward()
opt.step()
return loss.item()
Latent inspection
model.eval()
with torch.no_grad():
z = model.encode(batch)
recon = model.decode(z)
Use t-SNE/UMAP on z for 2D visualization; anomalies often have high reconstruction error.
VAE (contrast)
Encoder outputs μ, log σ²; sample z = μ + σ * ε; decoder reconstructs. Loss = reconstruction + KL divergence to standard normal. Enables sampling new images from z ~ N(0,I)—standard AE does not define a proper generative density without extra assumptions.
Takeaways
- AE: compress and reconstruct; bottleneck controls capacity.
- Denoising: learn invariances to corruption.
- VAE: generative latent with KL regularization.
Quick FAQ
GANs — introduction
Generator & discriminator (DCGAN-style sketch)
import torch
import torch.nn as nn
nz = 100 # noise dim
class G(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Linear(nz, 256 * 7 * 7),
nn.Unflatten(1, (256, 7, 7)),
nn.ConvTranspose2d(256, 128, 4, 2, 1, bias=False),
nn.BatchNorm2d(128), nn.ReLU(True),
nn.ConvTranspose2d(128, 1, 4, 2, 1, bias=False),
nn.Tanh(),
)
def forward(self, z):
return self.net(z)
class D(nn.Module):
def __init__(self):
super().__init__()
self.net = nn.Sequential(
nn.Conv2d(1, 64, 4, 2, 1, bias=False),
nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(64, 128, 4, 2, 1, bias=False),
nn.BatchNorm2d(128), nn.LeakyReLU(0.2, inplace=True),
nn.Conv2d(128, 256, 4, 2, 1, bias=False),
nn.BatchNorm2d(256), nn.LeakyReLU(0.2, inplace=True),
nn.Flatten(),
nn.Linear(256 * 3 * 3, 1),
)
def forward(self, x):
return self.net(x)
Two stride-2 transposed convs: 7×7→14×14→28×28. D outputs a logit (use BCEWithLogits). Normalize real images to [-1,1] to match Tanh.
One training iteration (non-saturating G loss)
device = "cuda" if torch.cuda.is_available() else "cpu"
G_m, D_m = G().to(device), D().to(device)
optG = torch.optim.Adam(G_m.parameters(), lr=2e-4, betas=(0.5, 0.999))
optD = torch.optim.Adam(D_m.parameters(), lr=2e-4, betas=(0.5, 0.999))
bce = nn.BCEWithLogitsLoss()
def step_D(real):
b = real.size(0)
z = torch.randn(b, nz, device=device)
fake = G_m(z).detach()
loss = bce(D_m(real), torch.ones(b, 1, device=device))
loss += bce(D_m(fake), torch.zeros(b, 1, device=device))
optD.zero_grad()
loss.backward()
optD.step()
return loss.item()
def step_G(b):
z = torch.randn(b, nz, device=device)
out = D_m(G_m(z))
loss = bce(out, torch.ones(b, 1, device=device))
optG.zero_grad()
loss.backward()
optG.step()
return loss.item()
Practical tips
- Train
Dmore often thanGearly on ifDis too weak—or the reverse ifDdominates. - Use spectral norm / WGAN-GP in harder setups for stability.
- Mode collapse: generator outputs limited variety; detect via sample diversity metrics.
Takeaways
- Minimax game between G and D; non-saturating loss common for G.
- DCGAN: strided conv D, transposed conv G, BN, no pooling in core blocks.
- Modern CV often uses diffusion or autoregressive models for higher fidelity—GANs still teach the adversarial idea.
Quick FAQ
z and/or intermediate features so generation is class-controlled.Diffusion models
Forward process (concept)
Given clean x_0, define q(x_t | x_{t-1}) = N(√(1-β_t) x_{t-1}, β_t I). With reparameterization, sample x_t = √(α̅_t) x_0 + √(1-α̅_t) ε with ε ~ N(0,I), where α̅_t is cumulative product of 1-β_t. Training picks random t and teaches a net ε_θ(x_t, t) to predict ε.
# Pseudocode: sample x_t in closed form (DDPM-style)
import torch
def q_sample(x0, t, alphas_cumprod, noise=None):
if noise is None:
noise = torch.randn_like(x0)
sqrt_acp = alphas_cumprod[t].sqrt().view(-1, 1, 1, 1)
sqrt_om = (1 - alphas_cumprod[t]).sqrt().view(-1, 1, 1, 1)
return sqrt_acp * x0 + sqrt_om * noise
alphas_cumprod is precomputed on device; t shape [B] indexes per-sample timestep.
Training step (predict noise)
# unet(x_t, t) -> predicted noise, same shape as x_t
def train_step(unet, x0, optimizer, alphas_cumprod):
b = x0.size(0)
t = torch.randint(0, len(alphas_cumprod), (b,), device=x0.device)
noise = torch.randn_like(x0)
xt = q_sample(x0, t, alphas_cumprod, noise)
pred = unet(xt, t)
loss = torch.nn.functional.mse_loss(pred, noise)
optimizer.zero_grad()
loss.backward()
optimizer.step()
return loss.item()
Sampling (ancestral DDPM outline)
@torch.no_grad()
def p_sample_loop(unet, shape, betas, alphas, alphas_cumprod):
x = torch.randn(shape, device=betas.device)
for t in reversed(range(len(betas))):
ts = torch.full((shape[0],), t, device=x.device, dtype=torch.long)
pred_noise = unet(x, ts)
# combine pred_noise with x, betas[t], alphas[t], alphas_cumprod[t]
# to get mean of p(x_{t-1}|x_t); add scaled noise if t > 0
...
return x
Full coefficients are in DDPM papers/cheatsheets; libraries implement them exactly.
Stable Diffusion (stack)
- VAE encoder/decoder: operate in lower-dimensional latent images.
- UNet: denoises latents with cross-attention to text tokens.
- Text encoder: CLIP-like transformer turns prompt into conditioning.
Hugging Face diffusers (optional)
# pip install diffusers transformers accelerate torch
from diffusers import StableDiffusionPipeline
import torch
pipe = StableDiffusionPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
torch_dtype=torch.float16,
).to("cuda")
img = pipe("a photo of an astronaut riding a horse").images[0]
Requires GPU with sufficient VRAM for default resolution; use smaller models or CPU offload for constraints.
Takeaways
- Forward: add noise; learn reverse denoising transitions.
- UNet + timestep (and optional text) conditioning is standard for images.
- Latent diffusion + text encoder = efficient high-res generation (Stable Diffusion class).
Quick FAQ
ε_θ + w (ε_c - ε_u).Chapter FAQ
Quick FAQ
Quick FAQ
z and/or intermediate features so generation is class-controlled.