Complete, annotated, runnable implementations of every major architecture.
Each connects directly to the math from Parts IโIII.
import torch # โโ Creating tensors โโ x = torch.tensor([1.0, 2.0, 3.0]) # from list X = torch.randn(3, 4) # random normal (3ร4) W = torch.zeros(4, 2) # zeros (4ร2) I = torch.eye(3) # identity matrix # โโ Key operations (map to Part I math) โโ dot = torch.dot(x, x) # dot product: xยทx matmul = X @ W # matrix multiply: (3ร4)(4ร2) โ (3ร2) norm = torch.norm(x) # L2 norm: ||x|| cosine = torch.nn.functional.cosine_similarity( X[0:1], X[1:2]) # cosine similarity # โโ Broadcasting: scalar ops apply element-wise โโ scaled = X * 2.0 + 1.0 # every element: 2x + 1 # โโ Reshaping โโ flat = X.view(-1) # flatten to 1D (12 elements) batch = X.unsqueeze(0) # add batch dim: (3,4) โ (1,3,4) # โโ Device management โโ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") X = X.to(device) # move to GPU
.backward(), it traverses this graph in reverse (backpropagation), computing gradients via the chain rule. You never write gradient code manually.
# โโ Autograd tracks operations on tensors with requires_grad=True โโ w = torch.tensor([2.0, -1.0], requires_grad=True) x = torch.tensor([3.0, 4.0]) y_true = torch.tensor(1.0) # Forward pass z = torch.dot(w, x) # z = 2(3) + (-1)(4) = 2 y_hat = torch.sigmoid(z) # ฯ(2) โ 0.8808 loss = -y_true * torch.log(y_hat) - (1 - y_true) * torch.log(1 - y_hat) # Backward pass โ computes all gradients automatically! loss.backward() print(w.grad) # โL/โw โ the gradient we derived by hand in Part I # tensor([-0.3574, -0.4765]) โ (ลท - y) * x = (0.881-1)*[3,4] # โโ CRITICAL: zero gradients before next backward pass โโ w.grad.zero_()
.backward() actually does: It walks the computational graph (stored as a DAG of Function nodes), calling each operation's .backward() method which implements the local derivative. For sigmoid: the SigmoidBackward node computes \(\sigma(z)(1-\sigma(z)) \times \text{incoming gradient}\). For matrix multiply: MmBackward computes \(\mathbf{X}^T \delta\) and \(\delta \mathbf{W}^T\). This is exactly the chain rule from Part I ยง1.3.
import torch.nn as nn class TwoLayerNet(nn.Module): def __init__(self, d_in, d_hidden, d_out): super().__init__() self.layer1 = nn.Linear(d_in, d_hidden) # Wโx + bโ self.relu = nn.ReLU() # activation self.layer2 = nn.Linear(d_hidden, d_out) # Wโh + bโ def forward(self, x): h = self.relu(self.layer1(x)) # aโฝยนโพ = ReLU(Wโx + bโ) return self.layer2(h) # zโฝยฒโพ = Wโaโฝยนโพ + bโ model = TwoLayerNet(784, 128, 10) print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}") # 784*128 + 128 + 128*10 + 10 = 101,770
import torch import torch.nn as nn from torch.utils.data import DataLoader, TensorDataset # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # LINEAR REGRESSION โ maps to Part I ยง2.1 # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # Generate synthetic data: y = 3xโ - 2xโ + 1 + noise torch.manual_seed(42) X = torch.randn(200, 2) y = X @ torch.tensor([3.0, -2.0]) + 1.0 + 0.3 * torch.randn(200) # Model: exactly ลท = Wx + b (Part I equation) model = nn.Linear(2, 1) # Loss: MSE = MLE under Gaussian noise (Part I ยง2.1) loss_fn = nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.05) for epoch in range(100): y_hat = model(X).squeeze() loss = loss_fn(y_hat, y) optimizer.zero_grad() # clear old gradients loss.backward() # compute โL/โW, โL/โb optimizer.step() # W โ W - ฮทยทโL print(f"Learned weights: {model.weight.data}") # Should be close to [3.0, -2.0] print(f"Learned bias: {model.bias.data}") # Should be close to 1.0 # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # LOGISTIC REGRESSION โ maps to Part I ยง2.2 # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # Generate 2-class data X_pos = torch.randn(100, 2) + torch.tensor([2.0, 2.0]) X_neg = torch.randn(100, 2) + torch.tensor([-2.0, -2.0]) X_cls = torch.cat([X_pos, X_neg]) y_cls = torch.cat([torch.ones(100), torch.zeros(100)]) model_cls = nn.Sequential( nn.Linear(2, 1), nn.Sigmoid() # P(y=1|x) = ฯ(Wx + b) ) # Loss: BCE = MLE under Bernoulli (Part I ยง2.2) loss_fn_cls = nn.BCELoss() optimizer_cls = torch.optim.SGD(model_cls.parameters(), lr=0.1) for epoch in range(200): p_hat = model_cls(X_cls).squeeze() loss = loss_fn_cls(p_hat, y_cls) optimizer_cls.zero_grad() loss.backward() optimizer_cls.step() accuracy = ((p_hat > 0.5).float() == y_cls).float().mean() print(f"Accuracy: {accuracy:.1%}") # Should be ~99%+
This implementation computes backprop manually โ no autograd โ so you can verify every gradient matches Part II's walkthrough.
import numpy as np class ManualMLP: """2-layer MLP with manual forward + backward. No frameworks.""" def __init__(self, d_in, d_hidden, d_out): # He initialization (Part I ยง3.1) self.W1 = np.random.randn(d_in, d_hidden) * np.sqrt(2.0 / d_in) self.b1 = np.zeros((1, d_hidden)) self.W2 = np.random.randn(d_hidden, d_out) * np.sqrt(2.0 / d_hidden) self.b2 = np.zeros((1, d_out)) def forward(self, X): # Cache everything โ backprop needs these (Part II ยงI) self.X = X # (N, d_in) self.z1 = X @ self.W1 + self.b1 # (N, d_hidden) self.a1 = np.maximum(0, self.z1) # ReLU self.z2 = self.a1 @ self.W2 + self.b2 # (N, d_out) self.a2 = 1 / (1 + np.exp(-self.z2)) # sigmoid return self.a2 def backward(self, y): N = y.shape[0] # โโ Output layer (Part II ยงI Step 4-5) โโ dz2 = self.a2 - y # ฮดโฝยฒโพ = ลท - y self.dW2 = self.a1.T @ dz2 / N # โL/โWยฒ = aโฝยนโพแต ยท ฮดโฝยฒโพ self.db2 = np.mean(dz2, axis=0, keepdims=True) # โโ Hidden layer (Part II ยงI Step 6-7) โโ da1 = dz2 @ self.W2.T # propagate error back dz1 = da1 * (self.z1 > 0).astype(float) # ReLU derivative self.dW1 = self.X.T @ dz1 / N # โL/โWยน = xแต ยท ฮดโฝยนโพ self.db1 = np.mean(dz1, axis=0, keepdims=True) def update(self, lr): self.W2 -= lr * self.dW2; self.b2 -= lr * self.db2 self.W1 -= lr * self.dW1; self.b1 -= lr * self.db1 # โโ Train on MNIST (simplified: 784 โ 128 โ 10) โโ # To verify: compare gradients against PyTorch autograd net = ManualMLP(784, 128, 10) # ... training loop omitted for brevity
# Gradient checking: compare analytical vs numerical gradient def numerical_gradient(f, param, eps=1e-5): grad = np.zeros_like(param) for idx in np.ndindex(param.shape): old_val = param[idx] param[idx] = old_val + eps loss_plus = f() param[idx] = old_val - eps loss_minus = f() grad[idx] = (loss_plus - loss_minus) / (2 * eps) param[idx] = old_val return grad
import torch import torch.nn as nn class SimpleCNN(nn.Module): """ LeNet-style CNN (Part I ยง4.1). Conv โ ReLU โ Pool โ Conv โ ReLU โ Pool โ FC โ FC """ def __init__(self, num_classes=10): super().__init__() self.features = nn.Sequential( # Conv block 1: detect low-level features (edges, textures) nn.Conv2d(1, 32, kernel_size=3, padding=1), # (1,28,28)โ(32,28,28) nn.ReLU(), nn.MaxPool2d(2), # (32,28,28)โ(32,14,14) # Conv block 2: detect mid-level features (shapes, parts) nn.Conv2d(32, 64, kernel_size=3, padding=1), # (32,14,14)โ(64,14,14) nn.ReLU(), nn.MaxPool2d(2), # (64,14,14)โ(64,7,7) ) self.classifier = nn.Sequential( nn.Flatten(), # (64,7,7)โ(3136) nn.Linear(64 * 7 * 7, 128), nn.ReLU(), nn.Dropout(0.5), # regularization (Part II ยงVII) nn.Linear(128, num_classes), # logits (no softmax โ inside loss) ) def forward(self, x): return self.classifier(self.features(x)) # โโ Training on MNIST โโ from torchvision import datasets, transforms from torch.utils.data import DataLoader transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) train_data = datasets.MNIST("./data", train=True, download=True, transform=transform) train_loader = DataLoader(train_data, batch_size=64, shuffle=True) model = SimpleCNN() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) loss_fn = nn.CrossEntropyLoss() # combines LogSoftmax + NLLLoss for epoch in range(5): for X_batch, y_batch in train_loader: logits = model(X_batch) loss = loss_fn(logits, y_batch) optimizer.zero_grad() loss.backward() optimizer.step() print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}") # Reaches ~99% accuracy in 5 epochs
nn.CrossEntropyLoss expects raw logits, not probabilities. It applies softmax internally. If you apply softmax yourself and then pass to CrossEntropyLoss, you're doing double-softmax โ a common bug that makes training fail silently.
import torch import torch.nn as nn class CharLSTM(nn.Module): """ Character-level language model (Part I ยง4.2). Predicts next character given previous characters. """ def __init__(self, vocab_size, embed_dim=64, hidden_dim=128): super().__init__() self.embed = nn.Embedding(vocab_size, embed_dim) self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, batch_first=True, dropout=0.2) self.head = nn.Linear(hidden_dim, vocab_size) def forward(self, x, hidden=None): emb = self.embed(x) # (B, T) โ (B, T, embed_dim) out, hidden = self.lstm(emb, hidden) # (B, T, hidden_dim) logits = self.head(out) # (B, T, vocab_size) return logits, hidden # โโ Generate text โโ @torch.no_grad() def generate(model, start_str, length=200, temperature=0.8): model.eval() chars = [char_to_idx[c] for c in start_str] input_seq = torch.tensor([chars]) hidden = None for _ in range(length): logits, hidden = model(input_seq, hidden) # Temperature sampling (Part II ยงV) probs = torch.softmax(logits[0, -1] / temperature, dim=-1) next_char = torch.multinomial(probs, 1) input_seq = next_char.unsqueeze(0) chars.append(next_char.item()) return "".join(idx_to_char[i] for i in chars)
import torch import torch.nn as nn class Word2Vec(nn.Module): """ Skip-gram with negative sampling (Part I ยง5.1). Maximize: log ฯ(u_o ยท v_c) + ฮฃ log ฯ(-u_neg ยท v_c) """ def __init__(self, vocab_size, embed_dim): super().__init__() self.center_embed = nn.Embedding(vocab_size, embed_dim) # v_c self.context_embed = nn.Embedding(vocab_size, embed_dim) # u_o # Initialize small (not zero โ need symmetry breaking) nn.init.uniform_(self.center_embed.weight, -0.5/embed_dim, 0.5/embed_dim) nn.init.zeros_(self.context_embed.weight) def forward(self, center, context, negatives): """ center: (B,) โ center word indices context: (B,) โ positive context word indices negatives: (B, num_neg) โ negative sample indices """ v_c = self.center_embed(center) # (B, d) u_pos = self.context_embed(context) # (B, d) u_neg = self.context_embed(negatives) # (B, num_neg, d) # Positive score: dot product โ sigmoid โ log pos_score = torch.sum(v_c * u_pos, dim=1) # (B,) pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-8) # Negative scores: dot product โ sigmoid(-score) โ log neg_score = torch.bmm(u_neg, v_c.unsqueeze(2)).squeeze() # (B, num_neg) neg_loss = -torch.log(torch.sigmoid(-neg_score) + 1e-8).sum(dim=1) return (pos_loss + neg_loss).mean() # After training, king - man + woman โ queen: # v = model.center_embed.weight.data # result = v[king] - v[man] + v[woman] # nearest = cosine_similarity(result, v).argmax()
import torch import torch.nn as nn import math class SelfAttention(nn.Module): """ Scaled dot-product self-attention (Part I ยง6.1-6.2). Attention(Q,K,V) = softmax(QKแต/โd_k)V """ def __init__(self, d_model, d_k): super().__init__() self.d_k = d_k self.W_q = nn.Linear(d_model, d_k, bias=False) self.W_k = nn.Linear(d_model, d_k, bias=False) self.W_v = nn.Linear(d_model, d_k, bias=False) def forward(self, x, mask=None): Q = self.W_q(x) # (B, T, d_k) K = self.W_k(x) # (B, T, d_k) V = self.W_v(x) # (B, T, d_k) # Attention scores: QKแต / โd_k scores = Q @ K.transpose(-2, -1) / math.sqrt(self.d_k) # (B, T, T) # Causal mask: prevent attending to future tokens if mask is not None: scores = scores.masked_fill(mask == 0, float('-inf')) # Softmax โ attention weights weights = torch.softmax(scores, dim=-1) # (B, T, T) # Weighted sum of values return weights @ V # (B, T, d_k) class MultiHeadAttention(nn.Module): """ Multi-head attention (Part I ยง6.3). Run h attention heads in parallel, concatenate, project. """ def __init__(self, d_model, n_heads): super().__init__() assert d_model % n_heads == 0 self.d_k = d_model // n_heads self.n_heads = n_heads # Single matrix for all heads (more efficient than separate) self.W_qkv = nn.Linear(d_model, 3 * d_model, bias=False) self.W_o = nn.Linear(d_model, d_model, bias=False) def forward(self, x, mask=None): B, T, C = x.shape # Project to Q, K, V all at once qkv = self.W_qkv(x) # (B, T, 3C) Q, K, V = qkv.chunk(3, dim=-1) # each (B, T, C) # Reshape: (B, T, C) โ (B, n_heads, T, d_k) Q = Q.view(B, T, self.n_heads, self.d_k).transpose(1, 2) K = K.view(B, T, self.n_heads, self.d_k).transpose(1, 2) V = V.view(B, T, self.n_heads, self.d_k).transpose(1, 2) # Scaled dot-product attention per head scores = Q @ K.transpose(-2, -1) / math.sqrt(self.d_k) if mask is not None: scores = scores.masked_fill(mask == 0, float('-inf')) weights = torch.softmax(scores, dim=-1) out = weights @ V # (B, h, T, d_k) # Concatenate heads: (B, h, T, d_k) โ (B, T, C) out = out.transpose(1, 2).contiguous().view(B, T, C) return self.W_o(out) # output projection
A minimal but complete decoder-only transformer. Every component maps directly to Part I ยง6.4 and ยง7.1.
import torch import torch.nn as nn import math # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # CONFIG # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ class GPTConfig: vocab_size: int = 50257 block_size: int = 256 # max sequence length n_layer: int = 6 n_head: int = 6 n_embd: int = 384 dropout: float = 0.2 # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # COMPONENTS # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ class CausalSelfAttention(nn.Module): """Multi-head causal self-attention (Part I ยง6.3 + ยง6.4)""" def __init__(self, config): super().__init__() assert config.n_embd % config.n_head == 0 self.n_head = config.n_head self.d_k = config.n_embd // config.n_head self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd) self.c_proj = nn.Linear(config.n_embd, config.n_embd) self.attn_drop = nn.Dropout(config.dropout) self.resid_drop = nn.Dropout(config.dropout) # Causal mask: lower-triangular (Part I ยง6.4) self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size)) .view(1, 1, config.block_size, config.block_size)) def forward(self, x): B, T, C = x.shape q, k, v = self.c_attn(x).split(C, dim=2) q = q.view(B, T, self.n_head, self.d_k).transpose(1, 2) k = k.view(B, T, self.n_head, self.d_k).transpose(1, 2) v = v.view(B, T, self.n_head, self.d_k).transpose(1, 2) # Attention: softmax(QKแต/โd_k + mask) V att = (q @ k.transpose(-2, -1)) / math.sqrt(self.d_k) att = att.masked_fill(self.mask[:, :, :T, :T] == 0, float('-inf')) att = torch.softmax(att, dim=-1) att = self.attn_drop(att) y = (att @ v).transpose(1, 2).contiguous().view(B, T, C) return self.resid_drop(self.c_proj(y)) class FeedForward(nn.Module): """Position-wise FFN (Part I ยง6.4): d โ 4d โ d""" def __init__(self, config): super().__init__() self.net = nn.Sequential( nn.Linear(config.n_embd, 4 * config.n_embd), nn.GELU(), # modern choice over ReLU nn.Linear(4 * config.n_embd, config.n_embd), nn.Dropout(config.dropout), ) def forward(self, x): return self.net(x) class TransformerBlock(nn.Module): """Pre-norm transformer block (Part II ยงVII)""" def __init__(self, config): super().__init__() self.ln1 = nn.LayerNorm(config.n_embd) self.attn = CausalSelfAttention(config) self.ln2 = nn.LayerNorm(config.n_embd) self.ffn = FeedForward(config) def forward(self, x): x = x + self.attn(self.ln1(x)) # residual + attention x = x + self.ffn(self.ln2(x)) # residual + FFN return x # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # GPT MODEL # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ class GPT(nn.Module): """ Full GPT: Embedding + N ร TransformerBlock + LM Head Maps to Part I ยง7.1 """ def __init__(self, config): super().__init__() self.config = config self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd) self.pos_emb = nn.Embedding(config.block_size, config.n_embd) self.drop = nn.Dropout(config.dropout) self.blocks = nn.Sequential( *[TransformerBlock(config) for _ in range(config.n_layer)] ) self.ln_f = nn.LayerNorm(config.n_embd) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) # Weight tying: share embedding and output weights (Part I ยง7.1) self.tok_emb.weight = self.lm_head.weight # Count parameters n_params = sum(p.numel() for p in self.parameters()) print(f"GPT model: {n_params/1e6:.1f}M parameters") def forward(self, idx, targets=None): B, T = idx.shape assert T <= self.config.block_size # Token + position embeddings pos = torch.arange(T, device=idx.device) x = self.drop(self.tok_emb(idx) + self.pos_emb(pos)) # Transformer blocks x = self.ln_f(self.blocks(x)) # Language model head โ logits over vocabulary logits = self.lm_head(x) # (B, T, vocab_size) # Compute loss if targets provided loss = None if targets is not None: loss = nn.functional.cross_entropy( logits.view(-1, logits.size(-1)), targets.view(-1) ) return logits, loss @torch.no_grad() def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None): """Autoregressive generation (Part II ยงV)""" for _ in range(max_new_tokens): # Crop to block_size if necessary idx_cond = idx[:, -self.config.block_size:] logits, _ = self(idx_cond) logits = logits[:, -1, :] / temperature # Top-k filtering if top_k is not None: v, _ = torch.topk(logits, top_k) logits[logits < v[:, [-1]]] = float('-inf') probs = torch.softmax(logits, dim=-1) idx_next = torch.multinomial(probs, num_samples=1) idx = torch.cat([idx, idx_next], dim=1) return idx
import torch from torch.optim import AdamW from torch.optim.lr_scheduler import CosineAnnealingLR def train_gpt(model, train_loader, val_loader, config): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) # โโ Optimizer: AdamW with weight decay (Part II ยงVIII) โโ # Separate params: decay weights, don't decay biases/norms decay_params = [p for n, p in model.named_parameters() if p.dim() >= 2] no_decay_params = [p for n, p in model.named_parameters() if p.dim() < 2] optim_groups = [ {"params": decay_params, "weight_decay": 0.1}, {"params": no_decay_params, "weight_decay": 0.0}, ] optimizer = AdamW(optim_groups, lr=3e-4, betas=(0.9, 0.999)) # โโ LR Schedule: warmup + cosine decay (Part II ยงVIII) โโ total_steps = len(train_loader) * config.epochs warmup_steps = int(0.05 * total_steps) def lr_lambda(step): if step < warmup_steps: return step / warmup_steps progress = (step - warmup_steps) / (total_steps - warmup_steps) return 0.5 * (1 + math.cos(math.pi * progress)) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) # โโ Training loop โโ for epoch in range(config.epochs): model.train() total_loss = 0 for batch_idx, (x, y) in enumerate(train_loader): x, y = x.to(device), y.to(device) logits, loss = model(x, targets=y) # Gradient accumulation for effective larger batch loss = loss / config.grad_accum_steps loss.backward() if (batch_idx + 1) % config.grad_accum_steps == 0: # Gradient clipping (prevents exploding gradients) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() optimizer.zero_grad() total_loss += loss.item() * config.grad_accum_steps # โโ Validation โโ val_loss = evaluate(model, val_loader, device) ppl = math.exp(val_loss) # perplexity (Part III ยงVII) print(f"Epoch {epoch+1} | Train loss: {total_loss/len(train_loader):.4f}" f" | Val loss: {val_loss:.4f} | Val PPL: {ppl:.1f}") @torch.no_grad() def evaluate(model, val_loader, device): model.eval() total_loss, total_tokens = 0, 0 for x, y in val_loader: x, y = x.to(device), y.to(device) _, loss = model(x, targets=y) total_loss += loss.item() * y.numel() total_tokens += y.numel() return total_loss / total_tokens
import torch import torch.nn as nn class LoRALinear(nn.Module): """ LoRA adapter (Part III ยงII). W_new = W_frozen + (ฮฑ/r) ยท B @ A Only B and A are trainable โ base weights are frozen. """ def __init__(self, base_layer: nn.Linear, r: int = 8, alpha: float = 16.0): super().__init__() self.base = base_layer self.r = r self.alpha = alpha d_in, d_out = base_layer.in_features, base_layer.out_features # LoRA matrices self.A = nn.Parameter(torch.randn(d_in, r) * 0.01) # Gaussian init self.B = nn.Parameter(torch.zeros(r, d_out)) # Zero init โ ฮW=0 at start # Freeze base weights self.base.weight.requires_grad = False if self.base.bias is not None: self.base.bias.requires_grad = False def forward(self, x): # h = Wโx + (ฮฑ/r) ยท B(Ax) [Part III ยงII equation] base_out = self.base(x) lora_out = (x @ self.A) @ self.B * (self.alpha / self.r) return base_out + lora_out def merge(self): """Merge LoRA weights into base for zero-cost inference.""" self.base.weight.data += (self.alpha / self.r) * (self.A @ self.B).T def apply_lora(model, r=8, alpha=16.0, target_modules=["c_attn", "c_proj"]): """Replace target Linear layers with LoRA-wrapped versions.""" for name, module in model.named_modules(): for target in target_modules: if hasattr(module, target): original = getattr(module, target) if isinstance(original, nn.Linear): setattr(module, target, LoRALinear(original, r, alpha)) # Freeze all non-LoRA parameters for name, param in model.named_parameters(): if "lora" not in name.lower() and ".A" not in name and ".B" not in name: param.requires_grad = False trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) total = sum(p.numel() for p in model.parameters()) print(f"LoRA: {trainable:,} trainable / {total:,} total ({100*trainable/total:.2f}%)") # Usage: # model = GPT(config) # model.load_state_dict(pretrained_weights) # load base model # apply_lora(model, r=16, alpha=32) # # Now train normally โ only LoRA params update
datasets library). Generate 500 characters and evaluate the perplexity.