Five complete projects that tie together every concept from Parts IโVI.
Each project: problem definition โ math โ architecture โ code โ evaluation โ extensions.
Tokenization is the first and last step in every LLM pipeline. Understanding it deeply means understanding why "tokenization" becomes ["token", "ization"] but "untokenize" becomes ["un", "token", "ize"]. Bugs here silently corrupt everything downstream.
class BPETokenizer: """ Byte Pair Encoding tokenizer, built from scratch. Maps to Part I ยง5.1 and Karpathy's minbpe. """ def __init__(self): self.merges = {} # (pair) โ new_token_id self.vocab = {} # token_id โ bytes def _get_pair_counts(self, ids): """Count frequency of each adjacent pair.""" counts = {} for i in range(len(ids) - 1): pair = (ids[i], ids[i+1]) counts[pair] = counts.get(pair, 0) + 1 return counts def _merge_pair(self, ids, pair, new_id): """Replace all occurrences of pair with new_id.""" new_ids = [] i = 0 while i < len(ids): if i < len(ids) - 1 and (ids[i], ids[i+1]) == pair: new_ids.append(new_id) i += 2 else: new_ids.append(ids[i]) i += 1 return new_ids def train(self, text, vocab_size): """ Train BPE on text. Start with 256 byte-level tokens, merge pairs until we reach vocab_size. """ # Start with raw bytes tokens = list(text.encode("utf-8")) # Initialize vocab with single bytes (0-255) self.vocab = {i: bytes([i]) for i in range(256)} num_merges = vocab_size - 256 for i in range(num_merges): counts = self._get_pair_counts(tokens) if not counts: break # Find most frequent pair best_pair = max(counts, key=counts.get) new_id = 256 + i # Record the merge self.merges[best_pair] = new_id self.vocab[new_id] = self.vocab[best_pair[0]] + self.vocab[best_pair[1]] # Apply merge to token sequence tokens = self._merge_pair(tokens, best_pair, new_id) if (i+1) % 100 == 0: print(f"Merge {i+1}/{num_merges}: {best_pair} โ {new_id}" f" ('{self.vocab[new_id].decode('utf-8', errors='replace')}')" f" | Tokens: {len(tokens)}") print(f"Compression: {len(text.encode('utf-8'))} bytes โ {len(tokens)} tokens" f" ({len(text.encode('utf-8'))/len(tokens):.1f}x)") def encode(self, text): """Encode text to token IDs.""" tokens = list(text.encode("utf-8")) # Apply merges in order they were learned for pair, new_id in self.merges.items(): tokens = self._merge_pair(tokens, pair, new_id) return tokens def decode(self, ids): """Decode token IDs back to text.""" raw_bytes = b"".join(self.vocab[i] for i in ids) return raw_bytes.decode("utf-8", errors="replace") # โโ Usage โโ tokenizer = BPETokenizer() # Train on Shakespeare (or any text) with open("shakespeare.txt") as f: text = f.read() tokenizer.train(text, vocab_size=512) # 256 bytes + 256 merges # Test roundtrip encoded = tokenizer.encode("To be or not to be") decoded = tokenizer.decode(encoded) assert decoded == "To be or not to be" print(f"Tokens: {encoded}") print(f"Token strings: {[tokenizer.vocab[t].decode('utf-8', errors='?') for t in encoded]}")
<|endoftext|>, <|pad|>.tiktoken (GPT-4's tokenizer) for the same text. Where do they differ?import torch from torch.utils.data import Dataset, DataLoader class CharDataset(Dataset): """ Character-level dataset for language modeling. Each item: (input[0:T], target[1:T+1]) โ shifted by one. """ def __init__(self, text, block_size): self.block_size = block_size # Build character vocabulary chars = sorted(set(text)) self.char_to_idx = {c: i for i, c in enumerate(chars)} self.idx_to_char = {i: c for c, i in self.char_to_idx.items()} self.vocab_size = len(chars) # Encode entire text self.data = torch.tensor([self.char_to_idx[c] for c in text], dtype=torch.long) print(f"Vocab size: {self.vocab_size} | Data length: {len(self.data):,} chars") def __len__(self): return len(self.data) - self.block_size def __getitem__(self, idx): chunk = self.data[idx : idx + self.block_size + 1] x = chunk[:-1] # input: chars 0 to T-1 y = chunk[1:] # target: chars 1 to T (shifted right) return x, y # โโ Setup โโ # Download: wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt with open("input.txt") as f: text = f.read() # 90/10 train/val split split = int(0.9 * len(text)) train_dataset = CharDataset(text[:split], block_size=256) val_dataset = CharDataset(text[split:], block_size=256) train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=64)
from gpt import GPT, GPTConfig # Part IV ยง8 config = GPTConfig() config.vocab_size = train_dataset.vocab_size # ~65 chars config.block_size = 256 config.n_layer = 6 config.n_head = 6 config.n_embd = 384 config.dropout = 0.2 model = GPT(config) # ~10.6M parameters โ trains in ~10 min on a single GPU # โโ Training with the loop from Part IV ยง9 โโ # After ~5000 steps, the model generates: # # ROMEO: # What say'st thou? I will not be thy love, # That hath so long been absent from thy state, # And yet I know not what to say to thee.
Remove or modify one component at a time and measure the effect on validation loss:
| Ablation | Val Loss | ฮ vs Baseline | Takeaway |
|---|---|---|---|
| Full model (baseline) | 1.48 | โ | โ |
| Remove positional encoding | 1.85 | +0.37 | Position info is critical |
| 1 head instead of 6 | 1.62 | +0.14 | Multi-head helps significantly |
| No residual connections | 2.30 | +0.82 | Training collapses without them |
| No layer norm | 1.95 | +0.47 | Stabilization is essential |
| ReLU instead of GELU | 1.51 | +0.03 | Activation choice matters less |
| No dropout | 1.52 | +0.04 | Slight overfitting at this scale |
| 2 layers instead of 6 | 1.68 | +0.20 | Depth matters for quality |
| FFN 2ร instead of 4ร | 1.55 | +0.07 | FFN capacity moderately important |
import numpy as np from sentence_transformers import SentenceTransformer import faiss # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # STEP 1: Document Chunking # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ def chunk_text(text, chunk_size=500, overlap=50): """Split text into overlapping chunks by character count.""" chunks = [] start = 0 while start < len(text): end = start + chunk_size # Find nearest sentence boundary if end < len(text): boundary = text.rfind('.', start + chunk_size//2, end) if boundary != -1: end = boundary + 1 chunks.append(text[start:end].strip()) start = end - overlap return [c for c in chunks if len(c) > 50] # filter tiny chunks # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # STEP 2: Embedding & Indexing # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # Load embedding model (Part I ยง5.1 โ modern contextual embeddings) embedder = SentenceTransformer("all-MiniLM-L6-v2") # 384-dim, fast def build_index(chunks): """Embed chunks and build FAISS index.""" embeddings = embedder.encode(chunks, show_progress_bar=True) embeddings = np.array(embeddings, dtype=np.float32) # Normalize for cosine similarity (Part I ยง8.3) faiss.normalize_L2(embeddings) # Build index โ Inner Product on normalized vectors = cosine similarity dim = embeddings.shape[1] index = faiss.IndexFlatIP(dim) index.add(embeddings) print(f"Indexed {len(chunks)} chunks, dim={dim}") return index, embeddings # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # STEP 3: Retrieval # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ def retrieve(query, index, chunks, k=5): """Retrieve top-k most relevant chunks for a query.""" q_emb = embedder.encode([query]).astype(np.float32) faiss.normalize_L2(q_emb) # score(q, c) = cos_sim(q, c) โ Part I ยง8.3 scores, indices = index.search(q_emb, k) results = [] for score, idx in zip(scores[0], indices[0]): results.append({ "chunk": chunks[idx], "score": float(score), "index": int(idx) }) return results # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # STEP 4: Generate Answer with LLM # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ def generate_answer(query, retrieved_chunks, llm_client): """Construct prompt with context and generate answer.""" context = "\n\n---\n\n".join(r["chunk"] for r in retrieved_chunks) prompt = f"""Answer the question based on the context below. If the context doesn't contain the answer, say "I don't have enough information." Context: {context} Question: {query} Answer:""" # Call your LLM (local or API) response = llm_client.generate(prompt, max_tokens=500) return response # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # FULL PIPELINE # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # 1. Ingest with open("documents.txt") as f: raw_text = f.read() chunks = chunk_text(raw_text) # 2. Index index, embeddings = build_index(chunks) # 3. Query query = "How does attention work in transformers?" results = retrieve(query, index, chunks, k=5) # 4. Generate answer = generate_answer(query, results, llm_client) print(answer)
# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # STEP 1: Prepare Instruction Dataset # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ def format_instruction(example): """Convert to chat format for fine-tuning.""" return { "text": f"""<|user|> {example['instruction']} <|assistant|> {example['output']}<|end|>""" } # Load and format dataset from datasets import load_dataset dataset = load_dataset("tatsu-lab/alpaca", split="train") dataset = dataset.map(format_instruction) # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # STEP 2: Load Model in 4-bit (QLoRA) # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training # 4-bit quantization config (Part III ยงIII) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", # NormalFloat4 โ optimal for Gaussian weights bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, # quantize the quantization constants too ) model_id = "Qwen/Qwen2.5-1.5B" model = AutoModelForCausalLM.from_pretrained( model_id, quantization_config=bnb_config, device_map="auto" ) tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.pad_token = tokenizer.eos_token # Prepare for LoRA training model = prepare_model_for_kbit_training(model) # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # STEP 3: Apply LoRA (Part III ยงII) # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ lora_config = LoraConfig( r=16, # rank โ 8 to 64 typical lora_alpha=32, # scaling factor (ฮฑ/r applied to ฮW) target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM", ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # Trainable: ~4.7M / 1.5B total = 0.31% # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # STEP 4: Train # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ from trl import SFTTrainer, SFTConfig training_args = SFTConfig( output_dir="./lora-output", num_train_epochs=3, per_device_train_batch_size=4, gradient_accumulation_steps=4, # effective batch = 16 learning_rate=2e-4, # higher than full fine-tuning lr_scheduler_type="cosine", warmup_ratio=0.05, bf16=True, logging_steps=25, save_strategy="epoch", max_seq_length=1024, dataset_text_field="text", ) trainer = SFTTrainer( model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer, ) trainer.train() # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # STEP 5: Merge & Save # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # Merge LoRA weights back into base (Part III ยงII: W = Wโ + BA) merged_model = model.merge_and_unload() merged_model.save_pretrained("./merged-model") tokenizer.save_pretrained("./merged-model") # The merged model has zero additional inference cost
gate_proj, up_proj, down_proj) โ these contain most of the model's "knowledge."import json, re, math # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # TOOL DEFINITIONS # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ TOOLS = { "calculator": { "description": "Evaluate a mathematical expression. Input: expression string.", "function": lambda expr: str(eval(expr, {"__builtins__": {}}, {"sqrt": math.sqrt, "pi": math.pi, "log": math.log, "exp": math.exp})) }, "search": { "description": "Search the web for information. Input: search query.", "function": lambda q: web_search(q) # your search API wrapper }, "python": { "description": "Execute Python code. Input: code string. Returns stdout.", "function": lambda code: run_python_sandbox(code) }, } # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # SYSTEM PROMPT # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ SYSTEM_PROMPT = """You are a helpful assistant that solves problems step by step. You have access to these tools: {tool_descriptions} To use a tool, respond with: THOUGHT: [your reasoning about what to do next] ACTION: [tool_name] INPUT: [tool input] After receiving a result, continue reasoning: THOUGHT: [interpret the result] ... When you have the final answer: THOUGHT: [final reasoning] ANSWER: [your final answer] Always think before acting. If a tool call fails, try a different approach.""" # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ # REACT AGENT LOOP # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ class ReActAgent: def __init__(self, llm_client, tools, max_steps=10): self.llm = llm_client self.tools = tools self.max_steps = max_steps def run(self, query): # Build tool descriptions for the prompt tool_desc = "\n".join( f"- {name}: {t['description']}" for name, t in self.tools.items() ) system = SYSTEM_PROMPT.format(tool_descriptions=tool_desc) messages = [ {"role": "system", "content": system}, {"role": "user", "content": query}, ] for step in range(self.max_steps): # Get LLM response response = self.llm.chat(messages) messages.append({"role": "assistant", "content": response}) print(f"\nโโ Step {step+1} โโ\n{response}") # Check for final answer if "ANSWER:" in response: answer = response.split("ANSWER:")[1].strip() return {"answer": answer, "steps": step + 1} # Extract and execute tool call action_match = re.search(r"ACTION:\s*(\w+)", response) input_match = re.search(r"INPUT:\s*(.+)", response, re.DOTALL) if action_match and input_match: tool_name = action_match.group(1).strip() tool_input = input_match.group(1).strip() if tool_name in self.tools: try: result = self.tools[tool_name]["function"](tool_input) observation = f"OBSERVATION: {result}" except Exception as e: observation = f"OBSERVATION: Error โ {str(e)}" else: observation = f"OBSERVATION: Unknown tool '{tool_name}'" messages.append({"role": "user", "content": observation}) print(observation) return {"answer": "Max steps reached", "steps": self.max_steps} # โโ Usage โโ agent = ReActAgent(llm_client, TOOLS) result = agent.run("What is the square root of the population of Tokyo?") # Step 1: THOUGHT: I need to find Tokyo's population. ACTION: search INPUT: Tokyo population # Step 2: OBSERVATION: 13.96 million (2023) # Step 3: THOUGHT: Now calculate sqrt(13960000). ACTION: calculator INPUT: sqrt(13960000) # Step 4: OBSERVATION: 3736.3... # Step 5: THOUGHT: I have the answer. ANSWER: approximately 3,736
Project 1 (Tokenizer) feeds into Project 2 (GPT) โ you can swap in your BPE tokenizer.
Project 2 (GPT) gives you a model that Project 4 (LoRA) can fine-tune.
Project 3 (RAG) provides context that Project 5 (Agent) can search and reason over.
Project 5 (Agent) can use the model from Project 4 with the search from Project 3.
Together, they form a complete LLM system: tokenize โ pretrain โ embed โ retrieve โ fine-tune โ deploy as agent.