PyTorch as a Framework
If you've worked with NumPy, React, or any framework with a reactive computation model, PyTorch's design will feel familiar. The key mental model: PyTorch builds a computation graph as you run code, then walks it backwards to compute gradients.
This is called define-by-run (or dynamic computation graphs), and it means PyTorch code is just Python — you can use if statements, loops, and print inside your model.
Tensors: NumPy Arrays with Superpowers
import torch
import numpy as np
# Creating tensors
x = torch.tensor([1.0, 2.0, 3.0])
zeros = torch.zeros(3, 4) # 3x4 matrix of zeros
ones = torch.ones(2, 3, 4) # 3D tensor
rand = torch.randn(100, 50) # Normal distribution
# From NumPy (shares memory — no copy)
np_array = np.array([1.0, 2.0, 3.0])
tensor = torch.from_numpy(np_array)
# Shape and dtype
print(rand.shape) # torch.Size([100, 50])
print(rand.dtype) # torch.float32
print(rand.device) # cpu
Operations
a = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
b = torch.tensor([[5.0, 6.0], [7.0, 8.0]])
# Element-wise
a + b # [[6, 8], [10, 12]]
a * b # [[5, 12], [21, 32]]
# Matrix operations
a @ b # Matrix multiply: [[19, 22], [43, 50]]
a.T # Transpose
# Reductions
a.sum() # 10.0
a.mean(dim=0) # Mean along rows: [2.0, 3.0]
a.max() # 4.0
# Reshaping
a.view(4) # Flatten to 1D: [1, 2, 3, 4]
a.unsqueeze(0) # Add dimension: [[[1, 2], [3, 4]]]
a.squeeze() # Remove size-1 dimensions
Autograd: Automatic Differentiation
This is PyTorch's core innovation for ML. Every tensor operation is recorded so gradients can be computed automatically.
# Enable gradient tracking
x = torch.tensor(3.0, requires_grad=True)
y = torch.tensor(4.0, requires_grad=True)
# Forward pass: compute some function
z = x**2 + 2*x*y + y**2 # (x+y)^2 = 49
# Backward pass: compute gradients
z.backward()
# Gradients: dz/dx = 2x + 2y = 14, dz/dy = 2x + 2y = 14
print(x.grad) # tensor(14.)
print(y.grad) # tensor(14.)
The Computation Graph
Under the hood, PyTorch builds this graph:
x (3.0) ──┐
├──> x**2 ──┐
└──> 2*x*y ──┼──> z (49.0)
y (4.0) ──┤ │
└──> y**2 ──┘
When you call .backward(), PyTorch traverses this graph in reverse, applying the chain rule at each node.
Detaching and No-Grad
# During inference — don't track gradients (saves memory, faster)
with torch.no_grad():
predictions = model(inputs)
# Detach a tensor from the graph
z_detached = z.detach()
Always use torch.no_grad() during evaluation. It's a common bug to forget this and waste memory.
Building Models with nn.Module
import torch.nn as nn
class MLP(nn.Module):
"""Multi-layer perceptron — the simplest neural network."""
def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
super().__init__()
# Define layers as attributes
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.2)
self.fc2 = nn.Linear(hidden_dim, output_dim)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# Define forward pass — PyTorch tracks this for autograd
x = self.fc1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return x
# Instantiate
model = MLP(input_dim=784, hidden_dim=256, output_dim=10)
# Inspect
print(model)
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
# Parameters: 203,530
nn.Sequential for Simple Models
# For linear stacks, Sequential is more concise
model = nn.Sequential(
nn.Linear(784, 256),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(256, 10)
)
Datasets and DataLoaders
from torch.utils.data import Dataset, DataLoader
class TabularDataset(Dataset):
def __init__(self, features: np.ndarray, labels: np.ndarray):
self.features = torch.FloatTensor(features)
self.labels = torch.LongTensor(labels)
def __len__(self) -> int:
return len(self.labels)
def __getitem__(self, idx: int):
return self.features[idx], self.labels[idx]
# Create dataset and loader
dataset = TabularDataset(X_train, y_train)
loader = DataLoader(
dataset,
batch_size=64,
shuffle=True, # Shuffle for training
num_workers=4, # Parallel data loading
pin_memory=True # Faster GPU transfer
)
# Iterate
for features, labels in loader:
# features: (64, input_dim), labels: (64,)
pass
The Training Loop
def train_epoch(model, loader, optimizer, loss_fn, device):
model.train() # Enable dropout, batch norm
total_loss = 0
for features, labels in loader:
features, labels = features.to(device), labels.to(device)
# Forward pass
logits = model(features)
loss = loss_fn(logits, labels)
# Backward pass
optimizer.zero_grad() # Clear old gradients
loss.backward() # Compute new gradients
# Gradient clipping (prevents exploding gradients)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step() # Update weights
total_loss += loss.item()
return total_loss / len(loader)
def evaluate(model, loader, loss_fn, device):
model.eval() # Disable dropout
total_loss = 0
correct = 0
with torch.no_grad(): # Don't track gradients
for features, labels in loader:
features, labels = features.to(device), labels.to(device)
logits = model(features)
loss = loss_fn(logits, labels)
total_loss += loss.item()
correct += (logits.argmax(1) == labels).sum().item()
accuracy = correct / len(loader.dataset)
return total_loss / len(loader), accuracy
# Full training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP(784, 256, 10).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
for epoch in range(50):
train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device)
val_loss, val_acc = evaluate(model, val_loader, loss_fn, device)
scheduler.step()
print(f"Epoch {epoch+1}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, val_acc={val_acc:.4f}")
Saving and Loading Models
# Save
torch.save({
"epoch": epoch,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"val_loss": val_loss,
}, "checkpoint.pt")
# Load
checkpoint = torch.load("checkpoint.pt")
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
Always save the optimizer state if you plan to resume training. Always save the epoch for bookkeeping.
Common Bugs and How to Avoid Them
Forgetting optimizer.zero_grad(): Gradients accumulate across calls. Always zero before backward().
Forgetting model.eval() and torch.no_grad(): Dropout is active during training. Eval mode disables it. Forgetting this gives inconsistent inference results.
Shape mismatches: Use print(x.shape) liberally. Get comfortable with broadcasting rules.
Device mismatches: All tensors in an operation must be on the same device. Move model and data to the same device.
# Systematic device handling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
for batch in loader:
# Move all tensors to device
batch = {k: v.to(device) for k, v in batch.items()}
Next step: apply this foundation to a real problem. See our guide on fine-tuning a pretrained model with Hugging Face.