PyTorch for Software Engineers: A Practical Introduction

PyTorch as a Framework

If you've worked with NumPy, React, or any framework with a reactive computation model, PyTorch's design will feel familiar. The key mental model: PyTorch builds a computation graph as you run code, then walks it backwards to compute gradients.

This is called define-by-run (or dynamic computation graphs), and it means PyTorch code is just Python — you can use if statements, loops, and print inside your model.

Tensors: NumPy Arrays with Superpowers

import torch
import numpy as np

# Creating tensors
x = torch.tensor([1.0, 2.0, 3.0])
zeros = torch.zeros(3, 4)          # 3x4 matrix of zeros
ones = torch.ones(2, 3, 4)         # 3D tensor
rand = torch.randn(100, 50)        # Normal distribution

# From NumPy (shares memory — no copy)
np_array = np.array([1.0, 2.0, 3.0])
tensor = torch.from_numpy(np_array)

# Shape and dtype
print(rand.shape)   # torch.Size([100, 50])
print(rand.dtype)   # torch.float32
print(rand.device)  # cpu

Operations

a = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
b = torch.tensor([[5.0, 6.0], [7.0, 8.0]])

# Element-wise
a + b           # [[6, 8], [10, 12]]
a * b           # [[5, 12], [21, 32]]

# Matrix operations
a @ b           # Matrix multiply: [[19, 22], [43, 50]]
a.T             # Transpose

# Reductions
a.sum()         # 10.0
a.mean(dim=0)   # Mean along rows: [2.0, 3.0]
a.max()         # 4.0

# Reshaping
a.view(4)       # Flatten to 1D: [1, 2, 3, 4]
a.unsqueeze(0)  # Add dimension: [[[1, 2], [3, 4]]]
a.squeeze()     # Remove size-1 dimensions

Autograd: Automatic Differentiation

This is PyTorch's core innovation for ML. Every tensor operation is recorded so gradients can be computed automatically.

# Enable gradient tracking
x = torch.tensor(3.0, requires_grad=True)
y = torch.tensor(4.0, requires_grad=True)

# Forward pass: compute some function
z = x**2 + 2*x*y + y**2  # (x+y)^2 = 49

# Backward pass: compute gradients
z.backward()

# Gradients: dz/dx = 2x + 2y = 14, dz/dy = 2x + 2y = 14
print(x.grad)  # tensor(14.)
print(y.grad)  # tensor(14.)

The Computation Graph

Under the hood, PyTorch builds this graph:

x (3.0) ──┐
           ├──> x**2 ──┐
           └──> 2*x*y ──┼──> z (49.0)
y (4.0) ──┤             │
           └──> y**2 ──┘

When you call .backward(), PyTorch traverses this graph in reverse, applying the chain rule at each node.

Detaching and No-Grad

# During inference — don't track gradients (saves memory, faster)
with torch.no_grad():
    predictions = model(inputs)

# Detach a tensor from the graph
z_detached = z.detach()

Always use torch.no_grad() during evaluation. It's a common bug to forget this and waste memory.

Building Models with nn.Module

import torch.nn as nn

class MLP(nn.Module):
    """Multi-layer perceptron — the simplest neural network."""

    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super().__init__()
        # Define layers as attributes
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Define forward pass — PyTorch tracks this for autograd
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Instantiate
model = MLP(input_dim=784, hidden_dim=256, output_dim=10)

# Inspect
print(model)
print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
# Parameters: 203,530

nn.Sequential for Simple Models

# For linear stacks, Sequential is more concise
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, 10)
)

Datasets and DataLoaders

from torch.utils.data import Dataset, DataLoader

class TabularDataset(Dataset):
    def __init__(self, features: np.ndarray, labels: np.ndarray):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)

    def __len__(self) -> int:
        return len(self.labels)

    def __getitem__(self, idx: int):
        return self.features[idx], self.labels[idx]

# Create dataset and loader
dataset = TabularDataset(X_train, y_train)
loader = DataLoader(
    dataset,
    batch_size=64,
    shuffle=True,          # Shuffle for training
    num_workers=4,         # Parallel data loading
    pin_memory=True        # Faster GPU transfer
)

# Iterate
for features, labels in loader:
    # features: (64, input_dim), labels: (64,)
    pass

The Training Loop

def train_epoch(model, loader, optimizer, loss_fn, device):
    model.train()  # Enable dropout, batch norm
    total_loss = 0

    for features, labels in loader:
        features, labels = features.to(device), labels.to(device)

        # Forward pass
        logits = model(features)
        loss = loss_fn(logits, labels)

        # Backward pass
        optimizer.zero_grad()   # Clear old gradients
        loss.backward()         # Compute new gradients

        # Gradient clipping (prevents exploding gradients)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()        # Update weights
        total_loss += loss.item()

    return total_loss / len(loader)

def evaluate(model, loader, loss_fn, device):
    model.eval()  # Disable dropout
    total_loss = 0
    correct = 0

    with torch.no_grad():  # Don't track gradients
        for features, labels in loader:
            features, labels = features.to(device), labels.to(device)
            logits = model(features)
            loss = loss_fn(logits, labels)
            total_loss += loss.item()
            correct += (logits.argmax(1) == labels).sum().item()

    accuracy = correct / len(loader.dataset)
    return total_loss / len(loader), accuracy

# Full training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP(784, 256, 10).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)

for epoch in range(50):
    train_loss = train_epoch(model, train_loader, optimizer, loss_fn, device)
    val_loss, val_acc = evaluate(model, val_loader, loss_fn, device)
    scheduler.step()

    print(f"Epoch {epoch+1}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, val_acc={val_acc:.4f}")

Saving and Loading Models

# Save
torch.save({
    "epoch": epoch,
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "val_loss": val_loss,
}, "checkpoint.pt")

# Load
checkpoint = torch.load("checkpoint.pt")
model.load_state_dict(checkpoint["model_state_dict"])
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

Always save the optimizer state if you plan to resume training. Always save the epoch for bookkeeping.

Common Bugs and How to Avoid Them

Forgetting optimizer.zero_grad(): Gradients accumulate across calls. Always zero before backward().

Forgetting model.eval() and torch.no_grad(): Dropout is active during training. Eval mode disables it. Forgetting this gives inconsistent inference results.

Shape mismatches: Use print(x.shape) liberally. Get comfortable with broadcasting rules.

Device mismatches: All tensors in an operation must be on the same device. Move model and data to the same device.

# Systematic device handling
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

for batch in loader:
    # Move all tensors to device
    batch = {k: v.to(device) for k, v in batch.items()}

Next step: apply this foundation to a real problem. See our guide on fine-tuning a pretrained model with Hugging Face.