PyTorch 学习笔记 (6): 卷积神经网络 (CNN)
2025-11-05·12 min read
#PyTorch#Deep Learning#CNN#Computer Vision
MNIST 是深度学习的 "Hello World"!
- 60,000 张训练图片,10,000 张测试图片
- 每张图片是 28x28 的灰度图
- 任务:识别图片中的数字(0-9)
CNN 核心概念
- 卷积层 (Conv):提取图像特征
- 池化层 (Pool):降低维度,减少计算
- 全连接层 (FC):分类决策
数据准备
python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
# 检查GPU
device = torch.device('cuda' if torch.cuda.is_available() else
'mps' if torch.backends.mps.is_available() else 'cpu')
print(f"使用设备: {device}")
# 数据预处理
transform = transforms.Compose([
transforms.ToTensor(), # 转换为张量,值范围从 [0,255] 变为 [0,1]
transforms.Normalize((0.1307,), (0.3081,)) # 标准化(MNIST的均值和标准差)
])
# 下载并加载训练集
train_dataset = datasets.MNIST(
root='./data',
train=True,
download=True,
transform=transform
)
# 下载并加载测试集
test_dataset = datasets.MNIST(
root='./data',
train=False,
download=True,
transform=transform
)
# 创建数据加载器
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
print(f"训练集大小: {len(train_dataset)}")
print(f"测试集大小: {len(test_dataset)}")
print(f"批次大小: {batch_size}")
数据维度说明
text
images shape: (batch_size, channels, height, width)
即 (64, 1, 28, 28) = 64张图片,1个颜色通道,28x28像素
定义 CNN 模型
python
class SimpleCNN(nn.Module):
"""
简单的CNN结构:
输入 (1, 28, 28)
↓
Conv1: 1→16, 3x3 → (16, 26, 26)
ReLU
MaxPool 2x2 → (16, 13, 13)
↓
Conv2: 16→32, 3x3 → (32, 11, 11)
ReLU
MaxPool 2x2 → (32, 5, 5)
↓
Flatten → (32*5*5) = (800)
↓
FC1: 800→128
ReLU
↓
FC2: 128→10 (10个数字类别)
"""
def __init__(self):
super().__init__()
# 卷积层1:提取低级特征(边缘、角点等)
self.conv1 = nn.Conv2d(
in_channels=1, # 输入通道数(灰度图为1)
out_channels=16, # 输出通道数(卷积核数量)
kernel_size=3, # 卷积核大小 3x3
stride=1,
padding=0
)
# 卷积层2:提取高级特征
self.conv2 = nn.Conv2d(16, 32, kernel_size=3)
# 池化层:降维
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
# 全连接层
self.fc1 = nn.Linear(32 * 5 * 5, 128)
self.fc2 = nn.Linear(128, 10)
# 激活函数
self.relu = nn.ReLU()
def forward(self, x):
# 第一卷积块
x = self.conv1(x) # (batch, 16, 26, 26)
x = self.relu(x)
x = self.pool(x) # (batch, 16, 13, 13)
# 第二卷积块
x = self.conv2(x) # (batch, 32, 11, 11)
x = self.relu(x)
x = self.pool(x) # (batch, 32, 5, 5)
# 展平
x = x.view(-1, 32 * 5 * 5) # (batch, 800)
# 全连接层
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x) # (batch, 10)
return x
model = SimpleCNN().to(device)
print(model)
# 计算模型参数数量
total_params = sum(p.numel() for p in model.parameters())
print(f"总参数量: {total_params:,}")
损失函数和优化器
python
# 交叉熵损失(多分类问题)
criterion = nn.CrossEntropyLoss()
# Adam优化器(比SGD收敛更快)
optimizer = optim.Adam(model.parameters(), lr=0.001)
训练函数
python
def train(model, device, train_loader, optimizer, criterion, epoch):
"""训练一个epoch"""
model.train()
running_loss = 0.0
correct = 0
total = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
# 五步法
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
# 统计
running_loss += loss.item()
_, predicted = output.max(1)
total += target.size(0)
correct += predicted.eq(target).sum().item()
return running_loss / len(train_loader), 100. * correct / total
def test(model, device, test_loader, criterion):
"""测试模型"""
model.eval()
test_loss = 0
correct = 0
with torch.no_grad(): # 测试不需要计算梯度
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += criterion(output, target).item()
pred = output.argmax(dim=1)
correct += pred.eq(target).sum().item()
accuracy = 100. * correct / len(test_loader.dataset)
return test_loss / len(test_loader), accuracy
训练循环
python
epochs = 5
for epoch in range(1, epochs + 1):
train_loss, train_acc = train(model, device, train_loader, optimizer, criterion, epoch)
test_loss, test_acc = test(model, device, test_loader, criterion)
print(f"Epoch {epoch}/{epochs}")
print(f" 训练 - Loss: {train_loss:.4f}, Acc: {train_acc:.2f}%")
print(f" 测试 - Loss: {test_loss:.4f}, Acc: {test_acc:.2f}%")
CNN 核心概念总结
1. 卷积层 (Conv2d)
- 使用卷积核在图像上滑动,提取特征
- 参数:
in_channels,out_channels,kernel_size - 输出大小 = (输入大小 - 卷积核大小 + 2*padding) / stride + 1
2. 池化层 (MaxPool2d)
- 降低特征图尺寸,减少计算量
- 常用 2x2 池化,尺寸减半
3. 激活函数 (ReLU)
- 引入非线性,使网络能学习复杂模式
- ReLU(x) = max(0, x)
4. 全连接层 (Linear)
- 将特征展平后进行分类
- 最后输出节点数 = 类别数
5. 数据维度变化
text
输入: (batch, 1, 28, 28)
Conv1: (batch, 16, 26, 26)
Pool: (batch, 16, 13, 13)
Conv2: (batch, 32, 11, 11)
Pool: (batch, 32, 5, 5)
Flatten: (batch, 800)
FC1: (batch, 128)
FC2: (batch, 10)
总结
CNN 是图像处理的核心技术:
| 层类型 | 作用 | 常用参数 |
|---|---|---|
| Conv2d | 提取特征 | kernel_size=3, stride=1 |
| MaxPool2d | 降维 | kernel_size=2 |
| ReLU | 非线性 | - |
| Linear | 分类 | output=类别数 |