动手学深度学习1

博主： GhTy
发布时间：2026年03月20日
160 次浏览
暂无评论
22673字数
分类：人工智能

%%HTML

修改MarkDown字体

如上Python代码块运行即可，推荐等宽字体：Menlo, Monaco, monospace.。

Jupyter运行bash语句

在一行bash语句前添加'!'/'%'，或在多行bash语句前添加"%%sh"。

MacBook-Anaconda环境准备

# 激活Anaconda(base)环境。
source /opt/homebrew/anaconda3/bin/activate
# conda初始化，zsh永久使用。
conda init zsh
    # 禁用base环境自动激活。
    conda config --set auto_activate_base false
    # 启用base环境自动激活。
    conda config --set auto_activate_base true
    # source生效。
    source ~/.zshrc
# 创建一个名为"advance"的基础环境，指定Python版本及包。
conda create --name advance python=3.12
# 激活"advance"环境。
conda activate advance
    # PyTorch官方推荐在MacARM64平台的安装方式。
    conda install pytorch torchvision torchaudio -c pytorch
    # 或者使用pip安装PyTorch相关包。
    pip install torch torchvision torchaudio asitop
    # 终端酷炫监测GPU使用率。
    sudo asitop
# 退出"advance"环境。
conda deactivate
# 查看所有环境。
conda env list
# 删除"advance"环境。
conda env remove --name advance

记得在VSCode-notebook，右上角点击，Select Another Kernel ｜ Python Environments，选择"advance"环境。
%%sh
pwd
conda activate advance
conda install matplotlib
python -m pip install ipython ipykernel
'''0_0'''
import torch

测试PyTorch是否正常工作。

print(torch.__version__)

在MacARM64平台应该显示False，不支持CUDA，使用Metal加速。

print(torch.cuda.is_available())

"Metal Performance Shaders"，“高性能图形和计算框架”可用。

print(torch.backends.mps.is_available())

创建一个张量。

print(torch.rand(3, 4))
'''0_1'''

$ x = [1.0, 2.0, 3.0, 4.0] $

x = torch.arange(4.0, requires_grad=True)
x.grad
y = 2 * torch.dot(x, x)
print(y)

调用反向传播函数来计算y关于x每个分量的梯度。

y.backward()
print(x.grad)
print(x.grad == 4 * x)

清除之前的梯度，防止梯度累加。

x.grad.zero_()

$ y = x1 + x2 + x3 + x4 $

y = x.sum()
y.backward()
print(x.grad)

清除之前的梯度，防止梯度累加。

x.grad.zero_()

$ y = [x1^2, x2^2, x3^2, x4^2] $

y = x * x

$ y.sum() = x1^2+x2^2+x3^2+x4^2 $

y.sum().backward()
print(x.grad)

$$ \begin{aligned} &样本\mathbf{X}，标签\mathbf{y}，权重矩阵\mathbf{W}，偏置b。 \\ &样本数量m个，每个样本有n个特征，也就是说，\mathbf{X}的形状是(m，n)，标签\mathbf{y}的形状是(m，1)，\\ &每个特征都要有1个权重，也就是说，\mathbf{W}的形状是(n，1)， \\ &偏置b是一个标量，可通过广播机制扩展为(m，1)。 \\ &预测值\qquad\qquad\quad \hat{y} = \mathbf{X} \cdot \mathbf{W} + \vec{b} \\ &平均平方误差\qquad \mathcal{Loss(W, b)} = \frac{1}{2m} \sum_{i=1}^{m} (\hat{y}_{[i]} - y_{[i]})^2 \\ &\hat{y}_{[i]}是第i个样本的预测值，y_{[i]}是第i个样本的真实值。 \\ &梯度下降更新权重，对偏置类似： \\ &W := W - \mathscr{lr} \times \frac{\partial Loss(W, b)}{\partial W} \\ &对于每个参数 W_{[j]}： \\ &W_{[j]} := W_{[j]} - \mathscr{lr} \times \frac{1}{m} \sum_{i=1}^{m} (\hat{y}_{[i]} - y_{[i]}) X_{[i][j]} \\ \end{aligned} $$

$$ \begin{aligned} 举个例子，很简单的模型， \\ 1个样本，2个特征，\hat{y}=w_1x_1+w_2x_2+b， \\ 损失函数，\mathcal{J}=\frac{1}{2}(\hat{y}-y)^2， \\ \frac{∂J}{∂\hat{y}}=\hat{y}-y,\frac{∂J}{∂w_1}=\frac{∂J}{∂\hat{y}}\cdot\frac{∂\hat{y}}{∂w_1}=(\hat{y}-y)\cdot{x_1}, \\ \frac{∂J}{∂w_2}=(\hat{y}-y)\cdot{x_2},\frac{∂J}{∂b}=(\hat{y}-y)， \\ 参数初始值，w_1=0.5,w_2=-0.3,b=0.1,𝓁𝓇=0.1， \\ 训练样本x_1=1.0,x_2=1.0，标签y=1， \\ \\ \textbf{前向传播}： \\ 预测值为\hat{y}=0.5×1.0-0.3×1.0+0.1=0.3， \\ 损失值为\mathcal{J}=0.5×(0.3-1.0)^2=0.245， \\ \textbf{反向传播}： \\ \frac{∂J}{∂\hat{y}}=0.3-1.0=-0.7， \\ \frac{∂J}{∂w_1}=-0.7×1.0=-0.7， \\ \frac{∂J}{∂w_2}=-0.7×1.0=-0.7， \\ \frac{∂J}{∂b}=-0.7， \\ w_1=w_1-𝓁𝓇×\frac{∂J}{∂w_1}=0.5+0.1×0.7=0.57， \\ w_2=w_2-𝓁𝓇×\frac{∂J}{∂w_2}=-0.3+0.1×0.7=-0.23， \\ b=b-𝓁𝓇×\frac{∂J}{∂b}=0.1+0.1×0.7=0.17， \\ \textbf{参数更新后再前向传播看看损失}： \\ 预测值为\hat{y}=0.57×1.0-0.23×1.0+0.1=0.44， \\ 损失值为\mathcal{J}=0.5×(0.44-1.0)^2=0.1568， \\ \end{aligned} $$

	更新前	更新后	改善
预测值	0.3	0.44	+46.7%
损失值	0.245	0.1568	-36.0%

线性回归的从零开始实现

'''1_0'''

%pip install d2l==2.0.0

%matplotlib inline

import random
import torch
import matplotlib.pyplot as plt
'''1_1'''
def synthetic_data_linear_noise(args_w, args_b, num_examples):
"""合成num_examples个 $ y = Xw + b + 噪声 $ 数据集。"""
X = torch.normal(0, 1, (num_examples, len(args_w)))
y = torch.matmul(X, args_w) + args_b

X.shape=(1000, 2), y.shape=(1000, 1).。

y += torch.normal(0, 0.01, y.shape)

print('', args_w, '=args_w\n', args_b, '=args_b\n', X, '=X\n', y, '=y')

返回y变成一个一维列向量。

return X, y.reshape((-1, 1))
my_w = torch.tensor([2, -3.4])
my_b = 4.2
features, labels = synthetic_data_linear_noise(my_w, my_b, 1000)
print('features:', features[0], '\nlabels:', labels[0])

绘制数据集。

plt.figure(figsize=(8, 6))
plt.scatter(features[:, 1].detach().numpy(), labels.detach().numpy(), s=1)
plt.xlabel('Feature 1')
plt.ylabel('Labels')
plt.title('Synthetic Data Visualization')
plt.show()

'''1_2'''
def iterate_data(batch_size, features, labels):
"""features, labels是数据集的输入和标签，此处随机读取样本小批量。"""
num_examples = len(features)
indices = list(range(num_examples))

随机读取样本，没有顺序。

random.shuffle(indices)
for i in range(0, num_examples, batch_size):
j = min(i + batch_size, num_examples)
batch_indices = torch.tensor(indices[i:j])

生成器，每次返回一个小批量样本。

yield features[batch_indices], labels[batch_indices]

batch_size = 10
for X, y in iterate_data(batch_size, features, labels):
print(X, '=X\n', y, '=y', sep='')
break

'''1_3'''
its_w = torch.normal(0, 0.01, size=(2, 1), requires_grad=True)
its_b = torch.zeros(1, requires_grad=True)
def linear_regression(X, w, b):
"""线性回归模型。"""
return torch.matmul(X, w) + b
def squared_error(o, y):
"""平方误差损失。"""

y.shape=(batch_size,)一维数组，o.shape=(batch_size,1)列向量。

return (o - y.reshape(o.shape))**2 / 2
def stochastic_gradient_descent(parameters, learning_rate, batch_size):
"""小批量随机梯度下降。"""
with torch.no_grad():
for param in parameters:

param就是要学习的参数，w, b。

这就是梯度下降。

param -= learning_rate * param.grad / batch_size
param.grad.zero_()

'''1_4'''

训练过程，给定学习率learn_rate和训练轮数train_epochs。

learn_rate = 0.03
train_epochs = 3

for epoch in range(train_epochs):
for X, y in iterate_data(batch_size, features, labels):
SE = squared_error(linear_regression(X, its_w, its_b), y)

将批次中的所有样本的损失值求和，得到一个标量，对这个标量计算梯度。

SE.sum().backward()

SE再除以batch_size取平均，在stochastic_gradient_descent()里实现了。

stochastic_gradient_descent([its_w, its_b], learn_rate, batch_size)
with torch.no_grad():

看一下训练效果，训练出来的参数对数据集的预测效果，以误差为指标。

train_loss = squared_error(linear_regression(features, its_w, its_b), labels)
print(f"epoch {epoch}, loss {float(train_loss.mean()):f}.")

本次训练误差收敛在0.00005以下，说明模型训练效果优秀。

'''1_5'''

比较真实参数my_w, my_b与线性回归模型训练学习到的参数its_w, its_b。

print(my_w, my_b, its_w, its_b)
print(f"w的估计误差: {my_w - its_w}.")
print(f"b的估计误差: {my_b - its_b}.")

线性回归的简洁实现

'''2_1'''
from torch import nn
from torch import optim
from torch.utils import data

batch_size = 10

特征和标签打包成一个数据集。

data_set = data.TensorDataset(features, labels)

用于批量加载数据，支持随机打乱和多进程加载。

shuffle=True，在每个epoch开始时打乱数据顺序，防止模型记住数据顺序。

iterate_data = data.DataLoader(data_set, batch_size, shuffle=True)

获取第一个批次数据的方法，得到1个X1个y。

next(iter(iterate_data))

线性回归模型。

your_model = nn.Sequential(nn.Linear(2, 1))

初始化模型参数。

your_model[0].weight.data.normal_(0, 0.01)
your_model[0].bias.data.fill_(0)

平均平方误差损失函数。

loss_function = nn.MSELoss()

随机梯度下降优化器。

train_optimizer = optim.SGD(your_model.parameters(), lr=0.03)

训练过程与从零开始实现相似。

train_epochs = 3
for epoch in range(train_epochs):
for X, y in iterate_data:
o = your_model(X)
SE = loss_function(o, y)

每次迭代，都要清空之前的梯度，防止累加。

train_optimizer.zero_grad()

nn.MSELoss(reduction='mean')返回值就是标量，可以直接backward()，计算权重和偏置的梯度值。

SE.backward()

更新权重和偏置，参数 -= 学习率 × 梯度。

train_optimizer.step()

看一下训练效果。

SE = loss_function(your_model(features), labels)
print(f"epoch {epoch}, loss {SE:f}.")

Fashion-MNIST数据集

MNIST: Modified National Institute of Standards and Technology database
- “修改后的美国国家标准与技术研究院数据库”的缩写，深度学习界的Hello World！
- 开发集：60,000张图片，测试集：10,000张图片，每张图片都是28×28的灰度图像，内容是0到9的手写数字。
Extended-MNIST:
- MNIST的扩展版本，包含了手写大小写英文字母，多用于OCR任务。
Fashion-MNIST:
- MNIST太简单，现代神经网络在MNIST上随便跑跑就能达到99%以上的准确率。
- 包含了10个类别的时尚单品，图像大小、数量都与MNIST一致。
- 0: T恤/上衣 1: 裤子 2: 衬衫 3: 裙子 4: 外套 5: 凉鞋 6: 衬衫 7: 运动鞋 8: 包包 9: 踝靴

'''3_1'''
%matplotlib inline

⬆️内联显示图像，自动show()。

from torchvision import transforms, datasets

将图像转换为张量，从PIL类型变换为32位浮点数类型，并除以255，将像素值缩放到[0,1]。

megatron = transforms.ToTensor()
'''
origin_mnist_dev = datasets.MNIST(
root="~/Public", train=True, transform=megatron, download=True)
origin_mnist_test = datasets.MNIST(
root="~/Public", train=False, transform=megatron, download=True)
'''

训练数据集，下载到~/Public/FashionMNIST/raw/。

fashion_mnist_dev = datasets.FashionMNIST(
root="~/Public", train=True, transform=megatron, download=True)

测试数据集，下载到~/Public/FashionMNIST/raw/。

fashion_mnist_test = datasets.FashionMNIST(
root="~/Public", train=False, transform=megatron, download=True)

文件的数据类型都是UnsignedByte，没法用普通图片查看器打开。

print(len(fashion_mnist_dev), len(fashion_mnist_test))

第一维表示i号样本，第二维[0]表示图像数据、[1]表示标签。

灰度图像，channel=1, height=28, width=28。彩色图像，channel=3。

print(fashion_mnist_dev0.shape, fashion_mnist_test0.shape)

'''3_2'''
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader

Fashion-MNIST数据集下载好之后，可以直接使用，就不要download=True了。

fashion_mnist_dev = torchvision.datasets.FashionMNIST(
root="~/Public", train=True, transform=megatron, download=False)
fashion_mnist_test = torchvision.datasets.FashionMNIST(
root="~/Public", train=False, transform=megatron, download=False)

'''3_3'''
import torch
import numpy as np
import matplotlib.pyplot as plt

def get_label_list(idx_list):
"""将Fashion-MNIST数据集中数字标签列表转换为对应的文本标签列表。"""
label_text = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle-boot']
return [label_text[int(i)] for i in idx_list]
def paint_images(images, rows, cols, titles=None, scale=1.5):
"""批量绘制Fashion-MNIST数据集中的图像。"""
fig, axes = plt.subplots(rows, cols, figsize=(cols scale, rows scale))

二维的坐标轴数组展平成一维，方便后续遍历。

axes = axes.flatten()
for i, (ax, img) in enumerate(zip(axes, images)):
ax.axis('off')
if torch.is_tensor(img):

将torch张量转换为numpy数组显示。

ax.imshow(img.numpy())
else:

直接显示numpy数组或PIL图像。

ax.imshow(img)
if titles:
ax.set_title(titles[i] + '⬇️')

'''3_4'''
"""
plt.subplots()的用法，
参数：
rows表示行数，cols表示列数，figsize表示整个画布大小，先宽度后高度，单位为英寸inch。
注意，cols与宽度成正比，rows与高度成正比，figsize顺序不能反，子图的尺寸自动计算。
返回：
fig@matplotlib.figure.Figure，整个画布对象，可以设置标题、保存图片等；
axes@numpy.ndarray，单个子图对象，维度为(rows, cols)，可以设置标题、坐标轴等。
创建 $ rows × cols $ 的子图网格。
"""
fig, axes = plt.subplots(2, 5, figsize=(10, 4))

fig.suptitle('Fashion-MNIST10K', fontfamily='PT mono', fontsize=14)
fig.patch.set_facecolor('lightgray')
fig.tight_layout(pad=1.0, h_pad=3.0, w_pad=3.0)
'''
fig.subplots_adjust(
top=0.9, # 顶部边距，0.9表示顶部留10%空间。
bottom=0.1, # 底部边距，0.1表示底部留10%空间。
left=0.1, # 左侧边距，0.1表示左侧留10%空间。
right=0.9, # 右侧边距，0.9表示右侧留10%空间。
hspace=0.4, # 行间距。
wspace=0.3 # 列间距。
) # 单位是图形尺寸的比例，范围[0,1]。
'''

fig.savefig('./example.png', dpi=300, bbox_inches='tight')

axes0.set_title('t-shirt')
axes0.axis('off')
axes0.set_xlabel('X-Ray', fontfamily='PT mono')
axes0.set_ylabel('Y-Ray', fontfamily='PT mono')
axes1.grid(True, alpha=0.3)

绘制随机散点图。

axes1.scatter(np.random.randn(100), np.random.randn(100), alpha=0.6)

测试数据集3号图片，去除单通道维度，显示为灰度图。

axes1.imshow(fashion_mnist_test2.numpy().squeeze(), cmap='gray')
'''
Axes.bar(x, height, width=0.8, bottom=None, **kwargs)
用于绘制垂直柱形图。

Axes.plot(x, y, [fmt], **kwargs)
用于绘制折线图。
fmt: 线条样式，color[makrer]，顺序无所谓，
color: 颜色，('b', 'g', 'r', 'c', 'm', 'y', 'k', 'w')；
line: 线型，（'-', '--', '-.', ':', '')；
marker: 数据点标记，（'.', 'o', 'v', '^', '<', '>', 's', '*', 'h', 'H','d', 'D', 'p', 'P', 'x', 'X')；

Axes.pie(y, explode=None, labels=None, colors=None, autopct=None,
shadow=False, startangle=0, counterclock=True, wedgeprops=None, **kwargs)
用于绘制饼状图。
'''
'''3_5'''
from torch.utils.data import DataLoader

X, y = next(iter(DataLoader(fashion_mnist_dev, batch_size=10)))

X.shape=(10, 1, 28, 28)，第2个维度是channel，需要reshape成(10, 28, 28)。

paint_images(X.reshape(-1, 28, 28), 2, 5, titles=get_label_list(y))
'''3_6'''

import os

import time
from contextlib import contextmanager
from torch.utils.data import DataLoader

os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'

@contextmanager
def secs_count(description='操作'):
"""记录操作耗时。"""
begin = time.time()
try:
yield
finally:
end = time.time()
print(f"{description} 耗时 {end - begin:.2f}秒。")

小批量，多进程。

batch_size, num_workers = 256, 4
iterate_devset = DataLoader(
fashion_mnist_dev, batch_size=batch_size, shuffle=True, num_workers=num_workers)
with secs_count('训练数据载入'):
for X, y in iterate_devset:
continue

会输出4个子进程的调试信息。

'''3_7'''
from torchvision import transforms, datasets
from torch.utils.data import DataLoader

def load_data_fashion_mnist(batch_size, reset_size:tuple=None):
"""加载Fashion-MNIST数据集到内存里。"""

例如提供reset_size=(32, 32)。

if reset_size:

先调整图像尺寸，再将PIL图像或numpy数组转换为torch张量。

tf_list = [transforms.Resize(reset_size), transforms.ToTensor()]
else:
tf_list = [transforms.ToTensor()]

将转换列表组合成一个可执行的转换管道。

megatron = transforms.Compose(tf_list)
fashion_mnist_dev = datasets.FashionMNIST(
root="~/Public", train=True, transform=megatron, download=False)
fashion_mnist_test = datasets.FashionMNIST(
root="~/Public", train=False, transform=megatron, download=False)

MacBookARM64平台，使用MPS后端，多进程在共享内存方面存在兼容性问题，num_workers=0是当前最稳妥的选择，创建0个子进程。

return (DataLoader(fashion_mnist_dev, batch_size, shuffle=True, num_workers=0),
DataLoader(fashion_mnist_test, batch_size, shuffle=False, num_workers=0))

SoftMax 分类问题

回归问题与分类问题的区别：

输出：回归问题输出是一个连续的数值，而分类问题输出是一个离散的类别。

示例：房价预测；
示例：图像识别；

目标：回归问题预测一个连续的数值，而分类问题预测一个样本所属的类别。

损失函数：平均平方误差（MSE）；
损失函数：交叉熵（Cross-Entropy）；

SoftMax：柔性最大值。

$$ \begin{align} % aligned取消编号。 &\mathcal{SoftMax}(\mathbf{X_{i,j}}) = \frac{\exp(\mathbf{X_j})}{\sum_{j=1}^N \exp(\mathbf{X}_{j})} \\ &\mathcal{CrossEntropy}(y, \hat{y}) = - \sum_{j=1}^N y_j \log(\hat{y}_j) \\ \end{align} $$

软大分类的从零开始实现

'''4_1'''
import os
import torch

os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'
batch_size = 256

load_data_fashion_mnist()定义在3_7里。

iterate_devset, iterate_testset = load_data_fashion_mnist(batch_size)

每个图像展平成长度为28×28=784的向量，10个类别，10个输出。

num_inputs, num_outputs = 784, 10

定义模型参数，权重W是一个784×10的矩阵，偏置b是一个10维的列向量。

初始化，权重W从均值为0、标准差为0.01的正态分布中采样，偏置b以0填充。

W = torch.normal(0, 0.01, (num_inputs, num_outputs), requires_grad=True)
b = torch.zeros(num_outputs, requires_grad=True)

'''
X_exp = torch.tensor([
[1.0, 2.0, 3.0, 4.0], # 样本[0]
[0.5, 1.5, 2.5, 3.5], # 样本[1]
[2.0, 3.0, 4.0, 5.0] # 样本[2]
])

partition = X_exp.sum(1, keepdim=True)
< tensor([[10.],
[ 8.],
[14.]])
'''
def calc_softmax(X):
X_exp = torch.exp(X)
第2维上求和，行向量压缩，并保持维度，避免广播错误。
partition = X_exp.sum(1, keepdim=True)
return X_exp / partition
def softmax_classification(X):
"""软大分类模型。"""
X.shape=(batch_size, 1, 28, 28)，重塑成(batch_size, num_inputs)。
return calc_softmax(torch.matmul(X.reshape(-1, W.shape[0]), W) + b)
def cross_entropy(o, y):
"""交叉熵损失。"""
range(len(o))生成下标整数序列，返回值是一个向量，还未求和。
return -torch.log(o[range(len(o)), y])
def stochastic_gradient_descent(parameters, learning_rate, batch_size):
"""小批量随机梯度下降。"""
with torch.no_grad():
for param in parameters:
param就是要学习的参数，W, b。
这就是梯度下降。
param -= learning_rate * param.grad / batch_size
param.grad.zero_()
'''
torch.tensor的花式索引，
o = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5], [0.7, 0.2, 0.1]])
样本[0]取下标0类别概率，样本[1]取下标0类别概率。
o[[0, 1], [0, 0]]
< tensor([0.1000, 0.3000])
o[[0, 1, 2], [1, 2, 0]]
< tensor([0.3000, 0.5000, 0.7000])
'''
def count_accurate(o, y):
"""计算预测正确的数量。"""
if len(o.shape) > 1 and o.shape[1] > 1:
若o是多维的，且第2维大于1，则对每个样本取最大概率的类别索引，即下标。
o = o.argmax(axis=1)
创建bool张量，标记哪些预测正确。
comp = o.type(y.dtype) == y
True→1，False→0，求和。
return float(comp.type(y.dtype).sum())
准确率。
count_accurate(o, y) / y.numel()
def evaluate_accuracy(your_model, iterate_data):
"""计算在指定数据集上模型的准确率。"""
if isinstance(your_model, torch.nn.Module):
将模型转为评估模式，不要计算梯度。
your_model.eval()
2个数，分别是预测正确数、样本总数。
metric = [0.0, 0.0]
with torch.no_grad():
for X, y in iterate_data:
metric[0] += count_accurate(your_model(X), y)
metric[1] += y.numel()
return metric[0] / metric[1]

iterate_testset@torch.utils.data.dataloader.DataLoader，定义在cell4_1上方。

evaluate_accuracy(softmax_classification, iterate_testset)

记得我们的模型吗？随机初始化的参数，跟瞎猜差不多，对于10个分类准确率在1/10以下。

此时交叉熵损失值大约是 $ -log(0.1) \approx 2.3 $ 。

'''4_2'''

使用SVG格式显示。

%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
from IPython import display

class ZigzagChartAnimator:
"""折线图动画可视化类。"""
def __init__(self, title='', legend=(), info_xy_=('','',(),(),'linear','linear'),
rows=1, cols=1, figsize=(5, 2.5)):
"""
title: 图表标题
legend: 图例列表
_x_y_: 包含x轴和y轴共6个参数
xlabel: x轴标签 -字符串
ylabel: y轴标签 -字符串
xlimit: x轴范围 -元组含2个数
ylimit: y轴范围 -元组含2个数
xscale: x轴缩放 -可选('linear', 'log')
yscale: y轴缩放 -可选('linear', 'log')
rows: 子图行数
cols: 子图列数
figsize: 图表尺寸
"""

正常显示中文字符，和负号。

plt.rcParams['font.family'] = ['DejaVu Sans', 'Arial Unicode MS']
plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False

# 创建子图网格。
    self.fig, self.axes = plt.subplots(rows, cols, figsize=figsize)
    if rows * cols == 1:
        # 如果只有1个子图，返回的是1个Axes对象，转为列表。
        self.axes = [self.axes]
    self.config_axes = lambda: self._set_axes(self.axes[0], title, legend, info_xy_)
    # m: 要绘多少条线。
    m = len(legend)
    self.X_2dli = [[] for _ in range(m)]
    self.Y_2dli = [[] for _ in range(m)]

def _set_axes(self, ax, title, legend, info_xy_):
    """
    ax: 子图Axes对象
    title: 图表标题
    legend: 图例列表
    xlabel: x轴标签
    ylabel: y轴标签
    xlimit: x轴范围
    ylimit: y轴范围
    xscale: x轴缩放
    yscale: y轴缩放
    """
    xlabel, ylabel, xlimit, ylimit, xscale, yscale = info_xy_
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_xlim(xlimit)
    ax.set_ylim(ylimit)
    ax.set_xscale(xscale)
    ax.set_yscale(yscale)
    # 图例锚点(1.0, 0.7)，距离画布右下角(100%, 70%)位置处，图例框的'lower left'钉在锚点。
    ax.legend(legend, loc='lower left', bbox_to_anchor=(1.0, 0.7))
    ax.set_title(title)
    ax.grid(True)

def insert_point(self, x_li, y_li):
    """
    向折线图中添加数据点

    x_li: x坐标值 可以是数或列表
    y_li: y坐标值 可以是数或列表
    """
    if not hasattr(y_li, "__len__"):
        # 数，转列表，方便后续遍历。
        y_li = [y_li]
    m = len(y_li)
    if not hasattr(x_li, "__len__"):
        # 数，转列表，方便后续遍历。
        x_li = [x_li] * m
    # 比如x_li=1, y_li=[0.2,0.3,0.4]，转换成[1,1,1], [0.2,0.3,0.4]。
    # 即1个横坐标对应3个纵坐标，3条线。

    # 添加数据点。
    for i, (a, b) in enumerate(zip(x_li, y_li)):
        if a and b:
            # a, b 都是数。
            self.X_2dli[i].append(a)
            self.Y_2dli[i].append(b)
    # 清除当前图表并重新绘制。
    self.axes[0].clear()
    # 定义线条样式，依次顺序使用下面元素。
    fmts = ('-','m--','g-.')
    for x_li, y_li, t in zip(self.X_2dli, self.Y_2dli, fmts):
        self.axes[0].plot(x_li, y_li, t)
    # 重新配置坐标轴。
    self.config_axes()
    # 显示图形并清空之前的输出。
    display.display(self.fig)
    display.clear_output(wait=True)

'''4_3'''

训练过程，给定学习率learn_rate和训练轮数train_epochs。

learn_rate = 0.1
train_epochs = 10

info_xy_ = ("训练轮数","",(1,10),(0,1),'linear','linear')
animator = ZigzagChartAnimator("训练可视化",("损失值","开发数据集准确率","测试数据集准确率"),info_xy_)

for epoch in range(train_epochs):
metric = [0.0, 0.0, 0.0]
for X, y in iterate_devset:
o = softmax_classification(X)
CE = cross_entropy(o, y)

函数定义时，返回的是个向量，需要求和。

CE.sum().backward()

CE再除以batch_size取平均，在stochastic_gradient_descent()里实现了。

stochastic_gradient_descent([W, b], learn_rate, X.shape[0])

交叉熵损失总和。

metric[0] += float(CE.sum())

预测正确数。

metric[1] += float(count_accurate(o, y))

样本总数。

metric[2] += y.numel()
train_loss, train_accu = metric[0] / metric[2], metric[1] / metric[2]
infer_accu = evaluate_accuracy(softmax_classification, iterate_testset)
animator.insert_point(epoch+1, [train_loss, train_accu, infer_accu])
print(f"devset loss: {train_loss:.4f}, devset accuracy: {train_accu:.4f}, testset accuracy: {infer_accu:.4f}.")

训练轮次结束，在测试数据集上的准确率收敛在0.84以上，说明模型训练效果良好。

'''4_4'''

用已经训练好的模型对测试集进行预测，画图对比。

for X, y in iterate_testset:
break

get_label_list(), paint_images()，定义在cell3_3里。

actual = get_label_list(y)
predi = get_label_list(softmax_classification(X).argmax(axis=1))
titles = [f'{a}\n{p}' for a, p in zip(actual, predi)]
m, n = 1, 8
paint_images(X[0:n].reshape(n, 28, 28), m, n, titles[0:n])

软大分类的简洁实现

'''5_1'''
import torch
from torch import nn, optim

batch_size = 256

load_data_fashion_mnist()定义在3_7里。

iterate_devset, iterate_testset = load_data_fashion_mnist(batch_size)

软大分类模型。

nn.Flatten()将输入的Fashion-MNIST图像(batch_size, 1, 28, 28)展平成(batch_size, 784)。

(batch_size, 784)才适合输入到nn.Linear(784, 10)。

your_model = nn.Sequential(nn.Flatten(), nn.Linear(784, 10))

初始化模型参数。

nn.init.normal_(your_model[1].weight, mean=0, std=0.01)

交叉熵损失函数。

loss_function = nn.CrossEntropyLoss()

随机梯度下降优化器。

train_optimizer = optim.SGD(your_model.parameters(), lr=0.1)

训练过程与从零开始实现相似。

train_epochs = 10
for epoch in range(train_epochs):

模型切换到训练模式，调教。

your_model.train()
metric = [0.0, 0.0, 0.0]
for X, y in iterate_devset:
o = your_model(X)
CE = loss_function(o, y)

每次迭代，都要清空之前的梯度，防止累加。

train_optimizer.zero_grad()

一组batch_size的平均损失值，反向传播，计算权重和偏置的梯度值。

CE.mean().backward()

更新权重和偏置，参数 -= 学习率 × 梯度。

train_optimizer.step()

交叉熵损失总和。

metric[0] += float(CE.sum())

预测正确数。

metric[1] += float(count_accurate(o, y))

样本总数。

metric[2] += y.numel()
train_loss, train_accu = metric[0] / metric[2], metric[1] / metric[2]

看一下训练效果，在evaluate_accuracy()内会将模型切回评估模式。

infer_accu = evaluate_accuracy(your_model, iterate_testset)
print(f"devset loss: {train_loss:.4f}, devset accuracy: {train_accu:.4f}, testset accuracy: {infer_accu:.4f}.")

最后修改于：2026年03月20日

如果觉得我的文章对你有用请狠狠地打赏我

发表评论取消回复
会利用Cookie保留一些您的个人信息以便您下次到来可以快速评论，如有错字或内容有误敬请指出！

评论 *

私密评论

名称 *

🎲

邮箱 *

网址

欢迎欢迎
浏览次数： 192307
关于🤗👏
浏览次数： 108358
《01》极限与函数✅
浏览次数： 2900
《03》一元函数积分❌
浏览次数： 2496
《02》一元函数微分❌
浏览次数： 2312

2013年计算机专业综合小题
浏览次数： 1143
动手学深度学习4
浏览次数： 175
《12》二维随机变量❌
浏览次数： 1095
动手学深度学习2
浏览次数： 159
《04》微分方程✅
浏览次数： 1404

动手学深度学习1

GhTy • 2026 年 03 月 20 日

<p>%%HTML</p><p><style></p><pre><code>body{--vscode-font-family: &quot;Menlo&quot;;}</code></pre><p></style></p><h1>修改MarkDown字体</h1><p>如上Python代码块运行即可，推荐等宽字体：Menlo, Monaco, monospace.。</p><h1>Jupyter运行bash语句</h1><p>在一行bash语句前添加'!'/'%'，或在多行bash语句前添加"%%sh"。</p><h1>MacBook-Anaconda环境准备</h1><pre><code class="lang-bash"># 激活Anaconda(base)环境。
source /opt/homebrew/anaconda3/bin/activate
# conda初始化，zsh永久使用。
conda init zsh
    # 禁用base环境自动激活。
    conda config --set auto_activate_base false
    # 启用base环境自动激活。
    conda config --set auto_activate_base true
    # source生效。
    source ~/.zshrc
# 创建一个名为&quot;advance&quot;的基础环境，指定Python版本及包。
conda create --name advance python=3.12
# 激活&quot;advance&quot;环境。
conda activate advance
    # PyTorch官方推荐在MacARM64平台的安装方式。
    conda install pytorch torchvision torchaudio -c pytorch
    # 或者使用pip安装PyTorch相关包。
    pip install torch torchvision torchaudio asitop
    # 终端酷炫监测GPU使用率。
    sudo asitop
# 退出&quot;advance&quot;环境。
conda deactivate
# 查看所有环境。
conda env list
# 删除&quot;advance&quot;环境。
conda env remove --name advance</code></pre><p>记得在VSCode-notebook，右上角点击，Select Another Kernel ｜ Python Environments，选择"advance"环境。<br>%%sh<br>pwd<br>conda activate advance<br>conda install matplotlib<br>python -m pip install ipython ipykernel<br>'''0_0'''<br>import torch</p><h1>测试PyTorch是否正常工作。</h1><p>print(torch.__version__)</p><h1>在MacARM64平台应该显示False，不支持CUDA，使用Metal加速。</h1><p>print(torch.cuda.is_available())</p><h1>"Metal Performance Shaders"，“高性能图形和计算框架”可用。</h1><p>print(torch.backends.mps.is_available())</p><h1>创建一个张量。</h1><p>print(torch.rand(3, 4))<br>'''0_1'''</p><h1>$ x = [1.0, 2.0, 3.0, 4.0] $</h1><p>x = torch.arange(4.0, requires_grad=True)<br>x.grad<br>y = 2 * torch.dot(x, x)<br>print(y)</p><h1>调用反向传播函数来计算y关于x每个分量的梯度。</h1><p>y.backward()<br>print(x.grad)<br>print(x.grad == 4 * x)</p><h1>清除之前的梯度，防止梯度累加。</h1><p>x.grad.zero_()</p><h1>$ y = x1 + x2 + x3 + x4 $</h1><p>y = x.sum()<br>y.backward()<br>print(x.grad)</p><h1>清除之前的梯度，防止梯度累加。</h1><p>x.grad.zero_()</p><h1>$ y = [x1^2, x2^2, x3^2, x4^2] $</h1><p>y = x * x</p><h1>$ y.sum() = x1^2+x2^2+x3^2+x4^2 $</h1><p>y.sum().backward()<br>print(x.grad)</p><p>$$
\begin{aligned}

&amp;样本\mathbf{X}，标签\mathbf{y}，权重矩阵\mathbf{W}，偏置b。 \\

&amp;样本数量m个，每个样本有n个特征，也就是说，\mathbf{X}的形状是(m，n)，标签\mathbf{y}的形状是(m，1)，\\

&amp;每个特征都要有1个权重，也就是说，\mathbf{W}的形状是(n，1)， \\

&amp;偏置b是一个标量，可通过广播机制扩展为(m，1)。 \\

&amp;预测值\qquad\qquad\quad \hat{y} = \mathbf{X} \cdot \mathbf{W} + \vec{b} \\

&amp;平均平方误差\qquad \mathcal{Loss(W, b)} = \frac{1}{2m} \sum_{i=1}^{m} (\hat{y}_{[i]} - y_{[i]})^2 \\

&amp;\hat{y}_{[i]}是第i个样本的预测值，y_{[i]}是第i个样本的真实值。 \\

&amp;梯度下降更新权重，对偏置类似： \\

&amp;W := W - \mathscr{lr} \times \frac{\partial Loss(W, b)}{\partial W} \\

&amp;对于每个参数 W_{[j]}： \\

&amp;W_{[j]} := W_{[j]} - \mathscr{lr} \times \frac{1}{m} \sum_{i=1}^{m} (\hat{y}_{[i]} - y_{[i]}) X_{[i][j]} \\

\end{aligned}
$$</p><p>$$
\begin{aligned}

举个例子，很简单的模型， \\
1个样本，2个特征，\hat{y}=w_1x_1+w_2x_2+b， \\
损失函数，\mathcal{J}=\frac{1}{2}(\hat{y}-y)^2， \\
\frac{∂J}{∂\hat{y}}=\hat{y}-y,\frac{∂J}{∂w_1}=\frac{∂J}{∂\hat{y}}\cdot\frac{∂\hat{y}}{∂w_1}=(\hat{y}-y)\cdot{x_1}, \\
\frac{∂J}{∂w_2}=(\hat{y}-y)\cdot{x_2},\frac{∂J}{∂b}=(\hat{y}-y)， \\
参数初始值，w_1=0.5,w_2=-0.3,b=0.1,𝓁𝓇=0.1， \\
训练样本x_1=1.0,x_2=1.0，标签y=1， \\
\\
\textbf{前向传播}： \\
预测值为\hat{y}=0.5×1.0-0.3×1.0+0.1=0.3， \\
损失值为\mathcal{J}=0.5×(0.3-1.0)^2=0.245， \\
\textbf{反向传播}： \\
\frac{∂J}{∂\hat{y}}=0.3-1.0=-0.7， \\
\frac{∂J}{∂w_1}=-0.7×1.0=-0.7， \\
\frac{∂J}{∂w_2}=-0.7×1.0=-0.7， \\
\frac{∂J}{∂b}=-0.7， \\
w_1=w_1-𝓁𝓇×\frac{∂J}{∂w_1}=0.5+0.1×0.7=0.57， \\
w_2=w_2-𝓁𝓇×\frac{∂J}{∂w_2}=-0.3+0.1×0.7=-0.23， \\
b=b-𝓁𝓇×\frac{∂J}{∂b}=0.1+0.1×0.7=0.17， \\
\textbf{参数更新后再前向传播看看损失}： \\
预测值为\hat{y}=0.57×1.0-0.23×1.0+0.1=0.44， \\
损失值为\mathcal{J}=0.5×(0.44-1.0)^2=0.1568， \\

\end{aligned}
$$</p><table><thead><tr><th> </th><th>更新前</th><th>更新后</th><th>改善</th></tr></thead><tbody><tr><td>预测值</td><td>0.3</td><td>0.44</td><td>+46.7%</td></tr><tr><td>损失值</td><td>0.245</td><td>0.1568</td><td>-36.0%</td></tr></tbody></table><h1>线性回归的从零开始实现</h1><p>'''1_0'''</p><h1>%pip install d2l==2.0.0</h1><h1>%matplotlib inline</h1><p>import random<br>import torch<br>import matplotlib.pyplot as plt<br>'''1_1'''<br>def synthetic_data_linear_noise(args_w, args_b, num_examples):<br>"""合成num_examples个 $ y = Xw + b + 噪声 $ 数据集。"""<br>X = torch.normal(0, 1, (num_examples, len(args_w)))<br>y = torch.matmul(X, args_w) + args_b</p><h1>X.shape=(1000, 2), y.shape=(1000, 1).。</h1><p>y += torch.normal(0, 0.01, y.shape)</p><h1>print('', args_w, '=args_w\n', args_b, '=args_b\n', X, '=X\n', y, '=y')</h1><h1>返回y变成一个一维列向量。</h1><p>return X, y.reshape((-1, 1))<br>my_w = torch.tensor([2, -3.4])<br>my_b = 4.2<br>features, labels = synthetic_data_linear_noise(my_w, my_b, 1000)<br>print('features:', features[0], '\nlabels:', labels[0])</p><h1>绘制数据集。</h1><p>plt.figure(figsize=(8, 6))<br>plt.scatter(features[:, 1].detach().numpy(), labels.detach().numpy(), s=1)<br>plt.xlabel('Feature 1')<br>plt.ylabel('Labels')<br>plt.title('Synthetic Data Visualization')<br>plt.show()</p><p>'''1_2'''<br>def iterate_data(batch_size, features, labels):<br>"""features, labels是数据集的输入和标签，此处随机读取样本小批量。"""<br>num_examples = len(features)<br>indices = list(range(num_examples))</p><h1>随机读取样本，没有顺序。</h1><p>random.shuffle(indices)<br>for i in range(0, num_examples, batch_size):<br>j = min(i + batch_size, num_examples)<br>batch_indices = torch.tensor(indices[i:j])</p><h1>生成器，每次返回一个小批量样本。</h1><p>yield features[batch_indices], labels[batch_indices]</p><p>batch_size = 10<br>for X, y in iterate_data(batch_size, features, labels):<br>print(X, '=X\n', y, '=y', sep='')<br>break</p><p>'''1_3'''<br>its_w = torch.normal(0, 0.01, size=(2, 1), requires_grad=True)<br>its_b = torch.zeros(1, requires_grad=True)<br>def linear_regression(X, w, b):<br>"""线性回归模型。"""<br>return torch.matmul(X, w) + b<br>def squared_error(o, y):<br>"""平方误差损失。"""</p><h1>y.shape=(batch_size,)一维数组，o.shape=(batch_size,1)列向量。</h1><p>return (o - y.reshape(o.shape))**2 / 2<br>def stochastic_gradient_descent(parameters, learning_rate, batch_size):<br>"""小批量随机梯度下降。"""<br>with torch.no_grad():<br>for param in parameters:</p><h1>param就是要学习的参数，w, b。</h1><h1>这就是梯度下降。</h1><p>param -= learning_rate * param.grad / batch_size<br>param.grad.zero_()</p><p>'''1_4'''</p><h1>训练过程，给定学习率learn_rate和训练轮数train_epochs。</h1><p>learn_rate = 0.03<br>train_epochs = 3</p><p>for epoch in range(train_epochs):<br>for X, y in iterate_data(batch_size, features, labels):<br>SE = squared_error(linear_regression(X, its_w, its_b), y)</p><h1>将批次中的所有样本的损失值求和，得到一个标量，对这个标量计算梯度。</h1><p>SE.sum().backward()</p><h1>SE再除以batch_size取平均，在stochastic_gradient_descent()里实现了。</h1><p>stochastic_gradient_descent([its_w, its_b], learn_rate, batch_size)<br>with torch.no_grad():</p><h1>看一下训练效果，训练出来的参数对数据集的预测效果，以误差为指标。</h1><p>train_loss = squared_error(linear_regression(features, its_w, its_b), labels)<br>print(f"epoch {epoch}, loss {float(train_loss.mean()):f}.")</p><h1>本次训练误差收敛在0.00005以下，说明模型训练效果优秀。</h1><p>'''1_5'''</p><h1>比较真实参数my_w, my_b与线性回归模型训练学习到的参数its_w, its_b。</h1><p>print(my_w, my_b, its_w, its_b)<br>print(f"w的估计误差: {my_w - its_w}.")<br>print(f"b的估计误差: {my_b - its_b}.")</p><h1>线性回归的简洁实现</h1><p>'''2_1'''<br>from torch import nn<br>from torch import optim<br>from torch.utils import data</p><p>batch_size = 10</p><h1>特征和标签打包成一个数据集。</h1><p>data_set = data.TensorDataset(features, labels)</p><h1>用于批量加载数据，支持随机打乱和多进程加载。</h1><h1>shuffle=True，在每个epoch开始时打乱数据顺序，防止模型记住数据顺序。</h1><p>iterate_data = data.DataLoader(data_set, batch_size, shuffle=True)</p><h1>获取第一个批次数据的方法，得到1个X1个y。</h1><p>next(iter(iterate_data))</p><h1>线性回归模型。</h1><p>your_model = nn.Sequential(nn.Linear(2, 1))</p><h1>初始化模型参数。</h1><p>your_model[0].weight.data.normal_(0, 0.01)<br>your_model[0].bias.data.fill_(0)</p><h1>平均平方误差损失函数。</h1><p>loss_function = nn.MSELoss()</p><h1>随机梯度下降优化器。</h1><p>train_optimizer = optim.SGD(your_model.parameters(), lr=0.03)</p><h1>训练过程与从零开始实现相似。</h1><p>train_epochs = 3<br>for epoch in range(train_epochs):<br>for X, y in iterate_data:<br>o = your_model(X)<br>SE = loss_function(o, y)</p><h1>每次迭代，都要清空之前的梯度，防止累加。</h1><p>train_optimizer.zero_grad()</p><h1>nn.MSELoss(reduction='mean')返回值就是标量，可以直接backward()，计算权重和偏置的梯度值。</h1><p>SE.backward()</p><h1>更新权重和偏置，参数 -= 学习率 × 梯度。</h1><p>train_optimizer.step()</p><h1>看一下训练效果。</h1><p>SE = loss_function(your_model(features), labels)<br>print(f"epoch {epoch}, loss {SE:f}.")</p><h1>Fashion-MNIST数据集</h1><ol><li><p>MNIST: Modified National Institute of Standards and Technology database</p><ul><li>“修改后的美国国家标准与技术研究院数据库”的缩写，深度学习界的Hello World！</li><li>开发集：60,000张图片，测试集：10,000张图片，每张图片都是28×28的灰度图像，内容是0到9的手写数字。</li></ul></li><li><p>Extended-MNIST:</p><ul><li>MNIST的扩展版本，包含了手写大小写英文字母，多用于OCR任务。</li></ul></li><li><p>Fashion-MNIST:</p><ul><li>MNIST太简单，现代神经网络在MNIST上随便跑跑就能达到99%以上的准确率。</li><li>包含了10个类别的时尚单品，图像大小、数量都与MNIST一致。</li><li>0: T恤/上衣 1: 裤子 2: 衬衫 3: 裙子 4: 外套 5: 凉鞋 6: 衬衫 7: 运动鞋 8: 包包 9: 踝靴</li></ul></li></ol><p>'''3_1'''<br>%matplotlib inline</p><h1>⬆️内联显示图像，自动show()。</h1><p>from torchvision import transforms, datasets</p><h1>将图像转换为张量，从PIL类型变换为32位浮点数类型，并除以255，将像素值缩放到[0,1]。</h1><p>megatron = transforms.ToTensor()<br>'''<br>origin_mnist_dev = datasets.MNIST(<br>root="~/Public", train=True, transform=megatron, download=True)<br>origin_mnist_test = datasets.MNIST(<br>root="~/Public", train=False, transform=megatron, download=True)<br>'''</p><h1>训练数据集，下载到~/Public/FashionMNIST/raw/。</h1><p>fashion_mnist_dev = datasets.FashionMNIST(<br>root="~/Public", train=True, transform=megatron, download=True)</p><h1>测试数据集，下载到~/Public/FashionMNIST/raw/。</h1><p>fashion_mnist_test = datasets.FashionMNIST(<br>root="~/Public", train=False, transform=megatron, download=True)</p><h1>文件的数据类型都是UnsignedByte，没法用普通图片查看器打开。</h1><p>print(len(fashion_mnist_dev), len(fashion_mnist_test))</p><h1>第一维表示i号样本，第二维[0]表示图像数据、[1]表示标签。</h1><h1>灰度图像，channel=1, height=28, width=28。彩色图像，channel=3。</h1><p>print(fashion_mnist_dev0.shape, fashion_mnist_test0.shape)</p><p>'''3_2'''<br>import torchvision<br>from torchvision import transforms<br>from torch.utils.data import DataLoader</p><h1>Fashion-MNIST数据集下载好之后，可以直接使用，就不要download=True了。</h1><p>fashion_mnist_dev = torchvision.datasets.FashionMNIST(<br>root="~/Public", train=True, transform=megatron, download=False)<br>fashion_mnist_test = torchvision.datasets.FashionMNIST(<br>root="~/Public", train=False, transform=megatron, download=False)</p><p>'''3_3'''<br>import torch<br>import numpy as np<br>import matplotlib.pyplot as plt</p><p>def get_label_list(idx_list):<br>"""将Fashion-MNIST数据集中数字标签列表转换为对应的文本标签列表。"""<br>label_text = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle-boot']<br>return [label_text[int(i)] for i in idx_list]<br>def paint_images(images, rows, cols, titles=None, scale=1.5):<br>"""批量绘制Fashion-MNIST数据集中的图像。"""<br>fig, axes = plt.subplots(rows, cols, figsize=(cols <em> scale, rows </em> scale))</p><h1>二维的坐标轴数组展平成一维，方便后续遍历。</h1><p>axes = axes.flatten()<br>for i, (ax, img) in enumerate(zip(axes, images)):<br>ax.axis('off')<br>if torch.is_tensor(img):</p><h1>将torch张量转换为numpy数组显示。</h1><p>ax.imshow(img.numpy())<br>else:</p><h1>直接显示numpy数组或PIL图像。</h1><p>ax.imshow(img)<br>if titles:<br>ax.set_title(titles[i] + '⬇️')</p><p>'''3_4'''<br>"""<br>plt.subplots()的用法，<br>参数：<br>rows表示行数，cols表示列数，figsize表示整个画布大小，先宽度后高度，单位为英寸inch。<br>注意，cols与宽度成正比，rows与高度成正比，figsize顺序不能反，子图的尺寸自动计算。<br>返回：<br><a href="mailto:fig@matplotlib.figure">fig@matplotlib.figure</a>.Figure，整个画布对象，可以设置标题、保存图片等；<br><a href="mailto:axes@numpy.ndarray">axes@numpy.ndarray</a>，单个子图对象，维度为(rows, cols)，可以设置标题、坐标轴等。<br>创建 $ rows × cols $ 的子图网格。<br>"""<br>fig, axes = plt.subplots(2, 5, figsize=(10, 4))</p><p>fig.suptitle('Fashion-MNIST10K', fontfamily='PT mono', fontsize=14)<br>fig.patch.set_facecolor('lightgray')<br>fig.tight_layout(pad=1.0, h_pad=3.0, w_pad=3.0)<br>'''<br>fig.subplots_adjust(<br>top=0.9,      # 顶部边距，0.9表示顶部留10%空间。<br>bottom=0.1,   # 底部边距，0.1表示底部留10%空间。<br>left=0.1,     # 左侧边距，0.1表示左侧留10%空间。<br>right=0.9,    # 右侧边距，0.9表示右侧留10%空间。<br>hspace=0.4,   # 行间距。<br>wspace=0.3    # 列间距。<br>) # 单位是图形尺寸的比例，范围[0,1]。<br>'''</p><h1>fig.savefig('./example.png', dpi=300, bbox_inches='tight')</h1><p>axes0.set_title('t-shirt')<br>axes0.axis('off')<br>axes0.set_xlabel('X-Ray', fontfamily='PT mono')<br>axes0.set_ylabel('Y-Ray', fontfamily='PT mono')<br>axes1.grid(True, alpha=0.3)</p><h1>绘制随机散点图。</h1><p>axes1.scatter(np.random.randn(100), np.random.randn(100), alpha=0.6)</p><h1>测试数据集3号图片，去除单通道维度，显示为灰度图。</h1><p>axes1.imshow(fashion_mnist_test2.numpy().squeeze(), cmap='gray')<br>'''<br>Axes.bar(x, height, width=0.8, bottom=None, **kwargs)<br>用于绘制垂直柱形图。</p><p>Axes.plot(x, y, [fmt], **kwargs)<br>用于绘制折线图。<br>fmt: 线条样式，color[makrer]，顺序无所谓，<br>color:  颜色，('b', 'g', 'r', 'c', 'm', 'y', 'k', 'w')；<br>line:   线型，（'-', '--', '-.', ':', '')；<br>marker: 数据点标记，（'.', 'o', 'v', '^', '&lt;', '&gt;', 's', '*', 'h', 'H','d', 'D', 'p', 'P', 'x', 'X')；</p><p>Axes.pie(y, explode=None, labels=None, colors=None, autopct=None,<br>shadow=False, startangle=0, counterclock=True, wedgeprops=None, **kwargs)<br>用于绘制饼状图。<br>'''<br>'''3_5'''<br>from torch.utils.data import DataLoader</p><p>X, y = next(iter(DataLoader(fashion_mnist_dev, batch_size=10)))</p><h1>X.shape=(10, 1, 28, 28)，第2个维度是channel，需要reshape成(10, 28, 28)。</h1><p>paint_images(X.reshape(-1, 28, 28), 2, 5, titles=get_label_list(y))<br>'''3_6'''</p><h1>import os</h1><p>import time<br>from contextlib import contextmanager<br>from torch.utils.data import DataLoader</p><h1>os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'</h1><p>@contextmanager<br>def secs_count(description='操作'):<br>"""记录操作耗时。"""<br>begin = time.time()<br>try:<br>yield<br>finally:<br>end = time.time()<br>print(f"{description} 耗时 {end - begin:.2f}秒。")</p><h1>小批量，多进程。</h1><p>batch_size, num_workers = 256, 4<br>iterate_devset = DataLoader(<br>fashion_mnist_dev, batch_size=batch_size, shuffle=True, num_workers=num_workers)<br>with secs_count('训练数据载入'):<br>for X, y in iterate_devset:<br>continue</p><h1>会输出4个子进程的调试信息。</h1><p>'''3_7'''<br>from torchvision import transforms, datasets<br>from torch.utils.data import DataLoader</p><p>def load_data_fashion_mnist(batch_size, reset_size:tuple=None):<br>"""加载Fashion-MNIST数据集到内存里。"""</p><h1>例如提供reset_size=(32, 32)。</h1><p>if reset_size:</p><h1>先调整图像尺寸，再将PIL图像或numpy数组转换为torch张量。</h1><p>tf_list = [transforms.Resize(reset_size), transforms.ToTensor()]<br>else:<br>tf_list = [transforms.ToTensor()]</p><h1>将转换列表组合成一个可执行的转换管道。</h1><p>megatron = transforms.Compose(tf_list)<br>fashion_mnist_dev = datasets.FashionMNIST(<br>root="~/Public", train=True, transform=megatron, download=False)<br>fashion_mnist_test = datasets.FashionMNIST(<br>root="~/Public", train=False, transform=megatron, download=False)</p><h1>MacBookARM64平台，使用MPS后端，多进程在共享内存方面存在兼容性问题，num_workers=0是当前最稳妥的选择，创建0个子进程。</h1><p>return (DataLoader(fashion_mnist_dev, batch_size, shuffle=True, num_workers=0),<br>DataLoader(fashion_mnist_test, batch_size, shuffle=False, num_workers=0))</p><h1>SoftMax 分类问题</h1><p>回归问题与分类问题的区别：</p><ol><li>输出：回归问题输出是一个连续的数值，而分类问题输出是一个离散的类别。</li></ol><ul><li>示例：房价预测；</li><li>示例：图像识别；</li></ul><ol start="2"><li>目标：回归问题预测一个连续的数值 ，而分类问题预测一个样本所属的类别。</li></ol><ul><li>损失函数：平均平方误差（MSE）；</li><li>损失函数：交叉熵（Cross-Entropy）；</li></ul><p>SoftMax：柔性最大值。</p><p>$$
\begin{align}
% aligned取消编号。
&amp;\mathcal{SoftMax}(\mathbf{X_{i,j}}) = \frac{\exp(\mathbf{X_j})}{\sum_{j=1}^N \exp(\mathbf{X}_{j})} \\
&amp;\mathcal{CrossEntropy}(y, \hat{y}) = - \sum_{j=1}^N y_j \log(\hat{y}_j) \\
\end{align}
$$</p><h1>软大分类的从零开始实现</h1><p>'''4_1'''<br>import os<br>import torch</p><p>os.environ['PYDEVD_DISABLE_FILE_VALIDATION'] = '1'<br>batch_size = 256</p><h1>load_data_fashion_mnist()定义在3_7里。</h1><p>iterate_devset, iterate_testset = load_data_fashion_mnist(batch_size)</p><h1>每个图像展平成长度为28×28=784的向量，10个类别，10个输出。</h1><p>num_inputs, num_outputs = 784, 10</p><h1>定义模型参数，权重W是一个784×10的矩阵，偏置b是一个10维的列向量。</h1><h1>初始化，权重W从均值为0、标准差为0.01的正态分布中采样，偏置b以0填充。</h1><p>W = torch.normal(0, 0.01, (num_inputs, num_outputs), requires_grad=True)<br>b = torch.zeros(num_outputs, requires_grad=True)</p><p>'''<br>X_exp = torch.tensor([<br>[1.0, 2.0, 3.0, 4.0],  # 样本[0]<br>[0.5, 1.5, 2.5, 3.5],  # 样本[1]<br>[2.0, 3.0, 4.0, 5.0]   # 样本[2]<br>])</p><blockquote><p>partition = X_exp.sum(1, keepdim=True)<br>&lt; tensor([[10.],<br>[ 8.],<br>[14.]])<br>'''<br>def calc_softmax(X):<br>X_exp = torch.exp(X)</p><h1>第2维上求和，行向量压缩，并保持维度，避免广播错误。</h1><p>partition = X_exp.sum(1, keepdim=True)<br>return X_exp / partition<br>def softmax_classification(X):<br>"""软大分类模型。"""</p><h1>X.shape=(batch_size, 1, 28, 28)，重塑成(batch_size, num_inputs)。</h1><p>return calc_softmax(torch.matmul(X.reshape(-1, W.shape[0]), W) + b)<br>def cross_entropy(o, y):<br>"""交叉熵损失。"""</p><h1>range(len(o))生成下标整数序列，返回值是一个向量，还未求和。</h1><p>return -torch.log(o[range(len(o)), y])<br>def stochastic_gradient_descent(parameters, learning_rate, batch_size):<br>"""小批量随机梯度下降。"""<br>with torch.no_grad():<br>for param in parameters:</p><h1>param就是要学习的参数，W, b。</h1><h1>这就是梯度下降。</h1><p>param -= learning_rate * param.grad / batch_size<br>param.grad.zero_()<br>'''<br>torch.tensor的花式索引，<br>o = torch.tensor([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5], [0.7, 0.2, 0.1]])<br>样本[0]取下标0类别概率，样本[1]取下标0类别概率。<br>o[[0, 1], [0, 0]]<br>&lt; tensor([0.1000, 0.3000])<br>o[[0, 1, 2], [1, 2, 0]]<br>&lt; tensor([0.3000, 0.5000, 0.7000])<br>'''<br>def count_accurate(o, y):<br>"""计算预测正确的数量。"""<br>if len(o.shape) &gt; 1 and o.shape[1] &gt; 1:</p><h1>若o是多维的，且第2维大于1，则对每个样本取最大概率的类别索引，即下标。</h1><p>o = o.argmax(axis=1)</p><h1>创建bool张量，标记哪些预测正确。</h1><p>comp = o.type(y.dtype) == y</p><h1>True→1，False→0，求和。</h1><p>return float(comp.type(y.dtype).sum())</p><h1>准确率。</h1><p>count_accurate(o, y) / y.numel()<br>def evaluate_accuracy(your_model, iterate_data):<br>"""计算在指定数据集上模型的准确率。"""<br>if isinstance(your_model, torch.nn.Module):</p><h1>将模型转为评估模式，不要计算梯度。</h1><p>your_model.eval()</p><h1>2个数，分别是预测正确数、样本总数。</h1><p>metric = [0.0, 0.0]<br>with torch.no_grad():<br>for X, y in iterate_data:<br>metric[0] += count_accurate(your_model(X), y)<br>metric[1] += y.numel()<br>return metric[0] / metric[1]</p></blockquote><h1><a href="mailto:iterate_testset@torch.utils.data.dataloader">iterate_testset@torch.utils.data.dataloader</a>.DataLoader，定义在cell4_1上方。</h1><p>evaluate_accuracy(softmax_classification, iterate_testset)</p><h1>记得我们的模型吗？随机初始化的参数，跟瞎猜差不多，对于10个分类准确率在1/10以下。</h1><h1>此时交叉熵损失值大约是 $ -log(0.1) \approx 2.3 $ 。</h1><p>'''4_2'''</p><h1>使用SVG格式显示。</h1><p>%config InlineBackend.figure_format = 'svg'<br>import matplotlib.pyplot as plt<br>from IPython import display</p><p>class ZigzagChartAnimator:<br>"""折线图动画可视化类。"""<br>def __init__(self, title='', legend=(), info_xy_=('','',(),(),'linear','linear'),<br>rows=1, cols=1, figsize=(5, 2.5)):<br>"""<br>title: 图表标题<br>legend: 图例列表<br>_x_y_: 包含x轴和y轴共6个参数<br>xlabel: x轴标签 -字符串<br>ylabel: y轴标签 -字符串<br>xlimit: x轴范围 -元组含2个数<br>ylimit: y轴范围 -元组含2个数<br>xscale: x轴缩放 -可选('linear', 'log')<br>yscale: y轴缩放 -可选('linear', 'log')<br>rows: 子图行数<br>cols: 子图列数<br>figsize: 图表尺寸<br>"""</p><h1>正常显示中文字符，和负号。</h1><p>plt.rcParams['font.family'] = ['DejaVu Sans', 'Arial Unicode MS']<br>plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'Arial Unicode MS']<br>plt.rcParams['axes.unicode_minus'] = False</p><pre><code># 创建子图网格。
    self.fig, self.axes = plt.subplots(rows, cols, figsize=figsize)
    if rows * cols == 1:
        # 如果只有1个子图，返回的是1个Axes对象，转为列表。
        self.axes = [self.axes]
    self.config_axes = lambda: self._set_axes(self.axes[0], title, legend, info_xy_)
    # m: 要绘多少条线。
    m = len(legend)
    self.X_2dli = [[] for _ in range(m)]
    self.Y_2dli = [[] for _ in range(m)]

def _set_axes(self, ax, title, legend, info_xy_):
    &quot;&quot;&quot;
    ax: 子图Axes对象
    title: 图表标题
    legend: 图例列表
    xlabel: x轴标签
    ylabel: y轴标签
    xlimit: x轴范围
    ylimit: y轴范围
    xscale: x轴缩放
    yscale: y轴缩放
    &quot;&quot;&quot;
    xlabel, ylabel, xlimit, ylimit, xscale, yscale = info_xy_
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_xlim(xlimit)
    ax.set_ylim(ylimit)
    ax.set_xscale(xscale)
    ax.set_yscale(yscale)
    # 图例锚点(1.0, 0.7)，距离画布右下角(100%, 70%)位置处，图例框的'lower left'钉在锚点。
    ax.legend(legend, loc='lower left', bbox_to_anchor=(1.0, 0.7))
    ax.set_title(title)
    ax.grid(True)

def insert_point(self, x_li, y_li):
    &quot;&quot;&quot;
    向折线图中添加数据点

x_li: x坐标值 可以是数或列表
    y_li: y坐标值 可以是数或列表
    &quot;&quot;&quot;
    if not hasattr(y_li, &quot;__len__&quot;):
        # 数，转列表，方便后续遍历。
        y_li = [y_li]
    m = len(y_li)
    if not hasattr(x_li, &quot;__len__&quot;):
        # 数，转列表，方便后续遍历。
        x_li = [x_li] * m
    # 比如x_li=1, y_li=[0.2,0.3,0.4]，转换成[1,1,1], [0.2,0.3,0.4]。
    # 即1个横坐标对应3个纵坐标，3条线。

# 添加数据点。
    for i, (a, b) in enumerate(zip(x_li, y_li)):
        if a and b:
            # a, b 都是数。
            self.X_2dli[i].append(a)
            self.Y_2dli[i].append(b)
    # 清除当前图表并重新绘制。
    self.axes[0].clear()
    # 定义线条样式，依次顺序使用下面元素。
    fmts = ('-','m--','g-.')
    for x_li, y_li, t in zip(self.X_2dli, self.Y_2dli, fmts):
        self.axes[0].plot(x_li, y_li, t)
    # 重新配置坐标轴。
    self.config_axes()
    # 显示图形并清空之前的输出。
    display.display(self.fig)
    display.clear_output(wait=True)</code></pre><p>'''4_3'''</p><h1>训练过程，给定学习率learn_rate和训练轮数train_epochs。</h1><p>learn_rate = 0.1<br>train_epochs = 10</p><p>info_xy_ = ("训练轮数","",(1,10),(0,1),'linear','linear')<br>animator = ZigzagChartAnimator("训练可视化",("损失值","开发数据集准确率","测试数据集准确率"),info_xy_)</p><p>for epoch in range(train_epochs):<br>metric = [0.0, 0.0, 0.0]<br>for X, y in iterate_devset:<br>o = softmax_classification(X)<br>CE = cross_entropy(o, y)</p><h1>函数定义时，返回的是个向量，需要求和。</h1><p>CE.sum().backward()</p><h1>CE再除以batch_size取平均，在stochastic_gradient_descent()里实现了。</h1><p>stochastic_gradient_descent([W, b], learn_rate, X.shape[0])</p><h1>交叉熵损失总和。</h1><p>metric[0] += float(CE.sum())</p><h1>预测正确数。</h1><p>metric[1] += float(count_accurate(o, y))</p><h1>样本总数。</h1><p>metric[2] += y.numel()<br>train_loss, train_accu = metric[0] / metric[2], metric[1] / metric[2]<br>infer_accu = evaluate_accuracy(softmax_classification, iterate_testset)<br>animator.insert_point(epoch+1, [train_loss, train_accu, infer_accu])<br>print(f"devset loss: {train_loss:.4f}, devset accuracy: {train_accu:.4f}, testset accuracy: {infer_accu:.4f}.")</p><h1>训练轮次结束，在测试数据集上的准确率收敛在0.84以上，说明模型训练效果良好。</h1><p>'''4_4'''</p><h1>用已经训练好的模型对测试集进行预测，画图对比。</h1><p>for X, y in iterate_testset:<br>break</p><h1>get_label_list(), paint_images()，定义在cell3_3里。</h1><p>actual = get_label_list(y)<br>predi = get_label_list(softmax_classification(X).argmax(axis=1))<br>titles = [f'{a}\n{p}' for a, p in zip(actual, predi)]<br>m, n = 1, 8<br>paint_images(X[0:n].reshape(n, 28, 28), m, n, titles[0:n])</p><h1>软大分类的简洁实现</h1><p>'''5_1'''<br>import torch<br>from torch import nn, optim</p><p>batch_size = 256</p><h1>load_data_fashion_mnist()定义在3_7里。</h1><p>iterate_devset, iterate_testset = load_data_fashion_mnist(batch_size)</p><h1>软大分类模型。</h1><h1>nn.Flatten()将输入的Fashion-MNIST图像(batch_size, 1, 28, 28)展平成(batch_size, 784)。</h1><h1>(batch_size, 784)才适合输入到nn.Linear(784, 10)。</h1><p>your_model = nn.Sequential(nn.Flatten(), nn.Linear(784, 10))</p><h1>初始化模型参数。</h1><p>nn.init.normal_(your_model[1].weight, mean=0, std=0.01)</p><h1>交叉熵损失函数。</h1><p>loss_function = nn.CrossEntropyLoss()</p><h1>随机梯度下降优化器。</h1><p>train_optimizer = optim.SGD(your_model.parameters(), lr=0.1)</p><h1>训练过程与从零开始实现相似。</h1><p>train_epochs = 10<br>for epoch in range(train_epochs):</p><h1>模型切换到训练模式，调教。</h1><p>your_model.train()<br>metric = [0.0, 0.0, 0.0]<br>for X, y in iterate_devset:<br>o = your_model(X)<br>CE = loss_function(o, y)</p><h1>每次迭代，都要清空之前的梯度，防止累加。</h1><p>train_optimizer.zero_grad()</p><h1>一组batch_size的平均损失值，反向传播，计算权重和偏置的梯度值。</h1><p>CE.mean().backward()</p><h1>更新权重和偏置，参数 -= 学习率 × 梯度。</h1><p>train_optimizer.step()</p><h1>交叉熵损失总和。</h1><p>metric[0] += float(CE.sum())</p><h1>预测正确数。</h1><p>metric[1] += float(count_accurate(o, y))</p><h1>样本总数。</h1><p>metric[2] += y.numel()<br>train_loss, train_accu = metric[0] / metric[2], metric[1] / metric[2]</p><h1>看一下训练效果，在evaluate_accuracy()内会将模型切回评估模式。</h1><p>infer_accu = evaluate_accuracy(your_model, iterate_testset)<br>print(f"devset loss: {train_loss:.4f}, devset accuracy: {train_accu:.4f}, testset accuracy: {infer_accu:.4f}.")</p>