线性回归的从零开始 1 2 3 4 %matplotlib inline import randomimport torchfrom d2l import torch as d2l
生成数据集 为了简单起⻅,我们将根据带有噪声 $\epsilon$ 的线性模型构造⼀个⼈造数据集。我们的任务是使⽤这个有限样本的数据集来恢复这个模型的参数。我们将使⽤低维数据,这样可以很容易地将其可视化。
使用线性模型w=$\begin{bmatrix}2 & -3.4\end{bmatrix}^\top$,$b=4.2$和噪声项$\epsilon$生成数据集和标签
1 2 3 4 5 6 7 8 def synthetic_data (w, b, num_examples ): """ 生成y=Xw+b+噪声 """ X = torch.normal(0 , 1 , (num_examples, len (w))) y = X@w + b y += torch.normal(0 , 0.01 , y.shape) return X, y.reshape((-1 , 1 ))
1 2 3 true_w = torch.tensor([2 , -3.4 ]) true_b = 4.2 features, labels = synthetic_data(true_w, true_b, 1000 )
1 2 print ('features:' , features[0 ], '\nlabels:' , labels[0 ])
features: tensor([-0.7822, -1.3131])
labels: tensor([7.0886])
通过生成第二个特征features[:,1]和labels的散点图,可以直接观察到两者之间的线性关系
1 2 d2l.set_figsize() d2l.plt.scatter(features[:, 1 ].detach().numpy(), labels.detach().numpy(), 1 )
<matplotlib.collections.PathCollection at 0x2228454ca60>
读取数据集 训练模型时要对数据集进⾏遍历,每次抽取⼀⼩批量样本,并使⽤它们来更新我们的模型。由于这个过程是训练机器学习算法的基础,所以有必要定义⼀个函数,该函数能打乱数据集中的样本并以⼩批量⽅式获取数据。
1 2 3 4 5 6 7 8 9 10 def data_iter (batch_size, features, labels ): num_examples = len (features) indices = list (range (num_examples)) random.shuffle(indices) for i in range (0 , num_examples, batch_size): batch_indices = torch.tensor(indices[i:min (i+batch_size, num_examples)]) yield features[batch_indices], labels[batch_indices]
1 2 3 4 5 6 batch_size = 16 for X, y in data_iter(batch_size, features, labels): print (X, '\n' , y) break
tensor([[-2.3705, 1.6151],
[-0.0978, -0.3523],
[-2.2586, 0.8220],
[-1.8264, 0.4761],
[-0.9017, 0.4296],
[ 0.1507, -0.8741],
[ 0.6576, 0.6147],
[-0.4884, -0.5035],
[-0.6032, 0.1846],
[ 0.8629, 0.8639],
[ 2.3865, 0.2038],
[ 2.1056, -0.4582],
[-0.9746, -0.6409],
[-0.3265, 0.7550],
[-0.4842, -2.5187],
[-0.7882, 1.0676]])
tensor([[-6.0368],
[ 5.1836],
[-3.1394],
[-1.0727],
[ 0.9296],
[ 7.4831],
[ 3.4451],
[ 4.9377],
[ 2.3702],
[ 2.9883],
[ 8.2696],
[ 9.9503],
[ 4.4302],
[ 0.9901],
[11.7968],
[-1.0145]])
虽然上面实现的迭代对于教学来说很好,但它的执行效率很低,可能会再实际问题上陷入麻烦。例如,他要求我们将所有数据加载到内存中,并执行大量的随机内存访问。再深度学习框架中实现的内置迭代器效率要高得多,它可以处理存储在文件中的数据和数据流提供的数据。
初始化模型参数 1 2 w = torch.normal(0 , 0.01 , size=(2 , 1 ), requires_grad=True ) b = torch.zeros(1 , requires_grad=True )
定义模型 1 2 3 def linreg (X, w, b ): """线性回归模型""" return torch.matmul(X, w) + b
定义损失函数 1 2 3 def squared_loss (y_hat, y ): """均方损失""" return (y_hat - y.reshape(y_hat.shape)) ** 2 / 2
定义优化算法 1 2 3 4 5 6 def sgd (params, lr, batch_size ): """小批量随机梯度下降""" with torch.no_grad(): for param in params: param -= lr * param.grad / batch_size param.grad.zero_()
训练 1 2 3 4 5 lr = 0.03 num_epochs = 5 net = linreg loss = squared_loss optim = sgd
1 2 3 4 5 6 7 8 9 for epoch in range (num_epochs): for X, y in data_iter(batch_size, features, labels): y_hat = net(X, w, b) l = loss(y_hat, y) l.sum ().backward() optim([w, b], lr, batch_size) print (f"epoch {epoch + 1 } , loss{float (l.mean()):f} " )
epoch 1, loss0.539183
epoch 2, loss0.006921
epoch 3, loss0.000593
epoch 4, loss0.000057
epoch 5, loss0.000130
因为我们使⽤的是⾃⼰合成的数据集,所以我们知道真正的参数是什么。因此,我们可以通过⽐较真实参数和通过训练学到的参数来评估训练的成功程度。事实上,真实参数和通过训练学到的参数确实⾮常接近。
1 2 print (f"w的估计误差:{true_w - w.reshape(true_w.shape)} " )print (f"b的估计误差:{true_b - b} " )
w的估计误差:tensor([ 0.0013, -0.0004], grad_fn=<SubBackward0>)
b的估计误差:tensor([0.0008], grad_fn=<RsubBackward1>)
借用torch相关库进行简化实现过程 1 2 3 4 import numpy as npimport torchfrom torch.utils import datafrom d2l import torch as d2l
生成数据集 1 2 3 true_w = torch.tensor([2 , -3.4 ]) true_b = 4.2 features, labels = d2l.synthetic_data(true_w, true_b, 1000 )
读取数据集 1 2 3 4 def load_array (data_arrays, batch_size, is_train=True ): """构造一个Pytorch数据迭代器""" dataset = data.TensorDataset(*data_arrays) return data.DataLoader(dataset, batch_size, shuffle=is_train)
1 2 batch_size = 16 data_iter = load_array((features, labels), batch_size)
[tensor([[ 0.1293, 2.1394],
[-0.3502, 0.0696],
[ 0.1085, 1.3638],
[-0.3916, 0.7837],
[-1.4090, -0.9483],
[ 0.8252, -1.9988],
[-0.4100, 2.6340],
[ 0.3648, 0.5963],
[-0.1308, -0.9131],
[ 0.8162, 1.0117],
[ 0.8033, -0.5222],
[-0.3889, 0.0667],
[ 1.0680, -0.2176],
[-0.8264, 1.0166],
[-0.9629, 0.6252],
[ 1.7917, -0.4423]]),
tensor([[-2.8102],
[ 3.2723],
[-0.2239],
[ 0.7533],
[ 4.6051],
[12.6580],
[-5.5930],
[ 2.8950],
[ 7.0333],
[ 2.4028],
[ 7.5783],
[ 3.2041],
[ 7.0752],
[-0.9071],
[ 0.1483],
[ 9.2980]])]
定义模型 1 2 3 4 from torch import nnnet = nn.Sequential(nn.Linear(2 , 1 ))
初始化模型参数
tensor([[-0.5707, 0.0745]])
1 2 net[0 ].weight.data.normal_(0 , 0.01 ) net[0 ].bias.data.fill_(0 )
tensor([0.])
定义损失函数
定义优化算法 当我们实例化⼀个SGD实例时,我们要指定优化的参数(可通过net.parameters()从我们的模型中获得)以及优化算法所需的超参数字典。⼩批量随机梯度下降只需要设置lr值,这⾥设置为0.03。
1 trianer = torch.optim.SGD(net.parameters(), lr=0.03 )
训练模型
1 2 3 4 5 6 7 8 9 for epoch in range (num_epochs): for X, y in data_iter: y_pred = net(X) l = loss(y_pred, y) trianer.zero_grad() l.backward() trianer.step() l =loss(net(features), labels) print (f"epoch{epoch + 1 } , loss {l:f} " )
epoch1, loss 0.008912
epoch2, loss 0.000101
epoch3, loss 0.000099
epoch4, loss 0.000099
epoch5, loss 0.000099
查看参数的训练误差
1 2 3 4 w = net[0 ].weight.data print ('w的估计误差:' , true_w - w.reshape(true_w.shape))b = net[0 ].bias.data print ('b的估计误差:' , true_b - b)
w的估计误差: tensor([-2.7442e-04, 1.6928e-05])
b的估计误差: tensor([-8.7261e-05])
图像分类数据集 MNIST数据集 [LeCun et al., 1998] 是图像分类中⼴泛使⽤的数据集之⼀,但作为基准数据集过于简单。我们将使⽤类似但更复杂的Fashion-MNIST数据集 [Xiao et al., 2017]。
1 2 3 4 5 6 7 8 %matplotlib inline import torchimport torchvisionfrom torch.utils import datafrom torchvision import transformsfrom d2l import torch as d2ld2l.use_svg_display()
读取数据集 1 2 3 4 5 6 7 trans = transforms.ToTensor() mnist_train = torchvision.datasets.FashionMNIST( root="../data" , train=True , transform=trans, download=False ) mnist_test = torchvision.datasets.FashionMNIST( root="../data" , train=False , transform=trans, download=False )
Fashion-MNIST由10个类别的图像组成,每个类别由训练数据集(train dataset)中的60000张图像和测试数据集(test dataset)中的10000张图像组成。因此,训练集和测试集分别包含60000和10000张图像。
1 len (mnist_train), len (mnist_test)
(60000, 10000)
1 2 mnist_train[0 ][0 ].shape
torch.Size([1, 28, 28])
Fashion-MNIST中包含的10个类别,分别为t-shirt(T恤)、trouser(裤⼦)、pullover(套衫)、dress(连⾐裙)、coat(外套)、sandal(凉鞋)、shirt(衬衫)、sneaker(运动鞋)、bag(包)和ankle boot(短靴)。
1 2 3 4 5 6 def get_fashion_mnist_labels (labels ): """返回Fashion-MNIST数据集的文本标签""" text_labels = ['t-shirt' , 'trouser' , 'pullover' , 'dress' , 'coat' , 'sandal' , 'shirt' , 'sneaker' , 'bag' , 'ankle boot' ] return [text_labels[int (i)] for i in labels]
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 def show_images (imgs, num_rows, num_cols, titles=None , scale=1.5 ): """绘制图像列表""" figsize = (num_cols * scale, num_rows * scale) _, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize) axes = axes.flatten() for i, (ax, img) in enumerate (zip (axes, imgs)): if torch.is_tensor(img): ax.imshow(img.numpy()) else : ax.imshow(img) ax.axes.get_xaxis().set_visible(False ) ax.axes.get_yaxis().set_visible(False ) if titles: ax.set_title(titles[i]) return axes
1 2 X, y = next (iter (data.DataLoader(mnist_train, batch_size=12 ))) show_images(X.reshape(12 , 28 , 28 ), 2 , 6 , titles=get_fashion_mnist_labels(y));
读取小批量数据集 1 2 3 4 5 6 7 8 batch_size = 256 def get_dataloader_workers (): """使用4个进程来读取数据""" return 4 train_iter = data.DataLoader(mnist_train, batch_size, shuffle=True , num_workers=get_dataloader_workers())
1 2 3 4 5 timer = d2l.Timer() for X, y in train_iter: continue f'{timer.stop():.2 f} sec'
'2.61 sec'
整合所有组件 1 2 3 4 5 6 7 8 9 10 11 12 13 14 def load_data_fashion_mnist (batch_size, resize=None ): """加载Fashion-MNIST数据集,然后将其加载到内存中""" trans = [transforms.ToTensor()] if resize: trans.insert(0 , transforms.Resize(resize)) trans = transforms.Compose(trans) mnist_train = torchvision.datasets.FashionMNIST( root="../data" , train=True , transform=trans, download=False ) mnist_test = torchvision.datasets.FashionMNIST( root="../data" , train=False , transform=trans, download=False ) return (data.DataLoader(mnist_train, batch_size, shuffle=True , num_workers=get_dataloader_workers()), data.DataLoader(mnist_test, batch_size, shuffle=False , num_workers=get_dataloader_workers()))
1 2 train_iter, test_iter = load_data_fashion_mnist(32 , resize=64 )
1 2 3 for X, y in train_iter: print (X.shape, X.dtype, '\n' , y.shape, y.dtype) break
torch.Size([256, 1, 28, 28]) torch.float32
torch.Size([256]) torch.int64
softmax回归的从零开始实现 1 2 3 import torchfrom IPython import displayfrom d2l import torch as d2l
1 2 batch_size = 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
初始化模型参数 1 2 num_inputs = 784 num_outputs = 10
1 2 w = torch.normal(0 , 0.01 , size=(num_inputs, num_outputs), requires_grad=True ) b = torch.zeros(num_outputs, requires_grad=True )
定义softmax操作 1 2 3 4 def softmax (X ): X_exp = torch.exp(X) partition = X_exp.sum (1 , keepdim=True ) return X_exp / partition
1 X = torch.normal(0 , 1 , (2 , 5 ))
1 2 X_prob = softmax(X) X_prob, X_prob.sum (1 , keepdim=True )
(tensor([[0.1131, 0.4422, 0.2198, 0.1412, 0.0837],
[0.3142, 0.1550, 0.1359, 0.2847, 0.1103]]),
tensor([[1.],
[1.]]))
注意,虽然这在数学上看起来是正确的,但我们在代码实现中有点草率。矩阵中的⾮常⼤或⾮常⼩的元素可能造成数值上溢或下溢,但我们没有采取措施来防⽌这点。
定义模型
torch.Size([2, 5])
torch.Size([784, 10])
1 2 def net (X ): return softmax(X.reshape((-1 , w.shape[0 ]))@w + b)
定义损失函数 1 2 3 def cross_entropy (y_hat, y ): return - torch.log(y_hat[range (len (y_hat)), y])
1 cross_entropy(net(X[:2 ]), y[:2 ])
tensor([2.2568, 2.2025], grad_fn=<NegBackward>)
分类精度 1 2 3 4 5 6 7 def accuracy (y_hat, y ): """计算预测正确的数量""" if len (y_hat.shape) > 1 and y_hat.shape[1 ] > 1 : y_hat = y_hat.argmax(axis=1 ) cmp = y_hat.type (y.dtype) == y return float (cmp.type (y.dtype).sum ())
1 2 accuracy(net(X[:100 ]), y[:100 ] ) / len (y[:100 ])
0.11
1 2 3 4 5 6 7 8 9 10 11 12 13 14 class Accumulator : """在n个变量上进行累加""" def __init__ (self, n ): self.data = [0.0 ] * n def add (self, *args ): self.data = [a + float (b) for a,b in zip (self.data, args)] def reset (self ): self.data = [0.0 ] * len (self.data) def __getitem__ (self, idx ): return self.data[idx]
1 2 3 4 5 6 7 8 9 10 def evaluate_accuracy (net, data_iter ): """计算在指定数据集上模型的精度""" if isinstance (net, torch.nn.Module): net.eval () metric = Accumulator(2 ) with torch.no_grad(): for X, y in data_iter: metric.add(accuracy(net(X), y), y.numel()) return metric[0 ] / metric[1 ]
1 evaluate_accuracy(net, test_iter)
0.099
训练 我们重构训练过程的实现以使其可重复使⽤。⾸先,我们定义⼀个函数来训练⼀个迭代周期。请注意,updater是更新模型参数的常⽤函数,它接受批量⼤⼩作为参数。它可以是d2l.sgd函数,也可以是框架的内置优化函数。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 def train_epoch_ch3 (net, train_iter, loss, updater ): """训练模型一个迭代周期(定义见第3章)""" if isinstance (net, torch.nn.Module): net.train() metric = Accumulator(3 ) for X, y in train_iter: y_hat = net(X) l = loss(y_hat, y) if isinstance (updater, torch.optim.Optimizer): updater.zero_grad() l.mean().backward() updater.step() else : l.sum ().backward() updater(X.shape[0 ]) metric.add(float (l.sum ()), accuracy(y_hat, y), y.numel()) return metric[0 ] / metric[2 ], metric[1 ] / metric[2 ]
在展⽰训练函数的实现之前,我们定义⼀个在动画中绘制数据的实⽤程序类Animator
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 class Animator : """在动画中绘制数据""" def __init__ (self, xlabel=None , ylabel=None , legend=None , xlim=None , ylim=None , xscale='linear' , yscale='linear' , fmts=('-' , 'm--' , 'g-.' , 'r:' ), nrows=1 , ncols=1 , figsize=(3.5 , 2.5 ) ): if legend is None : legend = [] d2l.use_svg_display() self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize) if nrows * ncols == 1 : self.axes = [self.axes, ] self.config_axes = lambda : d2l.set_axes( self.axes[0 ], xlabel, ylabel, xlim, ylim, xscale, yscale, legend) self.X, self.Y, self.fmts = None , None , fmts def add (self, x, y ): if not hasattr (y, "__len__" ): y = [y] n = len (y) if not hasattr (x, "__len__" ): x = [x] * n if not self.X: self.X = [[] for _ in range (n)] if not self.Y: self.Y = [[] for _ in range (n)] for i, (a, b) in enumerate (zip (x, y)): if a is not None and b is not None : self.X[i].append(a) self.Y[i].append(b) self.axes[0 ].cla() for x, y, fmt in zip (self.X, self.Y, self.fmts): self.axes[0 ].plot(x, y, fmt) self.config_axes() display.display(self.fig) display.clear_output(wait=True )
1 2 3 4 5 6 7 8 9 10 11 12 def train_ch3 (net, train_iter, test_iter, loss, num_epochs, updater ): """训练模型""" animator = Animator(xlabel='epoch' , xlim=[1 , num_epochs], ylim=[0.3 , 0.9 ], legend=['train loss' , 'train acc' , 'test acc' ]) for epoch in range (num_epochs): train_metrics = train_epoch_ch3(net, train_iter, loss, updater) test_acc = evaluate_accuracy(net, test_iter) animator.add(epoch+1 , train_metrics + (test_acc, )) train_loss, train_acc = train_metrics assert train_loss < 0.5 , train_loss assert train_acc <= 1 and train_acc > 0.7 , train_acc assert test_acc <= 1 and test_acc > 0.7 , test_acc
1 2 def updater (batch_size ): return d2l.sgd([w, b], lr, batch_size)
现在,我们训练模型10个迭代周期。请注意,迭代周期(num_epochs)和学习率(lr)都是可调节的超参数。通过更改它们的值,我们可以提⾼模型的分类精度。
1 2 num_epochs = 10 train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, updater)
预测 1 2 3 4 5 6 7 8 def predict_ch3 (net, test_iter, n=6 ): for X, y in test_iter: break trues = d2l.get_fashion_mnist_labels(y) y_pred = net(X).argmax(axis=1 ) preds = d2l.get_fashion_mnist_labels(y_pred) titles = [true + '\n' + pred for true, pred in zip (trues, preds)] d2l.show_images(X[:n].reshape((n, 28 , 28 )), 1 , n, titles=titles[:n])
1 predict_ch3(net, test_iter, n=10 )
softmax回归简洁实现 1 2 3 import torchfrom torch import nnfrom d2l import torch as d2l
1 2 batch_size = 256 train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
1 2 3 4 5 6 7 net = nn.Sequential(nn.Flatten(), nn.Linear(784 , 10 )) def init_weights (m ): if type (m) == nn.Linear: nn.init.normal_(m.weight, std=0.01 ) net.apply(init_weights);
1 loss = nn.CrossEntropyLoss(reduction='none' )
1 trainer = torch.optim.SGD(net.parameters(), lr=0.1 )
1 2 num_epochs = 10 d2l.train_ch3(net, train_iter, test_iter, loss, num_epochs, trainer)