Source code for d2l.mxnet

# This file is generated automatically through:
#    d2lbook build lib
# Don't edit it directly

# Defined in file: ./chapter_preface/index.md
import collections
from collections import defaultdict
from IPython import display
import math
from matplotlib import pyplot as plt
import os
import pandas as pd
import random
import re
import shutil
import sys
import tarfile
import time
import requests
import zipfile
import hashlib
d2l = sys.modules[__name__]


# Defined in file: ./chapter_preface/index.md
from mxnet import autograd, context, gluon, image, init, np, npx
from mxnet.gluon import nn, rnn


# Defined in file: ./chapter_preliminaries/pandas.md
[docs]def mkdir_if_not_exist(path): #@save """Make a directory if it does not exist.""" if not isinstance(path, str): path = os.path.join(*path) if not os.path.exists(path): os.makedirs(path)
# Defined in file: ./chapter_preliminaries/calculus.md
[docs]def use_svg_display(): #@save """Use the svg format to display a plot in Jupyter.""" display.set_matplotlib_formats('svg')
# Defined in file: ./chapter_preliminaries/calculus.md
[docs]def set_figsize(figsize=(3.5, 2.5)): #@save """Set the figure size for matplotlib.""" use_svg_display() d2l.plt.rcParams['figure.figsize'] = figsize
# Defined in file: ./chapter_preliminaries/calculus.md
[docs]def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend): """Set the axes for matplotlib.""" axes.set_xlabel(xlabel) axes.set_ylabel(ylabel) axes.set_xscale(xscale) axes.set_yscale(yscale) axes.set_xlim(xlim) axes.set_ylim(ylim) if legend: axes.legend(legend) axes.grid()
# Defined in file: ./chapter_preliminaries/calculus.md
[docs]def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, xscale='linear', yscale='linear', fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None): """Plot data points.""" if legend is None: legend = [] set_figsize(figsize) axes = axes if axes else d2l.plt.gca() # Return True if `X` (tensor or list) has 1 axis def has_one_axis(X): return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list) and not hasattr(X[0], "__len__")) if has_one_axis(X): X = [X] if Y is None: X, Y = [[]] * len(X), X elif has_one_axis(Y): Y = [Y] if len(X) != len(Y): X = X * len(Y) axes.cla() for x, y, fmt in zip(X, Y, fmts): if len(x): axes.plot(x, y, fmt) else: axes.plot(y, fmt) set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
# Defined in file: ./chapter_linear-networks/linear-regression.md
[docs]class Timer: #@save """Record multiple running times.""" def __init__(self): self.times = [] self.start()
[docs] def start(self): """Start the timer.""" self.tik = time.time()
[docs] def stop(self): """Stop the timer and record the time in a list.""" self.times.append(time.time() - self.tik) return self.times[-1]
[docs] def avg(self): """Return the average time.""" return sum(self.times) / len(self.times)
[docs] def sum(self): """Return the sum of time.""" return sum(self.times)
[docs] def cumsum(self): """Return the accumulated time.""" return np.array(self.times).cumsum().tolist()
# Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
[docs]def synthetic_data(w, b, num_examples): #@save """Generate y = Xw + b + noise.""" X = d2l.normal(0, 1, (num_examples, len(w))) y = d2l.matmul(X, w) + b y += d2l.normal(0, 0.01, y.shape) return X, d2l.reshape(y, (-1, 1))
# Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
[docs]def linreg(X, w, b): #@save """The linear regression model.""" return d2l.matmul(X, w) + b
# Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
[docs]def squared_loss(y_hat, y): #@save """Squared loss.""" return (y_hat - d2l.reshape(y, y_hat.shape)) ** 2 / 2
# Defined in file: ./chapter_linear-networks/linear-regression-scratch.md
[docs]def sgd(params, lr, batch_size): #@save """Minibatch stochastic gradient descent.""" for param in params: param[:] = param - lr * param.grad / batch_size
# Defined in file: ./chapter_linear-networks/linear-regression-concise.md
[docs]def load_array(data_arrays, batch_size, is_train=True): #@save """Construct a Gluon data iterator.""" dataset = gluon.data.ArrayDataset(*data_arrays) return gluon.data.DataLoader(dataset, batch_size, shuffle=is_train)
# Defined in file: ./chapter_linear-networks/image-classification-dataset.md
[docs]def get_fashion_mnist_labels(labels): #@save """Return text labels for the Fashion-MNIST dataset.""" text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] return [text_labels[int(i)] for i in labels]
# Defined in file: ./chapter_linear-networks/image-classification-dataset.md
[docs]def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5): #@save """Plot a list of images.""" figsize = (num_cols * scale, num_rows * scale) _, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize) axes = axes.flatten() for i, (ax, img) in enumerate(zip(axes, imgs)): ax.imshow(d2l.numpy(img)) ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) if titles: ax.set_title(titles[i]) return axes
# Defined in file: ./chapter_linear-networks/image-classification-dataset.md
[docs]def get_dataloader_workers(): #@save """Use 4 processes to read the data except for Windows.""" return 0 if sys.platform.startswith('win') else 4
# Defined in file: ./chapter_linear-networks/image-classification-dataset.md
[docs]def load_data_fashion_mnist(batch_size, resize=None): #@save """Download the Fashion-MNIST dataset and then load it into memory.""" dataset = gluon.data.vision trans = [dataset.transforms.ToTensor()] if resize: trans.insert(0, dataset.transforms.Resize(resize)) trans = dataset.transforms.Compose(trans) mnist_train = dataset.FashionMNIST(train=True).transform_first(trans) mnist_test = dataset.FashionMNIST(train=False).transform_first(trans) return (gluon.data.DataLoader(mnist_train, batch_size, shuffle=True, num_workers=get_dataloader_workers()), gluon.data.DataLoader(mnist_test, batch_size, shuffle=False, num_workers=get_dataloader_workers()))
# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
[docs]def accuracy(y_hat, y): #@save """Compute the number of correct predictions.""" if len(y_hat.shape) > 1 and y_hat.shape[1] > 1: y_hat = d2l.argmax(y_hat, axis=1) cmp = d2l.astype(y_hat, y.dtype) == y return float(d2l.reduce_sum(d2l.astype(cmp, y.dtype)))
# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
[docs]def evaluate_accuracy(net, data_iter): #@save """Compute the accuracy for a model on a dataset.""" metric = Accumulator(2) # No. of correct predictions, no. of predictions for _, (X, y) in enumerate(data_iter): metric.add(accuracy(net(X), y), d2l.size(y)) return metric[0] / metric[1]
# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
[docs]class Accumulator: #@save """For accumulating sums over `n` variables.""" def __init__(self, n): self.data = [0.0] * n def add(self, *args): self.data = [a + float(b) for a, b in zip(self.data, args)] def reset(self): self.data = [0.0] * len(self.data) def __getitem__(self, idx): return self.data[idx]
# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
[docs]def train_epoch_ch3(net, train_iter, loss, updater): #@save """Train a model within one epoch (defined in Chapter 3).""" # Sum of training loss, sum of training accuracy, no. of examples metric = Accumulator(3) if isinstance(updater, gluon.Trainer): updater = updater.step for X, y in train_iter: # Compute gradients and update parameters with autograd.record(): y_hat = net(X) l = loss(y_hat, y) l.backward() updater(X.shape[0]) metric.add(float(l.sum()), accuracy(y_hat, y), y.size) # Return training loss and training accuracy return metric[0] / metric[2], metric[1] / metric[2]
# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
[docs]class Animator: #@save """For plotting data in animation.""" def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, xscale='linear', yscale='linear', fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1, figsize=(3.5, 2.5)): # Incrementally plot multiple lines if legend is None: legend = [] d2l.use_svg_display() self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize) if nrows * ncols == 1: self.axes = [self.axes, ] # Use a lambda function to capture arguments self.config_axes = lambda: d2l.set_axes( self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend) self.X, self.Y, self.fmts = None, None, fmts def add(self, x, y): # Add multiple data points into the figure if not hasattr(y, "__len__"): y = [y] n = len(y) if not hasattr(x, "__len__"): x = [x] * n if not self.X: self.X = [[] for _ in range(n)] if not self.Y: self.Y = [[] for _ in range(n)] for i, (a, b) in enumerate(zip(x, y)): if a is not None and b is not None: self.X[i].append(a) self.Y[i].append(b) self.axes[0].cla() for x, y, fmt in zip(self.X, self.Y, self.fmts): self.axes[0].plot(x, y, fmt) self.config_axes() display.display(self.fig) display.clear_output(wait=True)
# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
[docs]def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater): #@save """Train a model (defined in Chapter 3).""" animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9], legend=['train loss', 'train acc', 'test acc']) for epoch in range(num_epochs): train_metrics = train_epoch_ch3(net, train_iter, loss, updater) test_acc = evaluate_accuracy(net, test_iter) animator.add(epoch + 1, train_metrics + (test_acc,)) train_loss, train_acc = train_metrics assert train_loss < 0.5, train_loss assert train_acc <= 1 and train_acc > 0.7, train_acc assert test_acc <= 1 and test_acc > 0.7, test_acc
# Defined in file: ./chapter_linear-networks/softmax-regression-scratch.md
[docs]def predict_ch3(net, test_iter, n=6): #@save """Predict labels (defined in Chapter 3).""" for X, y in test_iter: break trues = d2l.get_fashion_mnist_labels(y) preds = d2l.get_fashion_mnist_labels(d2l.argmax(net(X), axis=1)) titles = [true +'\n' + pred for true, pred in zip(trues, preds)] d2l.show_images(d2l.reshape(X[0:n], (n, 28, 28)), 1, n, titles=titles[0:n])
# Defined in file: ./chapter_multilayer-perceptrons/underfit-overfit.md
[docs]def evaluate_loss(net, data_iter, loss): #@save """Evaluate the loss of a model on the given dataset.""" metric = d2l.Accumulator(2) # Sum of losses, no. of examples for X, y in data_iter: l = loss(net(X), y) metric.add(d2l.reduce_sum(l), d2l.size(l)) return metric[0] / metric[1]
# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md DATA_HUB = dict() DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/' # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
[docs]def download(name, cache_dir=os.path.join('..', 'data')): #@save """Download a file inserted into DATA_HUB, return the local filename.""" assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}." url, sha1_hash = DATA_HUB[name] d2l.mkdir_if_not_exist(cache_dir) fname = os.path.join(cache_dir, url.split('/')[-1]) if os.path.exists(fname): sha1 = hashlib.sha1() with open(fname, 'rb') as f: while True: data = f.read(1048576) if not data: break sha1.update(data) if sha1.hexdigest() == sha1_hash: return fname # Hit cache print(f'Downloading {fname} from {url}...') r = requests.get(url, stream=True, verify=True) with open(fname, 'wb') as f: f.write(r.content) return fname
# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
[docs]def download_extract(name, folder=None): #@save """Download and extract a zip/tar file.""" fname = download(name) base_dir = os.path.dirname(fname) data_dir, ext = os.path.splitext(fname) if ext == '.zip': fp = zipfile.ZipFile(fname, 'r') elif ext in ('.tar', '.gz'): fp = tarfile.open(fname, 'r') else: assert False, 'Only zip/tar files can be extracted.' fp.extractall(base_dir) return os.path.join(base_dir, folder) if folder else data_dir
# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md
[docs]def download_all(): #@save """Download all files in the DATA_HUB.""" for name in DATA_HUB: download(name)
# Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md DATA_HUB['kaggle_house_train'] = ( #@save DATA_URL + 'kaggle_house_pred_train.csv', '585e9cc93e70b39160e7921475f9bcd7d31219ce') # Defined in file: ./chapter_multilayer-perceptrons/kaggle-house-price.md DATA_HUB['kaggle_house_test'] = ( #@save DATA_URL + 'kaggle_house_pred_test.csv', 'fa19780a7b011d9b009e8bff8e99922a8ee2eb90') # Defined in file: ./chapter_deep-learning-computation/use-gpu.md
[docs]def try_gpu(i=0): #@save """Return gpu(i) if exists, otherwise return cpu().""" return npx.gpu(i) if npx.num_gpus() >= i + 1 else npx.cpu()
# Defined in file: ./chapter_deep-learning-computation/use-gpu.md
[docs]def try_all_gpus(): #@save """Return all available GPUs, or [cpu()] if no GPU exists.""" devices = [npx.gpu(i) for i in range(npx.num_gpus())] return devices if devices else [npx.cpu()]
# Defined in file: ./chapter_convolutional-neural-networks/conv-layer.md
[docs]def corr2d(X, K): #@save """Compute 2D cross-correlation.""" h, w = K.shape Y = d2l.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1)) for i in range(Y.shape[0]): for j in range(Y.shape[1]): Y[i, j] = d2l.reduce_sum((X[i: i + h, j: j + w] * K)) return Y
# Defined in file: ./chapter_convolutional-neural-networks/lenet.md
[docs]def evaluate_accuracy_gpu(net, data_iter, device=None): #@save """Compute the accuracy for a model on a dataset using a GPU.""" if not device: # Query the first device where the first parameter is on device = list(net.collect_params().values())[0].list_ctx()[0] # No. of correct predictions, no. of predictions metric = d2l.Accumulator(2) for X, y in data_iter: X, y = X.as_in_ctx(device), y.as_in_ctx(device) metric.add(d2l.accuracy(net(X), y), d2l.size(y)) return metric[0]/metric[1]
# Defined in file: ./chapter_convolutional-neural-networks/lenet.md
[docs]def train_ch6(net, train_iter, test_iter, num_epochs, lr, device=d2l.try_gpu()): """Train a model with a GPU (defined in Chapter 6).""" net.initialize(force_reinit=True, ctx=device, init=init.Xavier()) loss = gluon.loss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], legend=['train loss', 'train acc', 'test acc']) timer = d2l.Timer() for epoch in range(num_epochs): # Sum of training loss, sum of training accuracy, no. of examples metric = d2l.Accumulator(3) for i, (X, y) in enumerate(train_iter): timer.start() # Here is the major difference compared with `d2l.train_epoch_ch3` X, y = X.as_in_ctx(device), y.as_in_ctx(device) with autograd.record(): y_hat = net(X) l = loss(y_hat, y) l.backward() trainer.step(X.shape[0]) metric.add(l.sum(), d2l.accuracy(y_hat, y), X.shape[0]) timer.stop() train_loss = metric[0] / metric[2] train_acc = metric[1] / metric[2] if (i + 1) % 50 == 0: animator.add(epoch + i / len(train_iter), (train_loss, train_acc, None)) test_acc = evaluate_accuracy_gpu(net, test_iter) animator.add(epoch + 1, (None, None, test_acc)) print(f'loss {train_loss:.3f}, train acc {train_acc:.3f}, ' f'test acc {test_acc:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(device)}')
# Defined in file: ./chapter_convolutional-modern/resnet.md
[docs]class Residual(nn.Block): #@save def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs): super().__init__(**kwargs) self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1, strides=strides) self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1) if use_1x1conv: self.conv3 = nn.Conv2D(num_channels, kernel_size=1, strides=strides) else: self.conv3 = None self.bn1 = nn.BatchNorm() self.bn2 = nn.BatchNorm()
[docs] def forward(self, X): Y = npx.relu(self.bn1(self.conv1(X))) Y = self.bn2(self.conv2(Y)) if self.conv3: X = self.conv3(X) return npx.relu(Y + X)
# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt', '090b5e7e70c295757f55df93cb0a180b9691891a') # Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
[docs]def read_time_machine(): #@save """Load the time machine book into a list of text lines.""" with open(d2l.download('time_machine'), 'r') as f: lines = f.readlines() return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
[docs]def tokenize(lines, token='word'): #@save """Split text lines into word or character tokens.""" if token == 'word': return [line.split() for line in lines] elif token == 'char': return [list(line) for line in lines] else: print('ERROR: unknown token type: ' + token)
# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
[docs]class Vocab: #@save """Vocabulary for text.""" def __init__(self, tokens=None, min_freq=0, reserved_tokens=None): if tokens is None: tokens = [] if reserved_tokens is None: reserved_tokens = [] # Sort according to frequencies counter = count_corpus(tokens) self.token_freqs = sorted(counter.items(), key=lambda x: x[0]) self.token_freqs.sort(key=lambda x: x[1], reverse=True) # The index for the unknown token is 0 self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens uniq_tokens += [token for token, freq in self.token_freqs if freq >= min_freq and token not in uniq_tokens] self.idx_to_token, self.token_to_idx = [], dict() for token in uniq_tokens: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token) - 1 def __len__(self): return len(self.idx_to_token) def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens] def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices]
# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md
[docs]def count_corpus(tokens): #@save """Count token frequencies.""" # Here `tokens` is a 1D list or 2D list if len(tokens) == 0 or isinstance(tokens[0], list): # Flatten a list of token lists into a list of tokens tokens = [token for line in tokens for token in line] return collections.Counter(tokens)
# Defined in file: ./chapter_recurrent-neural-networks/text-preprocessing.md def load_corpus_time_machine(max_tokens=-1): #@save lines = read_time_machine() tokens = tokenize(lines, 'char') vocab = Vocab(tokens) # Since each text line in the time machine dataset is not necessarily a # sentence or a paragraph, flatten all the text lines into a single list corpus = [vocab[token] for line in tokens for token in line] if max_tokens > 0: corpus = corpus[:max_tokens] return corpus, vocab # Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md def seq_data_iter_random(corpus, batch_size, num_steps): #@save # Start with a random offset to partition a sequence corpus = corpus[random.randint(0, num_steps):] # Subtract 1 since we need to account for labels num_subseqs = (len(corpus) - 1) // num_steps # The starting indices for subsequences of length `num_steps` initial_indices = list(range(0, num_subseqs * num_steps, num_steps)) # In random sampling, the subsequences from two adjacent random # minibatches during iteration are not necessarily adjacent on the # original sequence random.shuffle(initial_indices) def data(pos): # Return a sequence of length `num_steps` starting from `pos` return corpus[pos: pos + num_steps] num_subseqs_per_example = num_subseqs // batch_size for i in range(0, batch_size * num_subseqs_per_example, batch_size): # Here, `initial_indices` contains randomized starting indices for # subsequences initial_indices_per_batch = initial_indices[i: i + batch_size] X = [data(j) for j in initial_indices_per_batch] Y = [data(j + 1) for j in initial_indices_per_batch] yield d2l.tensor(X), d2l.tensor(Y) # Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md def seq_data_iter_sequential(corpus, batch_size, num_steps): #@save # Start with a random offset to partition a sequence offset = random.randint(0, num_steps) num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size Xs = d2l.tensor(corpus[offset: offset + num_tokens]) Ys = d2l.tensor(corpus[offset + 1: offset + 1 + num_tokens]) Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1) num_batches = Xs.shape[1] // num_steps for i in range(0, num_batches * num_steps, num_steps): X = Xs[:, i: i + num_steps] Y = Ys[:, i: i + num_steps] yield X, Y # Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md
[docs]class SeqDataLoader: #@save """An iterator to load sequence data.""" def __init__(self, batch_size, num_steps, use_random_iter, max_tokens): if use_random_iter: self.data_iter_fn = d2l.seq_data_iter_random else: self.data_iter_fn = d2l.seq_data_iter_sequential self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens) self.batch_size, self.num_steps = batch_size, num_steps def __iter__(self): return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)
# Defined in file: ./chapter_recurrent-neural-networks/language-models-and-dataset.md def load_data_time_machine(batch_size, num_steps, #@save use_random_iter=False, max_tokens=10000): data_iter = SeqDataLoader( batch_size, num_steps, use_random_iter, max_tokens) return data_iter, data_iter.vocab # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
[docs]class RNNModelScratch: #@save """An RNN Model implemented from scratch.""" def __init__(self, vocab_size, num_hiddens, device, get_params, init_state, forward_fn): self.vocab_size, self.num_hiddens = vocab_size, num_hiddens self.params = get_params(vocab_size, num_hiddens, device) self.init_state, self.forward_fn = init_state, forward_fn def __call__(self, X, state): X = npx.one_hot(X.T, self.vocab_size) return self.forward_fn(X, state, self.params) def begin_state(self, batch_size, ctx): return self.init_state(batch_size, self.num_hiddens, ctx)
# Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md def predict_ch8(prefix, num_preds, model, vocab, device): #@save state = model.begin_state(batch_size=1, ctx=device) outputs = [vocab[prefix[0]]] get_input = lambda: d2l.reshape( d2l.tensor([outputs[-1]], ctx=device), (1, 1)) for y in prefix[1:]: # Warm-up period _, state = model(get_input(), state) outputs.append(vocab[y]) for _ in range(num_preds): # Predict `num_preds` steps y, state = model(get_input(), state) outputs.append(int(y.argmax(axis=1).reshape(1))) return ''.join([vocab.idx_to_token[i] for i in outputs]) # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md def grad_clipping(model, theta): #@save if isinstance(model, gluon.Block): params = [p.data() for p in model.collect_params().values()] else: params = model.params norm = math.sqrt(sum((p.grad ** 2).sum() for p in params)) if norm > theta: for param in params: param.grad[:] *= theta / norm # Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
[docs]def train_epoch_ch8(model, train_iter, loss, updater, device, #@save use_random_iter): """Train a model within one epoch (defined in Chapter 8).""" state, timer = None, d2l.Timer() metric = d2l.Accumulator(2) # Sum of training loss, no. of tokens for X, Y in train_iter: if state is None or use_random_iter: # Initialize `state` when either it is the first iteration or # using random sampling state = model.begin_state(batch_size=X.shape[0], ctx=device) else: for s in state: s.detach() y = Y.T.reshape(-1) X, y = X.as_in_ctx(device), y.as_in_ctx(device) with autograd.record(): y_hat, state = model(X, state) l = loss(y_hat, y).mean() l.backward() grad_clipping(model, 1) updater(batch_size=1) # Since the `mean` function has been invoked metric.add(l * d2l.size(y), d2l.size(y)) return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()
# Defined in file: ./chapter_recurrent-neural-networks/rnn-scratch.md
[docs]def train_ch8(model, train_iter, vocab, lr, num_epochs, device, #@save use_random_iter=False): """Train a model (defined in Chapter 8).""" loss = gluon.loss.SoftmaxCrossEntropyLoss() animator = d2l.Animator(xlabel='epoch', ylabel='perplexity', legend=['train'], xlim=[1, num_epochs]) # Initialize if isinstance(model, gluon.Block): model.initialize(ctx=device, force_reinit=True, init=init.Normal(0.01)) trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': lr}) updater = lambda batch_size: trainer.step(batch_size) else: updater = lambda batch_size: d2l.sgd(model.params, lr, batch_size) predict = lambda prefix: predict_ch8(prefix, 50, model, vocab, device) # Train and predict for epoch in range(num_epochs): ppl, speed = train_epoch_ch8( model, train_iter, loss, updater, device, use_random_iter) if epoch % 10 == 0: print(predict('time traveller')) animator.add(epoch + 1, [ppl]) print(f'perplexity {ppl:.1f}, {speed:.1f} tokens/sec on {str(device)}') print(predict('time traveller')) print(predict('traveller'))
# Defined in file: ./chapter_recurrent-neural-networks/rnn-concise.md
[docs]class RNNModel(nn.Block): def __init__(self, rnn_layer, vocab_size, **kwargs): super(RNNModel, self).__init__(**kwargs) self.rnn = rnn_layer self.vocab_size = vocab_size self.dense = nn.Dense(vocab_size)
[docs] def forward(self, inputs, state): X = npx.one_hot(inputs.T, self.vocab_size) Y, state = self.rnn(X, state) # The fully connected layer will first change the shape of `Y` to # (`num_steps` * `batch_size`, `num_hiddens`). Its output shape is # (`num_steps` * `batch_size`, `vocab_size`). output = self.dense(Y.reshape(-1, Y.shape[-1])) return output, state
def begin_state(self, *args, **kwargs): return self.rnn.begin_state(*args, **kwargs)
# Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip', '94646ad1522d915e7b0f9296181140edcf86a4f5') # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def read_data_nmt(): data_dir = d2l.download_extract('fra-eng') with open(os.path.join(data_dir, 'fra.txt'), 'r') as f: return f.read() # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def preprocess_nmt(text): def no_space(char, prev_char): return char in set(',.!') and prev_char != ' ' text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower() out = [' ' + char if i > 0 and no_space(char, text[i-1]) else char for i, char in enumerate(text)] return ''.join(out) # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def tokenize_nmt(text, num_examples=None): source, target = [], [] for i, line in enumerate(text.split('\n')): if num_examples and i > num_examples: break parts = line.split('\t') if len(parts) == 2: source.append(parts[0].split(' ')) target.append(parts[1].split(' ')) return source, target # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def truncate_pad(line, num_steps, padding_token): if len(line) > num_steps: return line[:num_steps] # Trim return line + [padding_token] * (num_steps - len(line)) # Pad # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def build_array(lines, vocab, num_steps, is_source): lines = [vocab[l] for l in lines] if not is_source: lines = [[vocab['<bos>']] + l + [vocab['<eos>']] for l in lines] array = np.array([truncate_pad( l, num_steps, vocab['<pad>']) for l in lines]) valid_len = (array != vocab['<pad>']).sum(axis=1) return array, valid_len # Defined in file: ./chapter_recurrent-modern/machine-translation-and-dataset.md def load_data_nmt(batch_size, num_steps, num_examples=1000): text = preprocess_nmt(read_data_nmt()) source, target = tokenize_nmt(text, num_examples) src_vocab = d2l.Vocab(source, min_freq=3, reserved_tokens=['<pad>', '<bos>', '<eos>']) tgt_vocab = d2l.Vocab(target, min_freq=3, reserved_tokens=['<pad>', '<bos>', '<eos>']) src_array, src_valid_len = build_array( source, src_vocab, num_steps, True) tgt_array, tgt_valid_len = build_array( target, tgt_vocab, num_steps, False) data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len) data_iter = d2l.load_array(data_arrays, batch_size) return src_vocab, tgt_vocab, data_iter # Defined in file: ./chapter_recurrent-modern/encoder-decoder.md
[docs]class Encoder(nn.Block): """The base encoder interface for the encoder-decoder architecture.""" def __init__(self, **kwargs): super(Encoder, self).__init__(**kwargs)
[docs] def forward(self, X, *args): raise NotImplementedError
# Defined in file: ./chapter_recurrent-modern/encoder-decoder.md
[docs]class Decoder(nn.Block): """The base decoder interface for the encoder-decoder architecture.""" def __init__(self, **kwargs): super(Decoder, self).__init__(**kwargs) def init_state(self, enc_outputs, *args): raise NotImplementedError
[docs] def forward(self, X, state): raise NotImplementedError
# Defined in file: ./chapter_recurrent-modern/encoder-decoder.md
[docs]class EncoderDecoder(nn.Block): """The base class for the encoder-decoder architecture.""" def __init__(self, encoder, decoder, **kwargs): super(EncoderDecoder, self).__init__(**kwargs) self.encoder = encoder self.decoder = decoder
[docs] def forward(self, enc_X, dec_X, *args): enc_outputs = self.encoder(enc_X, *args) dec_state = self.decoder.init_state(enc_outputs, *args) return self.decoder(dec_X, dec_state)
# Defined in file: ./chapter_recurrent-modern/seq2seq.md
[docs]class Seq2SeqEncoder(d2l.Encoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs): super(Seq2SeqEncoder, self).__init__(**kwargs) self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = rnn.LSTM(num_hiddens, num_layers, dropout=dropout)
[docs] def forward(self, X, *args): # `X` shape: (`batch_size`, `seq_len`, `embed_size`) X = self.embedding(X) # RNN needs first axes to be time step, i.e., `seq_len` X = X.swapaxes(0, 1) state = self.rnn.begin_state(batch_size=X.shape[1], ctx=X.ctx) out, state = self.rnn(X, state) # `out` shape: (`seq_len`, `batch_size`, `num_hiddens`) # `state` shape: (`num_layers`, `batch_size`, `num_hiddens`), # where "state" contains the hidden state and the memory cell return out, state
# Defined in file: ./chapter_recurrent-modern/seq2seq.md
[docs]class Seq2SeqDecoder(d2l.Decoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs): super(Seq2SeqDecoder, self).__init__(**kwargs) self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = rnn.LSTM(num_hiddens, num_layers, dropout=dropout) self.dense = nn.Dense(vocab_size, flatten=False) def init_state(self, enc_outputs, *args): return enc_outputs[1]
[docs] def forward(self, X, state): X = self.embedding(X).swapaxes(0, 1) out, state = self.rnn(X, state) # Make the batch to be the first dimension to simplify loss computation out = self.dense(out).swapaxes(0, 1) return out, state
# Defined in file: ./chapter_recurrent-modern/seq2seq.md
[docs]class MaskedSoftmaxCELoss(gluon.loss.SoftmaxCELoss): # `pred` shape: (`batch_size`, `seq_len`, `vocab_size`) # `label` shape: (`batch_size`, `seq_len`) # `valid_len` shape: (`batch_size`, )
[docs] def forward(self, pred, label, valid_len): # weights shape: (batch_size, seq_len, 1) weights = np.expand_dims(np.ones_like(label), axis=-1) weights = npx.sequence_mask(weights, valid_len, True, axis=1) return super(MaskedSoftmaxCELoss, self).forward(pred, label, weights)
# Defined in file: ./chapter_recurrent-modern/seq2seq.md def train_s2s_ch9(model, data_iter, lr, num_epochs, device): model.initialize(init.Xavier(), force_reinit=True, ctx=device) trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr}) loss = MaskedSoftmaxCELoss() animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[1, num_epochs], ylim=[0, 0.25]) for epoch in range(1, num_epochs + 1): timer = d2l.Timer() metric = d2l.Accumulator(2) # loss_sum, num_tokens for batch in data_iter: X, X_vlen, Y, Y_vlen = [x.as_in_ctx(device) for x in batch] Y_input, Y_label, Y_vlen = Y[:, :-1], Y[:, 1:], Y_vlen-1 with autograd.record(): Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen) l = loss(Y_hat, Y_label, Y_vlen) l.backward() d2l.grad_clipping(model, 1) num_tokens = Y_vlen.sum() trainer.step(num_tokens) metric.add(l.sum(), num_tokens) if epoch % 10 == 0: animator.add(epoch, (metric[0]/metric[1],)) print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} ' f'tokens/sec on {str(device)}') # Defined in file: ./chapter_recurrent-modern/seq2seq.md def predict_s2s_ch9(model, src_sentence, src_vocab, tgt_vocab, num_steps, device): src_tokens = src_vocab[src_sentence.lower().split(' ')] enc_valid_len = np.array([len(src_tokens)], ctx=device) src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>']) enc_X = np.array(src_tokens, ctx=device) # Add the batch size dimension enc_outputs = model.encoder(np.expand_dims(enc_X, axis=0), enc_valid_len) dec_state = model.decoder.init_state(enc_outputs, enc_valid_len) dec_X = np.expand_dims(np.array([tgt_vocab['<bos>']], ctx=device), axis=0) predict_tokens = [] for _ in range(num_steps): Y, dec_state = model.decoder(dec_X, dec_state) # The token with highest score is used as the next time step input dec_X = Y.argmax(axis=2) py = dec_X.squeeze(axis=0).astype('int32').item() if py == tgt_vocab['<eos>']: break predict_tokens.append(py) return ' '.join(tgt_vocab.to_tokens(predict_tokens)) # Defined in file: ./chapter_attention-mechanisms/attention.md
[docs]def masked_softmax(X, valid_len): """Perform softmax by filtering out some elements.""" # X: 3-D tensor, valid_len: 1-D or 2-D tensor if valid_len is None: return npx.softmax(X) else: shape = X.shape if valid_len.ndim == 1: valid_len = valid_len.repeat(shape[1], axis=0) else: valid_len = valid_len.reshape(-1) # Fill masked elements with a large negative, whose exp is 0 X = npx.sequence_mask(X.reshape(-1, shape[-1]), valid_len, True, axis=1, value=-1e6) return npx.softmax(X).reshape(shape)
# Defined in file: ./chapter_attention-mechanisms/attention.md
[docs]class DotProductAttention(nn.Block): def __init__(self, dropout, **kwargs): super(DotProductAttention, self).__init__(**kwargs) self.dropout = nn.Dropout(dropout) # `query`: (`batch_size`, #queries, `d`) # `key`: (`batch_size`, #kv_pairs, `d`) # `value`: (`batch_size`, #kv_pairs, `dim_v`) # `valid_len`: either (`batch_size`, ) or (`batch_size`, xx)
[docs] def forward(self, query, key, value, valid_len=None): d = query.shape[-1] # Set transpose_b=True to swap the last two dimensions of key scores = npx.batch_dot(query, key, transpose_b=True) / math.sqrt(d) attention_weights = self.dropout(masked_softmax(scores, valid_len)) return npx.batch_dot(attention_weights, value)
# Defined in file: ./chapter_attention-mechanisms/attention.md
[docs]class MLPAttention(nn.Block): def __init__(self, units, dropout, **kwargs): super(MLPAttention, self).__init__(**kwargs) # Use flatten=False to keep query's and key's 3-D shapes self.W_k = nn.Dense(units, use_bias=False, flatten=False) self.W_q = nn.Dense(units, use_bias=False, flatten=False) self.v = nn.Dense(1, use_bias=False, flatten=False) self.dropout = nn.Dropout(dropout)
[docs] def forward(self, query, key, value, valid_len): query, key = self.W_q(query), self.W_k(key) # Expand query to (`batch_size`, #queries, 1, units), and key to # (`batch_size`, 1, #kv_pairs, units). Then plus them with broadcast features = np.expand_dims(query, axis=2) + np.expand_dims(key, axis=1) features = np.tanh(features) scores = np.squeeze(self.v(features), axis=-1) attention_weights = self.dropout(masked_softmax(scores, valid_len)) return npx.batch_dot(attention_weights, value)
# Defined in file: ./chapter_attention-mechanisms/transformer.md
[docs]class MultiHeadAttention(nn.Block): def __init__(self, num_hiddens, num_heads, dropout, use_bias=False, **kwargs): super(MultiHeadAttention, self).__init__(**kwargs) self.num_heads = num_heads self.attention = d2l.DotProductAttention(dropout) self.W_q = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False) self.W_k = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False) self.W_v = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False) self.W_o = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False)
[docs] def forward(self, query, key, value, valid_len): # For self-attention, `query`, `key`, and `value` shape: # (`batch_size`, `seq_len`, `dim`), where `seq_len` is the length of # input sequence. `valid_len` shape is either (`batch_size`, ) or # (`batch_size`, `seq_len`). # Project and transpose `query`, `key`, and `value` from # (`batch_size`, `seq_len`, `num_hiddens`) to # (`batch_size` * `num_heads`, `seq_len`, `num_hiddens` / `num_heads`) query = transpose_qkv(self.W_q(query), self.num_heads) key = transpose_qkv(self.W_k(key), self.num_heads) value = transpose_qkv(self.W_v(value), self.num_heads) if valid_len is not None: # Copy `valid_len` by `num_heads` times if valid_len.ndim == 1: valid_len = np.tile(valid_len, self.num_heads) else: valid_len = np.tile(valid_len, (self.num_heads, 1)) # For self-attention, `output` shape: # (`batch_size` * `num_heads`, `seq_len`, `num_hiddens` / `num_heads`) output = self.attention(query, key, value, valid_len) # `output_concat` shape: (`batch_size`, `seq_len`, `num_hiddens`) output_concat = transpose_output(output, self.num_heads) return self.W_o(output_concat)
# Defined in file: ./chapter_attention-mechanisms/transformer.md def transpose_qkv(X, num_heads): # Input `X` shape: (`batch_size`, `seq_len`, `num_hiddens`). # Output `X` shape: # (`batch_size`, `seq_len`, `num_heads`, `num_hiddens` / `num_heads`) X = X.reshape(X.shape[0], X.shape[1], num_heads, -1) # `X` shape: # (`batch_size`, `num_heads`, `seq_len`, `num_hiddens` / `num_heads`) X = X.transpose(0, 2, 1, 3) # `output` shape: # (`batch_size` * `num_heads`, `seq_len`, `num_hiddens` / `num_heads`) output = X.reshape(-1, X.shape[2], X.shape[3]) return output # Defined in file: ./chapter_attention-mechanisms/transformer.md def transpose_output(X, num_heads): # A reversed version of `transpose_qkv` X = X.reshape(-1, num_heads, X.shape[1], X.shape[2]) X = X.transpose(0, 2, 1, 3) return X.reshape(X.shape[0], X.shape[1], -1) # Defined in file: ./chapter_attention-mechanisms/transformer.md
[docs]class PositionWiseFFN(nn.Block): def __init__(self, ffn_num_hiddens, pw_num_outputs, **kwargs): super(PositionWiseFFN, self).__init__(**kwargs) self.dense1 = nn.Dense(ffn_num_hiddens, flatten=False, activation='relu') self.dense2 = nn.Dense(pw_num_outputs, flatten=False)
[docs] def forward(self, X): return self.dense2(self.dense1(X))
# Defined in file: ./chapter_attention-mechanisms/transformer.md
[docs]class AddNorm(nn.Block): def __init__(self, dropout, **kwargs): super(AddNorm, self).__init__(**kwargs) self.dropout = nn.Dropout(dropout) self.ln = nn.LayerNorm()
[docs] def forward(self, X, Y): return self.ln(self.dropout(Y) + X)
# Defined in file: ./chapter_attention-mechanisms/transformer.md
[docs]class PositionalEncoding(nn.Block): def __init__(self, num_hiddens, dropout, max_len=1000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(dropout) # Create a long enough `P` self.P = np.zeros((1, max_len, num_hiddens)) X = np.arange(0, max_len).reshape(-1, 1) / np.power( 10000, np.arange(0, num_hiddens, 2) / num_hiddens) self.P[:, :, 0::2] = np.sin(X) self.P[:, :, 1::2] = np.cos(X)
[docs] def forward(self, X): X = X + self.P[:, :X.shape[1], :].as_in_ctx(X.ctx) return self.dropout(X)
# Defined in file: ./chapter_attention-mechanisms/transformer.md
[docs]class EncoderBlock(nn.Block): def __init__(self, num_hiddens, ffn_num_hiddens, num_heads, dropout, use_bias=False, **kwargs): super(EncoderBlock, self).__init__(**kwargs) self.attention = MultiHeadAttention(num_hiddens, num_heads, dropout, use_bias) self.addnorm1 = AddNorm(dropout) self.ffn = PositionWiseFFN(ffn_num_hiddens, num_hiddens) self.addnorm2 = AddNorm(dropout)
[docs] def forward(self, X, valid_len): Y = self.addnorm1(X, self.attention(X, X, X, valid_len)) return self.addnorm2(Y, self.ffn(Y))
# Defined in file: ./chapter_attention-mechanisms/transformer.md
[docs]class TransformerEncoder(d2l.Encoder): def __init__(self, vocab_size, num_hiddens, ffn_num_hiddens, num_heads, num_layers, dropout, use_bias=False, **kwargs): super(TransformerEncoder, self).__init__(**kwargs) self.num_hiddens = num_hiddens self.embedding = nn.Embedding(vocab_size, num_hiddens) self.pos_encoding = PositionalEncoding(num_hiddens, dropout) self.blks = nn.Sequential() for _ in range(num_layers): self.blks.add( EncoderBlock(num_hiddens, ffn_num_hiddens, num_heads, dropout, use_bias))
[docs] def forward(self, X, valid_len, *args): X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens)) for blk in self.blks: X = blk(X, valid_len) return X
# Defined in file: ./chapter_optimization/optimization-intro.md def annotate(text, xy, xytext): #@save d2l.plt.gca().annotate(text, xy=xy, xytext=xytext, arrowprops=dict(arrowstyle='->')) # Defined in file: ./chapter_optimization/gd.md
[docs]def train_2d(trainer, steps=20): #@save """Optimize a 2-dim objective function with a customized trainer.""" # s1 and s2 are internal state variables and will # be used later in the chapter x1, x2, s1, s2 = -5, -2, 0, 0 results = [(x1, x2)] for i in range(steps): x1, x2, s1, s2 = trainer(x1, x2, s1, s2) results.append((x1, x2)) return results
# Defined in file: ./chapter_optimization/gd.md
[docs]def show_trace_2d(f, results): #@save """Show the trace of 2D variables during optimization.""" d2l.set_figsize() d2l.plt.plot(*zip(*results), '-o', color='#ff7f0e') x1, x2 = d2l.meshgrid(d2l.arange(-5.5, 1.0, 0.1), d2l.arange(-3.0, 1.0, 0.1)) d2l.plt.contour(x1, x2, f(x1, x2), colors='#1f77b4') d2l.plt.xlabel('x1') d2l.plt.ylabel('x2')
# Defined in file: ./chapter_optimization/minibatch-sgd.md d2l.DATA_HUB['airfoil'] = (d2l.DATA_URL + 'airfoil_self_noise.dat', '76e5be1548fd8222e5074cf0faae75edff8cf93f') # Defined in file: ./chapter_optimization/minibatch-sgd.md def get_data_ch11(batch_size=10, n=1500): data = np.genfromtxt(d2l.download('airfoil'), dtype=np.float32, delimiter='\t') data = (data - data.mean(axis=0)) / data.std(axis=0) data_iter = d2l.load_array( (data[:n, :-1], data[:n, -1]), batch_size, is_train=True) return data_iter, data.shape[1]-1 # Defined in file: ./chapter_optimization/minibatch-sgd.md def train_ch11(trainer_fn, states, hyperparams, data_iter, feature_dim, num_epochs=2): # Initialization w = np.random.normal(scale=0.01, size=(feature_dim, 1)) b = np.zeros(1) w.attach_grad() b.attach_grad() net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss # Train animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[0, num_epochs], ylim=[0.22, 0.35]) n, timer = 0, d2l.Timer() for _ in range(num_epochs): for X, y in data_iter: with autograd.record(): l = loss(net(X), y).mean() l.backward() trainer_fn([w, b], states, hyperparams) n += X.shape[0] if n % 200 == 0: timer.stop() animator.add(n/X.shape[0]/len(data_iter), (d2l.evaluate_loss(net, data_iter, loss),)) timer.start() print(f'loss: {animator.Y[0][-1]:.3f}, {timer.avg():.3f} sec/epoch') return timer.cumsum(), animator.Y[0] # Defined in file: ./chapter_optimization/minibatch-sgd.md def train_concise_ch11(tr_name, hyperparams, data_iter, num_epochs=2): # Initialization net = nn.Sequential() net.add(nn.Dense(1)) net.initialize(init.Normal(sigma=0.01)) trainer = gluon.Trainer(net.collect_params(), tr_name, hyperparams) loss = gluon.loss.L2Loss() animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[0, num_epochs], ylim=[0.22, 0.35]) n, timer = 0, d2l.Timer() for _ in range(num_epochs): for X, y in data_iter: with autograd.record(): l = loss(net(X), y) l.backward() trainer.step(X.shape[0]) n += X.shape[0] if n % 200 == 0: timer.stop() animator.add(n/X.shape[0]/len(data_iter), (d2l.evaluate_loss(net, data_iter, loss),)) timer.start() print(f'loss: {animator.Y[0][-1]:.3f}, {timer.avg():.3f} sec/epoch') # Defined in file: ./chapter_computational-performance/hybridize.md class Benchmark: def __init__(self, description='Done'): self.description = description def __enter__(self): self.timer = d2l.Timer() return self def __exit__(self, *args): print(f'{self.description}: {self.timer.stop():.4f} sec') # Defined in file: ./chapter_computational-performance/multiple-gpus.md
[docs]def split_batch(X, y, devices): """Split `X` and `y` into multiple devices.""" assert X.shape[0] == y.shape[0] return (gluon.utils.split_and_load(X, devices), gluon.utils.split_and_load(y, devices))
# Defined in file: ./chapter_computational-performance/multiple-gpus-concise.md
[docs]def resnet18(num_classes): """A slightly modified ResNet-18 model.""" def resnet_block(num_channels, num_residuals, first_block=False): blk = nn.Sequential() for i in range(num_residuals): if i == 0 and not first_block: blk.add(d2l.Residual( num_channels, use_1x1conv=True, strides=2)) else: blk.add(d2l.Residual(num_channels)) return blk net = nn.Sequential() # This model uses a smaller convolution kernel, stride, and padding and # removes the maximum pooling layer net.add(nn.Conv2D(64, kernel_size=3, strides=1, padding=1), nn.BatchNorm(), nn.Activation('relu')) net.add(resnet_block(64, 2, first_block=True), resnet_block(128, 2), resnet_block(256, 2), resnet_block(512, 2)) net.add(nn.GlobalAvgPool2D(), nn.Dense(num_classes)) return net
# Defined in file: ./chapter_computational-performance/multiple-gpus-concise.md def evaluate_accuracy_gpus(net, data_iter, split_f=d2l.split_batch): # Query the list of devices devices = list(net.collect_params().values())[0].list_ctx() metric = d2l.Accumulator(2) # num_corrected_examples, num_examples for features, labels in data_iter: X_shards, y_shards = split_f(features, labels, devices) # Run in parallel pred_shards = [net(X_shard) for X_shard in X_shards] metric.add(sum(float(d2l.accuracy(pred_shard, y_shard)) for pred_shard, y_shard in zip( pred_shards, y_shards)), labels.size) return metric[0] / metric[1] # Defined in file: ./chapter_computer-vision/image-augmentation.md def train_batch_ch13(net, features, labels, loss, trainer, devices, split_f=d2l.split_batch): X_shards, y_shards = split_f(features, labels, devices) with autograd.record(): pred_shards = [net(X_shard) for X_shard in X_shards] ls = [loss(pred_shard, y_shard) for pred_shard, y_shard in zip(pred_shards, y_shards)] for l in ls: l.backward() # The True flag allows parameters with stale gradients, which is useful # later (e.g., in fine-tuning BERT) trainer.step(labels.shape[0], ignore_stale_grad=True) train_loss_sum = sum([float(l.sum()) for l in ls]) train_acc_sum = sum(d2l.accuracy(pred_shard, y_shard) for pred_shard, y_shard in zip(pred_shards, y_shards)) return train_loss_sum, train_acc_sum # Defined in file: ./chapter_computer-vision/image-augmentation.md def train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices=d2l.try_all_gpus(), split_f=d2l.split_batch): num_batches, timer = len(train_iter), d2l.Timer() animator = d2l.Animator(xlabel='epoch', xlim=[0, num_epochs], ylim=[0, 1], legend=['train loss', 'train acc', 'test acc']) for epoch in range(num_epochs): # Store training_loss, training_accuracy, num_examples, num_features metric = d2l.Accumulator(4) for i, (features, labels) in enumerate(train_iter): timer.start() l, acc = train_batch_ch13( net, features, labels, loss, trainer, devices, split_f) metric.add(l, acc, labels.shape[0], labels.size) timer.stop() if (i + 1) % (num_batches // 5) == 0: animator.add(epoch + i / num_batches, (metric[0] / metric[2], metric[1] / metric[3], None)) test_acc = d2l.evaluate_accuracy_gpus(net, test_iter, split_f) animator.add(epoch + 1, (None, None, test_acc)) print(f'loss {metric[0] / metric[2]:.3f}, train acc ' f'{metric[1] / metric[3]:.3f}, test acc {test_acc:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec on ' f'{str(devices)}') # Defined in file: ./chapter_computer-vision/fine-tuning.md d2l.DATA_HUB['hotdog'] = (d2l.DATA_URL+'hotdog.zip', 'fba480ffa8aa7e0febbb511d181409f899b9baa5') # Defined in file: ./chapter_computer-vision/bounding-box.md
[docs]def bbox_to_rect(bbox, color): """Convert bounding box to matplotlib format.""" # Convert the bounding box (top-left x, top-left y, bottom-right x, # bottom-right y) format to matplotlib format: ((upper-left x, # upper-left y), width, height) return d2l.plt.Rectangle( xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1], fill=False, edgecolor=color, linewidth=2)
# Defined in file: ./chapter_computer-vision/anchor.md
[docs]def show_bboxes(axes, bboxes, labels=None, colors=None): """Show bounding boxes.""" def _make_list(obj, default_values=None): if obj is None: obj = default_values elif not isinstance(obj, (list, tuple)): obj = [obj] return obj labels = _make_list(labels) colors = _make_list(colors, ['b', 'g', 'r', 'm', 'c']) for i, bbox in enumerate(bboxes): color = colors[i % len(colors)] rect = d2l.bbox_to_rect(bbox.asnumpy(), color) axes.add_patch(rect) if labels and len(labels) > i: text_color = 'k' if color == 'w' else 'w' axes.text(rect.xy[0], rect.xy[1], labels[i], va='center', ha='center', fontsize=9, color=text_color, bbox=dict(facecolor=color, lw=0))
# Defined in file: ./chapter_computer-vision/object-detection-dataset.md d2l.DATA_HUB['bananas'] = (d2l.DATA_URL + 'bananas.zip', 'aadfd1c4c5d7178616799dd1801c9a234ccdaf19') # Defined in file: ./chapter_computer-vision/object-detection-dataset.md
[docs]def load_data_bananas(batch_size, edge_size=256): """Load the bananas dataset.""" data_dir = d2l.download_extract('bananas') train_iter = image.ImageDetIter( path_imgrec=os.path.join(data_dir, 'train.rec'), path_imgidx=os.path.join(data_dir, 'train.idx'), batch_size=batch_size, data_shape=(3, edge_size, edge_size), # The shape of the output image shuffle=True, # Read the dataset in random order rand_crop=1, # The probability of random cropping is 1 min_object_covered=0.95, max_attempts=200) val_iter = image.ImageDetIter( path_imgrec=os.path.join(data_dir, 'val.rec'), batch_size=batch_size, data_shape=(3, edge_size, edge_size), shuffle=False) return train_iter, val_iter
# Defined in file: ./chapter_computer-vision/semantic-segmentation-and-dataset.md d2l.DATA_HUB['voc2012'] = (d2l.DATA_URL + 'VOCtrainval_11-May-2012.tar', '4e443f8a2eca6b1dac8a6c57641b67dd40621a49') # Defined in file: ./chapter_computer-vision/semantic-segmentation-and-dataset.md
[docs]def read_voc_images(voc_dir, is_train=True): """Read all VOC feature and label images.""" txt_fname = os.path.join(voc_dir, 'ImageSets', 'Segmentation', 'train.txt' if is_train else 'val.txt') with open(txt_fname, 'r') as f: images = f.read().split() features, labels = [], [] for i, fname in enumerate(images): features.append(image.imread(os.path.join( voc_dir, 'JPEGImages', f'{fname}.jpg'))) labels.append(image.imread(os.path.join( voc_dir, 'SegmentationClass', f'{fname}.png'))) return features, labels
# Defined in file: ./chapter_computer-vision/semantic-segmentation-and-dataset.md VOC_COLORMAP = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]] # Defined in file: ./chapter_computer-vision/semantic-segmentation-and-dataset.md VOC_CLASSES = ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa', 'train', 'tv/monitor'] # Defined in file: ./chapter_computer-vision/semantic-segmentation-and-dataset.md
[docs]def build_colormap2label(): """Build an RGB color to label mapping for segmentation.""" colormap2label = np.zeros(256 ** 3) for i, colormap in enumerate(VOC_COLORMAP): colormap2label[(colormap[0]*256 + colormap[1])*256 + colormap[2]] = i return colormap2label
# Defined in file: ./chapter_computer-vision/semantic-segmentation-and-dataset.md
[docs]def voc_label_indices(colormap, colormap2label): """Map an RGB color to a label.""" colormap = colormap.astype(np.int32) idx = ((colormap[:, :, 0] * 256 + colormap[:, :, 1]) * 256 + colormap[:, :, 2]) return colormap2label[idx]
# Defined in file: ./chapter_computer-vision/semantic-segmentation-and-dataset.md
[docs]def voc_rand_crop(feature, label, height, width): """Randomly crop for both feature and label images.""" feature, rect = image.random_crop(feature, (width, height)) label = image.fixed_crop(label, *rect) return feature, label
# Defined in file: ./chapter_computer-vision/semantic-segmentation-and-dataset.md
[docs]class VOCSegDataset(gluon.data.Dataset): """A customized dataset to load VOC dataset.""" def __init__(self, is_train, crop_size, voc_dir): self.rgb_mean = np.array([0.485, 0.456, 0.406]) self.rgb_std = np.array([0.229, 0.224, 0.225]) self.crop_size = crop_size features, labels = read_voc_images(voc_dir, is_train=is_train) self.features = [self.normalize_image(feature) for feature in self.filter(features)] self.labels = self.filter(labels) self.colormap2label = build_colormap2label() print('read ' + str(len(self.features)) + ' examples') def normalize_image(self, img): return (img.astype('float32') / 255 - self.rgb_mean) / self.rgb_std
[docs] def filter(self, imgs): return [img for img in imgs if ( img.shape[0] >= self.crop_size[0] and img.shape[1] >= self.crop_size[1])]
def __getitem__(self, idx): feature, label = voc_rand_crop(self.features[idx], self.labels[idx], *self.crop_size) return (feature.transpose(2, 0, 1), voc_label_indices(label, self.colormap2label)) def __len__(self): return len(self.features)
# Defined in file: ./chapter_computer-vision/semantic-segmentation-and-dataset.md
[docs]def load_data_voc(batch_size, crop_size): """Download and load the VOC2012 semantic dataset.""" voc_dir = d2l.download_extract('voc2012', os.path.join( 'VOCdevkit', 'VOC2012')) num_workers = d2l.get_dataloader_workers() train_iter = gluon.data.DataLoader( VOCSegDataset(True, crop_size, voc_dir), batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) test_iter = gluon.data.DataLoader( VOCSegDataset(False, crop_size, voc_dir), batch_size, last_batch='discard', num_workers=num_workers) return train_iter, test_iter
# Defined in file: ./chapter_computer-vision/kaggle-cifar10.md d2l.DATA_HUB['cifar10_tiny'] = (d2l.DATA_URL + 'kaggle_cifar10_tiny.zip', '2068874e4b9a9f0fb07ebe0ad2b29754449ccacd') # Defined in file: ./chapter_computer-vision/kaggle-cifar10.md
[docs]def read_csv_labels(fname): """Read fname to return a name to label dictionary.""" with open(fname, 'r') as f: # Skip the file header line (column name) lines = f.readlines()[1:] tokens = [l.rstrip().split(',') for l in lines] return dict(((name, label) for name, label in tokens))
# Defined in file: ./chapter_computer-vision/kaggle-cifar10.md
[docs]def copyfile(filename, target_dir): """Copy a file into a target directory.""" d2l.mkdir_if_not_exist(target_dir) shutil.copy(filename, target_dir)
# Defined in file: ./chapter_computer-vision/kaggle-cifar10.md def reorg_train_valid(data_dir, labels, valid_ratio): # The number of examples of the class with the least examples in the # training dataset n = collections.Counter(labels.values()).most_common()[-1][1] # The number of examples per class for the validation set n_valid_per_label = max(1, math.floor(n * valid_ratio)) label_count = {} for train_file in os.listdir(os.path.join(data_dir, 'train')): label = labels[train_file.split('.')[0]] fname = os.path.join(data_dir, 'train', train_file) # Copy to train_valid_test/train_valid with a subfolder per class copyfile(fname, os.path.join(data_dir, 'train_valid_test', 'train_valid', label)) if label not in label_count or label_count[label] < n_valid_per_label: # Copy to train_valid_test/valid copyfile(fname, os.path.join(data_dir, 'train_valid_test', 'valid', label)) label_count[label] = label_count.get(label, 0) + 1 else: # Copy to train_valid_test/train copyfile(fname, os.path.join(data_dir, 'train_valid_test', 'train', label)) return n_valid_per_label # Defined in file: ./chapter_computer-vision/kaggle-cifar10.md def reorg_test(data_dir): for test_file in os.listdir(os.path.join(data_dir, 'test')): copyfile(os.path.join(data_dir, 'test', test_file), os.path.join(data_dir, 'train_valid_test', 'test', 'unknown')) # Defined in file: ./chapter_computer-vision/kaggle-dog.md d2l.DATA_HUB['dog_tiny'] = (d2l.DATA_URL + 'kaggle_dog_tiny.zip', '0cb91d09b814ecdc07b50f31f8dcad3e81d6a86d') # Defined in file: ./chapter_natural-language-processing-pretraining/word-embedding-dataset.md d2l.DATA_HUB['ptb'] = (d2l.DATA_URL + 'ptb.zip', '319d85e578af0cdc590547f26231e4e31cdf1e42') # Defined in file: ./chapter_natural-language-processing-pretraining/word-embedding-dataset.md def read_ptb(): data_dir = d2l.download_extract('ptb') with open(os.path.join(data_dir, 'ptb.train.txt')) as f: raw_text = f.read() return [line.split() for line in raw_text.split('\n')] # Defined in file: ./chapter_natural-language-processing-pretraining/word-embedding-dataset.md def subsampling(sentences, vocab): # Map low frequency words into <unk> sentences = [[vocab.idx_to_token[vocab[tk]] for tk in line] for line in sentences] # Count the frequency for each word counter = d2l.count_corpus(sentences) num_tokens = sum(counter.values()) # Return True if to keep this token during subsampling def keep(token): return(random.uniform(0, 1) < math.sqrt(1e-4 / counter[token] * num_tokens)) # Now do the subsampling return [[tk for tk in line if keep(tk)] for line in sentences] # Defined in file: ./chapter_natural-language-processing-pretraining/word-embedding-dataset.md def get_centers_and_contexts(corpus, max_window_size): centers, contexts = [], [] for line in corpus: # Each sentence needs at least 2 words to form a "central target word # - context word" pair if len(line) < 2: continue centers += line for i in range(len(line)): # Context window centered at i window_size = random.randint(1, max_window_size) indices = list(range(max(0, i - window_size), min(len(line), i + 1 + window_size))) # Exclude the central target word from the context words indices.remove(i) contexts.append([line[idx] for idx in indices]) return centers, contexts # Defined in file: ./chapter_natural-language-processing-pretraining/word-embedding-dataset.md
[docs]class RandomGenerator: """Draw a random int in [0, n] according to n sampling weights.""" def __init__(self, sampling_weights): self.population = list(range(len(sampling_weights))) self.sampling_weights = sampling_weights self.candidates = [] self.i = 0 def draw(self): if self.i == len(self.candidates): self.candidates = random.choices( self.population, self.sampling_weights, k=10000) self.i = 0 self.i += 1 return self.candidates[self.i-1]
# Defined in file: ./chapter_natural-language-processing-pretraining/word-embedding-dataset.md def get_negatives(all_contexts, corpus, K): counter = d2l.count_corpus(corpus) sampling_weights = [counter[i]**0.75 for i in range(len(counter))] all_negatives, generator = [], RandomGenerator(sampling_weights) for contexts in all_contexts: negatives = [] while len(negatives) < len(contexts) * K: neg = generator.draw() # Noise words cannot be context words if neg not in contexts: negatives.append(neg) all_negatives.append(negatives) return all_negatives # Defined in file: ./chapter_natural-language-processing-pretraining/word-embedding-dataset.md def batchify(data): max_len = max(len(c) + len(n) for _, c, n in data) centers, contexts_negatives, masks, labels = [], [], [], [] for center, context, negative in data: cur_len = len(context) + len(negative) centers += [center] contexts_negatives += [context + negative + [0] * (max_len - cur_len)] masks += [[1] * cur_len + [0] * (max_len - cur_len)] labels += [[1] * len(context) + [0] * (max_len - len(context))] return (np.array(centers).reshape(-1, 1), np.array(contexts_negatives), np.array(masks), np.array(labels)) # Defined in file: ./chapter_natural-language-processing-pretraining/word-embedding-dataset.md def load_data_ptb(batch_size, max_window_size, num_noise_words): num_workers = d2l.get_dataloader_workers() sentences = read_ptb() vocab = d2l.Vocab(sentences, min_freq=10) subsampled = subsampling(sentences, vocab) corpus = [vocab[line] for line in subsampled] all_centers, all_contexts = get_centers_and_contexts( corpus, max_window_size) all_negatives = get_negatives(all_contexts, corpus, num_noise_words) dataset = gluon.data.ArrayDataset( all_centers, all_contexts, all_negatives) data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True, batchify_fn=batchify, num_workers=num_workers) return data_iter, vocab # Defined in file: ./chapter_natural-language-processing-pretraining/similarity-analogy.md d2l.DATA_HUB['glove.6b.50d'] = (d2l.DATA_URL + 'glove.6B.50d.zip', '0b8703943ccdb6eb788e6f091b8946e82231bc4d') # Defined in file: ./chapter_natural-language-processing-pretraining/similarity-analogy.md d2l.DATA_HUB['glove.6b.100d'] = (d2l.DATA_URL + 'glove.6B.100d.zip', 'cd43bfb07e44e6f27cbcc7bc9ae3d80284fdaf5a') # Defined in file: ./chapter_natural-language-processing-pretraining/similarity-analogy.md d2l.DATA_HUB['glove.42b.300d'] = (d2l.DATA_URL + 'glove.42B.300d.zip', 'b5116e234e9eb9076672cfeabf5469f3eec904fa') # Defined in file: ./chapter_natural-language-processing-pretraining/similarity-analogy.md d2l.DATA_HUB['wiki.en'] = (d2l.DATA_URL + 'wiki.en.zip', 'c1816da3821ae9f43899be655002f6c723e91b88') # Defined in file: ./chapter_natural-language-processing-pretraining/similarity-analogy.md
[docs]class TokenEmbedding: """Token Embedding.""" def __init__(self, embedding_name): self.idx_to_token, self.idx_to_vec = self._load_embedding( embedding_name) self.unknown_idx = 0 self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)} def _load_embedding(self, embedding_name): idx_to_token, idx_to_vec = ['<unk>'], [] data_dir = d2l.download_extract(embedding_name) # GloVe website: https://nlp.stanford.edu/projects/glove/ # fastText website: https://fasttext.cc/ with open(os.path.join(data_dir, 'vec.txt'), 'r') as f: for line in f: elems = line.rstrip().split(' ') token, elems = elems[0], [float(elem) for elem in elems[1:]] # Skip header information, such as the top row in fastText if len(elems) > 1: idx_to_token.append(token) idx_to_vec.append(elems) idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec return idx_to_token, np.array(idx_to_vec) def __getitem__(self, tokens): indices = [self.token_to_idx.get(token, self.unknown_idx) for token in tokens] vecs = self.idx_to_vec[np.array(indices)] return vecs def __len__(self): return len(self.idx_to_token)
# Defined in file: ./chapter_natural-language-processing-pretraining/bert.md def get_tokens_and_segments(tokens_a, tokens_b=None): tokens = ['<cls>'] + tokens_a + ['<sep>'] # 0 and 1 are marking segment A and B, respectively segments = [0] * (len(tokens_a) + 2) if tokens_b is not None: tokens += tokens_b + ['<sep>'] segments += [1] * (len(tokens_b) + 1) return tokens, segments # Defined in file: ./chapter_natural-language-processing-pretraining/bert.md
[docs]class BERTEncoder(nn.Block): def __init__(self, vocab_size, num_hiddens, ffn_num_hiddens, num_heads, num_layers, dropout, max_len=1000, **kwargs): super(BERTEncoder, self).__init__(**kwargs) self.token_embedding = nn.Embedding(vocab_size, num_hiddens) self.segment_embedding = nn.Embedding(2, num_hiddens) self.blks = nn.Sequential() for _ in range(num_layers): self.blks.add(d2l.EncoderBlock( num_hiddens, ffn_num_hiddens, num_heads, dropout, True)) # In BERT, positional embeddings are learnable, thus we create a # parameter of positional embeddings that are long enough self.pos_embedding = self.params.get('pos_embedding', shape=(1, max_len, num_hiddens))
[docs] def forward(self, tokens, segments, valid_lens): # Shape of `X` remains unchanged in the following code snippet: # (batch size, max sequence length, `num_hiddens`) X = self.token_embedding(tokens) + self.segment_embedding(segments) X = X + self.pos_embedding.data(ctx=X.ctx)[:, :X.shape[1], :] for blk in self.blks: X = blk(X, valid_lens) return X
# Defined in file: ./chapter_natural-language-processing-pretraining/bert.md
[docs]class MaskLM(nn.Block): def __init__(self, vocab_size, num_hiddens, **kwargs): super(MaskLM, self).__init__(**kwargs) self.mlp = nn.Sequential() self.mlp.add( nn.Dense(num_hiddens, flatten=False, activation='relu')) self.mlp.add(nn.LayerNorm()) self.mlp.add(nn.Dense(vocab_size, flatten=False))
[docs] def forward(self, X, pred_positions): num_pred_positions = pred_positions.shape[1] pred_positions = pred_positions.reshape(-1) batch_size = X.shape[0] batch_idx = np.arange(0, batch_size) # Suppose that `batch_size` = 2, `num_pred_positions` = 3, then # `batch_idx` is `np.array([0, 0, 0, 1, 1, 1])` batch_idx = np.repeat(batch_idx, num_pred_positions) masked_X = X[batch_idx, pred_positions] masked_X = masked_X.reshape((batch_size, num_pred_positions, -1)) mlm_Y_hat = self.mlp(masked_X) return mlm_Y_hat
# Defined in file: ./chapter_natural-language-processing-pretraining/bert.md
[docs]class NextSentencePred(nn.Block): def __init__(self, **kwargs): super(NextSentencePred, self).__init__(**kwargs) self.output = nn.Dense(2)
[docs] def forward(self, X): # `X` shape: (batch size, `num_hiddens`) return self.output(X)
# Defined in file: ./chapter_natural-language-processing-pretraining/bert.md
[docs]class BERTModel(nn.Block): def __init__(self, vocab_size, num_hiddens, ffn_num_hiddens, num_heads, num_layers, dropout, max_len=1000): super(BERTModel, self).__init__() self.encoder = BERTEncoder(vocab_size, num_hiddens, ffn_num_hiddens, num_heads, num_layers, dropout, max_len) self.hidden = nn.Dense(num_hiddens, activation='tanh') self.mlm = MaskLM(vocab_size, num_hiddens) self.nsp = NextSentencePred()
[docs] def forward(self, tokens, segments, valid_lens=None, pred_positions=None): encoded_X = self.encoder(tokens, segments, valid_lens) if pred_positions is not None: mlm_Y_hat = self.mlm(encoded_X, pred_positions) else: mlm_Y_hat = None # The hidden layer of the MLP classifier for next sentence prediction. # 0 is the index of the '<cls>' token nsp_Y_hat = self.nsp(self.hidden(encoded_X[:, 0, :])) return encoded_X, mlm_Y_hat, nsp_Y_hat
# Defined in file: ./chapter_natural-language-processing-pretraining/bert-dataset.md d2l.DATA_HUB['wikitext-2'] = ( 'https://s3.amazonaws.com/research.metamind.io/wikitext/' 'wikitext-2-v1.zip', '3c914d17d80b1459be871a5039ac23e752a53cbe') # Defined in file: ./chapter_natural-language-processing-pretraining/bert-dataset.md def _read_wiki(data_dir): file_name = os.path.join(data_dir, 'wiki.train.tokens') with open(file_name, 'r') as f: lines = f.readlines() # Uppercase letters are converted to lowercase ones paragraphs = [line.strip().lower().split(' . ') for line in lines if len(line.split(' . ')) >= 2] random.shuffle(paragraphs) return paragraphs # Defined in file: ./chapter_natural-language-processing-pretraining/bert-dataset.md def _get_next_sentence(sentence, next_sentence, paragraphs): if random.random() < 0.5: is_next = True else: # `paragraphs` is a list of lists of lists next_sentence = random.choice(random.choice(paragraphs)) is_next = False return sentence, next_sentence, is_next # Defined in file: ./chapter_natural-language-processing-pretraining/bert-dataset.md def _get_nsp_data_from_paragraph(paragraph, paragraphs, vocab, max_len): nsp_data_from_paragraph = [] for i in range(len(paragraph) - 1): tokens_a, tokens_b, is_next = _get_next_sentence( paragraph[i], paragraph[i + 1], paragraphs) # Consider 1 '<cls>' token and 2 '<sep>' tokens if len(tokens_a) + len(tokens_b) + 3 > max_len: continue tokens, segments = d2l.get_tokens_and_segments(tokens_a, tokens_b) nsp_data_from_paragraph.append((tokens, segments, is_next)) return nsp_data_from_paragraph # Defined in file: ./chapter_natural-language-processing-pretraining/bert-dataset.md def _replace_mlm_tokens(tokens, candidate_pred_positions, num_mlm_preds, vocab): # Make a new copy of tokens for the input of a masked language model, # where the input may contain replaced '<mask>' or random tokens mlm_input_tokens = [token for token in tokens] pred_positions_and_labels = [] # Shuffle for getting 15% random tokens for prediction in the masked # language modeling task random.shuffle(candidate_pred_positions) for mlm_pred_position in candidate_pred_positions: if len(pred_positions_and_labels) >= num_mlm_preds: break masked_token = None # 80% of the time: replace the word with the '<mask>' token if random.random() < 0.8: masked_token = '<mask>' else: # 10% of the time: keep the word unchanged if random.random() < 0.5: masked_token = tokens[mlm_pred_position] # 10% of the time: replace the word with a random word else: masked_token = random.randint(0, len(vocab) - 1) mlm_input_tokens[mlm_pred_position] = masked_token pred_positions_and_labels.append( (mlm_pred_position, tokens[mlm_pred_position])) return mlm_input_tokens, pred_positions_and_labels # Defined in file: ./chapter_natural-language-processing-pretraining/bert-dataset.md def _get_mlm_data_from_tokens(tokens, vocab): candidate_pred_positions = [] # `tokens` is a list of strings for i, token in enumerate(tokens): # Special tokens are not predicted in the masked language modeling # task if token in ['<cls>', '<sep>']: continue candidate_pred_positions.append(i) # 15% of random tokens are predicted in the masked language modeling task num_mlm_preds = max(1, round(len(tokens) * 0.15)) mlm_input_tokens, pred_positions_and_labels = _replace_mlm_tokens( tokens, candidate_pred_positions, num_mlm_preds, vocab) pred_positions_and_labels = sorted(pred_positions_and_labels, key=lambda x: x[0]) pred_positions = [v[0] for v in pred_positions_and_labels] mlm_pred_labels = [v[1] for v in pred_positions_and_labels] return vocab[mlm_input_tokens], pred_positions, vocab[mlm_pred_labels] # Defined in file: ./chapter_natural-language-processing-pretraining/bert-dataset.md def _pad_bert_inputs(examples, max_len, vocab): max_num_mlm_preds = round(max_len * 0.15) all_token_ids, all_segments, valid_lens, = [], [], [] all_pred_positions, all_mlm_weights, all_mlm_labels = [], [], [] nsp_labels = [] for (token_ids, pred_positions, mlm_pred_label_ids, segments, is_next) in examples: all_token_ids.append(np.array(token_ids + [vocab['<pad>']] * ( max_len - len(token_ids)), dtype='int32')) all_segments.append(np.array(segments + [0] * ( max_len - len(segments)), dtype='int32')) # `valid_lens` excludes count of '<pad>' tokens valid_lens.append(np.array(len(token_ids), dtype='float32')) all_pred_positions.append(np.array(pred_positions + [0] * ( max_num_mlm_preds - len(pred_positions)), dtype='int32')) # Predictions of padded tokens will be filtered out in the loss via # multiplication of 0 weights all_mlm_weights.append( np.array([1.0] * len(mlm_pred_label_ids) + [0.0] * ( max_num_mlm_preds - len(pred_positions)), dtype='float32')) all_mlm_labels.append(np.array(mlm_pred_label_ids + [0] * ( max_num_mlm_preds - len(mlm_pred_label_ids)), dtype='int32')) nsp_labels.append(np.array(is_next)) return (all_token_ids, all_segments, valid_lens, all_pred_positions, all_mlm_weights, all_mlm_labels, nsp_labels) # Defined in file: ./chapter_natural-language-processing-pretraining/bert-dataset.md class _WikiTextDataset(gluon.data.Dataset): def __init__(self, paragraphs, max_len): # Input `paragraphs[i]` is a list of sentence strings representing a # paragraph; while output `paragraphs[i]` is a list of sentences # representing a paragraph, where each sentence is a list of tokens paragraphs = [d2l.tokenize( paragraph, token='word') for paragraph in paragraphs] sentences = [sentence for paragraph in paragraphs for sentence in paragraph] self.vocab = d2l.Vocab(sentences, min_freq=5, reserved_tokens=[ '<pad>', '<mask>', '<cls>', '<sep>']) # Get data for the next sentence prediction task examples = [] for paragraph in paragraphs: examples.extend(_get_nsp_data_from_paragraph( paragraph, paragraphs, self.vocab, max_len)) # Get data for the masked language model task examples = [(_get_mlm_data_from_tokens(tokens, self.vocab) + (segments, is_next)) for tokens, segments, is_next in examples] # Pad inputs (self.all_token_ids, self.all_segments, self.valid_lens, self.all_pred_positions, self.all_mlm_weights, self.all_mlm_labels, self.nsp_labels) = _pad_bert_inputs( examples, max_len, self.vocab) def __getitem__(self, idx): return (self.all_token_ids[idx], self.all_segments[idx], self.valid_lens[idx], self.all_pred_positions[idx], self.all_mlm_weights[idx], self.all_mlm_labels[idx], self.nsp_labels[idx]) def __len__(self): return len(self.all_token_ids) # Defined in file: ./chapter_natural-language-processing-pretraining/bert-dataset.md def load_data_wiki(batch_size, max_len): num_workers = d2l.get_dataloader_workers() data_dir = d2l.download_extract('wikitext-2', 'wikitext-2') paragraphs = _read_wiki(data_dir) train_set = _WikiTextDataset(paragraphs, max_len) train_iter = gluon.data.DataLoader(train_set, batch_size, shuffle=True, num_workers=num_workers) return train_iter, train_set.vocab # Defined in file: ./chapter_natural-language-processing-pretraining/bert-pretraining.md def _get_batch_loss_bert(net, loss, vocab_size, tokens_X_shards, segments_X_shards, valid_lens_x_shards, pred_positions_X_shards, mlm_weights_X_shards, mlm_Y_shards, nsp_y_shards): mlm_ls, nsp_ls, ls = [], [], [] for (tokens_X_shard, segments_X_shard, valid_lens_x_shard, pred_positions_X_shard, mlm_weights_X_shard, mlm_Y_shard, nsp_y_shard) in zip( tokens_X_shards, segments_X_shards, valid_lens_x_shards, pred_positions_X_shards, mlm_weights_X_shards, mlm_Y_shards, nsp_y_shards): # Forward pass _, mlm_Y_hat, nsp_Y_hat = net( tokens_X_shard, segments_X_shard, valid_lens_x_shard.reshape(-1), pred_positions_X_shard) # Compute masked language model loss mlm_l = loss( mlm_Y_hat.reshape((-1, vocab_size)), mlm_Y_shard.reshape(-1), mlm_weights_X_shard.reshape((-1, 1))) mlm_l = mlm_l.sum() / (mlm_weights_X_shard.sum() + 1e-8) # Compute next sentence prediction loss nsp_l = loss(nsp_Y_hat, nsp_y_shard) nsp_l = nsp_l.mean() mlm_ls.append(mlm_l) nsp_ls.append(nsp_l) ls.append(mlm_l + nsp_l) npx.waitall() return mlm_ls, nsp_ls, ls # Defined in file: ./chapter_natural-language-processing-pretraining/bert-pretraining.md def train_bert(train_iter, net, loss, vocab_size, devices, log_interval, num_steps): trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': 1e-3}) step, timer = 0, d2l.Timer() animator = d2l.Animator(xlabel='step', ylabel='loss', xlim=[1, num_steps], legend=['mlm', 'nsp']) # Sum of masked language modeling losses, sum of next sentence prediction # losses, no. of sentence pairs, count metric = d2l.Accumulator(4) num_steps_reached = False while step < num_steps and not num_steps_reached: for batch in train_iter: (tokens_X_shards, segments_X_shards, valid_lens_x_shards, pred_positions_X_shards, mlm_weights_X_shards, mlm_Y_shards, nsp_y_shards) = [gluon.utils.split_and_load( elem, devices, even_split=False) for elem in batch] timer.start() with autograd.record(): mlm_ls, nsp_ls, ls = _get_batch_loss_bert( net, loss, vocab_size, tokens_X_shards, segments_X_shards, valid_lens_x_shards, pred_positions_X_shards, mlm_weights_X_shards, mlm_Y_shards, nsp_y_shards) for l in ls: l.backward() trainer.step(1) mlm_l_mean = sum([float(l) for l in mlm_ls]) / len(mlm_ls) nsp_l_mean = sum([float(l) for l in nsp_ls]) / len(nsp_ls) metric.add(mlm_l_mean, nsp_l_mean, batch[0].shape[0], 1) timer.stop() if (step + 1) % log_interval == 0: animator.add(step + 1, (metric[0] / metric[3], metric[1] / metric[3])) step += 1 if step == num_steps: num_steps_reached = True break print(f'MLM loss {metric[0] / metric[3]:.3f}, ' f'NSP loss {metric[1] / metric[3]:.3f}') print(f'{metric[2] / timer.sum():.1f} sentence pairs/sec on ' f'{str(devices)}') # Defined in file: ./chapter_natural-language-processing-applications/sentiment-analysis-and-dataset.md d2l.DATA_HUB['aclImdb'] = ( 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz', '01ada507287d82875905620988597833ad4e0903') # Defined in file: ./chapter_natural-language-processing-applications/sentiment-analysis-and-dataset.md def read_imdb(data_dir, is_train): data, labels = [], [] for label in ('pos', 'neg'): folder_name = os.path.join(data_dir, 'train' if is_train else 'test', label) for file in os.listdir(folder_name): with open(os.path.join(folder_name, file), 'rb') as f: review = f.read().decode('utf-8').replace('\n', '') data.append(review) labels.append(1 if label == 'pos' else 0) return data, labels # Defined in file: ./chapter_natural-language-processing-applications/sentiment-analysis-and-dataset.md def load_data_imdb(batch_size, num_steps=500): data_dir = d2l.download_extract('aclImdb', 'aclImdb') train_data = read_imdb(data_dir, True) test_data = read_imdb(data_dir, False) train_tokens = d2l.tokenize(train_data[0], token='word') test_tokens = d2l.tokenize(test_data[0], token='word') vocab = d2l.Vocab(train_tokens, min_freq=5) train_features = np.array([d2l.truncate_pad( vocab[line], num_steps, vocab.unk) for line in train_tokens]) test_features = np.array([d2l.truncate_pad( vocab[line], num_steps, vocab.unk) for line in test_tokens]) train_iter = d2l.load_array((train_features, train_data[1]), batch_size) test_iter = d2l.load_array((test_features, test_data[1]), batch_size, is_train=False) return train_iter, test_iter, vocab # Defined in file: ./chapter_natural-language-processing-applications/sentiment-analysis-rnn.md def predict_sentiment(net, vocab, sentence): sentence = np.array(vocab[sentence.split()], ctx=d2l.try_gpu()) label = np.argmax(net(sentence.reshape(1, -1)), axis=1) return 'positive' if label == 1 else 'negative' # Defined in file: ./chapter_natural-language-processing-applications/natural-language-inference-and-dataset.md d2l.DATA_HUB['SNLI'] = ( 'https://nlp.stanford.edu/projects/snli/snli_1.0.zip', '9fcde07509c7e87ec61c640c1b2753d9041758e4') # Defined in file: ./chapter_natural-language-processing-applications/natural-language-inference-and-dataset.md
[docs]def read_snli(data_dir, is_train): """Read the SNLI dataset into premises, hypotheses, and labels.""" def extract_text(s): # Remove information that will not be used by us s = re.sub('\\(', '', s) s = re.sub('\\)', '', s) # Substitute two or more consecutive whitespace with space s = re.sub('\\s{2,}', ' ', s) return s.strip() label_set = {'entailment': 0, 'contradiction': 1, 'neutral': 2} file_name = os.path.join(data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt') with open(file_name, 'r') as f: rows = [row.split('\t') for row in f.readlines()[1:]] premises = [extract_text(row[1]) for row in rows if row[0] in label_set] hypotheses = [extract_text(row[2]) for row in rows if row[0] in label_set] labels = [label_set[row[0]] for row in rows if row[0] in label_set] return premises, hypotheses, labels
# Defined in file: ./chapter_natural-language-processing-applications/natural-language-inference-and-dataset.md
[docs]class SNLIDataset(gluon.data.Dataset): """A customized dataset to load the SNLI dataset.""" def __init__(self, dataset, num_steps, vocab=None): self.num_steps = num_steps all_premise_tokens = d2l.tokenize(dataset[0]) all_hypothesis_tokens = d2l.tokenize(dataset[1]) if vocab is None: self.vocab = d2l.Vocab(all_premise_tokens + all_hypothesis_tokens, min_freq=5, reserved_tokens=['<pad>']) else: self.vocab = vocab self.premises = self._pad(all_premise_tokens) self.hypotheses = self._pad(all_hypothesis_tokens) self.labels = np.array(dataset[2]) print('read ' + str(len(self.premises)) + ' examples') def _pad(self, lines): return np.array([d2l.truncate_pad( self.vocab[line], self.num_steps, self.vocab['<pad>']) for line in lines]) def __getitem__(self, idx): return (self.premises[idx], self.hypotheses[idx]), self.labels[idx] def __len__(self): return len(self.premises)
# Defined in file: ./chapter_natural-language-processing-applications/natural-language-inference-and-dataset.md
[docs]def load_data_snli(batch_size, num_steps=50): """Download the SNLI dataset and return data iterators and vocabulary.""" num_workers = d2l.get_dataloader_workers() data_dir = d2l.download_extract('SNLI') train_data = read_snli(data_dir, True) test_data = read_snli(data_dir, False) train_set = SNLIDataset(train_data, num_steps) test_set = SNLIDataset(test_data, num_steps, train_set.vocab) train_iter = gluon.data.DataLoader(train_set, batch_size, shuffle=True, num_workers=num_workers) test_iter = gluon.data.DataLoader(test_set, batch_size, shuffle=False, num_workers=num_workers) return train_iter, test_iter, train_set.vocab
# Defined in file: ./chapter_natural-language-processing-applications/natural-language-inference-attention.md
[docs]def split_batch_multi_inputs(X, y, devices): """Split multi-input `X` and `y` into multiple devices.""" X = list(zip(*[gluon.utils.split_and_load( feature, devices, even_split=False) for feature in X])) return (X, gluon.utils.split_and_load(y, devices, even_split=False))
# Defined in file: ./chapter_natural-language-processing-applications/natural-language-inference-attention.md def predict_snli(net, vocab, premise, hypothesis): premise = np.array(vocab[premise], ctx=d2l.try_gpu()) hypothesis = np.array(vocab[hypothesis], ctx=d2l.try_gpu()) label = np.argmax(net([premise.reshape((1, -1)), hypothesis.reshape((1, -1))]), axis=1) return 'entailment' if label == 0 else 'contradiction' if label == 1 \ else 'neutral' # Defined in file: ./chapter_recommender-systems/movielens.md d2l.DATA_HUB['ml-100k'] = ( 'http://files.grouplens.org/datasets/movielens/ml-100k.zip', 'cd4dcac4241c8a4ad7badc7ca635da8a69dddb83') # Defined in file: ./chapter_recommender-systems/movielens.md def read_data_ml100k(): data_dir = d2l.download_extract('ml-100k') names = ['user_id', 'item_id', 'rating', 'timestamp'] data = pd.read_csv(os.path.join(data_dir, 'u.data'), '\t', names=names, engine='python') num_users = data.user_id.unique().shape[0] num_items = data.item_id.unique().shape[0] return data, num_users, num_items # Defined in file: ./chapter_recommender-systems/movielens.md
[docs]def split_data_ml100k(data, num_users, num_items, split_mode='random', test_ratio=0.1): """Split the dataset in random mode or seq-aware mode.""" if split_mode == 'seq-aware': train_items, test_items, train_list = {}, {}, [] for line in data.itertuples(): u, i, rating, time = line[1], line[2], line[3], line[4] train_items.setdefault(u, []).append((u, i, rating, time)) if u not in test_items or test_items[u][-1] < time: test_items[u] = (i, rating, time) for u in range(1, num_users + 1): train_list.extend(sorted(train_items[u], key=lambda k: k[3])) test_data = [(key, *value) for key, value in test_items.items()] train_data = [item for item in train_list if item not in test_data] train_data = pd.DataFrame(train_data) test_data = pd.DataFrame(test_data) else: mask = [True if x == 1 else False for x in np.random.uniform( 0, 1, (len(data))) < 1 - test_ratio] neg_mask = [not x for x in mask] train_data, test_data = data[mask], data[neg_mask] return train_data, test_data
# Defined in file: ./chapter_recommender-systems/movielens.md def load_data_ml100k(data, num_users, num_items, feedback='explicit'): users, items, scores = [], [], [] inter = np.zeros((num_items, num_users)) if feedback == 'explicit' else {} for line in data.itertuples(): user_index, item_index = int(line[1] - 1), int(line[2] - 1) score = int(line[3]) if feedback == 'explicit' else 1 users.append(user_index) items.append(item_index) scores.append(score) if feedback == 'implicit': inter.setdefault(user_index, []).append(item_index) else: inter[item_index, user_index] = score return users, items, scores, inter # Defined in file: ./chapter_recommender-systems/movielens.md def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit', test_ratio=0.1, batch_size=256): data, num_users, num_items = read_data_ml100k() train_data, test_data = split_data_ml100k( data, num_users, num_items, split_mode, test_ratio) train_u, train_i, train_r, _ = load_data_ml100k( train_data, num_users, num_items, feedback) test_u, test_i, test_r, _ = load_data_ml100k( test_data, num_users, num_items, feedback) train_set = gluon.data.ArrayDataset( np.array(train_u), np.array(train_i), np.array(train_r)) test_set = gluon.data.ArrayDataset( np.array(test_u), np.array(test_i), np.array(test_r)) train_iter = gluon.data.DataLoader( train_set, shuffle=True, last_batch='rollover', batch_size=batch_size) test_iter = gluon.data.DataLoader( test_set, batch_size=batch_size) return num_users, num_items, train_iter, test_iter # Defined in file: ./chapter_recommender-systems/mf.md def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs, devices=d2l.try_all_gpus(), evaluator=None, **kwargs): timer = d2l.Timer() animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2], legend=['train loss', 'test RMSE']) for epoch in range(num_epochs): metric, l = d2l.Accumulator(3), 0. for i, values in enumerate(train_iter): timer.start() input_data = [] values = values if isinstance(values, list) else [values] for v in values: input_data.append(gluon.utils.split_and_load(v, devices)) train_feat = input_data[0:-1] if len(values) > 1 else input_data train_label = input_data[-1] with autograd.record(): preds = [net(*t) for t in zip(*train_feat)] ls = [loss(p, s) for p, s in zip(preds, train_label)] [l.backward() for l in ls] l += sum([l.asnumpy() for l in ls]).mean() / len(devices) trainer.step(values[0].shape[0]) metric.add(l, values[0].shape[0], values[0].size) timer.stop() if len(kwargs) > 0: # It will be used in section AutoRec test_rmse = evaluator(net, test_iter, kwargs['inter_mat'], devices) else: test_rmse = evaluator(net, test_iter, devices) train_l = l / (i + 1) animator.add(epoch + 1, (train_l, test_rmse)) print(f'train loss {metric[0] / metric[1]:.3f}, ' f'test RMSE {test_rmse:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(devices)}') # Defined in file: ./chapter_recommender-systems/ranking.md
[docs]class BPRLoss(gluon.loss.Loss): def __init__(self, weight=None, batch_axis=0, **kwargs): super(BPRLoss, self).__init__(weight=None, batch_axis=0, **kwargs)
[docs] def forward(self, positive, negative): distances = positive - negative loss = - np.sum(np.log(npx.sigmoid(distances)), 0, keepdims=True) return loss
# Defined in file: ./chapter_recommender-systems/ranking.md
[docs]class HingeLossbRec(gluon.loss.Loss): def __init__(self, weight=None, batch_axis=0, **kwargs): super(HingeLossbRec, self).__init__(weight=None, batch_axis=0, **kwargs)
[docs] def forward(self, positive, negative, margin=1): distances = positive - negative loss = np.sum(np.maximum(- distances + margin, 0)) return loss
# Defined in file: ./chapter_recommender-systems/neumf.md def hit_and_auc(rankedlist, test_matrix, k): hits_k = [(idx, val) for idx, val in enumerate(rankedlist[:k]) if val in set(test_matrix)] hits_all = [(idx, val) for idx, val in enumerate(rankedlist) if val in set(test_matrix)] max = len(rankedlist) - 1 auc = 1.0 * (max - hits_all[0][0]) / max if len(hits_all) > 0 else 0 return len(hits_k), auc # Defined in file: ./chapter_recommender-systems/neumf.md def evaluate_ranking(net, test_input, seq, candidates, num_users, num_items, devices): ranked_list, ranked_items, hit_rate, auc = {}, {}, [], [] all_items = set([i for i in range(num_users)]) for u in range(num_users): neg_items = list(all_items - set(candidates[int(u)])) user_ids, item_ids, x, scores = [], [], [], [] [item_ids.append(i) for i in neg_items] [user_ids.append(u) for _ in neg_items] x.extend([np.array(user_ids)]) if seq is not None: x.append(seq[user_ids, :]) x.extend([np.array(item_ids)]) test_data_iter = gluon.data.DataLoader( gluon.data.ArrayDataset(*x), shuffle=False, last_batch="keep", batch_size=1024) for index, values in enumerate(test_data_iter): x = [gluon.utils.split_and_load(v, devices, even_split=False) for v in values] scores.extend([list(net(*t).asnumpy()) for t in zip(*x)]) scores = [item for sublist in scores for item in sublist] item_scores = list(zip(item_ids, scores)) ranked_list[u] = sorted(item_scores, key=lambda t: t[1], reverse=True) ranked_items[u] = [r[0] for r in ranked_list[u]] temp = hit_and_auc(ranked_items[u], test_input[u], 50) hit_rate.append(temp[0]) auc.append(temp[1]) return np.mean(np.array(hit_rate)), np.mean(np.array(auc)) # Defined in file: ./chapter_recommender-systems/neumf.md def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter, num_users, num_items, num_epochs, devices, evaluator, candidates, eval_step=1): timer, hit_rate, auc = d2l.Timer(), 0, 0 animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1], legend=['test hit rate', 'test AUC']) for epoch in range(num_epochs): metric, l = d2l.Accumulator(3), 0. for i, values in enumerate(train_iter): input_data = [] for v in values: input_data.append(gluon.utils.split_and_load(v, devices)) with autograd.record(): p_pos = [net(*t) for t in zip(*input_data[0:-1])] p_neg = [net(*t) for t in zip(*input_data[0:-2], input_data[-1])] ls = [loss(p, n) for p, n in zip(p_pos, p_neg)] [l.backward(retain_graph=False) for l in ls] l += sum([l.asnumpy() for l in ls]).mean()/len(devices) trainer.step(values[0].shape[0]) metric.add(l, values[0].shape[0], values[0].size) timer.stop() with autograd.predict_mode(): if (epoch + 1) % eval_step == 0: hit_rate, auc = evaluator(net, test_iter, test_seq_iter, candidates, num_users, num_items, devices) animator.add(epoch + 1, (hit_rate, auc)) print(f'train loss {metric[0] / metric[1]:.3f}, ' f'test hit rate {float(hit_rate):.3f}, test AUC {float(auc):.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(devices)}') # Defined in file: ./chapter_recommender-systems/ctr.md d2l.DATA_HUB['ctr'] = (d2l.DATA_URL + 'ctr.zip', 'e18327c48c8e8e5c23da714dd614e390d369843f') # Defined in file: ./chapter_recommender-systems/ctr.md
[docs]class CTRDataset(gluon.data.Dataset): def __init__(self, data_path, feat_mapper=None, defaults=None, min_threshold=4, num_feat=34): self.NUM_FEATS, self.count, self.data = num_feat, 0, {} feat_cnts = defaultdict(lambda: defaultdict(int)) self.feat_mapper, self.defaults = feat_mapper, defaults self.field_dims = np.zeros(self.NUM_FEATS, dtype=np.int64) with open(data_path) as f: for line in f: instance = {} values = line.rstrip('\n').split('\t') if len(values) != self.NUM_FEATS + 1: continue label = np.float32([0, 0]) label[int(values[0])] = 1 instance['y'] = [np.float32(values[0])] for i in range(1, self.NUM_FEATS + 1): feat_cnts[i][values[i]] += 1 instance.setdefault('x', []).append(values[i]) self.data[self.count] = instance self.count = self.count + 1 if self.feat_mapper is None and self.defaults is None: feat_mapper = {i: {feat for feat, c in cnt.items() if c >= min_threshold} for i, cnt in feat_cnts.items()} self.feat_mapper = {i: {feat: idx for idx, feat in enumerate(cnt)} for i, cnt in feat_mapper.items()} self.defaults = {i: len(cnt) for i, cnt in feat_mapper.items()} for i, fm in self.feat_mapper.items(): self.field_dims[i - 1] = len(fm) + 1 self.offsets = np.array((0, *np.cumsum(self.field_dims).asnumpy() [:-1])) def __len__(self): return self.count def __getitem__(self, idx): feat = np.array([self.feat_mapper[i + 1].get(v, self.defaults[i + 1]) for i, v in enumerate(self.data[idx]['x'])]) return feat + self.offsets, self.data[idx]['y']
# Defined in file: ./chapter_generative-adversarial-networks/gan.md
[docs]def update_D(X, Z, net_D, net_G, loss, trainer_D): """Update discriminator.""" batch_size = X.shape[0] ones = np.ones((batch_size,), ctx=X.ctx) zeros = np.zeros((batch_size,), ctx=X.ctx) with autograd.record(): real_Y = net_D(X) fake_X = net_G(Z) # Do not need to compute gradient for `net_G`, detach it from # computing gradients. fake_Y = net_D(fake_X.detach()) loss_D = (loss(real_Y, ones) + loss(fake_Y, zeros)) / 2 loss_D.backward() trainer_D.step(batch_size) return float(loss_D.sum())
# Defined in file: ./chapter_generative-adversarial-networks/gan.md
[docs]def update_G(Z, net_D, net_G, loss, trainer_G): """Update generator.""" batch_size = Z.shape[0] ones = np.ones((batch_size,), ctx=Z.ctx) with autograd.record(): # We could reuse `fake_X` from `update_D` to save computation fake_X = net_G(Z) # Recomputing `fake_Y` is needed since `net_D` is changed fake_Y = net_D(fake_X) loss_G = loss(fake_Y, ones) loss_G.backward() trainer_G.step(batch_size) return float(loss_G.sum())
# Defined in file: ./chapter_generative-adversarial-networks/dcgan.md d2l.DATA_HUB['pokemon'] = (d2l.DATA_URL + 'pokemon.zip', 'c065c0e2593b8b161a2d7873e42418bf6a21106c') # Alias defined in config.ini size = lambda a: a.size transpose = lambda a: a.T ones = np.ones zeros = np.zeros arange = np.arange meshgrid = np.meshgrid sin = np.sin sinh = np.sinh cos = np.cos cosh = np.cosh tanh = np.tanh linspace = np.linspace exp = np.exp log = np.log tensor = np.array normal = np.random.normal matmul = np.dot int32 = np.int32 float32 = np.float32 concat = np.concatenate stack = np.stack abs = np.abs numpy = lambda x, *args, **kwargs: x.asnumpy(*args, **kwargs) reshape = lambda x, *args, **kwargs: x.reshape(*args, **kwargs) to = lambda x, *args, **kwargs: x.as_in_context(*args, **kwargs) reduce_sum = lambda x, *args, **kwargs: x.sum(*args, **kwargs) argmax = lambda x, *args, **kwargs: x.argmax(*args, **kwargs) astype = lambda x, *args, **kwargs: x.astype(*args, **kwargs)