self.update_rule = kwargs.pop('update_rule', 'sgd') ... # Make sure the update rule exists, then replace the string # name with the actual function ifnot hasattr(optim, self.update_rule): raise ValueError('Invalid update_rule "%s"' % self.update_rule) self.update_rule = getattr(optim, self.update_rule)
defbatchnorm_forward(x, gamma, beta, bn_param): """ Forward pass for batch normalization. During training the sample mean and (uncorrected) sample variance are computed from minibatch statistics and used to normalize the incoming data. During training we also keep an exponentially decaying running mean of the mean and variance of each feature, and these averages are used to normalize data at test-time. At each timestep we update the running averages for mean and variance using an exponential decay based on the momentum parameter: running_mean = momentum * running_mean + (1 - momentum) * sample_mean running_var = momentum * running_var + (1 - momentum) * sample_var Note that the batch normalization paper suggests a different test-time behavior: they compute sample mean and variance for each feature using a large number of training images rather than using a running average. For this implementation we have chosen to use running averages instead since they do not require an additional estimation step; the torch7 implementation of batch normalization also uses running averages. Input: - x: Data of shape (N, D) - gamma: Scale parameter of shape (D,) - beta: Shift paremeter of shape (D,) - bn_param: Dictionary with the following keys: - mode: 'train' or 'test'; required - eps: Constant for numeric stability - momentum: Constant for running mean / variance. - running_mean: Array of shape (D,) giving running mean of features - running_var Array of shape (D,) giving running variance of features Returns a tuple of: - out: of shape (N, D) - cache: A tuple of values needed in the backward pass """ mode = bn_param['mode'] eps = bn_param.get('eps', 1e-5) momentum = bn_param.get('momentum', 0.9)
N, D = x.shape running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype)) running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))
defbatchnorm_backward(dout, cache): """ Backward pass for batch normalization. For this implementation, you should write out a computation graph for batch normalization on paper and propagate gradients backward through intermediate nodes. Inputs: - dout: Upstream derivatives, of shape (N, D) - cache: Variable of intermediates from batchnorm_forward. Returns a tuple of: - dx: Gradient with respect to inputs x, of shape (N, D) - dgamma: Gradient with respect to scale parameter gamma, of shape (D,) - dbeta: Gradient with respect to shift parameter beta, of shape (D,) """ dx, dgamma, dbeta = None, None, None x, x_norm, sample_mean, sample_var, gamma, beta, eps = cache N, D = x_norm.shape
defbatchnorm_backward_alt(dout, cache): """ Alternative backward pass for batch normalization. For this implementation you should work out the derivatives for the batch normalizaton backward pass on paper and simplify as much as possible. You should be able to derive a simple expression for the backward pass. See the jupyter notebook for more hints. Note: This implementation should expect to receive the same cache variable as batchnorm_backward, but might not use all of the values in the cache. Inputs / outputs: Same as batchnorm_backward """ dx, dgamma, dbeta = None, None, None x, x_hat, sample_mean, sample_var, gamma, beta, eps = cache N, D = x_hat.shape mid = 1 / np.sqrt(sample_var + eps) dbeta = np.sum(dout, axis=0) dgamma = np.sum(x_hat * dout, axis=0) dxhat = dout * gamma dx = (1 / N) * mid * (N * dxhat - np.sum(dxhat, axis=0) - x_hat * np.sum(dxhat * x_hat, axis=0))
deflayernorm_forward(x, gamma, beta, ln_param): """ Forward pass for layer normalization. During both training and test-time, the incoming data is normalized per data-point, before being scaled by gamma and beta parameters identical to that of batch normalization. Note that in contrast to batch normalization, the behavior during train and test-time for layer normalization are identical, and we do not need to keep track of running averages of any sort. Input: - x: Data of shape (N, D) - gamma: Scale parameter of shape (D,) - beta: Shift paremeter of shape (D,) - ln_param: Dictionary with the following keys: - eps: Constant for numeric stability Returns a tuple of: - out: of shape (N, D) - cache: A tuple of values needed in the backward pass """ out, cache = None, None sample_var = np.var(x.T, axis=0) x_norm = (x.T - sample_mean) / np.sqrt(sample_var + eps) out = gamma * x_norm.T + beta cache = x, x_norm.T, sample_mean, sample_var, gamma, beta, eps return out, cache
deflayernorm_backward(dout, cache): """ Backward pass for layer normalization. For this implementation, you can heavily rely on the work you've done already for batch normalization. Inputs: - dout: Upstream derivatives, of shape (N, D) - cache: Variable of intermediates from layernorm_forward. Returns a tuple of: - dx: Gradient with respect to inputs x, of shape (N, D) - dgamma: Gradient with respect to scale parameter gamma, of shape (D,) - dbeta: Gradient with respect to shift parameter beta, of shape (D,) """ dx, dgamma, dbeta = None, None, None
x, x_hat, sample_mean, sample_var, gamma, beta, eps = cache N, D = x_hat.shape
首先是BN,BN是通过mini-batch来对相应的activation做规范化操作,使得输出的各个维度的均值为0,方差为1(标准化)。而最后的“scale and shift”,即加入一个放射变换,则是为了让因训练所需而“刻意”加入的BN能够有可能还原最初的输入,同时也缓解因为数据可能会因此丢失了一些信息,所以再加上beta和gama来恢复原始数据,这里beta和gama是可学习的。
defloss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test'if y isNoneelse'train'
# Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.normalization=='batchnorm': for bn_param in self.bn_params: bn_param['mode'] = mode scores = None
caches = [] scores = X for i in range(self.num_layers): W = self.params['W' + str(i+1)] b = self.params['b' + str(i+1)] if i == self.num_layers - 1: scores, cache = affine_forward(scores, W, b) else: if self.normalization isNone: scores, cache = affine_relu_forward(scores, W, b) elif self.normalization == "batchnorm": gamma = self.params['gamma' + str(i + 1)] beta = self.params['beta' + str(i + 1)] scores, cache = affine_bn_relu_forward(scores, W, b, gamma, beta, self.bn_params[i]) elif self.normalization == "layernorm": gamma = self.params['gamma' + str(i + 1)] beta = self.params['beta' + str(i + 1)] scores, cache = affine_ln_relu_forward(scores, W, b, gamma, beta, self.bn_params[i]) else: cache = None caches.append(cache) if self.use_dropout and i != self.num_layers-1: scores, cache = dropout_forward(scores, self.dropout_param) caches.append(cache)
# If test mode return early if mode == 'test': return scores
loss, grads = 0.0, {} reg = self.reg loss, dx = softmax_loss(scores, y) for i in reversed(range(self.num_layers)): w = 'W' + str(i + 1) b = 'b' + str(i + 1) gamma = 'gamma' + str(i + 1) beta = 'beta' + str(i + 1) loss += 0.5 * reg * np.sum(W * W) # add reg term if i == self.num_layers - 1: dx, grads[w], grads[b] = affine_backward(dx, caches.pop()) else: if self.use_dropout: dx = dropout_backward(dx, caches.pop()) if self.normalization isNone: dx, grads[w], grads[b] = affine_relu_backward(dx, caches.pop()) if self.normalization == 'batchnorm': dx, grads[w], grads[b], grads[gamma], grads[beta] = affine_bn_relu_backward(dx, caches.pop()) if self.normalization == 'layernorm': dx, grads[w], grads[b], grads[gamma], grads[beta] = affine_ln_relu_backward(dx, caches.pop()) grads[w] += reg * self.params[w]
defconv_forward_naive(x, w, b, conv_param): """ A naive implementation of the forward pass for a convolutional layer. The input consists of N data points, each with C channels, height H and width W. We convolve each input with F different filters, where each filter spans all C channels and has height HH and width WW. Input: - x: Input data of shape (N, C, H, W) - w: Filter weights of shape (F, C, HH, WW) - b: Biases, of shape (F,) - conv_param: A dictionary with the following keys: - 'stride': The number of pixels between adjacent receptive fields in the horizontal and vertical directions. - 'pad': The number of pixels that will be used to zero-pad the input. During padding, 'pad' zeros should be placed symmetrically (i.e equally on both sides) along the height and width axes of the input. Be careful not to modfiy the original input x directly. Returns a tuple of: - out: Output data, of shape (N, F, H', W') where H' and W' are given by H' = 1 + (H + 2 * pad - HH) / stride W' = 1 + (W + 2 * pad - WW) / stride - cache: (x, w, b, conv_param) """ out = None
pass pad = conv_param['pad'] stride = conv_param['stride'] x_pad = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), mode='constant', constant_values=0) N, C, H, W = x.shape F, C, HH, WW = w.shape H_out = int(1 + (H + 2 * pad - HH) / stride) W_out = int(1 + (W + 2 * pad - WW) / stride) out = np.zeros((N, F, H_out, W_out)) for n in range(N): for f in range(F): for h in range(H_out): for w_mid in range(W_out): out[n, f, h, w_mid] = np.sum( x_pad[n, :, h * stride:h * stride + HH, w_mid * stride:w_mid * stride + WW] * w[f, :, :, :]) + b[f]
defmax_pool_forward_naive(x, pool_param): """ A naive implementation of the forward pass for a max-pooling layer. Inputs: - x: Input data, of shape (N, C, H, W) - pool_param: dictionary with the following keys: - 'pool_height': The height of each pooling region - 'pool_width': The width of each pooling region - 'stride': The distance between adjacent pooling regions No padding is necessary here. Output size is given by Returns a tuple of: - out: Output data, of shape (N, C, H', W') where H' and W' are given by H' = 1 + (H - pool_height) / stride W' = 1 + (W - pool_width) / stride - cache: (x, pool_param) """ out = None pool_height = pool_param['pool_height'] pool_width = pool_param['pool_width'] stride = pool_param['stride'] N, C, H, W = x.shape H_out = int(1 + (H - pool_height) / stride) W_out = int(1 + (W - pool_width) / stride) out = np.zeros((N, C, H_out, W_out)) for n in range(N): for f in range(C): for h in range(H_out): for w_mid in range(W_out): out[n, f, h, w_mid] = np.max( x[n, f, h * stride:h * stride + pool_height, w_mid * stride:w_mid * stride + pool_width])
defmax_pool_backward_naive(dout, cache): """ A naive implementation of the backward pass for a max-pooling layer. Inputs: - dout: Upstream derivatives - cache: A tuple of (x, pool_param) as in the forward pass. Returns: - dx: Gradient with respect to x """ dx = None x, pool_param = cache pool_height = pool_param['pool_height'] pool_width = pool_param['pool_width'] stride = pool_param['stride'] N, C, H_out, W_out = dout.shape dx = np.zeros_like(x)
for n in range(N): for f in range(C): for h_mid in range(H_out): for w_mid in range(W_out): window = x[n, f, stride * h_mid:stride * h_mid + pool_height, stride * w_mid:stride * w_mid + pool_width] mask = window == np.max(window) dx[n, f, stride * h_mid:stride * h_mid + pool_height, stride * w_mid:stride * w_mid + pool_width] = mask * dout[n, f, h_mid, w_mid]