defrnn_forward(x, h0, Wx, Wh, b): h, cache = None, None N, T, D = x.shape cache = [] h = np.zeros((N, T, h0.shape[1])) for i in range(T): h0, c = rnn_step_forward(x[:, i, :], h0, Wx, Wh, b) h[:, i] += h0 cache.append(c) return h, cache
defrnn_backward(dh, cache): dx, dh0, dWx, dWh, db = None, None, None, None, None N, T, H = dh.shape D = cache[0][0].shape[0] dx = np.zeros((N, T, D)) dh0 = np.zeros((N, H)) dWx = np.zeros((D, H)) dWh = np.zeros((H, H)) db = np.zeros((H,))
for i in reversed(range(T)): dx[:, i], dh0, dWx_mid, dWh_mid, db_mid = rnn_step_backward(dh[:, i] + dh0, cache.pop()) dWx += dWx_mid dWh += dWh_mid db += db_mid return dx, dh0, dWx, dWh, db
defloss(self, features, captions): """ Compute training-time loss for the RNN. We input image features and ground-truth captions for those images, and use an RNN (or LSTM) to compute loss and gradients on all parameters. Inputs: - features: Input image features, of shape (N, D) - captions: Ground-truth captions; an integer array of shape (N, T) where each element is in the range 0 <= y[i, t] < V Returns a tuple of: - loss: Scalar loss - grads: Dictionary of gradients parallel to self.params """ # Cut captions into two pieces: captions_in has everything but the last word # and will be input to the RNN; captions_out has everything but the first # word and this is what we will expect the RNN to generate. These are offset # by one relative to each other because the RNN should produce word (t+1) # after receiving word t. The first element of captions_in will be the START # token, and the first element of captions_out will be the first word. captions_in = captions[:, :-1] captions_out = captions[:, 1:]
# You'll need this mask = (captions_out != self._null)
# Weight and bias for the affine transform from image features to initial # hidden state W_proj, b_proj = self.params['W_proj'], self.params['b_proj']
# Word embedding matrix W_embed = self.params['W_embed']
# Input-to-hidden, hidden-to-hidden, and biases for the RNN Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b']
# Weight and bias for the hidden-to-vocab transformation. W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab']
loss, grads = 0.0, {} ############################################################################ # TODO: Implement the forward and backward passes for the CaptioningRNN. # # In the forward pass you will need to do the following: # # (1) Use an affine transformation to compute the initial hidden state # # from the image features. This should produce an array of shape (N, H)# # (2) Use a word embedding layer to transform the words in captions_in # # from indices to vectors, giving an array of shape (N, T, W). # # (3) Use either a vanilla RNN or LSTM (depending on self.cell_type) to # # process the sequence of input word vectors and produce hidden state # # vectors for all timesteps, producing an array of shape (N, T, H). # # (4) Use a (temporal) affine transformation to compute scores over the # # vocabulary at every timestep using the hidden states, giving an # # array of shape (N, T, V). # # (5) Use (temporal) softmax to compute loss using captions_out, ignoring # # the points where the output word is <NULL> using the mask above. # # # # In the backward pass you will need to compute the gradient of the loss # # with respect to all model parameters. Use the loss and grads variables # # defined above to store loss and gradients; grads[k] should give the # # gradients for self.params[k]. # # # # Note also that you are allowed to make use of functions from layers.py # # in your implementation, if needed. # ############################################################################ # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** caches = [] out, cache = affine_forward(features, W_proj, b_proj) caches.append(cache) word_in, cache = word_embedding_forward(captions_in, W_embed) caches.append(cache) if self.cell_type == 'rnn': # must rnn or lstm out, cache = rnn_forward(word_in, out, Wx, Wh, b) else: out, cache = lstm_forward(word_in, out, Wx, Wh, b) caches.append(cache) out, cache = temporal_affine_forward(out, W_vocab, b_vocab) caches.append(cache)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################
defsample(self, features, max_length=30): """ Run a test-time forward pass for the model, sampling captions for input feature vectors. At each timestep, we embed the current word, pass it and the previous hidden state to the RNN to get the next hidden state, use the hidden state to get scores for all vocab words, and choose the word with the highest score as the next word. The initial hidden state is computed by applying an affine transform to the input image features, and the initial word is the <START> token. For LSTMs you will also have to keep track of the cell state; in that case the initial cell state should be zero. Inputs: - features: Array of input image features of shape (N, D). - max_length: Maximum length T of generated captions. Returns: - captions: Array of shape (N, max_length) giving sampled captions, where each element is an integer in the range [0, V). The first element of captions should be the first sampled word, not the <START> token. """ N = features.shape[0] captions = self._null * np.ones((N, max_length), dtype=np.int32)
########################################################################### # TODO: Implement test-time sampling for the model. You will need to # # initialize the hidden state of the RNN by applying the learned affine # # transform to the input image features. The first word that you feed to # # the RNN should be the <START> token; its value is stored in the # # variable self._start. At each timestep you will need to do to: # # (1) Embed the previous word using the learned word embeddings # # (2) Make an RNN step using the previous hidden state and the embedded # # current word to get the next hidden state. # # (3) Apply the learned affine transformation to the next hidden state to # # get scores for all words in the vocabulary # # (4) Select the word with the highest score as the next word, writing it # # (the word index) to the appropriate slot in the captions variable # # # # For simplicity, you do not need to stop generating after an <END> token # # is sampled, but you can if you want to. # # # # HINT: You will not be able to use the rnn_forward or lstm_forward # # functions; you'll need to call rnn_step_forward or lstm_step_forward in # # a loop. # # # # NOTE: we are still working over minibatches in this function. Also if # # you are using an LSTM, initialize the first cell state to zeros. # ########################################################################### # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** next_h, _ = affine_forward(features, W_proj, b_proj) next_c = np.zeros((N, W_proj.shape[1])) word = self._start * np.ones((N,), dtype=np.int32) # generate start token for i in range(max_length): word, _ = word_embedding_forward(word, W_embed) # embed the word to vector if self.cell_type == 'rnn': next_h, _ = rnn_step_forward(word, next_h, Wx, Wh, b) else: next_h, next_c, _ = lstm_step_forward(word, next_h, next_c, Wx, Wh, b)
out, _ = affine_forward(next_h, W_vocab, b_vocab) # get the output word = out.argmax(axis=1) # sample captions[:, i] = word # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)***** ############################################################################ # END OF YOUR CODE # ############################################################################ return captions
deflstm_step_forward(x, prev_h, prev_c, Wx, Wh, b): """ Forward pass for a single timestep of an LSTM. The input data has dimension D, the hidden state has dimension H, and we use a minibatch size of N. Note that a sigmoid() function has already been provided for you in this file. Inputs: - x: Input data, of shape (N, D) - prev_h: Previous hidden state, of shape (N, H) - prev_c: previous cell state, of shape (N, H) - Wx: Input-to-hidden weights, of shape (D, 4H) - Wh: Hidden-to-hidden weights, of shape (H, 4H) - b: Biases, of shape (4H,) Returns a tuple of: - next_h: Next hidden state, of shape (N, H) - next_c: Next cell state, of shape (N, H) - cache: Tuple of values needed for backward pass. """ next_h, next_c, cache = None, None, None N, H = prev_h.shape a = np.dot(x, Wx) + np.dot(prev_h, Wh) + b i = sigmoid(a[:, :H]) f = sigmoid(a[:, H:2*H]) o = sigmoid(a[:, 2*H:3*H]) g = np.tanh(a[:, 3*H:]) next_c = f * prev_c + i * g next_h = o * np.tanh(next_c) cache = i, f, o, g, next_c, Wh, Wx, prev_c, prev_h, x
deflstm_step_backward(dnext_h, dnext_c, cache): """ Backward pass for a single timestep of an LSTM. Inputs: - dnext_h: Gradients of next hidden state, of shape (N, H) - dnext_c: Gradients of next cell state, of shape (N, H) - cache: Values from the forward pass Returns a tuple of: - dx: Gradient of input data, of shape (N, D) - dprev_h: Gradient of previous hidden state, of shape (N, H) - dprev_c: Gradient of previous cell state, of shape (N, H) - dWx: Gradient of input-to-hidden weights, of shape (D, 4H) - dWh: Gradient of hidden-to-hidden weights, of shape (H, 4H) - db: Gradient of biases, of shape (4H,) """ dx, dprev_h, dprev_c, dWx, dWh, db = None, None, None, None, None, None
i, f, o, g, next_c, Wh, Wx, prev_c, prev_h, x = cache dprev_c = dnext_c * f + dnext_h * o * f * (1 - np.tanh(next_c)**2) dc = dnext_c + (1 - np.tanh(next_c)**2) * o * dnext_h # 这里遇到了问题 di = dc * g * i * (1 - i) df = dc * prev_c * f * (1 - f) do = dnext_h * np.tanh(next_c) * o * (1 - o) dg = dc * i * (1 - g**2)
deflstm_forward(x, h0, Wx, Wh, b): """ Forward pass for an LSTM over an entire sequence of data. We assume an input sequence composed of T vectors, each of dimension D. The LSTM uses a hidden size of H, and we work over a minibatch containing N sequences. After running the LSTM forward, we return the hidden states for all timesteps. Note that the initial cell state is passed as input, but the initial cell state is set to zero. Also note that the cell state is not returned; it is an internal variable to the LSTM and is not accessed from outside. Inputs: - x: Input data of shape (N, T, D) - h0: Initial hidden state of shape (N, H) - Wx: Weights for input-to-hidden connections, of shape (D, 4H) - Wh: Weights for hidden-to-hidden connections, of shape (H, 4H) - b: Biases of shape (4H,) Returns a tuple of: - h: Hidden states for all timesteps of all sequences, of shape (N, T, H) - cache: Values needed for the backward pass. """ h, cache = None, None N, T, D = x.shape N, H = h0.shape h = np.zeros((N, T, H)) cache = [] c0 = np.zeros_like(h0) for i in range(T): h0, c0, c = lstm_step_forward(x[:, i, :], h0, c0, Wx, Wh, b) h[:, i, :] = h0 cache.append(c)
deflstm_backward(dh, cache): """ Backward pass for an LSTM over an entire sequence of data.] Inputs: - dh: Upstream gradients of hidden states, of shape (N, T, H) - cache: Values from the forward pass Returns a tuple of: - dx: Gradient of input data of shape (N, T, D) - dh0: Gradient of initial hidden state of shape (N, H) - dWx: Gradient of input-to-hidden weight matrix of shape (D, 4H) - dWh: Gradient of hidden-to-hidden weight matrix of shape (H, 4H) - db: Gradient of biases, of shape (4H,) """ dx, dh0, dWx, dWh, db = None, None, None, None, None
N, T, H = dh.shape _, D = cache[0][-1].shape # cache[0][-1]对应x dx = np.zeros((N, T, D)) dc = np.zeros((N, H)) dWx = np.zeros((D, 4 * H)) dWh = np.zeros((H, 4 * H)) db = np.zeros((4 * H,)) dh0 = np.zeros((N, H)) for i in reversed(range(T)): dx[:, i, :], dh0, dc, dWx_, dWh_, db_ = lstm_step_backward(dh[:, i, :] + dh0, dc, cache.pop()) db += db_ dWx += dWx_ dWh += dWh_
defcontent_loss(content_weight, content_current, content_original): """ Compute the content loss for style transfer. Inputs: - content_weight: Scalar giving the weighting for the content loss. - content_current: features of the current image; this is a PyTorch Tensor of shape (1, C_l, H_l, W_l). - content_target: features of the content image, Tensor with shape (1, C_l, H_l, W_l). Returns: - scalar content loss """ loss = content_weight * torch.sum(torch.square(content_current.squeeze() - content_original.squeeze())) pass return loss
defgram_matrix(features, normalize=True): """ Compute the Gram matrix from features. Inputs: - features: PyTorch Tensor of shape (N, C, H, W) giving features for a batch of N images. - normalize: optional, whether to normalize the Gram matrix If True, divide the Gram matrix by the number of neurons (H * W * C) Returns: - gram: PyTorch Tensor of shape (N, C, C) giving the (optionally normalized) Gram matrices for the N input images. """ N,C,H,W = features.size() new_features = features.reshape((N,C,H*W)) gram_mat = torch.zeros(N,C,C) gram_mat = torch.bmm(new_features, new_features.permute(0,2,1)) if normalize: return gram_mat / (H*W*C) else: return gram_matm
defstyle_loss(feats, style_layers, style_targets, style_weights): """ Computes the style loss at a set of layers. Inputs: - feats: list of the features at every layer of the current image, as produced by the extract_features function. - style_layers: List of layer indices into feats giving the layers to include in the style loss. - style_targets: List of the same length as style_layers, where style_targets[i] is a PyTorch Tensor giving the Gram matrix of the source style image computed at layer style_layers[i]. - style_weights: List of the same length as style_layers, where style_weights[i] is a scalar giving the weight for the style loss at layer style_layers[i]. Returns: - style_loss: A PyTorch Tensor holding a scalar giving the style loss. """ tensor = torch.tensor(()) loss = tensor.new_zeros(1) for i in range(len(style_layers)): gram_mat = gram_matrix(feats[style_layers[i]]) loss += style_weights[i] * torch.sum((gram_mat - style_targets[i]).square()) return loss
Total-variation regularization
该项是一个正则化项,公式如下:
代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13
deftv_loss(img, tv_weight): """ Compute total variation loss. Inputs: - img: PyTorch Variable of shape (1, 3, H, W) holding an input image. - tv_weight: Scalar giving the weight w_t to use for the TV loss. Returns: - loss: PyTorch Variable holding a scalar giving the total variation loss for img weighted by tv_weight. """ loss = tv_weight * (torch.sum((img[0,:,1:,:] - img[0,:,:-1,:]).square()) + torch.sum((img[0,:,:,1:] - img[0,:,:,:-1]).square())) return loss