cjwbw / starcoder2-15b
Language Models for Code
Prediction
cjwbw/starcoder2-15b:d67b7d32b63bb8a2cf6b95c523921408e38ce7d7228fdff7b1eb636dc2c5ecd8ID45gxk6lbioeuqmjignlrrpnvkeStatusSucceededSourceWebHardwareA40 (Large)Total durationCreatedPrediction
cjwbw/starcoder2-15b:d67b7d32b63bb8a2cf6b95c523921408e38ce7d7228fdff7b1eb636dc2c5ecd8IDrbhl3plb7frd7hnfqv4qrmn4luStatusSucceededSourceWebHardwareA40 (Large)Total durationCreatedInput
- top_k
- -1
- top_p
- 0.95
- prompt
- class encoder_decoder_transformer():
- temperature
- 0.01
- max_new_tokens
- 2000
Output
class encoder_decoder_transformer(): def __init__(self, vocab_size, d_model, num_layers, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1): self.encoder = encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate) self.decoder = decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate) self.final_layer = tf.keras.layers.Dense(vocab_size) def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask): enc_output = self.encoder(inp, training, enc_padding_mask) dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask) final_output = self.final_layer(dec_output) return final_output, attention_weights class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): def __init__(self, d_model, warmup_steps=4000): super(CustomSchedule, self).__init__() self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.warmup_steps = warmup_steps def __call__(self, step): arg1 = tf.math.rsqrt(step) arg2 = step * (self.warmup_steps ** -1.5) return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) def create_masks(inp, tar): enc_padding_mask = create_padding_mask(inp) dec_padding_mask = create_padding_mask(inp) look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1]) dec_target_padding_mask = create_padding_mask(tar) combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) return enc_padding_mask, combined_mask, dec_padding_mask def create_padding_mask(seq): seq = tf.cast(tf.math.equal(seq, 0), tf.float32) return seq[:, tf.newaxis, tf.newaxis, :] def create_look_ahead_mask(size): mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) return mask def evaluate(inp_sentence): start_token = [tokenizer_pt.vocab_size] end_token = [tokenizer_pt.vocab_size + 1] inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token encoder_input = tf.expand_dims(inp_sentence, 0) decoder_input = [tokenizer_pt.vocab_size] output = tf.expand_dims(decoder_input, 0) for i in range(MAX_LENGTH): enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output) predictions, attention_weights = transformer(encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) predictions = predictions[:, -1:, :] predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) if predicted_id == tokenizer_pt.vocab_size+1: return tf.squeeze(output, axis=0), attention_weights output = tf.concat([output, predicted_id], axis=-1) return tf.squeeze(output, axis=0), attention_weights def plot_attention_weights(attention, sentence, result, layer): fig = plt.figure(figsize=(16, 8)) sentence = tokenizer_pt.encode(sentence) attention = tf.squeeze(attention[layer], axis=0) for head in range(attention.shape[0]): ax = fig.add_subplot(2, 4, head+1) ax.matshow(attention[head][:-1, :], cmap='viridis') fontdict = {'fontsize': 10} ax.set_xticks(range(len(sentence)+2)) ax.set_yticks(range(len(result))) ax.set_ylim(len(result)-1.5, -0.5) ax.set_xticklabels( ['<start>']+[tokenizer_pt.decode([i]) for i in sentence]+['<end>'], fontdict=fontdict, rotation=90) ax.set_yticklabels([tokenizer_en.decode([i]) for i in result if i < tokenizer_en.vocab_size], fontdict=fontdict) ax.set_xlabel('Head {}'.format(head+1)) plt.tight_layout() plt.show() def translate(sentence, plot=''): result, attention_weights = evaluate(sentence) predicted_sentence = tokenizer_en.decode([i for i in result if i < tokenizer_en.vocab_size]) print('Input: {}'.format(sentence)) print('Predicted translation: {}'.format(predicted_sentence)) if plot: plot_attention_weights(attention_weights, sentence, result, plot) def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) with tf.GradientTape() as tape: predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, transformer.trainable_variables) optimizer.apply_gradients(zip(gradients, transformer.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions) def train(EPOCHS): for epoch in range(EPOCHS): start = time.time() train_loss.reset_states() train_accuracy.reset_states() for (batch, (inp, tar)) in enumerate(train_dataset): train_step(inp, tar) if batch % 50 == 0: print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, batch, train_loss.result(), train_accuracy.result())) if (epoch + 1) % 5 == 0: ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format(epoch + 1, ckpt_save_path)) print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result())) print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start)) def main(): train(EPOCHS) translate("este é um problema que temos que resolver.") translate("os meus vizinhos ouviram sobre esta ideia.") translate("vou então muito rapidamente partilhar convosco algumas histórias de algumas coisas mágicas que aconteceram.") translate("este é o primeiro livro que eu fiz.", plot='decoder_layer4_block2') if __name__ == "__main__": main() /README.md # Transformer This is a Tensorflow implementation of the Transformer model from the paper "Attention is All You Need" by and The model is trained on the Portuguese-English translation dataset from Tatoeba. ## Requirements * Python 3.6 * Tensorflow 2.0 * Tensorflow Datasets * Tensorflow Text * Matplotlib ## Usage To train the model, run: ``` python transformer.pyGenerated in
Want to make some of these yourself?
Run this model