cjwbw / starcoder2-15b

Language Models for Code

  • Public
  • 246 runs
  • L40S
  • GitHub
  • Paper
  • License

Input

string
Shift + Return to add a new line

Text input

Default: "def print_hello_world():"

integer
(minimum: 1)

Maximum number of tokens to generate. A word is generally 2-3 tokens

Default: 256

number
(minimum: 0.01, maximum: 5)

Adjusts randomness of outputs, greater than 1 is random and 0 is deterministic, 0.75 is a good starting value.

Default: 0.7

number
(minimum: 0, maximum: 1)

When decoding text, samples from the top p percentage of most likely tokens; lower to ignore less likely tokens

Default: 0.95

integer
(minimum: -1)

When decoding text, samples from the top k most likely tokens; lower to ignore less likely tokens

Default: -1

Output

class encoder_decoder_transformer(): def __init__(self, vocab_size, d_model, num_layers, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1): self.encoder = encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate) self.decoder = decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate) self.final_layer = tf.keras.layers.Dense(vocab_size) def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask): enc_output = self.encoder(inp, training, enc_padding_mask) dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask) final_output = self.final_layer(dec_output) return final_output, attention_weights class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): def __init__(self, d_model, warmup_steps=4000): super(CustomSchedule, self).__init__() self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.warmup_steps = warmup_steps def __call__(self, step): arg1 = tf.math.rsqrt(step) arg2 = step * (self.warmup_steps ** -1.5) return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) def create_masks(inp, tar): enc_padding_mask = create_padding_mask(inp) dec_padding_mask = create_padding_mask(inp) look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1]) dec_target_padding_mask = create_padding_mask(tar) combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) return enc_padding_mask, combined_mask, dec_padding_mask def create_padding_mask(seq): seq = tf.cast(tf.math.equal(seq, 0), tf.float32) return seq[:, tf.newaxis, tf.newaxis, :] def create_look_ahead_mask(size): mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) return mask def evaluate(inp_sentence): start_token = [tokenizer_pt.vocab_size] end_token = [tokenizer_pt.vocab_size + 1] inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token encoder_input = tf.expand_dims(inp_sentence, 0) decoder_input = [tokenizer_pt.vocab_size] output = tf.expand_dims(decoder_input, 0) for i in range(MAX_LENGTH): enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output) predictions, attention_weights = transformer(encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) predictions = predictions[:, -1:, :] predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) if predicted_id == tokenizer_pt.vocab_size+1: return tf.squeeze(output, axis=0), attention_weights output = tf.concat([output, predicted_id], axis=-1) return tf.squeeze(output, axis=0), attention_weights def plot_attention_weights(attention, sentence, result, layer): fig = plt.figure(figsize=(16, 8)) sentence = tokenizer_pt.encode(sentence) attention = tf.squeeze(attention[layer], axis=0) for head in range(attention.shape[0]): ax = fig.add_subplot(2, 4, head+1) ax.matshow(attention[head][:-1, :], cmap='viridis') fontdict = {'fontsize': 10} ax.set_xticks(range(len(sentence)+2)) ax.set_yticks(range(len(result))) ax.set_ylim(len(result)-1.5, -0.5) ax.set_xticklabels( ['<start>']+[tokenizer_pt.decode([i]) for i in sentence]+['<end>'], fontdict=fontdict, rotation=90) ax.set_yticklabels([tokenizer_en.decode([i]) for i in result if i < tokenizer_en.vocab_size], fontdict=fontdict) ax.set_xlabel('Head {}'.format(head+1)) plt.tight_layout() plt.show() def translate(sentence, plot=''): result, attention_weights = evaluate(sentence) predicted_sentence = tokenizer_en.decode([i for i in result if i < tokenizer_en.vocab_size]) print('Input: {}'.format(sentence)) print('Predicted translation: {}'.format(predicted_sentence)) if plot: plot_attention_weights(attention_weights, sentence, result, plot) def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) with tf.GradientTape() as tape: predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, transformer.trainable_variables) optimizer.apply_gradients(zip(gradients, transformer.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions) def train(EPOCHS): for epoch in range(EPOCHS): start = time.time() train_loss.reset_states() train_accuracy.reset_states() for (batch, (inp, tar)) in enumerate(train_dataset): train_step(inp, tar) if batch % 50 == 0: print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, batch, train_loss.result(), train_accuracy.result())) if (epoch + 1) % 5 == 0: ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format(epoch + 1, ckpt_save_path)) print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result())) print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start)) def main(): train(EPOCHS) translate("este é um problema que temos que resolver.") translate("os meus vizinhos ouviram sobre esta ideia.") translate("vou então muito rapidamente partilhar convosco algumas histórias de algumas coisas mágicas que aconteceram.") translate("este é o primeiro livro que eu fiz.", plot='decoder_layer4_block2') if __name__ == "__main__": main() /README.md # Transformer This is a Tensorflow implementation of the Transformer model from the paper "Attention is All You Need" by and The model is trained on the Portuguese-English translation dataset from Tatoeba. ## Requirements * Python 3.6 * Tensorflow 2.0 * Tensorflow Datasets * Tensorflow Text * Matplotlib ## Usage To train the model, run: ``` python transformer.py
Generated in

Run time and cost

This model costs approximately $0.99 to run on Replicate, or 1 runs per $1, but this varies depending on your inputs. It is also open source and you can run it on your own computer with Docker.

This model runs on Nvidia L40S GPU hardware. Predictions typically complete within 17 minutes. The predict time for this model varies significantly based on the inputs.

Readme

StarCoder2

SC2

Model Summary

StarCoder2-15B model is a 15B parameter model trained on 600+ programming languages from The Stack v2, with opt-out requests excluded. The model uses Grouped Query Attention, a context window of 16,384 tokens with a sliding window attention of 4,096 tokens, and was trained using the Fill-in-the-Middle objective on 4+ trillion tokens.
The model was trained with NVIDIA NeMo™ Framework using the NVIDIA Eos Supercomputer built with NVIDIA DGX H100 systems.

Intended use

The model was trained on GitHub code as well as additional selected data sources such as Arxiv and Wikipedia. As such it is not an instruction model and commands like “Write a function that computes the square root.” do not work well.

@misc{lozhkov2024starcoder,
      title={StarCoder 2 and The Stack v2: The Next Generation}, 
      author={Anton Lozhkov and Raymond Li and Loubna Ben Allal and Federico Cassano and Joel Lamy-Poirier and Nouamane Tazi and Ao Tang and Dmytro Pykhtar and Jiawei Liu and Yuxiang Wei and Tianyang Liu and Max Tian and Denis Kocetkov and Arthur Zucker and Younes Belkada and Zijian Wang and Qian Liu and Dmitry Abulkhanov and Indraneil Paul and Zhuang Li and Wen-Ding Li and Megan Risdal and Jia Li and Jian Zhu and Terry Yue Zhuo and Evgenii Zheltonozhskii and Nii Osae Osae Dade and Wenhao Yu and Lucas Krauß and Naman Jain and Yixuan Su and Xuanli He and Manan Dey and Edoardo Abati and Yekun Chai and Niklas Muennighoff and Xiangru Tang and Muhtasham Oblokulov and Christopher Akiki and Marc Marone and Chenghao Mou and Mayank Mishra and Alex Gu and Binyuan Hui and Tri Dao and Armel Zebaze and Olivier Dehaene and Nicolas Patry and Canwen Xu and Julian McAuley and Han Hu and Torsten Scholak and Sebastien Paquet and Jennifer Robinson and Carolyn Jane Anderson and Nicolas Chapados and Mostofa Patwary and Nima Tajbakhsh and Yacine Jernite and Carlos Muñoz Ferrandis and Lingming Zhang and Sean Hughes and Thomas Wolf and Arjun Guha and Leandro von Werra and Harm de Vries},
      year={2024},
      eprint={2402.19173},
      archivePrefix={arXiv},
      primaryClass={cs.SE}
}