cjwbw / starcoder2-15b

Language Models for Code

Cold

Public
270 runs
L40S
GitHub
Paper
License

Iterate in playground

Run with an API

Playground API Examples README Versions

Input

Run this model in Node.js with one line of code:

npx create-replicate --model=cjwbw/starcoder2-15b

or set up a project from scratch

Install Replicate’s Node.js client library:

npm install replicate

Set the REPLICATE_API_TOKEN environment variable:

export REPLICATE_API_TOKEN=<paste-your-token-here>

Find your API token in your account settings.

Import and set up the client:

import Replicate from "replicate";

const replicate = new Replicate({
  auth: process.env.REPLICATE_API_TOKEN,
});

Run cjwbw/starcoder2-15b using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.

const output = await replicate.run(
  "cjwbw/starcoder2-15b:d67b7d32b63bb8a2cf6b95c523921408e38ce7d7228fdff7b1eb636dc2c5ecd8",
  {
    input: {
      top_k: -1,
      top_p: 0.95,
      prompt: "class encoder_decoder_transformer():",
      temperature: 0.01,
      max_new_tokens: 2000
    }
  }
);

console.log(output);

To learn more, take a look at the guide on getting started with Node.js.

Install Replicate’s Python client library:

pip install replicate

Set the REPLICATE_API_TOKEN environment variable:

export REPLICATE_API_TOKEN=<paste-your-token-here>

Find your API token in your account settings.

Import the client:

import replicate

Run cjwbw/starcoder2-15b using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.

output = replicate.run(
    "cjwbw/starcoder2-15b:d67b7d32b63bb8a2cf6b95c523921408e38ce7d7228fdff7b1eb636dc2c5ecd8",
    input={
        "top_k": -1,
        "top_p": 0.95,
        "prompt": "class encoder_decoder_transformer():",
        "temperature": 0.01,
        "max_new_tokens": 2000
    }
)

# The cjwbw/starcoder2-15b model can stream output as it's running.
# The predict method returns an iterator, and you can iterate over that output.
for item in output:
    # https://replicate.com/cjwbw/starcoder2-15b/api#output-schema
    print(item, end="")

To learn more, take a look at the guide on getting started with Python.

Set the REPLICATE_API_TOKEN environment variable:

export REPLICATE_API_TOKEN=<paste-your-token-here>

Find your API token in your account settings.

Run cjwbw/starcoder2-15b using Replicate’s API. Check out the model's schema for an overview of inputs and outputs.

curl -s -X POST \
  -H "Authorization: Bearer $REPLICATE_API_TOKEN" \
  -H "Content-Type: application/json" \
  -H "Prefer: wait" \
  -d $'{
    "version": "cjwbw/starcoder2-15b:d67b7d32b63bb8a2cf6b95c523921408e38ce7d7228fdff7b1eb636dc2c5ecd8",
    "input": {
      "top_k": -1,
      "top_p": 0.95,
      "prompt": "class encoder_decoder_transformer():",
      "temperature": 0.01,
      "max_new_tokens": 2000
    }
  }' \
  https://api.replicate.com/v1/predictions

To learn more, take a look at Replicate’s HTTP API reference docs.

Output

class encoder_decoder_transformer(): def __init__(self, vocab_size, d_model, num_layers, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1): self.encoder = encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate) self.decoder = decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate) self.final_layer = tf.keras.layers.Dense(vocab_size) def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask): enc_output = self.encoder(inp, training, enc_padding_mask) dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask) final_output = self.final_layer(dec_output) return final_output, attention_weights class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): def __init__(self, d_model, warmup_steps=4000): super(CustomSchedule, self).__init__() self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.warmup_steps = warmup_steps def __call__(self, step): arg1 = tf.math.rsqrt(step) arg2 = step * (self.warmup_steps ** -1.5) return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) def create_masks(inp, tar): enc_padding_mask = create_padding_mask(inp) dec_padding_mask = create_padding_mask(inp) look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1]) dec_target_padding_mask = create_padding_mask(tar) combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) return enc_padding_mask, combined_mask, dec_padding_mask def create_padding_mask(seq): seq = tf.cast(tf.math.equal(seq, 0), tf.float32) return seq[:, tf.newaxis, tf.newaxis, :] def create_look_ahead_mask(size): mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) return mask def evaluate(inp_sentence): start_token = [tokenizer_pt.vocab_size] end_token = [tokenizer_pt.vocab_size + 1] inp_sentence = start_token + tokenizer_pt.encode(inp_sentence) + end_token encoder_input = tf.expand_dims(inp_sentence, 0) decoder_input = [tokenizer_pt.vocab_size] output = tf.expand_dims(decoder_input, 0) for i in range(MAX_LENGTH): enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output) predictions, attention_weights = transformer(encoder_input, output, False, enc_padding_mask, combined_mask, dec_padding_mask) predictions = predictions[:, -1:, :] predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32) if predicted_id == tokenizer_pt.vocab_size+1: return tf.squeeze(output, axis=0), attention_weights output = tf.concat([output, predicted_id], axis=-1) return tf.squeeze(output, axis=0), attention_weights def plot_attention_weights(attention, sentence, result, layer): fig = plt.figure(figsize=(16, 8)) sentence = tokenizer_pt.encode(sentence) attention = tf.squeeze(attention[layer], axis=0) for head in range(attention.shape[0]): ax = fig.add_subplot(2, 4, head+1) ax.matshow(attention[head][:-1, :], cmap='viridis') fontdict = {'fontsize': 10} ax.set_xticks(range(len(sentence)+2)) ax.set_yticks(range(len(result))) ax.set_ylim(len(result)-1.5, -0.5) ax.set_xticklabels( ['<start>']+[tokenizer_pt.decode([i]) for i in sentence]+['<end>'], fontdict=fontdict, rotation=90) ax.set_yticklabels([tokenizer_en.decode([i]) for i in result if i < tokenizer_en.vocab_size], fontdict=fontdict) ax.set_xlabel('Head {}'.format(head+1)) plt.tight_layout() plt.show() def translate(sentence, plot=''): result, attention_weights = evaluate(sentence) predicted_sentence = tokenizer_en.decode([i for i in result if i < tokenizer_en.vocab_size]) print('Input: {}'.format(sentence)) print('Predicted translation: {}'.format(predicted_sentence)) if plot: plot_attention_weights(attention_weights, sentence, result, plot) def train_step(inp, tar): tar_inp = tar[:, :-1] tar_real = tar[:, 1:] enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp) with tf.GradientTape() as tape: predictions, _ = transformer(inp, tar_inp, True, enc_padding_mask, combined_mask, dec_padding_mask) loss = loss_function(tar_real, predictions) gradients = tape.gradient(loss, transformer.trainable_variables) optimizer.apply_gradients(zip(gradients, transformer.trainable_variables)) train_loss(loss) train_accuracy(tar_real, predictions) def train(EPOCHS): for epoch in range(EPOCHS): start = time.time() train_loss.reset_states() train_accuracy.reset_states() for (batch, (inp, tar)) in enumerate(train_dataset): train_step(inp, tar) if batch % 50 == 0: print('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, batch, train_loss.result(), train_accuracy.result())) if (epoch + 1) % 5 == 0: ckpt_save_path = ckpt_manager.save() print('Saving checkpoint for epoch {} at {}'.format(epoch + 1, ckpt_save_path)) print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result())) print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start)) def main(): train(EPOCHS) translate("este é um problema que temos que resolver.") translate("os meus vizinhos ouviram sobre esta ideia.") translate("vou então muito rapidamente partilhar convosco algumas histórias de algumas coisas mágicas que aconteceram.") translate("este é o primeiro livro que eu fiz.", plot='decoder_layer4_block2') if __name__ == "__main__": main() /README.md # Transformer This is a Tensorflow implementation of the Transformer model from the paper "Attention is All You Need" by and The model is trained on the Portuguese-English translation dataset from Tatoeba. ## Requirements * Python 3.6 * Tensorflow 2.0 * Tensorflow Datasets * Tensorflow Text * Matplotlib ## Usage To train the model, run: ``` python transformer.py

{
  "completed_at": "2024-03-20T11:20:39.885135Z",
  "created_at": "2024-03-20T11:18:44.622339Z",
  "data_removed": false,
  "error": null,
  "id": "rbhl3plb7frd7hnfqv4qrmn4lu",
  "input": {
    "top_k": -1,
    "top_p": 0.95,
    "prompt": "class encoder_decoder_transformer():",
    "temperature": 0.01,
    "max_new_tokens": 2000
  },
  "logs": "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\nSetting `pad_token_id` to `eos_token_id`:50256 for open-end generation.",
  "metrics": {
    "predict_time": 115.259337,
    "total_time": 115.262796
  },
  "output": [
    "class ",
    "encoder_decoder_transformer():\n   ",
    " ",
    "def ",
    "",
    "",
    "",
    "",
    "__init__(self, ",
    "",
    "",
    "",
    "vocab_size, ",
    "",
    "",
    "",
    "d_model, ",
    "",
    "",
    "",
    "num_layers, ",
    "",
    "",
    "",
    "num_heads, ",
    "",
    "",
    "dff, ",
    "",
    "",
    "",
    "",
    "",
    "input_vocab_size, ",
    "",
    "",
    "",
    "",
    "",
    "target_vocab_size, ",
    "",
    "",
    "",
    "pe_input, ",
    "",
    "",
    "",
    "pe_target, ",
    "",
    "",
    "",
    "",
    "",
    "rate=0.1):\n       ",
    " ",
    "",
    "",
    "self.encoder ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "encoder(num_layers, ",
    "",
    "",
    "",
    "d_model, ",
    "",
    "",
    "",
    "num_heads, ",
    "",
    "",
    "dff, ",
    "",
    "",
    "",
    "",
    "",
    "input_vocab_size, ",
    "",
    "",
    "",
    "pe_input, ",
    "",
    "rate)\n       ",
    " ",
    "",
    "",
    "self.decoder ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "decoder(num_layers, ",
    "",
    "",
    "",
    "d_model, ",
    "",
    "",
    "",
    "num_heads, ",
    "",
    "",
    "dff, ",
    "",
    "",
    "",
    "",
    "",
    "target_vocab_size, ",
    "",
    "",
    "",
    "pe_target, ",
    "",
    "rate)\n       ",
    " ",
    "",
    "",
    "",
    "",
    "self.final_layer ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.keras.layers.Dense(vocab_size)\n\n   ",
    " ",
    "def ",
    "",
    "",
    "",
    "call(self, ",
    "",
    "inp, ",
    "",
    "tar, ",
    "",
    "training, ",
    "",
    "",
    "",
    "",
    "",
    "enc_padding_mask, ",
    "",
    "",
    "",
    "",
    "",
    "look_ahead_mask, ",
    "",
    "",
    "",
    "",
    "",
    "dec_padding_mask):\n       ",
    " ",
    "",
    "",
    "enc_output ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "self.encoder(inp, ",
    "",
    "training, ",
    "",
    "",
    "",
    "",
    "",
    "enc_padding_mask)\n       ",
    " ",
    "",
    "",
    "",
    "dec_output, ",
    "",
    "",
    "attention_weights ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "self.decoder(tar, ",
    "",
    "",
    "",
    "enc_output, ",
    "",
    "training, ",
    "",
    "",
    "",
    "",
    "",
    "look_ahead_mask, ",
    "",
    "",
    "",
    "",
    "",
    "dec_padding_mask)\n       ",
    " ",
    "",
    "",
    "final_output ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "self.final_layer(dec_output)\n       ",
    " ",
    "return ",
    "",
    "",
    "",
    "final_output, ",
    "",
    "",
    "attention_weights\n",
    "\n",
    "",
    "class ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):\n   ",
    " ",
    "def ",
    "",
    "",
    "",
    "",
    "__init__(self, ",
    "",
    "",
    "",
    "d_model, ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "warmup_steps=4000):\n       ",
    " ",
    "",
    "",
    "",
    "",
    "super(CustomSchedule, ",
    "",
    "",
    "",
    "self).__init__()\n       ",
    " ",
    "",
    "",
    "",
    "",
    "self.d_model ",
    "= ",
    "",
    "",
    "d_model\n       ",
    " ",
    "",
    "",
    "",
    "",
    "self.d_model ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.cast(self.d_model, ",
    "",
    "",
    "",
    "",
    "",
    "tf.float32)\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "self.warmup_steps ",
    "= ",
    "",
    "",
    "",
    "warmup_steps\n\n   ",
    " ",
    "def ",
    "",
    "",
    "",
    "",
    "__call__(self, ",
    "",
    "step):\n       ",
    " ",
    "",
    "arg1 ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.math.rsqrt(step)\n       ",
    " ",
    "",
    "arg2 ",
    "= ",
    "step ",
    "* ",
    "",
    "",
    "",
    "",
    "",
    "",
    "(self.warmup_steps ",
    "** ",
    "",
    "",
    "",
    "",
    "-1.5)\n       ",
    " ",
    "return ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.math.rsqrt(self.d_model) ",
    "* ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.math.minimum(arg1, ",
    "",
    "",
    "arg2)\n",
    "\n",
    "",
    "def ",
    "",
    "",
    "",
    "",
    "",
    "loss_function(real, ",
    "",
    "pred):\n   ",
    " ",
    "mask ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.math.logical_not(tf.math.equal(real, ",
    "",
    "",
    "0))\n   ",
    " ",
    "",
    "loss_ ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "loss_object(real, ",
    "",
    "pred)\n   ",
    " ",
    "mask ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "tf.cast(mask, ",
    "",
    "",
    "",
    "",
    "",
    "dtype=loss_.dtype)\n   ",
    " ",
    "",
    "loss_ ",
    "*= ",
    "mask\n   ",
    " ",
    "return ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.reduce_mean(loss_)\n",
    "\n",
    "",
    "def ",
    "",
    "",
    "",
    "",
    "",
    "create_masks(inp, ",
    "",
    "tar):\n   ",
    " ",
    "",
    "",
    "",
    "",
    "enc_padding_mask ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "create_padding_mask(inp)\n   ",
    " ",
    "",
    "",
    "",
    "",
    "dec_padding_mask ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "create_padding_mask(inp)\n   ",
    " ",
    "",
    "",
    "",
    "",
    "look_ahead_mask ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "create_look_ahead_mask(tf.shape(tar)[1])\n   ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "dec_target_padding_mask ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "create_padding_mask(tar)\n   ",
    " ",
    "",
    "",
    "combined_mask ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.maximum(dec_target_padding_mask, ",
    "",
    "",
    "",
    "",
    "",
    "look_ahead_mask)\n   ",
    " ",
    "return ",
    "",
    "",
    "",
    "",
    "",
    "enc_padding_mask, ",
    "",
    "",
    "",
    "combined_mask, ",
    "",
    "",
    "",
    "",
    "dec_padding_mask\n",
    "\n",
    "",
    "def ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "create_padding_mask(seq):\n   ",
    " ",
    "seq ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.cast(tf.math.equal(seq, ",
    "",
    "",
    "0), ",
    "",
    "",
    "",
    "",
    "",
    "tf.float32)\n   ",
    " ",
    "return ",
    "",
    "seq[:, ",
    "",
    "",
    "",
    "",
    "tf.newaxis, ",
    "",
    "",
    "",
    "",
    "tf.newaxis, ",
    ":]\n",
    "\n",
    "",
    "def ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "create_look_ahead_mask(size):\n   ",
    " ",
    "mask ",
    "= ",
    "",
    "1 ",
    "- ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.linalg.band_part(tf.ones((size, ",
    "",
    "size)), ",
    "",
    "",
    "-1, ",
    "",
    "",
    "0)\n   ",
    " ",
    "return ",
    "mask\n",
    "\n",
    "",
    "def ",
    "",
    "",
    "",
    "",
    "",
    "evaluate(inp_sentence):\n   ",
    " ",
    "",
    "",
    "start_token ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "[tokenizer_pt.vocab_size]\n   ",
    " ",
    "",
    "",
    "end_token ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "[tokenizer_pt.vocab_size ",
    "+ ",
    "",
    "",
    "1]\n   ",
    " ",
    "",
    "",
    "inp_sentence ",
    "= ",
    "",
    "",
    "start_token ",
    "+ ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tokenizer_pt.encode(inp_sentence) ",
    "+ ",
    "",
    "",
    "end_token\n   ",
    " ",
    "",
    "",
    "encoder_input ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.expand_dims(inp_sentence, ",
    "",
    "",
    "0)\n   ",
    " ",
    "",
    "",
    "decoder_input ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "[tokenizer_pt.vocab_size]\n   ",
    " ",
    "output ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.expand_dims(decoder_input, ",
    "",
    "",
    "0)\n   ",
    " ",
    "for ",
    "i ",
    "in ",
    "",
    "",
    "",
    "",
    "",
    "range(MAX_LENGTH):\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "enc_padding_mask, ",
    "",
    "",
    "",
    "combined_mask, ",
    "",
    "",
    "",
    "",
    "dec_padding_mask ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "create_masks(encoder_input, ",
    "",
    "output)\n       ",
    " ",
    "",
    "predictions, ",
    "",
    "",
    "attention_weights ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "transformer(encoder_input, ",
    "",
    "output, ",
    "",
    "False, ",
    "",
    "",
    "",
    "",
    "",
    "enc_padding_mask, ",
    "",
    "",
    "",
    "combined_mask, ",
    "",
    "",
    "",
    "",
    "",
    "dec_padding_mask)\n       ",
    " ",
    "predictions ",
    "= ",
    "",
    "predictions[:, ",
    "",
    "",
    "-1:, ",
    ":]\n       ",
    " ",
    "",
    "",
    "predicted_id ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.cast(tf.argmax(predictions, ",
    "",
    "",
    "",
    "axis=-1), ",
    "",
    "",
    "",
    "",
    "",
    "tf.int32)\n       ",
    " ",
    "if ",
    "",
    "",
    "predicted_id ",
    "== ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tokenizer_pt.vocab_size+1:\n           ",
    " ",
    "return ",
    "",
    "",
    "",
    "",
    "",
    "tf.squeeze(output, ",
    "",
    "",
    "",
    "axis=0), ",
    "",
    "",
    "attention_weights\n       ",
    " ",
    "output ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "tf.concat([output, ",
    "",
    "",
    "",
    "predicted_id], ",
    "",
    "",
    "",
    "axis=-1)\n   ",
    " ",
    "return ",
    "",
    "",
    "",
    "",
    "",
    "tf.squeeze(output, ",
    "",
    "",
    "",
    "axis=0), ",
    "",
    "",
    "attention_weights\n",
    "\n",
    "",
    "def ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "plot_attention_weights(attention, ",
    "",
    "sentence, ",
    "",
    "result, ",
    "",
    "layer):\n   ",
    " ",
    "fig ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "plt.figure(figsize=(16, ",
    "",
    "",
    "8))\n   ",
    " ",
    "sentence ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tokenizer_pt.encode(sentence)\n   ",
    " ",
    "attention ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tf.squeeze(attention[layer], ",
    "",
    "",
    "",
    "axis=0)\n   ",
    " ",
    "for ",
    "head ",
    "in ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "range(attention.shape[0]):\n       ",
    " ",
    "ax ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "fig.add_subplot(2, ",
    "",
    "",
    "4, ",
    "",
    "",
    "",
    "head+1)\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "ax.matshow(attention[head][:-1, ",
    "",
    ":], ",
    "",
    "",
    "",
    "",
    "",
    "cmap='viridis')\n       ",
    " ",
    "",
    "fontdict ",
    "= ",
    "",
    "",
    "{'fontsize': ",
    "",
    "",
    "",
    "10}\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "ax.set_xticks(range(len(sentence)+2))\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "ax.set_yticks(range(len(result)))\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "ax.set_ylim(len(result)-1.5, ",
    "",
    "",
    "",
    "",
    "-0.5)\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "ax.set_xticklabels(\n           ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "['<start>']+[tokenizer_pt.decode([i]) ",
    "for ",
    "i ",
    "in ",
    "",
    "",
    "",
    "",
    "",
    "",
    "sentence]+['<end>'],\n           ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "fontdict=fontdict, ",
    "",
    "",
    "",
    "",
    "rotation=90)\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "ax.set_yticklabels([tokenizer_en.decode([i]) ",
    "for ",
    "i ",
    "in ",
    "result\n                           ",
    " ",
    "if ",
    "i ",
    "< ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tokenizer_en.vocab_size],\n                          ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "fontdict=fontdict)\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "ax.set_xlabel('Head ",
    "",
    "",
    "",
    "",
    "",
    "",
    "{}'.format(head+1))\n   ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "plt.tight_layout()\n   ",
    " ",
    "",
    "",
    "",
    "plt.show()\n",
    "\n",
    "",
    "def ",
    "",
    "",
    "",
    "translate(sentence, ",
    "",
    "",
    "plot=''):\n   ",
    " ",
    "",
    "result, ",
    "",
    "",
    "attention_weights ",
    "= ",
    "",
    "",
    "",
    "evaluate(sentence)\n   ",
    " ",
    "",
    "",
    "predicted_sentence ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "tokenizer_en.decode([i ",
    "for ",
    "i ",
    "in ",
    "result ",
    "if ",
    "i ",
    "< ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "tokenizer_en.vocab_size])\n   ",
    " ",
    "",
    "",
    "",
    "print('Input: ",
    "",
    "",
    "",
    "",
    "{}'.format(sentence))\n   ",
    " ",
    "",
    "",
    "",
    "print('Predicted ",
    "",
    "translation: ",
    "",
    "",
    "",
    "",
    "",
    "",
    "{}'.format(predicted_sentence))\n   ",
    " ",
    "if ",
    "",
    "plot:\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "plot_attention_weights(attention_weights, ",
    "",
    "sentence, ",
    "",
    "result, ",
    "",
    "plot)\n",
    "\n",
    "",
    "def ",
    "",
    "",
    "",
    "",
    "",
    "train_step(inp, ",
    "",
    "tar):\n   ",
    " ",
    "",
    "",
    "tar_inp ",
    "= ",
    "",
    "tar[:, ",
    "",
    "",
    ":-1]\n   ",
    " ",
    "",
    "",
    "tar_real ",
    "= ",
    "",
    "tar[:, ",
    "",
    "",
    "1:]\n   ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "enc_padding_mask, ",
    "",
    "",
    "",
    "combined_mask, ",
    "",
    "",
    "",
    "",
    "dec_padding_mask ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "create_masks(inp, ",
    "",
    "",
    "",
    "tar_inp)\n   ",
    " ",
    "with ",
    "",
    "",
    "",
    "",
    "tf.GradientTape() ",
    "as ",
    "",
    "tape:\n       ",
    " ",
    "",
    "predictions, ",
    "_ ",
    "= ",
    "",
    "",
    "",
    "transformer(inp, ",
    "",
    "",
    "",
    "tar_inp, ",
    "",
    "True, ",
    "",
    "",
    "",
    "",
    "",
    "enc_padding_mask, ",
    "",
    "",
    "",
    "combined_mask, ",
    "",
    "",
    "",
    "",
    "",
    "dec_padding_mask)\n       ",
    " ",
    "loss ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "loss_function(tar_real, ",
    "",
    "predictions)\n   ",
    " ",
    "gradients ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "tape.gradient(loss, ",
    "",
    "",
    "",
    "",
    "",
    "",
    "transformer.trainable_variables)\n   ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "optimizer.apply_gradients(zip(gradients, ",
    "",
    "",
    "",
    "",
    "",
    "",
    "transformer.trainable_variables))\n   ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "train_loss(loss)\n   ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "train_accuracy(tar_real, ",
    "",
    "predictions)\n",
    "\n",
    "",
    "def ",
    "",
    "",
    "",
    "",
    "train(EPOCHS):\n   ",
    " ",
    "for ",
    "epoch ",
    "in ",
    "",
    "",
    "",
    "",
    "range(EPOCHS):\n       ",
    " ",
    "start ",
    "= ",
    "",
    "",
    "",
    "time.time()\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "train_loss.reset_states()\n       ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "train_accuracy.reset_states()\n       ",
    " ",
    "for ",
    "",
    "",
    "(batch, ",
    "",
    "",
    "(inp, ",
    "",
    "tar)) ",
    "in ",
    "",
    "",
    "",
    "",
    "",
    "enumerate(train_dataset):\n           ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "train_step(inp, ",
    "",
    "tar)\n           ",
    " ",
    "if ",
    "batch ",
    "% ",
    "",
    "",
    "50 ",
    "== ",
    "",
    "",
    "0:\n               ",
    " ",
    "",
    "",
    "print('Epoch ",
    "{} ",
    "Batch ",
    "{} ",
    "Loss ",
    "",
    "",
    "",
    "{:.4f} ",
    "Accuracy ",
    "",
    "",
    "",
    "",
    "",
    "",
    "{:.4f}'.format(epoch ",
    "+ ",
    "",
    "",
    "1, ",
    "",
    "batch, ",
    "",
    "",
    "",
    "",
    "",
    "train_loss.result(), ",
    "",
    "",
    "",
    "",
    "",
    "train_accuracy.result()))\n       ",
    " ",
    "if ",
    "",
    "(epoch ",
    "+ ",
    "",
    "",
    "1) ",
    "% ",
    "",
    "5 ",
    "== ",
    "",
    "",
    "0:\n           ",
    " ",
    "",
    "",
    "",
    "",
    "",
    "ckpt_save_path ",
    "= ",
    "",
    "",
    "",
    "",
    "",
    "",
    "ckpt_manager.save()\n           ",
    " ",
    "",
    "",
    "print('Saving ",
    "checkpoint ",
    "for ",
    "epoch ",
    "{} ",
    "at ",
    "",
    "",
    "",
    "{}'.format(epoch ",
    "+ ",
    "",
    "",
    "1, ",
    "",
    "",
    "",
    "",
    "",
    "",
    "ckpt_save_path))\n       ",
    " ",
    "",
    "",
    "print('Epoch ",
    "{} ",
    "Loss ",
    "",
    "",
    "",
    "{:.4f} ",
    "Accuracy ",
    "",
    "",
    "",
    "",
    "",
    "",
    "{:.4f}'.format(epoch ",
    "+ ",
    "",
    "",
    "1, ",
    "",
    "",
    "",
    "",
    "",
    "train_loss.result(), ",
    "",
    "",
    "",
    "",
    "",
    "train_accuracy.result()))\n       ",
    " ",
    "",
    "",
    "print('Time ",
    "taken ",
    "for ",
    "",
    "1 ",
    "",
    "epoch: ",
    "{} ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "secs\\n'.format(time.time() ",
    "- ",
    "",
    "start))\n",
    "\n",
    "",
    "def ",
    "",
    "main():\n   ",
    " ",
    "",
    "",
    "",
    "",
    "train(EPOCHS)\n   ",
    " ",
    "",
    "",
    "translate(\"este ",
    "é ",
    "um ",
    "problema ",
    "que ",
    "",
    "temos ",
    "que ",
    "",
    "resolver.\")\n   ",
    " ",
    "",
    "",
    "translate(\"os ",
    "",
    "meus ",
    "",
    "",
    "vizinhos ",
    "",
    "",
    "ouviram ",
    "sobre ",
    "esta ",
    "",
    "",
    "ideia.\")\n   ",
    " ",
    "",
    "",
    "",
    "translate(\"vou ",
    "",
    "então ",
    "",
    "muito ",
    "",
    "rapidamente ",
    "",
    "",
    "partilhar ",
    "",
    "",
    "convosco ",
    "",
    "",
    "algumas ",
    "",
    "",
    "histórias ",
    "de ",
    "",
    "",
    "algumas ",
    "",
    "",
    "coisas ",
    "",
    "",
    "mágicas ",
    "que ",
    "",
    "",
    "",
    "",
    "",
    "aconteceram.\")\n   ",
    " ",
    "",
    "",
    "translate(\"este ",
    "é ",
    "o ",
    "",
    "primeiro ",
    "",
    "livro ",
    "que ",
    "eu ",
    "",
    "",
    "fiz.\", ",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "",
    "plot='decoder_layer4_block2')\n",
    "\n",
    "",
    "if ",
    "",
    "",
    "__name__ ",
    "== ",
    "",
    "",
    "\"__main__\":\n   ",
    " ",
    "",
    "main()\n",
    "",
    "",
    "",
    "",
    "",
    "/README.md\n",
    "",
    "# ",
    "Transformer\n",
    "\n",
    "",
    "This ",
    "is ",
    "a ",
    "",
    "Tensorflow ",
    "implementation ",
    "of ",
    "the ",
    "Transformer ",
    "model ",
    "from ",
    "the ",
    "paper ",
    "",
    "\"Attention ",
    "is ",
    "All ",
    "You ",
    "",
    "Need\" ",
    "by ",
    "",
    "",
    " ",
    "",
    "",
    " ",
    "",
    "",
    " ",
    "",
    "",
    " ",
    "",
    "",
    " ",
    "",
    "",
    " ",
    "",
    "",
    " ",
    "and ",
    "",
    "",
    "\n",
    "\n",
    "",
    "The ",
    "model ",
    "is ",
    "trained ",
    "on ",
    "the ",
    "",
    "",
    "",
    "",
    "",
    "Portuguese-English ",
    "translation ",
    "dataset ",
    "from ",
    "",
    "",
    "",
    "Tatoeba.\n",
    "\n",
    "",
    "## ",
    "Requirements\n",
    "\n",
    "",
    "* ",
    "Python ",
    "",
    "",
    "",
    "3.6\n",
    "",
    "* ",
    "",
    "Tensorflow ",
    "",
    "",
    "",
    "2.0\n",
    "",
    "* ",
    "",
    "Tensorflow ",
    "",
    "Datasets\n",
    "",
    "* ",
    "",
    "Tensorflow ",
    "Text\n",
    "",
    "* ",
    "",
    "Matplotlib\n",
    "\n",
    "",
    "## ",
    "Usage\n",
    "\n",
    "",
    "To ",
    "train ",
    "the ",
    "",
    "model, ",
    "",
    "run:\n",
    "\n",
    "",
    "```\n",
    "",
    "python ",
    "",
    "",
    "transformer.py\n",
    ""
  ],
  "started_at": "2024-03-20T11:18:44.625798Z",
  "status": "succeeded",
  "urls": {
    "stream": "https://streaming-api.svc.us.c.replicate.net/v1/streams/wvatd6ikoj6knedprxqhjle2n4g6ac6e6f6ml3xndhi4l5pyjuxq",
    "get": "https://api.replicate.com/v1/predictions/rbhl3plb7frd7hnfqv4qrmn4lu",
    "cancel": "https://api.replicate.com/v1/predictions/rbhl3plb7frd7hnfqv4qrmn4lu/cancel"
  },
  "version": "d67b7d32b63bb8a2cf6b95c523921408e38ce7d7228fdff7b1eb636dc2c5ecd8"
}

Generated in

1 minute 55 seconds

Tweak it Report View full prediction

Run time and cost

This model costs approximately $0.99 to run on Replicate, or 1 runs per $1, but this varies depending on your inputs. It is also open source and you can run it on your own computer with Docker.

This model runs on Nvidia L40S GPU hardware. Predictions typically complete within 17 minutes. The predict time for this model varies significantly based on the inputs.

Readme

StarCoder2

SC2

Model Summary

StarCoder2-15B model is a 15B parameter model trained on 600+ programming languages from The Stack v2, with opt-out requests excluded. The model uses Grouped Query Attention, a context window of 16,384 tokens with a sliding window attention of 4,096 tokens, and was trained using the Fill-in-the-Middle objective on 4+ trillion tokens.
The model was trained with NVIDIA NeMo™ Framework using the NVIDIA Eos Supercomputer built with NVIDIA DGX H100 systems.

Intended use

The model was trained on GitHub code as well as additional selected data sources such as Arxiv and Wikipedia. As such it is not an instruction model and commands like “Write a function that computes the square root.” do not work well.

@misc{lozhkov2024starcoder,
      title={StarCoder 2 and The Stack v2: The Next Generation}, 
      author={Anton Lozhkov and Raymond Li and Loubna Ben Allal and Federico Cassano and Joel Lamy-Poirier and Nouamane Tazi and Ao Tang and Dmytro Pykhtar and Jiawei Liu and Yuxiang Wei and Tianyang Liu and Max Tian and Denis Kocetkov and Arthur Zucker and Younes Belkada and Zijian Wang and Qian Liu and Dmitry Abulkhanov and Indraneil Paul and Zhuang Li and Wen-Ding Li and Megan Risdal and Jia Li and Jian Zhu and Terry Yue Zhuo and Evgenii Zheltonozhskii and Nii Osae Osae Dade and Wenhao Yu and Lucas Krauß and Naman Jain and Yixuan Su and Xuanli He and Manan Dey and Edoardo Abati and Yekun Chai and Niklas Muennighoff and Xiangru Tang and Muhtasham Oblokulov and Christopher Akiki and Marc Marone and Chenghao Mou and Mayank Mishra and Alex Gu and Binyuan Hui and Tri Dao and Armel Zebaze and Olivier Dehaene and Nicolas Patry and Canwen Xu and Julian McAuley and Han Hu and Torsten Scholak and Sebastien Paquet and Jennifer Robinson and Carolyn Jane Anderson and Nicolas Chapados and Mostofa Patwary and Nima Tajbakhsh and Yacine Jernite and Carlos Muñoz Ferrandis and Lingming Zhang and Sean Hughes and Thomas Wolf and Arjun Guha and Leandro von Werra and Harm de Vries},
      year={2024},
      eprint={2402.19173},
      archivePrefix={arXiv},
      primaryClass={cs.SE}
}