geopti/chatterbox-multilingual

Public

566 runs

Run geopti/chatterbox-multilingual with an API

Use one of our client libraries to get started quickly. Clicking on a library will take you to the Playground tab where you can tweak different inputs, see the results, and copy the corresponding code to use in your own project.

Input schema

The fields you can use to run this model with an API. If you don't give a value for a field its default value will be used.

Field	Type	Default value	Description
text	string		The text you want spoken. Can be a single sentence or a long paragraph — long inputs are automatically split into chunks.
language	None	en	Language of the text. Use the two-letter code (en=English, fr=French, de=German, es=Spanish, ja=Japanese, zh=Chinese, ar=Arabic, el=Greek, etc.).
audio_prompt	string		Optional reference voice clip (.wav/.mp3). The output will mimic this voice. If left empty, a default voice is used.
cfg_weight	number	0.5 Max: 1	How closely the speech follows the text. Higher = sticks to the text more strictly. Lower = more freedom (but can hallucinate or get stuck).
exaggeration	number	0.5 Max: 1	How expressive the voice is. Higher = more emotional / dramatic. Lower = more flat / neutral.
temperature	number	0.8 Max: 2	Randomness of the voice. Higher = more variation between runs. Lower = more consistent / robotic.
repetition_penalty	number	2 Min: 1 Max: 5	Penalty for repeating the same sounds. Higher = less repetition.
top_p	number	1 Max: 1	Top-p (nucleus) sampling. Restricts the model to the most likely tokens. 1.0 = no restriction.
pause_between_sentences	number	0.1 Max: 5	Length of the silence (in seconds) inserted between sentences.
max_words_per_chunk	integer	60 Min: 10 Max: 200	Long texts are split into chunks before generation. This is the max number of words per chunk. Smaller = safer for tricky languages, but slower.
repeated_token_threshold	integer	3 Min: 2 Max: 10	If the model repeats the same sound this many times in a row, the chunk is cut off (prevents the model from getting stuck looping). Raise this if too much real speech is being cut.
garbage_trim_buffer	integer	25 Max: 200	Number of audio frames kept after the model finishes saying the sentence (each frame = ~40ms). Lower = trims garbage tails more aggressively but may cut off the last syllable.

{
  "type": "object",
  "title": "Input",
  "required": [
    "text"
  ],
  "properties": {
    "text": {
      "type": "string",
      "title": "Text",
      "x-order": 0,
      "description": "The text you want spoken. Can be a single sentence or a long paragraph \u2014 long inputs are automatically split into chunks."
    },
    "top_p": {
      "type": "number",
      "title": "Top P",
      "default": 1,
      "maximum": 1,
      "minimum": 0,
      "x-order": 7,
      "description": "Top-p (nucleus) sampling. Restricts the model to the most likely tokens. 1.0 = no restriction."
    },
    "language": {
      "enum": [
        "ar",
        "da",
        "de",
        "el",
        "en",
        "es",
        "fi",
        "fr",
        "he",
        "hi",
        "it",
        "ja",
        "ko",
        "ms",
        "nl",
        "no",
        "pl",
        "pt",
        "ru",
        "sv",
        "sw",
        "tr",
        "zh"
      ],
      "type": "string",
      "title": "language",
      "description": "Language of the text. Use the two-letter code (en=English, fr=French, de=German, es=Spanish, ja=Japanese, zh=Chinese, ar=Arabic, el=Greek, etc.).",
      "default": "en",
      "x-order": 1
    },
    "cfg_weight": {
      "type": "number",
      "title": "Cfg Weight",
      "default": 0.5,
      "maximum": 1,
      "minimum": 0,
      "x-order": 3,
      "description": "How closely the speech follows the text. Higher = sticks to the text more strictly. Lower = more freedom (but can hallucinate or get stuck)."
    },
    "temperature": {
      "type": "number",
      "title": "Temperature",
      "default": 0.8,
      "maximum": 2,
      "minimum": 0,
      "x-order": 5,
      "description": "Randomness of the voice. Higher = more variation between runs. Lower = more consistent / robotic."
    },
    "audio_prompt": {
      "type": "string",
      "title": "Audio Prompt",
      "format": "uri",
      "x-order": 2,
      "nullable": true,
      "description": "Optional reference voice clip (.wav/.mp3). The output will mimic this voice. If left empty, a default voice is used."
    },
    "exaggeration": {
      "type": "number",
      "title": "Exaggeration",
      "default": 0.5,
      "maximum": 1,
      "minimum": 0,
      "x-order": 4,
      "description": "How expressive the voice is. Higher = more emotional / dramatic. Lower = more flat / neutral."
    },
    "repetition_penalty": {
      "type": "number",
      "title": "Repetition Penalty",
      "default": 2,
      "maximum": 5,
      "minimum": 1,
      "x-order": 6,
      "description": "Penalty for repeating the same sounds. Higher = less repetition."
    },
    "garbage_trim_buffer": {
      "type": "integer",
      "title": "Garbage Trim Buffer",
      "default": 25,
      "maximum": 200,
      "minimum": 0,
      "x-order": 11,
      "description": "Number of audio frames kept after the model finishes saying the sentence (each frame = ~40ms). Lower = trims garbage tails more aggressively but may cut off the last syllable."
    },
    "max_words_per_chunk": {
      "type": "integer",
      "title": "Max Words Per Chunk",
      "default": 60,
      "maximum": 200,
      "minimum": 10,
      "x-order": 9,
      "description": "Long texts are split into chunks before generation. This is the max number of words per chunk. Smaller = safer for tricky languages, but slower."
    },
    "pause_between_sentences": {
      "type": "number",
      "title": "Pause Between Sentences",
      "default": 0.1,
      "maximum": 5,
      "minimum": 0,
      "x-order": 8,
      "description": "Length of the silence (in seconds) inserted between sentences."
    },
    "repeated_token_threshold": {
      "type": "integer",
      "title": "Repeated Token Threshold",
      "default": 3,
      "maximum": 10,
      "minimum": 2,
      "x-order": 10,
      "description": "If the model repeats the same sound this many times in a row, the chunk is cut off (prevents the model from getting stuck looping). Raise this if too much real speech is being cut."
    }
  }
}

Output schema

The shape of the response you’ll get when you run this model with an API.

Schema

{
  "type": "string",
  "title": "Output",
  "format": "uri"
}