geopti/chatterbox-multilingual
Run geopti/chatterbox-multilingual with an API
Use one of our client libraries to get started quickly. Clicking on a library will take you to the Playground tab where you can tweak different inputs, see the results, and copy the corresponding code to use in your own project.
Input schema
The fields you can use to run this model with an API. If you don't give a value for a field its default value will be used.
| Field | Type | Default value | Description |
|---|---|---|---|
| text |
string
|
The text you want spoken. Can be a single sentence or a long paragraph — long inputs are automatically split into chunks.
|
|
| language |
None
|
en
|
Language of the text. Use the two-letter code (en=English, fr=French, de=German, es=Spanish, ja=Japanese, zh=Chinese, ar=Arabic, el=Greek, etc.).
|
| audio_prompt |
string
|
Optional reference voice clip (.wav/.mp3). The output will mimic this voice. If left empty, a default voice is used.
|
|
| cfg_weight |
number
|
0.5
Max: 1 |
How closely the speech follows the text. Higher = sticks to the text more strictly. Lower = more freedom (but can hallucinate or get stuck).
|
| exaggeration |
number
|
0.5
Max: 1 |
How expressive the voice is. Higher = more emotional / dramatic. Lower = more flat / neutral.
|
| temperature |
number
|
0.8
Max: 2 |
Randomness of the voice. Higher = more variation between runs. Lower = more consistent / robotic.
|
| repetition_penalty |
number
|
2
Min: 1 Max: 5 |
Penalty for repeating the same sounds. Higher = less repetition.
|
| top_p |
number
|
1
Max: 1 |
Top-p (nucleus) sampling. Restricts the model to the most likely tokens. 1.0 = no restriction.
|
| pause_between_sentences |
number
|
0.1
Max: 5 |
Length of the silence (in seconds) inserted between sentences.
|
| max_words_per_chunk |
integer
|
60
Min: 10 Max: 200 |
Long texts are split into chunks before generation. This is the max number of words per chunk. Smaller = safer for tricky languages, but slower.
|
| repeated_token_threshold |
integer
|
3
Min: 2 Max: 10 |
If the model repeats the same sound this many times in a row, the chunk is cut off (prevents the model from getting stuck looping). Raise this if too much real speech is being cut.
|
| garbage_trim_buffer |
integer
|
25
Max: 200 |
Number of audio frames kept after the model finishes saying the sentence (each frame = ~40ms). Lower = trims garbage tails more aggressively but may cut off the last syllable.
|
{
"type": "object",
"title": "Input",
"required": [
"text"
],
"properties": {
"text": {
"type": "string",
"title": "Text",
"x-order": 0,
"description": "The text you want spoken. Can be a single sentence or a long paragraph \u2014 long inputs are automatically split into chunks."
},
"top_p": {
"type": "number",
"title": "Top P",
"default": 1,
"maximum": 1,
"minimum": 0,
"x-order": 7,
"description": "Top-p (nucleus) sampling. Restricts the model to the most likely tokens. 1.0 = no restriction."
},
"language": {
"enum": [
"ar",
"da",
"de",
"el",
"en",
"es",
"fi",
"fr",
"he",
"hi",
"it",
"ja",
"ko",
"ms",
"nl",
"no",
"pl",
"pt",
"ru",
"sv",
"sw",
"tr",
"zh"
],
"type": "string",
"title": "language",
"description": "Language of the text. Use the two-letter code (en=English, fr=French, de=German, es=Spanish, ja=Japanese, zh=Chinese, ar=Arabic, el=Greek, etc.).",
"default": "en",
"x-order": 1
},
"cfg_weight": {
"type": "number",
"title": "Cfg Weight",
"default": 0.5,
"maximum": 1,
"minimum": 0,
"x-order": 3,
"description": "How closely the speech follows the text. Higher = sticks to the text more strictly. Lower = more freedom (but can hallucinate or get stuck)."
},
"temperature": {
"type": "number",
"title": "Temperature",
"default": 0.8,
"maximum": 2,
"minimum": 0,
"x-order": 5,
"description": "Randomness of the voice. Higher = more variation between runs. Lower = more consistent / robotic."
},
"audio_prompt": {
"type": "string",
"title": "Audio Prompt",
"format": "uri",
"x-order": 2,
"nullable": true,
"description": "Optional reference voice clip (.wav/.mp3). The output will mimic this voice. If left empty, a default voice is used."
},
"exaggeration": {
"type": "number",
"title": "Exaggeration",
"default": 0.5,
"maximum": 1,
"minimum": 0,
"x-order": 4,
"description": "How expressive the voice is. Higher = more emotional / dramatic. Lower = more flat / neutral."
},
"repetition_penalty": {
"type": "number",
"title": "Repetition Penalty",
"default": 2,
"maximum": 5,
"minimum": 1,
"x-order": 6,
"description": "Penalty for repeating the same sounds. Higher = less repetition."
},
"garbage_trim_buffer": {
"type": "integer",
"title": "Garbage Trim Buffer",
"default": 25,
"maximum": 200,
"minimum": 0,
"x-order": 11,
"description": "Number of audio frames kept after the model finishes saying the sentence (each frame = ~40ms). Lower = trims garbage tails more aggressively but may cut off the last syllable."
},
"max_words_per_chunk": {
"type": "integer",
"title": "Max Words Per Chunk",
"default": 60,
"maximum": 200,
"minimum": 10,
"x-order": 9,
"description": "Long texts are split into chunks before generation. This is the max number of words per chunk. Smaller = safer for tricky languages, but slower."
},
"pause_between_sentences": {
"type": "number",
"title": "Pause Between Sentences",
"default": 0.1,
"maximum": 5,
"minimum": 0,
"x-order": 8,
"description": "Length of the silence (in seconds) inserted between sentences."
},
"repeated_token_threshold": {
"type": "integer",
"title": "Repeated Token Threshold",
"default": 3,
"maximum": 10,
"minimum": 2,
"x-order": 10,
"description": "If the model repeats the same sound this many times in a row, the chunk is cut off (prevents the model from getting stuck looping). Raise this if too much real speech is being cut."
}
}
}
Output schema
The shape of the response you’ll get when you run this model with an API.
{
"type": "string",
"title": "Output",
"format": "uri"
}