suminhthanh
/
whisperx-custom
- Public
- 1.2K runs
Run suminhthanh/whisperx-custom with an API
Use one of our client libraries to get started quickly. Clicking on a library will take you to the Playground tab where you can tweak different inputs, see the results, and copy the corresponding code to use in your own project.
Input schema
The fields you can use to run this model with an API. If you don't give a value for a field its default value will be used.
Field | Type | Default value | Description |
---|---|---|---|
audio_file |
string
|
Audio file
|
|
url |
string
|
URL of the audio file (if audio_file is not provided)
|
|
language |
string
|
ISO code of the language spoken in the audio, specify None to perform language detection
|
|
language_detection_min_prob |
number
|
0
|
If language is not specified, then the language will be detected recursively on different parts of the file until it reaches the given probability
|
language_detection_max_tries |
integer
|
5
|
If language is not specified, then the language will be detected following the logic of language_detection_min_prob parameter, but will stop after the given max retries. If max retries is reached, the most probable language is kept.
|
initial_prompt |
string
|
Optional text to provide as a prompt for the first window
|
|
batch_size |
integer
|
64
|
Parallelization of input audio transcription
|
temperature |
number
|
0
|
Temperature to use for sampling
|
vad_onset |
number
|
0.5
|
VAD onset
|
vad_offset |
number
|
0.363
|
VAD offset
|
align_output |
boolean
|
False
|
Aligns whisper output to get accurate word-level timestamps
|
diarization |
boolean
|
False
|
Assign speaker ID labels
|
huggingface_access_token |
string
|
To enable diarization, please enter your HuggingFace token (read). You need to accept the user agreement for the models specified in the README.
|
|
min_speakers |
integer
|
Minimum number of speakers if diarization is activated (leave blank if unknown)
|
|
max_speakers |
integer
|
Maximum number of speakers if diarization is activated (leave blank if unknown)
|
|
debug |
boolean
|
False
|
Print out compute/inference times and memory usage information
|
keep_audio |
boolean
|
False
|
Keep the downloaded audio file
|
openai_api_key |
string
|
OpenAI API key
|
|
is_get_video_info |
boolean
|
False
|
Get video info
|
cleanup_voice |
boolean
|
False
|
Cleanup voice
|
deep_filter |
boolean
|
False
|
Deep filter
|
chunk_size |
integer
|
4
|
Chunk size
|
cdn_upload_url |
string
|
CDN UPLOAD URL
|
|
cdn_download_url |
string
|
CDN DOWNLOAD URL
|
{
"type": "object",
"title": "Input",
"properties": {
"url": {
"type": "string",
"title": "Url",
"x-order": 1,
"description": "URL of the audio file (if audio_file is not provided)"
},
"debug": {
"type": "boolean",
"title": "Debug",
"default": false,
"x-order": 15,
"description": "Print out compute/inference times and memory usage information"
},
"language": {
"type": "string",
"title": "Language",
"x-order": 2,
"description": "ISO code of the language spoken in the audio, specify None to perform language detection"
},
"vad_onset": {
"type": "number",
"title": "Vad Onset",
"default": 0.5,
"x-order": 8,
"description": "VAD onset"
},
"audio_file": {
"type": "string",
"title": "Audio File",
"format": "uri",
"x-order": 0,
"description": "Audio file"
},
"batch_size": {
"type": "integer",
"title": "Batch Size",
"default": 64,
"x-order": 6,
"description": "Parallelization of input audio transcription"
},
"chunk_size": {
"type": "integer",
"title": "Chunk Size",
"default": 4,
"x-order": 21,
"description": "Chunk size"
},
"keep_audio": {
"type": "boolean",
"title": "Keep Audio",
"default": false,
"x-order": 16,
"description": "Keep the downloaded audio file"
},
"vad_offset": {
"type": "number",
"title": "Vad Offset",
"default": 0.363,
"x-order": 9,
"description": "VAD offset"
},
"deep_filter": {
"type": "boolean",
"title": "Deep Filter",
"default": false,
"x-order": 20,
"description": "Deep filter"
},
"diarization": {
"type": "boolean",
"title": "Diarization",
"default": false,
"x-order": 11,
"description": "Assign speaker ID labels"
},
"temperature": {
"type": "number",
"title": "Temperature",
"default": 0,
"x-order": 7,
"description": "Temperature to use for sampling"
},
"align_output": {
"type": "boolean",
"title": "Align Output",
"default": false,
"x-order": 10,
"description": "Aligns whisper output to get accurate word-level timestamps"
},
"max_speakers": {
"type": "integer",
"title": "Max Speakers",
"x-order": 14,
"description": "Maximum number of speakers if diarization is activated (leave blank if unknown)"
},
"min_speakers": {
"type": "integer",
"title": "Min Speakers",
"x-order": 13,
"description": "Minimum number of speakers if diarization is activated (leave blank if unknown)"
},
"cleanup_voice": {
"type": "boolean",
"title": "Cleanup Voice",
"default": false,
"x-order": 19,
"description": "Cleanup voice"
},
"cdn_upload_url": {
"type": "string",
"title": "Cdn Upload Url",
"x-order": 22,
"description": "CDN UPLOAD URL"
},
"initial_prompt": {
"type": "string",
"title": "Initial Prompt",
"x-order": 5,
"description": "Optional text to provide as a prompt for the first window"
},
"openai_api_key": {
"type": "string",
"title": "Openai Api Key",
"x-order": 17,
"description": "OpenAI API key"
},
"cdn_download_url": {
"type": "string",
"title": "Cdn Download Url",
"x-order": 23,
"description": "CDN DOWNLOAD URL"
},
"is_get_video_info": {
"type": "boolean",
"title": "Is Get Video Info",
"default": false,
"x-order": 18,
"description": "Get video info"
},
"huggingface_access_token": {
"type": "string",
"title": "Huggingface Access Token",
"x-order": 12,
"description": "To enable diarization, please enter your HuggingFace token (read). You need to accept the user agreement for the models specified in the README."
},
"language_detection_min_prob": {
"type": "number",
"title": "Language Detection Min Prob",
"default": 0,
"x-order": 3,
"description": "If language is not specified, then the language will be detected recursively on different parts of the file until it reaches the given probability"
},
"language_detection_max_tries": {
"type": "integer",
"title": "Language Detection Max Tries",
"default": 5,
"x-order": 4,
"description": "If language is not specified, then the language will be detected following the logic of language_detection_min_prob parameter, but will stop after the given max retries. If max retries is reached, the most probable language is kept."
}
}
}
Output schema
The shape of the response you’ll get when you run this model with an API.
Schema
{
"type": "object",
"title": "Output",
"required": [
"detected_language"
],
"properties": {
"srt": {
"title": "Srt"
},
"text": {
"title": "Text"
},
"score": {
"title": "Score"
},
"title": {
"title": "Title"
},
"segments": {
"title": "Segments"
},
"video_id": {
"title": "Video Id"
},
"video_info": {
"title": "Video Info"
},
"view_count": {
"title": "View Count"
},
"detected_language": {
"type": "string",
"title": "Detected Language"
}
}
}