wordscenes/whisper-stable-ts

Public

37.9K runs

Run wordscenes/whisper-stable-ts with an API

Use one of our client libraries to get started quickly. Clicking on a library will take you to the Playground tab where you can tweak different inputs, see the results, and copy the corresponding code to use in your own project.

Input schema

The fields you can use to run this model with an API. If you don't give a value for a field its default value will be used.

Field	Type	Default value	Description
audio_path	string		Audio to transcribe or align
mode	None	transcribe	Mode: 'transcribe' to generate transcript, 'align' to align provided text
text	string		Text to align with audio (required when mode='align')
language	string	en	Language to transcribe
denoiser	None	none	The denoiser to use.
vad	boolean	True	Whether to use Silero VAD to generate timestamp suppression mask.
beam_size	integer	5	Number of beams in beam search, only applicable when temperature is zero (transcribe mode only).
best_of	integer	5	Number of candidates when sampling with non-zero temperature (transcribe mode only).
regroup	boolean	True	Whether to regroup all words into segments with more natural boundaries.
initial_prompt	string		Text to provide as a prompt for the first window (transcribe mode only).
aligner	None	new	The aligner to use.
suppress_arabic_numerals	boolean	True	Whether to suppress Arabic numerals.
suppress_pronounceable_symbols	boolean	True	Whether to suppress pronounceable symbols.

{
  "type": "object",
  "title": "Input",
  "required": [
    "audio_path"
  ],
  "properties": {
    "vad": {
      "type": "boolean",
      "title": "Vad",
      "default": true,
      "x-order": 5,
      "description": "Whether to use Silero VAD to generate timestamp suppression mask."
    },
    "mode": {
      "enum": [
        "transcribe",
        "align"
      ],
      "type": "string",
      "title": "mode",
      "description": "Mode: 'transcribe' to generate transcript, 'align' to align provided text",
      "default": "transcribe",
      "x-order": 1
    },
    "text": {
      "type": "string",
      "title": "Text",
      "default": "",
      "x-order": 2,
      "description": "Text to align with audio (required when mode='align')"
    },
    "aligner": {
      "enum": [
        "new",
        "legacy"
      ],
      "type": "string",
      "title": "aligner",
      "description": "The aligner to use.",
      "default": "new",
      "x-order": 10
    },
    "best_of": {
      "type": "integer",
      "title": "Best Of",
      "default": 5,
      "x-order": 7,
      "description": "Number of candidates when sampling with non-zero temperature (transcribe mode only)."
    },
    "regroup": {
      "type": "boolean",
      "title": "Regroup",
      "default": true,
      "x-order": 8,
      "description": "Whether to regroup all words into segments with more natural boundaries."
    },
    "denoiser": {
      "enum": [
        "none",
        "demucs",
        "dfnet",
        "noisereduce"
      ],
      "type": "string",
      "title": "denoiser",
      "description": "The denoiser to use.",
      "default": "none",
      "x-order": 4
    },
    "language": {
      "type": "string",
      "title": "Language",
      "default": "en",
      "x-order": 3,
      "description": "Language to transcribe"
    },
    "beam_size": {
      "type": "integer",
      "title": "Beam Size",
      "default": 5,
      "x-order": 6,
      "description": "Number of beams in beam search, only applicable when temperature is zero (transcribe mode only)."
    },
    "audio_path": {
      "type": "string",
      "title": "Audio Path",
      "format": "uri",
      "x-order": 0,
      "description": "Audio to transcribe or align"
    },
    "initial_prompt": {
      "type": "string",
      "title": "Initial Prompt",
      "x-order": 9,
      "description": "Text to provide as a prompt for the first window (transcribe mode only)."
    },
    "suppress_arabic_numerals": {
      "type": "boolean",
      "title": "Suppress Arabic Numerals",
      "default": true,
      "x-order": 11,
      "description": "Whether to suppress Arabic numerals."
    },
    "suppress_pronounceable_symbols": {
      "type": "boolean",
      "title": "Suppress Pronounceable Symbols",
      "default": true,
      "x-order": 12,
      "description": "Whether to suppress pronounceable symbols."
    }
  }
}

Output schema

The shape of the response you’ll get when you run this model with an API.

Schema

{
  "type": "string",
  "title": "Output"
}