vllm.config.speech_to_text ¶

SpeechToTextConfig ¶

Configuration for speech-to-text models.

Source code in vllm/config/speech_to_text.py

@config
class SpeechToTextConfig:
    """Configuration for speech-to-text models."""

    sample_rate: float = 16_000
    """Sample rate (Hz) to resample input audio to. Most speech models expect
    16kHz audio input. The input audio will be automatically resampled to this
    rate before processing."""

    max_audio_clip_s: int | None = 30
    """Maximum duration in seconds for a single audio clip without chunking.
    Audio longer than this will be split into smaller chunks if
    `allow_audio_chunking` evaluates to True, otherwise it will be rejected. 
    `None` means audio duration can be unlimited and won't be chunked."""

    overlap_chunk_second: int = 1
    """Overlap duration in seconds between consecutive audio chunks when
    splitting long audio. This helps maintain context across chunk boundaries
    and improves transcription quality at split points."""

    min_energy_split_window_size: int | None = 1600
    """Window size in samples for finding low-energy (quiet) regions to split
    audio chunks. The algorithm looks for the quietest moment within this
    window to minimize cutting through speech. Default 1600 samples ≈ 100ms
    at 16kHz. If None, no chunking will be done."""

    @property
    def allow_audio_chunking(self) -> bool:
        return (
            self.min_energy_split_window_size is not None
            and self.max_audio_clip_s is not None
        )

max_audio_clip_s `class-attribute` `instance-attribute` ¶

max_audio_clip_s: int | None = 30

Maximum duration in seconds for a single audio clip without chunking. Audio longer than this will be split into smaller chunks if allow_audio_chunking evaluates to True, otherwise it will be rejected. None means audio duration can be unlimited and won't be chunked.

min_energy_split_window_size `class-attribute` `instance-attribute` ¶

min_energy_split_window_size: int | None = 1600

Window size in samples for finding low-energy (quiet) regions to split audio chunks. The algorithm looks for the quietest moment within this window to minimize cutting through speech. Default 1600 samples ≈ 100ms at 16kHz. If None, no chunking will be done.

overlap_chunk_second `class-attribute` `instance-attribute` ¶

overlap_chunk_second: int = 1

Overlap duration in seconds between consecutive audio chunks when splitting long audio. This helps maintain context across chunk boundaries and improves transcription quality at split points.

sample_rate `class-attribute` `instance-attribute` ¶

sample_rate: float = 16000

Sample rate (Hz) to resample input audio to. Most speech models expect 16kHz audio input. The input audio will be automatically resampled to this rate before processing.

SpeechToTextParams `dataclass` ¶

All parameters consumed by get_generation_prompt().

TranscriptionRequest.build_stt_params() constructs this object, mapping API-level fields into typed attributes. Models only receive this object, so new parameters can be added here without changing the get_generation_prompt signature.

Source code in vllm/config/speech_to_text.py

@dataclass
class SpeechToTextParams:
    """All parameters consumed by ``get_generation_prompt()``.

    ``TranscriptionRequest.build_stt_params()`` constructs this object,
    mapping API-level fields into typed attributes.  Models only receive
    this object, so new parameters can be added here without changing the
    ``get_generation_prompt`` signature.
    """

    audio: np.ndarray
    """Resampled audio waveform for a single chunk."""

    stt_config: SpeechToTextConfig
    """Server-level speech-to-text configuration."""

    model_config: ModelConfig
    """Model configuration."""

    language: str | None = None
    """ISO 639-1 language code (validated / auto-detected)."""

    hotwords: str | None = None
    """
    hotwords refers to a list of important words or phrases that the model
    should pay extra attention to during transcription.
    """

    task_type: str = "transcribe"
    """``"transcribe"`` or ``"translate"``."""

    request_prompt: str = ""
    """Optional text prompt to guide the model."""

    to_language: str | None = None
    """Target language for translation (model-dependent)."""

audio `instance-attribute` ¶

audio: ndarray

Resampled audio waveform for a single chunk.

hotwords `class-attribute` `instance-attribute` ¶

hotwords: str | None = None

hotwords refers to a list of important words or phrases that the model should pay extra attention to during transcription.

language `class-attribute` `instance-attribute` ¶

language: str | None = None

ISO 639-1 language code (validated / auto-detected).

model_config `instance-attribute` ¶

model_config: ModelConfig

Model configuration.

request_prompt `class-attribute` `instance-attribute` ¶

request_prompt: str = ''

Optional text prompt to guide the model.

stt_config `instance-attribute` ¶

stt_config: SpeechToTextConfig

Server-level speech-to-text configuration.

task_type `class-attribute` `instance-attribute` ¶

task_type: str = 'transcribe'

"transcribe" or "translate".

to_language `class-attribute` `instance-attribute` ¶

to_language: str | None = None

Target language for translation (model-dependent).

vllm.config.speech_to_text ¶

SpeechToTextConfig ¶

max_audio_clip_s class-attribute instance-attribute ¶

min_energy_split_window_size class-attribute instance-attribute ¶

overlap_chunk_second class-attribute instance-attribute ¶

sample_rate class-attribute instance-attribute ¶

SpeechToTextParams dataclass ¶

audio instance-attribute ¶

hotwords class-attribute instance-attribute ¶

language class-attribute instance-attribute ¶

model_config instance-attribute ¶

request_prompt class-attribute instance-attribute ¶

stt_config instance-attribute ¶

task_type class-attribute instance-attribute ¶

to_language class-attribute instance-attribute ¶