hezar.models.speech_recognition.whisper.whisper_tokenizer module

class hezar.models.speech_recognition.whisper.whisper_tokenizer.WhisperBPEConfig(max_length: 'int' = 'deprecated', truncation: 'str' = 'deprecated', truncation_side: str = 'right', padding: 'str' = 'deprecated', padding_side: str = 'right', stride: int = 0, pad_to_multiple_of: int = 0, pad_token_type_id: 'int' = 0, bos_token: str = '<|endoftext|>', eos_token: str = '<|endoftext|>', unk_token: str = '<|endoftext|>', sep_token: str = '<sep>', pad_token: str = '<|endoftext|>', cls_token: str = '<cls>', mask_token: str = '<mask>', additional_special_tokens: List = <factory>, dropout: float = None, continuing_subword_prefix: str = '', end_of_word_suffix: str = '', fuse_unk: bool = False, vocab_size: int = 30000, min_frequency: int = 2, limit_alphabet: int = 1000, initial_alphabet: list = <factory>, show_progress: bool = True, translate_token: str = '<|translate|>', transcribe_token: str = '<|transcribe|>', notimestamps_token: str = '<|notimestamps|>', add_prefix_space: bool = False, add_bos_token: bool = False, model_max_length: int = 1024, language: str = None, task: str = None, predict_timestamps: str = False)[source]

Bases: BPEConfig

add_bos_token: bool = False
add_prefix_space: bool = False
bos_token: str = '<|endoftext|>'
eos_token: str = '<|endoftext|>'
language: str = None
model_max_length: int = 1024
name: str = 'whisper_bpe_tokenizer'
notimestamps_token: str = '<|notimestamps|>'
pad_to_multiple_of: int = 0
pad_token: str = '<|endoftext|>'
padding_side: str = 'right'
predict_timestamps: str = False
show_progress: bool = True
stride: int = 0
task: str = None
transcribe_token: str = '<|transcribe|>'
translate_token: str = '<|translate|>'
truncation_side: str = 'right'
unk_token: str = '<|endoftext|>'
class hezar.models.speech_recognition.whisper.whisper_tokenizer.WhisperBPETokenizer(config, tokenizer_file=None, **kwargs)[source]

Bases: BPETokenizer

decode(token_ids, skip_special_tokens: bool = False, output_offsets: bool = False, time_precision=0.02, decode_with_timestamps: bool = False, **kwargs)[source]

Override decode method to enable timestamps and offsets.

get_decoder_prompt_ids(task=None, language=None, no_timestamps=True)[source]
get_prompt_ids(text: str, return_tensors='numpy')[source]

Converts prompt text to IDs that can be passed to [~WhisperForConditionalGeneration.generate].

property prefix_tokens: List[int]
required_backends: List[str | Backends] = [Backends.TOKENIZERS]
set_prefix_tokens(language: str | None = None, task: str | None = None, predict_timestamps: bool | None = None)[source]