hezar.preprocessors.tokenizers package¶
Submodules¶
- hezar.preprocessors.tokenizers.bpe module
BPEConfig
BPEConfig.additional_special_tokens
BPEConfig.bos_token
BPEConfig.cls_token
BPEConfig.continuing_subword_prefix
BPEConfig.dropout
BPEConfig.end_of_word_suffix
BPEConfig.eos_token
BPEConfig.fuse_unk
BPEConfig.initial_alphabet
BPEConfig.limit_alphabet
BPEConfig.mask_token
BPEConfig.min_frequency
BPEConfig.name
BPEConfig.pad_to_multiple_of
BPEConfig.pad_token
BPEConfig.padding_side
BPEConfig.sep_token
BPEConfig.show_progress
BPEConfig.stride
BPEConfig.truncation_side
BPEConfig.unk_token
BPEConfig.vocab_size
BPETokenizer
- hezar.preprocessors.tokenizers.sentencepiece_bpe module
SentencePieceBPEConfig
SentencePieceBPEConfig.add_prefix_space
SentencePieceBPEConfig.additional_special_tokens
SentencePieceBPEConfig.bos_token
SentencePieceBPEConfig.cls_token
SentencePieceBPEConfig.continuing_subword_prefix
SentencePieceBPEConfig.dropout
SentencePieceBPEConfig.end_of_word_suffix
SentencePieceBPEConfig.eos_token
SentencePieceBPEConfig.fuse_unk
SentencePieceBPEConfig.initial_alphabet
SentencePieceBPEConfig.limit_alphabet
SentencePieceBPEConfig.mask_token
SentencePieceBPEConfig.min_frequency
SentencePieceBPEConfig.name
SentencePieceBPEConfig.pad_to_multiple_of
SentencePieceBPEConfig.pad_token
SentencePieceBPEConfig.padding_side
SentencePieceBPEConfig.replacement
SentencePieceBPEConfig.sep_token
SentencePieceBPEConfig.show_progress
SentencePieceBPEConfig.stride
SentencePieceBPEConfig.truncation_side
SentencePieceBPEConfig.unk_token
SentencePieceBPEConfig.vocab_size
SentencePieceBPETokenizer
- hezar.preprocessors.tokenizers.sentencepiece_unigram module
SentencePieceUnigramConfig
SentencePieceUnigramConfig.add_prefix_space
SentencePieceUnigramConfig.additional_special_tokens
SentencePieceUnigramConfig.bos_token
SentencePieceUnigramConfig.cls_token
SentencePieceUnigramConfig.continuing_subword_prefix
SentencePieceUnigramConfig.dropout
SentencePieceUnigramConfig.end_of_word_suffix
SentencePieceUnigramConfig.eos_token
SentencePieceUnigramConfig.fuse_unk
SentencePieceUnigramConfig.initial_alphabet
SentencePieceUnigramConfig.limit_alphabet
SentencePieceUnigramConfig.mask_token
SentencePieceUnigramConfig.min_frequency
SentencePieceUnigramConfig.name
SentencePieceUnigramConfig.pad_to_multiple_of
SentencePieceUnigramConfig.pad_token
SentencePieceUnigramConfig.padding_side
SentencePieceUnigramConfig.replacement
SentencePieceUnigramConfig.sep_token
SentencePieceUnigramConfig.show_progress
SentencePieceUnigramConfig.stride
SentencePieceUnigramConfig.truncation_side
SentencePieceUnigramConfig.unk_token
SentencePieceUnigramConfig.vocab_size
SentencePieceUnigramTokenizer
SentencePieceUnigramTokenizer.build()
SentencePieceUnigramTokenizer.required_backends
SentencePieceUnigramTokenizer.token_ids_name
SentencePieceUnigramTokenizer.tokenizer_config_filename
SentencePieceUnigramTokenizer.tokenizer_filename
SentencePieceUnigramTokenizer.train()
SentencePieceUnigramTokenizer.train_from_iterator()
- hezar.preprocessors.tokenizers.tokenizer module
Tokenizer
Tokenizer.add_special_tokens()
Tokenizer.add_tokens()
Tokenizer.bos_token
Tokenizer.bos_token_id
Tokenizer.build()
Tokenizer.cls_token
Tokenizer.cls_token_id
Tokenizer.convert_ids_to_tokens()
Tokenizer.convert_tokens_to_ids()
Tokenizer.decode()
Tokenizer.decoder
Tokenizer.enable_padding()
Tokenizer.enable_truncation()
Tokenizer.encode()
Tokenizer.eos_token
Tokenizer.eos_token_id
Tokenizer.from_file()
Tokenizer.get_added_vocab()
Tokenizer.get_tokens_from_offsets()
Tokenizer.get_vocab()
Tokenizer.get_vocab_size()
Tokenizer.id_to_token()
Tokenizer.load()
Tokenizer.mask_token
Tokenizer.mask_token_id
Tokenizer.model
Tokenizer.no_padding()
Tokenizer.no_truncation()
Tokenizer.num_special_tokens_to_add()
Tokenizer.pad_encoded_batch()
Tokenizer.pad_token
Tokenizer.pad_token_id
Tokenizer.padding
Tokenizer.push_to_hub()
Tokenizer.required_backends
Tokenizer.save()
Tokenizer.sep_token
Tokenizer.sep_token_id
Tokenizer.set_truncation_and_padding()
Tokenizer.special_ids
Tokenizer.token_ids_name
Tokenizer.token_to_id()
Tokenizer.tokenizer_config_filename
Tokenizer.tokenizer_filename
Tokenizer.truncation
Tokenizer.uncastable_keys
Tokenizer.unk_token
Tokenizer.unk_token_id
Tokenizer.vocab
Tokenizer.vocab_size
TokenizerConfig
TokenizerConfig.additional_special_tokens
TokenizerConfig.bos_token
TokenizerConfig.cls_token
TokenizerConfig.eos_token
TokenizerConfig.mask_token
TokenizerConfig.max_length
TokenizerConfig.name
TokenizerConfig.pad_to_multiple_of
TokenizerConfig.pad_token
TokenizerConfig.pad_token_type_id
TokenizerConfig.padding
TokenizerConfig.padding_side
TokenizerConfig.sep_token
TokenizerConfig.stride
TokenizerConfig.truncation
TokenizerConfig.truncation_side
TokenizerConfig.unk_token
- hezar.preprocessors.tokenizers.wordpiece module
WordPieceConfig
WordPieceConfig.additional_special_tokens
WordPieceConfig.cls_token
WordPieceConfig.initial_alphabet
WordPieceConfig.limit_alphabet
WordPieceConfig.mask_token
WordPieceConfig.min_frequency
WordPieceConfig.name
WordPieceConfig.pad_to_multiple_of
WordPieceConfig.pad_token
WordPieceConfig.pad_token_type_id
WordPieceConfig.padding_side
WordPieceConfig.sep_token
WordPieceConfig.show_progress
WordPieceConfig.stride
WordPieceConfig.truncation_side
WordPieceConfig.unk_token
WordPieceConfig.vocab_size
WordPieceConfig.wordpieces_prefix
WordPieceTokenizer