[docs]@register_preprocessor("wordpiece_tokenizer",config_class=WordPieceConfig)classWordPieceTokenizer(Tokenizer):""" A standard WordPiece tokenizer using 🤗HuggingFace Tokenizers Args: config: Preprocessor config for the tokenizer **kwargs: Extra/manual config parameters """required_backends=_required_backendstokenizer_filename=DEFAULT_TOKENIZER_FILEtokenizer_config_filename=DEFAULT_TOKENIZER_CONFIG_FILEtoken_ids_name="token_ids"def__init__(self,config,tokenizer_file=None,**kwargs):super().__init__(config,tokenizer_file=tokenizer_file,**kwargs)
[docs]deftrain(self,files:List[str],**train_kwargs):"""Train the model using the given files"""self.config.update(train_kwargs)trainer=trainers.WordPieceTrainer(vocab_size=self.config.vocab_size,min_frequency=self.config.min_frequency,limit_alphabet=self.config.limit_alphabet,initial_alphabet=self.config.initial_alphabet,special_tokens=self.config.special_tokens,show_progress=self.config.show_progress,continuing_subword_prefix=self.config.wordpieces_prefix,)ifisinstance(files,str):files=[files]self._tokenizer.train(files,trainer=trainer)
[docs]deftrain_from_iterator(self,dataset:List[str],**train_kwargs):"""Train the model using the given files"""self.config.update(train_kwargs)trainer=trainers.WordPieceTrainer(vocab_size=self.config.vocab_size,min_frequency=self.config.min_frequency,limit_alphabet=self.config.limit_alphabet,initial_alphabet=self.config.initial_alphabet,special_tokens=self.config.special_tokens,show_progress=self.config.show_progress,continuing_subword_prefix=self.config.wordpieces_prefix,)self._tokenizer.train_from_iterator(dataset,trainer=trainer,length=len(dataset))