| """ | |
| 超牛逼,没有OOV | |
| """ | |
| from tokenizers import Tokenizer | |
| from data_sample.oov_base import space_tokens, jd_vocab_tokens, docs | |
| tokenizer = Tokenizer.from_file("20B_tokenizer.json") | |
| def test_oov(): | |
| for line in space_tokens + jd_vocab_tokens + docs: | |
| tokens = tokenizer.encode(line) | |
| decode_line = tokenizer.decode(tokens.ids) | |
| if line != decode_line: | |
| print("原句:", line) | |
| print("解码:", decode_line) | |
| if __name__ == "__main__": | |
| test_oov() |