Spaces:
Sleeping
Sleeping
| """ | |
| 最简单的tokenizer | |
| """ | |
| import json | |
| from vocab.gpt_nexo_20b.tokenizer.tokenizer import HFTokenizer | |
| tokenizer = HFTokenizer("20B_tokenizer.json") | |
| print("vocab_size with added_tokens:", tokenizer.vocab_size) | |
| vocab = tokenizer.vocab | |
| def test_single_token(): | |
| """ | |
| 单个字符的编码(一个字符可能会编码成多个id) | |
| """ | |
| for word in "中国解决方法黑白侗鸩,。!?;": | |
| encoding = tokenizer.tokenize(word) | |
| for token_id in encoding: | |
| decode_str = tokenizer.detokenize([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd" | |
| # token = tokenizer.tokenizer.id_to_token(token_id) | |
| print(word, token_id, decode_str, json.dumps(decode_str), ) | |
| # print(word, token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) | |
| def test_encode(): | |
| text = "中国解决方法黑白侗鸩,。!?;一个人去哪里 一 个" | |
| encoding = tokenizer.tokenize(text) | |
| for token_id in encoding: | |
| decode_str = tokenizer.detokenize([token_id]) # 特殊字符解码后会统一变成 �,对应 "\ufffd" | |
| token = tokenizer.tokenizer.id_to_token(token_id) | |
| print(token_id, decode_str, json.dumps(decode_str), token, json.dumps(token)) | |
| test_encode() | |