| """ | |
| https://github.com/EleutherAI/gpt-neox/blob/main/tools/corpora.py | |
| ## | |
| """ | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # tokenizer = AutoTokenizer.from_pretrained("togethercomputer/GPT-NeoXT-Chat-Base-20B") | |
| tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") | |
| tokens = tokenizer.encode("good night\n中国 ss一个人去哪里") | |
| print(tokens) | |
| print(tokenizer.decode(tokens)) | |
| for token in tokens: | |
| print(token, tokenizer.decode([token])) |