add character glm
Browse files- vocab/__init__.py +65 -53
- vocab/character_glm_6b/__init__.py +4 -0
vocab/__init__.py
CHANGED
|
@@ -19,32 +19,37 @@ tokenizer.implementation = TokenizerImpl.SentencePiece.name # https://github.c
|
|
| 19 |
- 特征
|
| 20 |
- 词典:有##开头的token,表示subword
|
| 21 |
- 示例:
|
| 22 |
-
- google/sentencepiece:
|
| 23 |
- 特征:
|
| 24 |
- 训练:
|
| 25 |
-
- 文件: *.sp_model 或 *.model (可选文件 .vocab,)
|
| 26 |
- 实现:
|
|
|
|
| 27 |
- 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
|
| 28 |
- 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
|
| 29 |
- 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
|
|
|
|
|
|
|
| 30 |
- 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
|
| 31 |
- 示例:google-t5, llama,baichuan, orion,
|
| 32 |
- icetk: sentencepiece的分支,支持image_tokenizer
|
| 33 |
- glm, chatglm1, chatglm2
|
| 34 |
- openai/tiktoken
|
| 35 |
-
- hf_tokenizer
|
|
|
|
| 36 |
- 特征:
|
| 37 |
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
| 38 |
- added_tokens 在vocab中不一定存在。
|
| 39 |
- 实现:
|
| 40 |
-
- 训练:
|
| 41 |
- 加载:
|
| 42 |
-
- 方法:
|
| 43 |
- .model 是 tokenizer.models.BPE 类型
|
| 44 |
- 词典有 Ġ "\u0120" 开头
|
| 45 |
-
-
|
|
|
|
| 46 |
- 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
|
| 47 |
-
-
|
| 48 |
- ss
|
| 49 |
- tiktoken
|
| 50 |
- 特征:空格就是空格,
|
|
@@ -65,71 +70,72 @@ uniq_tokenizers = [
|
|
| 65 |
""
|
| 66 |
]
|
| 67 |
|
| 68 |
-
# TODO: alias/abbr, hf_path, tokenizer_class, comments,
|
| 69 |
all_tokenizers = [
|
| 70 |
##### bert 系列
|
| 71 |
-
("bert_base_cased", "", ""),
|
| 72 |
-
("bert_base_uncased","",),
|
| 73 |
-
("bert_base_chinese",),
|
| 74 |
-
("roberta_chinese_clue",),
|
| 75 |
("kplug",),
|
| 76 |
("gpt2_chinese",),
|
| 77 |
|
| 78 |
##### GPT2Tokenizer
|
| 79 |
-
("gpt2",),
|
| 80 |
-
("moss",),
|
| 81 |
-
("bloom",),
|
| 82 |
# ("bloomz_6b4_zh",
|
| 83 |
# ("belle_7b_2m", # 模型和词典都基于bloom
|
| 84 |
#
|
| 85 |
-
("gpt_nexo_20b",),
|
| 86 |
-
("qwen1_5_14b_chat",), # 15万,速度有点慢
|
| 87 |
-
("starchat_alpha",),
|
| 88 |
|
| 89 |
####### google/sentencepiece tokenizer:
|
| 90 |
# T5 llama internlm
|
| 91 |
-
("t5_small",),
|
| 92 |
-
("t5_base",),
|
| 93 |
-
("t5_large",),
|
| 94 |
-
("chatyuan_large_v2",),
|
| 95 |
-
("prompt_clue",),
|
| 96 |
-
|
| 97 |
-
("llama",), # '中文单字': 700, '中文多字': 0
|
| 98 |
-
("llama2",),
|
| 99 |
-
("chinese_llama",), #
|
| 100 |
-
("chinese_llama2",), #
|
| 101 |
# ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
|
| 102 |
# ("belle_llama_ext_7b",
|
| 103 |
# ("alpaca_7b",
|
| 104 |
-
("baichuan",),
|
| 105 |
-
("baichuan2",),
|
| 106 |
-
("internlm_chat_7b",),
|
| 107 |
-
("internlm2_chat_7b",),
|
| 108 |
-
("internlm2_math_7b",),
|
| 109 |
-
("internlm_xcomposer_7b",),
|
| 110 |
-
("falcon_7b",),
|
| 111 |
-
("falcon_180b",),
|
|
|
|
|
|
|
| 112 |
# "goat",
|
| 113 |
|
| 114 |
# ##### glm系列
|
| 115 |
# "glm_chinese",),
|
| 116 |
-
("chatglm_6b",),
|
| 117 |
-
("chatglm2_6b",),
|
| 118 |
-
("chatglm3_6b",),
|
| 119 |
-
|
| 120 |
|
| 121 |
# tiktoken 系列
|
| 122 |
-
("qwen_1_8b_chat",),
|
| 123 |
-
("qwen_7b_chat",),
|
| 124 |
-
("qwen_72b_chat",),
|
| 125 |
-
("text_davinci_003",),
|
| 126 |
-
("code_davinci_002",),
|
| 127 |
-
("gpt_35_turbo",),
|
| 128 |
-
("gpt_4",),
|
| 129 |
|
| 130 |
# 未分类
|
| 131 |
-
|
| 132 |
-
("skywork_13b_math",),
|
| 133 |
("mistral_7b",),
|
| 134 |
("mixtral_8_7b",),
|
| 135 |
|
|
@@ -205,15 +211,21 @@ class TokenizerType(Enum):
|
|
| 205 |
|
| 206 |
|
| 207 |
class TokenizerImpl(Enum):
|
|
|
|
| 208 |
"""
|
|
|
|
|
|
|
| 209 |
"""
|
| 210 |
-
SentencePiece = auto()
|
| 211 |
|
| 212 |
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
|
| 213 |
# 构造词典:
|
| 214 |
-
#
|
| 215 |
-
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
|
| 219 |
def load_tokener(model_name):
|
|
|
|
| 19 |
- 特征
|
| 20 |
- 词典:有##开头的token,表示subword
|
| 21 |
- 示例:
|
| 22 |
+
- bpe-google/sentencepiece:
|
| 23 |
- 特征:
|
| 24 |
- 训练:
|
| 25 |
+
- 文件: *.sp_model 或 *.model (可选文件 .vocab,) spm简称
|
| 26 |
- 实现:
|
| 27 |
+
- 依赖: protobuf
|
| 28 |
- 训练: `import sentencepiece as spm; spm.SentencePieceTrainer.train` 或 `spm_train`
|
| 29 |
- 加载: `import sentencepiece as spm; spm.SentencePieceProcessor().Load(vocab_file)`
|
| 30 |
- 方法: 是SentencePieceProcessor类型,sp_model.id_to_piece,有tokenizer.json tokenizer.model,
|
| 31 |
+
- 分词:
|
| 32 |
+
- pre_tokenizers.ByteLevel(add_prefix_space=True, use_regex=False)
|
| 33 |
- 词典: 词典字符有 ▁ (U+2581) ,表示空格或句首。
|
| 34 |
- 示例:google-t5, llama,baichuan, orion,
|
| 35 |
- icetk: sentencepiece的分支,支持image_tokenizer
|
| 36 |
- glm, chatglm1, chatglm2
|
| 37 |
- openai/tiktoken
|
| 38 |
+
- bpe-hf_tokenizer
|
| 39 |
+
- ss
|
| 40 |
- 特征:
|
| 41 |
- 文件: tokenizer.json(包含后两个文件的内容), merges.txt, vocab.json
|
| 42 |
- added_tokens 在vocab中不一定存在。
|
| 43 |
- 实现:
|
| 44 |
+
- 训练: `from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer`
|
| 45 |
- 加载:
|
| 46 |
+
- 方法: .model.from_file .model.save .model.token_to_id .model.tokenize
|
| 47 |
- .model 是 tokenizer.models.BPE 类型
|
| 48 |
- 词典有 Ġ "\u0120" 开头
|
| 49 |
+
- 优势
|
| 50 |
+
-
|
| 51 |
- 示例:gpt2, gpt_neox_20b, moss, bloom, qwen2
|
| 52 |
+
- 优势:相对sentence piece,hf_tokenizer支持pre-tokenization的正则表达式,对tab和换行支持更好 ()
|
| 53 |
- ss
|
| 54 |
- tiktoken
|
| 55 |
- 特征:空格就是空格,
|
|
|
|
| 70 |
""
|
| 71 |
]
|
| 72 |
|
| 73 |
+
# TODO: alias/abbr, hf_path, tokenizer_class/type, comments,
|
| 74 |
all_tokenizers = [
|
| 75 |
##### bert 系列
|
| 76 |
+
("bert_base_cased", "", "bert"),
|
| 77 |
+
("bert_base_uncased", "", "bert"),
|
| 78 |
+
("bert_base_chinese", "", "bert"),
|
| 79 |
+
("roberta_chinese_clue", "", "bert"),
|
| 80 |
("kplug",),
|
| 81 |
("gpt2_chinese",),
|
| 82 |
|
| 83 |
##### GPT2Tokenizer
|
| 84 |
+
("gpt2", "", "GPT2Tokenizer",), #
|
| 85 |
+
("moss", "", "GPT2Tokenizer",),
|
| 86 |
+
("bloom", "", "GPT2Tokenizer",),
|
| 87 |
# ("bloomz_6b4_zh",
|
| 88 |
# ("belle_7b_2m", # 模型和词典都基于bloom
|
| 89 |
#
|
| 90 |
+
("gpt_nexo_20b", "", "GPT2Tokenizer",), # 5万
|
| 91 |
+
("qwen1_5_14b_chat", "", "GPT2Tokenizer",), # 15万,速度有点慢
|
| 92 |
+
("starchat_alpha", "", "GPT2Tokenizer",),
|
| 93 |
|
| 94 |
####### google/sentencepiece tokenizer:
|
| 95 |
# T5 llama internlm
|
| 96 |
+
("t5_small", "", "sentencepiece"),
|
| 97 |
+
("t5_base", "", "sentencepiece"),
|
| 98 |
+
("t5_large", "", "sentencepiece"),
|
| 99 |
+
("chatyuan_large_v2", "", "sentencepiece"),
|
| 100 |
+
("prompt_clue", "", "sentencepiece"),
|
| 101 |
+
|
| 102 |
+
("llama", "", "sentencepiece"), # '中文单字': 700, '中文多字': 0
|
| 103 |
+
("llama2", "", "sentencepiece"),
|
| 104 |
+
("chinese_llama", "", "sentencepiece"), #
|
| 105 |
+
("chinese_llama2", "", "sentencepiece"), #
|
| 106 |
# ("chinese_alpaca_lora_7b", # 中文Alpaca模型在上述中文LLaMA模型的基础上进一步使用了指令数据进行精调。
|
| 107 |
# ("belle_llama_ext_7b",
|
| 108 |
# ("alpaca_7b",
|
| 109 |
+
("baichuan", "", "sentencepiece"),
|
| 110 |
+
("baichuan2", "", "sentencepiece"),
|
| 111 |
+
("internlm_chat_7b", "", "sentencepiece"),
|
| 112 |
+
("internlm2_chat_7b", "", "sentencepiece"),
|
| 113 |
+
("internlm2_math_7b", "", "sentencepiece"),
|
| 114 |
+
("internlm_xcomposer_7b", "", "sentencepiece"),
|
| 115 |
+
("falcon_7b", "", "sentencepiece"),
|
| 116 |
+
("falcon_180b", "", "sentencepiece"),
|
| 117 |
+
("skywork_13b_base",),
|
| 118 |
+
("skywork_13b_math",),
|
| 119 |
# "goat",
|
| 120 |
|
| 121 |
# ##### glm系列
|
| 122 |
# "glm_chinese",),
|
| 123 |
+
("chatglm_6b", "", "sentencepiece"),
|
| 124 |
+
("chatglm2_6b", "", "sentencepiece"),
|
| 125 |
+
("chatglm3_6b", "", "sentencepiece"),
|
| 126 |
+
("character_glm_6b", "", "sentencepiece"),
|
| 127 |
|
| 128 |
# tiktoken 系列
|
| 129 |
+
("qwen_1_8b_chat", "", "tiktoken"),
|
| 130 |
+
("qwen_7b_chat", "", "tiktoken"),
|
| 131 |
+
("qwen_72b_chat", "", "tiktoken"),
|
| 132 |
+
("text_davinci_003", "", "tiktoken"),
|
| 133 |
+
("code_davinci_002", "", "tiktoken"),
|
| 134 |
+
("gpt_35_turbo", "", "tiktoken"),
|
| 135 |
+
("gpt_4", "", "tiktoken"),
|
| 136 |
|
| 137 |
# 未分类
|
| 138 |
+
|
|
|
|
| 139 |
("mistral_7b",),
|
| 140 |
("mixtral_8_7b",),
|
| 141 |
|
|
|
|
| 211 |
|
| 212 |
|
| 213 |
class TokenizerImpl(Enum):
|
| 214 |
+
|
| 215 |
"""
|
| 216 |
+
https://github.com/google/sentencepiece,支持 sentencepiece(BPE,unigram,char,word), wordpiece,
|
| 217 |
+
spm_train --model_type unigram/bpe/char/word
|
| 218 |
"""
|
| 219 |
+
SentencePiece = auto()
|
| 220 |
|
| 221 |
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/gpt2/tokenization_gpt2.py#L104
|
| 222 |
# 构造词典:
|
| 223 |
+
# GPT2Tokenizer = auto()
|
| 224 |
+
# BertTokenizer = auto() #
|
| 225 |
+
|
| 226 |
+
"""
|
| 227 |
+
"""
|
| 228 |
+
HFTokenizer = auto() # https://github.com/huggingface/tokenizers, 支持
|
| 229 |
|
| 230 |
|
| 231 |
def load_tokener(model_name):
|
vocab/character_glm_6b/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from transformers import AutoTokenizer
|
| 3 |
+
|
| 4 |
+
tokenizer = AutoTokenizer.from_pretrained("thu-coai/CharacterGLM-6B", trust_remote_code=True)
|