update
Browse files- app.py +7 -7
- requirements.txt +1 -0
- vocab/chatglm_6b/__init__.py +3 -0
- vocab/llama2/README.md +0 -0
app.py
CHANGED
|
@@ -40,7 +40,7 @@ example_text = """Replace this text in the input field to see how tokenization w
|
|
| 40 |
|
| 41 |
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
| 42 |
examples = [
|
| 43 |
-
["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
| 44 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
| 45 |
["符号测试:🦙", "baichuan_7b", "llama"],
|
| 46 |
["中文测试:🦙", "baichuan_7b", "llama"],
|
|
@@ -83,10 +83,10 @@ def tokenize(text, tokenizer_type, color_num=5):
|
|
| 83 |
return
|
| 84 |
|
| 85 |
|
| 86 |
-
|
| 87 |
table.append(
|
| 88 |
{"TokenID": token_id,
|
| 89 |
-
"
|
| 90 |
"Text": decode_text, #
|
| 91 |
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
| 92 |
"Bytes": str(token_bytes),
|
|
@@ -212,13 +212,13 @@ with gr.Blocks(css=css) as demo:
|
|
| 212 |
|
| 213 |
with gr.Row():
|
| 214 |
output_table_1 = gr.Dataframe(
|
| 215 |
-
headers=["TokenID", "Byte", "Text"],
|
| 216 |
-
datatype=["str", "str", "str"],
|
| 217 |
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
| 218 |
)
|
| 219 |
output_table_2 = gr.Dataframe(
|
| 220 |
-
headers=["TokenID", "Token", "Text"],
|
| 221 |
-
datatype=["str", "str", "str"],
|
| 222 |
)
|
| 223 |
|
| 224 |
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
|
|
|
|
| 40 |
|
| 41 |
# llama chatglm_6b gpt_nexo_20b baichuan baichuan_7b
|
| 42 |
examples = [
|
| 43 |
+
# ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
|
| 44 |
["标点测试:,。!?;", "baichuan_7b", "llama"],
|
| 45 |
["符号测试:🦙", "baichuan_7b", "llama"],
|
| 46 |
["中文测试:🦙", "baichuan_7b", "llama"],
|
|
|
|
| 83 |
return
|
| 84 |
|
| 85 |
|
| 86 |
+
# ⭐
|
| 87 |
table.append(
|
| 88 |
{"TokenID": token_id,
|
| 89 |
+
"Token": token_str, # utf-8解码后的字符串,为什么有些是 <0xE7>,表示什么?比如llama
|
| 90 |
"Text": decode_text, #
|
| 91 |
# "Bytes": token_bytes, # bytes类型在gradio前端页面被解码成字符串,比如 b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
|
| 92 |
"Bytes": str(token_bytes),
|
|
|
|
| 212 |
|
| 213 |
with gr.Row():
|
| 214 |
output_table_1 = gr.Dataframe(
|
| 215 |
+
# headers=["TokenID", "Byte", "Text"],
|
| 216 |
+
# datatype=["str", "str", "str"],
|
| 217 |
# elem_classes="space-show", # 给整个Dataframe加这个css不起作用,因此直接修改cell-wrap
|
| 218 |
)
|
| 219 |
output_table_2 = gr.Dataframe(
|
| 220 |
+
# headers=["TokenID", "Token", "Text"],
|
| 221 |
+
# datatype=["str", "str", "str"],
|
| 222 |
)
|
| 223 |
|
| 224 |
tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])
|
requirements.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
transformers>=4.21.1
|
| 2 |
sentencepiece
|
| 3 |
tiktoken
|
|
|
|
| 4 |
torch
|
|
|
|
| 1 |
transformers>=4.21.1
|
| 2 |
sentencepiece
|
| 3 |
tiktoken
|
| 4 |
+
icetk
|
| 5 |
torch
|
vocab/chatglm_6b/__init__.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
import os
|
| 3 |
from transformers import AutoTokenizer
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
依赖 icetk
|
| 3 |
+
"""
|
| 4 |
|
| 5 |
import os
|
| 6 |
from transformers import AutoTokenizer
|
vocab/llama2/README.md
ADDED
|
File without changes
|