Spaces:

yhavinga
/

dutch-tokenizer-arena

Running

App Files Files Community

xu-song commited on Sep 1, 2023

Commit

1ee0570

1 Parent(s): 428b731

update

Browse files

Files changed (4) hide show

app.py +7 -7
requirements.txt +1 -0
vocab/chatglm_6b/__init__.py +3 -0
vocab/llama2/README.md +0 -0

app.py CHANGED Viewed

@@ -40,7 +40,7 @@ example_text = """Replace this text in the input field to see how tokenization w
 # llama chatglm_6b gpt_nexo_20b baichuan  baichuan_7b
 examples = [
-    ["空格测试：  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
     ["标点测试：，。！？；", "baichuan_7b", "llama"],
     ["符号测试：🦙", "baichuan_7b", "llama"],
     ["中文测试：🦙", "baichuan_7b", "llama"],
@@ -83,10 +83,10 @@ def tokenize(text, tokenizer_type, color_num=5):
             return
         table.append(
             {"TokenID": token_id,
-             "⭐Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama
              "Text": decode_text,  #
              # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串，比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
              "Bytes": str(token_bytes),
@@ -212,13 +212,13 @@ with gr.Blocks(css=css) as demo:
     with gr.Row():
         output_table_1 = gr.Dataframe(
-            headers=["TokenID", "Byte", "Text"],
-            datatype=["str", "str", "str"],
             # elem_classes="space-show",   # 给整个Dataframe加这个css不起作用，因此直接修改cell-wrap
         )
         output_table_2 = gr.Dataframe(
-            headers=["TokenID", "Token", "Text"],
-            datatype=["str", "str", "str"],
         )
     tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])

 # llama chatglm_6b gpt_nexo_20b baichuan  baichuan_7b
 examples = [
+    # ["空格测试：  2个空格        8个空格", "llama", "chatglm_6b"],  # chatglm 有blank_n,
     ["标点测试：，。！？；", "baichuan_7b", "llama"],
     ["符号测试：🦙", "baichuan_7b", "llama"],
     ["中文测试：🦙", "baichuan_7b", "llama"],
             return
+        # ⭐
         table.append(
             {"TokenID": token_id,
+             "Token": token_str,  # utf-8解码后的字符串，为什么有些是 <0xE7>，表示什么？比如llama
              "Text": decode_text,  #
              # "Bytes": token_bytes,  # bytes类型在gradio前端页面被解码成字符串，比如   b'\xe4\xb8\xad' 仍然显示成 "中"。因此 str(token_bytes)
              "Bytes": str(token_bytes),
     with gr.Row():
         output_table_1 = gr.Dataframe(
+            # headers=["TokenID", "Byte", "Text"],
+            # datatype=["str", "str", "str"],
             # elem_classes="space-show",   # 给整个Dataframe加这个css不起作用，因此直接修改cell-wrap
         )
         output_table_2 = gr.Dataframe(
+            # headers=["TokenID", "Token", "Text"],
+            # datatype=["str", "str", "str"],
         )
     tokenizer_type_1.change(tokenize, [user_input, tokenizer_type_1], [output_text_1, output_table_1, stats_token_size_1])

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 transformers>=4.21.1
 sentencepiece
 tiktoken
 torch

 transformers>=4.21.1
 sentencepiece
 tiktoken
+icetk
 torch

vocab/chatglm_6b/__init__.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import os
 from transformers import AutoTokenizer

+"""
+依赖 icetk
+"""
 import os
 from transformers import AutoTokenizer

vocab/llama2/README.md ADDED Viewed

File without changes