llamacpp-flan-t5-large-grammar-synthesis

Sleeping

App Files Files Community

Akjava commited on Mar 19

Commit

3debf7a

1 Parent(s): 8ce032d

update

Browse files

Files changed (2) hide show

README.md +2 -3
app.py +13 -7

README.md CHANGED Viewed

@@ -1,6 +1,5 @@
 ---
-title: Llamacpp-madlad400-3b-mt
-emoji: ⚡
 colorFrom: indigo
 colorTo: blue
 sdk: gradio
@@ -8,7 +7,7 @@ sdk_version: 5.20.1
 app_file: app.py
 pinned: false
 license: mit
-short_description: madlad400-3b-mt on Llama.cpp CPU
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Llamacpp-t5-query-reformulation-RL
 colorFrom: indigo
 colorTo: blue
 sdk: gradio
 app_file: app.py
 pinned: false
 license: mit
+short_description: t5-query-reformulation-RL on Llama.cpp CPU
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -25,16 +25,20 @@ from exception import CustomExceptionHandling
 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 os.makedirs("models",exist_ok=True)
 hf_hub_download(
     repo_id="AnanyaPathak/t5-query-reformulation-RL-GGUF",
     filename="t5-query-reformulation-RL-q8_0.gguf",
     local_dir="./models",
 )
 # Set the title and description
 title = "t5-query-reformulation-RL Llama.cpp"
 description = """
-I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5), I'm not sure current llama-cpp-python support t5
 [Model-Q8_0-GGUF](https://huggingface.co/AnanyaPathak/t5-query-reformulation-RL-GGUF), [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp)
 """
@@ -77,16 +81,18 @@ def respond(
     Returns:
         str: The response to the message.
     """
     try:
         global llama
         if llama == None:
-            llama = Llama("models/t5-query-reformulation-RL-q8_0.gguf",flash_attn=False,
                         n_gpu_layers=0,
-                        n_batch=32,
                         n_ctx=512,
                         n_threads=2,
                         n_threads_batch=2)
         tokens = llama.tokenize(f"{message}".encode("utf-8"))
         llama.encode(tokens)
         tokens = [llama.decoder_start_token()]
@@ -114,7 +120,7 @@ def respond(
 # Create a chat interface
 demo = gr.ChatInterface(
     respond,
-    examples=[["What is the capital of France?"], ["Tell me something about artificial intelligence."], ["What is gravity?"]],
     additional_inputs_accordion=gr.Accordion(
         label="⚙️ Parameters", open=False, render=False
     ),
@@ -140,12 +146,12 @@ demo = gr.ChatInterface(
             value=1024,
             step=1,
             label="Max Tokens",
-            info="Maximum length of response (higher = longer replies)",
         ),
         gr.Slider(
             minimum=0.1,
             maximum=2.0,
-            value=0.7,
             step=0.1,
             label="Temperature",
             info="Creativity level (higher = more creative, lower = more focused)",

 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 os.makedirs("models",exist_ok=True)
 hf_hub_download(
     repo_id="AnanyaPathak/t5-query-reformulation-RL-GGUF",
     filename="t5-query-reformulation-RL-q8_0.gguf",
     local_dir="./models",
 )
 # Set the title and description
 title = "t5-query-reformulation-RL Llama.cpp"
 description = """
+I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5), I'm not sure current llama-cpp-python server support t5
 [Model-Q8_0-GGUF](https://huggingface.co/AnanyaPathak/t5-query-reformulation-RL-GGUF), [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp)
 """
     Returns:
         str: The response to the message.
     """
     try:
         global llama
         if llama == None:
+            model_id = "t5-query-reformulation-RL-q8_0.gguf"
+            llama = Llama(f"models/{model_id}",flash_attn=False,
                         n_gpu_layers=0,
+                        #n_batch=16,#batch sometime make error
                         n_ctx=512,
                         n_threads=2,
                         n_threads_batch=2)
         tokens = llama.tokenize(f"{message}".encode("utf-8"))
         llama.encode(tokens)
         tokens = [llama.decoder_start_token()]
 # Create a chat interface
 demo = gr.ChatInterface(
     respond,
+    examples=[["What is the capital of France?"], ["What real child was raised by wolves?"], ["What is gravity?"]],
     additional_inputs_accordion=gr.Accordion(
         label="⚙️ Parameters", open=False, render=False
     ),
             value=1024,
             step=1,
             label="Max Tokens",
+            info="Maximum length of response (higher = longer replies)",visible=False
         ),
         gr.Slider(
             minimum=0.1,
             maximum=2.0,
+            value=0.4,
             step=0.1,
             label="Temperature",
             info="Creativity level (higher = more creative, lower = more focused)",