Spaces:

monurcan
/

efficient_test_time_scaling

Sleeping

App Files Files Community

monurcan commited on Oct 3

Commit

8178d81

1 Parent(s): d0df48e

xx

Browse files

Files changed (1) hide show

app.py +50 -48

app.py CHANGED Viewed

@@ -105,55 +105,57 @@ def model_inference(input_dict, history, *additional_inputs):
         yield "Please login with a Hugging Face account (use the Login button in the sidebar)."
         return
-    try:
-        client = InferenceClient(token=hf_token.token, model=model_name)
-        response = ""
-        yield progress_bar_html("Processing...")
-        # The API may stream tokens. Try to iterate the streaming generator and extract token deltas.
-        try:
-            stream = client.chat.completions.create(messages=messages, stream=True)
-        except TypeError:
-            # older/newer client variants: try the alternative method name
-            stream = client.chat_completion(messages=messages, stream=True)
-        for chunk in stream:
-            # chunk can be an object with attributes or a dict depending on client version
-            token = ""
-            try:
-                # attempt dict-style
-                if isinstance(chunk, dict):
-                    choices = chunk.get("choices")
-                    if choices and len(choices) > 0:
-                        delta = choices[0].get("delta", {})
-                        token = delta.get("content") or ""
-                else:
-                    # attribute-style
-                    choices = getattr(chunk, "choices", None)
-                    if choices and len(choices) > 0:
-                        delta = getattr(choices[0], "delta", None)
-                        if isinstance(delta, dict):
-                            token = delta.get("content") or ""
-                        else:
-                            token = getattr(delta, "content", "")
-            except Exception:
-                token = ""
-            if token:
-                # escape incremental token to avoid raw HTML breaking the chat box
-                response += html.escape(token)
-                time.sleep(0.001)
-                yield response
-        # ensure we yield at least one final message so the async iterator doesn't see StopIteration
-        if response:
-            yield response
-        else:
-            yield "(no text was returned by the model)"
-    except Exception as e:
-        # don't let exceptions escape the generator; yield them so Gradio can display them
-        yield f"Error during inference: {e}"
 examples = [

         yield "Please login with a Hugging Face account (use the Login button in the sidebar)."
         return
+    client = InferenceClient(
+        token=hf_token.token, model=model_name, provider="hf-inference"
+    )
+    response = ""
+    for message in client.chat_completion(
+        messages,
+        max_tokens=1024,
+        stream=True,
+    ):
+        choices = message.choices
+        token = ""
+        if len(choices) and choices[0].delta.content:
+            token = choices[0].delta.content
+        response += token
+        yield response
+    # for chunk in stream:
+    #     # chunk can be an object with attributes or a dict depending on client version
+    #     token = ""
+    #     try:
+    #         # attempt dict-style
+    #         if isinstance(chunk, dict):
+    #             choices = chunk.get("choices")
+    #             if choices and len(choices) > 0:
+    #                 delta = choices[0].get("delta", {})
+    #                 token = delta.get("content") or ""
+    #         else:
+    #             # attribute-style
+    #             choices = getattr(chunk, "choices", None)
+    #             if choices and len(choices) > 0:
+    #                 delta = getattr(choices[0], "delta", None)
+    #                 if isinstance(delta, dict):
+    #                     token = delta.get("content") or ""
+    #                 else:
+    #                     token = getattr(delta, "content", "")
+    #     except Exception:
+    #         token = ""
+    #     if token:
+    #         # escape incremental token to avoid raw HTML breaking the chat box
+    #         response += html.escape(token)
+    #         time.sleep(0.001)
+    #         yield response
+    # # ensure we yield at least one final message so the async iterator doesn't see StopIteration
+    # if response:
+    #     yield response
+    # else:
+    #     yield "(no text was returned by the model)"
 examples = [