Spaces:

Smilyai-labs
/

Sam-X-series-chat

Sleeping

App Files Files Community

Keeby-smilyai commited on Oct 19

Commit

87ef6d0

verified ·

1 Parent(s): cd32b87

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -8

app.py CHANGED Viewed

@@ -257,7 +257,11 @@ class KerasBackend(ModelBackend):
         self.display_name = display_name
         # Pre-compile predict function for faster inference
-        @tf.function(reduce_retracing=True, jit_compile=True)
         def fast_predict(inputs):
             return model(inputs, training=False)
@@ -279,6 +283,21 @@ class KerasBackend(ModelBackend):
         self.n_heads = model.cfg.get('n_heads', 0)
         self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
     def predict(self, input_ids):
         inputs = np.array([input_ids], dtype=np.int32)
         logits = self.fast_predict(inputs)
@@ -687,19 +706,16 @@ if __name__ == "__main__":
         # Start assistant message
         history.append({"role": "assistant", "content": "<think>"})
-        # Stream response with reduced UI updates
-        update_counter = 0
         last_tokens_per_sec = 0
         for new_chunk, in_thinking, tokens_per_sec in generate_response_stream(prompt, temperature, backend):
             history[-1]["content"] += new_chunk
-            update_counter += 1
             last_tokens_per_sec = tokens_per_sec
-            # Only update UI every 2 chunks for faster perceived speed
-            if update_counter % 2 == 0:
-                speed_text = f"⚡ {tokens_per_sec:.1f} tok/s"
-                yield "", render_history(history, show_thinking), speed_text
         # Final yield to ensure everything is rendered
         final_speed = f"✅ {last_tokens_per_sec:.1f} tok/s (avg)"

         self.display_name = display_name
         # Pre-compile predict function for faster inference
+        # Use input_signature to prevent retracing with different shapes
+        @tf.function(
+            input_signature=[tf.TensorSpec(shape=[1, None], dtype=tf.int32)],
+            jit_compile=True
+        )
         def fast_predict(inputs):
             return model(inputs, training=False)
         self.n_heads = model.cfg.get('n_heads', 0)
         self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
+    def predict(self, input_ids):
+        inputs = tf.constant([input_ids], dtype=tf.int32)
+        logits = self.fast_predict(inputs)
+        return logits[0, -1, :].numpy()
+        # Count parameters
+        total, non_zero = count_parameters(model)
+        self.total_params = total
+        self.non_zero_params = non_zero
+        self.sparsity = (1 - non_zero / total) * 100 if total > 0 else 0
+        # Calculate actual model config for speed estimation
+        self.n_heads = model.cfg.get('n_heads', 0)
+        self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
     def predict(self, input_ids):
         inputs = np.array([input_ids], dtype=np.int32)
         logits = self.fast_predict(inputs)
         # Start assistant message
         history.append({"role": "assistant", "content": "<think>"})
+        # Stream response - update every token for smooth streaming
         last_tokens_per_sec = 0
         for new_chunk, in_thinking, tokens_per_sec in generate_response_stream(prompt, temperature, backend):
             history[-1]["content"] += new_chunk
             last_tokens_per_sec = tokens_per_sec
+            # Update UI on every chunk
+            speed_text = f"⚡ {tokens_per_sec:.1f} tok/s"
+            yield "", render_history(history, show_thinking), speed_text
         # Final yield to ensure everything is rendered
         final_speed = f"✅ {last_tokens_per_sec:.1f} tok/s (avg)"