Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -257,7 +257,11 @@ class KerasBackend(ModelBackend):
|
|
| 257 |
self.display_name = display_name
|
| 258 |
|
| 259 |
# Pre-compile predict function for faster inference
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
def fast_predict(inputs):
|
| 262 |
return model(inputs, training=False)
|
| 263 |
|
|
@@ -279,6 +283,21 @@ class KerasBackend(ModelBackend):
|
|
| 279 |
self.n_heads = model.cfg.get('n_heads', 0)
|
| 280 |
self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
|
| 281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
def predict(self, input_ids):
|
| 283 |
inputs = np.array([input_ids], dtype=np.int32)
|
| 284 |
logits = self.fast_predict(inputs)
|
|
@@ -687,19 +706,16 @@ if __name__ == "__main__":
|
|
| 687 |
# Start assistant message
|
| 688 |
history.append({"role": "assistant", "content": "<think>"})
|
| 689 |
|
| 690 |
-
# Stream response
|
| 691 |
-
update_counter = 0
|
| 692 |
last_tokens_per_sec = 0
|
| 693 |
|
| 694 |
for new_chunk, in_thinking, tokens_per_sec in generate_response_stream(prompt, temperature, backend):
|
| 695 |
history[-1]["content"] += new_chunk
|
| 696 |
-
update_counter += 1
|
| 697 |
last_tokens_per_sec = tokens_per_sec
|
| 698 |
|
| 699 |
-
#
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
yield "", render_history(history, show_thinking), speed_text
|
| 703 |
|
| 704 |
# Final yield to ensure everything is rendered
|
| 705 |
final_speed = f"✅ {last_tokens_per_sec:.1f} tok/s (avg)"
|
|
|
|
| 257 |
self.display_name = display_name
|
| 258 |
|
| 259 |
# Pre-compile predict function for faster inference
|
| 260 |
+
# Use input_signature to prevent retracing with different shapes
|
| 261 |
+
@tf.function(
|
| 262 |
+
input_signature=[tf.TensorSpec(shape=[1, None], dtype=tf.int32)],
|
| 263 |
+
jit_compile=True
|
| 264 |
+
)
|
| 265 |
def fast_predict(inputs):
|
| 266 |
return model(inputs, training=False)
|
| 267 |
|
|
|
|
| 283 |
self.n_heads = model.cfg.get('n_heads', 0)
|
| 284 |
self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
|
| 285 |
|
| 286 |
+
def predict(self, input_ids):
|
| 287 |
+
inputs = tf.constant([input_ids], dtype=tf.int32)
|
| 288 |
+
logits = self.fast_predict(inputs)
|
| 289 |
+
return logits[0, -1, :].numpy()
|
| 290 |
+
|
| 291 |
+
# Count parameters
|
| 292 |
+
total, non_zero = count_parameters(model)
|
| 293 |
+
self.total_params = total
|
| 294 |
+
self.non_zero_params = non_zero
|
| 295 |
+
self.sparsity = (1 - non_zero / total) * 100 if total > 0 else 0
|
| 296 |
+
|
| 297 |
+
# Calculate actual model config for speed estimation
|
| 298 |
+
self.n_heads = model.cfg.get('n_heads', 0)
|
| 299 |
+
self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
|
| 300 |
+
|
| 301 |
def predict(self, input_ids):
|
| 302 |
inputs = np.array([input_ids], dtype=np.int32)
|
| 303 |
logits = self.fast_predict(inputs)
|
|
|
|
| 706 |
# Start assistant message
|
| 707 |
history.append({"role": "assistant", "content": "<think>"})
|
| 708 |
|
| 709 |
+
# Stream response - update every token for smooth streaming
|
|
|
|
| 710 |
last_tokens_per_sec = 0
|
| 711 |
|
| 712 |
for new_chunk, in_thinking, tokens_per_sec in generate_response_stream(prompt, temperature, backend):
|
| 713 |
history[-1]["content"] += new_chunk
|
|
|
|
| 714 |
last_tokens_per_sec = tokens_per_sec
|
| 715 |
|
| 716 |
+
# Update UI on every chunk
|
| 717 |
+
speed_text = f"⚡ {tokens_per_sec:.1f} tok/s"
|
| 718 |
+
yield "", render_history(history, show_thinking), speed_text
|
|
|
|
| 719 |
|
| 720 |
# Final yield to ensure everything is rendered
|
| 721 |
final_speed = f"✅ {last_tokens_per_sec:.1f} tok/s (avg)"
|