Keeby-smilyai commited on
Commit
87ef6d0
·
verified ·
1 Parent(s): cd32b87

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -8
app.py CHANGED
@@ -257,7 +257,11 @@ class KerasBackend(ModelBackend):
257
  self.display_name = display_name
258
 
259
  # Pre-compile predict function for faster inference
260
- @tf.function(reduce_retracing=True, jit_compile=True)
 
 
 
 
261
  def fast_predict(inputs):
262
  return model(inputs, training=False)
263
 
@@ -279,6 +283,21 @@ class KerasBackend(ModelBackend):
279
  self.n_heads = model.cfg.get('n_heads', 0)
280
  self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  def predict(self, input_ids):
283
  inputs = np.array([input_ids], dtype=np.int32)
284
  logits = self.fast_predict(inputs)
@@ -687,19 +706,16 @@ if __name__ == "__main__":
687
  # Start assistant message
688
  history.append({"role": "assistant", "content": "<think>"})
689
 
690
- # Stream response with reduced UI updates
691
- update_counter = 0
692
  last_tokens_per_sec = 0
693
 
694
  for new_chunk, in_thinking, tokens_per_sec in generate_response_stream(prompt, temperature, backend):
695
  history[-1]["content"] += new_chunk
696
- update_counter += 1
697
  last_tokens_per_sec = tokens_per_sec
698
 
699
- # Only update UI every 2 chunks for faster perceived speed
700
- if update_counter % 2 == 0:
701
- speed_text = f" {tokens_per_sec:.1f} tok/s"
702
- yield "", render_history(history, show_thinking), speed_text
703
 
704
  # Final yield to ensure everything is rendered
705
  final_speed = f"✅ {last_tokens_per_sec:.1f} tok/s (avg)"
 
257
  self.display_name = display_name
258
 
259
  # Pre-compile predict function for faster inference
260
+ # Use input_signature to prevent retracing with different shapes
261
+ @tf.function(
262
+ input_signature=[tf.TensorSpec(shape=[1, None], dtype=tf.int32)],
263
+ jit_compile=True
264
+ )
265
  def fast_predict(inputs):
266
  return model(inputs, training=False)
267
 
 
283
  self.n_heads = model.cfg.get('n_heads', 0)
284
  self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
285
 
286
+ def predict(self, input_ids):
287
+ inputs = tf.constant([input_ids], dtype=tf.int32)
288
+ logits = self.fast_predict(inputs)
289
+ return logits[0, -1, :].numpy()
290
+
291
+ # Count parameters
292
+ total, non_zero = count_parameters(model)
293
+ self.total_params = total
294
+ self.non_zero_params = non_zero
295
+ self.sparsity = (1 - non_zero / total) * 100 if total > 0 else 0
296
+
297
+ # Calculate actual model config for speed estimation
298
+ self.n_heads = model.cfg.get('n_heads', 0)
299
+ self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
300
+
301
  def predict(self, input_ids):
302
  inputs = np.array([input_ids], dtype=np.int32)
303
  logits = self.fast_predict(inputs)
 
706
  # Start assistant message
707
  history.append({"role": "assistant", "content": "<think>"})
708
 
709
+ # Stream response - update every token for smooth streaming
 
710
  last_tokens_per_sec = 0
711
 
712
  for new_chunk, in_thinking, tokens_per_sec in generate_response_stream(prompt, temperature, backend):
713
  history[-1]["content"] += new_chunk
 
714
  last_tokens_per_sec = tokens_per_sec
715
 
716
+ # Update UI on every chunk
717
+ speed_text = f"⚡ {tokens_per_sec:.1f} tok/s"
718
+ yield "", render_history(history, show_thinking), speed_text
 
719
 
720
  # Final yield to ensure everything is rendered
721
  final_speed = f"✅ {last_tokens_per_sec:.1f} tok/s (avg)"