88hours commited on
Commit
f7c72f7
Β·
1 Parent(s): 4768f1e

Improved UI

Browse files
Files changed (2) hide show
  1. app.py +131 -1
  2. utility.py +1 -0
app.py CHANGED
@@ -241,6 +241,136 @@ def init_ui():
241
  test_llama.click(test_btn, None, outputs=[response])
242
  return demo
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  if __name__ == '__main__':
245
- demo = init_ui()
246
  demo.launch(share=True, debug=True)
 
241
  test_llama.click(test_btn, None, outputs=[response])
242
  return demo
243
 
244
+ def init_improved_ui():
245
+
246
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
247
+ # Header Section with Introduction
248
+ with gr.Group():
249
+ gr.Markdown("""
250
+ # 🎬 Video Analysis Assistant
251
+
252
+ ## How it Works:
253
+ 1. πŸ“₯ Provide a YouTube URL.
254
+ 2. πŸ”„ Choose a processing method:
255
+ - Download the video and its captions/subtitles from YouTube.
256
+ - Download the video and generate captions using Whisper AI.
257
+ The system will load the video in video player for preview and process the video and extract frames from it.
258
+ It will then pass the captions and images to the RAG model to store them in the database.
259
+ The RAG (Lance DB) uses a pre-trained BridgeTower model to generate embeddings that provide pairs of captions and related images.
260
+ 3. πŸ€– Analyze video content through:
261
+ - Keyword Search - Use this functionality to search for keywords in the video. Our RAG model will return the most relevant captions and images.
262
+ - AI-powered Q&A - Use this functionality to ask questions about the video content. Our system will use the Meta/LLaMA model to analyze the captions and images and provide detailed answers.
263
+ 4. πŸ“Š Results will be displayed in the response section with related images.
264
+
265
+ > **Note**: Initial processing takes several minutes. Please be patient and monitor the logs for progress updates.
266
+ """)
267
+
268
+ # Video Input Section
269
+ with gr.Group():
270
+ url_input = gr.Textbox(
271
+ label="YouTube URL",
272
+ value="https://www.youtube.com/watch?v=kOEDG3j1bjs",
273
+ visible=True,
274
+ elem_id='url-inp',
275
+ interactive=False
276
+ )
277
+ vid_table_name = gr.Textbox(label="Table Name", visible=False)
278
+ video = gr.Video(label="Video Preview")
279
+
280
+ with gr.Row():
281
+ submit_btn = gr.Button("πŸ“₯ Process with Existing Subtitles", variant="primary")
282
+ submit_btn_gen = gr.Button("🎯 Generate New Subtitles", variant="secondary")
283
+
284
+ # Analysis Tools Section
285
+ with gr.Group():
286
+ gr.Markdown("### πŸ” Analysis Tools")
287
+
288
+ with gr.Tab("Keyword Search"):
289
+ with gr.Row():
290
+ chatbox = gr.Textbox(
291
+ label="Search Keywords",
292
+ value="event horizon",
293
+ visible=False,
294
+ scale=4
295
+ )
296
+ submit_btn_whisper = gr.Button(
297
+ "πŸ”Ž Search",
298
+ elem_id='chat-submit',
299
+ visible=False,
300
+ scale=1
301
+ )
302
+
303
+ with gr.Tab("AI Q&A"):
304
+ with gr.Row():
305
+ chatbox_llm = gr.Textbox(
306
+ label="Ask AI about the video",
307
+ value="What is this video about?",
308
+ visible=False,
309
+ scale=4
310
+ )
311
+ submit_btn_chat = gr.Button(
312
+ "πŸ€– Ask",
313
+ visible=False,
314
+ scale=1
315
+ )
316
+
317
+ # Results Display Section
318
+ with gr.Group():
319
+ gr.Markdown("### πŸ“Š Results")
320
+ response = gr.Textbox(
321
+ label="AI Response",
322
+ elem_id='chat-response',
323
+ visible=False,
324
+ interactive=False
325
+ )
326
+
327
+ with gr.Row():
328
+ frame1 = gr.Image(visible=False, label="Related Frame 1", scale=2)
329
+ frame2 = gr.Image(visible=False, label="Related Frame 2", scale=2)
330
+
331
+ # Control Buttons
332
+ with gr.Row():
333
+ reset_btn = gr.Button("πŸ”„ Start Over", variant="secondary")
334
+ test_llama = gr.Button("πŸ§ͺ Say Hi to Llama", variant="secondary")
335
+
336
+ # Event Handlers
337
+ submit_btn.click(
338
+ fn=process_url_and_init,
339
+ inputs=[url_input],
340
+ outputs=[url_input, submit_btn, video, vid_table_name,
341
+ chatbox, submit_btn_whisper, frame1, frame2,
342
+ chatbox_llm, submit_btn_chat]
343
+ )
344
+
345
+ submit_btn_gen.click(
346
+ fn=lambda x: process_url_and_init(x, from_gen=True),
347
+ inputs=[url_input],
348
+ outputs=[url_input, submit_btn, video, vid_table_name,
349
+ chatbox, submit_btn_whisper, frame1, frame2,
350
+ chatbox_llm, submit_btn_chat]
351
+ )
352
+
353
+ submit_btn_whisper.click(
354
+ fn=return_top_k_most_similar_docs,
355
+ inputs=[vid_table_name, chatbox],
356
+ outputs=[response, frame1, frame2]
357
+ )
358
+
359
+ submit_btn_chat.click(
360
+ fn=lambda table_name, query: return_top_k_most_similar_docs(
361
+ vid_table_name=table_name,
362
+ query=query,
363
+ use_llm=True
364
+ ),
365
+ inputs=[vid_table_name, chatbox_llm],
366
+ outputs=[response, frame1, frame2]
367
+ )
368
+
369
+ reset_btn.click(None, js="() => { location.reload(); }")
370
+ test_llama.click(test_btn, None, outputs=[response])
371
+
372
+ return demo
373
+
374
  if __name__ == '__main__':
375
+ demo = init_improved_ui() # Updated function name here
376
  demo.launch(share=True, debug=True)
utility.py CHANGED
@@ -575,6 +575,7 @@ def lvlm_inference_with_conversation(conversation, max_tokens: int = 200, temper
575
  return response['choices'][-1]['message']['content']
576
 
577
  def get_token():
 
578
  token = os.getenv("HUGGINGFACE_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
579
  if token is None:
580
  raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")
 
575
  return response['choices'][-1]['message']['content']
576
 
577
  def get_token():
578
+ load_env()
579
  token = os.getenv("HUGGINGFACE_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
580
  if token is None:
581
  raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")