Changing embedding from PredictionGuard to Local
Browse files
lrn_vector_embeddings.py
CHANGED
|
@@ -102,8 +102,10 @@ def bt_with_masked_input():
|
|
| 102 |
|
| 103 |
print(results)
|
| 104 |
return results
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
print(results)
|
| 104 |
return results
|
| 105 |
+
|
| 106 |
+
if __name__ == "__main__":
|
| 107 |
+
#res = bt_embeddingsl()
|
| 108 |
+
#print((res['text_embeddings']))
|
| 109 |
+
for img in [img1, img2, img3]:
|
| 110 |
+
embeddings = bt_embeddings_from_local(img['caption'], Image.open(img['image_path']))
|
| 111 |
+
print(embeddings['cross_modal_embeddings'][0].shape)
|
mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc
CHANGED
|
Binary files a/mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc and b/mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc differ
|
|
|
mm_rag/embeddings/bridgetower_embeddings.py
CHANGED
|
@@ -3,9 +3,10 @@ from langchain_core.embeddings import Embeddings
|
|
| 3 |
from langchain_core.pydantic_v1 import (
|
| 4 |
BaseModel,
|
| 5 |
)
|
|
|
|
| 6 |
from utility import encode_image, bt_embedding_from_prediction_guard
|
| 7 |
from tqdm import tqdm
|
| 8 |
-
|
| 9 |
class BridgeTowerEmbeddings(BaseModel, Embeddings):
|
| 10 |
""" BridgeTower embedding model """
|
| 11 |
|
|
@@ -51,6 +52,6 @@ class BridgeTowerEmbeddings(BaseModel, Embeddings):
|
|
| 51 |
|
| 52 |
embeddings = []
|
| 53 |
for path_to_img, text in tqdm(zip(images, texts), total=len(texts)):
|
| 54 |
-
embedding =
|
| 55 |
embeddings.append(embedding)
|
| 56 |
return embeddings
|
|
|
|
| 3 |
from langchain_core.pydantic_v1 import (
|
| 4 |
BaseModel,
|
| 5 |
)
|
| 6 |
+
from lrn_vector_embeddings import bt_embeddings_from_local
|
| 7 |
from utility import encode_image, bt_embedding_from_prediction_guard
|
| 8 |
from tqdm import tqdm
|
| 9 |
+
from PIL import Image
|
| 10 |
class BridgeTowerEmbeddings(BaseModel, Embeddings):
|
| 11 |
""" BridgeTower embedding model """
|
| 12 |
|
|
|
|
| 52 |
|
| 53 |
embeddings = []
|
| 54 |
for path_to_img, text in tqdm(zip(images, texts), total=len(texts)):
|
| 55 |
+
embedding = bt_embeddings_from_local(text, Image.open(path_to_img))
|
| 56 |
embeddings.append(embedding)
|
| 57 |
return embeddings
|
requirements.txt
CHANGED
|
@@ -14,4 +14,6 @@ whisper
|
|
| 14 |
webvtt-py
|
| 15 |
tqdm
|
| 16 |
lancedb
|
| 17 |
-
|
|
|
|
|
|
|
|
|
| 14 |
webvtt-py
|
| 15 |
tqdm
|
| 16 |
lancedb
|
| 17 |
+
langchain-core
|
| 18 |
+
langchain-community
|
| 19 |
+
ollama
|
utility.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import Iterator, TextIO, List, Dict, Any, Optional, Sequence, Union
|
|
| 9 |
from enum import auto, Enum
|
| 10 |
import base64
|
| 11 |
import glob
|
|
|
|
| 12 |
from tqdm import tqdm
|
| 13 |
from pytubefix import YouTube, Stream
|
| 14 |
import webvtt
|
|
@@ -18,6 +19,8 @@ from predictionguard import PredictionGuard
|
|
| 18 |
import cv2
|
| 19 |
import json
|
| 20 |
import PIL
|
|
|
|
|
|
|
| 21 |
from PIL import Image
|
| 22 |
import dataclasses
|
| 23 |
import random
|
|
@@ -234,6 +237,7 @@ def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
|
|
| 234 |
Example usage:
|
| 235 |
from pathlib import Path
|
| 236 |
from whisper.utils import write_srt
|
|
|
|
| 237 |
result = transcribe(model, audio_path, temperature=temperature, **args)
|
| 238 |
# save SRT
|
| 239 |
audio_basename = Path(audio_path).stem
|
|
@@ -520,6 +524,29 @@ def lvlm_inference_with_conversation(conversation, max_tokens: int = 200, temper
|
|
| 520 |
)
|
| 521 |
return response['choices'][-1]['message']['content']
|
| 522 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 523 |
# function `extract_and_save_frames_and_metadata``:
|
| 524 |
# receives as input a video and its transcript
|
| 525 |
# does extracting and saving frames and their metadatas
|
|
|
|
| 9 |
from enum import auto, Enum
|
| 10 |
import base64
|
| 11 |
import glob
|
| 12 |
+
import requests
|
| 13 |
from tqdm import tqdm
|
| 14 |
from pytubefix import YouTube, Stream
|
| 15 |
import webvtt
|
|
|
|
| 19 |
import cv2
|
| 20 |
import json
|
| 21 |
import PIL
|
| 22 |
+
from ollama import chat
|
| 23 |
+
from ollama import ChatResponse
|
| 24 |
from PIL import Image
|
| 25 |
import dataclasses
|
| 26 |
import random
|
|
|
|
| 237 |
Example usage:
|
| 238 |
from pathlib import Path
|
| 239 |
from whisper.utils import write_srt
|
| 240 |
+
import requests
|
| 241 |
result = transcribe(model, audio_path, temperature=temperature, **args)
|
| 242 |
# save SRT
|
| 243 |
audio_basename = Path(audio_path).stem
|
|
|
|
| 524 |
)
|
| 525 |
return response['choices'][-1]['message']['content']
|
| 526 |
|
| 527 |
+
def lvlm_inference_with_ollama(conversation, max_tokens: int = 200, temperature: float = 0.95, top_p: float = 0.1, top_k: int = 10):
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
# Send the request to the local Ollama server
|
| 532 |
+
#response = requests.post("http://localhost:8000/api/v1/completions", json=payload)
|
| 533 |
+
|
| 534 |
+
stream = chat(
|
| 535 |
+
model="llava-1.5-7b-hf",
|
| 536 |
+
messages= conversation,
|
| 537 |
+
stream=True,
|
| 538 |
+
temperature=temperature,
|
| 539 |
+
max_tokens=max_tokens,
|
| 540 |
+
top_p=top_p,
|
| 541 |
+
top_k=top_k
|
| 542 |
+
)
|
| 543 |
+
|
| 544 |
+
response_data = ''
|
| 545 |
+
for chunk in stream:
|
| 546 |
+
response_data += chunk['message']['content']
|
| 547 |
+
|
| 548 |
+
return response_data
|
| 549 |
+
|
| 550 |
# function `extract_and_save_frames_and_metadata``:
|
| 551 |
# receives as input a video and its transcript
|
| 552 |
# does extracting and saving frames and their metadatas
|