Spaces:

88hours
/

multimodel-rag-chat-with-videos

Sleeping

App Files Files Community

88hours commited on Feb 5

Commit

7d9878f

1 Parent(s): 24ad9c0

Add extra file for storing multimodel data in rag

Browse files

Files changed (5) hide show

mm_rag/MLM/client.py +135 -0
mm_rag/MLM/lvlm.py +301 -0
mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc +0 -0
mm_rag/embeddings/bridgetower_embeddings.py +56 -0
mm_rag/vectorstores/multimodal_lancedb.py +131 -0

mm_rag/MLM/client.py ADDED Viewed

	@@ -0,0 +1,135 @@

+"""Base interface for client making requests/call to visual language model provider API"""
+from abc import ABC, abstractmethod
+from typing import List, Optional, Dict, Union, Iterator
+import requests
+import json
+from utility import isBase64, encode_image, encode_image_from_path_or_url, lvlm_inference
+class BaseClient(ABC):
+    def __init__(self,
+                 hostname: str = "127.0.0.1",
+                 port: int = 8090,
+                 timeout: int = 60,
+                 url: Optional[str] = None):
+        self.connection_url = f"http://{hostname}:{port}" if url is None else url
+        self.timeout = timeout
+        # self.headers = {'Content-Type': 'application/x-www-form-urlencoded'}
+        self.headers = {'Content-Type': 'application/json'}
+    def root(self):
+        """Request for showing welcome message"""
+        connection_route = f"{self.connection_url}/"
+        return requests.get(connection_route)
+    @abstractmethod
+    def generate(self,
+                 prompt: str,
+                 image: str,
+                 **kwargs
+        ) -> str:
+        """Send request to visual language model API
+        and return generated text that was returned by the visual language model API
+        Use this method when you want to call visual language model API to generate text without streaming
+        Args:
+            prompt: A prompt.
+            image: A string that can be either path to image or base64 of an image.
+            **kwargs: Arbitrary additional keyword arguments.
+                These are usually passed to the model provider API call as hyperparameter for generation.
+        Returns:
+            Text returned from visual language model provider API call
+        """
+    def generate_stream(
+            self,
+            prompt: str,
+            image: str,
+            **kwargs
+    ) -> Iterator[str]:
+        """Send request to visual language model API
+        and return an iterator of streaming text that were returned from the visual language model API call
+        Use this method when you want to call visual language model API to stream generated text.
+        Args:
+            prompt: A prompt.
+            image: A string that can be either path to image or base64 of an image.
+            **kwargs: Arbitrary additional keyword arguments.
+                These are usually passed to the model provider API call as hyperparameter for generation.
+        Returns:
+            Iterator of text streamed from visual language model provider API call
+        """
+        raise NotImplementedError()
+    def generate_batch(
+            self,
+            prompt: List[str],
+            image: List[str],
+            **kwargs
+    ) -> List[str]:
+        """Send a request to visual language model API for multi-batch generation
+        and return a list of generated text that was returned by the visual language model API
+        Use this method when you want to call visual language model API to multi-batch generate text.
+        Multi-batch generation does not support streaming.
+        Args:
+            prompt: List of prompts.
+            image: List of strings; each of which can be either path to image or base64 of an image.
+            **kwargs: Arbitrary additional keyword arguments.
+                These are usually passed to the model provider API call as hyperparameter for generation.
+        Returns:
+            List of texts returned from visual language model provider API call
+        """
+        raise NotImplementedError()
+class PredictionGuardClient(BaseClient):
+    generate_kwargs = ['max_tokens',
+                       'temperature',
+                       'top_p',
+                       'top_k']
+    def filter_accepted_genkwargs(self, kwargs):
+        gen_args = {}
+        if "generate_kwargs" in kwargs and isinstance(kwargs["generate_kwargs"], dict):
+            gen_args = {k:kwargs["generate_kwargs"][k]
+                        for k in self.generate_kwargs
+                        if k in kwargs["generate_kwargs"]}
+        return gen_args
+    def generate(self,
+                 prompt: str,
+                 image: str,
+                 **kwargs
+        ) -> str:
+        """Send request to PredictionGuard's API
+        and return generated text that was returned by LLAVA model
+        Use this method when you want to call LLAVA model API to generate text without streaming
+        Args:
+            prompt: A prompt.
+            image: A string that can be either path/URL to image or base64 of an image.
+            **kwargs: Arbitrary additional keyword arguments.
+                These are usually passed to the model provider API call as hyperparameter for generation.
+        Returns:
+            Text returned from visual language model provider API call
+        """
+        assert image is not None and len(image) != "", "the input image cannot be None, it must be either base64-encoded image or path/URL to image"
+        if isBase64(image):
+            base64_image = image
+        else: # this is path to image or URL to image
+            base64_image = encode_image_from_path_or_url(image)
+        args = self.filter_accepted_genkwargs(kwargs)
+        return lvlm_inference(prompt=prompt, image=base64_image, **args)

mm_rag/MLM/lvlm.py ADDED Viewed

	@@ -0,0 +1,301 @@

+from .client import PredictionGuardClient
+from langchain_core.language_models.llms import LLM
+from langchain_core.pydantic_v1 import Extra, root_validator
+from typing import Any, Optional, List, Dict, Iterator, AsyncIterator
+from langchain_core.callbacks import CallbackManagerForLLMRun
+from utility import get_from_dict_or_env, MultimodalModelInput
+from langchain_core.runnables import RunnableConfig, ensure_config
+from langchain_core.language_models.base import LanguageModelInput
+from langchain_core.prompt_values import StringPromptValue
+# from langchain_core.outputs import GenerationChunk, LLMResult
+from langchain_core.language_models.llms import BaseLLM
+from langchain_core.callbacks import (
+    # CallbackManager,
+    CallbackManagerForLLMRun,
+)
+# from langchain_core.load import dumpd
+from langchain_core.runnables.config import run_in_executor
+class LVLM(LLM):
+    """This class extends LLM class for implementing a custom request to LVLM provider API"""
+    client: Any = None #: :meta private:
+    hostname: Optional[str] = None
+    port: Optional[int] = None
+    url: Optional[str] = None
+    max_new_tokens: Optional[int] =  200
+    temperature: Optional[float] = 0.6
+    top_k: Optional[float] = 0
+    stop: Optional[List[str]] = None
+    ignore_eos: Optional[bool] = False
+    do_sample: Optional[bool] = True
+    lazy_mode: Optional[bool] = True
+    hpu_graphs: Optional[bool] = True
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that the access token and python package exists in environment if needed"""
+        if values['client'] is None:
+            # check if url of API is provided
+            url = get_from_dict_or_env(values, 'url', "VLM_URL", None)
+            if url is None:
+                hostname = get_from_dict_or_env(values, 'hostname', 'VLM_HOSTNAME', None)
+                port = get_from_dict_or_env(values, 'port', 'VLM_PORT', None)
+                if hostname is not None and port is not None:
+                    values['client'] = PredictionGuardClient(hostname=hostname, port=port)
+                else:
+                    # using default hostname and port to create Client
+                    values['client'] = PredictionGuardClient()
+            else:
+                values['client'] = PredictionGuardClient(url=url)
+        return values
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm"""
+        return "Large Vision Language Model"
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters for calling the Prediction Guard API."""
+        return {
+            "max_tokens": self.max_new_tokens,
+            "temperature": self.temperature,
+            "top_k": self.top_k,
+            "ignore_eos": self.ignore_eos,
+            "do_sample": self.do_sample,
+            "stop" : self.stop,
+        }
+    def get_params(self, **kwargs):
+        params = self._default_params
+        params.update(kwargs)
+        return params
+    def _call(
+        self,
+        prompt: str,
+        image: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        """Run the VLM on the given input.
+        Args:
+            prompt: The prompt to generate from.
+            image: This can be either path to image or base64 encode of the image.
+            stop: Stop words to use when generating. Model output is cut off at the
+                first occurrence of any of the stop substrings.
+                If stop tokens are not supported consider raising NotImplementedError.
+        Returns:
+            The model output as a string. Actual completions DOES NOT include the prompt
+        Example: TBD
+        """
+        params = {}
+        if stop is not None:
+            raise ValueError("stop kwargs are not permitted.")
+        params['generate_kwargs'] = self.get_params(**kwargs)
+        response = self.client.generate(prompt=prompt, image=image, **params)
+        return response
+    def _stream(
+        self,
+        prompt: str,
+        image: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> Iterator[str]:
+        """Stream the VLM on the given prompt and image.
+        Args:
+            prompt: The prompt to generate from.
+            image: This can be either path to image or base64 encode of the image.
+            stop: Stop words to use when generating. Model output is cut off at the
+                first occurrence of any of the stop substrings.
+                If stop tokens are not supported consider raising NotImplementedError.
+        Returns:
+            The model outputs an iterator of string. Actual completions DOES NOT include the prompt
+        Example: TBD
+        """
+        params = {}
+        params['generate_kwargs'] = self.get_params(**kwargs)
+        for chunk in self.client.generate_stream(prompt=prompt, image=image, **params):
+            yield chunk
+    async def _astream(
+        self,
+        prompt: str,
+        image: str,
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[str]:
+        """An async version of _stream method that stream the VLM on the given prompt and image.
+        Args:
+            prompt: The prompt to generate from.
+            image: This can be either path to image or base64 encode of the image.
+            stop: Stop words to use when generating. Model output is cut off at the
+                first occurrence of any of the stop substrings.
+                If stop tokens are not supported consider raising NotImplementedError.
+        Returns:
+            The model outputs an async iterator of string. Actual completions DOES NOT include the prompt
+        Example: TBD
+        """
+        iterator = await run_in_executor(
+            None,
+            self._stream,
+            prompt,
+            image,
+            stop,
+            run_manager.get_sync() if run_manager else None,
+            **kwargs,
+        )
+        done = object()
+        while True:
+            item = await run_in_executor(
+                None,
+                next,
+                iterator,
+                done,  # type: ignore[call-arg, arg-type]
+            )
+            if item is done:
+                break
+            yield item  # type: ignore[misc]
+    def invoke(
+        self,
+        input: MultimodalModelInput,
+        config: Optional[RunnableConfig] = None,
+        *,
+        stop: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> str:
+        config = ensure_config(config)
+        if isinstance(input, dict) and 'prompt' in input.keys() and 'image' in input.keys():
+            return (
+                self.generate_prompt(
+                    [self._convert_input(StringPromptValue(text=input['prompt']))],
+                    stop=stop,
+                    callbacks=config.get("callbacks"),
+                    tags=config.get("tags"),
+                    metadata=config.get("metadata"),
+                    run_name=config.get("run_name"),
+                    run_id=config.pop("run_id", None),
+                    image= input['image'],
+                    **kwargs,
+                )
+                .generations[0][0]
+                .text
+            )
+        return (
+            self.generate_prompt(
+                [self._convert_input(input)],
+                stop=stop,
+                callbacks=config.get("callbacks"),
+                tags=config.get("tags"),
+                metadata=config.get("metadata"),
+                run_name=config.get("run_name"),
+                run_id=config.pop("run_id", None),
+                **kwargs,
+            )
+            .generations[0][0]
+            .text
+        )
+    async def ainvoke(
+        self,
+        input: MultimodalModelInput,
+        config: Optional[RunnableConfig] = None,
+        *,
+        stop: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> str:
+        config = ensure_config(config)
+        if isinstance(input, dict) and 'prompt' in input.keys() and 'image' in input.keys():
+            llm_result = await self.agenerate_prompt(
+            [self._convert_input(StringPromptValue(text=input['prompt']))],
+            stop=stop,
+            callbacks=config.get("callbacks"),
+            tags=config.get("tags"),
+            metadata=config.get("metadata"),
+            run_name=config.get("run_name"),
+            run_id=config.pop("run_id", None),
+            image=input['image'],
+            **kwargs,
+            )
+        else:
+            llm_result = await self.agenerate_prompt(
+            [self._convert_input(input)],
+            stop=stop,
+            callbacks=config.get("callbacks"),
+            tags=config.get("tags"),
+            metadata=config.get("metadata"),
+            run_name=config.get("run_name"),
+            run_id=config.pop("run_id", None),
+            **kwargs,
+        )
+        return llm_result.generations[0][0].text
+    def stream(
+        self,
+        input: MultimodalModelInput,
+        config: Optional[RunnableConfig] = None,
+        *,
+        stop: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> Iterator[str]:
+        if type(self)._stream == BaseLLM._stream:
+            # model doesn't implement streaming, so use default implementation
+            yield self.invoke(input, config=config, stop=stop, **kwargs)
+        else:
+            if stop is not None:
+                raise ValueError("stop kwargs are not permitted.")
+            image = None
+            prompt = None
+            if isinstance(input, dict) and 'prompt' in input.keys():
+                prompt = self._convert_input(input['prompt']).to_string()
+            else:
+                raise ValueError("prompt must be provided")
+            if isinstance(input, dict) and 'image' in input.keys():
+                image = input['image']
+            for chunk in self._stream(
+                prompt=prompt, image=image, **kwargs
+            ):
+                yield chunk
+    async def astream(
+        self,
+        input: LanguageModelInput,
+        config: Optional[RunnableConfig] = None,
+        *,
+        stop: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> AsyncIterator[str]:
+        if (
+            type(self)._astream is BaseLLM._astream
+            and type(self)._stream is BaseLLM._stream
+        ):
+            yield await self.ainvoke(input, config=config, stop=stop, **kwargs)
+            return
+        else:
+            if stop is not None:
+                raise ValueError("stop kwargs are not permitted.")
+            image = None
+            if isinstance(input, dict) and 'prompt' in input.keys() and 'image' in input.keys():
+                prompt = self._convert_input(input['prompt']).to_string()
+                image = input['image']
+            else:
+                raise ValueError("missing image is not permitted")
+                prompt = self._convert_input(input).to_string()
+            async for chunk in self._astream(
+                prompt=prompt, image=image, **kwargs
+            ):
+                yield chunk

mm_rag/embeddings/__pycache__/bridgetower_embeddings.cpython-311.pyc ADDED Viewed

Binary file (3.23 kB). View file

mm_rag/embeddings/bridgetower_embeddings.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from typing import List
+from langchain_core.embeddings import Embeddings
+from langchain_core.pydantic_v1 import (
+    BaseModel,
+)
+from utility import encode_image, bt_embedding_from_prediction_guard
+from tqdm import tqdm
+class BridgeTowerEmbeddings(BaseModel, Embeddings):
+    """ BridgeTower embedding model """
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of documents using BridgeTower.
+        Args:
+            texts: The list of texts to embed.
+        Returns:
+            List of embeddings, one for each text.
+        """
+        embeddings = []
+        for text in texts:
+            embedding = bt_embedding_from_prediction_guard(text, "")
+            embeddings.append(embedding)
+        return embeddings
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a query using BridgeTower.
+        Args:
+            text: The text to embed.
+        Returns:
+            Embeddings for the text.
+        """
+        return self.embed_documents([text])[0]
+    def embed_image_text_pairs(self, texts: List[str], images: List[str], batch_size=2) -> List[List[float]]:
+        """Embed a list of image-text pairs using BridgeTower.
+        Args:
+            texts: The list of texts to embed.
+            images: The list of path-to-images to embed
+            batch_size: the batch size to process, default to 2
+        Returns:
+            List of embeddings, one for each image-text pairs.
+        """
+        # the length of texts must be equal to the length of images
+        assert len(texts)==len(images), "the len of captions should be equal to the len of images"
+        embeddings = []
+        for path_to_img, text in tqdm(zip(images, texts), total=len(texts)):
+            embedding = bt_embedding_from_prediction_guard(text, encode_image(path_to_img))
+            embeddings.append(embedding)
+        return embeddings

mm_rag/vectorstores/multimodal_lancedb.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from typing import Any, Iterable, List, Optional
+from langchain_core.embeddings import Embeddings
+import uuid
+from langchain_community.vectorstores.lancedb import LanceDB
+class MultimodalLanceDB(LanceDB):
+    """`LanceDB` vector store to process multimodal data
+    To use, you should have ``lancedb`` python package installed.
+    You can install it with ``pip install lancedb``.
+    Args:
+        connection: LanceDB connection to use. If not provided, a new connection
+                    will be created.
+        embedding: Embedding to use for the vectorstore.
+        vector_key: Key to use for the vector in the database. Defaults to ``vector``.
+        id_key: Key to use for the id in the database. Defaults to ``id``.
+        text_key: Key to use for the text in the database. Defaults to ``text``.
+        image_path_key: Key to use for the path to image in the database. Defaults to ``image_path``.
+        table_name: Name of the table to use. Defaults to ``vectorstore``.
+        api_key: API key to use for LanceDB cloud database.
+        region: Region to use for LanceDB cloud database.
+        mode: Mode to use for adding data to the table. Defaults to ``overwrite``.
+    Example:
+        .. code-block:: python
+            vectorstore = MultimodalLanceDB(uri='/lancedb', embedding_function)
+            vectorstore.add_texts(['text1', 'text2'])
+            result = vectorstore.similarity_search('text1')
+    """
+    def __init__(
+        self,
+        connection: Optional[Any] = None,
+        embedding: Optional[Embeddings] = None,
+        uri: Optional[str] = "/tmp/lancedb",
+        vector_key: Optional[str] = "vector",
+        id_key: Optional[str] = "id",
+        text_key: Optional[str] = "text",
+        image_path_key: Optional[str] = "image_path",
+        table_name: Optional[str] = "vectorstore",
+        api_key: Optional[str] = None,
+        region: Optional[str] = None,
+        mode: Optional[str] = "append",
+    ):
+        super(MultimodalLanceDB, self).__init__(connection, embedding, uri, vector_key, id_key, text_key, table_name, api_key, region, mode)
+        self._image_path_key = image_path_key
+    def add_text_image_pairs(
+        self,
+        texts: Iterable[str],
+        image_paths: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        ids: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Turn text-image pairs into embedding and add it to the database
+        Args:
+            texts: Iterable of strings to combine with corresponding images to add to the vectorstore.
+            images: Iterable of path-to-images as strings to combine with corresponding texts to add to the vectorstore.
+            metadatas: Optional list of metadatas associated with the texts.
+            ids: Optional list of ids to associate w    ith the texts.
+        Returns:
+            List of ids of the added text-image pairs.
+        """
+        # the length of texts must be equal to the length of images
+        assert len(texts)==len(image_paths), "the len of transcripts should be equal to the len of images"
+        # Embed texts and create documents
+        docs = []
+        ids = ids or [str(uuid.uuid4()) for _ in texts]
+        embeddings = self._embedding.embed_image_text_pairs(texts=list(texts), images=list(image_paths))  # type: ignore
+        for idx, text in enumerate(texts):
+            embedding = embeddings[idx]
+            metadata = metadatas[idx] if metadatas else {"id": ids[idx]}
+            docs.append(
+                {
+                    self._vector_key: embedding,
+                    self._id_key: ids[idx],
+                    self._text_key: text,
+                    self._image_path_key : image_paths[idx],
+                    "metadata": metadata,
+                }
+            )
+        if 'mode' in kwargs:
+            mode = kwargs['mode']
+        else:
+            mode = self.mode
+        if self._table_name in self._connection.table_names():
+            tbl = self._connection.open_table(self._table_name)
+            if self.api_key is None:
+                tbl.add(docs, mode=mode)
+            else:
+                tbl.add(docs)
+        else:
+            self._connection.create_table(self._table_name, data=docs)
+        return ids
+    @classmethod
+    def from_text_image_pairs(
+        cls,
+        texts: List[str],
+        image_paths: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        connection: Any = None,
+        vector_key: Optional[str] = "vector",
+        id_key: Optional[str] = "id",
+        text_key: Optional[str] = "text",
+        image_path_key: Optional[str] = "image_path",
+        table_name: Optional[str] = "vectorstore",
+        **kwargs: Any,
+    ):
+        instance = MultimodalLanceDB(
+            connection=connection,
+            embedding=embedding,
+            vector_key=vector_key,
+            id_key=id_key,
+            text_key=text_key,
+            image_path_key=image_path_key,
+            table_name=table_name,
+        )
+        instance.add_text_image_pairs(texts, image_paths, metadatas=metadatas, **kwargs)
+        return instance