Spaces:
Running
Running
File size: 2,743 Bytes
0b9d8c7 d2a63cc 31086ae d2a63cc 31086ae 0b9d8c7 31086ae 0b9d8c7 d2a63cc 31086ae d2a63cc 31086ae d2a63cc 31086ae d2a63cc 31086ae d2a63cc 0b9d8c7 31086ae 0b9d8c7 31086ae 0b9d8c7 31086ae 0b9d8c7 31086ae 0b9d8c7 31086ae 0b9d8c7 31086ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import os
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Union
import pandas as pd
import requests
from ray.data import Dataset
class BaseReader(ABC):
"""
Abstract base class for reading and processing data.
"""
def __init__(self, text_column: str = "content", modalities: list = None):
self.text_column = text_column
self.modalities = modalities if modalities is not None else ["text"]
@abstractmethod
def read(self, input_path: Union[str, List[str]]) -> Dataset:
"""
Read data from the specified file path.
:param input_path: Path to the input file or list of file paths.
:return: Ray Dataset containing the read data.
"""
def _should_keep_item(self, item: Dict[str, Any]) -> bool:
"""
Determine whether to keep the given item based on the text column.
:param item: Dictionary representing a data entry.
:return: True if the item should be kept, False otherwise.
"""
item_type = item.get("type")
assert item_type in [
"text",
"image",
"table",
"equation",
"protein",
], f"Unsupported item type: {item_type}"
if item_type == "text":
content = item.get(self.text_column, "").strip()
return bool(content)
return True
def _validate_batch(self, batch: pd.DataFrame) -> pd.DataFrame:
"""
Validate data format.
"""
if "type" not in batch.columns:
raise ValueError(f"Missing 'type' column. Found: {list(batch.columns)}")
if "text" in batch["type"].values:
if self.text_column not in batch.columns:
raise ValueError(
f"Missing '{self.text_column}' column for text documents"
)
return batch
@staticmethod
def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
"""
Check if an image exists at the given local path or URL.
:param path_or_url: Local file path or remote URL of the image.
:param timeout: Timeout for remote URL requests in seconds.
:return: True if the image exists, False otherwise.
"""
if not path_or_url:
return False
if not path_or_url.startswith(("http://", "https://", "ftp://")):
path = path_or_url.replace("file://", "", 1)
path = os.path.abspath(path)
return os.path.isfile(path)
try:
resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
return resp.status_code == 200
except requests.RequestException:
return False
|