github-actions[bot] commited on
Commit
7735526
·
1 Parent(s): 816b4b1

Auto-sync from demo at Wed Nov 5 04:56:04 UTC 2025

Browse files
graphgen/models/reader/parquet_reader.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List
2
+
3
+ import pandas as pd
4
+
5
+ from graphgen.bases.base_reader import BaseReader
6
+
7
+
8
+ class ParquetReader(BaseReader):
9
+ """
10
+ Read parquet files, requiring the schema to be restored to List[Dict[str, Any]].
11
+ """
12
+
13
+ def read(self, file_path: str) -> List[Dict[str, Any]]:
14
+ df = pd.read_parquet(file_path)
15
+ data: List[Dict[str, Any]] = df.to_dict(orient="records")
16
+
17
+ for doc in data:
18
+ if doc.get("type") == "text" and self.text_column not in doc:
19
+ raise ValueError(f"Missing '{self.text_column}' in document: {doc}")
20
+ return self.filter(data)