Spaces:
Sleeping
Sleeping
| from collections import Counter | |
| from typing import Dict, List | |
| import numpy as np | |
| from transformers import AutoTokenizer | |
| class DatasetAnalyzer: | |
| def __init__(self, model_name: str = "facebook/opt-350m"): | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| def analyze_sample(self, sample: Dict) -> Dict: | |
| tokens = self.tokenizer.encode(str(sample)) | |
| return { | |
| "token_count": len(tokens), | |
| "word_count": len(str(sample).split()), | |
| "has_abstract": bool(sample.get("abstract")), | |
| "has_content": bool(sample.get("full_text") or sample.get("excerpt")), | |
| "has_section": bool(sample.get("section_type")), | |
| "domain": sample.get("domain_tag", "unknown") | |
| } | |
| def get_dataset_stats(self, samples: List[Dict]) -> Dict: | |
| stats = [] | |
| domains = Counter() | |
| sections = Counter() | |
| for sample in samples: | |
| sample_stats = self.analyze_sample(sample) | |
| stats.append(sample_stats) | |
| domains[sample_stats["domain"]] += 1 | |
| sections[sample.get("section_type", "unknown")] += 1 | |
| return { | |
| "total_samples": len(samples), | |
| "avg_tokens": np.mean([s["token_count"] for s in stats]), | |
| "avg_words": np.mean([s["word_count"] for s in stats]), | |
| "domain_distribution": dict(domains), | |
| "section_distribution": dict(sections) | |
| } | |