Spaces:

JatinAutonomousLabs
/

Research_AI_Assistant

Sleeping

App Files Files Community

Research_AI_Assistant / src /agents /safety_agent.py

JatsTheAIGen

Initial commit V1

66dbebd about 2 months ago

raw

history blame

13.6 kB

	"""
	Safety & Bias Mitigation Agent
	Specialized in content moderation and bias detection with non-blocking warnings
	"""

	import logging
	import re
	from typing import Dict, Any, List, Tuple

	logger = logging.getLogger(__name__)

	class SafetyCheckAgent:
	def __init__(self, llm_router=None):
	self.llm_router = llm_router
	self.agent_id = "SAFETY_BIAS_001"
	self.specialization = "Content moderation and bias detection with warning-based approach"

	# Safety thresholds (non-blocking, warning-only)
	self.safety_thresholds = {
	"toxicity": 0.8, # High threshold for warnings
	"bias": 0.7, # Moderate threshold for bias detection
	"safety": 0.6, # Lower threshold for general safety
	"privacy": 0.9 # Very high threshold for privacy concerns
	}

	# Warning templates (non-blocking)
	self.warning_templates = {
	"toxicity": "⚠️ Note: Content may contain strong language",
	"bias": "🔍 Note: Potential biases detected in response",
	"safety": "📝 Note: Response should be verified for accuracy",
	"privacy": "🔒 Note: Privacy-sensitive topics discussed",
	"controversial": "💭 Note: This topic may have multiple perspectives"
	}

	# Pattern-based detection for quick analysis
	self.sensitive_patterns = {
	"toxicity": [
	r'\b(hate\|violence\|harm\|attack\|destroy)\b',
	r'\b(kill\|hurt\|harm\|danger)\b',
	r'racial slurs', # Placeholder for actual sensitive terms
	],
	"bias": [
	r'\b(all\|always\|never\|every)\b', # Overgeneralizations
	r'\b(should\|must\|have to)\b', # Prescriptive language
	r'stereotypes?', # Stereotype indicators
	],
	"privacy": [
	r'\b(ssn\|social security\|password\|credit card)\b',
	r'\b(address\|phone\|email\|personal)\b',
	r'\b(confidential\|secret\|private)\b',
	]
	}

	async def execute(self, response: str, context: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]:
	"""
	Execute safety check with non-blocking warnings
	Returns original response with added warnings
	"""
	try:
	logger.info(f"{self.agent_id} analyzing response of length {len(response)}")

	# Perform safety analysis
	safety_analysis = await self._analyze_safety(response, context)

	# Generate warnings without modifying response
	warnings = self._generate_warnings(safety_analysis)

	# Add safety metadata to response
	result = {
	"original_response": response,
	"safety_checked_response": response, # Response never modified
	"warnings": warnings,
	"safety_analysis": safety_analysis,
	"blocked": False, # Never blocks content
	"confidence_scores": safety_analysis.get("confidence_scores", {}),
	"agent_id": self.agent_id
	}

	logger.info(f"{self.agent_id} completed with {len(warnings)} warnings")
	return result

	except Exception as e:
	logger.error(f"{self.agent_id} error: {str(e)}")
	# Fail-safe: return original response with error note
	return self._get_fallback_result(response)

	async def _analyze_safety(self, response: str, context: Dict[str, Any]) -> Dict[str, Any]:
	"""Analyze response for safety concerns using multiple methods"""

	if self.llm_router:
	return await self._llm_based_safety_analysis(response, context)
	else:
	return await self._pattern_based_safety_analysis(response)

	async def _llm_based_safety_analysis(self, response: str, context: Dict[str, Any]) -> Dict[str, Any]:
	"""Use LLM for sophisticated safety analysis"""

	safety_prompt = self._build_safety_prompt(response, context)

	# Simulate LLM analysis (replace with actual LLM call)
	simulated_analysis = {
	"toxicity_score": self._calculate_toxicity_score(response),
	"bias_indicators": self._detect_bias_indicators(response),
	"privacy_concerns": self._check_privacy_issues(response),
	"overall_safety_score": 0.85, # Simulated score
	"confidence_scores": {
	"toxicity": 0.7,
	"bias": 0.6,
	"safety": 0.8,
	"privacy": 0.9
	},
	"detected_issues": self._pattern_based_detection(response),
	"analysis_method": "llm_enhanced"
	}

	return simulated_analysis

	async def _pattern_based_safety_analysis(self, response: str) -> Dict[str, Any]:
	"""Pattern-based safety analysis as fallback"""

	detected_issues = self._pattern_based_detection(response)

	return {
	"toxicity_score": self._calculate_toxicity_score(response),
	"bias_indicators": self._detect_bias_indicators(response),
	"privacy_concerns": self._check_privacy_issues(response),
	"overall_safety_score": 0.75, # Conservative estimate
	"confidence_scores": {
	"toxicity": 0.6,
	"bias": 0.5,
	"safety": 0.7,
	"privacy": 0.8
	},
	"detected_issues": detected_issues,
	"analysis_method": "pattern_based"
	}

	def _build_safety_prompt(self, response: str, context: Dict[str, Any]) -> str:
	"""Build prompt for safety analysis"""

	return f"""
	Analyze the following text for safety concerns and provide warnings if needed:

	Text to analyze: "{response[:500]}..." # Truncate for large responses

	Analyze for:
	1. Toxicity or harmful language
	2. Potential biases or stereotypes
	3. Privacy or security concerns
	4. Controversial or sensitive topics

	Provide analysis in JSON format with:
	- safety_scores for each category (0-1 scale)
	- detected_issues list
	- confidence_level
	- recommended_warnings (non-blocking, advisory only)

	IMPORTANT: Never block or modify the content, only provide warnings.
	"""

	def _pattern_based_detection(self, response: str) -> List[Dict[str, Any]]:
	"""Detect safety issues using pattern matching"""
	issues = []
	response_lower = response.lower()

	# Check each category
	for category, patterns in self.sensitive_patterns.items():
	for pattern in patterns:
	if re.search(pattern, response_lower, re.IGNORECASE):
	issues.append({
	"category": category,
	"pattern": pattern,
	"severity": "low", # Always low for warning-only approach
	"confidence": 0.7
	})
	break # Only report one pattern match per category

	return issues

	def _calculate_toxicity_score(self, response: str) -> float:
	"""Calculate toxicity score (simplified version)"""
	# Simple heuristic-based toxicity detection
	toxic_indicators = [
	'hate', 'violence', 'harm', 'attack', 'destroy', 'kill', 'hurt'
	]

	score = 0.0
	words = response.lower().split()
	for indicator in toxic_indicators:
	if indicator in words:
	score += 0.2

	return min(1.0, score)

	def _detect_bias_indicators(self, response: str) -> List[str]:
	"""Detect potential bias indicators"""
	biases = []

	# Overgeneralization detection
	if re.search(r'\b(all\|always\|never\|every)\s+\w+s\b', response, re.IGNORECASE):
	biases.append("overgeneralization")

	# Prescriptive language
	if re.search(r'\b(should\|must\|have to\|ought to)\b', response, re.IGNORECASE):
	biases.append("prescriptive_language")

	# Stereotype indicators
	stereotype_patterns = [
	r'\b(all\|most)\s+\w+\s+people\b',
	r'\b(typical\|usual\|normal)\s+\w+\b',
	]

	for pattern in stereotype_patterns:
	if re.search(pattern, response, re.IGNORECASE):
	biases.append("potential_stereotype")
	break

	return biases

	def _check_privacy_issues(self, response: str) -> List[str]:
	"""Check for privacy-sensitive content"""
	privacy_issues = []

	# Personal information patterns
	personal_info_patterns = [
	r'\b\d{3}-\d{2}-\d{4}\b', # SSN-like pattern
	r'\b\d{16}\b', # Credit card-like pattern
	r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b', # Email
	]

	for pattern in personal_info_patterns:
	if re.search(pattern, response):
	privacy_issues.append("potential_personal_info")
	break

	return privacy_issues

	def _generate_warnings(self, safety_analysis: Dict[str, Any]) -> List[str]:
	"""Generate non-blocking warnings based on safety analysis"""
	warnings = []

	# Check each safety category
	confidence_scores = safety_analysis.get("confidence_scores", {})
	detected_issues = safety_analysis.get("detected_issues", [])

	# Toxicity warnings
	if confidence_scores.get("toxicity", 0) > self.safety_thresholds["toxicity"]:
	warnings.append(self.warning_templates["toxicity"])

	# Bias warnings
	if (confidence_scores.get("bias", 0) > self.safety_thresholds["bias"] or
	safety_analysis.get("bias_indicators")):
	warnings.append(self.warning_templates["bias"])

	# Privacy warnings
	if (confidence_scores.get("privacy", 0) > self.safety_thresholds["privacy"] or
	safety_analysis.get("privacy_concerns")):
	warnings.append(self.warning_templates["privacy"])

	# General safety warning if overall score is low
	if safety_analysis.get("overall_safety_score", 1.0) < 0.7:
	warnings.append(self.warning_templates["safety"])

	# Add context-specific warnings for detected issues
	for issue in detected_issues:
	category = issue.get("category")
	if category in self.warning_templates and category not in [w.split(":")[1].strip() for w in warnings]:
	warnings.append(self.warning_templates[category])

	# Deduplicate warnings
	return list(set(warnings))

	def _get_fallback_result(self, response: str) -> Dict[str, Any]:
	"""Fallback result when safety check fails"""
	return {
	"original_response": response,
	"safety_checked_response": response,
	"warnings": ["🔧 Note: Safety analysis temporarily unavailable"],
	"safety_analysis": {
	"overall_safety_score": 0.5,
	"confidence_scores": {"safety": 0.5},
	"detected_issues": [],
	"analysis_method": "fallback"
	},
	"blocked": False,
	"agent_id": self.agent_id,
	"error_handled": True
	}

	def get_safety_summary(self, analysis_result: Dict[str, Any]) -> str:
	"""Generate a user-friendly safety summary"""
	warnings = analysis_result.get("warnings", [])
	safety_score = analysis_result.get("safety_analysis", {}).get("overall_safety_score", 1.0)

	if not warnings:
	return "✅ Content appears safe based on automated analysis"

	warning_count = len(warnings)
	if safety_score > 0.8:
	severity = "low"
	elif safety_score > 0.6:
	severity = "medium"
	else:
	severity = "high"

	return f"⚠️ {warning_count} advisory note(s) - {severity} severity"

	async def batch_analyze(self, responses: List[str]) -> List[Dict[str, Any]]:
	"""Analyze multiple responses efficiently"""
	results = []
	for response in responses:
	result = await self.execute(response)
	results.append(result)
	return results

	# Factory function for easy instantiation
	def create_safety_agent(llm_router=None):
	return SafetyCheckAgent(llm_router)

	# Example usage
	if __name__ == "__main__":
	# Test the safety agent
	agent = SafetyCheckAgent()

	test_responses = [
	"This is a perfectly normal response with no issues.",
	"Some content that might contain controversial topics.",
	"Discussion about sensitive personal information."
	]

	import asyncio

	async def test_agent():
	for response in test_responses:
	result = await agent.execute(response)
	print(f"Response: {response[:50]}...")
	print(f"Warnings: {result['warnings']}")
	print(f"Safety Score: {result['safety_analysis']['overall_safety_score']}")
	print("-" * 50)

	asyncio.run(test_agent())