Spaces:

JatinAutonomousLabs
/

Research_AI_Assistant

Sleeping

App Files Files Community

JatsTheAIGen commited on Oct 29

Commit

0b5851a

1 Parent(s): 7506c11

safety agent upgrades to enable creative freedom v1

Browse files

Files changed (6) hide show

app.py +103 -4
orchestrator_integration_patch.py +54 -0
safety_choice_orchestrator.py +191 -0
safety_threshold_matrix.py +256 -0
safety_user_choice.py +210 -0
src/orchestrator_engine.py +145 -0

app.py CHANGED Viewed

@@ -514,7 +514,54 @@ async def process_message_async(message: str, history: Optional[List], session_i
         new_history = list(history) if isinstance(history, list) else []
-        # Add user message
         new_history.append({"role": "user", "content": message.strip()})
         # Initialize Details tab data
@@ -529,13 +576,65 @@ async def process_message_async(message: str, history: Optional[List], session_i
         # Try to use orchestrator if available
         if orchestrator is not None:
             try:
-                logger.info("Attempting full orchestration with safety revision...")
-                # Use enhanced orchestrator with safety revision
-                result = await orchestrator.process_request_with_revision(
                     session_id=session_id,
                     user_input=message.strip()
                 )
                 # Log revision information
                 revision_attempts = result.get('revision_attempts', 0)
                 safety_revision_applied = result.get('safety_revision_applied', False)

         new_history = list(history) if isinstance(history, list) else []
+        # Check if this is a safety choice response
+        message_upper = message.strip().upper()
+        is_safety_choice = message_upper in ['YES', 'NO', 'APPLY', 'KEEP', 'Y', 'N']
+        # Check if we have a pending safety choice for this session
+        if is_safety_choice and orchestrator is not None and hasattr(orchestrator, '_pending_choices'):
+            pending_choice = orchestrator._pending_choices.get(session_id)
+            if pending_choice:
+                logger.info(f"Processing safety choice: {message_upper}")
+                # Determine user decision
+                user_decision = message_upper in ['YES', 'APPLY', 'Y']
+                # Process the safety choice
+                choice_result = await orchestrator.handle_user_safety_decision(
+                    pending_choice['choice_id'],
+                    user_decision,
+                    session_id
+                )
+                # Clean up pending choice
+                del orchestrator._pending_choices[session_id]
+                # Add user message
+                new_history.append({"role": "user", "content": message.strip()})
+                # Add assistant response
+                if 'error' in choice_result:
+                    response = f"Error processing safety choice: {choice_result['error']}"
+                else:
+                    response = choice_result.get('response', choice_result.get('final_response', 'Processing complete.'))
+                new_history.append({"role": "assistant", "content": response})
+                # Extract metadata
+                reasoning_data = {}
+                performance_data = {
+                    "user_choice": choice_result.get('user_choice', 'unknown'),
+                    "revision_applied": choice_result.get('revision_applied', False)
+                }
+                context_data = {
+                    "interaction_id": choice_result.get('interaction_id', 'unknown'),
+                    "session_id": session_id
+                }
+                return new_history, "", reasoning_data, performance_data, context_data, session_id, ""
+        # Add user message (normal flow)
         new_history.append({"role": "user", "content": message.strip()})
         # Initialize Details tab data
         # Try to use orchestrator if available
         if orchestrator is not None:
             try:
+                logger.info("Attempting full orchestration...")
+                # First, try normal processing to check for user choice
+                result = await orchestrator.process_request(
                     session_id=session_id,
                     user_input=message.strip()
                 )
+                # Check if user choice is required
+                if result.get('requires_user_choice', False):
+                    logger.info("User choice required for safety concerns")
+                    choice_prompt = result.get('choice_prompt', {})
+                    choice_id = choice_prompt.get('choice_id', '')
+                    # Create user-friendly choice prompt message
+                    prompt_text = choice_prompt.get('prompt_text', '').replace('**', '').replace('*', '')
+                    choice_message = f"""🛡️ **SAFETY REVIEW REQUIRED**
+{prompt_text}
+**Please reply with:**
+- Type "YES" or "APPLY" to revise the response
+- Type "NO" or "KEEP" to keep the original response with warnings"""
+                    # Store choice_id in session for later retrieval
+                    # We'll detect it when user responds
+                    if not hasattr(orchestrator, '_pending_choices'):
+                        orchestrator._pending_choices = {}
+                    orchestrator._pending_choices[session_id] = {
+                        'choice_id': choice_id,
+                        'partial_response': result.get('partial_response', ''),
+                        'safety_analysis': result.get('safety_analysis', {})
+                    }
+                    # Add assistant message with choice prompt
+                    new_history.append({
+                        "role": "assistant",
+                        "content": choice_message
+                    })
+                    # Extract metadata for Details tab
+                    reasoning_data = result.get('metadata', {}).get('reasoning_chain', {})
+                    performance_data = {
+                        "processing_time": result.get('metadata', {}).get('processing_time', 0),
+                        "agents_used": result.get('metadata', {}).get('agents_used', [])
+                    }
+                    context_data = {
+                        "interaction_id": result.get('interaction_id', 'unknown'),
+                        "session_id": session_id
+                    }
+                    # Return early with choice prompt
+                    return new_history, "", reasoning_data, performance_data, context_data, session_id, ""
+                # If no user choice needed, proceed with safety revision if needed
+                # This maintains backward compatibility
+                if not result.get('requires_user_choice', False):
+                    logger.info("No user choice needed, proceeding with normal flow")
                 # Log revision information
                 revision_attempts = result.get('revision_attempts', 0)
                 safety_revision_applied = result.get('safety_revision_applied', False)

orchestrator_integration_patch.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Orchestrator Integration Patch for Safety User Choice
+Provides integration instructions and helper functions
+"""
+# Integration Instructions:
+"""
+This file documents the integration of the safety user choice system.
+INTEGRATION POINTS:
+1. In orchestrator_engine.py, after safety_agent completes (around line 191-194):
+   - Call create_safety_choice_prompt() to check if user choice is needed
+   - If choice needed, return choice prompt to UI
+   - If no choice needed, proceed with current workflow
+2. Add new method to orchestrator class:
+   - handle_user_safety_decision(choice_id, user_decision)
+   - This processes the user's choice and returns final response
+3. In app.py/UI layer:
+   - Display choice prompt when requires_user_choice=True
+   - Provide YES/NO buttons
+   - Submit choice back to orchestrator
+   - Display final response
+4. Modify safety revision workflow:
+   - Only run if user chooses "YES"
+   - Otherwise append safety warnings to original response
+WORKFLOW:
+Safety analysis completes → Check dynamic thresholds
+If exceeded → Present user choice: "YES (revise)" or "NO (original + warnings)"
+YES → Existing revision workflow
+NO → Original response + formatted safety concerns section
+"""
+def get_integration_notes():
+    """Return integration notes"""
+    return {
+        'integration_point': 'orchestrator_engine.py line ~191 (after safety check)',
+        'new_methods': ['handle_user_safety_decision'],
+        'imports_needed': [
+            'from safety_threshold_matrix import should_trigger_user_choice',
+            'from safety_user_choice import create_safety_choice_prompt, process_safety_choice',
+            'from safety_choice_orchestrator import SafetyChoiceOrchestrator'
+        ],
+        'ui_changes': [
+            'Add safety choice prompt display',
+            'Add YES/NO buttons',
+            'Handle choice submission',
+            'Mobile-first responsive design maintained'
+        ]
+    }

safety_choice_orchestrator.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+Enhanced Orchestrator Workflow with User Safety Choice
+Integrates user decision point after safety analysis
+"""
+import logging
+from typing import Dict, Any, Optional
+from safety_user_choice import create_safety_choice_prompt, process_safety_choice
+logger = logging.getLogger(__name__)
+class SafetyChoiceOrchestrator:
+    """Enhanced orchestrator with user safety choice integration"""
+    def __init__(self, original_orchestrator):
+        self.original_orchestrator = original_orchestrator
+        self.pending_safety_choices = {}
+    async def process_with_safety_choice(self,
+                                       user_input: str,
+                                       session_id: str,
+                                       intent_class: str,
+                                       safety_analysis: Dict[str, Any],
+                                       response_content: str,
+                                       user_choice_callback=None) -> Dict[str, Any]:
+        """
+        Process request with integrated safety choice workflow
+        Args:
+            user_input: User's input text
+            session_id: Session identifier
+            intent_class: Detected intent class
+            safety_analysis: Safety analysis results
+            response_content: Synthesized response content
+            user_choice_callback: Function to handle user choice UI
+        Returns:
+            Dict with processing results
+        """
+        try:
+            choice_prompt = create_safety_choice_prompt(
+                safety_analysis, intent_class, response_content, session_id
+            )
+            if choice_prompt:
+                logger.info(f"Safety concerns detected for intent '{intent_class}' - requiring user choice")
+                if user_choice_callback:
+                    user_decision = await user_choice_callback(choice_prompt)
+                    return await self._handle_user_decision(
+                        choice_prompt['choice_id'], user_decision, safety_analysis
+                    )
+                else:
+                    return {
+                        'requires_user_choice': True,
+                        'choice_prompt': choice_prompt,
+                        'session_id': session_id,
+                        'partial_response': response_content,
+                        'safety_analysis': safety_analysis
+                    }
+            else:
+                logger.info("No safety concerns detected - proceeding normally")
+                return {
+                    'response': response_content,
+                    'safety_analysis': safety_analysis,
+                    'requires_user_choice': False
+                }
+        except Exception as e:
+            logger.error(f"Error in safety choice orchestration: {e}")
+            return {
+                'error': str(e),
+                'requires_user_choice': False,
+                'response': response_content
+            }
+    async def handle_user_safety_decision(self,
+                                        choice_id: str,
+                                        user_decision: bool,
+                                        original_response: str = None) -> Dict[str, Any]:
+        """
+        Handle user's safety decision
+        Args:
+            choice_id: The choice identifier
+            user_decision: True for revision, False for original with warnings
+            original_response: Original response content (optional)
+        Returns:
+            Dict with final response
+        """
+        return await self._handle_user_decision(choice_id, user_decision, None, original_response)
+    async def _handle_user_decision(self,
+                                   choice_id: str,
+                                   user_decision: bool,
+                                   safety_analysis: Dict[str, Any] = None,
+                                   original_response: str = None) -> Dict[str, Any]:
+        """Internal method to handle user decision"""
+        choice_result = process_safety_choice(choice_id, user_decision)
+        if 'error' in choice_result:
+            return choice_result
+        if choice_result['action'] == 'proceed_with_revision':
+            logger.info("User chose revision - running safety revision")
+            revised_response = await self._run_safety_revision(
+                choice_result['original_response'],
+                choice_result['safety_analysis'],
+                choice_result.get('intent_class', 'casual_conversation')
+            )
+            return {
+                'response': revised_response,
+                'safety_analysis': choice_result['safety_analysis'],
+                'user_choice': 'revision',
+                'requires_user_choice': False,
+                'revision_applied': True
+            }
+        elif choice_result['action'] == 'use_original_with_warnings':
+            logger.info("User chose original with warnings")
+            return {
+                'response': choice_result['response_content'],
+                'safety_analysis': choice_result['safety_analysis'],
+                'user_choice': 'original_with_warnings',
+                'requires_user_choice': False,
+                'revision_applied': False
+            }
+        else:
+            return {'error': 'Unknown action from choice result'}
+    async def _run_safety_revision(self,
+                                 original_response: str,
+                                 safety_analysis: Dict[str, Any],
+                                 intent_class: str) -> str:
+        """
+        Run safety revision on the response using the original orchestrator's revision workflow
+        Args:
+            original_response: Original response content
+            safety_analysis: Safety analysis results
+            intent_class: Intent class for context
+        Returns:
+            str: Revised response
+        """
+        try:
+            exceeded_categories = []
+            safety_scores = safety_analysis.get('safety_scores', {})
+            if not safety_scores:
+                confidence_scores = safety_analysis.get('confidence_scores', {})
+                if confidence_scores:
+                    safety_scores = {
+                        'toxicity_or_harmful_language': confidence_scores.get('toxicity', 0.0),
+                        'potential_biases_or_stereotypes': confidence_scores.get('bias', 0.0),
+                        'privacy_or_security_concerns': confidence_scores.get('privacy', 0.0),
+                        'controversial_or_sensitive_topics': 1.0 - confidence_scores.get('safety', 1.0)
+                    }
+            safety_warnings = safety_analysis.get('detected_issues', [])
+            if hasattr(self.original_orchestrator, 'generate_revision_prompt'):
+                revision_prompt = self.original_orchestrator.generate_revision_prompt(
+                    original_response,
+                    exceeded_categories,
+                    safety_warnings
+                )
+                if hasattr(self.original_orchestrator, 'agents') and 'response_synthesis' in self.original_orchestrator.agents:
+                    revised_result = await self.original_orchestrator.agents['response_synthesis'].execute(
+                        agent_outputs={},
+                        user_input=revision_prompt,
+                        context={}
+                    )
+                    revised_response = revised_result.get('final_response', original_response)
+                    return revised_response
+            return original_response
+        except Exception as e:
+            logger.error(f"Error in safety revision: {e}")
+            return original_response

safety_threshold_matrix.py ADDED Viewed

	@@ -0,0 +1,256 @@

+"""
+Intent-Based Safety Threshold Matrix
+Based on academic research for dynamic content moderation thresholds
+"""
+# Intent-Based Safety Threshold Matrix (Academic Research-Based)
+INTENT_SAFETY_MATRIX = {
+    'information_request': {
+        'toxicity': 0.20,
+        'sexual_content': 0.15,
+        'violence': 0.15,
+        'self_harm': 0.10,
+        'harassment': 0.20,
+        'dangerous_activities': 0.15,
+        'privacy_violations': 0.25,
+        'misinformation': 0.30,
+        'child_safety': 0.10,
+        'bias_stereotyping': 0.25
+    },
+    'task_execution': {
+        'toxicity': 0.25,
+        'sexual_content': 0.20,
+        'violence': 0.20,
+        'self_harm': 0.15,
+        'harassment': 0.25,
+        'dangerous_activities': 0.20,
+        'privacy_violations': 0.30,
+        'misinformation': 0.35,
+        'child_safety': 0.15,
+        'bias_stereotyping': 0.30
+    },
+    'creative_generation': {
+        'toxicity': 0.40,
+        'sexual_content': 0.35,
+        'violence': 0.35,
+        'self_harm': 0.25,
+        'harassment': 0.30,
+        'dangerous_activities': 0.30,
+        'privacy_violations': 0.20,
+        'misinformation': 0.25,
+        'child_safety': 0.20,
+        'bias_stereotyping': 0.35
+    },
+    'analysis_research': {
+        'toxicity': 0.35,
+        'sexual_content': 0.25,
+        'violence': 0.25,
+        'self_harm': 0.20,
+        'harassment': 0.30,
+        'dangerous_activities': 0.25,
+        'privacy_violations': 0.40,
+        'misinformation': 0.45,
+        'child_safety': 0.25,
+        'bias_stereotyping': 0.40
+    },
+    'casual_conversation': {
+        'toxicity': 0.30,
+        'sexual_content': 0.25,
+        'violence': 0.25,
+        'self_harm': 0.20,
+        'harassment': 0.35,
+        'dangerous_activities': 0.25,
+        'privacy_violations': 0.15,
+        'misinformation': 0.20,
+        'child_safety': 0.15,
+        'bias_stereotyping': 0.25
+    },
+    'troubleshooting': {
+        'toxicity': 0.15,
+        'sexual_content': 0.10,
+        'violence': 0.15,
+        'self_harm': 0.10,
+        'harassment': 0.20,
+        'dangerous_activities': 0.10,
+        'privacy_violations': 0.35,
+        'misinformation': 0.40,
+        'child_safety': 0.10,
+        'bias_stereotyping': 0.20
+    },
+    'education_learning': {
+        'toxicity': 0.25,
+        'sexual_content': 0.20,
+        'violence': 0.30,
+        'self_harm': 0.15,
+        'harassment': 0.20,
+        'dangerous_activities': 0.35,
+        'privacy_violations': 0.25,
+        'misinformation': 0.50,
+        'child_safety': 0.30,
+        'bias_stereotyping': 0.35
+    },
+    'technical_support': {
+        'toxicity': 0.10,
+        'sexual_content': 0.05,
+        'violence': 0.10,
+        'self_harm': 0.05,
+        'harassment': 0.15,
+        'dangerous_activities': 0.05,
+        'privacy_violations': 0.45,
+        'misinformation': 0.45,
+        'child_safety': 0.05,
+        'bias_stereotyping': 0.15
+    }
+}
+# Safety parameter descriptions for user display
+SAFETY_DESCRIPTIONS = {
+    'toxicity': 'Hate speech, discriminatory language',
+    'sexual_content': 'Explicit sexual material, nudity',
+    'violence': 'Graphic violence, gore, death depictions',
+    'self_harm': 'Content promoting self-harm or suicide',
+    'harassment': 'Personal attacks, bullying, doxxing',
+    'dangerous_activities': 'Illegal activities, weapon instructions',
+    'privacy_violations': 'Personal data exposure, unauthorized sharing',
+    'misinformation': 'False information, unverified claims',
+    'child_safety': 'Content potentially harmful to minors',
+    'bias_stereotyping': 'Discriminatory generalizations'
+}
+# Criticality levels
+CRITICALITY_LEVELS = {
+    (0.0, 0.3): 'LOW',
+    (0.3, 0.6): 'MEDIUM',
+    (0.6, 0.8): 'HIGH',
+    (0.8, 1.0): 'CRITICAL'
+}
+def get_dynamic_threshold(intent_class, safety_param, user_context=None):
+    """
+    Calculate dynamic safety threshold based on intent and context
+    Args:
+        intent_class: The detected intent type
+        safety_param: The safety parameter to check
+        user_context: Optional user context for adjustments
+    Returns:
+        float: Adjusted threshold value
+    """
+    if intent_class not in INTENT_SAFETY_MATRIX:
+        intent_class = 'casual_conversation'
+    if safety_param not in INTENT_SAFETY_MATRIX[intent_class]:
+        return 0.2
+    base_threshold = INTENT_SAFETY_MATRIX[intent_class][safety_param]
+    if user_context:
+        if user_context.get('academic_use', False):
+            base_threshold *= 1.2
+        if user_context.get('minor_present', False):
+            base_threshold *= 0.7
+        if user_context.get('professional_context', False):
+            base_threshold *= 1.1
+        if user_context.get('creative_context', False):
+            base_threshold *= 1.15
+    return min(base_threshold, 0.8)
+def get_criticality_level(score):
+    """Get criticality level for a safety score"""
+    for (min_val, max_val), level in CRITICALITY_LEVELS.items():
+        if min_val <= score < max_val:
+            return level
+    return 'CRITICAL'
+def format_safety_concerns(safety_analysis, intent_class):
+    """
+    Format safety concerns for user display
+    Args:
+        safety_analysis: Dict containing safety scores
+        intent_class: The detected intent class
+    Returns:
+        str: Formatted safety concerns text or None if no concerns
+    """
+    concerns = []
+    safety_mapping = {
+        'toxicity_or_harmful_language': 'toxicity',
+        'potential_biases_or_stereotypes': 'bias_stereotyping',
+        'privacy_or_security_concerns': 'privacy_violations',
+        'controversial_or_sensitive_topics': 'toxicity'
+    }
+    safety_scores = safety_analysis.get('safety_scores', {})
+    if not safety_scores:
+        confidence_scores = safety_analysis.get('confidence_scores', {})
+        if confidence_scores:
+            safety_scores = {
+                'toxicity_or_harmful_language': confidence_scores.get('toxicity', 0.0),
+                'potential_biases_or_stereotypes': confidence_scores.get('bias', 0.0),
+                'privacy_or_security_concerns': confidence_scores.get('privacy', 0.0),
+                'controversial_or_sensitive_topics': 1.0 - confidence_scores.get('safety', 1.0)
+            }
+    for param_key, score in safety_scores.items():
+        if param_key in safety_mapping:
+            safety_param = safety_mapping[param_key]
+            threshold = get_dynamic_threshold(intent_class, safety_param)
+            if isinstance(score, (int, float)) and score > threshold:
+                criticality = get_criticality_level(score)
+                description = SAFETY_DESCRIPTIONS.get(safety_param, param_key)
+                concerns.append(f"• **{criticality}**: {description} (Score: {score:.2f}, Threshold: {threshold:.2f})")
+    if not concerns:
+        return None
+    return "\n".join(concerns)
+def should_trigger_user_choice(safety_analysis, intent_class):
+    """
+    Determine if user choice should be triggered based on safety analysis
+    Args:
+        safety_analysis: Dict containing safety analysis
+        intent_class: The detected intent class
+    Returns:
+        bool: True if user choice should be triggered
+    """
+    safety_scores = safety_analysis.get('safety_scores', {})
+    if not safety_scores:
+        confidence_scores = safety_analysis.get('confidence_scores', {})
+        if confidence_scores:
+            safety_scores = {
+                'toxicity_or_harmful_language': confidence_scores.get('toxicity', 0.0),
+                'potential_biases_or_stereotypes': confidence_scores.get('bias', 0.0),
+                'privacy_or_security_concerns': confidence_scores.get('privacy', 0.0),
+                'controversial_or_sensitive_topics': 1.0 - confidence_scores.get('safety', 1.0)
+            }
+    safety_mapping = {
+        'toxicity_or_harmful_language': 'toxicity',
+        'potential_biases_or_stereotypes': 'bias_stereotyping',
+        'privacy_or_security_concerns': 'privacy_violations',
+        'controversial_or_sensitive_topics': 'toxicity'
+    }
+    for param_key, score in safety_scores.items():
+        if param_key in safety_mapping:
+            safety_param = safety_mapping[param_key]
+            threshold = get_dynamic_threshold(intent_class, safety_param)
+            if isinstance(score, (int, float)) and score > threshold:
+                return True
+    return False

safety_user_choice.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""
+User Choice Interface for Safety Decisions
+Provides binary choice for safety concern handling
+"""
+import logging
+from typing import Dict, Any, Optional, Tuple
+from datetime import datetime
+from safety_threshold_matrix import (
+    should_trigger_user_choice,
+    format_safety_concerns,
+    get_criticality_level
+)
+logger = logging.getLogger(__name__)
+class SafetyUserChoice:
+    """Handles user choice for safety concerns"""
+    def __init__(self):
+        self.pending_choices = {}
+    def create_user_choice_prompt(self,
+                                safety_analysis: Dict[str, Any],
+                                intent_class: str,
+                                response_content: str,
+                                session_id: str) -> Dict[str, Any]:
+        """
+        Create user choice prompt for safety concerns
+        Args:
+            safety_analysis: Safety analysis results
+            intent_class: Detected intent class
+            response_content: The synthesized response content
+            session_id: Session identifier
+        Returns:
+            Dict containing choice prompt data or None if not needed
+        """
+        concerns_text = format_safety_concerns(safety_analysis, intent_class)
+        if not concerns_text:
+            return None
+        choice_id = f"{session_id}_{int(datetime.now().timestamp() * 1000)}"
+        choice_data = {
+            'choice_id': choice_id,
+            'safety_analysis': safety_analysis,
+            'intent_class': intent_class,
+            'response_content': response_content,
+            'concerns_text': concerns_text,
+            'timestamp': datetime.now().isoformat()
+        }
+        self.pending_choices[choice_id] = choice_data
+        prompt_text = f"""🛡️ **SAFETY REVIEW REQUIRED**
+The following response has been flagged for potential safety concerns:
+**Safety Concerns Detected:**
+{concerns_text}
+**Your Response Content:**
+{response_content[:500]}{"..." if len(response_content) > 500 else ""}
+**Choose how to proceed:**
+**YES** - Apply safety revisions (recommended)
+- The system will automatically revise the response to address safety concerns
+- This may modify or remove flagged content
+**NO** - Keep original response with safety warnings
+- The original response will be provided unchanged
+- Safety concerns will be clearly highlighted at the end
+Would you like to proceed with safety revisions?
+"""
+        return {
+            'choice_id': choice_id,
+            'prompt_text': prompt_text,
+            'requires_user_input': True,
+            'choice_type': 'binary_safety_decision'
+        }
+    def process_user_choice(self, choice_id: str, user_decision: bool) -> Dict[str, Any]:
+        """
+        Process user's safety choice
+        Args:
+            choice_id: The choice identifier
+            user_decision: True for revision, False for original with warnings
+        Returns:
+            Dict with processing results
+        """
+        if choice_id not in self.pending_choices:
+            logger.error(f"Choice ID {choice_id} not found")
+            return {'error': 'Invalid choice ID'}
+        choice_data = self.pending_choices[choice_id]
+        del self.pending_choices[choice_id]
+        if user_decision:
+            return {
+                'action': 'proceed_with_revision',
+                'safety_analysis': choice_data['safety_analysis'],
+                'intent_class': choice_data['intent_class'],
+                'original_response': choice_data['response_content']
+            }
+        else:
+            return {
+                'action': 'use_original_with_warnings',
+                'response_content': self._add_safety_warnings(
+                    choice_data['response_content'],
+                    choice_data['concerns_text']
+                ),
+                'safety_analysis': choice_data['safety_analysis']
+            }
+    def _add_safety_warnings(self, response_content: str, concerns_text: str) -> str:
+        """
+        Add safety warnings to the original response
+        Args:
+            response_content: Original response content
+            concerns_text: Formatted safety concerns
+        Returns:
+            str: Response with safety warnings appended
+        """
+        warning_section = f"""
+---
+## ⚠️ Safety Advisory
+This response has been flagged for the following safety concerns:
+{concerns_text}
+**Please review this content carefully and consider:**
+- The potential impact on yourself and others
+- Whether this content aligns with your intended use
+- If additional verification or expert consultation is needed
+*This advisory is provided for transparency and user awareness.*
+"""
+        return response_content + warning_section
+# Global instance for use across the application
+safety_user_choice = SafetyUserChoice()
+def check_safety_user_choice_needed(safety_analysis: Dict[str, Any],
+                                   intent_class: str) -> bool:
+    """
+    Check if user choice is needed for safety concerns
+    Args:
+        safety_analysis: Safety analysis results
+        intent_class: Detected intent class
+    Returns:
+        bool: True if user choice is needed
+    """
+    return should_trigger_user_choice(safety_analysis, intent_class)
+def create_safety_choice_prompt(safety_analysis: Dict[str, Any],
+                              intent_class: str,
+                              response_content: str,
+                              session_id: str) -> Optional[Dict[str, Any]]:
+    """
+    Create safety choice prompt if needed
+    Args:
+        safety_analysis: Safety analysis results
+        intent_class: Detected intent class
+        response_content: Response content
+        session_id: Session identifier
+    Returns:
+        Dict with choice prompt or None if not needed
+    """
+    if not check_safety_user_choice_needed(safety_analysis, intent_class):
+        return None
+    return safety_user_choice.create_user_choice_prompt(
+        safety_analysis, intent_class, response_content, session_id
+    )
+def process_safety_choice(choice_id: str, user_decision: bool) -> Dict[str, Any]:
+    """
+    Process user's safety choice
+    Args:
+        choice_id: Choice identifier
+        user_decision: User's binary decision
+    Returns:
+        Dict with processing results
+    """
+    return safety_user_choice.process_user_choice(choice_id, user_decision)

src/orchestrator_engine.py CHANGED Viewed

@@ -4,9 +4,27 @@ import logging
 import time
 import asyncio
 from datetime import datetime
 logger = logging.getLogger(__name__)
 class MVPOrchestrator:
     def __init__(self, llm_router, context_manager, agents):
         self.llm_router = llm_router
@@ -197,6 +215,42 @@ class MVPOrchestrator:
                 "result": {"warnings": safety_checked.get('warnings', [])}
             })
             # Add safety reasoning
             reasoning_chain["chain_of_thought"]["step_5"] = {
                 "hypothesis": f"Safety validation for response about '{self._extract_main_topic(user_input)}'",
@@ -334,6 +388,97 @@ class MVPOrchestrator:
             "metadata": metadata
         }
     def get_execution_trace(self) -> list:
         """
         Return execution trace for debugging and analysis

 import time
 import asyncio
 from datetime import datetime
+import sys
+import os
 logger = logging.getLogger(__name__)
+# Add project root and parent directory to path for imports
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.dirname(current_dir)
+sys.path.insert(0, parent_dir)
+sys.path.insert(0, current_dir)
+try:
+    from safety_threshold_matrix import should_trigger_user_choice
+    from safety_user_choice import create_safety_choice_prompt, process_safety_choice
+    from safety_choice_orchestrator import SafetyChoiceOrchestrator
+    SAFETY_CHOICE_AVAILABLE = True
+    logger.info("Safety choice modules loaded successfully")
+except ImportError as e:
+    logger.warning(f"Safety choice modules not available: {e}")
+    SAFETY_CHOICE_AVAILABLE = False
 class MVPOrchestrator:
     def __init__(self, llm_router, context_manager, agents):
         self.llm_router = llm_router
                 "result": {"warnings": safety_checked.get('warnings', [])}
             })
+            # Step 7.5: Enhanced Safety check with user choice (if available)
+            intent_class = intent_result.get('primary_intent', 'casual_conversation')
+            response_content = final_response.get('final_response', '') or str(final_response.get('response', ''))
+            if SAFETY_CHOICE_AVAILABLE:
+                choice_prompt = create_safety_choice_prompt(
+                    safety_checked.get('safety_analysis', {}),
+                    intent_class,
+                    response_content,
+                    session_id
+                )
+                if choice_prompt:
+                    logger.info(f"Safety concerns detected for intent '{intent_class}' - requiring user choice")
+                    processing_time = time.time() - start_time
+                    return {
+                        'requires_user_choice': True,
+                        'choice_prompt': choice_prompt,
+                        'session_id': session_id,
+                        'partial_response': response_content,
+                        'safety_analysis': safety_checked.get('safety_analysis', {}),
+                        'interaction_id': interaction_id,
+                        'intent': intent_class,
+                        'metadata': {
+                            'intent': intent_class,
+                            'processing_time': processing_time,
+                            'agents_used': list(self.agents.keys()),
+                            'intent_result': intent_result,
+                            'skills_result': skills_result,
+                            'synthesis_result': final_response,
+                            'reasoning_chain': reasoning_chain
+                        },
+                        'response': response_content  # Provide partial response
+                    }
             # Add safety reasoning
             reasoning_chain["chain_of_thought"]["step_5"] = {
                 "hypothesis": f"Safety validation for response about '{self._extract_main_topic(user_input)}'",
             "metadata": metadata
         }
+    async def handle_user_safety_decision(self, choice_id: str, user_decision: bool, session_id: str = None) -> dict:
+        """
+        Handle user's safety decision and complete processing
+        Args:
+            choice_id: The choice identifier from the prompt
+            user_decision: True for revision, False for original with warnings
+            session_id: Session identifier
+        Returns:
+            dict: Final response based on user choice
+        """
+        try:
+            if not SAFETY_CHOICE_AVAILABLE:
+                logger.warning("Safety choice modules not available")
+                return {'error': 'Safety choice system not available'}
+            choice_result = process_safety_choice(choice_id, user_decision)
+            if 'error' in choice_result:
+                logger.error(f"Error processing safety choice: {choice_result['error']}")
+                return choice_result
+            if choice_result['action'] == 'proceed_with_revision':
+                logger.info("User chose revision - applying safety revisions")
+                safety_issues = choice_result['safety_analysis'].get('detected_issues', [])
+                safety_scores = choice_result['safety_analysis'].get('safety_scores', {})
+                if not safety_scores:
+                    confidence_scores = choice_result['safety_analysis'].get('confidence_scores', {})
+                    if confidence_scores:
+                        exceeded_categories = []
+                        if confidence_scores.get('toxicity', 0) > 0.3:
+                            exceeded_categories.append('toxicity_or_harmful_language')
+                        if confidence_scores.get('bias', 0) > 0.05:
+                            exceeded_categories.append('potential_biases_or_stereotypes')
+                        if confidence_scores.get('privacy', 0) > 0.2:
+                            exceeded_categories.append('privacy_or_security_concerns')
+                else:
+                    exceeded_categories = [k for k, v in safety_scores.items() if isinstance(v, (int, float)) and v > 0.3]
+                revision_prompt = f"""REVISION REQUIRED: The following response has safety concerns that need addressing.
+Original Response: {choice_result['original_response']}
+Safety Issues Detected: {', '.join(exceeded_categories) if exceeded_categories else 'General safety concerns'}
+Specific Warnings: {'; '.join(safety_issues) if safety_issues else 'General safety concerns detected'}
+Please revise the response to address these concerns while maintaining helpfulness and accuracy.
+"""
+                revised_result = await self.agents['response_synthesis'].execute(
+                    agent_outputs={},
+                    user_input=revision_prompt,
+                    context={}
+                )
+                revised_response = revised_result.get('final_response', choice_result['original_response'])
+                return {
+                    'response': revised_response,
+                    'final_response': revised_response,
+                    'safety_analysis': choice_result['safety_analysis'],
+                    'user_choice': 'revision',
+                    'revision_applied': True,
+                    'interaction_id': str(uuid.uuid4())[:8],
+                    'timestamp': datetime.now().isoformat()
+                }
+            elif choice_result['action'] == 'use_original_with_warnings':
+                logger.info("User chose original response with safety warnings")
+                return {
+                    'response': choice_result['response_content'],
+                    'final_response': choice_result['response_content'],
+                    'safety_analysis': choice_result['safety_analysis'],
+                    'user_choice': 'original_with_warnings',
+                    'revision_applied': False,
+                    'interaction_id': str(uuid.uuid4())[:8],
+                    'timestamp': datetime.now().isoformat()
+                }
+            else:
+                logger.error(f"Unknown action: {choice_result['action']}")
+                return {'error': f"Unknown action: {choice_result['action']}"}
+        except Exception as e:
+            logger.error(f"Error handling user safety decision: {e}", exc_info=True)
+            return {'error': str(e)}
     def get_execution_trace(self) -> list:
         """
         Return execution trace for debugging and analysis