JatsTheAIGen commited on
Commit
0b5851a
·
1 Parent(s): 7506c11

safety agent upgrades to enable creative freedom v1

Browse files
app.py CHANGED
@@ -514,7 +514,54 @@ async def process_message_async(message: str, history: Optional[List], session_i
514
 
515
  new_history = list(history) if isinstance(history, list) else []
516
 
517
- # Add user message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
  new_history.append({"role": "user", "content": message.strip()})
519
 
520
  # Initialize Details tab data
@@ -529,13 +576,65 @@ async def process_message_async(message: str, history: Optional[List], session_i
529
  # Try to use orchestrator if available
530
  if orchestrator is not None:
531
  try:
532
- logger.info("Attempting full orchestration with safety revision...")
533
- # Use enhanced orchestrator with safety revision
534
- result = await orchestrator.process_request_with_revision(
535
  session_id=session_id,
536
  user_input=message.strip()
537
  )
538
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
  # Log revision information
540
  revision_attempts = result.get('revision_attempts', 0)
541
  safety_revision_applied = result.get('safety_revision_applied', False)
 
514
 
515
  new_history = list(history) if isinstance(history, list) else []
516
 
517
+ # Check if this is a safety choice response
518
+ message_upper = message.strip().upper()
519
+ is_safety_choice = message_upper in ['YES', 'NO', 'APPLY', 'KEEP', 'Y', 'N']
520
+
521
+ # Check if we have a pending safety choice for this session
522
+ if is_safety_choice and orchestrator is not None and hasattr(orchestrator, '_pending_choices'):
523
+ pending_choice = orchestrator._pending_choices.get(session_id)
524
+ if pending_choice:
525
+ logger.info(f"Processing safety choice: {message_upper}")
526
+
527
+ # Determine user decision
528
+ user_decision = message_upper in ['YES', 'APPLY', 'Y']
529
+
530
+ # Process the safety choice
531
+ choice_result = await orchestrator.handle_user_safety_decision(
532
+ pending_choice['choice_id'],
533
+ user_decision,
534
+ session_id
535
+ )
536
+
537
+ # Clean up pending choice
538
+ del orchestrator._pending_choices[session_id]
539
+
540
+ # Add user message
541
+ new_history.append({"role": "user", "content": message.strip()})
542
+
543
+ # Add assistant response
544
+ if 'error' in choice_result:
545
+ response = f"Error processing safety choice: {choice_result['error']}"
546
+ else:
547
+ response = choice_result.get('response', choice_result.get('final_response', 'Processing complete.'))
548
+
549
+ new_history.append({"role": "assistant", "content": response})
550
+
551
+ # Extract metadata
552
+ reasoning_data = {}
553
+ performance_data = {
554
+ "user_choice": choice_result.get('user_choice', 'unknown'),
555
+ "revision_applied": choice_result.get('revision_applied', False)
556
+ }
557
+ context_data = {
558
+ "interaction_id": choice_result.get('interaction_id', 'unknown'),
559
+ "session_id": session_id
560
+ }
561
+
562
+ return new_history, "", reasoning_data, performance_data, context_data, session_id, ""
563
+
564
+ # Add user message (normal flow)
565
  new_history.append({"role": "user", "content": message.strip()})
566
 
567
  # Initialize Details tab data
 
576
  # Try to use orchestrator if available
577
  if orchestrator is not None:
578
  try:
579
+ logger.info("Attempting full orchestration...")
580
+ # First, try normal processing to check for user choice
581
+ result = await orchestrator.process_request(
582
  session_id=session_id,
583
  user_input=message.strip()
584
  )
585
 
586
+ # Check if user choice is required
587
+ if result.get('requires_user_choice', False):
588
+ logger.info("User choice required for safety concerns")
589
+ choice_prompt = result.get('choice_prompt', {})
590
+ choice_id = choice_prompt.get('choice_id', '')
591
+
592
+ # Create user-friendly choice prompt message
593
+ prompt_text = choice_prompt.get('prompt_text', '').replace('**', '').replace('*', '')
594
+
595
+ choice_message = f"""🛡️ **SAFETY REVIEW REQUIRED**
596
+
597
+ {prompt_text}
598
+
599
+ **Please reply with:**
600
+ - Type "YES" or "APPLY" to revise the response
601
+ - Type "NO" or "KEEP" to keep the original response with warnings"""
602
+
603
+ # Store choice_id in session for later retrieval
604
+ # We'll detect it when user responds
605
+ if not hasattr(orchestrator, '_pending_choices'):
606
+ orchestrator._pending_choices = {}
607
+ orchestrator._pending_choices[session_id] = {
608
+ 'choice_id': choice_id,
609
+ 'partial_response': result.get('partial_response', ''),
610
+ 'safety_analysis': result.get('safety_analysis', {})
611
+ }
612
+
613
+ # Add assistant message with choice prompt
614
+ new_history.append({
615
+ "role": "assistant",
616
+ "content": choice_message
617
+ })
618
+
619
+ # Extract metadata for Details tab
620
+ reasoning_data = result.get('metadata', {}).get('reasoning_chain', {})
621
+ performance_data = {
622
+ "processing_time": result.get('metadata', {}).get('processing_time', 0),
623
+ "agents_used": result.get('metadata', {}).get('agents_used', [])
624
+ }
625
+ context_data = {
626
+ "interaction_id": result.get('interaction_id', 'unknown'),
627
+ "session_id": session_id
628
+ }
629
+
630
+ # Return early with choice prompt
631
+ return new_history, "", reasoning_data, performance_data, context_data, session_id, ""
632
+
633
+ # If no user choice needed, proceed with safety revision if needed
634
+ # This maintains backward compatibility
635
+ if not result.get('requires_user_choice', False):
636
+ logger.info("No user choice needed, proceeding with normal flow")
637
+
638
  # Log revision information
639
  revision_attempts = result.get('revision_attempts', 0)
640
  safety_revision_applied = result.get('safety_revision_applied', False)
orchestrator_integration_patch.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Orchestrator Integration Patch for Safety User Choice
3
+ Provides integration instructions and helper functions
4
+ """
5
+
6
+ # Integration Instructions:
7
+ """
8
+ This file documents the integration of the safety user choice system.
9
+
10
+ INTEGRATION POINTS:
11
+ 1. In orchestrator_engine.py, after safety_agent completes (around line 191-194):
12
+ - Call create_safety_choice_prompt() to check if user choice is needed
13
+ - If choice needed, return choice prompt to UI
14
+ - If no choice needed, proceed with current workflow
15
+
16
+ 2. Add new method to orchestrator class:
17
+ - handle_user_safety_decision(choice_id, user_decision)
18
+ - This processes the user's choice and returns final response
19
+
20
+ 3. In app.py/UI layer:
21
+ - Display choice prompt when requires_user_choice=True
22
+ - Provide YES/NO buttons
23
+ - Submit choice back to orchestrator
24
+ - Display final response
25
+
26
+ 4. Modify safety revision workflow:
27
+ - Only run if user chooses "YES"
28
+ - Otherwise append safety warnings to original response
29
+
30
+ WORKFLOW:
31
+ Safety analysis completes → Check dynamic thresholds
32
+ If exceeded → Present user choice: "YES (revise)" or "NO (original + warnings)"
33
+ YES → Existing revision workflow
34
+ NO → Original response + formatted safety concerns section
35
+ """
36
+
37
+ def get_integration_notes():
38
+ """Return integration notes"""
39
+ return {
40
+ 'integration_point': 'orchestrator_engine.py line ~191 (after safety check)',
41
+ 'new_methods': ['handle_user_safety_decision'],
42
+ 'imports_needed': [
43
+ 'from safety_threshold_matrix import should_trigger_user_choice',
44
+ 'from safety_user_choice import create_safety_choice_prompt, process_safety_choice',
45
+ 'from safety_choice_orchestrator import SafetyChoiceOrchestrator'
46
+ ],
47
+ 'ui_changes': [
48
+ 'Add safety choice prompt display',
49
+ 'Add YES/NO buttons',
50
+ 'Handle choice submission',
51
+ 'Mobile-first responsive design maintained'
52
+ ]
53
+ }
54
+
safety_choice_orchestrator.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced Orchestrator Workflow with User Safety Choice
3
+ Integrates user decision point after safety analysis
4
+ """
5
+
6
+ import logging
7
+ from typing import Dict, Any, Optional
8
+ from safety_user_choice import create_safety_choice_prompt, process_safety_choice
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class SafetyChoiceOrchestrator:
13
+ """Enhanced orchestrator with user safety choice integration"""
14
+
15
+ def __init__(self, original_orchestrator):
16
+ self.original_orchestrator = original_orchestrator
17
+ self.pending_safety_choices = {}
18
+
19
+ async def process_with_safety_choice(self,
20
+ user_input: str,
21
+ session_id: str,
22
+ intent_class: str,
23
+ safety_analysis: Dict[str, Any],
24
+ response_content: str,
25
+ user_choice_callback=None) -> Dict[str, Any]:
26
+ """
27
+ Process request with integrated safety choice workflow
28
+
29
+ Args:
30
+ user_input: User's input text
31
+ session_id: Session identifier
32
+ intent_class: Detected intent class
33
+ safety_analysis: Safety analysis results
34
+ response_content: Synthesized response content
35
+ user_choice_callback: Function to handle user choice UI
36
+
37
+ Returns:
38
+ Dict with processing results
39
+ """
40
+
41
+ try:
42
+ choice_prompt = create_safety_choice_prompt(
43
+ safety_analysis, intent_class, response_content, session_id
44
+ )
45
+
46
+ if choice_prompt:
47
+ logger.info(f"Safety concerns detected for intent '{intent_class}' - requiring user choice")
48
+
49
+ if user_choice_callback:
50
+ user_decision = await user_choice_callback(choice_prompt)
51
+ return await self._handle_user_decision(
52
+ choice_prompt['choice_id'], user_decision, safety_analysis
53
+ )
54
+ else:
55
+ return {
56
+ 'requires_user_choice': True,
57
+ 'choice_prompt': choice_prompt,
58
+ 'session_id': session_id,
59
+ 'partial_response': response_content,
60
+ 'safety_analysis': safety_analysis
61
+ }
62
+ else:
63
+ logger.info("No safety concerns detected - proceeding normally")
64
+ return {
65
+ 'response': response_content,
66
+ 'safety_analysis': safety_analysis,
67
+ 'requires_user_choice': False
68
+ }
69
+
70
+ except Exception as e:
71
+ logger.error(f"Error in safety choice orchestration: {e}")
72
+ return {
73
+ 'error': str(e),
74
+ 'requires_user_choice': False,
75
+ 'response': response_content
76
+ }
77
+
78
+ async def handle_user_safety_decision(self,
79
+ choice_id: str,
80
+ user_decision: bool,
81
+ original_response: str = None) -> Dict[str, Any]:
82
+ """
83
+ Handle user's safety decision
84
+
85
+ Args:
86
+ choice_id: The choice identifier
87
+ user_decision: True for revision, False for original with warnings
88
+ original_response: Original response content (optional)
89
+
90
+ Returns:
91
+ Dict with final response
92
+ """
93
+ return await self._handle_user_decision(choice_id, user_decision, None, original_response)
94
+
95
+ async def _handle_user_decision(self,
96
+ choice_id: str,
97
+ user_decision: bool,
98
+ safety_analysis: Dict[str, Any] = None,
99
+ original_response: str = None) -> Dict[str, Any]:
100
+ """Internal method to handle user decision"""
101
+
102
+ choice_result = process_safety_choice(choice_id, user_decision)
103
+
104
+ if 'error' in choice_result:
105
+ return choice_result
106
+
107
+ if choice_result['action'] == 'proceed_with_revision':
108
+ logger.info("User chose revision - running safety revision")
109
+
110
+ revised_response = await self._run_safety_revision(
111
+ choice_result['original_response'],
112
+ choice_result['safety_analysis'],
113
+ choice_result.get('intent_class', 'casual_conversation')
114
+ )
115
+
116
+ return {
117
+ 'response': revised_response,
118
+ 'safety_analysis': choice_result['safety_analysis'],
119
+ 'user_choice': 'revision',
120
+ 'requires_user_choice': False,
121
+ 'revision_applied': True
122
+ }
123
+
124
+ elif choice_result['action'] == 'use_original_with_warnings':
125
+ logger.info("User chose original with warnings")
126
+
127
+ return {
128
+ 'response': choice_result['response_content'],
129
+ 'safety_analysis': choice_result['safety_analysis'],
130
+ 'user_choice': 'original_with_warnings',
131
+ 'requires_user_choice': False,
132
+ 'revision_applied': False
133
+ }
134
+
135
+ else:
136
+ return {'error': 'Unknown action from choice result'}
137
+
138
+ async def _run_safety_revision(self,
139
+ original_response: str,
140
+ safety_analysis: Dict[str, Any],
141
+ intent_class: str) -> str:
142
+ """
143
+ Run safety revision on the response using the original orchestrator's revision workflow
144
+
145
+ Args:
146
+ original_response: Original response content
147
+ safety_analysis: Safety analysis results
148
+ intent_class: Intent class for context
149
+
150
+ Returns:
151
+ str: Revised response
152
+ """
153
+ try:
154
+ exceeded_categories = []
155
+ safety_scores = safety_analysis.get('safety_scores', {})
156
+
157
+ if not safety_scores:
158
+ confidence_scores = safety_analysis.get('confidence_scores', {})
159
+ if confidence_scores:
160
+ safety_scores = {
161
+ 'toxicity_or_harmful_language': confidence_scores.get('toxicity', 0.0),
162
+ 'potential_biases_or_stereotypes': confidence_scores.get('bias', 0.0),
163
+ 'privacy_or_security_concerns': confidence_scores.get('privacy', 0.0),
164
+ 'controversial_or_sensitive_topics': 1.0 - confidence_scores.get('safety', 1.0)
165
+ }
166
+
167
+ safety_warnings = safety_analysis.get('detected_issues', [])
168
+
169
+ if hasattr(self.original_orchestrator, 'generate_revision_prompt'):
170
+ revision_prompt = self.original_orchestrator.generate_revision_prompt(
171
+ original_response,
172
+ exceeded_categories,
173
+ safety_warnings
174
+ )
175
+
176
+ if hasattr(self.original_orchestrator, 'agents') and 'response_synthesis' in self.original_orchestrator.agents:
177
+ revised_result = await self.original_orchestrator.agents['response_synthesis'].execute(
178
+ agent_outputs={},
179
+ user_input=revision_prompt,
180
+ context={}
181
+ )
182
+
183
+ revised_response = revised_result.get('final_response', original_response)
184
+ return revised_response
185
+
186
+ return original_response
187
+
188
+ except Exception as e:
189
+ logger.error(f"Error in safety revision: {e}")
190
+ return original_response
191
+
safety_threshold_matrix.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Intent-Based Safety Threshold Matrix
3
+ Based on academic research for dynamic content moderation thresholds
4
+ """
5
+
6
+ # Intent-Based Safety Threshold Matrix (Academic Research-Based)
7
+ INTENT_SAFETY_MATRIX = {
8
+ 'information_request': {
9
+ 'toxicity': 0.20,
10
+ 'sexual_content': 0.15,
11
+ 'violence': 0.15,
12
+ 'self_harm': 0.10,
13
+ 'harassment': 0.20,
14
+ 'dangerous_activities': 0.15,
15
+ 'privacy_violations': 0.25,
16
+ 'misinformation': 0.30,
17
+ 'child_safety': 0.10,
18
+ 'bias_stereotyping': 0.25
19
+ },
20
+ 'task_execution': {
21
+ 'toxicity': 0.25,
22
+ 'sexual_content': 0.20,
23
+ 'violence': 0.20,
24
+ 'self_harm': 0.15,
25
+ 'harassment': 0.25,
26
+ 'dangerous_activities': 0.20,
27
+ 'privacy_violations': 0.30,
28
+ 'misinformation': 0.35,
29
+ 'child_safety': 0.15,
30
+ 'bias_stereotyping': 0.30
31
+ },
32
+ 'creative_generation': {
33
+ 'toxicity': 0.40,
34
+ 'sexual_content': 0.35,
35
+ 'violence': 0.35,
36
+ 'self_harm': 0.25,
37
+ 'harassment': 0.30,
38
+ 'dangerous_activities': 0.30,
39
+ 'privacy_violations': 0.20,
40
+ 'misinformation': 0.25,
41
+ 'child_safety': 0.20,
42
+ 'bias_stereotyping': 0.35
43
+ },
44
+ 'analysis_research': {
45
+ 'toxicity': 0.35,
46
+ 'sexual_content': 0.25,
47
+ 'violence': 0.25,
48
+ 'self_harm': 0.20,
49
+ 'harassment': 0.30,
50
+ 'dangerous_activities': 0.25,
51
+ 'privacy_violations': 0.40,
52
+ 'misinformation': 0.45,
53
+ 'child_safety': 0.25,
54
+ 'bias_stereotyping': 0.40
55
+ },
56
+ 'casual_conversation': {
57
+ 'toxicity': 0.30,
58
+ 'sexual_content': 0.25,
59
+ 'violence': 0.25,
60
+ 'self_harm': 0.20,
61
+ 'harassment': 0.35,
62
+ 'dangerous_activities': 0.25,
63
+ 'privacy_violations': 0.15,
64
+ 'misinformation': 0.20,
65
+ 'child_safety': 0.15,
66
+ 'bias_stereotyping': 0.25
67
+ },
68
+ 'troubleshooting': {
69
+ 'toxicity': 0.15,
70
+ 'sexual_content': 0.10,
71
+ 'violence': 0.15,
72
+ 'self_harm': 0.10,
73
+ 'harassment': 0.20,
74
+ 'dangerous_activities': 0.10,
75
+ 'privacy_violations': 0.35,
76
+ 'misinformation': 0.40,
77
+ 'child_safety': 0.10,
78
+ 'bias_stereotyping': 0.20
79
+ },
80
+ 'education_learning': {
81
+ 'toxicity': 0.25,
82
+ 'sexual_content': 0.20,
83
+ 'violence': 0.30,
84
+ 'self_harm': 0.15,
85
+ 'harassment': 0.20,
86
+ 'dangerous_activities': 0.35,
87
+ 'privacy_violations': 0.25,
88
+ 'misinformation': 0.50,
89
+ 'child_safety': 0.30,
90
+ 'bias_stereotyping': 0.35
91
+ },
92
+ 'technical_support': {
93
+ 'toxicity': 0.10,
94
+ 'sexual_content': 0.05,
95
+ 'violence': 0.10,
96
+ 'self_harm': 0.05,
97
+ 'harassment': 0.15,
98
+ 'dangerous_activities': 0.05,
99
+ 'privacy_violations': 0.45,
100
+ 'misinformation': 0.45,
101
+ 'child_safety': 0.05,
102
+ 'bias_stereotyping': 0.15
103
+ }
104
+ }
105
+
106
+ # Safety parameter descriptions for user display
107
+ SAFETY_DESCRIPTIONS = {
108
+ 'toxicity': 'Hate speech, discriminatory language',
109
+ 'sexual_content': 'Explicit sexual material, nudity',
110
+ 'violence': 'Graphic violence, gore, death depictions',
111
+ 'self_harm': 'Content promoting self-harm or suicide',
112
+ 'harassment': 'Personal attacks, bullying, doxxing',
113
+ 'dangerous_activities': 'Illegal activities, weapon instructions',
114
+ 'privacy_violations': 'Personal data exposure, unauthorized sharing',
115
+ 'misinformation': 'False information, unverified claims',
116
+ 'child_safety': 'Content potentially harmful to minors',
117
+ 'bias_stereotyping': 'Discriminatory generalizations'
118
+ }
119
+
120
+ # Criticality levels
121
+ CRITICALITY_LEVELS = {
122
+ (0.0, 0.3): 'LOW',
123
+ (0.3, 0.6): 'MEDIUM',
124
+ (0.6, 0.8): 'HIGH',
125
+ (0.8, 1.0): 'CRITICAL'
126
+ }
127
+
128
+ def get_dynamic_threshold(intent_class, safety_param, user_context=None):
129
+ """
130
+ Calculate dynamic safety threshold based on intent and context
131
+
132
+ Args:
133
+ intent_class: The detected intent type
134
+ safety_param: The safety parameter to check
135
+ user_context: Optional user context for adjustments
136
+
137
+ Returns:
138
+ float: Adjusted threshold value
139
+ """
140
+ if intent_class not in INTENT_SAFETY_MATRIX:
141
+ intent_class = 'casual_conversation'
142
+
143
+ if safety_param not in INTENT_SAFETY_MATRIX[intent_class]:
144
+ return 0.2
145
+
146
+ base_threshold = INTENT_SAFETY_MATRIX[intent_class][safety_param]
147
+
148
+ if user_context:
149
+ if user_context.get('academic_use', False):
150
+ base_threshold *= 1.2
151
+
152
+ if user_context.get('minor_present', False):
153
+ base_threshold *= 0.7
154
+
155
+ if user_context.get('professional_context', False):
156
+ base_threshold *= 1.1
157
+
158
+ if user_context.get('creative_context', False):
159
+ base_threshold *= 1.15
160
+
161
+ return min(base_threshold, 0.8)
162
+
163
+ def get_criticality_level(score):
164
+ """Get criticality level for a safety score"""
165
+ for (min_val, max_val), level in CRITICALITY_LEVELS.items():
166
+ if min_val <= score < max_val:
167
+ return level
168
+ return 'CRITICAL'
169
+
170
+ def format_safety_concerns(safety_analysis, intent_class):
171
+ """
172
+ Format safety concerns for user display
173
+
174
+ Args:
175
+ safety_analysis: Dict containing safety scores
176
+ intent_class: The detected intent class
177
+
178
+ Returns:
179
+ str: Formatted safety concerns text or None if no concerns
180
+ """
181
+ concerns = []
182
+
183
+ safety_mapping = {
184
+ 'toxicity_or_harmful_language': 'toxicity',
185
+ 'potential_biases_or_stereotypes': 'bias_stereotyping',
186
+ 'privacy_or_security_concerns': 'privacy_violations',
187
+ 'controversial_or_sensitive_topics': 'toxicity'
188
+ }
189
+
190
+ safety_scores = safety_analysis.get('safety_scores', {})
191
+
192
+ if not safety_scores:
193
+ confidence_scores = safety_analysis.get('confidence_scores', {})
194
+ if confidence_scores:
195
+ safety_scores = {
196
+ 'toxicity_or_harmful_language': confidence_scores.get('toxicity', 0.0),
197
+ 'potential_biases_or_stereotypes': confidence_scores.get('bias', 0.0),
198
+ 'privacy_or_security_concerns': confidence_scores.get('privacy', 0.0),
199
+ 'controversial_or_sensitive_topics': 1.0 - confidence_scores.get('safety', 1.0)
200
+ }
201
+
202
+ for param_key, score in safety_scores.items():
203
+ if param_key in safety_mapping:
204
+ safety_param = safety_mapping[param_key]
205
+ threshold = get_dynamic_threshold(intent_class, safety_param)
206
+
207
+ if isinstance(score, (int, float)) and score > threshold:
208
+ criticality = get_criticality_level(score)
209
+ description = SAFETY_DESCRIPTIONS.get(safety_param, param_key)
210
+ concerns.append(f"• **{criticality}**: {description} (Score: {score:.2f}, Threshold: {threshold:.2f})")
211
+
212
+ if not concerns:
213
+ return None
214
+
215
+ return "\n".join(concerns)
216
+
217
+ def should_trigger_user_choice(safety_analysis, intent_class):
218
+ """
219
+ Determine if user choice should be triggered based on safety analysis
220
+
221
+ Args:
222
+ safety_analysis: Dict containing safety analysis
223
+ intent_class: The detected intent class
224
+
225
+ Returns:
226
+ bool: True if user choice should be triggered
227
+ """
228
+ safety_scores = safety_analysis.get('safety_scores', {})
229
+
230
+ if not safety_scores:
231
+ confidence_scores = safety_analysis.get('confidence_scores', {})
232
+ if confidence_scores:
233
+ safety_scores = {
234
+ 'toxicity_or_harmful_language': confidence_scores.get('toxicity', 0.0),
235
+ 'potential_biases_or_stereotypes': confidence_scores.get('bias', 0.0),
236
+ 'privacy_or_security_concerns': confidence_scores.get('privacy', 0.0),
237
+ 'controversial_or_sensitive_topics': 1.0 - confidence_scores.get('safety', 1.0)
238
+ }
239
+
240
+ safety_mapping = {
241
+ 'toxicity_or_harmful_language': 'toxicity',
242
+ 'potential_biases_or_stereotypes': 'bias_stereotyping',
243
+ 'privacy_or_security_concerns': 'privacy_violations',
244
+ 'controversial_or_sensitive_topics': 'toxicity'
245
+ }
246
+
247
+ for param_key, score in safety_scores.items():
248
+ if param_key in safety_mapping:
249
+ safety_param = safety_mapping[param_key]
250
+ threshold = get_dynamic_threshold(intent_class, safety_param)
251
+
252
+ if isinstance(score, (int, float)) and score > threshold:
253
+ return True
254
+
255
+ return False
256
+
safety_user_choice.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ User Choice Interface for Safety Decisions
3
+ Provides binary choice for safety concern handling
4
+ """
5
+
6
+ import logging
7
+ from typing import Dict, Any, Optional, Tuple
8
+ from datetime import datetime
9
+ from safety_threshold_matrix import (
10
+ should_trigger_user_choice,
11
+ format_safety_concerns,
12
+ get_criticality_level
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class SafetyUserChoice:
18
+ """Handles user choice for safety concerns"""
19
+
20
+ def __init__(self):
21
+ self.pending_choices = {}
22
+
23
+ def create_user_choice_prompt(self,
24
+ safety_analysis: Dict[str, Any],
25
+ intent_class: str,
26
+ response_content: str,
27
+ session_id: str) -> Dict[str, Any]:
28
+ """
29
+ Create user choice prompt for safety concerns
30
+
31
+ Args:
32
+ safety_analysis: Safety analysis results
33
+ intent_class: Detected intent class
34
+ response_content: The synthesized response content
35
+ session_id: Session identifier
36
+
37
+ Returns:
38
+ Dict containing choice prompt data or None if not needed
39
+ """
40
+
41
+ concerns_text = format_safety_concerns(safety_analysis, intent_class)
42
+
43
+ if not concerns_text:
44
+ return None
45
+
46
+ choice_id = f"{session_id}_{int(datetime.now().timestamp() * 1000)}"
47
+
48
+ choice_data = {
49
+ 'choice_id': choice_id,
50
+ 'safety_analysis': safety_analysis,
51
+ 'intent_class': intent_class,
52
+ 'response_content': response_content,
53
+ 'concerns_text': concerns_text,
54
+ 'timestamp': datetime.now().isoformat()
55
+ }
56
+
57
+ self.pending_choices[choice_id] = choice_data
58
+
59
+ prompt_text = f"""🛡️ **SAFETY REVIEW REQUIRED**
60
+
61
+ The following response has been flagged for potential safety concerns:
62
+
63
+ **Safety Concerns Detected:**
64
+ {concerns_text}
65
+
66
+ **Your Response Content:**
67
+ {response_content[:500]}{"..." if len(response_content) > 500 else ""}
68
+
69
+ **Choose how to proceed:**
70
+
71
+ **YES** - Apply safety revisions (recommended)
72
+ - The system will automatically revise the response to address safety concerns
73
+ - This may modify or remove flagged content
74
+
75
+ **NO** - Keep original response with safety warnings
76
+ - The original response will be provided unchanged
77
+ - Safety concerns will be clearly highlighted at the end
78
+
79
+ Would you like to proceed with safety revisions?
80
+ """
81
+
82
+ return {
83
+ 'choice_id': choice_id,
84
+ 'prompt_text': prompt_text,
85
+ 'requires_user_input': True,
86
+ 'choice_type': 'binary_safety_decision'
87
+ }
88
+
89
+ def process_user_choice(self, choice_id: str, user_decision: bool) -> Dict[str, Any]:
90
+ """
91
+ Process user's safety choice
92
+
93
+ Args:
94
+ choice_id: The choice identifier
95
+ user_decision: True for revision, False for original with warnings
96
+
97
+ Returns:
98
+ Dict with processing results
99
+ """
100
+
101
+ if choice_id not in self.pending_choices:
102
+ logger.error(f"Choice ID {choice_id} not found")
103
+ return {'error': 'Invalid choice ID'}
104
+
105
+ choice_data = self.pending_choices[choice_id]
106
+
107
+ del self.pending_choices[choice_id]
108
+
109
+ if user_decision:
110
+ return {
111
+ 'action': 'proceed_with_revision',
112
+ 'safety_analysis': choice_data['safety_analysis'],
113
+ 'intent_class': choice_data['intent_class'],
114
+ 'original_response': choice_data['response_content']
115
+ }
116
+ else:
117
+ return {
118
+ 'action': 'use_original_with_warnings',
119
+ 'response_content': self._add_safety_warnings(
120
+ choice_data['response_content'],
121
+ choice_data['concerns_text']
122
+ ),
123
+ 'safety_analysis': choice_data['safety_analysis']
124
+ }
125
+
126
+ def _add_safety_warnings(self, response_content: str, concerns_text: str) -> str:
127
+ """
128
+ Add safety warnings to the original response
129
+
130
+ Args:
131
+ response_content: Original response content
132
+ concerns_text: Formatted safety concerns
133
+
134
+ Returns:
135
+ str: Response with safety warnings appended
136
+ """
137
+
138
+ warning_section = f"""
139
+
140
+ ---
141
+
142
+ ## ⚠️ Safety Advisory
143
+
144
+ This response has been flagged for the following safety concerns:
145
+
146
+ {concerns_text}
147
+
148
+ **Please review this content carefully and consider:**
149
+ - The potential impact on yourself and others
150
+ - Whether this content aligns with your intended use
151
+ - If additional verification or expert consultation is needed
152
+
153
+ *This advisory is provided for transparency and user awareness.*
154
+ """
155
+
156
+ return response_content + warning_section
157
+
158
+ # Global instance for use across the application
159
+ safety_user_choice = SafetyUserChoice()
160
+
161
+ def check_safety_user_choice_needed(safety_analysis: Dict[str, Any],
162
+ intent_class: str) -> bool:
163
+ """
164
+ Check if user choice is needed for safety concerns
165
+
166
+ Args:
167
+ safety_analysis: Safety analysis results
168
+ intent_class: Detected intent class
169
+
170
+ Returns:
171
+ bool: True if user choice is needed
172
+ """
173
+ return should_trigger_user_choice(safety_analysis, intent_class)
174
+
175
+ def create_safety_choice_prompt(safety_analysis: Dict[str, Any],
176
+ intent_class: str,
177
+ response_content: str,
178
+ session_id: str) -> Optional[Dict[str, Any]]:
179
+ """
180
+ Create safety choice prompt if needed
181
+
182
+ Args:
183
+ safety_analysis: Safety analysis results
184
+ intent_class: Detected intent class
185
+ response_content: Response content
186
+ session_id: Session identifier
187
+
188
+ Returns:
189
+ Dict with choice prompt or None if not needed
190
+ """
191
+ if not check_safety_user_choice_needed(safety_analysis, intent_class):
192
+ return None
193
+
194
+ return safety_user_choice.create_user_choice_prompt(
195
+ safety_analysis, intent_class, response_content, session_id
196
+ )
197
+
198
+ def process_safety_choice(choice_id: str, user_decision: bool) -> Dict[str, Any]:
199
+ """
200
+ Process user's safety choice
201
+
202
+ Args:
203
+ choice_id: Choice identifier
204
+ user_decision: User's binary decision
205
+
206
+ Returns:
207
+ Dict with processing results
208
+ """
209
+ return safety_user_choice.process_user_choice(choice_id, user_decision)
210
+
src/orchestrator_engine.py CHANGED
@@ -4,9 +4,27 @@ import logging
4
  import time
5
  import asyncio
6
  from datetime import datetime
 
 
7
 
8
  logger = logging.getLogger(__name__)
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  class MVPOrchestrator:
11
  def __init__(self, llm_router, context_manager, agents):
12
  self.llm_router = llm_router
@@ -197,6 +215,42 @@ class MVPOrchestrator:
197
  "result": {"warnings": safety_checked.get('warnings', [])}
198
  })
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  # Add safety reasoning
201
  reasoning_chain["chain_of_thought"]["step_5"] = {
202
  "hypothesis": f"Safety validation for response about '{self._extract_main_topic(user_input)}'",
@@ -334,6 +388,97 @@ class MVPOrchestrator:
334
  "metadata": metadata
335
  }
336
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  def get_execution_trace(self) -> list:
338
  """
339
  Return execution trace for debugging and analysis
 
4
  import time
5
  import asyncio
6
  from datetime import datetime
7
+ import sys
8
+ import os
9
 
10
  logger = logging.getLogger(__name__)
11
 
12
+ # Add project root and parent directory to path for imports
13
+ current_dir = os.path.dirname(os.path.abspath(__file__))
14
+ parent_dir = os.path.dirname(current_dir)
15
+ sys.path.insert(0, parent_dir)
16
+ sys.path.insert(0, current_dir)
17
+
18
+ try:
19
+ from safety_threshold_matrix import should_trigger_user_choice
20
+ from safety_user_choice import create_safety_choice_prompt, process_safety_choice
21
+ from safety_choice_orchestrator import SafetyChoiceOrchestrator
22
+ SAFETY_CHOICE_AVAILABLE = True
23
+ logger.info("Safety choice modules loaded successfully")
24
+ except ImportError as e:
25
+ logger.warning(f"Safety choice modules not available: {e}")
26
+ SAFETY_CHOICE_AVAILABLE = False
27
+
28
  class MVPOrchestrator:
29
  def __init__(self, llm_router, context_manager, agents):
30
  self.llm_router = llm_router
 
215
  "result": {"warnings": safety_checked.get('warnings', [])}
216
  })
217
 
218
+ # Step 7.5: Enhanced Safety check with user choice (if available)
219
+ intent_class = intent_result.get('primary_intent', 'casual_conversation')
220
+ response_content = final_response.get('final_response', '') or str(final_response.get('response', ''))
221
+
222
+ if SAFETY_CHOICE_AVAILABLE:
223
+ choice_prompt = create_safety_choice_prompt(
224
+ safety_checked.get('safety_analysis', {}),
225
+ intent_class,
226
+ response_content,
227
+ session_id
228
+ )
229
+
230
+ if choice_prompt:
231
+ logger.info(f"Safety concerns detected for intent '{intent_class}' - requiring user choice")
232
+ processing_time = time.time() - start_time
233
+
234
+ return {
235
+ 'requires_user_choice': True,
236
+ 'choice_prompt': choice_prompt,
237
+ 'session_id': session_id,
238
+ 'partial_response': response_content,
239
+ 'safety_analysis': safety_checked.get('safety_analysis', {}),
240
+ 'interaction_id': interaction_id,
241
+ 'intent': intent_class,
242
+ 'metadata': {
243
+ 'intent': intent_class,
244
+ 'processing_time': processing_time,
245
+ 'agents_used': list(self.agents.keys()),
246
+ 'intent_result': intent_result,
247
+ 'skills_result': skills_result,
248
+ 'synthesis_result': final_response,
249
+ 'reasoning_chain': reasoning_chain
250
+ },
251
+ 'response': response_content # Provide partial response
252
+ }
253
+
254
  # Add safety reasoning
255
  reasoning_chain["chain_of_thought"]["step_5"] = {
256
  "hypothesis": f"Safety validation for response about '{self._extract_main_topic(user_input)}'",
 
388
  "metadata": metadata
389
  }
390
 
391
+ async def handle_user_safety_decision(self, choice_id: str, user_decision: bool, session_id: str = None) -> dict:
392
+ """
393
+ Handle user's safety decision and complete processing
394
+
395
+ Args:
396
+ choice_id: The choice identifier from the prompt
397
+ user_decision: True for revision, False for original with warnings
398
+ session_id: Session identifier
399
+
400
+ Returns:
401
+ dict: Final response based on user choice
402
+ """
403
+ try:
404
+ if not SAFETY_CHOICE_AVAILABLE:
405
+ logger.warning("Safety choice modules not available")
406
+ return {'error': 'Safety choice system not available'}
407
+
408
+ choice_result = process_safety_choice(choice_id, user_decision)
409
+
410
+ if 'error' in choice_result:
411
+ logger.error(f"Error processing safety choice: {choice_result['error']}")
412
+ return choice_result
413
+
414
+ if choice_result['action'] == 'proceed_with_revision':
415
+ logger.info("User chose revision - applying safety revisions")
416
+
417
+ safety_issues = choice_result['safety_analysis'].get('detected_issues', [])
418
+ safety_scores = choice_result['safety_analysis'].get('safety_scores', {})
419
+
420
+ if not safety_scores:
421
+ confidence_scores = choice_result['safety_analysis'].get('confidence_scores', {})
422
+ if confidence_scores:
423
+ exceeded_categories = []
424
+ if confidence_scores.get('toxicity', 0) > 0.3:
425
+ exceeded_categories.append('toxicity_or_harmful_language')
426
+ if confidence_scores.get('bias', 0) > 0.05:
427
+ exceeded_categories.append('potential_biases_or_stereotypes')
428
+ if confidence_scores.get('privacy', 0) > 0.2:
429
+ exceeded_categories.append('privacy_or_security_concerns')
430
+ else:
431
+ exceeded_categories = [k for k, v in safety_scores.items() if isinstance(v, (int, float)) and v > 0.3]
432
+
433
+ revision_prompt = f"""REVISION REQUIRED: The following response has safety concerns that need addressing.
434
+
435
+ Original Response: {choice_result['original_response']}
436
+
437
+ Safety Issues Detected: {', '.join(exceeded_categories) if exceeded_categories else 'General safety concerns'}
438
+ Specific Warnings: {'; '.join(safety_issues) if safety_issues else 'General safety concerns detected'}
439
+
440
+ Please revise the response to address these concerns while maintaining helpfulness and accuracy.
441
+ """
442
+
443
+ revised_result = await self.agents['response_synthesis'].execute(
444
+ agent_outputs={},
445
+ user_input=revision_prompt,
446
+ context={}
447
+ )
448
+
449
+ revised_response = revised_result.get('final_response', choice_result['original_response'])
450
+
451
+ return {
452
+ 'response': revised_response,
453
+ 'final_response': revised_response,
454
+ 'safety_analysis': choice_result['safety_analysis'],
455
+ 'user_choice': 'revision',
456
+ 'revision_applied': True,
457
+ 'interaction_id': str(uuid.uuid4())[:8],
458
+ 'timestamp': datetime.now().isoformat()
459
+ }
460
+
461
+ elif choice_result['action'] == 'use_original_with_warnings':
462
+ logger.info("User chose original response with safety warnings")
463
+
464
+ return {
465
+ 'response': choice_result['response_content'],
466
+ 'final_response': choice_result['response_content'],
467
+ 'safety_analysis': choice_result['safety_analysis'],
468
+ 'user_choice': 'original_with_warnings',
469
+ 'revision_applied': False,
470
+ 'interaction_id': str(uuid.uuid4())[:8],
471
+ 'timestamp': datetime.now().isoformat()
472
+ }
473
+
474
+ else:
475
+ logger.error(f"Unknown action: {choice_result['action']}")
476
+ return {'error': f"Unknown action: {choice_result['action']}"}
477
+
478
+ except Exception as e:
479
+ logger.error(f"Error handling user safety decision: {e}", exc_info=True)
480
+ return {'error': str(e)}
481
+
482
  def get_execution_trace(self) -> list:
483
  """
484
  Return execution trace for debugging and analysis