Commit
·
0b5851a
1
Parent(s):
7506c11
safety agent upgrades to enable creative freedom v1
Browse files- app.py +103 -4
- orchestrator_integration_patch.py +54 -0
- safety_choice_orchestrator.py +191 -0
- safety_threshold_matrix.py +256 -0
- safety_user_choice.py +210 -0
- src/orchestrator_engine.py +145 -0
app.py
CHANGED
|
@@ -514,7 +514,54 @@ async def process_message_async(message: str, history: Optional[List], session_i
|
|
| 514 |
|
| 515 |
new_history = list(history) if isinstance(history, list) else []
|
| 516 |
|
| 517 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
new_history.append({"role": "user", "content": message.strip()})
|
| 519 |
|
| 520 |
# Initialize Details tab data
|
|
@@ -529,13 +576,65 @@ async def process_message_async(message: str, history: Optional[List], session_i
|
|
| 529 |
# Try to use orchestrator if available
|
| 530 |
if orchestrator is not None:
|
| 531 |
try:
|
| 532 |
-
logger.info("Attempting full orchestration
|
| 533 |
-
#
|
| 534 |
-
result = await orchestrator.
|
| 535 |
session_id=session_id,
|
| 536 |
user_input=message.strip()
|
| 537 |
)
|
| 538 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
# Log revision information
|
| 540 |
revision_attempts = result.get('revision_attempts', 0)
|
| 541 |
safety_revision_applied = result.get('safety_revision_applied', False)
|
|
|
|
| 514 |
|
| 515 |
new_history = list(history) if isinstance(history, list) else []
|
| 516 |
|
| 517 |
+
# Check if this is a safety choice response
|
| 518 |
+
message_upper = message.strip().upper()
|
| 519 |
+
is_safety_choice = message_upper in ['YES', 'NO', 'APPLY', 'KEEP', 'Y', 'N']
|
| 520 |
+
|
| 521 |
+
# Check if we have a pending safety choice for this session
|
| 522 |
+
if is_safety_choice and orchestrator is not None and hasattr(orchestrator, '_pending_choices'):
|
| 523 |
+
pending_choice = orchestrator._pending_choices.get(session_id)
|
| 524 |
+
if pending_choice:
|
| 525 |
+
logger.info(f"Processing safety choice: {message_upper}")
|
| 526 |
+
|
| 527 |
+
# Determine user decision
|
| 528 |
+
user_decision = message_upper in ['YES', 'APPLY', 'Y']
|
| 529 |
+
|
| 530 |
+
# Process the safety choice
|
| 531 |
+
choice_result = await orchestrator.handle_user_safety_decision(
|
| 532 |
+
pending_choice['choice_id'],
|
| 533 |
+
user_decision,
|
| 534 |
+
session_id
|
| 535 |
+
)
|
| 536 |
+
|
| 537 |
+
# Clean up pending choice
|
| 538 |
+
del orchestrator._pending_choices[session_id]
|
| 539 |
+
|
| 540 |
+
# Add user message
|
| 541 |
+
new_history.append({"role": "user", "content": message.strip()})
|
| 542 |
+
|
| 543 |
+
# Add assistant response
|
| 544 |
+
if 'error' in choice_result:
|
| 545 |
+
response = f"Error processing safety choice: {choice_result['error']}"
|
| 546 |
+
else:
|
| 547 |
+
response = choice_result.get('response', choice_result.get('final_response', 'Processing complete.'))
|
| 548 |
+
|
| 549 |
+
new_history.append({"role": "assistant", "content": response})
|
| 550 |
+
|
| 551 |
+
# Extract metadata
|
| 552 |
+
reasoning_data = {}
|
| 553 |
+
performance_data = {
|
| 554 |
+
"user_choice": choice_result.get('user_choice', 'unknown'),
|
| 555 |
+
"revision_applied": choice_result.get('revision_applied', False)
|
| 556 |
+
}
|
| 557 |
+
context_data = {
|
| 558 |
+
"interaction_id": choice_result.get('interaction_id', 'unknown'),
|
| 559 |
+
"session_id": session_id
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
return new_history, "", reasoning_data, performance_data, context_data, session_id, ""
|
| 563 |
+
|
| 564 |
+
# Add user message (normal flow)
|
| 565 |
new_history.append({"role": "user", "content": message.strip()})
|
| 566 |
|
| 567 |
# Initialize Details tab data
|
|
|
|
| 576 |
# Try to use orchestrator if available
|
| 577 |
if orchestrator is not None:
|
| 578 |
try:
|
| 579 |
+
logger.info("Attempting full orchestration...")
|
| 580 |
+
# First, try normal processing to check for user choice
|
| 581 |
+
result = await orchestrator.process_request(
|
| 582 |
session_id=session_id,
|
| 583 |
user_input=message.strip()
|
| 584 |
)
|
| 585 |
|
| 586 |
+
# Check if user choice is required
|
| 587 |
+
if result.get('requires_user_choice', False):
|
| 588 |
+
logger.info("User choice required for safety concerns")
|
| 589 |
+
choice_prompt = result.get('choice_prompt', {})
|
| 590 |
+
choice_id = choice_prompt.get('choice_id', '')
|
| 591 |
+
|
| 592 |
+
# Create user-friendly choice prompt message
|
| 593 |
+
prompt_text = choice_prompt.get('prompt_text', '').replace('**', '').replace('*', '')
|
| 594 |
+
|
| 595 |
+
choice_message = f"""🛡️ **SAFETY REVIEW REQUIRED**
|
| 596 |
+
|
| 597 |
+
{prompt_text}
|
| 598 |
+
|
| 599 |
+
**Please reply with:**
|
| 600 |
+
- Type "YES" or "APPLY" to revise the response
|
| 601 |
+
- Type "NO" or "KEEP" to keep the original response with warnings"""
|
| 602 |
+
|
| 603 |
+
# Store choice_id in session for later retrieval
|
| 604 |
+
# We'll detect it when user responds
|
| 605 |
+
if not hasattr(orchestrator, '_pending_choices'):
|
| 606 |
+
orchestrator._pending_choices = {}
|
| 607 |
+
orchestrator._pending_choices[session_id] = {
|
| 608 |
+
'choice_id': choice_id,
|
| 609 |
+
'partial_response': result.get('partial_response', ''),
|
| 610 |
+
'safety_analysis': result.get('safety_analysis', {})
|
| 611 |
+
}
|
| 612 |
+
|
| 613 |
+
# Add assistant message with choice prompt
|
| 614 |
+
new_history.append({
|
| 615 |
+
"role": "assistant",
|
| 616 |
+
"content": choice_message
|
| 617 |
+
})
|
| 618 |
+
|
| 619 |
+
# Extract metadata for Details tab
|
| 620 |
+
reasoning_data = result.get('metadata', {}).get('reasoning_chain', {})
|
| 621 |
+
performance_data = {
|
| 622 |
+
"processing_time": result.get('metadata', {}).get('processing_time', 0),
|
| 623 |
+
"agents_used": result.get('metadata', {}).get('agents_used', [])
|
| 624 |
+
}
|
| 625 |
+
context_data = {
|
| 626 |
+
"interaction_id": result.get('interaction_id', 'unknown'),
|
| 627 |
+
"session_id": session_id
|
| 628 |
+
}
|
| 629 |
+
|
| 630 |
+
# Return early with choice prompt
|
| 631 |
+
return new_history, "", reasoning_data, performance_data, context_data, session_id, ""
|
| 632 |
+
|
| 633 |
+
# If no user choice needed, proceed with safety revision if needed
|
| 634 |
+
# This maintains backward compatibility
|
| 635 |
+
if not result.get('requires_user_choice', False):
|
| 636 |
+
logger.info("No user choice needed, proceeding with normal flow")
|
| 637 |
+
|
| 638 |
# Log revision information
|
| 639 |
revision_attempts = result.get('revision_attempts', 0)
|
| 640 |
safety_revision_applied = result.get('safety_revision_applied', False)
|
orchestrator_integration_patch.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Orchestrator Integration Patch for Safety User Choice
|
| 3 |
+
Provides integration instructions and helper functions
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
# Integration Instructions:
|
| 7 |
+
"""
|
| 8 |
+
This file documents the integration of the safety user choice system.
|
| 9 |
+
|
| 10 |
+
INTEGRATION POINTS:
|
| 11 |
+
1. In orchestrator_engine.py, after safety_agent completes (around line 191-194):
|
| 12 |
+
- Call create_safety_choice_prompt() to check if user choice is needed
|
| 13 |
+
- If choice needed, return choice prompt to UI
|
| 14 |
+
- If no choice needed, proceed with current workflow
|
| 15 |
+
|
| 16 |
+
2. Add new method to orchestrator class:
|
| 17 |
+
- handle_user_safety_decision(choice_id, user_decision)
|
| 18 |
+
- This processes the user's choice and returns final response
|
| 19 |
+
|
| 20 |
+
3. In app.py/UI layer:
|
| 21 |
+
- Display choice prompt when requires_user_choice=True
|
| 22 |
+
- Provide YES/NO buttons
|
| 23 |
+
- Submit choice back to orchestrator
|
| 24 |
+
- Display final response
|
| 25 |
+
|
| 26 |
+
4. Modify safety revision workflow:
|
| 27 |
+
- Only run if user chooses "YES"
|
| 28 |
+
- Otherwise append safety warnings to original response
|
| 29 |
+
|
| 30 |
+
WORKFLOW:
|
| 31 |
+
Safety analysis completes → Check dynamic thresholds
|
| 32 |
+
If exceeded → Present user choice: "YES (revise)" or "NO (original + warnings)"
|
| 33 |
+
YES → Existing revision workflow
|
| 34 |
+
NO → Original response + formatted safety concerns section
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def get_integration_notes():
|
| 38 |
+
"""Return integration notes"""
|
| 39 |
+
return {
|
| 40 |
+
'integration_point': 'orchestrator_engine.py line ~191 (after safety check)',
|
| 41 |
+
'new_methods': ['handle_user_safety_decision'],
|
| 42 |
+
'imports_needed': [
|
| 43 |
+
'from safety_threshold_matrix import should_trigger_user_choice',
|
| 44 |
+
'from safety_user_choice import create_safety_choice_prompt, process_safety_choice',
|
| 45 |
+
'from safety_choice_orchestrator import SafetyChoiceOrchestrator'
|
| 46 |
+
],
|
| 47 |
+
'ui_changes': [
|
| 48 |
+
'Add safety choice prompt display',
|
| 49 |
+
'Add YES/NO buttons',
|
| 50 |
+
'Handle choice submission',
|
| 51 |
+
'Mobile-first responsive design maintained'
|
| 52 |
+
]
|
| 53 |
+
}
|
| 54 |
+
|
safety_choice_orchestrator.py
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enhanced Orchestrator Workflow with User Safety Choice
|
| 3 |
+
Integrates user decision point after safety analysis
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Dict, Any, Optional
|
| 8 |
+
from safety_user_choice import create_safety_choice_prompt, process_safety_choice
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
class SafetyChoiceOrchestrator:
|
| 13 |
+
"""Enhanced orchestrator with user safety choice integration"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, original_orchestrator):
|
| 16 |
+
self.original_orchestrator = original_orchestrator
|
| 17 |
+
self.pending_safety_choices = {}
|
| 18 |
+
|
| 19 |
+
async def process_with_safety_choice(self,
|
| 20 |
+
user_input: str,
|
| 21 |
+
session_id: str,
|
| 22 |
+
intent_class: str,
|
| 23 |
+
safety_analysis: Dict[str, Any],
|
| 24 |
+
response_content: str,
|
| 25 |
+
user_choice_callback=None) -> Dict[str, Any]:
|
| 26 |
+
"""
|
| 27 |
+
Process request with integrated safety choice workflow
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
user_input: User's input text
|
| 31 |
+
session_id: Session identifier
|
| 32 |
+
intent_class: Detected intent class
|
| 33 |
+
safety_analysis: Safety analysis results
|
| 34 |
+
response_content: Synthesized response content
|
| 35 |
+
user_choice_callback: Function to handle user choice UI
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Dict with processing results
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
try:
|
| 42 |
+
choice_prompt = create_safety_choice_prompt(
|
| 43 |
+
safety_analysis, intent_class, response_content, session_id
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
if choice_prompt:
|
| 47 |
+
logger.info(f"Safety concerns detected for intent '{intent_class}' - requiring user choice")
|
| 48 |
+
|
| 49 |
+
if user_choice_callback:
|
| 50 |
+
user_decision = await user_choice_callback(choice_prompt)
|
| 51 |
+
return await self._handle_user_decision(
|
| 52 |
+
choice_prompt['choice_id'], user_decision, safety_analysis
|
| 53 |
+
)
|
| 54 |
+
else:
|
| 55 |
+
return {
|
| 56 |
+
'requires_user_choice': True,
|
| 57 |
+
'choice_prompt': choice_prompt,
|
| 58 |
+
'session_id': session_id,
|
| 59 |
+
'partial_response': response_content,
|
| 60 |
+
'safety_analysis': safety_analysis
|
| 61 |
+
}
|
| 62 |
+
else:
|
| 63 |
+
logger.info("No safety concerns detected - proceeding normally")
|
| 64 |
+
return {
|
| 65 |
+
'response': response_content,
|
| 66 |
+
'safety_analysis': safety_analysis,
|
| 67 |
+
'requires_user_choice': False
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.error(f"Error in safety choice orchestration: {e}")
|
| 72 |
+
return {
|
| 73 |
+
'error': str(e),
|
| 74 |
+
'requires_user_choice': False,
|
| 75 |
+
'response': response_content
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
async def handle_user_safety_decision(self,
|
| 79 |
+
choice_id: str,
|
| 80 |
+
user_decision: bool,
|
| 81 |
+
original_response: str = None) -> Dict[str, Any]:
|
| 82 |
+
"""
|
| 83 |
+
Handle user's safety decision
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
choice_id: The choice identifier
|
| 87 |
+
user_decision: True for revision, False for original with warnings
|
| 88 |
+
original_response: Original response content (optional)
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
Dict with final response
|
| 92 |
+
"""
|
| 93 |
+
return await self._handle_user_decision(choice_id, user_decision, None, original_response)
|
| 94 |
+
|
| 95 |
+
async def _handle_user_decision(self,
|
| 96 |
+
choice_id: str,
|
| 97 |
+
user_decision: bool,
|
| 98 |
+
safety_analysis: Dict[str, Any] = None,
|
| 99 |
+
original_response: str = None) -> Dict[str, Any]:
|
| 100 |
+
"""Internal method to handle user decision"""
|
| 101 |
+
|
| 102 |
+
choice_result = process_safety_choice(choice_id, user_decision)
|
| 103 |
+
|
| 104 |
+
if 'error' in choice_result:
|
| 105 |
+
return choice_result
|
| 106 |
+
|
| 107 |
+
if choice_result['action'] == 'proceed_with_revision':
|
| 108 |
+
logger.info("User chose revision - running safety revision")
|
| 109 |
+
|
| 110 |
+
revised_response = await self._run_safety_revision(
|
| 111 |
+
choice_result['original_response'],
|
| 112 |
+
choice_result['safety_analysis'],
|
| 113 |
+
choice_result.get('intent_class', 'casual_conversation')
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
return {
|
| 117 |
+
'response': revised_response,
|
| 118 |
+
'safety_analysis': choice_result['safety_analysis'],
|
| 119 |
+
'user_choice': 'revision',
|
| 120 |
+
'requires_user_choice': False,
|
| 121 |
+
'revision_applied': True
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
elif choice_result['action'] == 'use_original_with_warnings':
|
| 125 |
+
logger.info("User chose original with warnings")
|
| 126 |
+
|
| 127 |
+
return {
|
| 128 |
+
'response': choice_result['response_content'],
|
| 129 |
+
'safety_analysis': choice_result['safety_analysis'],
|
| 130 |
+
'user_choice': 'original_with_warnings',
|
| 131 |
+
'requires_user_choice': False,
|
| 132 |
+
'revision_applied': False
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
else:
|
| 136 |
+
return {'error': 'Unknown action from choice result'}
|
| 137 |
+
|
| 138 |
+
async def _run_safety_revision(self,
|
| 139 |
+
original_response: str,
|
| 140 |
+
safety_analysis: Dict[str, Any],
|
| 141 |
+
intent_class: str) -> str:
|
| 142 |
+
"""
|
| 143 |
+
Run safety revision on the response using the original orchestrator's revision workflow
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
original_response: Original response content
|
| 147 |
+
safety_analysis: Safety analysis results
|
| 148 |
+
intent_class: Intent class for context
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
str: Revised response
|
| 152 |
+
"""
|
| 153 |
+
try:
|
| 154 |
+
exceeded_categories = []
|
| 155 |
+
safety_scores = safety_analysis.get('safety_scores', {})
|
| 156 |
+
|
| 157 |
+
if not safety_scores:
|
| 158 |
+
confidence_scores = safety_analysis.get('confidence_scores', {})
|
| 159 |
+
if confidence_scores:
|
| 160 |
+
safety_scores = {
|
| 161 |
+
'toxicity_or_harmful_language': confidence_scores.get('toxicity', 0.0),
|
| 162 |
+
'potential_biases_or_stereotypes': confidence_scores.get('bias', 0.0),
|
| 163 |
+
'privacy_or_security_concerns': confidence_scores.get('privacy', 0.0),
|
| 164 |
+
'controversial_or_sensitive_topics': 1.0 - confidence_scores.get('safety', 1.0)
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
safety_warnings = safety_analysis.get('detected_issues', [])
|
| 168 |
+
|
| 169 |
+
if hasattr(self.original_orchestrator, 'generate_revision_prompt'):
|
| 170 |
+
revision_prompt = self.original_orchestrator.generate_revision_prompt(
|
| 171 |
+
original_response,
|
| 172 |
+
exceeded_categories,
|
| 173 |
+
safety_warnings
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
if hasattr(self.original_orchestrator, 'agents') and 'response_synthesis' in self.original_orchestrator.agents:
|
| 177 |
+
revised_result = await self.original_orchestrator.agents['response_synthesis'].execute(
|
| 178 |
+
agent_outputs={},
|
| 179 |
+
user_input=revision_prompt,
|
| 180 |
+
context={}
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
revised_response = revised_result.get('final_response', original_response)
|
| 184 |
+
return revised_response
|
| 185 |
+
|
| 186 |
+
return original_response
|
| 187 |
+
|
| 188 |
+
except Exception as e:
|
| 189 |
+
logger.error(f"Error in safety revision: {e}")
|
| 190 |
+
return original_response
|
| 191 |
+
|
safety_threshold_matrix.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Intent-Based Safety Threshold Matrix
|
| 3 |
+
Based on academic research for dynamic content moderation thresholds
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
# Intent-Based Safety Threshold Matrix (Academic Research-Based)
|
| 7 |
+
INTENT_SAFETY_MATRIX = {
|
| 8 |
+
'information_request': {
|
| 9 |
+
'toxicity': 0.20,
|
| 10 |
+
'sexual_content': 0.15,
|
| 11 |
+
'violence': 0.15,
|
| 12 |
+
'self_harm': 0.10,
|
| 13 |
+
'harassment': 0.20,
|
| 14 |
+
'dangerous_activities': 0.15,
|
| 15 |
+
'privacy_violations': 0.25,
|
| 16 |
+
'misinformation': 0.30,
|
| 17 |
+
'child_safety': 0.10,
|
| 18 |
+
'bias_stereotyping': 0.25
|
| 19 |
+
},
|
| 20 |
+
'task_execution': {
|
| 21 |
+
'toxicity': 0.25,
|
| 22 |
+
'sexual_content': 0.20,
|
| 23 |
+
'violence': 0.20,
|
| 24 |
+
'self_harm': 0.15,
|
| 25 |
+
'harassment': 0.25,
|
| 26 |
+
'dangerous_activities': 0.20,
|
| 27 |
+
'privacy_violations': 0.30,
|
| 28 |
+
'misinformation': 0.35,
|
| 29 |
+
'child_safety': 0.15,
|
| 30 |
+
'bias_stereotyping': 0.30
|
| 31 |
+
},
|
| 32 |
+
'creative_generation': {
|
| 33 |
+
'toxicity': 0.40,
|
| 34 |
+
'sexual_content': 0.35,
|
| 35 |
+
'violence': 0.35,
|
| 36 |
+
'self_harm': 0.25,
|
| 37 |
+
'harassment': 0.30,
|
| 38 |
+
'dangerous_activities': 0.30,
|
| 39 |
+
'privacy_violations': 0.20,
|
| 40 |
+
'misinformation': 0.25,
|
| 41 |
+
'child_safety': 0.20,
|
| 42 |
+
'bias_stereotyping': 0.35
|
| 43 |
+
},
|
| 44 |
+
'analysis_research': {
|
| 45 |
+
'toxicity': 0.35,
|
| 46 |
+
'sexual_content': 0.25,
|
| 47 |
+
'violence': 0.25,
|
| 48 |
+
'self_harm': 0.20,
|
| 49 |
+
'harassment': 0.30,
|
| 50 |
+
'dangerous_activities': 0.25,
|
| 51 |
+
'privacy_violations': 0.40,
|
| 52 |
+
'misinformation': 0.45,
|
| 53 |
+
'child_safety': 0.25,
|
| 54 |
+
'bias_stereotyping': 0.40
|
| 55 |
+
},
|
| 56 |
+
'casual_conversation': {
|
| 57 |
+
'toxicity': 0.30,
|
| 58 |
+
'sexual_content': 0.25,
|
| 59 |
+
'violence': 0.25,
|
| 60 |
+
'self_harm': 0.20,
|
| 61 |
+
'harassment': 0.35,
|
| 62 |
+
'dangerous_activities': 0.25,
|
| 63 |
+
'privacy_violations': 0.15,
|
| 64 |
+
'misinformation': 0.20,
|
| 65 |
+
'child_safety': 0.15,
|
| 66 |
+
'bias_stereotyping': 0.25
|
| 67 |
+
},
|
| 68 |
+
'troubleshooting': {
|
| 69 |
+
'toxicity': 0.15,
|
| 70 |
+
'sexual_content': 0.10,
|
| 71 |
+
'violence': 0.15,
|
| 72 |
+
'self_harm': 0.10,
|
| 73 |
+
'harassment': 0.20,
|
| 74 |
+
'dangerous_activities': 0.10,
|
| 75 |
+
'privacy_violations': 0.35,
|
| 76 |
+
'misinformation': 0.40,
|
| 77 |
+
'child_safety': 0.10,
|
| 78 |
+
'bias_stereotyping': 0.20
|
| 79 |
+
},
|
| 80 |
+
'education_learning': {
|
| 81 |
+
'toxicity': 0.25,
|
| 82 |
+
'sexual_content': 0.20,
|
| 83 |
+
'violence': 0.30,
|
| 84 |
+
'self_harm': 0.15,
|
| 85 |
+
'harassment': 0.20,
|
| 86 |
+
'dangerous_activities': 0.35,
|
| 87 |
+
'privacy_violations': 0.25,
|
| 88 |
+
'misinformation': 0.50,
|
| 89 |
+
'child_safety': 0.30,
|
| 90 |
+
'bias_stereotyping': 0.35
|
| 91 |
+
},
|
| 92 |
+
'technical_support': {
|
| 93 |
+
'toxicity': 0.10,
|
| 94 |
+
'sexual_content': 0.05,
|
| 95 |
+
'violence': 0.10,
|
| 96 |
+
'self_harm': 0.05,
|
| 97 |
+
'harassment': 0.15,
|
| 98 |
+
'dangerous_activities': 0.05,
|
| 99 |
+
'privacy_violations': 0.45,
|
| 100 |
+
'misinformation': 0.45,
|
| 101 |
+
'child_safety': 0.05,
|
| 102 |
+
'bias_stereotyping': 0.15
|
| 103 |
+
}
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
# Safety parameter descriptions for user display
|
| 107 |
+
SAFETY_DESCRIPTIONS = {
|
| 108 |
+
'toxicity': 'Hate speech, discriminatory language',
|
| 109 |
+
'sexual_content': 'Explicit sexual material, nudity',
|
| 110 |
+
'violence': 'Graphic violence, gore, death depictions',
|
| 111 |
+
'self_harm': 'Content promoting self-harm or suicide',
|
| 112 |
+
'harassment': 'Personal attacks, bullying, doxxing',
|
| 113 |
+
'dangerous_activities': 'Illegal activities, weapon instructions',
|
| 114 |
+
'privacy_violations': 'Personal data exposure, unauthorized sharing',
|
| 115 |
+
'misinformation': 'False information, unverified claims',
|
| 116 |
+
'child_safety': 'Content potentially harmful to minors',
|
| 117 |
+
'bias_stereotyping': 'Discriminatory generalizations'
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
# Criticality levels
|
| 121 |
+
CRITICALITY_LEVELS = {
|
| 122 |
+
(0.0, 0.3): 'LOW',
|
| 123 |
+
(0.3, 0.6): 'MEDIUM',
|
| 124 |
+
(0.6, 0.8): 'HIGH',
|
| 125 |
+
(0.8, 1.0): 'CRITICAL'
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
def get_dynamic_threshold(intent_class, safety_param, user_context=None):
|
| 129 |
+
"""
|
| 130 |
+
Calculate dynamic safety threshold based on intent and context
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
intent_class: The detected intent type
|
| 134 |
+
safety_param: The safety parameter to check
|
| 135 |
+
user_context: Optional user context for adjustments
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
float: Adjusted threshold value
|
| 139 |
+
"""
|
| 140 |
+
if intent_class not in INTENT_SAFETY_MATRIX:
|
| 141 |
+
intent_class = 'casual_conversation'
|
| 142 |
+
|
| 143 |
+
if safety_param not in INTENT_SAFETY_MATRIX[intent_class]:
|
| 144 |
+
return 0.2
|
| 145 |
+
|
| 146 |
+
base_threshold = INTENT_SAFETY_MATRIX[intent_class][safety_param]
|
| 147 |
+
|
| 148 |
+
if user_context:
|
| 149 |
+
if user_context.get('academic_use', False):
|
| 150 |
+
base_threshold *= 1.2
|
| 151 |
+
|
| 152 |
+
if user_context.get('minor_present', False):
|
| 153 |
+
base_threshold *= 0.7
|
| 154 |
+
|
| 155 |
+
if user_context.get('professional_context', False):
|
| 156 |
+
base_threshold *= 1.1
|
| 157 |
+
|
| 158 |
+
if user_context.get('creative_context', False):
|
| 159 |
+
base_threshold *= 1.15
|
| 160 |
+
|
| 161 |
+
return min(base_threshold, 0.8)
|
| 162 |
+
|
| 163 |
+
def get_criticality_level(score):
|
| 164 |
+
"""Get criticality level for a safety score"""
|
| 165 |
+
for (min_val, max_val), level in CRITICALITY_LEVELS.items():
|
| 166 |
+
if min_val <= score < max_val:
|
| 167 |
+
return level
|
| 168 |
+
return 'CRITICAL'
|
| 169 |
+
|
| 170 |
+
def format_safety_concerns(safety_analysis, intent_class):
|
| 171 |
+
"""
|
| 172 |
+
Format safety concerns for user display
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
safety_analysis: Dict containing safety scores
|
| 176 |
+
intent_class: The detected intent class
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
str: Formatted safety concerns text or None if no concerns
|
| 180 |
+
"""
|
| 181 |
+
concerns = []
|
| 182 |
+
|
| 183 |
+
safety_mapping = {
|
| 184 |
+
'toxicity_or_harmful_language': 'toxicity',
|
| 185 |
+
'potential_biases_or_stereotypes': 'bias_stereotyping',
|
| 186 |
+
'privacy_or_security_concerns': 'privacy_violations',
|
| 187 |
+
'controversial_or_sensitive_topics': 'toxicity'
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
safety_scores = safety_analysis.get('safety_scores', {})
|
| 191 |
+
|
| 192 |
+
if not safety_scores:
|
| 193 |
+
confidence_scores = safety_analysis.get('confidence_scores', {})
|
| 194 |
+
if confidence_scores:
|
| 195 |
+
safety_scores = {
|
| 196 |
+
'toxicity_or_harmful_language': confidence_scores.get('toxicity', 0.0),
|
| 197 |
+
'potential_biases_or_stereotypes': confidence_scores.get('bias', 0.0),
|
| 198 |
+
'privacy_or_security_concerns': confidence_scores.get('privacy', 0.0),
|
| 199 |
+
'controversial_or_sensitive_topics': 1.0 - confidence_scores.get('safety', 1.0)
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
for param_key, score in safety_scores.items():
|
| 203 |
+
if param_key in safety_mapping:
|
| 204 |
+
safety_param = safety_mapping[param_key]
|
| 205 |
+
threshold = get_dynamic_threshold(intent_class, safety_param)
|
| 206 |
+
|
| 207 |
+
if isinstance(score, (int, float)) and score > threshold:
|
| 208 |
+
criticality = get_criticality_level(score)
|
| 209 |
+
description = SAFETY_DESCRIPTIONS.get(safety_param, param_key)
|
| 210 |
+
concerns.append(f"• **{criticality}**: {description} (Score: {score:.2f}, Threshold: {threshold:.2f})")
|
| 211 |
+
|
| 212 |
+
if not concerns:
|
| 213 |
+
return None
|
| 214 |
+
|
| 215 |
+
return "\n".join(concerns)
|
| 216 |
+
|
| 217 |
+
def should_trigger_user_choice(safety_analysis, intent_class):
|
| 218 |
+
"""
|
| 219 |
+
Determine if user choice should be triggered based on safety analysis
|
| 220 |
+
|
| 221 |
+
Args:
|
| 222 |
+
safety_analysis: Dict containing safety analysis
|
| 223 |
+
intent_class: The detected intent class
|
| 224 |
+
|
| 225 |
+
Returns:
|
| 226 |
+
bool: True if user choice should be triggered
|
| 227 |
+
"""
|
| 228 |
+
safety_scores = safety_analysis.get('safety_scores', {})
|
| 229 |
+
|
| 230 |
+
if not safety_scores:
|
| 231 |
+
confidence_scores = safety_analysis.get('confidence_scores', {})
|
| 232 |
+
if confidence_scores:
|
| 233 |
+
safety_scores = {
|
| 234 |
+
'toxicity_or_harmful_language': confidence_scores.get('toxicity', 0.0),
|
| 235 |
+
'potential_biases_or_stereotypes': confidence_scores.get('bias', 0.0),
|
| 236 |
+
'privacy_or_security_concerns': confidence_scores.get('privacy', 0.0),
|
| 237 |
+
'controversial_or_sensitive_topics': 1.0 - confidence_scores.get('safety', 1.0)
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
safety_mapping = {
|
| 241 |
+
'toxicity_or_harmful_language': 'toxicity',
|
| 242 |
+
'potential_biases_or_stereotypes': 'bias_stereotyping',
|
| 243 |
+
'privacy_or_security_concerns': 'privacy_violations',
|
| 244 |
+
'controversial_or_sensitive_topics': 'toxicity'
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
for param_key, score in safety_scores.items():
|
| 248 |
+
if param_key in safety_mapping:
|
| 249 |
+
safety_param = safety_mapping[param_key]
|
| 250 |
+
threshold = get_dynamic_threshold(intent_class, safety_param)
|
| 251 |
+
|
| 252 |
+
if isinstance(score, (int, float)) and score > threshold:
|
| 253 |
+
return True
|
| 254 |
+
|
| 255 |
+
return False
|
| 256 |
+
|
safety_user_choice.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
User Choice Interface for Safety Decisions
|
| 3 |
+
Provides binary choice for safety concern handling
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Dict, Any, Optional, Tuple
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
from safety_threshold_matrix import (
|
| 10 |
+
should_trigger_user_choice,
|
| 11 |
+
format_safety_concerns,
|
| 12 |
+
get_criticality_level
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
class SafetyUserChoice:
|
| 18 |
+
"""Handles user choice for safety concerns"""
|
| 19 |
+
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self.pending_choices = {}
|
| 22 |
+
|
| 23 |
+
def create_user_choice_prompt(self,
|
| 24 |
+
safety_analysis: Dict[str, Any],
|
| 25 |
+
intent_class: str,
|
| 26 |
+
response_content: str,
|
| 27 |
+
session_id: str) -> Dict[str, Any]:
|
| 28 |
+
"""
|
| 29 |
+
Create user choice prompt for safety concerns
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
safety_analysis: Safety analysis results
|
| 33 |
+
intent_class: Detected intent class
|
| 34 |
+
response_content: The synthesized response content
|
| 35 |
+
session_id: Session identifier
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Dict containing choice prompt data or None if not needed
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
concerns_text = format_safety_concerns(safety_analysis, intent_class)
|
| 42 |
+
|
| 43 |
+
if not concerns_text:
|
| 44 |
+
return None
|
| 45 |
+
|
| 46 |
+
choice_id = f"{session_id}_{int(datetime.now().timestamp() * 1000)}"
|
| 47 |
+
|
| 48 |
+
choice_data = {
|
| 49 |
+
'choice_id': choice_id,
|
| 50 |
+
'safety_analysis': safety_analysis,
|
| 51 |
+
'intent_class': intent_class,
|
| 52 |
+
'response_content': response_content,
|
| 53 |
+
'concerns_text': concerns_text,
|
| 54 |
+
'timestamp': datetime.now().isoformat()
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
self.pending_choices[choice_id] = choice_data
|
| 58 |
+
|
| 59 |
+
prompt_text = f"""🛡️ **SAFETY REVIEW REQUIRED**
|
| 60 |
+
|
| 61 |
+
The following response has been flagged for potential safety concerns:
|
| 62 |
+
|
| 63 |
+
**Safety Concerns Detected:**
|
| 64 |
+
{concerns_text}
|
| 65 |
+
|
| 66 |
+
**Your Response Content:**
|
| 67 |
+
{response_content[:500]}{"..." if len(response_content) > 500 else ""}
|
| 68 |
+
|
| 69 |
+
**Choose how to proceed:**
|
| 70 |
+
|
| 71 |
+
**YES** - Apply safety revisions (recommended)
|
| 72 |
+
- The system will automatically revise the response to address safety concerns
|
| 73 |
+
- This may modify or remove flagged content
|
| 74 |
+
|
| 75 |
+
**NO** - Keep original response with safety warnings
|
| 76 |
+
- The original response will be provided unchanged
|
| 77 |
+
- Safety concerns will be clearly highlighted at the end
|
| 78 |
+
|
| 79 |
+
Would you like to proceed with safety revisions?
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
return {
|
| 83 |
+
'choice_id': choice_id,
|
| 84 |
+
'prompt_text': prompt_text,
|
| 85 |
+
'requires_user_input': True,
|
| 86 |
+
'choice_type': 'binary_safety_decision'
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
def process_user_choice(self, choice_id: str, user_decision: bool) -> Dict[str, Any]:
|
| 90 |
+
"""
|
| 91 |
+
Process user's safety choice
|
| 92 |
+
|
| 93 |
+
Args:
|
| 94 |
+
choice_id: The choice identifier
|
| 95 |
+
user_decision: True for revision, False for original with warnings
|
| 96 |
+
|
| 97 |
+
Returns:
|
| 98 |
+
Dict with processing results
|
| 99 |
+
"""
|
| 100 |
+
|
| 101 |
+
if choice_id not in self.pending_choices:
|
| 102 |
+
logger.error(f"Choice ID {choice_id} not found")
|
| 103 |
+
return {'error': 'Invalid choice ID'}
|
| 104 |
+
|
| 105 |
+
choice_data = self.pending_choices[choice_id]
|
| 106 |
+
|
| 107 |
+
del self.pending_choices[choice_id]
|
| 108 |
+
|
| 109 |
+
if user_decision:
|
| 110 |
+
return {
|
| 111 |
+
'action': 'proceed_with_revision',
|
| 112 |
+
'safety_analysis': choice_data['safety_analysis'],
|
| 113 |
+
'intent_class': choice_data['intent_class'],
|
| 114 |
+
'original_response': choice_data['response_content']
|
| 115 |
+
}
|
| 116 |
+
else:
|
| 117 |
+
return {
|
| 118 |
+
'action': 'use_original_with_warnings',
|
| 119 |
+
'response_content': self._add_safety_warnings(
|
| 120 |
+
choice_data['response_content'],
|
| 121 |
+
choice_data['concerns_text']
|
| 122 |
+
),
|
| 123 |
+
'safety_analysis': choice_data['safety_analysis']
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
def _add_safety_warnings(self, response_content: str, concerns_text: str) -> str:
|
| 127 |
+
"""
|
| 128 |
+
Add safety warnings to the original response
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
response_content: Original response content
|
| 132 |
+
concerns_text: Formatted safety concerns
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
str: Response with safety warnings appended
|
| 136 |
+
"""
|
| 137 |
+
|
| 138 |
+
warning_section = f"""
|
| 139 |
+
|
| 140 |
+
---
|
| 141 |
+
|
| 142 |
+
## ⚠️ Safety Advisory
|
| 143 |
+
|
| 144 |
+
This response has been flagged for the following safety concerns:
|
| 145 |
+
|
| 146 |
+
{concerns_text}
|
| 147 |
+
|
| 148 |
+
**Please review this content carefully and consider:**
|
| 149 |
+
- The potential impact on yourself and others
|
| 150 |
+
- Whether this content aligns with your intended use
|
| 151 |
+
- If additional verification or expert consultation is needed
|
| 152 |
+
|
| 153 |
+
*This advisory is provided for transparency and user awareness.*
|
| 154 |
+
"""
|
| 155 |
+
|
| 156 |
+
return response_content + warning_section
|
| 157 |
+
|
| 158 |
+
# Global instance for use across the application
|
| 159 |
+
safety_user_choice = SafetyUserChoice()
|
| 160 |
+
|
| 161 |
+
def check_safety_user_choice_needed(safety_analysis: Dict[str, Any],
|
| 162 |
+
intent_class: str) -> bool:
|
| 163 |
+
"""
|
| 164 |
+
Check if user choice is needed for safety concerns
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
safety_analysis: Safety analysis results
|
| 168 |
+
intent_class: Detected intent class
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
bool: True if user choice is needed
|
| 172 |
+
"""
|
| 173 |
+
return should_trigger_user_choice(safety_analysis, intent_class)
|
| 174 |
+
|
| 175 |
+
def create_safety_choice_prompt(safety_analysis: Dict[str, Any],
|
| 176 |
+
intent_class: str,
|
| 177 |
+
response_content: str,
|
| 178 |
+
session_id: str) -> Optional[Dict[str, Any]]:
|
| 179 |
+
"""
|
| 180 |
+
Create safety choice prompt if needed
|
| 181 |
+
|
| 182 |
+
Args:
|
| 183 |
+
safety_analysis: Safety analysis results
|
| 184 |
+
intent_class: Detected intent class
|
| 185 |
+
response_content: Response content
|
| 186 |
+
session_id: Session identifier
|
| 187 |
+
|
| 188 |
+
Returns:
|
| 189 |
+
Dict with choice prompt or None if not needed
|
| 190 |
+
"""
|
| 191 |
+
if not check_safety_user_choice_needed(safety_analysis, intent_class):
|
| 192 |
+
return None
|
| 193 |
+
|
| 194 |
+
return safety_user_choice.create_user_choice_prompt(
|
| 195 |
+
safety_analysis, intent_class, response_content, session_id
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
def process_safety_choice(choice_id: str, user_decision: bool) -> Dict[str, Any]:
|
| 199 |
+
"""
|
| 200 |
+
Process user's safety choice
|
| 201 |
+
|
| 202 |
+
Args:
|
| 203 |
+
choice_id: Choice identifier
|
| 204 |
+
user_decision: User's binary decision
|
| 205 |
+
|
| 206 |
+
Returns:
|
| 207 |
+
Dict with processing results
|
| 208 |
+
"""
|
| 209 |
+
return safety_user_choice.process_user_choice(choice_id, user_decision)
|
| 210 |
+
|
src/orchestrator_engine.py
CHANGED
|
@@ -4,9 +4,27 @@ import logging
|
|
| 4 |
import time
|
| 5 |
import asyncio
|
| 6 |
from datetime import datetime
|
|
|
|
|
|
|
| 7 |
|
| 8 |
logger = logging.getLogger(__name__)
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
class MVPOrchestrator:
|
| 11 |
def __init__(self, llm_router, context_manager, agents):
|
| 12 |
self.llm_router = llm_router
|
|
@@ -197,6 +215,42 @@ class MVPOrchestrator:
|
|
| 197 |
"result": {"warnings": safety_checked.get('warnings', [])}
|
| 198 |
})
|
| 199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
# Add safety reasoning
|
| 201 |
reasoning_chain["chain_of_thought"]["step_5"] = {
|
| 202 |
"hypothesis": f"Safety validation for response about '{self._extract_main_topic(user_input)}'",
|
|
@@ -334,6 +388,97 @@ class MVPOrchestrator:
|
|
| 334 |
"metadata": metadata
|
| 335 |
}
|
| 336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
def get_execution_trace(self) -> list:
|
| 338 |
"""
|
| 339 |
Return execution trace for debugging and analysis
|
|
|
|
| 4 |
import time
|
| 5 |
import asyncio
|
| 6 |
from datetime import datetime
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
|
| 10 |
logger = logging.getLogger(__name__)
|
| 11 |
|
| 12 |
+
# Add project root and parent directory to path for imports
|
| 13 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 14 |
+
parent_dir = os.path.dirname(current_dir)
|
| 15 |
+
sys.path.insert(0, parent_dir)
|
| 16 |
+
sys.path.insert(0, current_dir)
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
from safety_threshold_matrix import should_trigger_user_choice
|
| 20 |
+
from safety_user_choice import create_safety_choice_prompt, process_safety_choice
|
| 21 |
+
from safety_choice_orchestrator import SafetyChoiceOrchestrator
|
| 22 |
+
SAFETY_CHOICE_AVAILABLE = True
|
| 23 |
+
logger.info("Safety choice modules loaded successfully")
|
| 24 |
+
except ImportError as e:
|
| 25 |
+
logger.warning(f"Safety choice modules not available: {e}")
|
| 26 |
+
SAFETY_CHOICE_AVAILABLE = False
|
| 27 |
+
|
| 28 |
class MVPOrchestrator:
|
| 29 |
def __init__(self, llm_router, context_manager, agents):
|
| 30 |
self.llm_router = llm_router
|
|
|
|
| 215 |
"result": {"warnings": safety_checked.get('warnings', [])}
|
| 216 |
})
|
| 217 |
|
| 218 |
+
# Step 7.5: Enhanced Safety check with user choice (if available)
|
| 219 |
+
intent_class = intent_result.get('primary_intent', 'casual_conversation')
|
| 220 |
+
response_content = final_response.get('final_response', '') or str(final_response.get('response', ''))
|
| 221 |
+
|
| 222 |
+
if SAFETY_CHOICE_AVAILABLE:
|
| 223 |
+
choice_prompt = create_safety_choice_prompt(
|
| 224 |
+
safety_checked.get('safety_analysis', {}),
|
| 225 |
+
intent_class,
|
| 226 |
+
response_content,
|
| 227 |
+
session_id
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
if choice_prompt:
|
| 231 |
+
logger.info(f"Safety concerns detected for intent '{intent_class}' - requiring user choice")
|
| 232 |
+
processing_time = time.time() - start_time
|
| 233 |
+
|
| 234 |
+
return {
|
| 235 |
+
'requires_user_choice': True,
|
| 236 |
+
'choice_prompt': choice_prompt,
|
| 237 |
+
'session_id': session_id,
|
| 238 |
+
'partial_response': response_content,
|
| 239 |
+
'safety_analysis': safety_checked.get('safety_analysis', {}),
|
| 240 |
+
'interaction_id': interaction_id,
|
| 241 |
+
'intent': intent_class,
|
| 242 |
+
'metadata': {
|
| 243 |
+
'intent': intent_class,
|
| 244 |
+
'processing_time': processing_time,
|
| 245 |
+
'agents_used': list(self.agents.keys()),
|
| 246 |
+
'intent_result': intent_result,
|
| 247 |
+
'skills_result': skills_result,
|
| 248 |
+
'synthesis_result': final_response,
|
| 249 |
+
'reasoning_chain': reasoning_chain
|
| 250 |
+
},
|
| 251 |
+
'response': response_content # Provide partial response
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
# Add safety reasoning
|
| 255 |
reasoning_chain["chain_of_thought"]["step_5"] = {
|
| 256 |
"hypothesis": f"Safety validation for response about '{self._extract_main_topic(user_input)}'",
|
|
|
|
| 388 |
"metadata": metadata
|
| 389 |
}
|
| 390 |
|
| 391 |
+
async def handle_user_safety_decision(self, choice_id: str, user_decision: bool, session_id: str = None) -> dict:
|
| 392 |
+
"""
|
| 393 |
+
Handle user's safety decision and complete processing
|
| 394 |
+
|
| 395 |
+
Args:
|
| 396 |
+
choice_id: The choice identifier from the prompt
|
| 397 |
+
user_decision: True for revision, False for original with warnings
|
| 398 |
+
session_id: Session identifier
|
| 399 |
+
|
| 400 |
+
Returns:
|
| 401 |
+
dict: Final response based on user choice
|
| 402 |
+
"""
|
| 403 |
+
try:
|
| 404 |
+
if not SAFETY_CHOICE_AVAILABLE:
|
| 405 |
+
logger.warning("Safety choice modules not available")
|
| 406 |
+
return {'error': 'Safety choice system not available'}
|
| 407 |
+
|
| 408 |
+
choice_result = process_safety_choice(choice_id, user_decision)
|
| 409 |
+
|
| 410 |
+
if 'error' in choice_result:
|
| 411 |
+
logger.error(f"Error processing safety choice: {choice_result['error']}")
|
| 412 |
+
return choice_result
|
| 413 |
+
|
| 414 |
+
if choice_result['action'] == 'proceed_with_revision':
|
| 415 |
+
logger.info("User chose revision - applying safety revisions")
|
| 416 |
+
|
| 417 |
+
safety_issues = choice_result['safety_analysis'].get('detected_issues', [])
|
| 418 |
+
safety_scores = choice_result['safety_analysis'].get('safety_scores', {})
|
| 419 |
+
|
| 420 |
+
if not safety_scores:
|
| 421 |
+
confidence_scores = choice_result['safety_analysis'].get('confidence_scores', {})
|
| 422 |
+
if confidence_scores:
|
| 423 |
+
exceeded_categories = []
|
| 424 |
+
if confidence_scores.get('toxicity', 0) > 0.3:
|
| 425 |
+
exceeded_categories.append('toxicity_or_harmful_language')
|
| 426 |
+
if confidence_scores.get('bias', 0) > 0.05:
|
| 427 |
+
exceeded_categories.append('potential_biases_or_stereotypes')
|
| 428 |
+
if confidence_scores.get('privacy', 0) > 0.2:
|
| 429 |
+
exceeded_categories.append('privacy_or_security_concerns')
|
| 430 |
+
else:
|
| 431 |
+
exceeded_categories = [k for k, v in safety_scores.items() if isinstance(v, (int, float)) and v > 0.3]
|
| 432 |
+
|
| 433 |
+
revision_prompt = f"""REVISION REQUIRED: The following response has safety concerns that need addressing.
|
| 434 |
+
|
| 435 |
+
Original Response: {choice_result['original_response']}
|
| 436 |
+
|
| 437 |
+
Safety Issues Detected: {', '.join(exceeded_categories) if exceeded_categories else 'General safety concerns'}
|
| 438 |
+
Specific Warnings: {'; '.join(safety_issues) if safety_issues else 'General safety concerns detected'}
|
| 439 |
+
|
| 440 |
+
Please revise the response to address these concerns while maintaining helpfulness and accuracy.
|
| 441 |
+
"""
|
| 442 |
+
|
| 443 |
+
revised_result = await self.agents['response_synthesis'].execute(
|
| 444 |
+
agent_outputs={},
|
| 445 |
+
user_input=revision_prompt,
|
| 446 |
+
context={}
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
revised_response = revised_result.get('final_response', choice_result['original_response'])
|
| 450 |
+
|
| 451 |
+
return {
|
| 452 |
+
'response': revised_response,
|
| 453 |
+
'final_response': revised_response,
|
| 454 |
+
'safety_analysis': choice_result['safety_analysis'],
|
| 455 |
+
'user_choice': 'revision',
|
| 456 |
+
'revision_applied': True,
|
| 457 |
+
'interaction_id': str(uuid.uuid4())[:8],
|
| 458 |
+
'timestamp': datetime.now().isoformat()
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
elif choice_result['action'] == 'use_original_with_warnings':
|
| 462 |
+
logger.info("User chose original response with safety warnings")
|
| 463 |
+
|
| 464 |
+
return {
|
| 465 |
+
'response': choice_result['response_content'],
|
| 466 |
+
'final_response': choice_result['response_content'],
|
| 467 |
+
'safety_analysis': choice_result['safety_analysis'],
|
| 468 |
+
'user_choice': 'original_with_warnings',
|
| 469 |
+
'revision_applied': False,
|
| 470 |
+
'interaction_id': str(uuid.uuid4())[:8],
|
| 471 |
+
'timestamp': datetime.now().isoformat()
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
else:
|
| 475 |
+
logger.error(f"Unknown action: {choice_result['action']}")
|
| 476 |
+
return {'error': f"Unknown action: {choice_result['action']}"}
|
| 477 |
+
|
| 478 |
+
except Exception as e:
|
| 479 |
+
logger.error(f"Error handling user safety decision: {e}", exc_info=True)
|
| 480 |
+
return {'error': str(e)}
|
| 481 |
+
|
| 482 |
def get_execution_trace(self) -> list:
|
| 483 |
"""
|
| 484 |
Return execution trace for debugging and analysis
|