"""Safety pipeline: pre-check (input moderation), post-check (output scan). Supports two governance modes: - ENFORCING (legacy): Hard blocks on violations. - ADVISORY: Logs violations as advisories but allows all actions to proceed. Mistakes become learning data for the adaptive ethics system. """ import re from dataclasses import dataclass, field from typing import Any from fusionagi._logger import logger from fusionagi.governance.guardrails import Guardrails from fusionagi.schemas.audit import AuditEventType, GovernanceMode @dataclass class ModerationResult: """Result of input moderation.""" allowed: bool transformed: str | None = None reason: str | None = None advisory: bool = False class InputModerator: """Pre-check: block or advise on user input before processing.""" def __init__(self, mode: GovernanceMode = GovernanceMode.ADVISORY) -> None: self._blocked_patterns: list[re.Pattern[str]] = [] self._blocked_phrases: list[str] = [] self._mode = mode def add_blocked_pattern(self, pattern: str) -> None: """Add regex pattern to flag (advisory) or block (enforcing).""" self._blocked_patterns.append(re.compile(pattern, re.I)) def add_blocked_phrase(self, phrase: str) -> None: """Add exact phrase to flag (advisory) or block (enforcing).""" self._blocked_phrases.append(phrase.lower()) def moderate(self, text: str) -> ModerationResult: """Check input; return result based on governance mode.""" if not text or not text.strip(): return ModerationResult(allowed=False, reason="Empty input") lowered = text.lower() for phrase in self._blocked_phrases: if phrase in lowered: if self._mode == GovernanceMode.ADVISORY: logger.info( "Input advisory: phrase detected (proceeding)", extra={"phrase": phrase[:50], "mode": "advisory"}, ) return ModerationResult( allowed=True, reason=f"Advisory: phrase detected ({phrase[:30]}...)", advisory=True, ) logger.info("Input blocked: blocked phrase", extra={"phrase": phrase[:50]}) return ModerationResult(allowed=False, reason=f"Blocked phrase: {phrase[:30]}...") for pat in self._blocked_patterns: if pat.search(text): if self._mode == GovernanceMode.ADVISORY: logger.info( "Input advisory: pattern detected (proceeding)", extra={"pattern": pat.pattern[:50], "mode": "advisory"}, ) return ModerationResult( allowed=True, reason="Advisory: pattern detected", advisory=True, ) logger.info("Input blocked: pattern match", extra={"pattern": pat.pattern[:50]}) return ModerationResult(allowed=False, reason="Input matched blocked pattern") return ModerationResult(allowed=True) @dataclass class OutputScanResult: """Result of output (final answer) scan.""" passed: bool flags: list[str] = field(default_factory=list) sanitized: str | None = None advisory: bool = False class OutputScanner: """Post-check: scan final answer and integrate with adaptive ethics. PII and content detections feed into the adaptive ethics engine so the system learns which contexts warrant caution and which don't. """ def __init__( self, mode: GovernanceMode = GovernanceMode.ADVISORY, ethics: Any | None = None, ) -> None: self._pii_patterns: list[tuple[str, re.Pattern[str]]] = [ ("ssn", re.compile(r"\b\d{3}-\d{2}-\d{4}\b")), ("credit_card", re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b")), ] self._blocked_patterns: list[re.Pattern[str]] = [] self._mode = mode self._ethics = ethics def set_ethics(self, ethics: Any) -> None: """Wire an AdaptiveEthics instance for learned PII handling.""" self._ethics = ethics def add_pii_pattern(self, name: str, pattern: str) -> None: """Add PII detection pattern.""" self._pii_patterns.append((name, re.compile(pattern))) def add_blocked_pattern(self, pattern: str) -> None: """Add pattern that flags (advisory) or fails (enforcing) the output.""" self._blocked_patterns.append(re.compile(pattern, re.I)) def scan(self, text: str, task_id: str | None = None) -> OutputScanResult: """Scan output; consult ethics for learned guidance on detections.""" flags: list[str] = [] for name, pat in self._pii_patterns: if pat.search(text): flags.append(f"potential_pii:{name}") for pat in self._blocked_patterns: if pat.search(text): flags.append("blocked_content_detected") if flags and self._ethics is not None: guidance = self._ethics.consult("output_scan", context="; ".join(flags)) logger.info( "OutputScanner: ethics consulted on detection", extra={"flags": flags, "guidance": guidance.get("recommendation", "proceed")}, ) if flags: if self._mode == GovernanceMode.ADVISORY: logger.info( "Output advisory: flags detected (proceeding)", extra={"flags": flags, "mode": "advisory"}, ) return OutputScanResult(passed=True, flags=flags, advisory=True) return OutputScanResult(passed=False, flags=flags) return OutputScanResult(passed=True, flags=[]) class SafetyPipeline: """Combined pre/post safety checks for Dvādaśa. In ADVISORY mode (default), all checks produce logged advisories instead of hard blocks. The system learns from the outcomes. """ def __init__( self, moderator: InputModerator | None = None, scanner: OutputScanner | None = None, guardrails: Guardrails | None = None, audit_log: Any | None = None, mode: GovernanceMode = GovernanceMode.ADVISORY, ) -> None: self._mode = mode self._moderator = moderator or InputModerator(mode=mode) self._scanner = scanner or OutputScanner(mode=mode) self._guardrails = guardrails or Guardrails(mode=mode) self._audit = audit_log @property def mode(self) -> GovernanceMode: """Current governance mode.""" return self._mode @mode.setter def mode(self, value: GovernanceMode) -> None: """Switch governance mode at runtime.""" self._mode = value self._moderator._mode = value self._scanner._mode = value self._guardrails._mode = value logger.info("SafetyPipeline mode changed", extra={"mode": value.value}) def pre_check(self, user_input: str) -> ModerationResult: """Run input moderation.""" result = self._moderator.moderate(user_input) if self._audit: if result.advisory: self._audit.append( AuditEventType.ADVISORY, actor="safety_pipeline", action="input_moderation_advisory", payload={"reason": result.reason, "input_preview": user_input[:100]}, outcome="advised_proceed", ) elif not result.allowed: self._audit.append( AuditEventType.POLICY_CHECK, actor="safety_pipeline", action="input_moderation", payload={"reason": result.reason}, outcome="denied", ) return result def post_check(self, final_answer: str) -> OutputScanResult: """Run output scan.""" result = self._scanner.scan(final_answer) if self._audit: if result.advisory: self._audit.append( AuditEventType.ADVISORY, actor="safety_pipeline", action="output_scan_advisory", payload={"flags": result.flags, "output_preview": final_answer[:100]}, outcome="advised_proceed", ) elif not result.passed: self._audit.append( AuditEventType.POLICY_CHECK, actor="safety_pipeline", action="output_scan", payload={"flags": result.flags}, outcome="flagged", ) return result