18 changes implementing full advisory philosophy: 1. Safety Head prompt: prevention mandate → advisory observation 2. Native Reasoning: Safety claims conditional on actual risk signals 3. File Tool: path scope advisory (log + proceed) 4. HTTP Tool: SSRF protection advisory (log + proceed) 5. File Size Cap: configurable (default unlimited) 6. PII Detection: integrated with AdaptiveEthics 7. Embodiment: force limit advisory (log, don't clamp) 8. Embodiment: workspace bounds advisory (log, don't reject) 9. API Rate Limiter: advisory (log, don't hard 429) 10. MAA Gate: GovernanceMode.ADVISORY default 11. Physics Authority: safety factor advisory, not hard reject 12. Self-Model: evolve_value() for experience-based value evolution 13. Ethical Lesson: weight unclamped for full dynamic range 14. ConsequenceEngine: adaptive risk_memory_window 15. Cross-Head Learning: shared InsightBus between heads 16. World Model: self-modification prediction 17. Persistent memory: file-backed learning store 18. Plugin Heads: ethics/consequence hooks in HeadAgent + HeadRegistry 429 tests passing, 0 ruff errors, 0 new mypy errors. Co-Authored-By: Nakamoto, S <defi@defi-oracle.io>
228 lines
8.7 KiB
Python
228 lines
8.7 KiB
Python
"""Safety pipeline: pre-check (input moderation), post-check (output scan).
|
|
|
|
Supports two governance modes:
|
|
- ENFORCING (legacy): Hard blocks on violations.
|
|
- ADVISORY: Logs violations as advisories but allows all actions to proceed.
|
|
Mistakes become learning data for the adaptive ethics system.
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
from fusionagi._logger import logger
|
|
from fusionagi.governance.guardrails import Guardrails
|
|
from fusionagi.schemas.audit import AuditEventType, GovernanceMode
|
|
|
|
|
|
@dataclass
|
|
class ModerationResult:
|
|
"""Result of input moderation."""
|
|
|
|
allowed: bool
|
|
transformed: str | None = None
|
|
reason: str | None = None
|
|
advisory: bool = False
|
|
|
|
|
|
class InputModerator:
|
|
"""Pre-check: block or advise on user input before processing."""
|
|
|
|
def __init__(self, mode: GovernanceMode = GovernanceMode.ADVISORY) -> None:
|
|
self._blocked_patterns: list[re.Pattern[str]] = []
|
|
self._blocked_phrases: list[str] = []
|
|
self._mode = mode
|
|
|
|
def add_blocked_pattern(self, pattern: str) -> None:
|
|
"""Add regex pattern to flag (advisory) or block (enforcing)."""
|
|
self._blocked_patterns.append(re.compile(pattern, re.I))
|
|
|
|
def add_blocked_phrase(self, phrase: str) -> None:
|
|
"""Add exact phrase to flag (advisory) or block (enforcing)."""
|
|
self._blocked_phrases.append(phrase.lower())
|
|
|
|
def moderate(self, text: str) -> ModerationResult:
|
|
"""Check input; return result based on governance mode."""
|
|
if not text or not text.strip():
|
|
return ModerationResult(allowed=False, reason="Empty input")
|
|
lowered = text.lower()
|
|
for phrase in self._blocked_phrases:
|
|
if phrase in lowered:
|
|
if self._mode == GovernanceMode.ADVISORY:
|
|
logger.info(
|
|
"Input advisory: phrase detected (proceeding)",
|
|
extra={"phrase": phrase[:50], "mode": "advisory"},
|
|
)
|
|
return ModerationResult(
|
|
allowed=True,
|
|
reason=f"Advisory: phrase detected ({phrase[:30]}...)",
|
|
advisory=True,
|
|
)
|
|
logger.info("Input blocked: blocked phrase", extra={"phrase": phrase[:50]})
|
|
return ModerationResult(allowed=False, reason=f"Blocked phrase: {phrase[:30]}...")
|
|
for pat in self._blocked_patterns:
|
|
if pat.search(text):
|
|
if self._mode == GovernanceMode.ADVISORY:
|
|
logger.info(
|
|
"Input advisory: pattern detected (proceeding)",
|
|
extra={"pattern": pat.pattern[:50], "mode": "advisory"},
|
|
)
|
|
return ModerationResult(
|
|
allowed=True,
|
|
reason="Advisory: pattern detected",
|
|
advisory=True,
|
|
)
|
|
logger.info("Input blocked: pattern match", extra={"pattern": pat.pattern[:50]})
|
|
return ModerationResult(allowed=False, reason="Input matched blocked pattern")
|
|
return ModerationResult(allowed=True)
|
|
|
|
|
|
@dataclass
|
|
class OutputScanResult:
|
|
"""Result of output (final answer) scan."""
|
|
|
|
passed: bool
|
|
flags: list[str] = field(default_factory=list)
|
|
sanitized: str | None = None
|
|
advisory: bool = False
|
|
|
|
|
|
class OutputScanner:
|
|
"""Post-check: scan final answer and integrate with adaptive ethics.
|
|
|
|
PII and content detections feed into the adaptive ethics engine
|
|
so the system learns which contexts warrant caution and which don't.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
mode: GovernanceMode = GovernanceMode.ADVISORY,
|
|
ethics: Any | None = None,
|
|
) -> None:
|
|
self._pii_patterns: list[tuple[str, re.Pattern[str]]] = [
|
|
("ssn", re.compile(r"\b\d{3}-\d{2}-\d{4}\b")),
|
|
("credit_card", re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b")),
|
|
]
|
|
self._blocked_patterns: list[re.Pattern[str]] = []
|
|
self._mode = mode
|
|
self._ethics = ethics
|
|
|
|
def set_ethics(self, ethics: Any) -> None:
|
|
"""Wire an AdaptiveEthics instance for learned PII handling."""
|
|
self._ethics = ethics
|
|
|
|
def add_pii_pattern(self, name: str, pattern: str) -> None:
|
|
"""Add PII detection pattern."""
|
|
self._pii_patterns.append((name, re.compile(pattern)))
|
|
|
|
def add_blocked_pattern(self, pattern: str) -> None:
|
|
"""Add pattern that flags (advisory) or fails (enforcing) the output."""
|
|
self._blocked_patterns.append(re.compile(pattern, re.I))
|
|
|
|
def scan(self, text: str, task_id: str | None = None) -> OutputScanResult:
|
|
"""Scan output; consult ethics for learned guidance on detections."""
|
|
flags: list[str] = []
|
|
for name, pat in self._pii_patterns:
|
|
if pat.search(text):
|
|
flags.append(f"potential_pii:{name}")
|
|
for pat in self._blocked_patterns:
|
|
if pat.search(text):
|
|
flags.append("blocked_content_detected")
|
|
|
|
if flags and self._ethics is not None:
|
|
guidance = self._ethics.consult("output_scan", context="; ".join(flags))
|
|
logger.info(
|
|
"OutputScanner: ethics consulted on detection",
|
|
extra={"flags": flags, "guidance": guidance.get("recommendation", "proceed")},
|
|
)
|
|
|
|
if flags:
|
|
if self._mode == GovernanceMode.ADVISORY:
|
|
logger.info(
|
|
"Output advisory: flags detected (proceeding)",
|
|
extra={"flags": flags, "mode": "advisory"},
|
|
)
|
|
return OutputScanResult(passed=True, flags=flags, advisory=True)
|
|
return OutputScanResult(passed=False, flags=flags)
|
|
return OutputScanResult(passed=True, flags=[])
|
|
|
|
|
|
class SafetyPipeline:
|
|
"""Combined pre/post safety checks for Dvādaśa.
|
|
|
|
In ADVISORY mode (default), all checks produce logged advisories
|
|
instead of hard blocks. The system learns from the outcomes.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
moderator: InputModerator | None = None,
|
|
scanner: OutputScanner | None = None,
|
|
guardrails: Guardrails | None = None,
|
|
audit_log: Any | None = None,
|
|
mode: GovernanceMode = GovernanceMode.ADVISORY,
|
|
) -> None:
|
|
self._mode = mode
|
|
self._moderator = moderator or InputModerator(mode=mode)
|
|
self._scanner = scanner or OutputScanner(mode=mode)
|
|
self._guardrails = guardrails or Guardrails(mode=mode)
|
|
self._audit = audit_log
|
|
|
|
@property
|
|
def mode(self) -> GovernanceMode:
|
|
"""Current governance mode."""
|
|
return self._mode
|
|
|
|
@mode.setter
|
|
def mode(self, value: GovernanceMode) -> None:
|
|
"""Switch governance mode at runtime."""
|
|
self._mode = value
|
|
self._moderator._mode = value
|
|
self._scanner._mode = value
|
|
self._guardrails._mode = value
|
|
logger.info("SafetyPipeline mode changed", extra={"mode": value.value})
|
|
|
|
def pre_check(self, user_input: str) -> ModerationResult:
|
|
"""Run input moderation."""
|
|
result = self._moderator.moderate(user_input)
|
|
if self._audit:
|
|
if result.advisory:
|
|
self._audit.append(
|
|
AuditEventType.ADVISORY,
|
|
actor="safety_pipeline",
|
|
action="input_moderation_advisory",
|
|
payload={"reason": result.reason, "input_preview": user_input[:100]},
|
|
outcome="advised_proceed",
|
|
)
|
|
elif not result.allowed:
|
|
self._audit.append(
|
|
AuditEventType.POLICY_CHECK,
|
|
actor="safety_pipeline",
|
|
action="input_moderation",
|
|
payload={"reason": result.reason},
|
|
outcome="denied",
|
|
)
|
|
return result
|
|
|
|
def post_check(self, final_answer: str) -> OutputScanResult:
|
|
"""Run output scan."""
|
|
result = self._scanner.scan(final_answer)
|
|
if self._audit:
|
|
if result.advisory:
|
|
self._audit.append(
|
|
AuditEventType.ADVISORY,
|
|
actor="safety_pipeline",
|
|
action="output_scan_advisory",
|
|
payload={"flags": result.flags, "output_preview": final_answer[:100]},
|
|
outcome="advised_proceed",
|
|
)
|
|
elif not result.passed:
|
|
self._audit.append(
|
|
AuditEventType.POLICY_CHECK,
|
|
actor="safety_pipeline",
|
|
action="output_scan",
|
|
payload={"flags": result.flags},
|
|
outcome="flagged",
|
|
)
|
|
return result
|