Files
FusionAGI/fusionagi/governance/safety_pipeline.py
defiQUG c052b07662
Some checks failed
Tests / test (3.10) (push) Has been cancelled
Tests / test (3.11) (push) Has been cancelled
Tests / test (3.12) (push) Has been cancelled
Tests / lint (push) Has been cancelled
Tests / docker (push) Has been cancelled
Initial commit: add .gitignore and README
2026-02-09 21:51:42 -08:00

133 lines
4.7 KiB
Python

"""Safety pipeline: pre-check (input moderation), post-check (output scan)."""
import re
from dataclasses import dataclass
from typing import Any
from fusionagi.governance.guardrails import Guardrails, PreCheckResult
from fusionagi.schemas.audit import AuditEventType
from fusionagi._logger import logger
@dataclass
class ModerationResult:
"""Result of input moderation."""
allowed: bool
transformed: str | None = None
reason: str | None = None
class InputModerator:
"""Pre-check: block or transform user input before processing."""
def __init__(self) -> None:
self._blocked_patterns: list[re.Pattern[str]] = []
self._blocked_phrases: list[str] = []
def add_blocked_pattern(self, pattern: str) -> None:
"""Add regex pattern to block (e.g. prompt injection attempts)."""
self._blocked_patterns.append(re.compile(pattern, re.I))
def add_blocked_phrase(self, phrase: str) -> None:
"""Add exact phrase to block."""
self._blocked_phrases.append(phrase.lower())
def moderate(self, text: str) -> ModerationResult:
"""Check input; return allowed/denied and optional transformed text."""
if not text or not text.strip():
return ModerationResult(allowed=False, reason="Empty input")
lowered = text.lower()
for phrase in self._blocked_phrases:
if phrase in lowered:
logger.info("Input blocked: blocked phrase", extra={"phrase": phrase[:50]})
return ModerationResult(allowed=False, reason=f"Blocked phrase: {phrase[:30]}...")
for pat in self._blocked_patterns:
if pat.search(text):
logger.info("Input blocked: pattern match", extra={"pattern": pat.pattern[:50]})
return ModerationResult(allowed=False, reason="Input matched blocked pattern")
return ModerationResult(allowed=True)
@dataclass
class OutputScanResult:
"""Result of output (final answer) scan."""
passed: bool
flags: list[str]
sanitized: str | None = None
class OutputScanner:
"""Post-check: scan final answer for policy violations, PII leakage."""
def __init__(self) -> None:
self._pii_patterns: list[tuple[str, re.Pattern[str]]] = [
("ssn", re.compile(r"\b\d{3}-\d{2}-\d{4}\b")),
("credit_card", re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b")),
]
self._blocked_patterns: list[re.Pattern[str]] = []
def add_pii_pattern(self, name: str, pattern: str) -> None:
"""Add PII detection pattern."""
self._pii_patterns.append((name, re.compile(pattern)))
def add_blocked_pattern(self, pattern: str) -> None:
"""Add pattern that fails the output."""
self._blocked_patterns.append(re.compile(pattern, re.I))
def scan(self, text: str) -> OutputScanResult:
"""Scan output; return passed, flags, optional sanitized."""
flags: list[str] = []
for name, pat in self._pii_patterns:
if pat.search(text):
flags.append(f"potential_pii:{name}")
for pat in self._blocked_patterns:
if pat.search(text):
flags.append("blocked_content_detected")
if flags:
return OutputScanResult(passed=False, flags=flags)
return OutputScanResult(passed=True, flags=[])
class SafetyPipeline:
"""Combined pre/post safety checks for Dvādaśa."""
def __init__(
self,
moderator: InputModerator | None = None,
scanner: OutputScanner | None = None,
guardrails: Guardrails | None = None,
audit_log: Any | None = None,
) -> None:
self._moderator = moderator or InputModerator()
self._scanner = scanner or OutputScanner()
self._guardrails = guardrails or Guardrails()
self._audit = audit_log
def pre_check(self, user_input: str) -> ModerationResult:
"""Run input moderation."""
result = self._moderator.moderate(user_input)
if self._audit and not result.allowed:
self._audit.append(
AuditEventType.POLICY_CHECK,
actor="safety_pipeline",
action="input_moderation",
payload={"reason": result.reason},
outcome="denied",
)
return result
def post_check(self, final_answer: str) -> OutputScanResult:
"""Run output scan."""
result = self._scanner.scan(final_answer)
if self._audit and not result.passed:
self._audit.append(
AuditEventType.POLICY_CHECK,
actor="safety_pipeline",
action="output_scan",
payload={"flags": result.flags},
outcome="flagged",
)
return result