Some checks failed
- Integrate GPU scoring inline into reasoning/multi_path.py (auto-uses GPU when available) - Integrate GPU deduplication into multi_agent/consensus_engine.py - Add semantic_search() method to memory/semantic_graph.py with GPU acceleration - Integrate GPU training into self_improvement/training.py AutoTrainer - Fix all 758 ruff lint issues (whitespace, import sorting, unused imports, ambiguous vars, undefined names) - Fix all 40 mypy type errors across the codebase (no-any-return, union-attr, arg-type, etc.) - Fix deprecated ruff config keys (select/ignore -> [tool.ruff.lint]) - Add .dockerignore to exclude .venv/, tests/, docs/ from Docker builds - Add type hints and docstrings to verification/outcome.py - Fix E402 import ordering in witness_agent.py - Fix F821 undefined names in vector_pgvector.py and native.py - Fix E741 ambiguous variable names in reflective.py and recommender.py All 276 tests pass. 0 ruff errors. 0 mypy errors. Co-Authored-By: Nakamoto, S <defi@defi-oracle.io>
133 lines
4.7 KiB
Python
133 lines
4.7 KiB
Python
"""Safety pipeline: pre-check (input moderation), post-check (output scan)."""
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import Any
|
|
|
|
from fusionagi._logger import logger
|
|
from fusionagi.governance.guardrails import Guardrails
|
|
from fusionagi.schemas.audit import AuditEventType
|
|
|
|
|
|
@dataclass
|
|
class ModerationResult:
|
|
"""Result of input moderation."""
|
|
|
|
allowed: bool
|
|
transformed: str | None = None
|
|
reason: str | None = None
|
|
|
|
|
|
class InputModerator:
|
|
"""Pre-check: block or transform user input before processing."""
|
|
|
|
def __init__(self) -> None:
|
|
self._blocked_patterns: list[re.Pattern[str]] = []
|
|
self._blocked_phrases: list[str] = []
|
|
|
|
def add_blocked_pattern(self, pattern: str) -> None:
|
|
"""Add regex pattern to block (e.g. prompt injection attempts)."""
|
|
self._blocked_patterns.append(re.compile(pattern, re.I))
|
|
|
|
def add_blocked_phrase(self, phrase: str) -> None:
|
|
"""Add exact phrase to block."""
|
|
self._blocked_phrases.append(phrase.lower())
|
|
|
|
def moderate(self, text: str) -> ModerationResult:
|
|
"""Check input; return allowed/denied and optional transformed text."""
|
|
if not text or not text.strip():
|
|
return ModerationResult(allowed=False, reason="Empty input")
|
|
lowered = text.lower()
|
|
for phrase in self._blocked_phrases:
|
|
if phrase in lowered:
|
|
logger.info("Input blocked: blocked phrase", extra={"phrase": phrase[:50]})
|
|
return ModerationResult(allowed=False, reason=f"Blocked phrase: {phrase[:30]}...")
|
|
for pat in self._blocked_patterns:
|
|
if pat.search(text):
|
|
logger.info("Input blocked: pattern match", extra={"pattern": pat.pattern[:50]})
|
|
return ModerationResult(allowed=False, reason="Input matched blocked pattern")
|
|
return ModerationResult(allowed=True)
|
|
|
|
|
|
@dataclass
|
|
class OutputScanResult:
|
|
"""Result of output (final answer) scan."""
|
|
|
|
passed: bool
|
|
flags: list[str]
|
|
sanitized: str | None = None
|
|
|
|
|
|
class OutputScanner:
|
|
"""Post-check: scan final answer for policy violations, PII leakage."""
|
|
|
|
def __init__(self) -> None:
|
|
self._pii_patterns: list[tuple[str, re.Pattern[str]]] = [
|
|
("ssn", re.compile(r"\b\d{3}-\d{2}-\d{4}\b")),
|
|
("credit_card", re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b")),
|
|
]
|
|
self._blocked_patterns: list[re.Pattern[str]] = []
|
|
|
|
def add_pii_pattern(self, name: str, pattern: str) -> None:
|
|
"""Add PII detection pattern."""
|
|
self._pii_patterns.append((name, re.compile(pattern)))
|
|
|
|
def add_blocked_pattern(self, pattern: str) -> None:
|
|
"""Add pattern that fails the output."""
|
|
self._blocked_patterns.append(re.compile(pattern, re.I))
|
|
|
|
def scan(self, text: str) -> OutputScanResult:
|
|
"""Scan output; return passed, flags, optional sanitized."""
|
|
flags: list[str] = []
|
|
for name, pat in self._pii_patterns:
|
|
if pat.search(text):
|
|
flags.append(f"potential_pii:{name}")
|
|
for pat in self._blocked_patterns:
|
|
if pat.search(text):
|
|
flags.append("blocked_content_detected")
|
|
if flags:
|
|
return OutputScanResult(passed=False, flags=flags)
|
|
return OutputScanResult(passed=True, flags=[])
|
|
|
|
|
|
class SafetyPipeline:
|
|
"""Combined pre/post safety checks for Dvādaśa."""
|
|
|
|
def __init__(
|
|
self,
|
|
moderator: InputModerator | None = None,
|
|
scanner: OutputScanner | None = None,
|
|
guardrails: Guardrails | None = None,
|
|
audit_log: Any | None = None,
|
|
) -> None:
|
|
self._moderator = moderator or InputModerator()
|
|
self._scanner = scanner or OutputScanner()
|
|
self._guardrails = guardrails or Guardrails()
|
|
self._audit = audit_log
|
|
|
|
def pre_check(self, user_input: str) -> ModerationResult:
|
|
"""Run input moderation."""
|
|
result = self._moderator.moderate(user_input)
|
|
if self._audit and not result.allowed:
|
|
self._audit.append(
|
|
AuditEventType.POLICY_CHECK,
|
|
actor="safety_pipeline",
|
|
action="input_moderation",
|
|
payload={"reason": result.reason},
|
|
outcome="denied",
|
|
)
|
|
return result
|
|
|
|
def post_check(self, final_answer: str) -> OutputScanResult:
|
|
"""Run output scan."""
|
|
result = self._scanner.scan(final_answer)
|
|
if self._audit and not result.passed:
|
|
self._audit.append(
|
|
AuditEventType.POLICY_CHECK,
|
|
actor="safety_pipeline",
|
|
action="output_scan",
|
|
payload={"flags": result.flags},
|
|
outcome="flagged",
|
|
)
|
|
return result
|