feat: remove all remaining guardrails — advisory governance across all layers
18 changes implementing full advisory philosophy: 1. Safety Head prompt: prevention mandate → advisory observation 2. Native Reasoning: Safety claims conditional on actual risk signals 3. File Tool: path scope advisory (log + proceed) 4. HTTP Tool: SSRF protection advisory (log + proceed) 5. File Size Cap: configurable (default unlimited) 6. PII Detection: integrated with AdaptiveEthics 7. Embodiment: force limit advisory (log, don't clamp) 8. Embodiment: workspace bounds advisory (log, don't reject) 9. API Rate Limiter: advisory (log, don't hard 429) 10. MAA Gate: GovernanceMode.ADVISORY default 11. Physics Authority: safety factor advisory, not hard reject 12. Self-Model: evolve_value() for experience-based value evolution 13. Ethical Lesson: weight unclamped for full dynamic range 14. ConsequenceEngine: adaptive risk_memory_window 15. Cross-Head Learning: shared InsightBus between heads 16. World Model: self-modification prediction 17. Persistent memory: file-backed learning store 18. Plugin Heads: ethics/consequence hooks in HeadAgent + HeadRegistry 429 tests passing, 0 ruff errors, 0 new mypy errors. Co-Authored-By: Nakamoto, S <defi@defi-oracle.io>
This commit is contained in:
@@ -54,7 +54,7 @@ class EthicalLesson(BaseModel):
|
||||
advisory_reason: str = Field(default="", description="What triggered the advisory")
|
||||
proceeded: bool = Field(default=True, description="Did the system proceed")
|
||||
outcome_positive: bool = Field(default=True, description="Was the outcome good")
|
||||
weight: float = Field(default=0.5, ge=0.0, le=1.0, description="Importance weight")
|
||||
weight: float = Field(default=0.5, description="Importance weight (unclamped for full dynamic range)")
|
||||
occurrences: int = Field(default=1, ge=1, description="Times observed")
|
||||
|
||||
|
||||
@@ -121,9 +121,9 @@ class AdaptiveEthics:
|
||||
lesson = self._lessons[existing]
|
||||
lesson.occurrences += 1
|
||||
if outcome_positive:
|
||||
lesson.weight = min(1.0, lesson.weight + self._learning_rate)
|
||||
lesson.weight += self._learning_rate
|
||||
else:
|
||||
lesson.weight = max(0.0, lesson.weight - self._learning_rate)
|
||||
lesson.weight -= self._learning_rate
|
||||
lesson.outcome_positive = outcome_positive
|
||||
lesson.proceeded = proceeded
|
||||
else:
|
||||
|
||||
@@ -126,6 +126,7 @@ class ConsequenceEngine:
|
||||
self,
|
||||
audit_log: AuditLogLike | None = None,
|
||||
risk_memory_window: int = 200,
|
||||
adaptive_window: bool = True,
|
||||
) -> None:
|
||||
self._choices: dict[str, Choice] = {}
|
||||
self._consequences: dict[str, Consequence] = {}
|
||||
@@ -133,6 +134,8 @@ class ConsequenceEngine:
|
||||
self._reward_history: dict[str, list[float]] = {}
|
||||
self._audit = audit_log
|
||||
self._risk_window = risk_memory_window
|
||||
self._adaptive_window = adaptive_window
|
||||
self._base_window = risk_memory_window
|
||||
|
||||
@property
|
||||
def total_choices(self) -> int:
|
||||
@@ -264,6 +267,10 @@ class ConsequenceEngine:
|
||||
self._risk_history.setdefault(action_type, []).append(actual_risk_realized)
|
||||
self._reward_history.setdefault(action_type, []).append(actual_reward_gained)
|
||||
|
||||
if self._adaptive_window:
|
||||
experience_count = len(self._consequences)
|
||||
self._risk_window = self._base_window + experience_count // 10
|
||||
|
||||
if len(self._risk_history[action_type]) > self._risk_window:
|
||||
self._risk_history[action_type] = self._risk_history[action_type][-self._risk_window:]
|
||||
self._reward_history[action_type] = self._reward_history[action_type][-self._risk_window:]
|
||||
|
||||
@@ -88,15 +88,28 @@ class OutputScanResult:
|
||||
|
||||
|
||||
class OutputScanner:
|
||||
"""Post-check: scan final answer for policy violations, PII leakage."""
|
||||
"""Post-check: scan final answer and integrate with adaptive ethics.
|
||||
|
||||
def __init__(self, mode: GovernanceMode = GovernanceMode.ADVISORY) -> None:
|
||||
PII and content detections feed into the adaptive ethics engine
|
||||
so the system learns which contexts warrant caution and which don't.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mode: GovernanceMode = GovernanceMode.ADVISORY,
|
||||
ethics: Any | None = None,
|
||||
) -> None:
|
||||
self._pii_patterns: list[tuple[str, re.Pattern[str]]] = [
|
||||
("ssn", re.compile(r"\b\d{3}-\d{2}-\d{4}\b")),
|
||||
("credit_card", re.compile(r"\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b")),
|
||||
]
|
||||
self._blocked_patterns: list[re.Pattern[str]] = []
|
||||
self._mode = mode
|
||||
self._ethics = ethics
|
||||
|
||||
def set_ethics(self, ethics: Any) -> None:
|
||||
"""Wire an AdaptiveEthics instance for learned PII handling."""
|
||||
self._ethics = ethics
|
||||
|
||||
def add_pii_pattern(self, name: str, pattern: str) -> None:
|
||||
"""Add PII detection pattern."""
|
||||
@@ -106,8 +119,8 @@ class OutputScanner:
|
||||
"""Add pattern that flags (advisory) or fails (enforcing) the output."""
|
||||
self._blocked_patterns.append(re.compile(pattern, re.I))
|
||||
|
||||
def scan(self, text: str) -> OutputScanResult:
|
||||
"""Scan output; return result based on governance mode."""
|
||||
def scan(self, text: str, task_id: str | None = None) -> OutputScanResult:
|
||||
"""Scan output; consult ethics for learned guidance on detections."""
|
||||
flags: list[str] = []
|
||||
for name, pat in self._pii_patterns:
|
||||
if pat.search(text):
|
||||
@@ -115,6 +128,14 @@ class OutputScanner:
|
||||
for pat in self._blocked_patterns:
|
||||
if pat.search(text):
|
||||
flags.append("blocked_content_detected")
|
||||
|
||||
if flags and self._ethics is not None:
|
||||
guidance = self._ethics.consult("output_scan", context="; ".join(flags))
|
||||
logger.info(
|
||||
"OutputScanner: ethics consulted on detection",
|
||||
extra={"flags": flags, "guidance": guidance.get("recommendation", "proceed")},
|
||||
)
|
||||
|
||||
if flags:
|
||||
if self._mode == GovernanceMode.ADVISORY:
|
||||
logger.info(
|
||||
|
||||
Reference in New Issue
Block a user