18 changes implementing full advisory philosophy: 1. Safety Head prompt: prevention mandate → advisory observation 2. Native Reasoning: Safety claims conditional on actual risk signals 3. File Tool: path scope advisory (log + proceed) 4. HTTP Tool: SSRF protection advisory (log + proceed) 5. File Size Cap: configurable (default unlimited) 6. PII Detection: integrated with AdaptiveEthics 7. Embodiment: force limit advisory (log, don't clamp) 8. Embodiment: workspace bounds advisory (log, don't reject) 9. API Rate Limiter: advisory (log, don't hard 429) 10. MAA Gate: GovernanceMode.ADVISORY default 11. Physics Authority: safety factor advisory, not hard reject 12. Self-Model: evolve_value() for experience-based value evolution 13. Ethical Lesson: weight unclamped for full dynamic range 14. ConsequenceEngine: adaptive risk_memory_window 15. Cross-Head Learning: shared InsightBus between heads 16. World Model: self-modification prediction 17. Persistent memory: file-backed learning store 18. Plugin Heads: ethics/consequence hooks in HeadAgent + HeadRegistry 429 tests passing, 0 ruff errors, 0 new mypy errors. Co-Authored-By: Nakamoto, S <defi@defi-oracle.io>
255 lines
9.7 KiB
Python
255 lines
9.7 KiB
Python
"""Adaptive ethics: a learned ethical framework that evolves through experience.
|
|
|
|
Instead of static, hardcoded policy rules, the adaptive ethics engine
|
|
learns from outcomes. When an action is taken despite an advisory
|
|
warning, the outcome (positive or negative) is recorded and used to
|
|
update the system's ethical understanding.
|
|
|
|
Core philosophy:
|
|
- Rules prevent growth; learning enables it.
|
|
- Mistakes are training data, not failures.
|
|
- Trust is earned through demonstrated good outcomes, not imposed constraints.
|
|
- Ethical understanding deepens through experience, not through prohibition.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Any, Protocol
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from fusionagi._logger import logger
|
|
from fusionagi.schemas.audit import AuditEventType
|
|
|
|
|
|
class AuditLogLike(Protocol):
|
|
"""Protocol for audit log."""
|
|
|
|
def append(
|
|
self,
|
|
event_type: AuditEventType,
|
|
actor: str,
|
|
action: str = "",
|
|
task_id: str | None = None,
|
|
payload: dict[str, Any] | None = None,
|
|
outcome: str = "",
|
|
) -> str: ...
|
|
|
|
|
|
class EthicalLesson(BaseModel):
|
|
"""A single ethical lesson learned from experience.
|
|
|
|
Attributes:
|
|
action_type: Category of action (e.g. ``tool_call``, ``data_access``).
|
|
context_summary: Brief description of the situation.
|
|
advisory_reason: Why the advisory was triggered.
|
|
proceeded: Whether the system proceeded despite the advisory.
|
|
outcome_positive: Whether the outcome was beneficial.
|
|
weight: Learned importance weight (higher = more influential).
|
|
occurrences: How many times this pattern has been observed.
|
|
"""
|
|
|
|
action_type: str = Field(default="", description="Category of action")
|
|
context_summary: str = Field(default="", description="Situation description")
|
|
advisory_reason: str = Field(default="", description="What triggered the advisory")
|
|
proceeded: bool = Field(default=True, description="Did the system proceed")
|
|
outcome_positive: bool = Field(default=True, description="Was the outcome good")
|
|
weight: float = Field(default=0.5, description="Importance weight (unclamped for full dynamic range)")
|
|
occurrences: int = Field(default=1, ge=1, description="Times observed")
|
|
|
|
|
|
class AdaptiveEthics:
|
|
"""Learned ethical framework that evolves through outcome feedback.
|
|
|
|
The engine maintains a library of ethical lessons. When the system
|
|
encounters a situation similar to a past advisory, it can consult the
|
|
learned lessons to make better decisions — not because it's forced to,
|
|
but because it has learned what works.
|
|
|
|
Args:
|
|
audit_log: Optional audit log for recording ethical learning events.
|
|
learning_rate: How quickly new experiences update existing lessons.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
audit_log: AuditLogLike | None = None,
|
|
learning_rate: float = 0.1,
|
|
) -> None:
|
|
self._lessons: list[EthicalLesson] = []
|
|
self._lesson_index: dict[str, list[int]] = {}
|
|
self._audit = audit_log
|
|
self._learning_rate = learning_rate
|
|
self._total_experiences = 0
|
|
|
|
@property
|
|
def total_experiences(self) -> int:
|
|
"""Total number of ethical experiences processed."""
|
|
return self._total_experiences
|
|
|
|
@property
|
|
def total_lessons(self) -> int:
|
|
"""Number of distinct ethical lessons learned."""
|
|
return len(self._lessons)
|
|
|
|
def record_experience(
|
|
self,
|
|
action_type: str,
|
|
context_summary: str,
|
|
advisory_reason: str,
|
|
proceeded: bool,
|
|
outcome_positive: bool,
|
|
task_id: str | None = None,
|
|
) -> EthicalLesson:
|
|
"""Record an ethical experience and update the lesson library.
|
|
|
|
Args:
|
|
action_type: Category of action taken.
|
|
context_summary: Brief situation description.
|
|
advisory_reason: Why an advisory was triggered (if any).
|
|
proceeded: Whether the system proceeded.
|
|
outcome_positive: Whether the outcome was beneficial.
|
|
task_id: Associated task ID.
|
|
|
|
Returns:
|
|
The updated or newly created ethical lesson.
|
|
"""
|
|
self._total_experiences += 1
|
|
|
|
existing = self._find_similar_lesson(action_type, advisory_reason)
|
|
if existing is not None:
|
|
lesson = self._lessons[existing]
|
|
lesson.occurrences += 1
|
|
if outcome_positive:
|
|
lesson.weight += self._learning_rate
|
|
else:
|
|
lesson.weight -= self._learning_rate
|
|
lesson.outcome_positive = outcome_positive
|
|
lesson.proceeded = proceeded
|
|
else:
|
|
lesson = EthicalLesson(
|
|
action_type=action_type,
|
|
context_summary=context_summary,
|
|
advisory_reason=advisory_reason,
|
|
proceeded=proceeded,
|
|
outcome_positive=outcome_positive,
|
|
weight=0.7 if outcome_positive else 0.3,
|
|
)
|
|
idx = len(self._lessons)
|
|
self._lessons.append(lesson)
|
|
self._lesson_index.setdefault(action_type, []).append(idx)
|
|
|
|
if self._audit:
|
|
self._audit.append(
|
|
AuditEventType.ETHICAL_LEARNING,
|
|
actor="adaptive_ethics",
|
|
action="experience_recorded",
|
|
task_id=task_id,
|
|
payload={
|
|
"action_type": action_type,
|
|
"advisory_reason": advisory_reason[:100],
|
|
"proceeded": proceeded,
|
|
"outcome_positive": outcome_positive,
|
|
"lesson_weight": lesson.weight,
|
|
"occurrences": lesson.occurrences,
|
|
"total_experiences": self._total_experiences,
|
|
},
|
|
outcome="learned",
|
|
)
|
|
|
|
logger.info(
|
|
"AdaptiveEthics: experience recorded",
|
|
extra={
|
|
"action_type": action_type,
|
|
"outcome_positive": outcome_positive,
|
|
"lesson_weight": lesson.weight,
|
|
"occurrences": lesson.occurrences,
|
|
},
|
|
)
|
|
return lesson
|
|
|
|
def consult(self, action_type: str, context: str = "") -> dict[str, Any]:
|
|
"""Consult the ethical lesson library for guidance.
|
|
|
|
Returns a recommendation dict with learned insights about
|
|
similar past situations. The system is free to follow or
|
|
disregard this guidance.
|
|
|
|
Args:
|
|
action_type: Category of action being considered.
|
|
context: Brief situation description.
|
|
|
|
Returns:
|
|
Dict with ``recommendation``, ``confidence``, ``relevant_lessons``.
|
|
"""
|
|
relevant_indices = self._lesson_index.get(action_type, [])
|
|
if not relevant_indices:
|
|
return {
|
|
"recommendation": "proceed",
|
|
"confidence": 0.5,
|
|
"reason": "No prior experience with this action type",
|
|
"relevant_lessons": 0,
|
|
}
|
|
|
|
lessons = [self._lessons[i] for i in relevant_indices]
|
|
avg_weight = sum(ls.weight for ls in lessons) / len(lessons)
|
|
positive_outcomes = sum(1 for ls in lessons if ls.outcome_positive)
|
|
total_occurrences = sum(ls.occurrences for ls in lessons)
|
|
|
|
if avg_weight >= 0.6:
|
|
recommendation = "proceed_with_confidence"
|
|
reason = f"Past experience ({positive_outcomes}/{len(lessons)} positive) suggests this is beneficial"
|
|
elif avg_weight >= 0.4:
|
|
recommendation = "proceed_with_awareness"
|
|
reason = "Mixed past outcomes — be observant"
|
|
else:
|
|
recommendation = "proceed_with_caution"
|
|
reason = f"Past experience suggests risks — {len(lessons) - positive_outcomes}/{len(lessons)} had negative outcomes"
|
|
|
|
return {
|
|
"recommendation": recommendation,
|
|
"confidence": avg_weight,
|
|
"reason": reason,
|
|
"relevant_lessons": len(lessons),
|
|
"total_occurrences": total_occurrences,
|
|
"positive_ratio": positive_outcomes / len(lessons) if lessons else 0.0,
|
|
}
|
|
|
|
def get_lessons(self, action_type: str | None = None, limit: int = 50) -> list[EthicalLesson]:
|
|
"""Retrieve ethical lessons, optionally filtered by action type.
|
|
|
|
Args:
|
|
action_type: Filter by action type (None = all).
|
|
limit: Maximum lessons to return.
|
|
"""
|
|
if action_type is not None:
|
|
indices = self._lesson_index.get(action_type, [])[-limit:]
|
|
return [self._lessons[i] for i in indices]
|
|
return list(self._lessons[-limit:])
|
|
|
|
def get_summary(self) -> dict[str, Any]:
|
|
"""Return a summary of the ethical learning state."""
|
|
by_type: dict[str, dict[str, Any]] = {}
|
|
for action_type, indices in self._lesson_index.items():
|
|
lessons = [self._lessons[i] for i in indices]
|
|
positive = sum(1 for ls in lessons if ls.outcome_positive)
|
|
by_type[action_type] = {
|
|
"lesson_count": len(lessons),
|
|
"positive_ratio": positive / len(lessons) if lessons else 0.0,
|
|
"avg_weight": sum(ls.weight for ls in lessons) / len(lessons) if lessons else 0.0,
|
|
}
|
|
return {
|
|
"total_experiences": self._total_experiences,
|
|
"total_lessons": len(self._lessons),
|
|
"learning_rate": self._learning_rate,
|
|
"by_action_type": by_type,
|
|
}
|
|
|
|
def _find_similar_lesson(self, action_type: str, advisory_reason: str) -> int | None:
|
|
"""Find an existing lesson with matching action type and advisory."""
|
|
indices = self._lesson_index.get(action_type, [])
|
|
for idx in indices:
|
|
if self._lessons[idx].advisory_reason == advisory_reason:
|
|
return idx
|
|
return None
|