Files
FusionAGI/tests/test_metacognition.py
Devin AI 9a8affae9a
Some checks failed
Tests / test (3.10) (pull_request) Failing after 35s
Tests / test (3.11) (pull_request) Failing after 34s
Tests / test (3.12) (pull_request) Successful in 39s
Tests / lint (pull_request) Successful in 36s
Tests / docker (pull_request) Successful in 1m42s
feat: consequence engine, causal world model, metacognition, interpretability, claim verification
Choice → Consequence → Learning:
- ConsequenceEngine tracks every decision point with alternatives,
  risk/reward estimates, and actual outcomes
- Consequences feed into AdaptiveEthics for experience-based learning
- FusionAGILoop now wires ethics + consequences into task lifecycle

Causal World Model:
- CausalWorldModel learns state-transition patterns from execution history
- Predicts outcomes based on observed action→effect patterns
- Uncertainty estimates decrease as more evidence accumulates

Metacognition:
- assess_head_outputs() evaluates reasoning quality from head outputs
- Detects knowledge gaps, measures head agreement, identifies uncertainty
- Actively recommends whether to seek more information

Interpretability:
- ReasoningTracer captures full prompt→answer reasoning traces
- Each step records stage, component, input/output, timing
- explain() generates human-readable reasoning explanations

Claim Verification:
- ClaimVerifier cross-checks claims for evidence, consistency, grounding
- Flags high-confidence claims lacking evidence support
- Detects contradictions between claims from different heads

325 tests passing, 0 ruff errors, 0 mypy errors.

Co-Authored-By: Nakamoto, S <defi@defi-oracle.io>
2026-04-28 06:25:35 +00:00

140 lines
5.0 KiB
Python

"""Tests for metacognition and reasoning interpretability."""
from fusionagi.reasoning.interpretability import ReasoningTracer
from fusionagi.reasoning.metacognition import (
assess_head_outputs,
)
from fusionagi.schemas.grounding import Citation
from fusionagi.schemas.head import HeadClaim, HeadId, HeadOutput
from fusionagi.verification import ClaimVerifier
_SAMPLE_CITATION = Citation(source_id="src_1", excerpt="supporting evidence")
def _make_head_output(
head_id: HeadId,
claims: list[tuple[str, float]] | None = None,
) -> HeadOutput:
"""Helper to create a head output with claims."""
head_claims = []
for text, conf in (claims or [("Test claim", 0.7)]):
head_claims.append(HeadClaim(
claim_text=text,
confidence=conf,
evidence=[_SAMPLE_CITATION] if conf > 0.5 else [],
))
return HeadOutput(
head_id=head_id,
summary=f"Output from {head_id.value}",
claims=head_claims,
risks=[],
)
class TestMetacognition:
"""Test metacognitive self-assessment."""
def test_empty_outputs(self) -> None:
assessment = assess_head_outputs([])
assert assessment.overall_confidence == 0.0
assert assessment.should_seek_more is True
def test_high_confidence_outputs(self) -> None:
outputs = [
_make_head_output(HeadId.LOGIC, [("Logic is sound", 0.9)]),
_make_head_output(HeadId.RESEARCH, [("Data supports this", 0.85)]),
]
assessment = assess_head_outputs(outputs)
assert assessment.overall_confidence > 0.3
assert isinstance(assessment.knowledge_gaps, list)
def test_low_confidence_triggers_seek_more(self) -> None:
outputs = [
_make_head_output(HeadId.LOGIC, [("Uncertain claim", 0.1)]),
]
assessment = assess_head_outputs(outputs)
assert len(assessment.uncertainty_sources) > 0
def test_knowledge_gap_detection(self) -> None:
outputs = [
_make_head_output(HeadId.LOGIC, [("Low conf claim", 0.1)]),
]
assessment = assess_head_outputs(outputs)
gap_domains = [g.domain for g in assessment.knowledge_gaps]
assert "logic" in gap_domains
def test_domain_gap_detection(self) -> None:
outputs = [_make_head_output(HeadId.LOGIC)]
assessment = assess_head_outputs(outputs, user_prompt="legal compliance required")
gap_domains = [g.domain for g in assessment.knowledge_gaps]
assert "legal" in gap_domains
class TestReasoningTracer:
"""Test interpretability tracing."""
def test_trace_lifecycle(self) -> None:
tracer = ReasoningTracer()
tracer.start_trace("t1", "task1", "What is 2+2?")
tracer.add_step("t1", "decomposition", "decomposer", "prompt", "2 units")
tracer.add_step("t1", "head_dispatch", "orchestrator", "5 heads", "5 outputs")
tracer.finalize_trace("t1", "4", 0.95)
result = tracer.get_trace("t1")
assert result is not None
assert len(result.steps) == 2
assert result.final_answer == "4"
assert result.overall_confidence == 0.95
def test_explain(self) -> None:
tracer = ReasoningTracer()
tracer.start_trace("t1", "task1", "question")
tracer.add_step("t1", "stage1", "comp1", "in", "out")
tracer.finalize_trace("t1", "answer", 0.8)
explanation = tracer.explain("t1")
assert "stage1" in explanation
assert "answer" in explanation
def test_trace_not_found(self) -> None:
tracer = ReasoningTracer()
assert tracer.get_trace("nonexistent") is None
assert "not found" in tracer.explain("nonexistent")
def test_recent_traces(self) -> None:
tracer = ReasoningTracer()
for i in range(5):
tracer.start_trace(f"t{i}", f"task{i}", f"prompt{i}")
assert len(tracer.get_recent_traces(limit=3)) == 3
assert tracer.total_traces == 5
class TestClaimVerifier:
"""Test formal claim verification."""
def test_verify_no_outputs(self) -> None:
verifier = ClaimVerifier()
report = verifier.verify_outputs([])
assert report.total_claims == 0
def test_verify_well_supported_claims(self) -> None:
outputs = [
_make_head_output(HeadId.LOGIC, [("Well supported", 0.7)]),
_make_head_output(HeadId.RESEARCH, [("Also supported", 0.7)]),
]
verifier = ClaimVerifier()
report = verifier.verify_outputs(outputs)
assert report.total_claims == 2
assert report.overall_integrity > 0.0
def test_high_conf_no_evidence_flagged(self) -> None:
claim = HeadClaim(claim_text="Bold claim", confidence=0.95, evidence=[])
output = HeadOutput(
head_id=HeadId.LOGIC,
summary="Bold output",
claims=[claim],
risks=[],
)
verifier = ClaimVerifier()
report = verifier.verify_outputs([output])
assert report.flagged_count >= 1
assert any("evidence" in issue.lower() for r in report.results for issue in r.issues)