Items completed: 1. Merged PR #2 (starlette/httpx deps) 2. Fixed async race condition in multimodal_ui.py 3. Wired TTSAdapter (ElevenLabs, Azure) in API routes 4. Moved super_big_brain.py from core/ to reasoning/ (backward compat shim) 5. Added API authentication middleware (Bearer token via FUSIONAGI_API_KEY) 6. Added async adapter interface (acomplete/acomplete_structured) 7. Migrated FastAPI on_event to lifespan (fixes 20 deprecation warnings) 8. Liquid Neural Networks (continuous-time adaptive weights) 9. Quantum-AI Hybrid compute backend (simulator + optimization) 10. Embodied Intelligence / Robotics bridge (actuator + sensor protocols) 11. Consciousness Engineering (formal self-model with introspection) 12. ASI Scoring Rubric (C/A/L/N/R self-assessment harness) 13. GPU integration tests for TensorFlow backend 14. Multi-stage production Dockerfile 15. Gitea CI/CD pipeline (lint, test matrix, Docker build) 16. API rate limiting middleware (per-IP sliding window) 17. OpenAPI docs cleanup (auth + rate limiting descriptions) 18. Benchmarking suite (decomposition, multi-path, recomposition, e2e) 19. Plugin system (head registry for custom heads) 427 tests passing, 0 ruff errors, 0 mypy errors. Co-Authored-By: Nakamoto, S <defi@defi-oracle.io>
344 lines
11 KiB
Python
344 lines
11 KiB
Python
"""ASI Scoring Rubric — C/A/L/N/R self-assessment evaluation harness.
|
|
|
|
Implements the 5-dimension capability scoring framework:
|
|
- Cognitive Capability (C) — raw intelligence across domains
|
|
- Agency / Autonomy (A) — ability to execute multi-step goals
|
|
- Learning & Adaptation (L) — ability to improve over time
|
|
- Creativity / Novelty (N) — original insight generation
|
|
- Reliability / Robustness (R) — consistency, safety, correctness
|
|
|
|
Tier mapping:
|
|
0-40 Narrow AI
|
|
40-60 Advanced AI
|
|
60-75 Agentic AI
|
|
75-90 AGI-like
|
|
90+ ASI (theoretical)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from typing import Any
|
|
|
|
from fusionagi._logger import logger
|
|
|
|
|
|
class CapabilityTier(str, Enum):
|
|
"""Classification tier based on composite score."""
|
|
|
|
NARROW_AI = "Narrow AI"
|
|
ADVANCED_AI = "Advanced AI"
|
|
AGENTIC_AI = "Agentic AI"
|
|
AGI_LIKE = "AGI-like"
|
|
ASI = "ASI"
|
|
|
|
|
|
@dataclass
|
|
class DimensionScore:
|
|
"""Score for a single evaluation dimension."""
|
|
|
|
name: str
|
|
abbreviation: str
|
|
weight: float
|
|
score: float = 0.0
|
|
sub_scores: dict[str, float] = field(default_factory=dict)
|
|
evidence: list[str] = field(default_factory=list)
|
|
|
|
@property
|
|
def weighted_score(self) -> float:
|
|
"""Return weight * score."""
|
|
return self.weight * self.score
|
|
|
|
|
|
@dataclass
|
|
class RubricConfig:
|
|
"""Configuration for rubric weights (must sum to 1.0)."""
|
|
|
|
cognitive_weight: float = 0.30
|
|
agency_weight: float = 0.20
|
|
learning_weight: float = 0.15
|
|
creativity_weight: float = 0.15
|
|
reliability_weight: float = 0.20
|
|
|
|
def validate(self) -> bool:
|
|
"""Check weights sum to 1.0 (within tolerance)."""
|
|
total = (
|
|
self.cognitive_weight
|
|
+ self.agency_weight
|
|
+ self.learning_weight
|
|
+ self.creativity_weight
|
|
+ self.reliability_weight
|
|
)
|
|
return abs(total - 1.0) < 0.01
|
|
|
|
|
|
@dataclass
|
|
class RubricResult:
|
|
"""Complete evaluation result."""
|
|
|
|
dimensions: dict[str, DimensionScore]
|
|
composite_score: float
|
|
tier: CapabilityTier
|
|
config: RubricConfig
|
|
metadata: dict[str, Any] = field(default_factory=dict)
|
|
|
|
def radar_chart_data(self) -> dict[str, float]:
|
|
"""Return data suitable for radar chart visualization."""
|
|
return {d.abbreviation: d.score for d in self.dimensions.values()}
|
|
|
|
def summary(self) -> str:
|
|
"""Human-readable summary."""
|
|
lines = [f"Composite Score: {self.composite_score:.1f} — {self.tier.value}"]
|
|
for dim in self.dimensions.values():
|
|
lines.append(f" {dim.abbreviation} ({dim.name}): {dim.score:.1f}")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _classify_tier(score: float) -> CapabilityTier:
|
|
"""Map composite score to tier."""
|
|
if score >= 90:
|
|
return CapabilityTier.ASI
|
|
if score >= 75:
|
|
return CapabilityTier.AGI_LIKE
|
|
if score >= 60:
|
|
return CapabilityTier.AGENTIC_AI
|
|
if score >= 40:
|
|
return CapabilityTier.ADVANCED_AI
|
|
return CapabilityTier.NARROW_AI
|
|
|
|
|
|
class ASIRubric:
|
|
"""Self-assessment evaluation harness for FusionAGI.
|
|
|
|
Can evaluate the system's own capabilities by running test
|
|
batteries, analyzing historical performance, and computing
|
|
dimension scores.
|
|
"""
|
|
|
|
def __init__(self, config: RubricConfig | None = None) -> None:
|
|
self._config = config or RubricConfig()
|
|
if not self._config.validate():
|
|
raise ValueError("Rubric weights must sum to 1.0")
|
|
self._history: list[RubricResult] = []
|
|
|
|
def evaluate(
|
|
self,
|
|
cognitive_scores: dict[str, float] | None = None,
|
|
agency_scores: dict[str, float] | None = None,
|
|
learning_scores: dict[str, float] | None = None,
|
|
creativity_scores: dict[str, float] | None = None,
|
|
reliability_scores: dict[str, float] | None = None,
|
|
metadata: dict[str, Any] | None = None,
|
|
) -> RubricResult:
|
|
"""Run a full evaluation.
|
|
|
|
Each dimension accepts a dict of sub-metric names to scores (0-100).
|
|
The dimension score is the weighted average of its sub-metrics.
|
|
|
|
Args:
|
|
cognitive_scores: Sub-metrics for Cognitive Capability.
|
|
agency_scores: Sub-metrics for Agency / Autonomy.
|
|
learning_scores: Sub-metrics for Learning & Adaptation.
|
|
creativity_scores: Sub-metrics for Creativity / Novelty.
|
|
reliability_scores: Sub-metrics for Reliability / Robustness.
|
|
metadata: Additional context.
|
|
|
|
Returns:
|
|
Complete evaluation result.
|
|
"""
|
|
cfg = self._config
|
|
|
|
dimensions: dict[str, DimensionScore] = {}
|
|
|
|
dimensions["cognitive"] = self._score_dimension(
|
|
"Cognitive Capability", "C", cfg.cognitive_weight,
|
|
cognitive_scores or {},
|
|
{
|
|
"general_knowledge": 0.25,
|
|
"scientific_reasoning": 0.25,
|
|
"hard_reasoning": 0.25,
|
|
"math_frontier": 0.25,
|
|
},
|
|
)
|
|
|
|
dimensions["agency"] = self._score_dimension(
|
|
"Agency / Autonomy", "A", cfg.agency_weight,
|
|
agency_scores or {},
|
|
{
|
|
"task_completion": 0.30,
|
|
"planning_depth": 0.25,
|
|
"tool_use": 0.25,
|
|
"self_correction": 0.20,
|
|
},
|
|
)
|
|
|
|
dimensions["learning"] = self._score_dimension(
|
|
"Learning & Adaptation", "L", cfg.learning_weight,
|
|
learning_scores or {},
|
|
{
|
|
"few_shot_gain": 0.40,
|
|
"memory_retention": 0.30,
|
|
"iterative_improvement": 0.30,
|
|
},
|
|
)
|
|
|
|
dimensions["creativity"] = self._score_dimension(
|
|
"Creativity / Novelty", "N", cfg.creativity_weight,
|
|
creativity_scores or {},
|
|
{
|
|
"originality": 0.40,
|
|
"cross_domain_synthesis": 0.30,
|
|
"research_capability": 0.30,
|
|
},
|
|
)
|
|
|
|
dimensions["reliability"] = self._score_dimension(
|
|
"Reliability / Robustness", "R", cfg.reliability_weight,
|
|
reliability_scores or {},
|
|
{
|
|
"consistency": 0.25,
|
|
"adversarial_resistance": 0.25,
|
|
"calibration": 0.25,
|
|
"hallucination_rate": 0.25,
|
|
},
|
|
)
|
|
|
|
composite = sum(d.weighted_score for d in dimensions.values())
|
|
tier = _classify_tier(composite)
|
|
|
|
result = RubricResult(
|
|
dimensions=dimensions,
|
|
composite_score=composite,
|
|
tier=tier,
|
|
config=cfg,
|
|
metadata=metadata or {},
|
|
)
|
|
self._history.append(result)
|
|
|
|
logger.info(
|
|
"ASI rubric evaluation complete",
|
|
extra={"composite": composite, "tier": tier.value},
|
|
)
|
|
|
|
return result
|
|
|
|
def evaluate_from_self_model(self, self_model_snapshot: dict[str, Any]) -> RubricResult:
|
|
"""Evaluate using data from the SelfModel introspection.
|
|
|
|
Args:
|
|
self_model_snapshot: Output from SelfModel.introspect().
|
|
|
|
Returns:
|
|
Evaluation result.
|
|
"""
|
|
capabilities = self_model_snapshot.get("capabilities", {})
|
|
emotional = self_model_snapshot.get("emotional_state", {})
|
|
|
|
cognitive_scores = {}
|
|
agency_scores = {}
|
|
learning_scores = {}
|
|
creativity_scores = {}
|
|
reliability_scores = {}
|
|
|
|
for domain, cap_info in capabilities.items():
|
|
rate = cap_info.get("success_rate", 0.5) * 100
|
|
if domain in ("reasoning", "logic", "math"):
|
|
cognitive_scores[domain] = rate
|
|
elif domain in ("planning", "execution", "tool_use"):
|
|
agency_scores[domain] = rate
|
|
elif domain in ("adaptation", "learning", "memory"):
|
|
learning_scores[domain] = rate
|
|
elif domain in ("creativity", "synthesis", "novelty"):
|
|
creativity_scores[domain] = rate
|
|
elif domain in ("consistency", "safety", "accuracy"):
|
|
reliability_scores[domain] = rate
|
|
|
|
confidence = emotional.get("confidence", 0.5) * 100
|
|
reliability_scores.setdefault("calibration", confidence)
|
|
|
|
return self.evaluate(
|
|
cognitive_scores=cognitive_scores,
|
|
agency_scores=agency_scores,
|
|
learning_scores=learning_scores,
|
|
creativity_scores=creativity_scores,
|
|
reliability_scores=reliability_scores,
|
|
metadata={"source": "self_model"},
|
|
)
|
|
|
|
def trend(self) -> list[dict[str, Any]]:
|
|
"""Return historical evaluation trend.
|
|
|
|
Returns:
|
|
List of past composite scores and tiers.
|
|
"""
|
|
return [
|
|
{
|
|
"composite": r.composite_score,
|
|
"tier": r.tier.value,
|
|
"radar": r.radar_chart_data(),
|
|
}
|
|
for r in self._history
|
|
]
|
|
|
|
def _score_dimension(
|
|
self,
|
|
name: str,
|
|
abbreviation: str,
|
|
weight: float,
|
|
scores: dict[str, float],
|
|
sub_weights: dict[str, float],
|
|
) -> DimensionScore:
|
|
"""Compute a dimension score from sub-metrics.
|
|
|
|
Args:
|
|
name: Dimension name.
|
|
abbreviation: Short code.
|
|
weight: Dimension weight in composite.
|
|
scores: Provided sub-metric scores.
|
|
sub_weights: Default sub-metric weights.
|
|
|
|
Returns:
|
|
Computed DimensionScore.
|
|
"""
|
|
if not scores:
|
|
return DimensionScore(
|
|
name=name, abbreviation=abbreviation, weight=weight,
|
|
score=0.0, sub_scores={}, evidence=["No data provided"],
|
|
)
|
|
|
|
total_w = 0.0
|
|
total_score = 0.0
|
|
for sub_name, sub_weight in sub_weights.items():
|
|
if sub_name in scores:
|
|
total_score += sub_weight * scores[sub_name]
|
|
total_w += sub_weight
|
|
|
|
if total_w > 0:
|
|
for sub_name in scores:
|
|
if sub_name not in sub_weights:
|
|
equal_w = (1.0 - total_w) / max(1, len(scores) - len(sub_weights))
|
|
total_score += equal_w * scores[sub_name]
|
|
total_w += equal_w
|
|
|
|
dimension_score = total_score / total_w if total_w > 0 else 0.0
|
|
dimension_score = max(0.0, min(100.0, dimension_score))
|
|
|
|
return DimensionScore(
|
|
name=name,
|
|
abbreviation=abbreviation,
|
|
weight=weight,
|
|
score=dimension_score,
|
|
sub_scores=dict(scores),
|
|
evidence=[f"{k}: {v:.1f}" for k, v in scores.items()],
|
|
)
|
|
|
|
|
|
__all__ = [
|
|
"ASIRubric",
|
|
"CapabilityTier",
|
|
"DimensionScore",
|
|
"RubricConfig",
|
|
"RubricResult",
|
|
]
|