feat: GPU/TensorCore integration — TensorFlow backend, GPU-accelerated reasoning, training, and memory

- New fusionagi/gpu/ module with TensorBackend protocol abstraction - TensorFlowBackend: GPU-accelerated ops with TensorCore mixed-precision - NumPyBackend: CPU fallback (always available, no extra deps) - Auto-selects best available backend at runtime - GPU-accelerated operations: - Cosine similarity matrix (batched, XLA-compiled) - Multi-head attention for consensus scoring - Batch hypothesis scoring on GPU - Semantic similarity search (pairwise, nearest-neighbor, deduplication) - New TensorFlowAdapter (fusionagi/adapters/): - LLMAdapter for local TF/Keras model inference - TensorCore mixed-precision support - GPU-accelerated embedding synthesis fallback - Reasoning pipeline integration: - gpu_scoring.py: drop-in GPU replacement for multi_path scoring - Super Big Brain: use_gpu config flag, GPU scoring when available - Memory integration: - gpu_search.py: GPU-accelerated semantic search for SemanticGraphMemory - Self-improvement integration: - gpu_training.py: gradient-based heuristic weight optimization - Reflective memory training loop with loss tracking - Dependencies: gpu extra (tensorflow>=2.16, numpy>=1.26) - 64 new tests (276 total), all passing - Architecture spec: docs/gpu_tensorcore_integration.md Co-Authored-By: Nakamoto, S <defi@defi-oracle.io>
2026-04-28 05:05:50 +00:00
parent c052b07662
commit fa71f973a6
22 changed files with 2448 additions and 3 deletions
--- a/fusionagi/adapters/tensorflow_adapter.py
+++ b/fusionagi/adapters/tensorflow_adapter.py
@@ -0,0 +1,234 @@
+"""TensorFlow adapter: local model inference via TF/Keras with TensorCore.
+
+Requires: pip install fusionagi[gpu]
+
+Provides LLMAdapter-compatible interface for locally-hosted TensorFlow/Keras
+models. Supports TensorCore mixed-precision, XLA compilation, and GPU memory
+management.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from fusionagi._logger import logger
+from fusionagi.adapters.base import LLMAdapter
+
+try:
+    import numpy as np
+    import tensorflow as tf
+except ImportError as e:
+    raise ImportError(
+        "TensorFlow is required for TensorFlowAdapter. "
+        "Install with: pip install fusionagi[gpu]"
+    ) from e
+
+
+class TensorFlowAdapter(LLMAdapter):
+    """LLM adapter for local TensorFlow/Keras model inference.
+
+    Loads a saved Keras model or TF SavedModel and runs inference with
+    TensorCore acceleration when available.
+
+    Args:
+        model_path: Path to a saved Keras model (.keras) or SavedModel directory.
+        tokenizer: Optional tokenizer callable (text -> token IDs).
+        max_length: Maximum sequence length for generation.
+        temperature: Sampling temperature.
+        mixed_precision: Enable FP16 mixed-precision for TensorCore.
+    """
+
+    def __init__(
+        self,
+        model_path: str | None = None,
+        model: Any | None = None,
+        tokenizer: Any | None = None,
+        max_length: int = 512,
+        temperature: float = 0.7,
+        mixed_precision: bool = False,
+    ) -> None:
+        self._model: Any = None
+        self._tokenizer = tokenizer
+        self._max_length = max_length
+        self._temperature = temperature
+        self._model_path = model_path
+
+        if mixed_precision:
+            try:
+                tf.keras.mixed_precision.set_global_policy("mixed_float16")
+                logger.info("TensorFlowAdapter: TensorCore mixed-precision enabled")
+            except Exception:
+                logger.warning("TensorFlowAdapter: mixed-precision not available")
+
+        if model is not None:
+            self._model = model
+            logger.info("TensorFlowAdapter initialized with provided model")
+        elif model_path:
+            self._load_model(model_path)
+        else:
+            logger.info(
+                "TensorFlowAdapter initialized without model "
+                "(will use embedding-based synthesis)"
+            )
+
+    def _load_model(self, path: str) -> None:
+        """Load a TF SavedModel or Keras model from disk."""
+        try:
+            self._model = tf.saved_model.load(path)
+            logger.info("TensorFlowAdapter: loaded SavedModel", extra={"path": path})
+        except Exception:
+            try:
+                self._model = tf.keras.models.load_model(path)
+                logger.info("TensorFlowAdapter: loaded Keras model", extra={"path": path})
+            except Exception:
+                logger.warning(
+                    "TensorFlowAdapter: no model loaded; "
+                    "falling back to embedding synthesis",
+                    extra={"path": path},
+                )
+
+    def complete(
+        self,
+        messages: list[dict[str, str]],
+        **kwargs: Any,
+    ) -> str:
+        """Generate completion using the loaded TF model.
+
+        If no model is loaded, falls back to embedding-based synthesis
+        that uses GPU-accelerated similarity scoring.
+
+        Args:
+            messages: List of message dicts with 'role' and 'content'.
+            **kwargs: Additional parameters (temperature, max_length).
+
+        Returns:
+            Generated response text.
+        """
+        if self._model is not None and self._tokenizer is not None:
+            return self._model_inference(messages, **kwargs)
+        return self._embedding_synthesis(messages)
+
+    def complete_structured(
+        self,
+        messages: list[dict[str, str]],
+        schema: dict[str, Any] | None = None,
+        **kwargs: Any,
+    ) -> Any:
+        """Attempt structured JSON output from the model.
+
+        Falls back to parsing the raw completion if the model doesn't
+        natively support structured output.
+        """
+        raw = self.complete(messages, **kwargs)
+        try:
+            return json.loads(raw)
+        except (json.JSONDecodeError, TypeError):
+            return None
+
+    def _model_inference(
+        self,
+        messages: list[dict[str, str]],
+        **kwargs: Any,
+    ) -> str:
+        """Run inference through the loaded TF/Keras model."""
+        prompt = self._messages_to_prompt(messages)
+        temperature = kwargs.get("temperature", self._temperature)
+        max_length = kwargs.get("max_length", self._max_length)
+
+        tokenizer = self._tokenizer
+        assert tokenizer is not None
+        tokens = tokenizer(prompt)
+        if isinstance(tokens, (list, np.ndarray)):
+            input_tensor = tf.constant([tokens[:max_length]], dtype=tf.int32)
+        else:
+            input_tensor = tokens
+
+        try:
+            if hasattr(self._model, "generate"):
+                output = self._model.generate(
+                    input_tensor,
+                    max_length=max_length,
+                    temperature=temperature,
+                )
+            elif hasattr(self._model, "predict"):
+                output = self._model.predict(input_tensor)
+            elif callable(self._model):
+                output = self._model(input_tensor)
+            else:
+                logger.warning("TensorFlowAdapter: model has no callable interface")
+                return self._embedding_synthesis(messages)
+
+            if isinstance(output, tf.Tensor):
+                output = output.numpy()
+            if hasattr(output, "tolist"):
+                output = output.tolist()
+            if isinstance(output, list) and output:
+                if isinstance(output[0], list):
+                    output = output[0]
+                if isinstance(output[0], (int, float)):
+                    if tokenizer and hasattr(tokenizer, "decode"):
+                        return str(tokenizer.decode(output))
+            return str(output)  # type: ignore[no-any-return]
+        except Exception as e:
+            logger.warning(
+                "TensorFlowAdapter: model inference failed, using synthesis",
+                extra={"error": str(e)},
+            )
+            return self._embedding_synthesis(messages)
+
+    def _embedding_synthesis(self, messages: list[dict[str, str]]) -> str:
+        """Fallback: synthesize response using GPU-accelerated embeddings.
+
+        Embeds message content and produces a summary based on
+        semantic similarity between parts.
+        """
+        content_parts: list[str] = []
+        for msg in messages:
+            content = msg.get("content", "")
+            if isinstance(content, str) and content.strip():
+                content_parts.append(content.strip())
+
+        if not content_parts:
+            return ""
+
+        from fusionagi.gpu.backend import get_backend
+
+        be = get_backend()
+        embeddings = be.embed_texts(content_parts)
+        emb_np = be.to_numpy(embeddings)
+
+        mean_emb = np.mean(emb_np, axis=0, keepdims=True)
+        sims = be.to_numpy(
+            be.cosine_similarity_matrix(be.from_numpy(mean_emb), embeddings)
+        )[0]
+
+        ranked_indices = np.argsort(sims)[::-1]
+        summary_parts: list[str] = []
+        for idx in ranked_indices[:5]:
+            part = content_parts[idx]
+            summary_parts.append(part[:300])
+
+        return "\n\n".join(summary_parts)
+
+    @staticmethod
+    def _messages_to_prompt(messages: list[dict[str, str]]) -> str:
+        """Convert message list to a flat prompt string."""
+        parts: list[str] = []
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            parts.append(f"<|{role}|>\n{content}")
+        return "\n".join(parts)
+
+    def device_summary(self) -> dict[str, Any]:
+        """Return device and model information."""
+        gpus = tf.config.list_physical_devices("GPU")
+        return {
+            "adapter": "tensorflow",
+            "model_path": self._model_path,
+            "has_model": self._model is not None,
+            "has_tokenizer": self._tokenizer is not None,
+            "gpu_count": len(gpus),
+            "tf_version": tf.__version__,
+        }