feat: GPU/TensorCore integration — TensorFlow backend, GPU-accelerated reasoning, training, and memory
Some checks failed
Some checks failed
- New fusionagi/gpu/ module with TensorBackend protocol abstraction - TensorFlowBackend: GPU-accelerated ops with TensorCore mixed-precision - NumPyBackend: CPU fallback (always available, no extra deps) - Auto-selects best available backend at runtime - GPU-accelerated operations: - Cosine similarity matrix (batched, XLA-compiled) - Multi-head attention for consensus scoring - Batch hypothesis scoring on GPU - Semantic similarity search (pairwise, nearest-neighbor, deduplication) - New TensorFlowAdapter (fusionagi/adapters/): - LLMAdapter for local TF/Keras model inference - TensorCore mixed-precision support - GPU-accelerated embedding synthesis fallback - Reasoning pipeline integration: - gpu_scoring.py: drop-in GPU replacement for multi_path scoring - Super Big Brain: use_gpu config flag, GPU scoring when available - Memory integration: - gpu_search.py: GPU-accelerated semantic search for SemanticGraphMemory - Self-improvement integration: - gpu_training.py: gradient-based heuristic weight optimization - Reflective memory training loop with loss tracking - Dependencies: gpu extra (tensorflow>=2.16, numpy>=1.26) - 64 new tests (276 total), all passing - Architecture spec: docs/gpu_tensorcore_integration.md Co-Authored-By: Nakamoto, S <defi@defi-oracle.io>
This commit is contained in:
234
fusionagi/adapters/tensorflow_adapter.py
Normal file
234
fusionagi/adapters/tensorflow_adapter.py
Normal file
@@ -0,0 +1,234 @@
|
||||
"""TensorFlow adapter: local model inference via TF/Keras with TensorCore.
|
||||
|
||||
Requires: pip install fusionagi[gpu]
|
||||
|
||||
Provides LLMAdapter-compatible interface for locally-hosted TensorFlow/Keras
|
||||
models. Supports TensorCore mixed-precision, XLA compilation, and GPU memory
|
||||
management.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from fusionagi._logger import logger
|
||||
from fusionagi.adapters.base import LLMAdapter
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"TensorFlow is required for TensorFlowAdapter. "
|
||||
"Install with: pip install fusionagi[gpu]"
|
||||
) from e
|
||||
|
||||
|
||||
class TensorFlowAdapter(LLMAdapter):
|
||||
"""LLM adapter for local TensorFlow/Keras model inference.
|
||||
|
||||
Loads a saved Keras model or TF SavedModel and runs inference with
|
||||
TensorCore acceleration when available.
|
||||
|
||||
Args:
|
||||
model_path: Path to a saved Keras model (.keras) or SavedModel directory.
|
||||
tokenizer: Optional tokenizer callable (text -> token IDs).
|
||||
max_length: Maximum sequence length for generation.
|
||||
temperature: Sampling temperature.
|
||||
mixed_precision: Enable FP16 mixed-precision for TensorCore.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_path: str | None = None,
|
||||
model: Any | None = None,
|
||||
tokenizer: Any | None = None,
|
||||
max_length: int = 512,
|
||||
temperature: float = 0.7,
|
||||
mixed_precision: bool = False,
|
||||
) -> None:
|
||||
self._model: Any = None
|
||||
self._tokenizer = tokenizer
|
||||
self._max_length = max_length
|
||||
self._temperature = temperature
|
||||
self._model_path = model_path
|
||||
|
||||
if mixed_precision:
|
||||
try:
|
||||
tf.keras.mixed_precision.set_global_policy("mixed_float16")
|
||||
logger.info("TensorFlowAdapter: TensorCore mixed-precision enabled")
|
||||
except Exception:
|
||||
logger.warning("TensorFlowAdapter: mixed-precision not available")
|
||||
|
||||
if model is not None:
|
||||
self._model = model
|
||||
logger.info("TensorFlowAdapter initialized with provided model")
|
||||
elif model_path:
|
||||
self._load_model(model_path)
|
||||
else:
|
||||
logger.info(
|
||||
"TensorFlowAdapter initialized without model "
|
||||
"(will use embedding-based synthesis)"
|
||||
)
|
||||
|
||||
def _load_model(self, path: str) -> None:
|
||||
"""Load a TF SavedModel or Keras model from disk."""
|
||||
try:
|
||||
self._model = tf.saved_model.load(path)
|
||||
logger.info("TensorFlowAdapter: loaded SavedModel", extra={"path": path})
|
||||
except Exception:
|
||||
try:
|
||||
self._model = tf.keras.models.load_model(path)
|
||||
logger.info("TensorFlowAdapter: loaded Keras model", extra={"path": path})
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"TensorFlowAdapter: no model loaded; "
|
||||
"falling back to embedding synthesis",
|
||||
extra={"path": path},
|
||||
)
|
||||
|
||||
def complete(
|
||||
self,
|
||||
messages: list[dict[str, str]],
|
||||
**kwargs: Any,
|
||||
) -> str:
|
||||
"""Generate completion using the loaded TF model.
|
||||
|
||||
If no model is loaded, falls back to embedding-based synthesis
|
||||
that uses GPU-accelerated similarity scoring.
|
||||
|
||||
Args:
|
||||
messages: List of message dicts with 'role' and 'content'.
|
||||
**kwargs: Additional parameters (temperature, max_length).
|
||||
|
||||
Returns:
|
||||
Generated response text.
|
||||
"""
|
||||
if self._model is not None and self._tokenizer is not None:
|
||||
return self._model_inference(messages, **kwargs)
|
||||
return self._embedding_synthesis(messages)
|
||||
|
||||
def complete_structured(
|
||||
self,
|
||||
messages: list[dict[str, str]],
|
||||
schema: dict[str, Any] | None = None,
|
||||
**kwargs: Any,
|
||||
) -> Any:
|
||||
"""Attempt structured JSON output from the model.
|
||||
|
||||
Falls back to parsing the raw completion if the model doesn't
|
||||
natively support structured output.
|
||||
"""
|
||||
raw = self.complete(messages, **kwargs)
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return None
|
||||
|
||||
def _model_inference(
|
||||
self,
|
||||
messages: list[dict[str, str]],
|
||||
**kwargs: Any,
|
||||
) -> str:
|
||||
"""Run inference through the loaded TF/Keras model."""
|
||||
prompt = self._messages_to_prompt(messages)
|
||||
temperature = kwargs.get("temperature", self._temperature)
|
||||
max_length = kwargs.get("max_length", self._max_length)
|
||||
|
||||
tokenizer = self._tokenizer
|
||||
assert tokenizer is not None
|
||||
tokens = tokenizer(prompt)
|
||||
if isinstance(tokens, (list, np.ndarray)):
|
||||
input_tensor = tf.constant([tokens[:max_length]], dtype=tf.int32)
|
||||
else:
|
||||
input_tensor = tokens
|
||||
|
||||
try:
|
||||
if hasattr(self._model, "generate"):
|
||||
output = self._model.generate(
|
||||
input_tensor,
|
||||
max_length=max_length,
|
||||
temperature=temperature,
|
||||
)
|
||||
elif hasattr(self._model, "predict"):
|
||||
output = self._model.predict(input_tensor)
|
||||
elif callable(self._model):
|
||||
output = self._model(input_tensor)
|
||||
else:
|
||||
logger.warning("TensorFlowAdapter: model has no callable interface")
|
||||
return self._embedding_synthesis(messages)
|
||||
|
||||
if isinstance(output, tf.Tensor):
|
||||
output = output.numpy()
|
||||
if hasattr(output, "tolist"):
|
||||
output = output.tolist()
|
||||
if isinstance(output, list) and output:
|
||||
if isinstance(output[0], list):
|
||||
output = output[0]
|
||||
if isinstance(output[0], (int, float)):
|
||||
if tokenizer and hasattr(tokenizer, "decode"):
|
||||
return str(tokenizer.decode(output))
|
||||
return str(output) # type: ignore[no-any-return]
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"TensorFlowAdapter: model inference failed, using synthesis",
|
||||
extra={"error": str(e)},
|
||||
)
|
||||
return self._embedding_synthesis(messages)
|
||||
|
||||
def _embedding_synthesis(self, messages: list[dict[str, str]]) -> str:
|
||||
"""Fallback: synthesize response using GPU-accelerated embeddings.
|
||||
|
||||
Embeds message content and produces a summary based on
|
||||
semantic similarity between parts.
|
||||
"""
|
||||
content_parts: list[str] = []
|
||||
for msg in messages:
|
||||
content = msg.get("content", "")
|
||||
if isinstance(content, str) and content.strip():
|
||||
content_parts.append(content.strip())
|
||||
|
||||
if not content_parts:
|
||||
return ""
|
||||
|
||||
from fusionagi.gpu.backend import get_backend
|
||||
|
||||
be = get_backend()
|
||||
embeddings = be.embed_texts(content_parts)
|
||||
emb_np = be.to_numpy(embeddings)
|
||||
|
||||
mean_emb = np.mean(emb_np, axis=0, keepdims=True)
|
||||
sims = be.to_numpy(
|
||||
be.cosine_similarity_matrix(be.from_numpy(mean_emb), embeddings)
|
||||
)[0]
|
||||
|
||||
ranked_indices = np.argsort(sims)[::-1]
|
||||
summary_parts: list[str] = []
|
||||
for idx in ranked_indices[:5]:
|
||||
part = content_parts[idx]
|
||||
summary_parts.append(part[:300])
|
||||
|
||||
return "\n\n".join(summary_parts)
|
||||
|
||||
@staticmethod
|
||||
def _messages_to_prompt(messages: list[dict[str, str]]) -> str:
|
||||
"""Convert message list to a flat prompt string."""
|
||||
parts: list[str] = []
|
||||
for msg in messages:
|
||||
role = msg.get("role", "user")
|
||||
content = msg.get("content", "")
|
||||
parts.append(f"<|{role}|>\n{content}")
|
||||
return "\n".join(parts)
|
||||
|
||||
def device_summary(self) -> dict[str, Any]:
|
||||
"""Return device and model information."""
|
||||
gpus = tf.config.list_physical_devices("GPU")
|
||||
return {
|
||||
"adapter": "tensorflow",
|
||||
"model_path": self._model_path,
|
||||
"has_model": self._model is not None,
|
||||
"has_tokenizer": self._tokenizer is not None,
|
||||
"gpu_count": len(gpus),
|
||||
"tf_version": tf.__version__,
|
||||
}
|
||||
Reference in New Issue
Block a user