"""GPU-accelerated semantic similarity for reasoning and consensus. Provides high-level similarity operations built on the TensorBackend: - Pairwise text similarity - Claim deduplication with GPU cosine similarity - Nearest-neighbor lookup for memory retrieval """ from __future__ import annotations from typing import Any from fusionagi._logger import logger from fusionagi.gpu.backend import TensorBackend, get_backend def pairwise_text_similarity( texts_a: list[str], texts_b: list[str], backend: TensorBackend | None = None, ) -> Any: """Compute pairwise cosine similarity between two sets of texts. Args: texts_a: First set of texts (M items). texts_b: Second set of texts (N items). backend: TensorBackend to use. If None, auto-selects. Returns: Similarity matrix of shape (M, N) as a NumPy array. """ be = backend or get_backend() emb_a = be.embed_texts(texts_a) emb_b = be.embed_texts(texts_b) sim = be.cosine_similarity_matrix(emb_a, emb_b) return be.to_numpy(sim) def deduplicate_claims( claims: list[str], threshold: float = 0.85, backend: TensorBackend | None = None, ) -> list[list[int]]: """Group semantically similar claims using GPU-accelerated similarity. Args: claims: List of claim texts. threshold: Similarity threshold for grouping. backend: TensorBackend to use. Returns: List of groups, where each group is a list of claim indices. """ if not claims: return [] if len(claims) == 1: return [[0]] be = backend or get_backend() embeddings = be.embed_texts(claims) sim_matrix = be.to_numpy(be.cosine_similarity_matrix(embeddings, embeddings)) used: set[int] = set() groups: list[list[int]] = [] for i in range(len(claims)): if i in used: continue group = [i] used.add(i) for j in range(i + 1, len(claims)): if j in used: continue if sim_matrix[i, j] >= threshold: group.append(j) used.add(j) groups.append(group) logger.debug( "Claim deduplication complete", extra={"total_claims": len(claims), "groups": len(groups)}, ) return groups def nearest_neighbors( query_texts: list[str], corpus_texts: list[str], top_k: int = 5, backend: TensorBackend | None = None, ) -> list[list[tuple[int, float]]]: """Find top-k nearest neighbors from corpus for each query. Args: query_texts: Query texts to search for. corpus_texts: Corpus texts to search within. top_k: Number of nearest neighbors per query. backend: TensorBackend to use. Returns: For each query, a list of (corpus_index, similarity_score) tuples. """ if not query_texts or not corpus_texts: return [[] for _ in query_texts] be = backend or get_backend() import numpy as np q_emb = be.embed_texts(query_texts) c_emb = be.embed_texts(corpus_texts) sim = be.to_numpy(be.cosine_similarity_matrix(q_emb, c_emb)) results: list[list[tuple[int, float]]] = [] for i in range(len(query_texts)): row = sim[i] k = min(top_k, len(corpus_texts)) top_indices = np.argsort(row)[-k:][::-1] results.append([(int(idx), float(row[idx])) for idx in top_indices]) return results