Skip to content

Vector Search Examples

This page covers examples for implementing AI-powered semantic search using vector embeddings in ArcadeDB.

Vector Search Examples

Example 03 - Vector Search: Product Discovery

Learn the fundamentals of vector search:

  • Creating vector indexes
  • Generating embeddings
  • Performing similarity searches
  • Understanding JVector parameters

Movie Recommendations

Example 06 - Vector Search: Movie Recommendations

Build a recommendation system:

  • Movie embeddings from titles/genres
  • Semantic similarity search
  • Personalized recommendations
  • Real-world MovieLens data

Create Vector Index

import arcadedb_embedded as arcadedb

with arcadedb.create_database("./vector_demo") as db:
    # Create vertex type with vector property (schema ops are auto-transactional)
    db.schema.create_vertex_type("Product")
    db.schema.create_property("Product", "name", "STRING")
    db.schema.create_property("Product", "description", "STRING")
    db.schema.create_property("Product", "embedding", "ARRAY_OF_FLOATS")

    # Create vector index (384 dims to mirror Example 03)
    db.create_vector_index(
        "Product",
        "embedding",
        dimensions=384,
        distance_function="cosine",
    )

Insert Vectors

import numpy as np
import arcadedb_embedded as arcadedb

with arcadedb.open_database("./vector_demo") as db:
    # Generate or load embeddings (example with random vectors)
    def get_embedding(text: str) -> list:
        # In production, use OpenAI, Sentence Transformers, etc.
        return np.random.rand(384).tolist()

    # Insert products with embeddings
    products = [
        ("Laptop", "High-performance computing device"),
        ("Mouse", "Wireless ergonomic mouse"),
        ("Keyboard", "Mechanical keyboard with RGB")
    ]

    with db.transaction():
        for name, description in products:
            embedding = get_embedding(f"{name}: {description}")
            product = db.new_vertex("Product")
            product.set("name", name)
            product.set("description", description)
            # Convert to Java float[]
            product.set("embedding", arcadedb.to_java_float_array(embedding))
            product.save()

Search Similar Items

import arcadedb_embedded as arcadedb

with arcadedb.open_database("./vector_demo") as db:
    # Query for similar products (reads don't require a transaction)
    def get_embedding(text: str) -> list:
        # In production, use real embedding service
        import numpy as np
        return np.random.rand(384).tolist()

    search_text = "computer accessories"
    query_embedding = get_embedding(search_text)

    results = db.query(
        "sql",
        """
        SELECT name, description,
               vectorL2Distance(embedding, ?) as distance
        FROM Product
        ORDER BY distance ASC
        LIMIT 5
        """,
        query_embedding,
    )

    for record in results:
        print(f"{record.get('name')}: {record.get('distance'):.4f}")

Pythonic nearest-neighbor (preferred for code):

import arcadedb_embedded as arcadedb
import numpy as np

with arcadedb.open_database("./vector_demo") as db:
    index = db.schema.get_vector_index("Product", "embedding")
    query_embedding = np.random.rand(384).tolist()

    results = index.find_nearest(query_embedding, k=5)
    for vertex, distance in results:
        print(f"{vertex.get('name')}: {distance:.4f}")

Vector Functions

ArcadeDB provides several vector functions:

Distance Metrics

import arcadedb_embedded as arcadedb

with arcadedb.open_database("./vector_demo") as db:
    query_vector = [0.5] * 384  # Example embedding

    # Cosine similarity (0-2, lower = more similar)
    results = db.query(
        "sql",
        """
        SELECT vectorCosineSimilarity(embedding, ?) as score
        FROM Product
        """,
        query_vector,
    )

    # Euclidean distance (L2)
    results = db.query(
        "sql",
        """
        SELECT vectorL2Distance(embedding, ?) as score
        FROM Product
        """,
        query_vector,
    )

    # Dot product
    results = db.query(
        "sql",
        """
        SELECT vectorDotProduct(embedding, ?) as score
        FROM Product
        """,
        query_vector,
    )

Nearest Neighbors

import arcadedb_embedded as arcadedb

with arcadedb.open_database("./vector_demo") as db:
    query_vector = [0.5] * 384  # Example embedding

    # Find k-nearest neighbors (read-only, no transaction needed)
    results = db.query(
        "sql",
        """
        SELECT name,
               vectorL2Distance(embedding, ?) as distance
        FROM Product
        ORDER BY distance ASC
        LIMIT 10
        """,
        query_vector,
    )

JVector Index Configuration

Tune vector index performance with JVector parameters:

import arcadedb_embedded as arcadedb

with arcadedb.create_database("./vector_demo") as db:
    # Create vertex type
    db.schema.create_vertex_type("Product")
    db.schema.create_property("Product", "embedding", "ARRAY_OF_FLOATS")

    # Create index with JVector parameters (schema operations are auto-transactional)
    db.create_vector_index("Product", "embedding",
                          dimensions=384,         # Match Example 03 defaults
                          max_connections=16,     # Connections per node (default: 16)
                          beam_width=100)         # Search beam width (default: 100)

Index Configuration Parameters:

  • max_connections: 8-32 (higher = better accuracy, more memory/slower build)
  • beam_width: 64-200 (higher = better search accuracy, slower queries)
    • 64: Fast search, lower accuracy
    • 100: Balanced (default)
    • 200: High accuracy, slower search
  • overquery_factor: Search-time tuning (default: 4)
    • Multiplies k internally; 16 means searches ~160 candidates for k=10
    • Smaller (4-8) = faster, lower recall
    • Larger (32-64) = slower, better recall

Embedding Providers

OpenAI Embeddings

from openai import OpenAI

client = OpenAI(api_key="your-key")

def get_embedding(text: str) -> list:
    response = client.embeddings.create(
        model="text-embedding-3-small",  # 1536 dimensions
        input=text
    )
    return response.data[0].embedding

Sentence Transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # 384 dimensions

def get_embedding(text: str) -> list:
    return model.encode(text).tolist()

Hugging Face

from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def get_embedding(text: str) -> list:
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings[0].tolist()

Performance Optimization

Batch Embeddings

Generate embeddings in batches for better performance:

import arcadedb_embedded as arcadedb

with arcadedb.open_database("./vector_demo") as db:
    # Batch embedding generation
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer('all-MiniLM-L6-v2')

    texts = ["product 1", "product 2", "product 3"]
    embeddings = model.encode(texts, batch_size=32)

    # Batch insert with transaction
    with db.transaction():
        for text, embedding in zip(texts, embeddings):
            product = db.new_vertex("Product")
            product.set("description", text)
            product.set("embedding", arcadedb.to_java_float_array(embedding))
            product.save()

Complete Examples

See full implementations:

Additional Resources

Source Code

View the complete vector search example source code: