Elsai VectorDB v2.0.0#

The Elsai VectorDB package provides interfaces to work with vector databases like ChromaDB, Pinecone, and Weaviate, enabling efficient storage and retrieval of document embeddings with enhanced retriever integration for v2.0.0.

Prerequisites#

Python >= 3.9
.env file with appropriate API keys and configuration variables

Installation#

To install the elsai-vectordb package:

pip install --extra-index-url https://elsai-core-package.optisolbusiness.com/root/elsai-vectordb/ elsai-vectordb==2.1.0

Components#

1. ChromaVectorDb#

ChromaVectorDb is a wrapper around ChromaDB to manage local document embeddings with persistent storage.

from elsai_vectordb.chromadb import ChromaVectorDb

chroma_client = ChromaVectorDb(persist_directory="your_persist_directory") # Or set in environment variable CHROMA_PERSIST_DIRECTORY

chroma_client.create_if_not_exists(collection_name="your_collection_name")

document = {
    "id": "001",
    "embeddings": [0.1, 0.2, 0.7],  # Example embedding vector
    "page_content": "This is a sample document.",
    "metadatas": {"source": "example_source", "file_id": "doc1"}
}

chroma_client.add_document(document=document, collection_name="your_collection_name")

Note

The filter-based retrieve_document functionality (using where parameter) is available only in version 2.1.0 or later.

# Simple filter example
documents = chroma_client.retrieve_document(
    collection_name="your_collection_name",
    embeddings=[0.1, 0.2, 0.7],
    where={"file_id": {"$eq": "doc1"}},
    k=5
)

# List filter example
documents = chroma_client.retrieve_document(
    collection_name="your_collection_name",
    embeddings=[0.1, 0.2, 0.7],
    where={"file_id": {"$in": ["doc1", "doc2"]}},
    k=5
)

# Complex filter example
documents = chroma_client.retrieve_document(
    collection_name="your_collection_name",
    embeddings=[0.1, 0.2, 0.7],
    where={"$and": [{"user_id": {"$eq": "123"}}, {"file_id": {"$in": ["doc1"]}}]},
    k=5
)

# Without filter (retrieve all matching documents)
documents = chroma_client.retrieve_document(
    collection_name="your_collection_name",
    embeddings=[0.1, 0.2, 0.7],
    k=5
)

collection = chroma_client.get_collection(collection_name="your_collection_name")

chunks = chroma_client.fetch_chunks(collection_name="your_collection_name", files_id=["doc1"])

.. note::
   The following functions (``update_document``, ``delete_document``, ``list_collections``, and ``delete_collection``) are available only in version 1.1.0 or later.

# Update a document in the collection
updated_document = {
    "id": "001",
    "embeddings": [0.2, 0.3, 0.8],  # Updated embedding vector
    "page_content": "This is an updated document.",
    "metadatas": {"source": "updated_source", "file_id": "doc1"}
}
chroma_client.update_document(document=updated_document, collection_name="your_collection_name")

# Delete documents by ID
chroma_client.delete_document(ids=["001", "002"], collection_name="your_collection_name")

# Delete documents using where filter
chroma_client.delete_document(where={"file_id": "doc1"}, collection_name="your_collection_name")

# List all collections
collections = chroma_client.list_collections()

chroma_client.delete_collection(collection_name="your_collection_name")

# Use ChromaDB as a retriever for RAG workflows
retrievers = chroma_client.as_retriever(
    collection_name="your_collection_name",
    embedding_model="your_embedding_model_instance"
)

Required Environment Variables:

CHROMA_PERSIST_DIRECTORY – Path to the directory where ChromaDB will persist data locally

2. PineconeVectorDb#

PineconeVectorDb integrates with Pinecone to manage vector search using cloud-hosted infrastructure.

from elsai_vectordb.pinecone import PineconeVectorDb

pinecone_client = PineconeVectorDb(
    index_name="testingindex",
    pinecone_api_key="pinecone_api_key",  # Or set in environment variable PINECONE_API_KEY
    dimension=1536  # Example dimension size
)

pinecone_client.add_document(
    document={
        "id": "001",
        "embeddings": [0.1, 0.2, 0.7],  # Replace with a 1536-dimension vector
        "page_content": "This is a sample document.",
        "metadatas": {"source": "example_source", "file_id": "doc1"}
    },
    namespace="namespacename"
)

Note

The filter-based retrieve_document functionality (using filter parameter) is available only in version 2.1.0 or later.

# Simple filter example
results = pinecone_client.retrieve_document(
    namespace="namespacename",
    question_embedding=[0.1, 0.2, 0.7],
    filter={"user_id": "123"},
    k=5
)

# List filter example
results = pinecone_client.retrieve_document(
    namespace="namespacename",
    question_embedding=[0.1, 0.2, 0.7],
    filter={"file_id": {"$in": ["doc1", "doc2"]}},
    k=5
)

# Complex filter example
results = pinecone_client.retrieve_document(
    namespace="namespacename",
    question_embedding=[0.1, 0.2, 0.7],
    filter={"$and": [{"user_id": "123"}, {"file_id": {"$in": ["doc1"]}}]},
    k=5
)

# Without filter (retrieve all matching documents)
results = pinecone_client.retrieve_document(
    namespace="namespacename",
    question_embedding=[0.1, 0.2, 0.7],
    k=5
)

.. note::
   The following functions (``update_document``, ``delete_document``, ``list_namespaces``, and ``delete_namespace``) are available only in version 1.1.0 or later.

# Update a document in the Pinecone index
updated_document = {
    "id": "001",
    "embeddings": [0.2, 0.3, 0.8],  # Updated embedding vector (1536 dimensions)
    "page_content": "This is an updated document.",
    "metadatas": {"source": "updated_source", "file_id": "doc1"}
}
pinecone_client.update_document(document=updated_document, namespace="namespacename")

# Delete documents by ID
pinecone_client.delete_document(ids=["001", "002"], namespace="namespacename")

# Delete documents using metadata filter
pinecone_client.delete_document(filter={"file_id": "doc1"}, namespace="namespacename")

# List all namespaces
namespaces = pinecone_client.list_namespaces()

# Delete a namespace
pinecone_client.delete_namespace(namespace="namespacename")

# Use Pinecone as a retriever for RAG workflows
retrievers = pinecone_client.as_retriever(
    namespace="namespacename",
    embedding_model="your_embedding_model_instance"
)

Required Environment Variables:

PINECONE_API_KEY – API key to authenticate with Pinecone vector DB

3. WeaviateVectorDb#

WeaviateVectorDb handles operations for managing and querying vectors in Weaviate with flexible filters and native retriever support.

Key methods:

add_context: Adds context with optional custom vectors; creates the collection if it does not exist.
get_context / get_context_by_vector: Semantic search via text or pre-computed embeddings.
get_last_n_chats_by_filter / delete_chats_by_filter: Retrieve or delete chats using any filter dictionary or Filter object (supports timestamp bounds).
get_object_by_uuid / update_object_by_uuid / delete_object_by_uuid: CRUD helpers for individual objects.
delete_collection / close / as_retriever: Manage lifecycle and create a NativeWeaviateRetriever for RAG.

Initialize client

from datetime import datetime, timezone
from elsai_vectordb.weaviate import WeaviateVectorDb
# from elsai_vectordb.weaviate import NativeWeaviateRetriever
# from elsai_embeddings.azure_embeddings import AzureOpenAIEmbeddingModel

default_schema = {
    "content": "TEXT",
    "question": "TEXT",
    "executed_sql_query": "TEXT",
    "generated_sql_query": "TEXT",
    "user_id": "TEXT",
    "createdAt": "DATE",
}

temp_db = WeaviateVectorDb(
    connection_type="local",  # or "cloud" with cluster_url/auth_credentials
    host="localhost",
    port=8080,
    collection_name="test_collection_1",
    schema=default_schema,
)

Add contexts

data1 = {
    "content": "Python is a high-level programming language known for its simplicity and readability.",
    "question": "What is Python?",
    "user_id": "user123",
    "createdAt": datetime.now(timezone.utc).isoformat(),
    "executed_sql_query": "SELECT * FROM languages WHERE name='Python'",
}

data2 = {
    "content": "Machine learning is a subset of artificial intelligence that enables systems to learn from data.",
    "question": "What is machine learning?",
    "user_id": "user123",
    "createdAt": datetime.now(timezone.utc).isoformat(),
    "executed_sql_query": "SELECT * FROM ai_topics WHERE topic='ML'",
}

vector1 = [0.1] * 1536
vector2 = [0.2] * 1536

# Provide vectors when DEFAULT_VECTORIZER_MODULE is "none"
temp_db.add_context(data=data1, vector=vector1)
temp_db.add_context(data=data2, vector=vector2)

Semantic retrieval (by vector)

semantic_results = temp_db.get_context_by_vector(query_vector=vector2, limit=5)
print("Semantic results:", semantic_results)

Filtering (simple and complex)

# Simple filter
filter_dict = {"property": "user_id", "operator": "equal", "value": "user123"}
last_n_chats = temp_db.get_last_n_chats_by_filter(filter=filter_dict, limit=5)
print("Last N chats by filter:", last_n_chats)

# Complex filter with timestamp + fuzzy match
complex_filter = {
    "operator": "and",
    "filters": [
        {"property": "createdAt", "operator": "greater_or_equal", "value": datetime(2024, 1, 1, 0, 0, 0, tzinfo=timezone.utc).isoformat()},
        {"property": "question", "operator": "like", "value": "*Python*"},
    ],
}
last_n_chats_complex = temp_db.get_last_n_chats_by_filter(filter=complex_filter, limit=5)
print("Last N chats by complex filter:", last_n_chats_complex)

Object-level operations

object_by_uuid = temp_db.get_object_by_uuid(uuid="6628a50a-b784-425c-9db8-bb03cdbd45d4", include_vector=False)
print("Object by UUID:", object_by_uuid)

update_data = {
    "content": "Python is a high-level programming language known for its simplicity",
    "question": "What is Python?",
    "user_id": "user123",
    "createdAt": datetime.now(timezone.utc).isoformat(),
    "executed_sql_query": "SELECT * FROM languages WHERE name='Python'",
}

temp_db.update_object_by_uuid(uuid="de644fe1-9d7c-40d8-8a73-05ee1845a4cd", data=update_data)
temp_db.delete_object_by_uuid(uuid="6628a50a-b784-425c-9db8-bb03cdbd45d4")

Delete collection (destructive)

temp_db.delete_collection()

Close the connection

temp_db.close()

4. FaissVectorDb#

FaissVectorDb provides an interface for managing local vector embeddings using FAISS. It supports various index types and similarity metrics for optimized search operations.

Note

The FAISS component is available only in version 2.2.0 or later.

from elsai_vectordb.FAISS.faiss_vectordb import FaissVectorDb

db = FaissVectorDb(
    persist_directory="./my_faiss_store",
    dimension=768,
    index_type="flat",
    metric="cosine"
)

# Create a collection
db.create_if_not_exists("my_collection")

# Adding a single document
document = {
    "id": "doc_0",
    "embeddings": [0.1, 0.2, 0.7],  # Example embedding vector
    "page_content": "This is a basic hello world document.",
    "metadatas": {"file_id": "f1", "source": "electronic"}
}
db.add_document(document, "my_collection")

# Adding documents in batch
batch_documents = [
    {
        "id": "doc_1",
        "embeddings": [0.1, 0.2, 0.7],
        "page_content": "Detailed content for chunk 1.",
        "metadatas": {"file_id": "file_no_1", "category": "health"}
    },
]
db.add_documents(batch_documents, "my_collection")

# Standard Retrieval
results = db.retrieve_document(
    collection_name="my_collection",
    embeddings=query_vector,
    k=5
)

# Filtered Retrieval (by file_id)
filtered_results = db.retrieve_document(
    collection_name="my_collection",
    embeddings=query_vector,
    files_id=["file_no_1"],
    k=3
)

# Fetch Chunks
chunks = db.fetch_chunks("my_collection", files_id=["file_no_1"])

# Update a Document
updated_document = {
    "id": "doc_1",
    "embeddings": [0.1, 0.2, 0.7],
    "page_content": "This is UPDATED content.",
    "metadatas": {"file_id": "file_no_5", "category": "test"}
}
db.update_document("doc_1", updated_document, "my_collection")

# Delete a Document
db.delete_document("doc_0", "my_collection")

# List Collections
collections = db.list_collections()

# LangChain Integration
retriever = db.as_retriever(collection_name="my_collection", embedding_model=model_instance)
lc_docs = retriever.invoke("search query")

# Delete Collection
db.delete_collection("my_collection")

Initialization Parameters:

persist_directory – Path to the directory where FAISS index and metadata will be stored.
dimension – Dimension of the embedding vectors.
index_type – Type of FAISS index (e.g., “flat”, “ivf”).
metric – Similarity metric (e.g., “cosine”, “l2”).
nlist – (Optional) Number of clusters for IVF index.
nprobe – (Optional) Number of clusters to search during retrieval for IVF index.