postgres_ops.py

"""
This module provides a class for interacting with a PostgreSQL database.
It includes methods for storing, retrieving, and searching documents and their embeddings.
"""

import json
from typing import Dict, List

import psycopg2
from pgvector.psycopg2 import register_vector
from psycopg2.extras import execute_values


class PostgresOperations:
    def __init__(self, host='postgres', port=5432, dbname='ragdb', user='raguser', password='ragpass'):
        """
        Initialize the PostgresOperations class.
        
        Args:
            host (str): The database host.
            port (int): The database port.
            dbname (str): The name of the database.
            user (str): The database user.
            password (str): The database password.
        """
        self.conn = psycopg2.connect(
            dbname=dbname,
            user=user,
            password=password,
            host=host,
            port=port
        )
        self.create_tables()


    def create_tables(self):
        """Create the necessary tables if they don't exist."""
        with self.conn.cursor() as cur:
            register_vector(cur)
            # Create documents table
            cur.execute("""
            CREATE TABLE IF NOT EXISTS documents (
                id SERIAL PRIMARY KEY,
                filename TEXT UNIQUE,
                content TEXT,
                chunks JSONB
            )
            """)
            # Create embeddings table
            cur.execute("""
            CREATE TABLE IF NOT EXISTS embeddings (
                id SERIAL PRIMARY KEY,
                document_id INTEGER REFERENCES documents(id),
                chunk_index INTEGER,
                embedding vector(1536)
            )
            """)
        self.conn.commit()

    def store_document(self, filename: str, content: str, chunks: List[str], embeddings: List[List[float]]):
        """
        Store a document and its embeddings in the database.
        
        Args:
            filename (str): The name of the document file.
            content (str): The full content of the document.
            chunks (List[str]): The document split into chunks.
            embeddings (List[List[float]]): The embeddings for each chunk.
        """
        with self.conn.cursor() as cur:
            register_vector(cur)
            # Store document and chunks
            cur.execute(
                "INSERT INTO documents (filename, content, chunks) VALUES (%s, %s, %s) RETURNING id",
                (filename, content, json.dumps(chunks))
            )
            document_id = cur.fetchone()[0]

            # Store embeddings
            execute_values(cur, 
                "INSERT INTO embeddings (document_id, chunk_index, embedding) VALUES %s",
                [(document_id, i, embedding) for i, embedding in enumerate(embeddings)],
                template="(%s, %s, %s::vector)"
            )
        self.conn.commit()

    def get_document(self, filename: str) -> Dict[str, any]:
        """
        Retrieve a document and its embeddings from the database.
        
        Args:
            filename (str): The name of the document file.
        
        Returns:
            Dict[str, any]: The document data, or None if not found.
        """
        with self.conn.cursor() as cur:
            cur.execute("""
            SELECT d.content, d.chunks, array_agg(e.embedding ORDER BY e.chunk_index) as embeddings
            FROM documents d
            JOIN embeddings e ON d.id = e.document_id
            WHERE d.filename = %s
            GROUP BY d.id
            """, (filename,))
            result = cur.fetchone()
            if result:
                content, chunks, embeddings = result
                return {
                    "filename": filename,
                    "content": content,
                    "chunks": json.loads(chunks),
                    "embeddings": embeddings
                }
        return None

    def get_all_documents(self) -> List[Dict[str, any]]:
        """
        Retrieve all documents and their embeddings from the database.
        
        Returns:
            List[Dict[str, any]]: A list of all documents and their data.
        """
        with self.conn.cursor() as cur:
            cur.execute("""
            SELECT d.filename, d.content, d.chunks, array_agg(e.embedding ORDER BY e.chunk_index) as embeddings
            FROM documents d
            JOIN embeddings e ON d.id = e.document_id
            GROUP BY d.id
            """)
            results = cur.fetchall()
            return [
                {
                    "filename": filename,
                    "content": content,
                    "chunks": json.loads(chunks),
                    "embeddings": embeddings
                }
                for filename, content, chunks, embeddings in results
            ]

    def search_similar_chunks(self, query_embedding: List[float], top_k: int = 5) -> List[Dict[str, any]]:
        """
        Search for chunks similar to the query embedding.
        
        Args:
            query_embedding (List[float]): The embedding of the query.
            top_k (int): The number of results to return.
        
        Returns:
            List[Dict[str, any]]: The top_k most similar chunks.
        """
        with self.conn.cursor() as cur:
            register_vector(cur)
            cur.execute("""
            SELECT d.filename, d.chunks->e.chunk_index as chunk, e.embedding <-> %s::vector as distance
            FROM embeddings e
            JOIN documents d ON e.document_id = d.id
            ORDER BY distance
            LIMIT %s
            """, (query_embedding, top_k))
            results = cur.fetchall()
            return [
                {
                    "filename": filename,
                    "chunk": chunk,
                    "distance": distance
                }
                for filename, chunk, distance in results
            ]

    def clear_db(self):
        """Clear all data from the database."""
        with self.conn.cursor() as cur:
            cur.execute("TRUNCATE TABLE embeddings")
            cur.execute("TRUNCATE TABLE documents CASCADE")
        self.conn.commit()
        print("Database cleared.")

    def print_db_contents(self) -> List[Dict[str, any]]:
        """
        Retrieve a summary of all documents in the database.
        
        Returns:
            List[Dict[str, any]]: A list of document summaries.
        """
        docs = self.get_all_documents()
        return [
            {
                "filename": doc["filename"],
                "content_preview": doc["content"][:1000] + "..." if len(doc["content"]) > 1000 else doc["content"],
                "chunks_count": len(doc["chunks"]),
                "embeddings_count": len(doc["embeddings"])
            }
            for doc in docs
        ]

    def __del__(self):
        """Close the database connection when the object is destroyed."""
        if self.conn:
            self.conn.close()