Source code for openchatbi.catalog.schema_retrival

"""Schema and column retrieval functionality for finding relevant database structures."""

import os
import re

import Levenshtein

from openchatbi import config
from openchatbi.catalog.retrival_helper import build_column_tables_mapping, build_columns_retriever
from openchatbi.text_segmenter import _segmenter
from openchatbi.utils import log

# Skip build during documentation build
if not os.environ.get("SPHINX_BUILD"):
    try:
        _catalog_store = config.get().catalog_store
    except ValueError:
        _catalog_store = None
else:
    _catalog_store = None

if _catalog_store:
    bm25, vector_db, columns, col_dict = build_columns_retriever(_catalog_store, config.get().vector_db_path)
    column_tables_mapping = build_column_tables_mapping(_catalog_store)
else:
    bm25, vector_db, columns, col_dict = None, None, [], {}
    column_tables_mapping = {}


[docs] def column_retrieval(query, db, k=10, threshold=0.5, filter=None): """Retrieves relevant columns based on a similarity search. Args: query (str): The query string to search for. db: The vector database to search in. k (int, optional): The number of top results to return. Defaults to 10. threshold (float, optional): The similarity threshold for filtering results. Defaults to 0.5. filter (dict, optional): A filter to apply to the search. Defaults to None. Returns: list: List of relevant column names. """ log(f"Get the top relevant columns for query: {query}") similar_column_key_scores = db.similarity_search_with_score(query, k=k, filter=filter) # log(f"similar_column_key_scores: {similar_column_key_scores}") column_names = [key.metadata["column_name"] for (key, score) in similar_column_key_scores if score < threshold] log(f"Filtered relevant columns: {column_names}") return column_names
[docs] def merge_list(list1, list2): return list(set(list1 + list2))
[docs] def edit_distance_score(key1, key2): """Calculate normalized edit distance score between two strings. Returns: float: Score between 0 (identical) and 1 (completely different). """ dist = Levenshtein.distance(key1, key2) max_len = max(len(key1), len(key2)) return dist / max_len if max_len > 0 else 1
[docs] def get_relevant_columns(keywords_list, dimensions, metrics): """Get the most relevant columns for given keywords, dimensions, and metrics. Uses multiple retrieval methods (BM25, edit distance, vector similarity) to find the best matching columns. Args: keywords_list (list): General keywords to search for. dimensions (list): Dimension-specific keywords. metrics (list): Metric-specific keywords. Returns: list: Relevant column names. """ # 1. BM25 search for general keywords total_results = bm25_search(keywords_list, top_k=len(keywords_list) * 4) # 2. Edit distance search for exact matches keyword_len = len(keywords_list + dimensions + metrics) ed_results = edit_distance_search(keywords_list + dimensions + metrics, top_k=keyword_len, threshold=0.3) total_results = merge_list(total_results, ed_results) # 3. Vector similarity search for dimensions if dimensions: d_results = column_retrieval(" ".join(dimensions), vector_db, k=10, filter={"category": "dimension"}) total_results = merge_list(total_results, d_results) # 4. Vector similarity search for metrics if metrics: m_results = column_retrieval(" ".join(metrics), vector_db, k=10, threshold=0.55, filter={"category": "metric"}) total_results = merge_list(total_results, m_results) log(f"Relevant columns: {total_results}") return total_results