Spaces:
Sleeping
Sleeping
from typing import List, Dict, Any, Optional | |
import logging | |
from elasticsearch import exceptions | |
from elastic.es_client import get_es_client | |
logger = logging.getLogger(__name__) | |
es_client = get_es_client() | |
def search_certification_chunks( | |
index_name: str, | |
text_query: str, | |
vector_query: List[float], | |
certification_name: str, | |
es_client=es_client, | |
vector_field: str = "embedding", | |
text_field: str = "chunk", | |
size: int = 5, | |
min_score: float = 0.1, # Lowered threshold | |
boost_text: float = 1.0, | |
boost_vector: float = 1.0, | |
) -> List[Dict[str, Any]]: | |
# First verify the certification value exists | |
cert_check = es_client.search( | |
index=index_name, | |
body={ | |
"query": {"term": {"certification": certification_name}}, | |
"size": 1, | |
}, | |
) | |
if not cert_check["hits"]["hits"]: | |
logger.error(f"No documents found with certification: {certification_name}") | |
return [] | |
# Then proceed with hybrid search | |
query_body = { | |
"size": size, | |
"query": { | |
"bool": { | |
"should": [ | |
{"match": {"chunk": text_query}}, | |
{ | |
"script_score": { | |
"query": {"match_all": {}}, | |
"script": { | |
"source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0", | |
"params": {"query_vector": vector_query}, | |
}, | |
} | |
}, | |
] | |
} | |
}, | |
} | |
logger.debug(f"Elasticsearch query body: {query_body}") | |
logger.info(f"Executing search on index '{index_name}'") | |
response = es_client.search(index=index_name, body=query_body, routing=cert_check["hits"]["hits"][0]["_id"]) | |
hits = response.get("hits", {}).get("hits", []) | |
logger.info(f"Found {len(hits)} matching documents") | |
# Process results with correct field names | |
results = [ | |
{ | |
"id": hit["_id"], | |
"score": hit["_score"], | |
"text": hit["_source"]["chunk"], | |
"source_file": hit["_source"]["source_file"], | |
} | |
for hit in hits | |
] | |
if results: | |
logger.debug(f"Top result score: {results[0]['score']}") | |
logger.debug(f"Top result source: {results[0]['source_file']}") | |
else: | |
logger.warning("No results returned from Elasticsearch") | |
return results | |