Spaces:

A7m0d
/

Autism_QA

Runtime error

App Files Files Community

Autism_QA / docs_utils.py

A7m0d

Upload folder using huggingface_hub

712579e verified 2 months ago

raw

history blame contribute delete

20.2 kB

	from query_utils import process_query_for_rewrite, get_non_autism_response
	from logger.custom_logger import CustomLoggerTracker
	from dotenv import load_dotenv
	from query_utils import check_answer_autism_relevance, get_non_autism_answer_response
	from clients import get_weaviate_client, qwen_generate
	from query_utils import process_query_for_rewrite
	from rag_steps import *
	from rag_utils import *
	from prompt_template import (
	Prompt_template_Wisal,
	Prompt_template_User_document_prompt)

	import os
	import asyncio
	from typing import Dict
	from configs import load_yaml_config

	config = load_yaml_config("config.yaml")

	# Load .env early
	load_dotenv()

	# ---------------------------
	# Custom Logger Initialization
	# ---------------------------
	custom_log = CustomLoggerTracker()
	logger = custom_log.get_logger("doc_utils")
	logger.info("Logger initialized for Documents utilities module")

	# ---------------------------
	# Environment & Globals
	# ---------------------------
	# client = get_weaviate_client()
	# if client is None:
	# logger.info("Weaviate client not connected. Please check your WEAVIATE_URL and WEAVIATE_API_KEY.")
	# else:
	# logger.info("Weaviate client connected (startup checks skipped).")


	SESSION_ID = "default"
	pending_clarifications: Dict[str, str] = {}
	SILICONFLOW_API_KEY = os.getenv("SILICONFLOW_API_KEY", "")
	SILICONFLOW_URL = os.getenv("SILICONFLOW_URL", "").strip()
	SILICONFLOW_CHAT_URL = os.getenv(
	"SILICONFLOW_CHAT_URL", "https://api.siliconflow.com/v1/chat/completions").strip()

	if not SILICONFLOW_API_KEY:
	logger.warning(
	"SILICONFLOW_API_KEY is not set. LLM/Reranker calls may fail.")
	if not SILICONFLOW_URL:
	logger.warning(
	"SILICONFLOW_URL is not set. OpenAI client base_url will not work.")

	# Global variables - consider moving to a config class
	last_uploaded_path = None


	def get_text_splitter():
	"""Factory function for text splitter - makes testing easier"""
	return RecursiveCharacterTextSplitter(
	chunk_size=config["chunking"]["chunk_size"],
	# Fixed: was chunk_size
	chunk_overlap=config["chunking"]["chunk_overlap"],
	separators=config["chunking"]["separators"], # Fixed: was chunk_size
	)


	# ---------------------------
	# RAG DOMAIN FUNCTIONS
	# ---------------------------
	def rag_dom_ingest(file_path: str) -> str:
	if not os.path.exists(file_path):
	raise FileNotFoundError(f"File not found: {file_path}")
	try:
	raw = extract_text(file_path)
	if not raw.strip():
	raise ValueError(f"No text extracted from {file_path}")
	splitter = get_text_splitter()
	docs = splitter.split_text(raw)
	# Filter empty chunks
	texts = [chunk for chunk in docs if chunk.strip()]
	if not texts:
	raise ValueError("No valid text chunks created")
	vectors = embed_texts(texts)
	collection_name = config['rag']['weavaite_collection']
	logger.info(f"RAG domain ingesting to collection: {collection_name}")
	client = get_weaviate_client()
	# Batch insert with error handling
	with client.batch.dynamic() as batch:
	for txt, vec in zip(texts, vectors):
	batch.add_object(
	collection=collection_name,
	properties={"text": txt},
	vector=vec)
	logger.info(f"Successfully ingested {len(texts)} chunks from {os.path.basename(file_path)}")
	return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}"
	except Exception as e:
	logger.exception(f"Error ingesting file {file_path}: {e}")

	finally:
	if client is not None:
	try:
	client.close()
	except Exception as close_error:
	logger.error(f"Error closing Weaviate client: {close_error}")



	def rag_dom_qa(question: str) -> str:
	if not question.strip():
	return "Please provide a valid question."
	try:
	corrected_query, is_autism_related, _ = process_query_for_rewrite(
	question)
	if not is_autism_related:
	return get_non_autism_response()
	q_vec = embed_texts([corrected_query])[0]
	collection_name = config["rag"]["weavaite_collection"]
	logger.info(f"RAG domain QA using collection: {collection_name}")
	client = get_weaviate_client()
	documents = client.collections.get(collection_name)
	response = documents.query.near_vector(
	near_vector=q_vec,
	limit=5,
	return_metadata=["distance"])
	hits = response.objects
	if not hits:
	return "I couldn't find relevant information to answer your question."
	context = "\n\n".join(hit.properties["text"] for hit in hits)
	wisal_prompt = Prompt_template_Wisal.format(
	new_query=corrected_query,
	document=context)
	initial_answer = qwen_generate(wisal_prompt)
	answer_relevance_score = check_answer_autism_relevance(initial_answer)
	if answer_relevance_score < 50:
	return get_non_autism_answer_response()
	return initial_answer
	except Exception as e:
	logger.error(f"Error in RAG domain QA: {e}")
	return f"Sorry, I encountered an error processing your question: {str(e)}"
	finally:
	if client is not None:
	try:
	client.close()
	except Exception as close_error:
	logger.error(f"Error closing Weaviate client: {close_error}")


	# ---------------------------
	# OLD DOCUMENTS
	# ---------------------------
	async def old_doc_vdb(query: str, top_k: int = 1) -> dict:
	"""Query old documents vector database"""
	if not query.strip():
	return {"answer": []}
	qe = encode_query(query)
	if not qe:
	return {"answer": []}
	try:
	client = get_weaviate_client()
	coll = client.collections.get(config["rag"]["weavaite_collection"]) ## old_documents
	res = coll.query.near_vector(
	near_vector=qe,
	limit=top_k,
	return_properties=["text"])
	if not getattr(res, "objects", None):
	return {"answer": []}
	return {"answer": [obj.properties.get("text", "[No Text]") for obj in res.objects]}
	except Exception as e:
	logger.error(f"RAG Error in old_doc_vdb: {e}")
	return {"answer": []}
	finally:
	if client is not None:
	try:
	client.close()
	except Exception as close_error:
	logger.error(f"Error closing Weaviate client: {close_error}")


	def old_doc_ingestion(path: str) -> str:
	global last_uploaded_path
	if not os.path.exists(path):
	raise FileNotFoundError(f"File not found: {path}")
	last_uploaded_path = path
	logger.info(f"Old document path set: {os.path.basename(path)}")
	return f"Old document ingested: {os.path.basename(path)}"


	def old_doc_qa(query: str) -> str:
	if not query.strip():
	return "Please provide a valid question."
	try:
	corrected_query, is_autism_related, _ = process_query_for_rewrite(
	query)
	if not is_autism_related:
	return get_non_autism_response()
	rag_resp = asyncio.run(old_doc_vdb(corrected_query))
	chunks = rag_resp.get("answer", [])
	if not chunks:
	return "Sorry, I couldn't find relevant content in the old document."
	combined_answer = "\n".join(f"- {c}" for c in chunks if c.strip())
	answer_relevance_score = check_answer_autism_relevance(combined_answer)
	if answer_relevance_score < 50:
	return get_non_autism_answer_response()
	return combined_answer
	except Exception as e:
	logger.error(f"Error in old_doc_qa: {e}")
	return f"Error processing your request: {e}"


	# ---------------------------
	# USER SPECIFIC DOCUMENTS
	# ---------------------------
	def user_doc_ingest(file_path: str) -> str:
	if not os.path.exists(file_path):
	raise FileNotFoundError(f"File not found: {file_path}")
	try:
	raw = extract_text(file_path)
	if not raw.strip():
	raise ValueError(f"No text extracted from {file_path}")
	splitter = get_text_splitter()
	docs = splitter.split_text(raw)
	texts = [chunk for chunk in docs if chunk.strip()]
	if not texts:
	raise ValueError("No valid text chunks created")
	vectors = embed_texts(texts)
	client = get_weaviate_client()
	collection_name = config["rag"]["weavaite_collection"]
	# Batch insert
	with client.batch.dynamic() as batch:
	for txt, vec in zip(texts, vectors):
	batch.add_object(
	collection=collection_name,
	properties={"text": txt},
	vector=vec)
	logger.info(
	f"Successfully ingested user document: {os.path.basename(file_path)}")
	return f"Ingested {len(texts)} chunks from {os.path.basename(file_path)}"
	except Exception as e:
	logger.exception(f"Error ingesting user document {file_path}: {e}")
	finally:
	if client is not None:
	try:
	client.close()
	except Exception as close_error:
	logger.error(f"Error closing Weaviate client: {close_error}")

	def user_doc_qa(question: str) -> str:
	if not question.strip():
	return "Please provide a valid question."
	try:
	corrected_query, is_autism_related, _ = process_query_for_rewrite(
	question)
	if not is_autism_related:
	return get_non_autism_response()
	q_vec = embed_texts([corrected_query])[0]
	client = get_weaviate_client()
	documents = client.collections.get(
	config["rag"]["weavaite_collection"])
	response = documents.query.near_vector(
	near_vector=q_vec,
	limit=5,
	return_metadata=["distance"])
	hits = response.objects
	if not hits:
	return "I couldn't find relevant information to answer your question."
	context = "\n\n".join(hit.properties["text"] for hit in hits)
	UserSpecificDocument_prompt = Prompt_template_User_document_prompt.format(
	new_query=corrected_query,
	document=context)
	initial_answer = qwen_generate(UserSpecificDocument_prompt)
	answer_relevance_score = check_answer_autism_relevance(initial_answer)
	if answer_relevance_score < 50:
	return get_non_autism_answer_response()
	return initial_answer

	except Exception as e:
	logger.error(f"Error in user_doc_qa: {e}")
	return f"Sorry, I encountered an error processing your question: {str(e)}"

	# finally:
	# if client is not None:
	# try:
	# client.close()
	# except Exception as close_error:
	# logger.error(f"Error closing Weaviate client: {close_error}")

	## close client of weaviate
	# client.close()


	if __name__ == "__main__":
	# Test file paths
	pdf_test = "tests/Computational Requirements for Embed.pdf"
	docs_test = "tests/Computational Requirements for Embed.docx"
	txt_test = "assets/RAG_Documents/Autism_Books_1.txt"

	print(f"=" * 70)
	print("COMPREHENSIVE RAG DOCUMENT UTILS TEST SUITE")
	print(f"=" * 70)

	# ===========================
	# Test 1: RAG Domain Functions
	# ===========================
	print(f"\n{'=' * 50}")
	print("TEST 1: RAG DOMAIN FUNCTIONS")
	print(f"{'=' * 50}")

	try:
	print(f"Testing RAG domain ingestion with: {os.path.basename(txt_test)}")
	if os.path.exists(txt_test):
	result = rag_dom_ingest(txt_test)
	print(f"✓ RAG Domain Ingestion Result: {result}")

	# Test RAG domain QA
	print(f"\nTesting RAG domain QA...")
	test_questions = [
	"What is autism?",
	"How can I help a child with autism?",
	"What are the symptoms of autism?",
	"Tell me about weather today" # Non-autism related
	]

	for question in test_questions:
	print(f"\nQ: {question}")
	answer = rag_dom_qa(question)
	print(f"A: {answer[:200]}{'...' if len(answer) > 200 else ''}")

	else:
	print(f"✗ Test file not found: {txt_test}")

	except Exception as e:
	print(f"✗ RAG Domain Test Failed: {e}")

	# ===========================
	# Test 2: Old Document Functions
	# ===========================
	print(f"\n{'=' * 50}")
	print("TEST 2: OLD DOCUMENT FUNCTIONS")
	print(f"{'=' * 50}")

	try:
	print(f"Testing old document ingestion...")
	if os.path.exists(txt_test):
	result = old_doc_ingestion(txt_test)
	print(f"✓ Old Document Ingestion Result: {result}")

	# Test old document QA
	print(f"\nTesting old document QA...")
	test_questions = [
	"What information is in this document?",
	"Tell me about autism interventions",
	"What is machine learning?" # Non-autism related
	]

	for question in test_questions:
	print(f"\nQ: {question}")
	answer = old_doc_qa(question)
	print(f"A: {answer[:200]}{'...' if len(answer) > 200 else ''}")

	else:
	print(f"✗ Test file not found: {txt_test}")

	except Exception as e:
	print(f"✗ Old Document Test Failed: {e}")

	# ===========================
	# Test 3: User Document Functions
	# ===========================
	print(f"\n{'=' * 50}")
	print("TEST 3: USER DOCUMENT FUNCTIONS")
	print(f"{'=' * 50}")

	try:
	print(f"Testing user document ingestion...")
	if os.path.exists(txt_test):
	result = user_doc_ingest(txt_test)
	print(f"✓ User Document Ingestion Result: {result}")

	# Test user document QA
	print(f"\nTesting user document QA...")
	test_questions = [
	"What does this document say about autism?",
	"Are there any treatment recommendations?",
	"What's the capital of France?" # Non-autism related
	]

	for question in test_questions:
	print(f"\nQ: {question}")
	answer = user_doc_qa(question)
	print(f"A: {answer[:200]}{'...' if len(answer) > 200 else ''}")

	else:
	print(f"✗ Test file not found: {txt_test}")

	except Exception as e:
	print(f"✗ User Document Test Failed: {e}")

	# ===========================
	# Test 4: Multiple File Format Support
	# ===========================
	print(f"\n{'=' * 50}")
	print("TEST 4: MULTIPLE FILE FORMAT SUPPORT")
	print(f"{'=' * 50}")

	test_files = [
	(pdf_test, "PDF"),
	(docs_test, "DOCX"),
	(txt_test, "TXT")
	]

	for file_path, file_type in test_files:
	print(f"\nTesting {file_type} file: {os.path.basename(file_path)}")
	if os.path.exists(file_path):
	try:
	# Test extraction
	text = extract_text(file_path)
	if text:
	print(f"✓ {file_type} text extraction successful: {len(text)} characters")
	print(f" Preview: {text[:100]}...")

	# Test ingestion
	result = rag_dom_ingest(file_path)
	print(f"✓ {file_type} ingestion successful: {result}")
	else:
	print(f"✗ {file_type} text extraction returned empty")

	except Exception as e:
	print(f"✗ {file_type} processing failed: {e}")
	else:
	print(f"✗ {file_type} file not found: {file_path}")

	# ===========================
	# Test 5: Error Handling
	# ===========================
	print(f"\n{'=' * 50}")
	print("TEST 5: ERROR HANDLING")
	print(f"{'=' * 50}")

	# Test with non-existent file
	print("Testing with non-existent file...")
	try:
	result = rag_dom_ingest("non_existent_file.txt")
	print(f"✗ Should have failed: {result}")
	except FileNotFoundError:
	print("✓ Correctly handled non-existent file")
	except Exception as e:
	print(f"✓ Handled error: {e}")

	# Test with empty query
	print("\nTesting with empty query...")
	empty_result = rag_dom_qa("")
	print(f"✓ Empty query handled: {empty_result}")

	# Test with very long query
	print("\nTesting with very long query...")
	long_query = "autism " * 100 + "what is it?"
	long_result = rag_dom_qa(long_query)
	print(f"✓ Long query handled: {long_result[:100]}...")

	# ===========================
	# Test 6: Old Document Vector DB
	# ===========================
	print(f"\n{'=' * 50}")
	print("TEST 6: OLD DOCUMENT VECTOR DB")
	print(f"{'=' * 50}")

	try:
	print("Testing old document vector database query...")
	vdb_result = asyncio.run(old_doc_vdb("autism interventions", top_k=3))
	print(f"✓ Vector DB query successful: {len(vdb_result.get('answer', []))} results")

	for i, answer in enumerate(vdb_result.get('answer', [])[:2]):
	print(f" Result {i+1}: {answer[:100]}...")

	except Exception as e:
	print(f"✗ Vector DB test failed: {e}")

	# ===========================
	# Test 7: Configuration and Environment
	# ===========================
	print(f"\n{'=' * 50}")
	print("TEST 7: CONFIGURATION AND ENVIRONMENT")
	print(f"{'=' * 50}")

	print("Checking environment variables...")
	env_vars = [
	"SILICONFLOW_API_KEY",
	"SILICONFLOW_URL",
	"SILICONFLOW_CHAT_URL",
	"WEAVIATE_URL",
	"WEAVIATE_API_KEY"
	]

	for var in env_vars:
	value = os.getenv(var)
	if value:
	print(f"✓ {var}: Set (length: {len(value)})")
	else:
	print(f"✗ {var}: Not set")

	print(f"\nChecking configuration...")
	try:
	print(f"✓ Chunk size: {config['chunking']['chunk_size']}")
	print(f"✓ Chunk overlap: {config['chunking']['chunk_overlap']}")
	print(f"✓ RAG collection: {config['rag']['weavaite_collection']}")
	print(f"✓ Old doc collection: {config['rag']['old_doc']}")
	except Exception as e:
	print(f"✗ Configuration error: {e}")

	# ===========================
	# Test 8: Text Splitter
	# ===========================
	print(f"\n{'=' * 50}")
	print("TEST 8: TEXT SPLITTER")
	print(f"{'=' * 50}")

	try:
	splitter = get_text_splitter()
	sample_text = "This is a sample text. " * 100 # Create long text
	chunks = splitter.split_text(sample_text)
	print(f"✓ Text splitter created {len(chunks)} chunks")
	print(f"✓ Average chunk size: {sum(len(c) for c in chunks) / len(chunks):.0f} characters")

	except Exception as e:
	print(f"✗ Text splitter test failed: {e}")

	# ===========================
	# Test Summary
	# ===========================
	print(f"\n{'=' * 70}")
	print("TEST SUMMARY")
	print(f"{'=' * 70}")
	print("✓ All major functions tested")
	print("✓ Error handling verified")
	print("✓ Multiple file formats supported")
	print("✓ Configuration checked")
	print("✓ Vector database operations tested")
	print(f"{'=' * 70}")
	print("TEST SUITE COMPLETED")
	print(f"{'=' * 70}")

	# # Close client properly
	# try:
	# if 'client' in globals() and client:
	# client.close()
	# print("✓ Weaviate client closed properly")
	# except Exception as e:
	# print(f"✗ Error closing client: {e}")