Spaces:
Paused
Paused
| import spacy | |
| from itertools import groupby | |
| from operator import itemgetter | |
| from langsmith import traceable | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import numpy as np | |
| def get_nlp_model(): | |
| """ | |
| Load and return the spaCy NLP model. Downloads the model if not already installed. | |
| Returns: | |
| nlp: The loaded spaCy NLP model. | |
| """ | |
| if not spacy.util.is_package("en_core_web_md"): | |
| print("Downloading en_core_web_md model...") | |
| spacy.cli.download("en_core_web_md") | |
| print("Model downloaded successfully!") | |
| nlp = spacy.load("en_core_web_md") | |
| return nlp | |
| def recursive_split_documents(contents, max_chunk_size=1000, overlap=100): | |
| """ | |
| Split documents into smaller chunks using a recursive character text splitter. | |
| Args: | |
| contents (list): List of content dictionaries with 'page_content', 'title', and 'link'. | |
| max_chunk_size (int): Maximum size of each chunk. | |
| overlap (int): Overlap between chunks. | |
| Returns: | |
| list: List of chunks with text and metadata. | |
| """ | |
| from langchain_core.documents.base import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| documents = [] | |
| for content in contents: | |
| try: | |
| page_content = content['page_content'] | |
| if page_content: | |
| metadata = {'title': content['title'], 'source': content['link']} | |
| doc = Document(page_content=content['page_content'], metadata=metadata) | |
| documents.append(doc) | |
| except Exception as e: | |
| print(f"Error processing content for {content['link']}: {e}") | |
| # Initialize recursive text splitter | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=max_chunk_size, chunk_overlap=overlap) | |
| # Split documents | |
| split_documents = text_splitter.split_documents(documents) | |
| # Convert split documents to the same format as recursive_split | |
| chunks = [] | |
| for doc in split_documents: | |
| chunk = { | |
| 'text': doc.page_content, | |
| 'metadata': { | |
| 'title': doc.metadata.get('title', ''), | |
| 'source': doc.metadata.get('source', '') | |
| } | |
| } | |
| chunks.append(chunk) | |
| return chunks | |
| def semantic_search(query, chunks, nlp, similarity_threshold=0.5, top_n=10): | |
| """ | |
| Perform semantic search to find relevant chunks based on similarity to the query. | |
| Args: | |
| query (str): The search query. | |
| chunks (list): List of text chunks with vectors. | |
| nlp: The spaCy NLP model. | |
| similarity_threshold (float): Minimum similarity score to consider a chunk relevant. | |
| top_n (int): Number of top relevant chunks to return. | |
| Returns: | |
| list: List of relevant chunks and their similarity scores. | |
| """ | |
| # Precompute query vector and its norm | |
| query_vector = nlp(query).vector | |
| query_norm = np.linalg.norm(query_vector) + 1e-8 # Add epsilon to avoid division by zero | |
| # Check if chunks have precomputed vectors; if not, compute them | |
| if 'vector' not in chunks[0]: | |
| texts = [chunk['text'] for chunk in chunks] | |
| # Process texts in batches using nlp.pipe() | |
| batch_size = 1000 # Adjust based on available memory | |
| with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'tok2vec']): | |
| docs = nlp.pipe(texts, batch_size=batch_size) | |
| # Add vectors to chunks | |
| for chunk, doc in zip(chunks, docs): | |
| chunk['vector'] = doc.vector | |
| # Prepare chunk vectors and norms | |
| chunk_vectors = np.array([chunk['vector'] for chunk in chunks]) | |
| chunk_norms = np.linalg.norm(chunk_vectors, axis=1) + 1e-8 # Add epsilon to avoid division by zero | |
| # Compute similarities | |
| similarities = np.dot(chunk_vectors, query_vector) / (chunk_norms * query_norm) | |
| # Filter and sort results | |
| relevant_chunks = [ | |
| (chunk, sim) for chunk, sim in zip(chunks, similarities) if sim > similarity_threshold | |
| ] | |
| relevant_chunks.sort(key=lambda x: x[1], reverse=True) | |
| return relevant_chunks[:top_n] | |
| def query_rag(chat_llm, query, relevant_results, callbacks = []): | |
| """ | |
| Generate a response using retrieval-augmented generation (RAG) based on relevant results. | |
| Args: | |
| chat_llm: The chat language model to use. | |
| query (str): The user's query. | |
| relevant_results (list): List of relevant chunks and their similarity scores. | |
| Returns: | |
| str: The generated response. | |
| """ | |
| prompt = build_rag_prompt(query, relevant_results) | |
| response = chat_llm.invoke(prompt).content | |
| return response | |
| def build_rag_prompt(query, relevant_results): | |
| import web_rag as wr | |
| formatted_chunks = format_docs(relevant_results) | |
| prompt = wr.get_rag_prompt_template().format(query=query, context=formatted_chunks) | |
| return prompt | |
| def format_docs(relevant_results): | |
| """ | |
| Convert relevant search results into a JSON-formatted string. | |
| Args: | |
| relevant_results (list): List of relevant chunks with metadata. | |
| Returns: | |
| str: JSON-formatted string of document chunks. | |
| """ | |
| import json | |
| formatted_chunks = [] | |
| for chunk, _ in relevant_results: # Unpack the tuple, ignore similarity score | |
| formatted_chunk = { | |
| "content": chunk['text'], | |
| "link": chunk['metadata'].get('source', ''), | |
| "title": chunk['metadata'].get('title', ''), | |
| } | |
| formatted_chunks.append(formatted_chunk) | |
| return json.dumps(formatted_chunks, indent=2) |