import os import openai import tiktoken import warnings import numpy as np import pandas as pd import configparser # Mute the PerformanceWarning warnings.filterwarnings("ignore", category=Warning) dir_path = os.path.abspath(os.getcwd()) config_dir = dir_path + "/src" COMPLETIONS_MODEL = "gpt-3.5-turbo" EMBEDDING_MODEL = "text-embedding-ada-002" config = configparser.ConfigParser() config.read(os.path.join(config_dir, 'gpt_local_config.cfg')) # openai.api_key = config.get('token', 'GPT_TOKEN') openai.api_key = os.environ.get("GPT_TOKEN") SEPARATOR = "\n* " ENCODING = "gpt2" # encoding for text-davinci-003 MAX_SECTION_LEN = 4000 encoding = tiktoken.get_encoding(ENCODING) separator_len = len(encoding.encode(SEPARATOR)) # The embedding functions were inspired by example # "Question answering using embeddings-based search" # in the OpenAI Cookbook repo (https://github.com/openai/openai-cookbook) # which hosts a great number of example applications # using OpenAI APIs. The content is fast evolving and the # current example is far different then what I saw before. # It is a great resource to learn and get inspired! def get_embedding( text: str, model: str = EMBEDDING_MODEL ) -> list[float]: result = openai.Embedding.create( model=model, input=text ) return result["data"][0]["embedding"] def compute_doc_embeddings( df: pd.DataFrame ) -> dict[tuple[str, str], list[float]]: """ Create an embedding for each row in the dataframe using the OpenAI Embeddings API. Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to. """ return { idx: get_embedding(r.content) for idx, r in df.iterrows() } def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]: """ Read the document embeddings and their keys from a CSV. fname is the path to a CSV with exactly these named columns: "title", "heading", "0", "1", ... up to the length of the embedding vectors. """ df = pd.read_csv(fname, header=0) max_dim = max([ int(c) for c in df.columns if c != "title" and c != "heading" ]) return { (r.title, r.heading): [ r[str(i)] for i in range(max_dim + 1) ] for _, r in df.iterrows() } def vector_similarity(x: list[float], y: list[float]) -> float: """ Returns the similarity between two vectors. Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product. """ return np.dot(np.array(x), np.array(y)) def order_document_sections_by_query_similarity( query: str, contexts: dict[(str, str), np.array] ) -> list[(float, (str, str))]: """ Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings to find the most relevant sections. Return the list of document sections, sorted by relevance in descending order. """ query_embedding = get_embedding(query) document_similarities = sorted([ (vector_similarity( query_embedding, doc_embedding ), doc_index) for doc_index, doc_embedding in contexts.items() ], reverse=True) return document_similarities def construct_prompt( question: str, context_embeddings: dict, df: pd.DataFrame, show_section=False ) -> str: """ Fetch relevant """ most_relevant_doc_secs = order_document_sections_by_query_similarity( question, context_embeddings ) chosen_sections = [] chosen_sections_len = 0 chosen_sections_indexes = [] for _, section_index in most_relevant_doc_secs: # Add contexts until we run out of space. document_section = df.loc[section_index] chosen_sections_len += document_section.tokens.values[0] + \ separator_len if chosen_sections_len > MAX_SECTION_LEN: break chosen_sections.append( SEPARATOR + document_section.content.values[0].replace("\n", " ") ) chosen_sections_indexes.append(str(section_index)) # Useful diagnostic information if show_section: print(f"Selected {len(chosen_sections)} document sections:") print("\n".join(chosen_sections_indexes)) string_list = [str(item) for item in chosen_sections] chosen_sections_str = ''.join(string_list) header = "Answer the question strictly using the provided context," + \ " and if the answer is not contained within the text below," + \ " say 'Sorry, your inquiry is not in the Wiki. For further" + \ " assistance, please contact caNanoLab-Support@ISB-CGC.org' " + \ "\n\nContext:\n" prompt = header + chosen_sections_str + "\n\n Q: " + question + "\n A:" return prompt, chosen_sections_indexes def answer_query_with_context( query: str, df: pd.DataFrame, document_embeddings: dict[(str, str), np.array], show_prompt: bool = False, show_source: bool = False ) -> str: prompt, chosen_sections_indexes = construct_prompt( query, document_embeddings, df ) if show_prompt: print(prompt) response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[{ "role": "user", "content": prompt }], temperature=0, max_tokens=500 # top_p=1, # frequency_penalty=0, # presence_penalty=0 ) msg = response.choices[0]['message']['content'] chosen_sections_indexes = "
".join(chosen_sections_indexes) return msg, chosen_sections_indexes