caNanoWiki / src /embedding_qa.py
ruiheesi
Add applicaiton file
bfd0858
import os
import openai
import tiktoken
import warnings
import numpy as np
import pandas as pd
import configparser
# Mute the PerformanceWarning
warnings.filterwarnings("ignore", category=Warning)
dir_path = os.path.abspath(os.getcwd())
config_dir = dir_path + "/src"
COMPLETIONS_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL = "text-embedding-ada-002"
config = configparser.ConfigParser()
config.read(os.path.join(config_dir, 'gpt_local_config.cfg'))
# openai.api_key = config.get('token', 'GPT_TOKEN')
openai.api_key = os.environ.get("GPT_TOKEN")
SEPARATOR = "\n* "
ENCODING = "gpt2" # encoding for text-davinci-003
MAX_SECTION_LEN = 4000
encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))
# The embedding functions were inspired by example
# "Question answering using embeddings-based search"
# in the OpenAI Cookbook repo (https://github.com/openai/openai-cookbook)
# which hosts a great number of example applications
# using OpenAI APIs. The content is fast evolving and the
# current example is far different then what I saw before.
# It is a great resource to learn and get inspired!
def get_embedding(
text: str,
model: str = EMBEDDING_MODEL
) -> list[float]:
result = openai.Embedding.create(
model=model,
input=text
)
return result["data"][0]["embedding"]
def compute_doc_embeddings(
df: pd.DataFrame
) -> dict[tuple[str, str], list[float]]:
"""
Create an embedding for each row in the dataframe
using the OpenAI Embeddings API.
Return a dictionary that maps between each embedding
vector and the index of the row that it corresponds to.
"""
return {
idx: get_embedding(r.content) for idx, r in df.iterrows()
}
def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
"""
Read the document embeddings and their keys from a CSV.
fname is the path to a CSV with exactly these named columns:
"title", "heading", "0", "1", ...
up to the length of the embedding vectors.
"""
df = pd.read_csv(fname, header=0)
max_dim = max([
int(c) for c in df.columns if c != "title" and c != "heading"
])
return {
(r.title, r.heading): [
r[str(i)] for i in range(max_dim + 1)
] for _, r in df.iterrows()
}
def vector_similarity(x: list[float], y: list[float]) -> float:
"""
Returns the similarity between two vectors.
Because OpenAI Embeddings are normalized to length 1,
the cosine similarity is the same as the dot product.
"""
return np.dot(np.array(x), np.array(y))
def order_document_sections_by_query_similarity(
query: str,
contexts: dict[(str, str), np.array]
) -> list[(float, (str, str))]:
"""
Find the query embedding for the supplied query,
and compare it against all of the pre-calculated document embeddings
to find the most relevant sections.
Return the list of document sections,
sorted by relevance in descending order.
"""
query_embedding = get_embedding(query)
document_similarities = sorted([
(vector_similarity(
query_embedding,
doc_embedding
), doc_index) for doc_index, doc_embedding in contexts.items()
], reverse=True)
return document_similarities
def construct_prompt(
question: str,
context_embeddings: dict,
df: pd.DataFrame,
show_section=False
) -> str:
"""
Fetch relevant
"""
most_relevant_doc_secs = order_document_sections_by_query_similarity(
question,
context_embeddings
)
chosen_sections = []
chosen_sections_len = 0
chosen_sections_indexes = []
for _, section_index in most_relevant_doc_secs:
# Add contexts until we run out of space.
document_section = df.loc[section_index]
chosen_sections_len += document_section.tokens.values[0] + \
separator_len
if chosen_sections_len > MAX_SECTION_LEN:
break
chosen_sections.append(
SEPARATOR +
document_section.content.values[0].replace("\n", " ")
)
chosen_sections_indexes.append(str(section_index))
# Useful diagnostic information
if show_section:
print(f"Selected {len(chosen_sections)} document sections:")
print("\n".join(chosen_sections_indexes))
string_list = [str(item) for item in chosen_sections]
chosen_sections_str = ''.join(string_list)
header = "Answer the question strictly using the provided context," + \
" and if the answer is not contained within the text below," + \
" say 'Sorry, your inquiry is not in the Wiki. For further" + \
" assistance, please contact caNanoLab-Support@ISB-CGC.org' " + \
"\n\nContext:\n"
prompt = header + chosen_sections_str + "\n\n Q: " + question + "\n A:"
return prompt, chosen_sections_indexes
def answer_query_with_context(
query: str,
df: pd.DataFrame,
document_embeddings: dict[(str, str), np.array],
show_prompt: bool = False,
show_source: bool = False
) -> str:
prompt, chosen_sections_indexes = construct_prompt(
query,
document_embeddings,
df
)
if show_prompt:
print(prompt)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{
"role": "user",
"content": prompt
}],
temperature=0,
max_tokens=500
# top_p=1,
# frequency_penalty=0,
# presence_penalty=0
)
msg = response.choices[0]['message']['content']
chosen_sections_indexes = "<br>".join(chosen_sections_indexes)
return msg, chosen_sections_indexes