Spaces:
Sleeping
Sleeping
File size: 5,741 Bytes
bfd0858 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import os
import openai
import tiktoken
import warnings
import numpy as np
import pandas as pd
import configparser
# Mute the PerformanceWarning
warnings.filterwarnings("ignore", category=Warning)
dir_path = os.path.abspath(os.getcwd())
config_dir = dir_path + "/src"
COMPLETIONS_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL = "text-embedding-ada-002"
config = configparser.ConfigParser()
config.read(os.path.join(config_dir, 'gpt_local_config.cfg'))
# openai.api_key = config.get('token', 'GPT_TOKEN')
openai.api_key = os.environ.get("GPT_TOKEN")
SEPARATOR = "\n* "
ENCODING = "gpt2" # encoding for text-davinci-003
MAX_SECTION_LEN = 4000
encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))
# The embedding functions were inspired by example
# "Question answering using embeddings-based search"
# in the OpenAI Cookbook repo (https://github.com/openai/openai-cookbook)
# which hosts a great number of example applications
# using OpenAI APIs. The content is fast evolving and the
# current example is far different then what I saw before.
# It is a great resource to learn and get inspired!
def get_embedding(
text: str,
model: str = EMBEDDING_MODEL
) -> list[float]:
result = openai.Embedding.create(
model=model,
input=text
)
return result["data"][0]["embedding"]
def compute_doc_embeddings(
df: pd.DataFrame
) -> dict[tuple[str, str], list[float]]:
"""
Create an embedding for each row in the dataframe
using the OpenAI Embeddings API.
Return a dictionary that maps between each embedding
vector and the index of the row that it corresponds to.
"""
return {
idx: get_embedding(r.content) for idx, r in df.iterrows()
}
def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
"""
Read the document embeddings and their keys from a CSV.
fname is the path to a CSV with exactly these named columns:
"title", "heading", "0", "1", ...
up to the length of the embedding vectors.
"""
df = pd.read_csv(fname, header=0)
max_dim = max([
int(c) for c in df.columns if c != "title" and c != "heading"
])
return {
(r.title, r.heading): [
r[str(i)] for i in range(max_dim + 1)
] for _, r in df.iterrows()
}
def vector_similarity(x: list[float], y: list[float]) -> float:
"""
Returns the similarity between two vectors.
Because OpenAI Embeddings are normalized to length 1,
the cosine similarity is the same as the dot product.
"""
return np.dot(np.array(x), np.array(y))
def order_document_sections_by_query_similarity(
query: str,
contexts: dict[(str, str), np.array]
) -> list[(float, (str, str))]:
"""
Find the query embedding for the supplied query,
and compare it against all of the pre-calculated document embeddings
to find the most relevant sections.
Return the list of document sections,
sorted by relevance in descending order.
"""
query_embedding = get_embedding(query)
document_similarities = sorted([
(vector_similarity(
query_embedding,
doc_embedding
), doc_index) for doc_index, doc_embedding in contexts.items()
], reverse=True)
return document_similarities
def construct_prompt(
question: str,
context_embeddings: dict,
df: pd.DataFrame,
show_section=False
) -> str:
"""
Fetch relevant
"""
most_relevant_doc_secs = order_document_sections_by_query_similarity(
question,
context_embeddings
)
chosen_sections = []
chosen_sections_len = 0
chosen_sections_indexes = []
for _, section_index in most_relevant_doc_secs:
# Add contexts until we run out of space.
document_section = df.loc[section_index]
chosen_sections_len += document_section.tokens.values[0] + \
separator_len
if chosen_sections_len > MAX_SECTION_LEN:
break
chosen_sections.append(
SEPARATOR +
document_section.content.values[0].replace("\n", " ")
)
chosen_sections_indexes.append(str(section_index))
# Useful diagnostic information
if show_section:
print(f"Selected {len(chosen_sections)} document sections:")
print("\n".join(chosen_sections_indexes))
string_list = [str(item) for item in chosen_sections]
chosen_sections_str = ''.join(string_list)
header = "Answer the question strictly using the provided context," + \
" and if the answer is not contained within the text below," + \
" say 'Sorry, your inquiry is not in the Wiki. For further" + \
" assistance, please contact caNanoLab-Support@ISB-CGC.org' " + \
"\n\nContext:\n"
prompt = header + chosen_sections_str + "\n\n Q: " + question + "\n A:"
return prompt, chosen_sections_indexes
def answer_query_with_context(
query: str,
df: pd.DataFrame,
document_embeddings: dict[(str, str), np.array],
show_prompt: bool = False,
show_source: bool = False
) -> str:
prompt, chosen_sections_indexes = construct_prompt(
query,
document_embeddings,
df
)
if show_prompt:
print(prompt)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[{
"role": "user",
"content": prompt
}],
temperature=0,
max_tokens=500
# top_p=1,
# frequency_penalty=0,
# presence_penalty=0
)
msg = response.choices[0]['message']['content']
chosen_sections_indexes = "<br>".join(chosen_sections_indexes)
return msg, chosen_sections_indexes
|