Spaces:
Sleeping
Sleeping
8bitnand
commited on
Commit
·
c575b59
1
Parent(s):
8b6196b
returning the query itsef, IDK why
Browse files
model.py
CHANGED
|
@@ -37,12 +37,11 @@ class RAGModel:
|
|
| 37 |
|
| 38 |
context = "_ " + "\n-".join(c for c in topk_items)
|
| 39 |
|
| 40 |
-
base_prompt = f"""
|
| 41 |
-
Give time for yourself to read the context and then answer the query.
|
| 42 |
Do not return thinking process, just return the answer.
|
| 43 |
If you do not find the answer, or if the query is offesnsive or in any other way harmfull just return "I'm not aware of it"
|
| 44 |
Now use the following context items to answer the user query.
|
| 45 |
-
{context}.
|
| 46 |
user query : {query}
|
| 47 |
"""
|
| 48 |
|
|
@@ -56,7 +55,6 @@ class RAGModel:
|
|
| 56 |
def answer_query(self, query: str, topk_items: list[str]):
|
| 57 |
|
| 58 |
prompt = self.create_prompt(query, topk_items)
|
| 59 |
-
print(prompt)
|
| 60 |
input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
| 61 |
output = self.model.generate(**input_ids, max_new_tokens=512)
|
| 62 |
text = self.tokenizer.decode(output[0])
|
|
@@ -67,12 +65,13 @@ class RAGModel:
|
|
| 67 |
if __name__ == "__main__":
|
| 68 |
|
| 69 |
configs = load_configs(config_file="rag.configs.yml")
|
| 70 |
-
query = "what is
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
| 37 |
|
| 38 |
context = "_ " + "\n-".join(c for c in topk_items)
|
| 39 |
|
| 40 |
+
base_prompt = f"""Give time for yourself to read the context and then answer the query.
|
|
|
|
| 41 |
Do not return thinking process, just return the answer.
|
| 42 |
If you do not find the answer, or if the query is offesnsive or in any other way harmfull just return "I'm not aware of it"
|
| 43 |
Now use the following context items to answer the user query.
|
| 44 |
+
context: {context}.
|
| 45 |
user query : {query}
|
| 46 |
"""
|
| 47 |
|
|
|
|
| 55 |
def answer_query(self, query: str, topk_items: list[str]):
|
| 56 |
|
| 57 |
prompt = self.create_prompt(query, topk_items)
|
|
|
|
| 58 |
input_ids = self.tokenizer(prompt, return_tensors="pt").to(self.device)
|
| 59 |
output = self.model.generate(**input_ids, max_new_tokens=512)
|
| 60 |
text = self.tokenizer.decode(output[0])
|
|
|
|
| 65 |
if __name__ == "__main__":
|
| 66 |
|
| 67 |
configs = load_configs(config_file="rag.configs.yml")
|
| 68 |
+
query = "what is computer vision"
|
| 69 |
+
g = GoogleSearch(query)
|
| 70 |
+
data = g.all_page_data
|
| 71 |
+
d = Document(data, 512)
|
| 72 |
+
doc_chunks = d.doc()
|
| 73 |
+
s = SemanticSearch(doc_chunks, "all-mpnet-base-v2", "mps")
|
| 74 |
+
topk, u = s.semantic_search(query=query, k=32)
|
| 75 |
+
r = RAGModel(configs)
|
| 76 |
+
output = r.answer_query(query=query, topk_items=topk)
|
| 77 |
+
print(output)
|
search.py
CHANGED
|
@@ -34,12 +34,11 @@ class GoogleSearch:
|
|
| 34 |
for link in sublist
|
| 35 |
if len(link) > 0
|
| 36 |
]
|
| 37 |
-
|
| 38 |
return links
|
| 39 |
|
| 40 |
def read_url_page(self, url: str) -> str:
|
| 41 |
|
| 42 |
-
print(url)
|
| 43 |
response = requests.get(url, headers=self.headers)
|
| 44 |
response.raise_for_status()
|
| 45 |
soup = BeautifulSoup(response.text, "html.parser")
|
|
@@ -136,7 +135,7 @@ class SemanticSearch:
|
|
| 136 |
)
|
| 137 |
|
| 138 |
def semantic_search(self, query: str, k: int = 10):
|
| 139 |
-
print("
|
| 140 |
query_embeding = self.get_embeding(query)
|
| 141 |
doc_embeding = self.get_embeding(self.doc_chunks)
|
| 142 |
scores = util.dot_score(a=query_embeding, b=doc_embeding)[0]
|
|
@@ -147,20 +146,3 @@ class SemanticSearch:
|
|
| 147 |
def get_embeding(self, text: Union[list[str], str]):
|
| 148 |
en = self.st.encode(text)
|
| 149 |
return en
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
if __name__ == "__main__":
|
| 153 |
-
|
| 154 |
-
query = "what is LLM"
|
| 155 |
-
g = GoogleSearch(query)
|
| 156 |
-
data = g.all_page_data
|
| 157 |
-
# d = Document(data, 333)
|
| 158 |
-
# doc_chunks = d.doc()
|
| 159 |
-
# s = SemanticSearch(doc_chunks, "all-mpnet-base-v2", "mps")
|
| 160 |
-
# topk, u = s.semantic_search(query, k=64)
|
| 161 |
-
# print(len(topk))
|
| 162 |
-
# print(topk, u)
|
| 163 |
-
|
| 164 |
-
# g = GoogleSearch("what is LLM")
|
| 165 |
-
# d = Document(g.all_page_data)
|
| 166 |
-
# print(len(d.doc()[0]))
|
|
|
|
| 34 |
for link in sublist
|
| 35 |
if len(link) > 0
|
| 36 |
]
|
| 37 |
+
|
| 38 |
return links
|
| 39 |
|
| 40 |
def read_url_page(self, url: str) -> str:
|
| 41 |
|
|
|
|
| 42 |
response = requests.get(url, headers=self.headers)
|
| 43 |
response.raise_for_status()
|
| 44 |
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
| 135 |
)
|
| 136 |
|
| 137 |
def semantic_search(self, query: str, k: int = 10):
|
| 138 |
+
print("Searching Top k in document...")
|
| 139 |
query_embeding = self.get_embeding(query)
|
| 140 |
doc_embeding = self.get_embeding(self.doc_chunks)
|
| 141 |
scores = util.dot_score(a=query_embeding, b=doc_embeding)[0]
|
|
|
|
| 146 |
def get_embeding(self, text: Union[list[str], str]):
|
| 147 |
en = self.st.encode(text)
|
| 148 |
return en
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|