Spaces:
Runtime error
Runtime error
Kushwanth Chowday Kandala
commited on
insert uploaded document to pinecone
Browse files
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import numpy as np
|
|
| 5 |
import pandas as pd
|
| 6 |
from io import StringIO
|
| 7 |
import PyPDF2
|
| 8 |
-
from tqdm import tqdm
|
| 9 |
import math
|
| 10 |
from transformers import pipeline
|
| 11 |
# import json
|
|
@@ -150,26 +150,12 @@ def chat_actions():
|
|
| 150 |
if "chat_history" not in st.session_state:
|
| 151 |
st.session_state["chat_history"] = []
|
| 152 |
|
| 153 |
-
|
| 154 |
st.chat_input("show me the contents of ML paper published on xxx with article no. xx?", on_submit=chat_actions, key="chat_input")
|
| 155 |
|
| 156 |
for i in st.session_state["chat_history"]:
|
| 157 |
with st.chat_message(name=i["role"]):
|
| 158 |
st.write(i["content"])
|
| 159 |
|
| 160 |
-
### Creating a Index(Pinecone Vector Database)
|
| 161 |
-
# %%writefile .env
|
| 162 |
-
# PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
|
| 163 |
-
# PINECONE_ENV=os.getenv("PINECONE_ENV")
|
| 164 |
-
# PINECONE_ENVIRONMENT=os.getenv("PINECONE_ENVIRONMENT")
|
| 165 |
-
|
| 166 |
-
# import os
|
| 167 |
-
# import pinecone
|
| 168 |
-
|
| 169 |
-
# from pinecone import Index, GRPCIndex
|
| 170 |
-
# pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
|
| 171 |
-
# st.text(pinecone)
|
| 172 |
-
|
| 173 |
def print_out(pages):
|
| 174 |
for i in range(len(pages)):
|
| 175 |
text = pages[i].extract_text().strip()
|
|
@@ -184,9 +170,11 @@ def combine_text(pages):
|
|
| 184 |
p = math.pow(1024, 2)
|
| 185 |
mbsize = round(len(bytesize) / p, 2)
|
| 186 |
st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
|
|
|
|
| 187 |
|
| 188 |
def create_embeddings():
|
| 189 |
# Get the uploaded file
|
|
|
|
| 190 |
with st.sidebar:
|
| 191 |
uploaded_files = st.session_state["uploaded_files"]
|
| 192 |
for uploaded_file in uploaded_files:
|
|
@@ -194,14 +182,37 @@ def create_embeddings():
|
|
| 194 |
reader = PyPDF2.PdfReader(uploaded_file)
|
| 195 |
pages = reader.pages
|
| 196 |
print_out(pages)
|
| 197 |
-
combine_text(pages)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
-
st.write("created_embeddings")
|
| 200 |
|
| 201 |
# Display the contents of the file
|
| 202 |
# st.write(file_contents)
|
| 203 |
|
| 204 |
-
|
| 205 |
with st.sidebar:
|
| 206 |
st.markdown("""
|
| 207 |
***:red[Follow this steps]***
|
|
@@ -234,5 +245,4 @@ with st.sidebar:
|
|
| 234 |
# pages = reader.pages
|
| 235 |
# print_out(pages)
|
| 236 |
# combine_text(pages)
|
| 237 |
-
# promt_engineer(text)
|
| 238 |
-
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
from io import StringIO
|
| 7 |
import PyPDF2
|
| 8 |
+
from tqdm.auto import tqdm
|
| 9 |
import math
|
| 10 |
from transformers import pipeline
|
| 11 |
# import json
|
|
|
|
| 150 |
if "chat_history" not in st.session_state:
|
| 151 |
st.session_state["chat_history"] = []
|
| 152 |
|
|
|
|
| 153 |
st.chat_input("show me the contents of ML paper published on xxx with article no. xx?", on_submit=chat_actions, key="chat_input")
|
| 154 |
|
| 155 |
for i in st.session_state["chat_history"]:
|
| 156 |
with st.chat_message(name=i["role"]):
|
| 157 |
st.write(i["content"])
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
def print_out(pages):
|
| 160 |
for i in range(len(pages)):
|
| 161 |
text = pages[i].extract_text().strip()
|
|
|
|
| 170 |
p = math.pow(1024, 2)
|
| 171 |
mbsize = round(len(bytesize) / p, 2)
|
| 172 |
st.write(f"There are {len(concatenates_text)} characters in the pdf with {mbsize}MB size")
|
| 173 |
+
return concatenates_text
|
| 174 |
|
| 175 |
def create_embeddings():
|
| 176 |
# Get the uploaded file
|
| 177 |
+
inputtext = ""
|
| 178 |
with st.sidebar:
|
| 179 |
uploaded_files = st.session_state["uploaded_files"]
|
| 180 |
for uploaded_file in uploaded_files:
|
|
|
|
| 182 |
reader = PyPDF2.PdfReader(uploaded_file)
|
| 183 |
pages = reader.pages
|
| 184 |
print_out(pages)
|
| 185 |
+
inputtext = combine_text(pages)
|
| 186 |
+
|
| 187 |
+
# connect to pinecone index
|
| 188 |
+
pinecone = connect_pinecone()
|
| 189 |
+
index = get_pinecone_semantic_index(pinecone)
|
| 190 |
+
|
| 191 |
+
# The maximum metadata size per vector is 40KB
|
| 192 |
+
batch_size = 10000
|
| 193 |
+
for i in tqdm(range(0, len(inputtext), batch_size)):
|
| 194 |
+
# find end of batch
|
| 195 |
+
end = min(i + batch_size, len(inputtext))
|
| 196 |
+
# create ids batch
|
| 197 |
+
ids = [str(i) for i in range(i, end)]
|
| 198 |
+
# create metadata batch
|
| 199 |
+
metadata = [{"text": text} for text in inputtext[i:end]]
|
| 200 |
+
# create embeddings
|
| 201 |
+
xc = model.encode(inputtext[i:end])
|
| 202 |
+
# create records list for upsert
|
| 203 |
+
records = zip(ids, xc, metadata)
|
| 204 |
+
# upsert records
|
| 205 |
+
index.upsert(vectors=records)
|
| 206 |
+
|
| 207 |
+
with st.sidebar:
|
| 208 |
+
st.write("created vector embeddings!")
|
| 209 |
+
# check no of records in the index
|
| 210 |
+
st.write(f"{index.describe_index_stats()}")
|
| 211 |
|
|
|
|
| 212 |
|
| 213 |
# Display the contents of the file
|
| 214 |
# st.write(file_contents)
|
| 215 |
|
|
|
|
| 216 |
with st.sidebar:
|
| 217 |
st.markdown("""
|
| 218 |
***:red[Follow this steps]***
|
|
|
|
| 245 |
# pages = reader.pages
|
| 246 |
# print_out(pages)
|
| 247 |
# combine_text(pages)
|
| 248 |
+
# promt_engineer(text)
|
|
|