Spaces:
Sleeping
Sleeping
Functioning pages added
Browse files- app.py +43 -38
- pages/01_data_collection.py +97 -0
- pages/02_data_organization.py +75 -0
- pages/03_model_selection.py +50 -0
- pages/04_encoding_storage.py +122 -0
- pages/05_testing_qa.py +111 -0
app.py
CHANGED
|
@@ -1,39 +1,44 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
#
|
| 10 |
-
if '
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
#
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
st.
|
| 19 |
-
st.
|
| 20 |
-
|
| 21 |
-
st.
|
| 22 |
-
st.
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
st.set_page_config(page_title='Knowledge Navigator', layout='wide')
|
| 5 |
+
|
| 6 |
+
def main():
|
| 7 |
+
st.title('Knowledge Navigator')
|
| 8 |
+
|
| 9 |
+
# Button to go back to Data Collection Page
|
| 10 |
+
if st.button('Go to Data Collection'):
|
| 11 |
+
st.switch_page('pages/01_data_collection.py')
|
| 12 |
+
|
| 13 |
+
# Button to navigate to Data Organization Page and pass data
|
| 14 |
+
if st.button('Go to Data Organization with Data'):
|
| 15 |
+
# Navigating to Data Organization Page
|
| 16 |
+
st.switch_page('pages/02_data_organization.py')
|
| 17 |
+
|
| 18 |
+
if st.button('Proceed to Model Selection'):
|
| 19 |
+
st.switch_page('pages/03_model_selection.py')
|
| 20 |
+
|
| 21 |
+
if st.button('Proceed to encoding vector storage'):
|
| 22 |
+
st.switch_page('pages/04_encoding_storage.py')
|
| 23 |
+
|
| 24 |
+
if st.button('Proceed to Q&A Testing'):
|
| 25 |
+
st.switch_page('pages/05_testing_qa.py')
|
| 26 |
+
|
| 27 |
+
# Check if 'data' state variable is defined
|
| 28 |
+
if 'data' in st.session_state:
|
| 29 |
+
st.write("Data Available")
|
| 30 |
+
st.write("Data (URL dataframe) is defined.")
|
| 31 |
+
else:
|
| 32 |
+
st.write("Data (URL dataframe) is not defined.")
|
| 33 |
+
|
| 34 |
+
# Check if 'docs' state variable is defined
|
| 35 |
+
if 'docs' in st.session_state:
|
| 36 |
+
st.write("Docs (fetched and stored data collection) is defined.")
|
| 37 |
+
else:
|
| 38 |
+
st.write("Docs (fetched and stored data collection) is not defined.")
|
| 39 |
+
|
| 40 |
+
# Render the navigation menu
|
| 41 |
+
# menu()
|
| 42 |
+
|
| 43 |
+
if __name__ == '__main__':
|
| 44 |
+
main()
|
pages/01_data_collection.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import requests
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
from urllib.parse import urljoin, urlparse
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
|
| 9 |
+
def find_linked_urls_and_title(url):
|
| 10 |
+
try:
|
| 11 |
+
response = requests.get(url)
|
| 12 |
+
if response.status_code == 200:
|
| 13 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 14 |
+
links = soup.find_all('a')
|
| 15 |
+
urls = {link.get('href') for link in links if link.get('href') is not None}
|
| 16 |
+
title_tag = soup.find('title')
|
| 17 |
+
page_title = title_tag.text if title_tag else 'No Title Found'
|
| 18 |
+
return urls, page_title
|
| 19 |
+
else:
|
| 20 |
+
st.write(f"Failed to retrieve {url}")
|
| 21 |
+
return set(), 'No Title Found'
|
| 22 |
+
except Exception as e:
|
| 23 |
+
st.write(f"An error occurred with {url}: {e}")
|
| 24 |
+
return set(), 'No Title Found'
|
| 25 |
+
|
| 26 |
+
def convert_to_absolute_urls(base_url, links):
|
| 27 |
+
return {urljoin(base_url, link) if not link.startswith('http') else link for link in links}
|
| 28 |
+
|
| 29 |
+
def categorize_links(base_url, links):
|
| 30 |
+
internal_links, external_links = set(), set()
|
| 31 |
+
for link in links:
|
| 32 |
+
if urlparse(link).netloc == urlparse(base_url).netloc:
|
| 33 |
+
internal_links.add(link)
|
| 34 |
+
else:
|
| 35 |
+
external_links.add(link)
|
| 36 |
+
return internal_links, external_links
|
| 37 |
+
|
| 38 |
+
def display_editable_table(df):
|
| 39 |
+
edited_df = st.data_editor(data=df, key="data_editor_key", num_rows="dynamic") # Add num_rows="dynamic" to allow adding/deleting rows
|
| 40 |
+
return edited_df
|
| 41 |
+
|
| 42 |
+
def prepare_dataframe(df):
|
| 43 |
+
if "Ignore" not in df.columns:
|
| 44 |
+
df["Ignore"] = False # Initialize all values as False
|
| 45 |
+
return df
|
| 46 |
+
|
| 47 |
+
def store_data(df):
|
| 48 |
+
st.session_state['data'] = df
|
| 49 |
+
|
| 50 |
+
def main():
|
| 51 |
+
#menu()
|
| 52 |
+
|
| 53 |
+
st.title("Data Source Configuration")
|
| 54 |
+
|
| 55 |
+
# Initialize 'scanned_urls' with all columns, including 'Ignore'
|
| 56 |
+
if 'scanned_urls' not in st.session_state:
|
| 57 |
+
st.session_state['scanned_urls'] = pd.DataFrame(columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore'])
|
| 58 |
+
|
| 59 |
+
st.subheader("Scan Websites for URLs")
|
| 60 |
+
url_input = st.text_area("Enter URLs to scan, separated by new lines:", "https://fubarlabs.org")
|
| 61 |
+
url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()]
|
| 62 |
+
scan_button_clicked = st.button("Scan URLs")
|
| 63 |
+
|
| 64 |
+
if scan_button_clicked:
|
| 65 |
+
for url in url_list:
|
| 66 |
+
unique_urls, page_title = find_linked_urls_and_title(url)
|
| 67 |
+
scan_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 68 |
+
absolute_urls = convert_to_absolute_urls(url, unique_urls)
|
| 69 |
+
internal_links, external_links = categorize_links(url, absolute_urls)
|
| 70 |
+
|
| 71 |
+
new_entries = pd.DataFrame([(url, 'Internal', page_title, scan_datetime, False) for url in internal_links] +
|
| 72 |
+
[(url, 'External', page_title, scan_datetime, False) for url in external_links],
|
| 73 |
+
columns=['URL', 'Type', 'Page Name', 'Scanned DateTime', 'Ignore']) # Include 'Ignore' column
|
| 74 |
+
st.session_state['scanned_urls'] = pd.concat([st.session_state['scanned_urls'], new_entries]).drop_duplicates().reset_index(drop=True)
|
| 75 |
+
store_data(st.session_state['scanned_urls'])
|
| 76 |
+
|
| 77 |
+
if not st.session_state['scanned_urls'].empty:
|
| 78 |
+
# Prepare the dataframe, this now includes the 'Ignore' column from the start
|
| 79 |
+
prepared_df = prepare_dataframe(st.session_state['scanned_urls'])
|
| 80 |
+
|
| 81 |
+
# Display the editable table with an "Ignore" column
|
| 82 |
+
edited_df = display_editable_table(prepared_df)
|
| 83 |
+
|
| 84 |
+
if edited_df is not None:
|
| 85 |
+
st.session_state['scanned_urls'] = edited_df
|
| 86 |
+
|
| 87 |
+
# Access the edits made to the table
|
| 88 |
+
if "data_editor_key" in st.session_state:
|
| 89 |
+
edits = st.session_state["data_editor_key"]
|
| 90 |
+
st.write("Edits made to the table:")
|
| 91 |
+
st.write(edits)
|
| 92 |
+
|
| 93 |
+
if st.button('Proceed to Data Organization'):
|
| 94 |
+
st.switch_page('pages/02_data_organization.py')
|
| 95 |
+
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
main()
|
pages/02_data_organization.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 02_data_organization.py
|
| 2 |
+
import streamlit as st
|
| 3 |
+
from langchain_community.document_loaders import AsyncHtmlLoader
|
| 4 |
+
from langchain.schema import Document
|
| 5 |
+
import json
|
| 6 |
+
from typing import Iterable
|
| 7 |
+
import asyncio
|
| 8 |
+
from urllib.parse import urlparse
|
| 9 |
+
|
| 10 |
+
# Async fetch function
|
| 11 |
+
async def fetch_documents(urls):
|
| 12 |
+
loader = AsyncHtmlLoader(urls)
|
| 13 |
+
docs = await loader.aload()
|
| 14 |
+
return docs
|
| 15 |
+
|
| 16 |
+
def save_docs_to_jsonl(array: Iterable[Document], file_path: str) -> None:
|
| 17 |
+
with open(file_path, 'w') as jsonl_file:
|
| 18 |
+
for doc in array:
|
| 19 |
+
if hasattr(doc, 'to_dict'):
|
| 20 |
+
jsonl_file.write(json.dumps(doc.to_dict()) + '\n')
|
| 21 |
+
else:
|
| 22 |
+
jsonl_file.write(json.dumps(doc.__dict__) + '\n')
|
| 23 |
+
|
| 24 |
+
def load_docs_from_jsonl(file_path) -> Iterable[Document]:
|
| 25 |
+
array = []
|
| 26 |
+
with open(file_path, 'r') as jsonl_file:
|
| 27 |
+
for line in jsonl_file:
|
| 28 |
+
data = json.loads(line)
|
| 29 |
+
obj = Document(**data)
|
| 30 |
+
array.append(obj)
|
| 31 |
+
return array
|
| 32 |
+
|
| 33 |
+
def is_valid_url(url):
|
| 34 |
+
try:
|
| 35 |
+
result = urlparse(url)
|
| 36 |
+
return all([result.scheme, result.netloc])
|
| 37 |
+
except ValueError:
|
| 38 |
+
return False
|
| 39 |
+
|
| 40 |
+
def fetch_clean_organize_page():
|
| 41 |
+
st.title("Fetch, Clean, and Organize Documents")
|
| 42 |
+
|
| 43 |
+
# Check if 'data' exists in the session state
|
| 44 |
+
if 'data' not in st.session_state:
|
| 45 |
+
st.warning("No data found. Please go back to the previous page and scan URLs first.")
|
| 46 |
+
return
|
| 47 |
+
|
| 48 |
+
data = st.session_state['data']
|
| 49 |
+
st.write("URLs to fetch and clean:")
|
| 50 |
+
st.write(data)
|
| 51 |
+
|
| 52 |
+
# Filter out URLs marked as "Ignore" and invalid URLs
|
| 53 |
+
valid_urls = data[(data['Ignore'] == False) & (data['URL'].apply(is_valid_url))]['URL'].tolist()
|
| 54 |
+
|
| 55 |
+
if st.button("Fetch Documents"):
|
| 56 |
+
docs = asyncio.run(fetch_documents(valid_urls))
|
| 57 |
+
st.session_state['docs'] = docs
|
| 58 |
+
st.write(f"Fetched {len(st.session_state['docs'])} documents.")
|
| 59 |
+
|
| 60 |
+
if 'docs' in st.session_state:
|
| 61 |
+
if st.button("Save Documents as JSON"):
|
| 62 |
+
save_docs_to_jsonl(st.session_state['docs'], "documents.jsonl")
|
| 63 |
+
st.success("Documents saved as JSON.")
|
| 64 |
+
|
| 65 |
+
# Provide download link (streamlit >= 0.88.0)
|
| 66 |
+
with open("documents.jsonl", "rb") as file:
|
| 67 |
+
btn = st.download_button(
|
| 68 |
+
label="Download JSON",
|
| 69 |
+
data=file,
|
| 70 |
+
file_name="documents.jsonl",
|
| 71 |
+
mime="application/octet-stream"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# Assuming this function is called in your app
|
| 75 |
+
fetch_clean_organize_page()
|
pages/03_model_selection.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
st.title('Model Selection')
|
| 4 |
+
|
| 5 |
+
# Introduction
|
| 6 |
+
st.write("Select the embedding model and the large language model (LLM) for processing.")
|
| 7 |
+
|
| 8 |
+
# Embedding Model Selection
|
| 9 |
+
embedding_models = ["thenlper/gte-small", "sentence-transformers/all-MiniLM-L6-v2", "other"]
|
| 10 |
+
selected_embedding_model = st.selectbox("Select Embedding Model", options=embedding_models)
|
| 11 |
+
|
| 12 |
+
# LLM Model Selection
|
| 13 |
+
llm_models = ["mistralai/Mistral-7B-Instruct-v0.2", "gpt-3.5-turbo", "other"]
|
| 14 |
+
selected_llm_model = st.selectbox("Select LLM Model", options=llm_models)
|
| 15 |
+
|
| 16 |
+
# Display selections (for demonstration)
|
| 17 |
+
st.write("Selected Embedding Model:", selected_embedding_model)
|
| 18 |
+
st.write("Selected LLM Model:", selected_llm_model)
|
| 19 |
+
|
| 20 |
+
# Configuration options for the selected models
|
| 21 |
+
st.header("Model Configuration")
|
| 22 |
+
|
| 23 |
+
# Embedding Model Configuration (example)
|
| 24 |
+
if selected_embedding_model == "thenlper/gte-small":
|
| 25 |
+
# Placeholder for model-specific configuration options
|
| 26 |
+
st.write("No additional configuration required for this model.")
|
| 27 |
+
else:
|
| 28 |
+
# Configuration for other models
|
| 29 |
+
st.write("Configuration options for other models will appear here.")
|
| 30 |
+
|
| 31 |
+
# LLM Model Configuration (example)
|
| 32 |
+
if selected_llm_model == "mistralai/Mistral-7B-Instruct-v0.2":
|
| 33 |
+
max_tokens = st.slider("Max Tokens", min_value=100, max_value=1000, value=250)
|
| 34 |
+
temperature = st.slider("Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01)
|
| 35 |
+
else:
|
| 36 |
+
# Configuration for other models
|
| 37 |
+
st.write("Configuration options for other models will appear here.")
|
| 38 |
+
|
| 39 |
+
# Save model selections and configurations
|
| 40 |
+
if st.button("Save Model Configuration"):
|
| 41 |
+
st.session_state['selected_embedding_model'] = selected_embedding_model
|
| 42 |
+
st.session_state['selected_llm_model'] = selected_llm_model
|
| 43 |
+
|
| 44 |
+
# Assuming configurations are more complex and vary per model, you might want to store them differently
|
| 45 |
+
st.session_state['llm_model_config'] = {"max_tokens": max_tokens, "temperature": temperature}
|
| 46 |
+
|
| 47 |
+
st.success("Model configurations saved.")
|
| 48 |
+
|
| 49 |
+
if st.button('Proceed to encoding vector storage'):
|
| 50 |
+
st.switch_page('pages/04_encoding_storage.py')
|
pages/04_encoding_storage.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from langchain_community.vectorstores import FAISS
|
| 3 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 4 |
+
from langchain_community.llms import HuggingFaceEndpoint
|
| 5 |
+
from langchain.schema import Document
|
| 6 |
+
import json
|
| 7 |
+
from typing import Iterable
|
| 8 |
+
import os
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
import zipfile
|
| 11 |
+
import tempfile
|
| 12 |
+
|
| 13 |
+
def save_docs_to_jsonl(array:Iterable[Document], file_path:str)->None:
|
| 14 |
+
with open(file_path, 'w') as jsonl_file:
|
| 15 |
+
for doc in array:
|
| 16 |
+
jsonl_file.write(doc.json() + '\n')
|
| 17 |
+
|
| 18 |
+
def load_docs_from_jsonl(file)->Iterable[Document]:
|
| 19 |
+
array = []
|
| 20 |
+
for line in file:
|
| 21 |
+
data = json.loads(line.decode('utf-8'))
|
| 22 |
+
obj = Document(**data)
|
| 23 |
+
array.append(obj)
|
| 24 |
+
return array
|
| 25 |
+
|
| 26 |
+
st.title('Encoding and Storage')
|
| 27 |
+
|
| 28 |
+
# Create output directory
|
| 29 |
+
start_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
| 30 |
+
OUTPUT_DIR = "./out"
|
| 31 |
+
|
| 32 |
+
# Check if the directory exists, and if not, create it
|
| 33 |
+
if not os.path.exists(OUTPUT_DIR):
|
| 34 |
+
os.makedirs(OUTPUT_DIR)
|
| 35 |
+
st.write(f"Directory '{OUTPUT_DIR}' was created.")
|
| 36 |
+
else:
|
| 37 |
+
st.write(f"Directory '{OUTPUT_DIR}' already exists.")
|
| 38 |
+
|
| 39 |
+
# Allow the user to upload the JSON file if missing
|
| 40 |
+
# Allow the user to upload the JSONL file if missing
|
| 41 |
+
if 'docs' not in st.session_state:
|
| 42 |
+
st.write("Document collection not found in session state.")
|
| 43 |
+
uploaded_file = st.file_uploader("Upload JSONL file", type=["jsonl"])
|
| 44 |
+
if uploaded_file is not None:
|
| 45 |
+
try:
|
| 46 |
+
docs = load_docs_from_jsonl(uploaded_file)
|
| 47 |
+
st.session_state['docs'] = docs
|
| 48 |
+
st.write(f"Loaded {len(docs)} documents from the uploaded file.")
|
| 49 |
+
except Exception as e:
|
| 50 |
+
st.error(f"Error loading JSONL file: {str(e)}")
|
| 51 |
+
else:
|
| 52 |
+
docs = st.session_state['docs']
|
| 53 |
+
st.write(f"Loaded {len(docs)} documents from the session state.")
|
| 54 |
+
# Show the embedding model
|
| 55 |
+
EMBEDDING_MODEL_NAME = st.session_state.get('selected_embedding_model', "thenlper/gte-small")
|
| 56 |
+
st.write(f"Selected Embedding Model: {EMBEDDING_MODEL_NAME}")
|
| 57 |
+
|
| 58 |
+
# Allow the user to select the device (GPU or CPU)
|
| 59 |
+
device_form = st.form(key='device_form')
|
| 60 |
+
device = device_form.radio("Select Device", ("CUDA", "CPU"))
|
| 61 |
+
submit_device = device_form.form_submit_button(label='Submit Device')
|
| 62 |
+
|
| 63 |
+
if submit_device:
|
| 64 |
+
# Set up the embedding model
|
| 65 |
+
embedding_model = HuggingFaceEmbeddings(
|
| 66 |
+
model_name=EMBEDDING_MODEL_NAME,
|
| 67 |
+
multi_process=True,
|
| 68 |
+
model_kwargs={"device": device.lower()},
|
| 69 |
+
encode_kwargs={"normalize_embeddings": True}, # set True for cosine similarity
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Show the configuration
|
| 73 |
+
st.write("Embedding Model Configuration:")
|
| 74 |
+
st.write(embedding_model)
|
| 75 |
+
|
| 76 |
+
# Start the encoding
|
| 77 |
+
if 'docs' in st.session_state:
|
| 78 |
+
progress_bar = st.progress(0)
|
| 79 |
+
total_docs = len(docs)
|
| 80 |
+
|
| 81 |
+
collection_vectorstore = FAISS.from_documents(docs, embedding=embedding_model)
|
| 82 |
+
st.session_state['collection_vectorstore'] = collection_vectorstore
|
| 83 |
+
|
| 84 |
+
for i in range(total_docs):
|
| 85 |
+
progress_bar.progress((i + 1) / total_docs)
|
| 86 |
+
|
| 87 |
+
st.write("Encoding completed.")
|
| 88 |
+
else:
|
| 89 |
+
st.write("No documents found in the session state.")
|
| 90 |
+
|
| 91 |
+
# Allow saving and downloading the configuration
|
| 92 |
+
if st.button("Save and Download Configuration"):
|
| 93 |
+
if 'collection_vectorstore' in st.session_state:
|
| 94 |
+
collection_vectorstore = st.session_state['collection_vectorstore']
|
| 95 |
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
| 96 |
+
zip_filename = f"docs_vectors_{timestamp}.zip"
|
| 97 |
+
|
| 98 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 99 |
+
collection_vectorstore.save_local(f"{temp_dir}/docs_vectors")
|
| 100 |
+
|
| 101 |
+
with zipfile.ZipFile(zip_filename, "w") as zip_file:
|
| 102 |
+
for root, _, files in os.walk(temp_dir):
|
| 103 |
+
for file in files:
|
| 104 |
+
file_path = os.path.join(root, file)
|
| 105 |
+
zip_file.write(file_path, os.path.relpath(file_path, temp_dir))
|
| 106 |
+
|
| 107 |
+
with open(zip_filename, "rb") as zip_file:
|
| 108 |
+
zip_bytes = zip_file.read()
|
| 109 |
+
|
| 110 |
+
st.download_button(
|
| 111 |
+
label="Download Configuration",
|
| 112 |
+
data=zip_bytes,
|
| 113 |
+
file_name=zip_filename,
|
| 114 |
+
mime="application/zip",
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
st.success("Configuration saved and downloaded.")
|
| 118 |
+
else:
|
| 119 |
+
st.warning("No vector store found. Please make sure the encoding is completed.")
|
| 120 |
+
|
| 121 |
+
if st.button('Proceed to Q&A Testing'):
|
| 122 |
+
st.switch_page('pages/05_testing_qa.py')
|
pages/05_testing_qa.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
from langchain_community.vectorstores import FAISS
|
| 3 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 4 |
+
from langchain_community.llms import HuggingFaceEndpoint
|
| 5 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 6 |
+
from langchain_core.runnables import RunnablePassthrough
|
| 7 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 8 |
+
import tempfile
|
| 9 |
+
import zipfile
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
st.title('Testing and QA')
|
| 13 |
+
|
| 14 |
+
# Dynamically load the selected models from the session state
|
| 15 |
+
EMBEDDING_MODEL_NAME = st.session_state.get('selected_embedding_model', "thenlper/gte-small")
|
| 16 |
+
LLM_MODEL_NAME = st.session_state.get('selected_llm_model', "mistralai/Mistral-7B-Instruct-v0.2")
|
| 17 |
+
|
| 18 |
+
# Initialization block for embedding_model, with a debug message
|
| 19 |
+
if 'embedding_model' not in st.session_state:
|
| 20 |
+
EMBEDDING_MODEL_NAME = st.session_state.get('selected_embedding_model', "thenlper/gte-small")
|
| 21 |
+
st.session_state['embedding_model'] = HuggingFaceEmbeddings(
|
| 22 |
+
model_name=EMBEDDING_MODEL_NAME,
|
| 23 |
+
multi_process=True,
|
| 24 |
+
model_kwargs={"device": "cpu"},
|
| 25 |
+
encode_kwargs={"normalize_embeddings": True},
|
| 26 |
+
)
|
| 27 |
+
st.info("embedding_model has been initialized.") # Debug message for initialization
|
| 28 |
+
else:
|
| 29 |
+
st.info("embedding_model was already initialized.") # Debug message if already initialized
|
| 30 |
+
|
| 31 |
+
# Now that we've ensured embedding_model is initialized, we can safely access it
|
| 32 |
+
embedding_model = st.session_state['embedding_model']
|
| 33 |
+
st.write("Accessing embedding_model...") # Debug message for accessing
|
| 34 |
+
|
| 35 |
+
# Form for LLM settings, allowing dynamic model selection
|
| 36 |
+
with st.form("llm_settings_form"):
|
| 37 |
+
st.subheader("LLM Settings")
|
| 38 |
+
repo_id = st.text_input("Repo ID", value=LLM_MODEL_NAME, key="repo_id")
|
| 39 |
+
max_new_tokens = st.number_input("Max New Tokens", value=250, key="max_new_tokens")
|
| 40 |
+
top_k = st.number_input("Top K", value=3, key="top_k")
|
| 41 |
+
top_p = st.number_input("Top P", value=0.95, key="top_p")
|
| 42 |
+
typical_p = st.number_input("Typical P", value=0.95, key="typical_p")
|
| 43 |
+
temperature = st.number_input("Temperature", value=0.01, key="temperature")
|
| 44 |
+
repetition_penalty = st.number_input("Repetition Penalty", value=1.035, key="repetition_penalty")
|
| 45 |
+
|
| 46 |
+
submitted = st.form_submit_button("Update LLM Settings")
|
| 47 |
+
if submitted:
|
| 48 |
+
st.session_state['llm'] = HuggingFaceEndpoint(
|
| 49 |
+
repo_id=repo_id,
|
| 50 |
+
max_new_tokens=max_new_tokens,
|
| 51 |
+
top_k=top_k,
|
| 52 |
+
top_p=top_p,
|
| 53 |
+
typical_p=typical_p,
|
| 54 |
+
temperature=temperature,
|
| 55 |
+
repetition_penalty=repetition_penalty,
|
| 56 |
+
)
|
| 57 |
+
st.success("LLM settings updated.")
|
| 58 |
+
|
| 59 |
+
# Vector store upload and setup
|
| 60 |
+
if 'collection_vectorstore' not in st.session_state:
|
| 61 |
+
uploaded_file = st.file_uploader("Upload Vector Store ZIP", type=["zip"])
|
| 62 |
+
if uploaded_file is not None:
|
| 63 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
| 64 |
+
with zipfile.ZipFile(uploaded_file, 'r') as zip_ref:
|
| 65 |
+
zip_ref.extractall(temp_dir)
|
| 66 |
+
docs_vectors_path = os.path.join(temp_dir, "docs_vectors")
|
| 67 |
+
st.session_state['collection_vectorstore'] = FAISS.load_local(docs_vectors_path, embeddings=embedding_model, allow_dangerous_deserialization=True)
|
| 68 |
+
st.success("Vector store uploaded and loaded successfully.")
|
| 69 |
+
|
| 70 |
+
# Create the retriever as soon as the vector store is created
|
| 71 |
+
st.session_state['retriever'] = st.session_state['collection_vectorstore'].as_retriever()
|
| 72 |
+
st.info("Retriever has been created.") # Debug message to confirm the retriever's creation
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# Check if LLM and vector store are ready
|
| 76 |
+
if 'llm' in st.session_state and 'collection_vectorstore' in st.session_state:
|
| 77 |
+
# Use a button to indicate when to update the prompt template
|
| 78 |
+
if st.button("Update Prompt Template"):
|
| 79 |
+
# Assuming you have a text area where users input the new template
|
| 80 |
+
new_template = st.text_area("Enter new prompt template", key="new_prompt_template")
|
| 81 |
+
# Update the session state only when the button is pressed
|
| 82 |
+
st.session_state['prompt_template'] = new_template
|
| 83 |
+
st.success("Prompt template updated.")
|
| 84 |
+
|
| 85 |
+
# Ensure there's a default prompt template
|
| 86 |
+
if 'prompt_template' not in st.session_state:
|
| 87 |
+
st.session_state['prompt_template'] = "You are a knowledgeable assistant answering the following question based on the provided documents: {context} Question: {question}"
|
| 88 |
+
|
| 89 |
+
# Display the current template for editing
|
| 90 |
+
current_template = st.text_area("Edit Prompt Template", value=st.session_state['prompt_template'], key="current_prompt_template")
|
| 91 |
+
|
| 92 |
+
# Question input and processing
|
| 93 |
+
question = st.text_input("Enter your question", key="question_input")
|
| 94 |
+
|
| 95 |
+
if question:
|
| 96 |
+
llm = st.session_state['llm']
|
| 97 |
+
prompt = ChatPromptTemplate.from_template(current_template)
|
| 98 |
+
retriever = st.session_state['retriever']
|
| 99 |
+
chain = (
|
| 100 |
+
{"context": retriever, "question": RunnablePassthrough()}
|
| 101 |
+
| prompt
|
| 102 |
+
| llm
|
| 103 |
+
| StrOutputParser()
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
if st.button("Ask"):
|
| 107 |
+
result = chain.invoke(question)
|
| 108 |
+
st.subheader("Answer:")
|
| 109 |
+
st.write(result)
|
| 110 |
+
else:
|
| 111 |
+
st.warning("Please configure and submit the LLM settings and ensure the vector store is loaded to ask questions.")
|