import streamlit as st from langchain_community.document_loaders import AsyncHtmlLoader from langchain.schema import Document import json from typing import Iterable # Placeholder for async fetch function (adjust based on actual async handling in your environment) async def fetch_documents(urls): loader = AsyncHtmlLoader(urls) docs = await loader.load() return docs def save_docs_to_jsonl(array: Iterable[Document], file_path: str) -> None: with open(file_path, 'w') as jsonl_file: for doc in array: jsonl_file.write(json.dumps(doc.to_dict()) + '\n') # Assuming Document objects have a to_dict method def load_docs_from_jsonl(file_path) -> Iterable[Document]: array = [] with open(file_path, 'r') as jsonl_file: for line in jsonl_file: data = json.loads(line) obj = Document(**data) array.append(obj) return array def fetch_clean_organize_page(): st.title("Fetch, Clean, and Organize Documents") # Initialize 'selected_urls' at the start of your app if it doesn't exist if 'selected_urls' not in st.session_state: st.session_state['selected_urls'] = [] # Default to an empty list urls = st.session_state['selected_urls'] if st.button("Fetch Documents"): # Async fetching operation placeholder # Adjust based on your async handling strategy docs = fetch_documents(urls) # This needs proper async handling st.session_state['docs'] = docs # Assuming docs are fetched and stored correctly if 'docs' in st.session_state: st.write(f"Fetched {len(st.session_state['docs'])} documents.") if st.button("Save Documents as JSON"): save_docs_to_jsonl(st.session_state['docs'], "documents.jsonl") st.success("Documents saved as JSON.") # Provide download link (streamlit >= 0.88.0) with open("documents.jsonl", "rb") as file: btn = st.download_button( label="Download JSON", data=file, file_name="documents.jsonl", mime="application/octet-stream") # Assuming this function is called in your app fetch_clean_organize_page()