Spaces:

rianders
/

mpi_data_store

Sleeping

App Files Files Community

mpi_data_store / pages /file_web_source_collection.py

rianders

Initial web collection and scan

5958103 over 1 year ago

raw

history blame

3.08 kB

	import streamlit as st
	import pandas as pd
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse

	def find_linked_urls(url):
	try:
	response = requests.get(url)
	if response.status_code == 200:
	soup = BeautifulSoup(response.text, 'html.parser')
	links = soup.find_all('a')
	urls = {link.get('href') for link in links if link.get('href') is not None}
	return urls
	else:
	st.write(f"Failed to retrieve {url}")
	except Exception as e:
	st.write(f"An error occurred with {url}: {e}")
	return set()

	def convert_to_absolute_urls(base_url, links):
	absolute_urls = []
	for link in links:
	if not link.startswith('http'):
	link = urljoin(base_url, link)
	absolute_urls.append(link)
	return set(absolute_urls)

	def categorize_links(base_url, links):
	internal_links, external_links = set(), set()
	for link in links:
	if urlparse(link).netloc == urlparse(base_url).netloc:
	internal_links.add(link)
	else:
	external_links.add(link)
	return internal_links, external_links

	def main():
	st.title("Data Source Configuration")

	st.subheader("Scan Websites for URLs")
	url_input = st.text_area("Enter URLs to scan, separated by new lines:")
	url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()] # Splitting and cleaning input

	if st.button("Scan URLs"):
	all_links = {}
	for url in url_list:
	unique_urls = find_linked_urls(url)
	absolute_urls = convert_to_absolute_urls(url, unique_urls)
	internal_links, external_links = categorize_links(url, absolute_urls)
	all_links[url] = {"internal": internal_links, "external": external_links}

	selected_urls = []
	for base_url, links in all_links.items():
	st.write(f"Base URL: {base_url}")
	include_all_internal = st.checkbox(f"Include all internal links from {base_url}", key=f"all_{base_url}")

	if include_all_internal:
	selected_urls.extend(links["internal"])
	else:
	selected_internal = [link for link in links["internal"] if st.checkbox(link, key=link)]
	selected_urls.extend(selected_internal)

	# Displaying external links for informational purposes
	if links["external"]:
	st.write("External links:")
	for link in links["external"]:
	st.write(link)

	# Convert selected URLs to a DataFrame and display
	if selected_urls:
	df_selected_urls = pd.DataFrame(selected_urls, columns=['Selected URLs'])
	st.write(df_selected_urls)

	# Saving the DataFrame as CSV
	if st.button("Save Selected URLs to CSV"):
	df_selected_urls.to_csv('selected_urls.csv', index=False)
	st.success("Saved selected URLs to selected_urls.csv")

	if __name__ == "__main__":
	main()