mpi_data_store / pages /file_web_source_collection.py
rianders's picture
Initial web collection and scan
5958103
raw
history blame
3.08 kB
import streamlit as st
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
def find_linked_urls(url):
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
urls = {link.get('href') for link in links if link.get('href') is not None}
return urls
else:
st.write(f"Failed to retrieve {url}")
except Exception as e:
st.write(f"An error occurred with {url}: {e}")
return set()
def convert_to_absolute_urls(base_url, links):
absolute_urls = []
for link in links:
if not link.startswith('http'):
link = urljoin(base_url, link)
absolute_urls.append(link)
return set(absolute_urls)
def categorize_links(base_url, links):
internal_links, external_links = set(), set()
for link in links:
if urlparse(link).netloc == urlparse(base_url).netloc:
internal_links.add(link)
else:
external_links.add(link)
return internal_links, external_links
def main():
st.title("Data Source Configuration")
st.subheader("Scan Websites for URLs")
url_input = st.text_area("Enter URLs to scan, separated by new lines:")
url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()] # Splitting and cleaning input
if st.button("Scan URLs"):
all_links = {}
for url in url_list:
unique_urls = find_linked_urls(url)
absolute_urls = convert_to_absolute_urls(url, unique_urls)
internal_links, external_links = categorize_links(url, absolute_urls)
all_links[url] = {"internal": internal_links, "external": external_links}
selected_urls = []
for base_url, links in all_links.items():
st.write(f"Base URL: {base_url}")
include_all_internal = st.checkbox(f"Include all internal links from {base_url}", key=f"all_{base_url}")
if include_all_internal:
selected_urls.extend(links["internal"])
else:
selected_internal = [link for link in links["internal"] if st.checkbox(link, key=link)]
selected_urls.extend(selected_internal)
# Displaying external links for informational purposes
if links["external"]:
st.write("External links:")
for link in links["external"]:
st.write(link)
# Convert selected URLs to a DataFrame and display
if selected_urls:
df_selected_urls = pd.DataFrame(selected_urls, columns=['Selected URLs'])
st.write(df_selected_urls)
# Saving the DataFrame as CSV
if st.button("Save Selected URLs to CSV"):
df_selected_urls.to_csv('selected_urls.csv', index=False)
st.success("Saved selected URLs to selected_urls.csv")
if __name__ == "__main__":
main()