Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| def find_linked_urls(url): | |
| try: | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| links = soup.find_all('a') | |
| urls = {link.get('href') for link in links if link.get('href') is not None} | |
| return urls | |
| else: | |
| st.write(f"Failed to retrieve {url}") | |
| except Exception as e: | |
| st.write(f"An error occurred with {url}: {e}") | |
| return set() | |
| def convert_to_absolute_urls(base_url, links): | |
| absolute_urls = [] | |
| for link in links: | |
| if not link.startswith('http'): | |
| link = urljoin(base_url, link) | |
| absolute_urls.append(link) | |
| return set(absolute_urls) | |
| def categorize_links(base_url, links): | |
| internal_links, external_links = set(), set() | |
| for link in links: | |
| if urlparse(link).netloc == urlparse(base_url).netloc: | |
| internal_links.add(link) | |
| else: | |
| external_links.add(link) | |
| return internal_links, external_links | |
| def main(): | |
| st.title("Data Source Configuration") | |
| st.subheader("Scan Websites for URLs") | |
| url_input = st.text_area("Enter URLs to scan, separated by new lines:") | |
| url_list = [url.strip() for url in url_input.strip().split('\n') if url.strip()] # Splitting and cleaning input | |
| if st.button("Scan URLs"): | |
| all_links = {} | |
| for url in url_list: | |
| unique_urls = find_linked_urls(url) | |
| absolute_urls = convert_to_absolute_urls(url, unique_urls) | |
| internal_links, external_links = categorize_links(url, absolute_urls) | |
| all_links[url] = {"internal": internal_links, "external": external_links} | |
| selected_urls = [] | |
| for base_url, links in all_links.items(): | |
| st.write(f"Base URL: {base_url}") | |
| include_all_internal = st.checkbox(f"Include all internal links from {base_url}", key=f"all_{base_url}") | |
| if include_all_internal: | |
| selected_urls.extend(links["internal"]) | |
| else: | |
| selected_internal = [link for link in links["internal"] if st.checkbox(link, key=link)] | |
| selected_urls.extend(selected_internal) | |
| # Displaying external links for informational purposes | |
| if links["external"]: | |
| st.write("External links:") | |
| for link in links["external"]: | |
| st.write(link) | |
| # Convert selected URLs to a DataFrame and display | |
| if selected_urls: | |
| df_selected_urls = pd.DataFrame(selected_urls, columns=['Selected URLs']) | |
| st.write(df_selected_urls) | |
| # Saving the DataFrame as CSV | |
| if st.button("Save Selected URLs to CSV"): | |
| df_selected_urls.to_csv('selected_urls.csv', index=False) | |
| st.success("Saved selected URLs to selected_urls.csv") | |
| if __name__ == "__main__": | |
| main() | |