BrightData_SerpAPI_LinkedIn_Profile_Scraping

Paused

App Files Files Community

ElegantSolutions commited on Jan 30

Commit

0297e38

verified ·

1 Parent(s): d2a6463

Updating app.py

Browse files

Files changed (1) hide show

app.py +49 -97

app.py CHANGED Viewed

@@ -1,153 +1,105 @@
-import streamlit as st  # Import Streamlit for creating a web app interface
-import pandas as pd  # Import pandas for data manipulation
-from serpapi import GoogleSearch  # Import SerpAPI to perform Google searches
-import re  # Import regex module for pattern matching
-import tempfile  # Import tempfile for creating temporary files
-import shutil  # Import shutil for file operations
-import os  # Import os for handling file paths
-from difflib import SequenceMatcher  # Import SequenceMatcher to calculate string similarity
-# Function to construct a Google search query from applicant data
 def construct_query(row):
     """Constructs the Google search query using applicant data."""
-    query = str(row['Applicant Name'])  # Start with the applicant's name
-    print(f"Constructing query for Applicant Name: {row['Applicant Name']}")
-    # Additional fields to include in the search query if available
     optional_fields = ['Job Title', 'State', 'City', 'Skills']
     for field in optional_fields:
-        if field in row and pd.notna(row[field]):  # Check if the field exists and is not NaN
             value = row[field]
-            if isinstance(value, str) and value.strip():  # Ensure the value is a non-empty string
-                query += f" {value.strip()}"  # Add the value to the query
-            elif not isinstance(value, str):  # Handle non-string values
                 query += f" {str(value).strip()}"
-    query += " linkedin"  # Append "linkedin" to focus search on LinkedIn profiles
-    print(f"Constructed query: {query}")
     return query
-# Function to extract the name from a LinkedIn profile URL
 def get_name_from_url(link):
     """Extracts the name part from a LinkedIn profile URL."""
-    print(f"Extracting name from LinkedIn URL: {link}")
-    match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link)  # Regex to find profile name
     if match:
-        name = match.group(1).replace('-', ' ')  # Replace dashes with spaces for readability
-        print(f"Extracted name: {name}")
-        return name
-    print("No name extracted from URL.")
     return None
-# Function to calculate similarity between two names
 def calculate_similarity(name1, name2):
     """Calculates similarity between two names."""
-    similarity = SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio()
-    print(f"Calculated similarity between '{name1}' and '{name2}': {similarity}")
-    return similarity
-# Function to fetch LinkedIn links using SerpAPI
 def fetch_linkedin_links(query, api_key, applicant_name):
-    """Fetches LinkedIn profile links and validates them against the applicant's name."""
-    linkedin_regex = r'https://(www|[a-z]{2})\.linkedin\.com/.*'  # Regex for LinkedIn links
     try:
-        print(f"Fetching LinkedIn links for query: {query}")
-        search = GoogleSearch({
-            "q": query,  # The search query
-            "num": 5,  # Number of search results
-            "api_key": api_key  # API key for SerpAPI
-        })
-        # Execute the search and get results
-        results = search.get_dict()
-        organic_results = results.get("organic_results", [])  # Extract organic search results
-        print(f"Raw search results: {organic_results}")
-        # Iterate through results to find LinkedIn links
         for result in organic_results:
-            link = result.get("link")  # Get the URL of the search result
-            print(f"Checking link: {link}")
-            if re.match(linkedin_regex, link):  # Check if the link matches LinkedIn regex
-                profile_name = get_name_from_url(link)  # Extract the name from the URL
                 if profile_name:
-                    similarity = calculate_similarity(applicant_name, profile_name)  # Validate name similarity
-                    if similarity >= 0.5:  # Accept link if similarity is above the threshold
-                        print(f"Valid LinkedIn link found: {link} (Similarity: {similarity})")
                         return link
-                    else:
-                        print(f"Rejected link: {link} (Similarity: {similarity})")
-            else:
-                print(f"Link does not match LinkedIn regex: {link}")
-        print("No valid LinkedIn link found.")
         return None
     except Exception as e:
-        print(f"Error fetching link for query '{query}': {e}")
         st.error(f"Error fetching link for query '{query}': {e}")
         return None
-# Function to process the uploaded Excel file
 def process_file(file, api_key):
     """Processes the uploaded Excel file to fetch LinkedIn profile links."""
     try:
-        print("Reading uploaded Excel file...")
-        df = pd.read_excel(file)  # Read the Excel file into a pandas DataFrame
-        print(f"Initial DataFrame:\n{df.head()}")
-        # Filter out rows with empty or missing applicant names
         df = df[df['Applicant Name'].notna()]
         df = df[df['Applicant Name'].str.strip() != '']
-        print(f"Filtered DataFrame:\n{df.head()}")
-        # Generate search queries for each applicant
         df['Search Query'] = df.apply(construct_query, axis=1)
-        print(f"DataFrame with Search Queries:\n{df[['Applicant Name', 'Search Query']].head()}")
-        # Fetch LinkedIn links for each applicant
         df['LinkedIn Link'] = df.apply(
             lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']),
             axis=1
         )
-        print(f"DataFrame with LinkedIn Links:\n{df.head()}")
-        # Save the updated DataFrame to a temporary file
-        temp_dir = tempfile.mkdtemp()  # Create a temporary directory
         output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv")
-        df.to_csv(output_file, index=False)  # Save as CSV
-        print(f"CSV file created at: {output_file}")
         return output_file
     except Exception as e:
-        print(f"Error processing file: {e}")
         st.error(f"Error processing file: {e}")
         return None
-# Streamlit UI setup
-st.title("LinkedIn Profile Link Scraper")  # App title
-st.markdown("Upload an Excel file with applicant details, and get a CSV with LinkedIn profile links.")  # Description
-# Input for SerpAPI Key
-api_key = st.text_input("Enter your SerpAPI Key", type="password")  # Input for SerpAPI key
-# File uploader widget
-uploaded_file = st.file_uploader("Upload Excel File", type=["xlsx"])  # File uploader for Excel files
-# Process the file if both file and API key are provided
 if uploaded_file and api_key:
     st.write("Processing file...")
-    output_file = process_file(uploaded_file, api_key)  # Process the uploaded file
     if output_file:
-        with open(output_file, "rb") as f:  # Open the CSV for download
             st.download_button(
                 label="Download Updated CSV",
                 data=f,
                 file_name="updated_with_linkedin_links.csv",
                 mime="text/csv"
             )
-        print("File ready for download.")
-        # Clean up the temporary directory after download
         shutil.rmtree(os.path.dirname(output_file))
-        print("Temporary files cleaned up.")
 elif not api_key:
-    st.warning("Please enter your SerpAPI key to proceed.")  # Warning for missing API key

+import streamlit as st
+import pandas as pd
+import requests
+import re
+import tempfile
+import shutil
+import os
+from difflib import SequenceMatcher
 def construct_query(row):
     """Constructs the Google search query using applicant data."""
+    query = str(row['Applicant Name'])
     optional_fields = ['Job Title', 'State', 'City', 'Skills']
     for field in optional_fields:
+        if field in row and pd.notna(row[field]):
             value = row[field]
+            if isinstance(value, str) and value.strip():
+                query += f" {value.strip()}"
+            elif not isinstance(value, str):
                 query += f" {str(value).strip()}"
+    query += " linkedin"
     return query
 def get_name_from_url(link):
     """Extracts the name part from a LinkedIn profile URL."""
+    match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link)
     if match:
+        return match.group(1).replace('-', ' ')
     return None
 def calculate_similarity(name1, name2):
     """Calculates similarity between two names."""
+    return SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio()
 def fetch_linkedin_links(query, api_key, applicant_name):
+    """Fetches LinkedIn profile links using BrightData SERP API."""
+    linkedin_regex = r'https://(www|[a-z]{2})\.linkedin\.com/.*'
     try:
+        response = requests.get(
+            "https://serpapi.brightdata.com/google/search",
+            params={
+                "q": query,
+                "num": 5,
+                "api_key": api_key
+            }
+        )
+        response.raise_for_status()
+        results = response.json()
+        organic_results = results.get("organic_results", [])
         for result in organic_results:
+            link = result.get("link")
+            if re.match(linkedin_regex, link):
+                profile_name = get_name_from_url(link)
                 if profile_name:
+                    similarity = calculate_similarity(applicant_name, profile_name)
+                    if similarity >= 0.5:
                         return link
         return None
     except Exception as e:
         st.error(f"Error fetching link for query '{query}': {e}")
         return None
 def process_file(file, api_key):
     """Processes the uploaded Excel file to fetch LinkedIn profile links."""
     try:
+        df = pd.read_excel(file)
         df = df[df['Applicant Name'].notna()]
         df = df[df['Applicant Name'].str.strip() != '']
         df['Search Query'] = df.apply(construct_query, axis=1)
         df['LinkedIn Link'] = df.apply(
             lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']),
             axis=1
         )
+        temp_dir = tempfile.mkdtemp()
         output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv")
+        df.to_csv(output_file, index=False)
         return output_file
     except Exception as e:
         st.error(f"Error processing file: {e}")
         return None
+# Streamlit UI
+st.title("LinkedIn Profile Link Scraper")
+st.markdown("Upload an Excel file with applicant details, and get a CSV with LinkedIn profile links.")
+api_key = st.text_input("Enter your BrightData SERP API Key", type="password")
+uploaded_file = st.file_uploader("Upload Excel File", type=["xlsx"])
 if uploaded_file and api_key:
     st.write("Processing file...")
+    output_file = process_file(uploaded_file, api_key)
     if output_file:
+        with open(output_file, "rb") as f:
             st.download_button(
                 label="Download Updated CSV",
                 data=f,
                 file_name="updated_with_linkedin_links.csv",
                 mime="text/csv"
             )
         shutil.rmtree(os.path.dirname(output_file))
 elif not api_key:
+    st.warning("Please enter your BrightData SERP API key to proceed.")