BrightData_SerpAPI_LinkedIn_Profile_Scraping

Paused

App Files Files Community

ElegantSolutions commited on Jun 11

Commit

a9ddd8f

verified ·

1 Parent(s): a9239bd

Create app.py

Browse files

Files changed (1) hide show

app.py +152 -0

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import streamlit as st
+import pandas as pd
+import requests
+import re
+import tempfile
+import shutil
+import os
+from difflib import SequenceMatcher
+import json
+def construct_query(row):
+    """Constructs the Google search query using applicant data."""
+    query = str(row['Applicant Name'])
+    optional_fields = ['Job Title', 'State', 'City', 'Skills']
+    for field in optional_fields:
+        if field in row and pd.notna(row[field]):
+            value = row[field]
+            if isinstance(value, str) and value.strip():
+                query += f" {value.strip()}"
+            elif not isinstance(value, str):
+                query += f" {str(value).strip()}"
+    query += " linkedin"
+    return query
+def get_name_from_url(link):
+    """Extracts the name part from a LinkedIn profile URL."""
+    match = re.search(r'linkedin\.com/in/([a-zA-Z0-9-]+)', link)
+    if match:
+        return match.group(1).replace('-', ' ')
+    return None
+def calculate_similarity(name1, name2):
+    """Calculates similarity between two names."""
+    return SequenceMatcher(None, name1.lower().strip(), name2.lower().strip()).ratio()
+# def fetch_linkedin_links(query, api_key, applicant_name):
+#     """Fetches LinkedIn profile links using BrightData SERP API."""
+#     linkedin_regex = r'https://(www|[a-z]{2})\.linkedin\.com/.*'
+#     try:
+#         response = requests.get(
+#             "https://serpapi.brightdata.com/google/search",
+#             params={
+#                 "q": query,
+#                 "num": 5,
+#                 "api_key": api_key
+#             }
+#         )
+#         response.raise_for_status()
+#         results = response.json()
+#         organic_results = results.get("organic_results", [])
+#         for result in organic_results:
+#             link = result.get("link")
+#             if re.match(linkedin_regex, link):
+#                 profile_name = get_name_from_url(link)
+#                 if profile_name:
+#                     similarity = calculate_similarity(applicant_name, profile_name)
+#                     if similarity >= 0.5:
+#                         return link
+#         return None
+#     except Exception as e:
+#         st.error(f"Error fetching link for query '{query}': {e}")
+#         return None
+def fetch_linkedin_links(query, api_key, applicant_name):
+    """Fetches LinkedIn profile links using BrightData proxy to Google."""
+    linkedin_regex = r'https://(www|[a-z]{2})\.linkedin\.com/.*'
+    try:
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {api_key}"
+        }
+        payload = {
+            "zone": "serp_api2",  # Or your configured BrightData zone name
+            "url": f"https://www.google.com/search?q={query}",
+            "format": "json"  # Or "raw" if you want HTML
+        }
+        response = requests.post("https://api.brightdata.com/request", headers=headers, data=json.dumps(payload))
+        response.raise_for_status()
+        data = response.json()
+        # If "json" format is used and BrightData parses the page:
+        if "results" in data:
+            results = data["results"]
+        else:
+            # Fallback: Parse raw HTML if format is "raw"
+            results = []
+        # Search for LinkedIn links in response content (raw or parsed)
+        links = re.findall(linkedin_regex, response.text)
+        for link in links:
+            profile_name = get_name_from_url(link)
+            if profile_name:
+                similarity = calculate_similarity(applicant_name, profile_name)
+                if similarity >= 0.5:
+                    return link
+        return None
+    except Exception as e:
+        st.error(f"Error fetching link for query '{query}': {e}")
+        return None
+def process_file(file, api_key):
+    """Processes the uploaded Excel file to fetch LinkedIn profile links."""
+    try:
+        df = pd.read_excel(file)
+        df = df[df['Applicant Name'].notna()]
+        df = df[df['Applicant Name'].str.strip() != '']
+        df['Search Query'] = df.apply(construct_query, axis=1)
+        df['LinkedIn Link'] = df.apply(
+            lambda row: fetch_linkedin_links(row['Search Query'], api_key, row['Applicant Name']),
+            axis=1
+        )
+        temp_dir = tempfile.mkdtemp()
+        output_file = os.path.join(temp_dir, "updated_with_linkedin_links.csv")
+        df.to_csv(output_file, index=False)
+        return output_file
+    except Exception as e:
+        st.error(f"Error processing file: {e}")
+        return None
+# Streamlit UI
+st.title("LinkedIn Profile Link Scraper")
+st.markdown("Upload an Excel file with applicant details, and get a CSV with LinkedIn profile links.")
+api_key = st.text_input("Enter your BrightData SERP API Key", type="password")
+uploaded_file = st.file_uploader("Upload Excel File", type=["xlsx"])
+if uploaded_file and api_key:
+    st.write("Processing file...")
+    output_file = process_file(uploaded_file, api_key)
+    if output_file:
+        with open(output_file, "rb") as f:
+            st.download_button(
+                label="Download Updated CSV",
+                data=f,
+                file_name="updated_with_linkedin_links.csv",
+                mime="text/csv"
+            )
+        shutil.rmtree(os.path.dirname(output_file))
+elif not api_key:
+    st.warning("Please enter your BrightData SERP API key to proceed.")