BrightData_SerpAPI_LinkedIn_Profile_Scraping

Paused

App Files Files Community

ElegantSolutions commited on Jun 11

Commit

f953f49

verified ·

1 Parent(s): a9ddd8f

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -20

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ import shutil
 import os
 from difflib import SequenceMatcher
 import json
 def construct_query(row):
     """Constructs the Google search query using applicant data."""
@@ -73,41 +74,36 @@ def fetch_linkedin_links(query, api_key, applicant_name):
             "Content-Type": "application/json",
             "Authorization": f"Bearer {api_key}"
         }
         payload = {
-            "zone": "serp_api2",  # Or your configured BrightData zone name
-            "url": f"https://www.google.com/search?q={query}",
-            "format": "json"  # Or "raw" if you want HTML
         }
         response = requests.post("https://api.brightdata.com/request", headers=headers, data=json.dumps(payload))
         response.raise_for_status()
-        data = response.json()
-        # If "json" format is used and BrightData parses the page:
-        if "results" in data:
-            results = data["results"]
-        else:
-            # Fallback: Parse raw HTML if format is "raw"
-            results = []
-        # Search for LinkedIn links in response content (raw or parsed)
-        links = re.findall(linkedin_regex, response.text)
-        for link in links:
             profile_name = get_name_from_url(link)
             if profile_name:
                 similarity = calculate_similarity(applicant_name, profile_name)
                 if similarity >= 0.5:
                     return link
         return None
     except Exception as e:
         st.error(f"Error fetching link for query '{query}': {e}")
         return None
 def process_file(file, api_key):
     """Processes the uploaded Excel file to fetch LinkedIn profile links."""

 import os
 from difflib import SequenceMatcher
 import json
+from urllib.parse import quote_plus
 def construct_query(row):
     """Constructs the Google search query using applicant data."""
             "Content-Type": "application/json",
             "Authorization": f"Bearer {api_key}"
         }
+        encoded_query = quote_plus(query)
+        search_url = f"https://www.google.com/search?q={encoded_query}"
         payload = {
+            "zone": "serp_api2",  # Must match your BrightData dashboard zone
+            "url": search_url,
+            "format": "raw"  # Use "raw" if "json" gives problems
         }
         response = requests.post("https://api.brightdata.com/request", headers=headers, data=json.dumps(payload))
         response.raise_for_status()
+        html = response.text
+        linkedin_links = re.findall(r'https://(www|[a-z]{2})\.linkedin\.com/in/[a-zA-Z0-9\-]+', html)
+        linkedin_links = list(set(["https://" + link for link in linkedin_links]))  # De-duplicate
+        for link in linkedin_links:
             profile_name = get_name_from_url(link)
             if profile_name:
                 similarity = calculate_similarity(applicant_name, profile_name)
                 if similarity >= 0.5:
                     return link
         return None
     except Exception as e:
         st.error(f"Error fetching link for query '{query}': {e}")
         return None
 def process_file(file, api_key):
     """Processes the uploaded Excel file to fetch LinkedIn profile links."""