Spaces:

pquiggles
/

vsp-demo

Runtime error

App Files Files Community

navkast commited on Sep 10, 2024

Commit

6abd06b

unverified ·

1 Parent(s): dce8143

Add per-minute rate limiter for linkedin (#6)

Browse files

Files changed (1) hide show

src/vsp/app/scrapers/linkedin_downloader.py +50 -2

src/vsp/app/scrapers/linkedin_downloader.py CHANGED Viewed

@@ -7,6 +7,7 @@ of requests in case of failures.
 Classes:
     LinkedInFetchFailedError: Custom exception for LinkedIn fetch failures.
     LinkedinDownloader: Main class for downloading LinkedIn profile data.
 Usage:
@@ -15,6 +16,8 @@ Usage:
 """
 import asyncio
 from typing import Final
 import aiohttp
@@ -30,6 +33,47 @@ class LinkedInFetchFailedError(Exception):
     """Custom exception raised when fetching LinkedIn profile data fails."""
 class LinkedinDownloader:
     """
     A class for asynchronously downloading LinkedIn profile data.
@@ -42,20 +86,23 @@ class LinkedinDownloader:
         _X_RAPIDAPI_HOST (Final[str]): The RapidAPI host for LinkedIn API.
         _api_key (str): The RapidAPI key for authentication.
         _semaphore (asyncio.Semaphore): Semaphore for limiting concurrent requests.
     """
     _URL: Final[str] = "https://linkedin-api8.p.rapidapi.com/"
     _X_RAPIDAPI_HOST: Final[str] = "linkedin-api8.p.rapidapi.com"
-    def __init__(self, max_concurrency: int = 10):
         """
         Initialize the LinkedinDownloader.
         Args:
-            max_concurrency (int): Maximum number of concurrent API calls. Defaults to 10.
         """
         self._api_key = self._fetch_api_key()
         self._semaphore = asyncio.Semaphore(max_concurrency)
     @staticmethod
     def _fetch_api_key() -> str:
@@ -116,6 +163,7 @@ class LinkedinDownloader:
             LinkedInFetchFailedError: If the API call fails after all retry attempts.
         """
         async with self._semaphore:
             headers, querystring = self._compose_request(linkedin_url)
             logger.info("Fetching LinkedIn profile", url=linkedin_url)
             async with aiohttp.ClientSession() as session:

 Classes:
     LinkedInFetchFailedError: Custom exception for LinkedIn fetch failures.
+    RateLimiter: Token bucket algorithm implementation for rate limiting.
     LinkedinDownloader: Main class for downloading LinkedIn profile data.
 Usage:
 """
 import asyncio
+import math
+import time
 from typing import Final
 import aiohttp
     """Custom exception raised when fetching LinkedIn profile data fails."""
+class RateLimiter:
+    """
+    Implements a token bucket algorithm for rate limiting.
+    This class manages a token bucket to control the rate of API requests,
+    ensuring that the number of requests per minute does not exceed a specified limit.
+    """
+    def __init__(self, rate: int, per: float = 60.0):
+        """
+        Initialize the RateLimiter.
+        Args:
+            rate (int): The number of tokens (requests) allowed per time period.
+            per (float): The time period in seconds. Defaults to 60.0 (1 minute).
+        """
+        self.rate = rate
+        self.per = per
+        self.allowance = rate
+        self.last_check = time.monotonic()
+    async def acquire(self) -> None:
+        """
+        Acquire a token from the bucket, waiting if necessary.
+        This method implements the token bucket algorithm. If there are no tokens
+        available, it will sleep until a token becomes available.
+        """
+        current = time.monotonic()
+        time_passed = current - self.last_check
+        self.last_check = current
+        self.allowance += math.floor(time_passed * (self.rate / self.per))
+        if self.allowance > self.rate:
+            self.allowance = self.rate
+        if self.allowance < 1:
+            await asyncio.sleep((1 - self.allowance) * self.per / self.rate)
+            self.allowance = 0
+        else:
+            self.allowance -= 1
 class LinkedinDownloader:
     """
     A class for asynchronously downloading LinkedIn profile data.
         _X_RAPIDAPI_HOST (Final[str]): The RapidAPI host for LinkedIn API.
         _api_key (str): The RapidAPI key for authentication.
         _semaphore (asyncio.Semaphore): Semaphore for limiting concurrent requests.
+        _rate_limiter (RateLimiter): Rate limiter for controlling requests per minute.
     """
     _URL: Final[str] = "https://linkedin-api8.p.rapidapi.com/"
     _X_RAPIDAPI_HOST: Final[str] = "linkedin-api8.p.rapidapi.com"
+    def __init__(self, max_concurrency: int = 2, max_per_minute: int = 10):
         """
         Initialize the LinkedinDownloader.
         Args:
+            max_concurrency (int): Maximum number of concurrent API calls. Defaults to 2.
+            max_per_minute (int): Maximum number of requests per minute. Defaults to 5.
         """
         self._api_key = self._fetch_api_key()
         self._semaphore = asyncio.Semaphore(max_concurrency)
+        self._rate_limiter = RateLimiter(max_per_minute)
     @staticmethod
     def _fetch_api_key() -> str:
             LinkedInFetchFailedError: If the API call fails after all retry attempts.
         """
         async with self._semaphore:
+            await self._rate_limiter.acquire()  # Acquire a token from the rate limiter
             headers, querystring = self._compose_request(linkedin_url)
             logger.info("Fetching LinkedIn profile", url=linkedin_url)
             async with aiohttp.ClientSession() as session: