Spaces:
Runtime error
Runtime error
| import time | |
| from collections import namedtuple | |
| from pathlib import Path | |
| from typing import List | |
| import requests | |
| from requests.adapters import HTTPAdapter | |
| from urllib3.util.retry import Retry | |
| from faker import Faker | |
| fake = Faker() | |
| MAX_RETRIES = 10 | |
| SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL = 0.1 | |
| FILING_DETAILS_FILENAME_STEM = "filing-details" | |
| SEC_EDGAR_SEARCH_API_ENDPOINT = "https://efts.sec.gov/LATEST/search-index" | |
| SEC_EDGAR_ARCHIVES_BASE_URL = "https://www.sec.gov/Archives/edgar/data" | |
| retries = Retry( | |
| total=MAX_RETRIES, | |
| backoff_factor=SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL, | |
| status_forcelist=[403, 500, 502, 503, 504], | |
| ) | |
| FilingMetadata = namedtuple( | |
| "FilingMetadata", | |
| [ | |
| "accession_number", | |
| "full_submission_url", | |
| "filing_details_url", | |
| "filing_details_filename", | |
| ], | |
| ) | |
| class EdgarSearchApiError(Exception): | |
| pass | |
| def form_request_payload( | |
| ticker_or_cik: str, | |
| filing_types: List[str], | |
| start_date: str, | |
| end_date: str, | |
| start_index: int, | |
| query: str, | |
| ) -> dict: | |
| payload = { | |
| "dateRange": "custom", | |
| "startdt": start_date, | |
| "enddt": end_date, | |
| "entityName": ticker_or_cik, | |
| "forms": filing_types, | |
| "from": start_index, | |
| "q": query, | |
| } | |
| return payload | |
| def build_filing_metadata_from_hit(hit: dict) -> FilingMetadata: | |
| accession_number, filing_details_filename = hit["_id"].split(":", 1) | |
| # Company CIK should be last in the CIK list. This list may also include | |
| # the CIKs of executives carrying out insider transactions like in form 4. | |
| cik = hit["_source"]["ciks"][-1] | |
| accession_number_no_dashes = accession_number.replace("-", "", 2) | |
| submission_base_url = ( | |
| f"{SEC_EDGAR_ARCHIVES_BASE_URL}/{cik}/{accession_number_no_dashes}" | |
| ) | |
| full_submission_url = f"{submission_base_url}/{accession_number}.txt" | |
| # Get XSL if human readable is wanted | |
| # XSL is required to download the human-readable | |
| # and styled version of XML documents like form 4 | |
| # SEC_EDGAR_ARCHIVES_BASE_URL + /320193/000032019320000066/wf-form4_159839550969947.xml | |
| # SEC_EDGAR_ARCHIVES_BASE_URL + | |
| # /320193/000032019320000066/xslF345X03/wf-form4_159839550969947.xml | |
| # xsl = hit["_source"]["xsl"] | |
| # if xsl is not None: | |
| # filing_details_url = f"{submission_base_url}/{xsl}/{filing_details_filename}" | |
| # else: | |
| # filing_details_url = f"{submission_base_url}/{filing_details_filename}" | |
| filing_details_url = f"{submission_base_url}/{filing_details_filename}" | |
| filing_details_filename_extension = Path(filing_details_filename).suffix.replace( | |
| "htm", "html" | |
| ) | |
| filing_details_filename = ( | |
| f"{FILING_DETAILS_FILENAME_STEM}{filing_details_filename_extension}" | |
| ) | |
| return FilingMetadata( | |
| accession_number=accession_number, | |
| full_submission_url=full_submission_url, | |
| filing_details_url=filing_details_url, | |
| filing_details_filename=filing_details_filename, | |
| ) | |
| def generate_random_user_agent() -> str: | |
| return f"{fake.first_name()} {fake.last_name()} {fake.email()}" | |
| def get_filing_urls_to_download( | |
| filing_type: str, | |
| ticker_or_cik: str, | |
| num_filings_to_download: int, | |
| after_date: str, | |
| before_date: str, | |
| include_amends: bool, | |
| query: str = "", | |
| ) -> List[FilingMetadata]: | |
| """Get the filings URL to download the data | |
| Returns: | |
| List[FilingMetadata]: Filing metadata from SEC | |
| """ | |
| filings_to_fetch: List[FilingMetadata] = [] | |
| start_index = 0 | |
| client = requests.Session() | |
| client.mount("http://", HTTPAdapter(max_retries=retries)) | |
| client.mount("https://", HTTPAdapter(max_retries=retries)) | |
| try: | |
| while len(filings_to_fetch) < num_filings_to_download: | |
| payload = form_request_payload( | |
| ticker_or_cik, | |
| [filing_type], | |
| after_date, | |
| before_date, | |
| start_index, | |
| query, | |
| ) | |
| headers = { | |
| "User-Agent": generate_random_user_agent(), | |
| "Accept-Encoding": "gzip, deflate", | |
| "Host": "efts.sec.gov", | |
| } | |
| resp = client.post( | |
| SEC_EDGAR_SEARCH_API_ENDPOINT, json=payload, headers=headers | |
| ) | |
| resp.raise_for_status() | |
| search_query_results = resp.json() | |
| if "error" in search_query_results: | |
| try: | |
| root_cause = search_query_results["error"]["root_cause"] | |
| if not root_cause: # pragma: no cover | |
| raise ValueError | |
| error_reason = root_cause[0]["reason"] | |
| raise EdgarSearchApiError( | |
| f"Edgar Search API encountered an error: {error_reason}. " | |
| f"Request payload:\n{payload}" | |
| ) | |
| except (ValueError, KeyError): # pragma: no cover | |
| raise EdgarSearchApiError( | |
| "Edgar Search API encountered an unknown error. " | |
| f"Request payload:\n{payload}" | |
| ) from None | |
| query_hits = search_query_results["hits"]["hits"] | |
| # No more results to process | |
| if not query_hits: | |
| break | |
| for hit in query_hits: | |
| hit_filing_type = hit["_source"]["file_type"] | |
| is_amend = hit_filing_type[-2:] == "/A" | |
| if not include_amends and is_amend: | |
| continue | |
| if is_amend: | |
| num_filings_to_download += 1 | |
| # Work around bug where incorrect filings are sometimes included. | |
| # For example, AAPL 8-K searches include N-Q entries. | |
| if not is_amend and hit_filing_type != filing_type: | |
| continue | |
| metadata = build_filing_metadata_from_hit(hit) | |
| filings_to_fetch.append(metadata) | |
| if len(filings_to_fetch) == num_filings_to_download: | |
| return filings_to_fetch | |
| # Edgar queries 100 entries at a time, but it is best to set this | |
| # from the response payload in case it changes in the future | |
| query_size = search_query_results["query"]["size"] | |
| start_index += query_size | |
| # Prevent rate limiting | |
| time.sleep(SEC_EDGAR_RATE_LIMIT_SLEEP_INTERVAL) | |
| finally: | |
| client.close() | |
| return filings_to_fetch | |