Spaces:
Sleeping
Sleeping
| import sys | |
| from time import sleep | |
| import trafilatura | |
| from trafilatura.meta import reset_caches | |
| from trafilatura.settings import DEFAULT_CONFIG | |
| import spacy | |
| from lxml.etree import tostring | |
| import lxml.etree | |
| import spacy | |
| import subprocess | |
| try: | |
| nlp = spacy.load("en_core_web_lg") | |
| except OSError: | |
| print("π Downloading spaCy model 'en_core_web_lg' ...") | |
| subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"], check=True) | |
| nlp = spacy.load("en_core_web_lg") | |
| DEFAULT_CONFIG.MAX_FILE_SIZE = 50000 | |
| MIN_CHAR = 50 | |
| MAX_CHAR = 5000 | |
| def get_page(url): | |
| page = None | |
| for _ in range(3): | |
| try: | |
| page = trafilatura.fetch_url(url, config=DEFAULT_CONFIG) | |
| assert page is not None | |
| print("Fetched " + url, file=sys.stderr) | |
| break | |
| except: | |
| sleep(3) | |
| return page | |
| def url2lines(url): | |
| page = get_page(url) | |
| if page is None: | |
| return [] | |
| lines = html2lines(page) | |
| return lines | |
| def line_correction(lines, max_size=100): | |
| out_lines = [] | |
| for line in lines: | |
| if len(line) < MIN_CHAR: | |
| continue | |
| if len(line) > max_size: | |
| doc = nlp( | |
| line[:MAX_CHAR] | |
| ) # We split lines into sentences, but for performance we take only the first 5k characters per line | |
| stack = "" | |
| for sent in doc.sents: | |
| if len(stack) > 0: | |
| stack += " " | |
| stack += str(sent).strip() | |
| if len(stack) > max_size: | |
| out_lines.append(stack) | |
| stack = "" | |
| if ( | |
| len(stack) > MIN_CHAR | |
| ): # Ensure every lines in the out_lines suffice the MIN_CHAR restriction | |
| out_lines.append(stack) | |
| else: | |
| out_lines.append(line) | |
| return out_lines | |
| def html2lines(page): | |
| out_lines = [] | |
| if len(page.strip()) == 0 or page is None: | |
| return out_lines | |
| text = trafilatura.extract(page, config=DEFAULT_CONFIG) | |
| reset_caches() | |
| if text is None: | |
| return out_lines | |
| return text.split( | |
| "\n" | |
| ) # We just spit out the entire page, so need to reformat later. | |
| def html2metadata(url): | |
| page = get_page(url) | |
| metadata = trafilatura.extract_metadata(page) | |
| return metadata.as_dict() | |
| if __name__ == "__main__": | |
| url = "https://www.bbc.co.uk/news/61407508" | |
| metadata = html2metadata(url) | |
| text = " ".join(html2lines(page)) | |
| print(metadata) | |