Spaces:

sigridveronica
/

ai-news-analyzer

Runtime error

ai-news-analyzer / external /FinNLP /finnlp /data_sources /news /investorplace_streaming.py

Sigrid De los Santos

Remove remaining binary file for Hugging Face

9df4cc0 4 months ago

2.49 kB

	import warnings
	warnings.filterwarnings("ignore")
	import requests
	from lxml import etree
	from tqdm import tqdm
	import pandas as pd
	import json
	import time
	from finnlp.data_sources.news._base import News_Downloader

	# TODO:
	# 1. Contents

	class InvestorPlace_Streaming(News_Downloader):

	def __init__(self, args={}):
	super().__init__(args)
	self.dataframe = pd.DataFrame()

	def download_streaming_search(self, keyword = "apple", rounds = 3, delay = 0.5):
	url = 'https://investorplace.com/search/'

	headers = {
	"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
	}
	print("Downloading ...", end = ' ')
	for page in range(rounds):
	params = {
	'q': keyword,
	"pg": page,
	}
	res = requests.get(url = url, params=params, headers=headers)
	if res.status_code != 200:
	break

	res = etree.HTML(res.text)
	div_list = res.xpath("/html/body/main/section/div/div/div/div[2]/div[1]/div[1]/div")
	divs = []

	for div in div_list:
	divs += div.xpath("./div")

	titles = []
	times = []
	authors = []
	summaries = []

	for div in divs:
	try:
	title = div.xpath('./h2/a//text()')[0]
	except:
	title = ''
	try:
	time_ = div.xpath('div/time//text()')[0].replace('\n','').replace('\t','')
	except:
	time_ = ''
	try:
	author = div.xpath('div/span/a/text()')[0].replace('\n','').replace('\t','')
	except:
	author = ''
	try:
	summary = div.xpath('p/text()')[0].replace('\n','').replace('\t','')
	except:
	summary = ''

	titles.append(title)
	times.append(time_)
	authors.append(author)
	summaries.append(summary)

	titles.append(title)

	tmp = pd.DataFrame([titles, times, authors, summaries]).T
	tmp.columns = ['title', 'time', 'author', 'summary']
	self.dataframe = pd.concat([self.dataframe, tmp])

	print(page, end = ' ')

	time.sleep(delay)