Spaces:
Runtime error
Runtime error
| import requests | |
| from lxml import etree | |
| from tqdm import tqdm | |
| import pandas as pd | |
| from finnlp.data_sources.news._base import News_Downloader | |
| class Eastmoney_Streaming(News_Downloader): | |
| def __init__(self, args={}): | |
| super().__init__(args) | |
| self.dataframe = pd.DataFrame() | |
| def download_streaming_stock(self, stock = "600519", rounds = 3): | |
| print( "Geting pages: ", end = "") | |
| if rounds > 0: | |
| for r in range(rounds): | |
| br = self._gather_pages(stock, r) | |
| if br == "break": | |
| break | |
| else: | |
| r = 1 | |
| error_count = 0 | |
| while 1: | |
| br = self._gather_pages(stock, r) | |
| if br == "break": | |
| break | |
| elif br == "Error": | |
| error_count +=1 | |
| if error_count>10: | |
| print("Connection Error") | |
| r += 1 | |
| print( f"Get total {r+1} pages.") | |
| self.dataframe = self.dataframe.reset_index(drop = True) | |
| def _gather_pages(self, stock, page): | |
| print( page, end = " ") | |
| url = f"https://guba.eastmoney.com/list,{stock},1,f_{page}.html" | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36", | |
| } | |
| requests.DEFAULT_RETRIES = 5 # 增加重试连接次数 | |
| s = requests.session() | |
| s.keep_alive = False # 关闭多余连接 | |
| response = self._request_get(url, headers=headers) | |
| if response.status_code != 200: | |
| return "Error" | |
| # gather the comtent of the first page | |
| page = etree.HTML(response.text) | |
| trs = page.xpath('//*[@id="mainlist"]/div/ul/li[1]/table/tbody/tr') | |
| have_one = False | |
| for item in trs: | |
| have_one = True | |
| read_amount = item.xpath("./td[1]//text()")[0] | |
| comments = item.xpath("./td[2]//text()")[0] | |
| title = item.xpath("./td[3]/div/a//text()")[0] | |
| content_link = item.xpath("./td[3]/div/a/@href")[0] | |
| author = item.xpath("./td[4]//text()")[0] | |
| time = item.xpath("./td[5]//text()")[0] | |
| tmp = pd.DataFrame([read_amount, comments, title, content_link, author, time]).T | |
| columns = [ "read amount", "comments", "title", "content link", "author", "create time" ] | |
| tmp.columns = columns | |
| self.dataframe = pd.concat([self.dataframe, tmp]) | |
| #print(title) | |
| if have_one == False: | |
| return "break" | |