Removed print statements and comments
Browse files- scrape_sources.py +6 -9
scrape_sources.py
CHANGED
|
@@ -26,13 +26,12 @@ class NPRLite(Source):
|
|
| 26 |
# and identified entities for each article.
|
| 27 |
# Chosen articles will have their data stored in a Summary object.
|
| 28 |
def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
|
| 29 |
-
print("retrieving NPR article stub")
|
| 30 |
"""Creates article stubs for articles listed on text.npr.org"""
|
| 31 |
# Scrape NPR for headlines and links
|
| 32 |
soup = Soup(get(self.source_url))
|
| 33 |
# extract each headline
|
| 34 |
npr_hed = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
|
| 35 |
-
#npr_hed = [i for i in npr_hed if 'Opinion:' not in i]
|
| 36 |
# links scraped are just the extension to the site's base link.
|
| 37 |
npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
|
| 38 |
# limit amount of data being returned for clustering
|
|
@@ -42,13 +41,13 @@ class NPRLite(Source):
|
|
| 42 |
# Create stubs with heds and links
|
| 43 |
# Test: do the headlines and links zipped together lineup correctly?
|
| 44 |
article_tuples = [stub(i[0], i[1], [], self) for i in zip(npr_links, npr_hed)]
|
| 45 |
-
print(f"Number of npr articles: {len(npr_hed)}")
|
| 46 |
return article_tuples, len(npr_hed)
|
| 47 |
|
| 48 |
# Returns None if article is only 1 line.
|
| 49 |
def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
|
| 50 |
"""Retrieves article data from text.npr.org subhead if exists, date, author(s), and whole text"""
|
| 51 |
-
st.write(f"""Retrieving article from:\n\t{self.source_url[:-5] + indata.link}\n""")
|
| 52 |
container = Soup(get(self.source_url[:-5] + indata.link))
|
| 53 |
text_container = container.find('div', {'class': "paragraphs-container"}).find('p')
|
| 54 |
if isinstance(text_container, Soup):
|
|
@@ -82,7 +81,6 @@ class CNNText(Source):
|
|
| 82 |
# Chosen articles will have their data stored in a Summary object.
|
| 83 |
def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
|
| 84 |
"""Creates a stub for each article listed on lite.cnn.com"""
|
| 85 |
-
print("retrieving CNN article stub")
|
| 86 |
soup = Soup(get(self.source_url))
|
| 87 |
# Scrape NPR for headlines and links
|
| 88 |
cnn_heds = [i.text for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
|
|
@@ -91,16 +89,15 @@ class CNNText(Source):
|
|
| 91 |
if limit is not None:
|
| 92 |
cnn_heds = cnn_heds[:limit]
|
| 93 |
cnn_links = cnn_links[:limit]
|
| 94 |
-
#cnn = [i for i in cnn_heds if 'Analysis:' not in i and 'Opinion:' not in i]
|
| 95 |
# Take this next line out of this function and place it where this data is used.
|
| 96 |
article_tuples = [stub(i[0], i[1], [], self) for i in zip(cnn_links, cnn_heds) if 'Opinion' not in i[1] and 'Analysis' not in i[1]]
|
| 97 |
-
|
| 98 |
return article_tuples, len(cnn_heds)
|
| 99 |
|
| 100 |
# Returns None if article is only 1 line.
|
| 101 |
def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
|
| 102 |
"""Retrieves article data from lite.cnn.com: subhead if exists, date, author(s), and whole text"""
|
| 103 |
-
print(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
|
| 104 |
st.write(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
|
| 105 |
repeat = 0
|
| 106 |
good = False
|
|
@@ -114,7 +111,7 @@ class CNNText(Source):
|
|
| 114 |
repeat += 1
|
| 115 |
if good:
|
| 116 |
story_container = container.find('div', {'class': 'afe4286c'})
|
| 117 |
-
print(story_container)
|
| 118 |
author = story_container.find('p',{'id':'byline'}).text
|
| 119 |
story_date = story_container.find('p',{'id':'published datetime'}).text[9:]
|
| 120 |
#if isinstance(story_container, Soup):
|
|
|
|
| 26 |
# and identified entities for each article.
|
| 27 |
# Chosen articles will have their data stored in a Summary object.
|
| 28 |
def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
|
| 29 |
+
#print("retrieving NPR article stub")
|
| 30 |
"""Creates article stubs for articles listed on text.npr.org"""
|
| 31 |
# Scrape NPR for headlines and links
|
| 32 |
soup = Soup(get(self.source_url))
|
| 33 |
# extract each headline
|
| 34 |
npr_hed = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
|
|
|
|
| 35 |
# links scraped are just the extension to the site's base link.
|
| 36 |
npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
|
| 37 |
# limit amount of data being returned for clustering
|
|
|
|
| 41 |
# Create stubs with heds and links
|
| 42 |
# Test: do the headlines and links zipped together lineup correctly?
|
| 43 |
article_tuples = [stub(i[0], i[1], [], self) for i in zip(npr_links, npr_hed)]
|
| 44 |
+
#print(f"Number of npr articles: {len(npr_hed)}")
|
| 45 |
return article_tuples, len(npr_hed)
|
| 46 |
|
| 47 |
# Returns None if article is only 1 line.
|
| 48 |
def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
|
| 49 |
"""Retrieves article data from text.npr.org subhead if exists, date, author(s), and whole text"""
|
| 50 |
+
#st.write(f"""Retrieving article from:\n\t{self.source_url[:-5] + indata.link}\n""")
|
| 51 |
container = Soup(get(self.source_url[:-5] + indata.link))
|
| 52 |
text_container = container.find('div', {'class': "paragraphs-container"}).find('p')
|
| 53 |
if isinstance(text_container, Soup):
|
|
|
|
| 81 |
# Chosen articles will have their data stored in a Summary object.
|
| 82 |
def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
|
| 83 |
"""Creates a stub for each article listed on lite.cnn.com"""
|
|
|
|
| 84 |
soup = Soup(get(self.source_url))
|
| 85 |
# Scrape NPR for headlines and links
|
| 86 |
cnn_heds = [i.text for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
|
|
|
|
| 89 |
if limit is not None:
|
| 90 |
cnn_heds = cnn_heds[:limit]
|
| 91 |
cnn_links = cnn_links[:limit]
|
|
|
|
| 92 |
# Take this next line out of this function and place it where this data is used.
|
| 93 |
article_tuples = [stub(i[0], i[1], [], self) for i in zip(cnn_links, cnn_heds) if 'Opinion' not in i[1] and 'Analysis' not in i[1]]
|
| 94 |
+
|
| 95 |
return article_tuples, len(cnn_heds)
|
| 96 |
|
| 97 |
# Returns None if article is only 1 line.
|
| 98 |
def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
|
| 99 |
"""Retrieves article data from lite.cnn.com: subhead if exists, date, author(s), and whole text"""
|
| 100 |
+
#print(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
|
| 101 |
st.write(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
|
| 102 |
repeat = 0
|
| 103 |
good = False
|
|
|
|
| 111 |
repeat += 1
|
| 112 |
if good:
|
| 113 |
story_container = container.find('div', {'class': 'afe4286c'})
|
| 114 |
+
#print(story_container)
|
| 115 |
author = story_container.find('p',{'id':'byline'}).text
|
| 116 |
story_date = story_container.find('p',{'id':'published datetime'}).text[9:]
|
| 117 |
#if isinstance(story_container, Soup):
|