Spaces:

green
/

TopicDig

Runtime error

App Files Files Community

green commited on Aug 8, 2022

Commit

1f8826e

1 Parent(s): 3e11f3e

Removed print statements and comments

Browse files

Files changed (1) hide show

scrape_sources.py +6 -9

scrape_sources.py CHANGED Viewed

@@ -26,13 +26,12 @@ class NPRLite(Source):
     # and identified entities for each article.
     # Chosen articles will have their data stored in a Summary object.
     def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
-        print("retrieving NPR article stub")
         """Creates article stubs for articles listed on text.npr.org"""
         # Scrape NPR for headlines and links
         soup = Soup(get(self.source_url))
         # extract each headline
         npr_hed = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
-        #npr_hed = [i for i in npr_hed if 'Opinion:' not in i]
         # links scraped are just the extension to the site's base link.
         npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
         # limit amount of data being returned for clustering
@@ -42,13 +41,13 @@ class NPRLite(Source):
         # Create stubs with heds and links
         # Test: do the headlines and links zipped together lineup correctly?
         article_tuples = [stub(i[0], i[1], [], self) for i in zip(npr_links, npr_hed)]
-        print(f"Number of npr articles: {len(npr_hed)}")
         return article_tuples, len(npr_hed)
     # Returns None if article is only 1 line.
     def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
         """Retrieves article data from text.npr.org subhead if exists, date, author(s), and whole text"""
-        st.write(f"""Retrieving article from:\n\t{self.source_url[:-5] + indata.link}\n""")
         container = Soup(get(self.source_url[:-5] + indata.link))
         text_container = container.find('div', {'class': "paragraphs-container"}).find('p')
         if isinstance(text_container, Soup):
@@ -82,7 +81,6 @@ class CNNText(Source):
     # Chosen articles will have their data stored in a Summary object.
     def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
         """Creates a stub for each article listed on lite.cnn.com"""
-        print("retrieving CNN article stub")
         soup = Soup(get(self.source_url))
         # Scrape NPR for headlines and links
         cnn_heds = [i.text for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
@@ -91,16 +89,15 @@ class CNNText(Source):
         if limit is not None:
             cnn_heds = cnn_heds[:limit]
             cnn_links = cnn_links[:limit]
-        #cnn = [i for i in cnn_heds if 'Analysis:' not in i and 'Opinion:' not in i]
         # Take this next line out of this function and place it where this data is used.
         article_tuples = [stub(i[0], i[1], [], self) for i in zip(cnn_links, cnn_heds) if 'Opinion' not in i[1] and 'Analysis' not in i[1]]
-        print(f"Number of cnn articles: {len(cnn_heds)}")
         return article_tuples, len(cnn_heds)
     # Returns None if article is only 1 line.
     def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
         """Retrieves article data from lite.cnn.com: subhead if exists, date, author(s), and whole text"""
-        print(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
         st.write(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
         repeat = 0
         good = False
@@ -114,7 +111,7 @@ class CNNText(Source):
                 repeat += 1
         if good:
             story_container = container.find('div', {'class': 'afe4286c'})
-            print(story_container)
             author = story_container.find('p',{'id':'byline'}).text
             story_date = story_container.find('p',{'id':'published datetime'}).text[9:]
             #if isinstance(story_container, Soup):

     # and identified entities for each article.
     # Chosen articles will have their data stored in a Summary object.
     def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
+        #print("retrieving NPR article stub")
         """Creates article stubs for articles listed on text.npr.org"""
         # Scrape NPR for headlines and links
         soup = Soup(get(self.source_url))
         # extract each headline
         npr_hed = [i.text for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
         # links scraped are just the extension to the site's base link.
         npr_links = [i.attrs['href'] for i in soup.find('div', {'class': 'topic-container'}).find('ul').find('a')]
         # limit amount of data being returned for clustering
         # Create stubs with heds and links
         # Test: do the headlines and links zipped together lineup correctly?
         article_tuples = [stub(i[0], i[1], [], self) for i in zip(npr_links, npr_hed)]
+        #print(f"Number of npr articles: {len(npr_hed)}")
         return article_tuples, len(npr_hed)
     # Returns None if article is only 1 line.
     def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
         """Retrieves article data from text.npr.org subhead if exists, date, author(s), and whole text"""
+        #st.write(f"""Retrieving article from:\n\t{self.source_url[:-5] + indata.link}\n""")
         container = Soup(get(self.source_url[:-5] + indata.link))
         text_container = container.find('div', {'class': "paragraphs-container"}).find('p')
         if isinstance(text_container, Soup):
     # Chosen articles will have their data stored in a Summary object.
     def retrieve_cluster_data(self, limit=None) -> List[namedtuple]:
         """Creates a stub for each article listed on lite.cnn.com"""
         soup = Soup(get(self.source_url))
         # Scrape NPR for headlines and links
         cnn_heds = [i.text for i in soup.find('div', {'class': 'afe4286c'}).find('a')]
         if limit is not None:
             cnn_heds = cnn_heds[:limit]
             cnn_links = cnn_links[:limit]
         # Take this next line out of this function and place it where this data is used.
         article_tuples = [stub(i[0], i[1], [], self) for i in zip(cnn_links, cnn_heds) if 'Opinion' not in i[1] and 'Analysis' not in i[1]]
         return article_tuples, len(cnn_heds)
     # Returns None if article is only 1 line.
     def retrieve_article(self, indata: stub) -> Tuple[str, List[Tuple[str, Any]]]:
         """Retrieves article data from lite.cnn.com: subhead if exists, date, author(s), and whole text"""
+        #print(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
         st.write(f"""Retrieving article from:\n\t{self.source_url + indata.link}\n""")
         repeat = 0
         good = False
                 repeat += 1
         if good:
             story_container = container.find('div', {'class': 'afe4286c'})
+            #print(story_container)
             author = story_container.find('p',{'id':'byline'}).text
             story_date = story_container.find('p',{'id':'published datetime'}).text[9:]
             #if isinstance(story_container, Soup):