Spaces:
Sleeping
Sleeping
| import unittest | |
| from bs4 import BeautifulSoup | |
| from mediaunmasked.scrapers.article_scraper import ArticleScraper | |
| class TestArticleScraper(unittest.TestCase): | |
| def setUp(self): | |
| self.scraper = ArticleScraper() | |
| def test_process_element_formatting(self): | |
| """Test that _process_element preserves various HTML formatting.""" | |
| # Test complex nested HTML with multiple formatting elements | |
| html = """ | |
| <div> | |
| <h1>Main Title</h1> | |
| <p>This is a <strong>bold</strong> and <em>italic</em> text.</p> | |
| <p>This is a <a href="https://example.com">link</a> in a paragraph.</p> | |
| <ul> | |
| <li>First <strong>important</strong> item</li> | |
| <li>Second item with <em>emphasis</em></li> | |
| </ul> | |
| <ol> | |
| <li>Numbered item <a href="test.com">with link</a></li> | |
| <li>Another numbered item</li> | |
| </ol> | |
| <div> | |
| Nested <br/>content with<br />line breaks | |
| </div> | |
| </div> | |
| """ | |
| soup = BeautifulSoup(html, 'html.parser') | |
| formatted_content = self.scraper._process_element(soup.div) | |
| expected_output = """ | |
| ## Main Title | |
| This is a **bold** and _italic_ text. | |
| This is a [link](https://example.com) in a paragraph. | |
| • First **important** item | |
| • Second item with _emphasis_ | |
| 1. Numbered item [with link](test.com) | |
| 2. Another numbered item | |
| Nested | |
| content with | |
| line breaks""".strip() | |
| # Normalize whitespace for comparison | |
| formatted_content = '\n'.join(line.strip() for line in formatted_content.split('\n') if line.strip()) | |
| expected_output = '\n'.join(line.strip() for line in expected_output.split('\n') if line.strip()) | |
| self.assertEqual(formatted_content, expected_output) | |
| def test_extract_snopes_article(self): | |
| """Test extraction of a Snopes-style article with formatting.""" | |
| html = """ | |
| <html> | |
| <body> | |
| <header> | |
| <h1>Fact Check: Test Claim</h1> | |
| </header> | |
| <article> | |
| <h2>The Claim</h2> | |
| <p>This is the <strong>main claim</strong> being tested.</p> | |
| <h2>The Facts</h2> | |
| <ul> | |
| <li>First important fact with <em>emphasis</em></li> | |
| <li>Second fact with a <a href="source.com">source</a></li> | |
| </ul> | |
| <p>Additional <strong>important</strong> context.</p> | |
| </article> | |
| </body> | |
| </html> | |
| """ | |
| soup = BeautifulSoup(html, 'html.parser') | |
| result = self.scraper._extract_article(soup, 'snopes.com') | |
| expected_content = """ | |
| ## The Claim | |
| This is the **main claim** being tested. | |
| ## The Facts | |
| • First important fact with _emphasis_ | |
| • Second fact with a [source](source.com) | |
| Additional **important** context.""".strip() | |
| self.assertEqual(result['headline'], 'Fact Check: Test Claim') | |
| # Normalize whitespace for comparison | |
| actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip()) | |
| expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip()) | |
| self.assertEqual(actual_content, expected_content) | |
| def test_extract_politifact_article(self): | |
| """Test extraction of a PolitiFact-style article with formatting.""" | |
| html = """ | |
| <html> | |
| <body> | |
| <h1 class="article__title">Test Political Claim</h1> | |
| <article class="article"> | |
| <div class="article__text"> | |
| <p>Here's a claim with <strong>bold text</strong> and <em>italics</em>.</p> | |
| <h3>Our Analysis</h3> | |
| <ul> | |
| <li>Evidence point 1</li> | |
| <li>Evidence point 2 with <a href="proof.com">proof</a></li> | |
| </ul> | |
| <p>Final assessment with <strong>key points</strong>.</p> | |
| </div> | |
| </article> | |
| </body> | |
| </html> | |
| """ | |
| soup = BeautifulSoup(html, 'html.parser') | |
| result = self.scraper._extract_article(soup, 'politifact.com') | |
| expected_content = """ | |
| Here's a claim with **bold text** and _italics_. | |
| ### Our Analysis | |
| • Evidence point 1 | |
| • Evidence point 2 with [proof](proof.com) | |
| Final assessment with **key points**.""".strip() | |
| self.assertEqual(result['headline'], 'Test Political Claim') | |
| # Normalize whitespace for comparison | |
| actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip()) | |
| expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip()) | |
| self.assertEqual(actual_content, expected_content) | |
| def test_extract_generic_article(self): | |
| """Test extraction of a generic article with formatting.""" | |
| html = """ | |
| <html> | |
| <body> | |
| <h1>Generic Article Title</h1> | |
| <main> | |
| <p>Opening paragraph with <strong>bold</strong> text.</p> | |
| <div class="content"> | |
| <h2>Section Title</h2> | |
| <p>Content with <em>italic</em> text and <a href="ref.com">reference</a>.</p> | |
| <ul> | |
| <li>Point <strong>one</strong></li> | |
| <li>Point <em>two</em></li> | |
| </ul> | |
| </div> | |
| </main> | |
| </body> | |
| </html> | |
| """ | |
| soup = BeautifulSoup(html, 'html.parser') | |
| result = self.scraper._extract_article(soup, 'generic.com') | |
| expected_content = """ | |
| Opening paragraph with **bold** text. | |
| ## Section Title | |
| Content with _italic_ text and [reference](ref.com). | |
| • Point **one** | |
| • Point _two_""".strip() | |
| self.assertEqual(result['headline'], 'Generic Article Title') | |
| # Normalize whitespace for comparison | |
| actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip()) | |
| expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip()) | |
| self.assertEqual(actual_content, expected_content) | |
| if __name__ == '__main__': | |
| unittest.main() |