Spaces:

wozwize
/

media-unmasked-api

Sleeping

App Files Files Community

media-unmasked-api / mediaunmasked /tests /test_article_scraper.py

wozwize

updating response structure to maintain formatting of article

a9d5552 9 months ago

raw

history blame contribute delete

6.52 kB

	import unittest
	from bs4 import BeautifulSoup
	from mediaunmasked.scrapers.article_scraper import ArticleScraper

	class TestArticleScraper(unittest.TestCase):
	def setUp(self):
	self.scraper = ArticleScraper()

	def test_process_element_formatting(self):
	"""Test that _process_element preserves various HTML formatting."""
	# Test complex nested HTML with multiple formatting elements
	html = """
	<div>
	<h1>Main Title</h1>
	<p>This is a <strong>bold</strong> and <em>italic</em> text.</p>
	<p>This is a <a href="https://example.com">link</a> in a paragraph.</p>
	<ul>
	<li>First <strong>important</strong> item</li>
	<li>Second item with <em>emphasis</em></li>
	</ul>
	<ol>
	<li>Numbered item <a href="test.com">with link</a></li>
	<li>Another numbered item</li>
	</ol>
	<div>
	Nested <br/>content with<br />line breaks
	</div>
	</div>
	"""
	soup = BeautifulSoup(html, 'html.parser')
	formatted_content = self.scraper._process_element(soup.div)

	expected_output = """
	## Main Title

	This is a bold and _italic_ text.

	This is a [link](https://example.com) in a paragraph.

	• First important item
	• Second item with _emphasis_

	1. Numbered item [with link](test.com)
	2. Another numbered item

	Nested
	content with
	line breaks""".strip()

	# Normalize whitespace for comparison
	formatted_content = '\n'.join(line.strip() for line in formatted_content.split('\n') if line.strip())
	expected_output = '\n'.join(line.strip() for line in expected_output.split('\n') if line.strip())

	self.assertEqual(formatted_content, expected_output)

	def test_extract_snopes_article(self):
	"""Test extraction of a Snopes-style article with formatting."""
	html = """
	<html>
	<body>
	<header>
	<h1>Fact Check: Test Claim</h1>
	</header>
	<article>
	<h2>The Claim</h2>
	<p>This is the <strong>main claim</strong> being tested.</p>
	<h2>The Facts</h2>
	<ul>
	<li>First important fact with <em>emphasis</em></li>
	<li>Second fact with a <a href="source.com">source</a></li>
	</ul>
	<p>Additional <strong>important</strong> context.</p>
	</article>
	</body>
	</html>
	"""
	soup = BeautifulSoup(html, 'html.parser')
	result = self.scraper._extract_article(soup, 'snopes.com')

	expected_content = """
	## The Claim

	This is the main claim being tested.

	## The Facts

	• First important fact with _emphasis_
	• Second fact with a [source](source.com)

	Additional important context.""".strip()

	self.assertEqual(result['headline'], 'Fact Check: Test Claim')
	# Normalize whitespace for comparison
	actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
	expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
	self.assertEqual(actual_content, expected_content)

	def test_extract_politifact_article(self):
	"""Test extraction of a PolitiFact-style article with formatting."""
	html = """
	<html>
	<body>
	<h1 class="article__title">Test Political Claim</h1>
	<article class="article">
	<div class="article__text">
	<p>Here's a claim with <strong>bold text</strong> and <em>italics</em>.</p>
	<h3>Our Analysis</h3>
	<ul>
	<li>Evidence point 1</li>
	<li>Evidence point 2 with <a href="proof.com">proof</a></li>
	</ul>
	<p>Final assessment with <strong>key points</strong>.</p>
	</div>
	</article>
	</body>
	</html>
	"""
	soup = BeautifulSoup(html, 'html.parser')
	result = self.scraper._extract_article(soup, 'politifact.com')

	expected_content = """
	Here's a claim with bold text and _italics_.

	### Our Analysis

	• Evidence point 1
	• Evidence point 2 with [proof](proof.com)

	Final assessment with key points.""".strip()

	self.assertEqual(result['headline'], 'Test Political Claim')
	# Normalize whitespace for comparison
	actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
	expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
	self.assertEqual(actual_content, expected_content)

	def test_extract_generic_article(self):
	"""Test extraction of a generic article with formatting."""
	html = """
	<html>
	<body>
	<h1>Generic Article Title</h1>
	<main>
	<p>Opening paragraph with <strong>bold</strong> text.</p>
	<div class="content">
	<h2>Section Title</h2>
	<p>Content with <em>italic</em> text and <a href="ref.com">reference</a>.</p>
	<ul>
	<li>Point <strong>one</strong></li>
	<li>Point <em>two</em></li>
	</ul>
	</div>
	</main>
	</body>
	</html>
	"""
	soup = BeautifulSoup(html, 'html.parser')
	result = self.scraper._extract_article(soup, 'generic.com')

	expected_content = """
	Opening paragraph with bold text.

	## Section Title

	Content with _italic_ text and [reference](ref.com).

	• Point one
	• Point _two_""".strip()

	self.assertEqual(result['headline'], 'Generic Article Title')
	# Normalize whitespace for comparison
	actual_content = '\n'.join(line.strip() for line in result['content'].split('\n') if line.strip())
	expected_content = '\n'.join(line.strip() for line in expected_content.split('\n') if line.strip())
	self.assertEqual(actual_content, expected_content)

	if __name__ == '__main__':
	unittest.main()