Spaces:

ihaveaplan66
/

news-analyzer

Runtime error

App Files Files Community

news-analyzer / main.py

ihaveaplan66

Update main.py

026468b verified 8 months ago

raw

history blame contribute delete

3.61 kB

	import requests
	from collections import Counter
	from transformers import pipeline
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	import string
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	import torch
	import os

	nltk_data_dir = "/tmp/nltk_data"
	hf_cache_dir = "/tmp/huggingface"

	os.makedirs(nltk_data_dir, exist_ok=True)
	os.makedirs(hf_cache_dir, exist_ok=True)

	os.environ["NLTK_DATA"] = nltk_data_dir
	os.environ["HF_HOME"] = hf_cache_dir

	nltk.download('punkt', download_dir=nltk_data_dir)
	nltk.download('stopwords', download_dir=nltk_data_dir)
	nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir)
	nltk.download('punkt_tab', download_dir=nltk_data_dir)


	# 1. Function for getting news via NewsAPI
	def get_news(query, api_key, num_articles=5):
	url = f'https://newsapi.org/v2/everything?q={query}&apiKey={api_key}&language=en&pageSize={num_articles}'
	response = requests.get(url)
	if response.status_code == 200:
	return response.json()['articles']
	return []


	# 2. Analyzing tone with Hugging Face
	tone_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", revision="714eb0f")

	def analyze_sentiment(text):
	return tone_analyzer(text)[0]


	# 3. Define category

	category_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/tweet-topic-21-multi")
	category_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/tweet-topic-21-multi")
	labels = ['art', 'business', 'entertainment', 'environment', 'fashion', 'finance', 'food',
	'health', 'law', 'media', 'military', 'music', 'politics', 'religion', 'sci/tech',
	'sports', 'travel', 'weather', 'world news', 'none']

	def classify_category(text):
	inputs = category_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
	outputs = category_model(**inputs)
	predicted_class = torch.argmax(outputs.logits, dim=1).item()
	return labels[predicted_class]


	# 4. Summarization
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

	def split_text(text, max_tokens=512):
	words = text.split()
	return [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]

	def summarize_text(text):
	chunks = split_text(text)
	summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
	return ' '.join(summaries)


	# 5. Search for trending words
	def extract_trending_words(texts):
	text = ' '.join(texts).lower()
	words = word_tokenize(text)
	words = [word for word in words if word not in stopwords.words('english') and word not in string.punctuation and len(word) > 1]
	word_freq = Counter(words)
	return word_freq.most_common(10)

	# 6. The main process of analyzing news
	def analyze_news(query, api_key, num_articles=5):
	articles = get_news(query, api_key, num_articles)

	if not articles:
	return []

	news_results = []
	for article in articles:
	title = article.get('title', 'No Title')
	description = article.get('description', '') or ''
	url = article.get('url', '#')

	sentiment = analyze_sentiment(title + " " + description)['label']
	category = classify_category(title + " " + description)
	summary = summarize_text(title + " " + description)

	news_results.append({
	"title": title,
	"url": url,
	"sentiment": sentiment,
	"category": category,
	"summary": summary
	})

	return news_results