Spaces:
Runtime error
Runtime error
| import requests | |
| from collections import Counter | |
| from transformers import pipeline | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| import string | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| import torch | |
| import os | |
| nltk_data_dir = "/tmp/nltk_data" | |
| hf_cache_dir = "/tmp/huggingface" | |
| os.makedirs(nltk_data_dir, exist_ok=True) | |
| os.makedirs(hf_cache_dir, exist_ok=True) | |
| os.environ["NLTK_DATA"] = nltk_data_dir | |
| os.environ["HF_HOME"] = hf_cache_dir | |
| nltk.download('punkt', download_dir=nltk_data_dir) | |
| nltk.download('stopwords', download_dir=nltk_data_dir) | |
| nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir) | |
| nltk.download('punkt_tab', download_dir=nltk_data_dir) | |
| # 1. Function for getting news via NewsAPI | |
| def get_news(query, api_key, num_articles=5): | |
| url = f'https://newsapi.org/v2/everything?q={query}&apiKey={api_key}&language=en&pageSize={num_articles}' | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| return response.json()['articles'] | |
| return [] | |
| # 2. Analyzing tone with Hugging Face | |
| tone_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", revision="714eb0f") | |
| def analyze_sentiment(text): | |
| return tone_analyzer(text)[0] | |
| # 3. Define category | |
| category_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/tweet-topic-21-multi") | |
| category_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/tweet-topic-21-multi") | |
| labels = ['art', 'business', 'entertainment', 'environment', 'fashion', 'finance', 'food', | |
| 'health', 'law', 'media', 'military', 'music', 'politics', 'religion', 'sci/tech', | |
| 'sports', 'travel', 'weather', 'world news', 'none'] | |
| def classify_category(text): | |
| inputs = category_tokenizer(text, return_tensors="pt", truncation=True, padding=True) | |
| outputs = category_model(**inputs) | |
| predicted_class = torch.argmax(outputs.logits, dim=1).item() | |
| return labels[predicted_class] | |
| # 4. Summarization | |
| summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
| def split_text(text, max_tokens=512): | |
| words = text.split() | |
| return [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)] | |
| def summarize_text(text): | |
| chunks = split_text(text) | |
| summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks] | |
| return ' '.join(summaries) | |
| # 5. Search for trending words | |
| def extract_trending_words(texts): | |
| text = ' '.join(texts).lower() | |
| words = word_tokenize(text) | |
| words = [word for word in words if word not in stopwords.words('english') and word not in string.punctuation and len(word) > 1] | |
| word_freq = Counter(words) | |
| return word_freq.most_common(10) | |
| # 6. The main process of analyzing news | |
| def analyze_news(query, api_key, num_articles=5): | |
| articles = get_news(query, api_key, num_articles) | |
| if not articles: | |
| return [] | |
| news_results = [] | |
| for article in articles: | |
| title = article.get('title', 'No Title') | |
| description = article.get('description', '') or '' | |
| url = article.get('url', '#') | |
| sentiment = analyze_sentiment(title + " " + description)['label'] | |
| category = classify_category(title + " " + description) | |
| summary = summarize_text(title + " " + description) | |
| news_results.append({ | |
| "title": title, | |
| "url": url, | |
| "sentiment": sentiment, | |
| "category": category, | |
| "summary": summary | |
| }) | |
| return news_results | |