Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import re | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| from spaces import GPU # Required for ZeroGPU Spaces | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize, sent_tokenize | |
| import nltk | |
| # Download NLTK stopwords if not already available | |
| nltk.download("stopwords") | |
| nltk.download('punkt') | |
| stop_words = set(stopwords.words("english")) | |
| # Model list | |
| model_choices = { | |
| "DistilBART CNN (sshleifer/distilbart-cnn-12-6)": "sshleifer/distilbart-cnn-12-6", | |
| "T5 Small (t5-small)": "t5-small", | |
| "T5 Base (t5-base)": "t5-base", | |
| "Pegasus XSum (google/pegasus-xsum)": "google/pegasus-xsum", | |
| "BART CNN (facebook/bart-large-cnn)": "facebook/bart-large-cnn", | |
| } | |
| model_cache = {} | |
| # Clean text: remove special characters, stop words, SKU codes, and short words | |
| def clean_text(input_text): | |
| # Step 1: Remove any non-English characters (like special symbols, non-latin characters) | |
| cleaned_text = re.sub(r"[^A-Za-z0-9\s]", " ", input_text) | |
| cleaned_text = re.sub(r"\s+", " ", cleaned_text) # Replace multiple spaces with a single space | |
| # Step 2: Tokenize the text and remove stopwords and words that are too short to be meaningful | |
| words = cleaned_text.split() | |
| filtered_words = [word for word in words if word.lower() not in stop_words and len(word) > 2] | |
| # Step 3: Rebuild the text from the remaining words | |
| filtered_text = " ".join(filtered_words) | |
| # Step 4: Remove any product codes or sequences (e.g., ST1642, AB1234) | |
| filtered_text = re.sub(r"\b[A-Za-z]{2,}[0-9]{3,}\b", "", filtered_text) # SKU/product code pattern | |
| # Step 5: Strip leading/trailing spaces | |
| filtered_text = filtered_text.strip() | |
| return filtered_text | |
| # Extractive Summarization: Select sentences directly from the input text | |
| def extractive_summary(input_text, num_sentences=2): | |
| sentences = sent_tokenize(input_text) # Tokenize into sentences | |
| filtered_sentences = [sentence for sentence in sentences if len(sentence.split()) > 2] # Filter out very short sentences | |
| return " ".join(filtered_sentences[:num_sentences]) # Return first `num_sentences` sentences | |
| # Main function triggered by Gradio | |
| # 👈 Required for ZeroGPU to trigger GPU spin-up | |
| def summarize_text(input_text, model_label, char_limit): | |
| if not input_text.strip(): | |
| return "Please enter some text." | |
| input_text = clean_text(input_text) | |
| # For extractive summarization, we don't use the models that generate new tokens. | |
| summary = extractive_summary(input_text) | |
| # Truncate summary based on the character limit | |
| return summary[:char_limit].strip() | |
| # Gradio UI | |
| iface = gr.Interface( | |
| fn=summarize_text, | |
| inputs=[ | |
| gr.Textbox(lines=6, label="Enter text to summarize"), | |
| gr.Dropdown(choices=list(model_choices.keys()), label="Choose summarization model", value="DistilBART CNN (sshleifer/distilbart-cnn-12-6)"), | |
| gr.Slider(minimum=30, maximum=200, value=65, step=1, label="Max Character Limit") | |
| ], | |
| outputs=gr.Textbox(lines=3, label="Summary (truncated to character limit)"), | |
| title="🔥 Fast Summarizer (Extractive Only)", | |
| description="Summarizes input by selecting key sentences from the input text, without generating new tokens." | |
| ) | |
| iface.launch() | |