Spaces:

Xindus
/

xindus_summarizer

Sleeping

File size: 3,299 Bytes

import gradio as gr
import re
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from spaces import GPU  # Required for ZeroGPU Spaces
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

# Download NLTK stopwords if not already available
nltk.download("stopwords")
nltk.download('punkt')
stop_words = set(stopwords.words("english"))

# Model list
model_choices = {
    "DistilBART CNN (sshleifer/distilbart-cnn-12-6)": "sshleifer/distilbart-cnn-12-6",
    "T5 Small (t5-small)": "t5-small",
    "T5 Base (t5-base)": "t5-base",
    "Pegasus XSum (google/pegasus-xsum)": "google/pegasus-xsum",
    "BART CNN (facebook/bart-large-cnn)": "facebook/bart-large-cnn",
}

model_cache = {}

# Clean text: remove special characters, stop words, SKU codes, and short words
def clean_text(input_text):
    # Step 1: Remove any non-English characters (like special symbols, non-latin characters)
    cleaned_text = re.sub(r"[^A-Za-z0-9\s]", " ", input_text)
    cleaned_text = re.sub(r"\s+", " ", cleaned_text)  # Replace multiple spaces with a single space

    # Step 2: Tokenize the text and remove stopwords and words that are too short to be meaningful
    words = cleaned_text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words and len(word) > 2]

    # Step 3: Rebuild the text from the remaining words
    filtered_text = " ".join(filtered_words)

    # Step 4: Remove any product codes or sequences (e.g., ST1642, AB1234)
    filtered_text = re.sub(r"\b[A-Za-z]{2,}[0-9]{3,}\b", "", filtered_text)  # SKU/product code pattern

    # Step 5: Strip leading/trailing spaces
    filtered_text = filtered_text.strip()

    return filtered_text

# Extractive Summarization: Select sentences directly from the input text
def extractive_summary(input_text, num_sentences=2):
    sentences = sent_tokenize(input_text)  # Tokenize into sentences
    filtered_sentences = [sentence for sentence in sentences if len(sentence.split()) > 2]  # Filter out very short sentences
    return " ".join(filtered_sentences[:num_sentences])  # Return first `num_sentences` sentences

# Main function triggered by Gradio
@GPU  # 👈 Required for ZeroGPU to trigger GPU spin-up
def summarize_text(input_text, model_label, char_limit):
    if not input_text.strip():
        return "Please enter some text."

    input_text = clean_text(input_text)

    # For extractive summarization, we don't use the models that generate new tokens.
    summary = extractive_summary(input_text)

    # Truncate summary based on the character limit
    return summary[:char_limit].strip()

# Gradio UI
iface = gr.Interface(
    fn=summarize_text,
    inputs=[
        gr.Textbox(lines=6, label="Enter text to summarize"),
        gr.Dropdown(choices=list(model_choices.keys()), label="Choose summarization model", value="DistilBART CNN (sshleifer/distilbart-cnn-12-6)"),
        gr.Slider(minimum=30, maximum=200, value=65, step=1, label="Max Character Limit")
    ],
    outputs=gr.Textbox(lines=3, label="Summary (truncated to character limit)"),
    title="🔥 Fast Summarizer (Extractive Only)",
    description="Summarizes input by selecting key sentences from the input text, without generating new tokens."
)

iface.launch()