Spaces:

sayanAIAI
/

AIprojects

Runtime error

App Files Files Community

sayanAIAI commited on Sep 17

Commit

bb6c458

verified ·

1 Parent(s): 743f7ef

Update main.py

Browse files

Files changed (1) hide show

main.py +170 -125

main.py CHANGED Viewed

@@ -6,47 +6,44 @@ import json
 import re
 import logging
 from collections import Counter
 from flask import Flask, request, jsonify, render_template
 import torch
-from transformers import (
-    AutoTokenizer,
-    AutoModelForSeq2SeqLM,
-    pipeline
-)
 # -------------------------
-# Basic app + logging
 # -------------------------
 app = Flask(__name__)
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("summarizer")
 # -------------------------
-# Device selection (GPU if available)
 # -------------------------
 USE_GPU = torch.cuda.is_available()
 DEVICE = 0 if USE_GPU else -1
 logger.info("CUDA available: %s. Using device: %s", USE_GPU, DEVICE)
 # -------------------------
-# Models (quality-first)
 # -------------------------
-# Primary summarizer (higher-quality model)
-SUMMARIZER_MODEL = "facebook/bart-large-cnn"           # quality-focused
-summ_tokenizer = AutoTokenizer.from_pretrained(SUMMARIZER_MODEL)
-summ_model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZER_MODEL)
-summarizer = pipeline("summarization", model=summ_model, tokenizer=summ_tokenizer, device=DEVICE)
-# Parameter-generator (small instruction model to "think" and choose settings)
-# We keep this compact but capable. If you later want stronger reasoning, swap to flan-t5-base.
-PARAM_MODEL = "google/flan-t5-small"
 param_tokenizer = AutoTokenizer.from_pretrained(PARAM_MODEL)
 param_model = AutoModelForSeq2SeqLM.from_pretrained(PARAM_MODEL)
 param_generator = pipeline("text2text-generation", model=param_model, tokenizer=param_tokenizer, device=DEVICE)
 # -------------------------
-# Presets & utilities
 # -------------------------
 LENGTH_PRESETS = {
     "short": {"min_length": 20, "max_length": 60},
@@ -54,7 +51,6 @@ LENGTH_PRESETS = {
     "long": {"min_length": 130, "max_length": 300},
 }
-# Simple sentence splitter and extractive prefilter (helps focus abstractive model)
 _STOPWORDS = {
     "the","and","is","in","to","of","a","that","it","on","for","as","are","with","was","be","by","this","an","or","from","at","which","we","has","have"
 }
@@ -64,22 +60,18 @@ def tokenize_sentences(text):
     return [s.strip() for s in sents if s.strip()]
 def extractive_prefilter(text, top_k=12):
-    """
-    Rank sentences by (non-stopword) word-frequency and return top_k sentences
-    in original order joined. Useful for very long inputs.
-    """
     sents = tokenize_sentences(text)
     if len(sents) <= top_k:
         return text
     words = re.findall(r"\w+", text.lower())
     freqs = Counter(w for w in words if w not in _STOPWORDS)
-    scores = []
     for i, s in enumerate(sents):
         ws = re.findall(r"\w+", s.lower())
         score = sum(freqs.get(w, 0) for w in ws)
-        scores.append((score, i, s))
-    scores.sort(reverse=True)
-    chosen = [s for _, _, s in sorted(scores[:top_k], key=lambda t: t[1])]
     return " ".join(chosen)
 def chunk_text_by_chars(text, max_chars=1500, overlap=200):
@@ -95,16 +87,47 @@ def chunk_text_by_chars(text, max_chars=1500, overlap=200):
             end = start + nl
             chunk = text[start:end]
         parts.append(chunk.strip())
-        start = max(end - overlap, end)  # move forward with overlap
     return parts
-def apply_tone_instruction(text, tone, target_sentences=None):
     """
-    Build a clear instruction prompt for the summarizer based on tone/length.
     """
     tone = (tone or "neutral").lower()
     if tone == "bullet":
-        instr = "Produce concise bullet points. Each bullet should be short (<=20 words) and focused. No extra commentary."
     elif tone == "short":
         ts = target_sentences or 1
         instr = f"Summarize the content in {ts} sentence{'s' if ts>1 else ''}. Be highly abstractive and avoid copying sentences verbatim."
@@ -113,47 +136,32 @@ def apply_tone_instruction(text, tone, target_sentences=None):
     elif tone == "casual":
         instr = "Summarize in a casual, conversational tone in 1-3 sentences. Use plain, friendly language."
     elif tone == "long":
-        instr = "Provide a clear, structured summary in 4-8 sentences, covering key points and relevant context."
     else:
         instr = "Summarize the content in 2-3 sentences. Be clear and concise."
-    instr += " Do not repeat the same information. Prefer rephrasing over copying."
     return f"{instr}\n\nText:\n{text}"
-# helper: extract first integer
-def _first_int_from_text(s, fallback=None):
-    m = re.search(r"\d{1,4}", s)
-    return int(m.group()) if m else fallback
-# -------------------------
-# Parameter generator (AI "thinking" module)
-# -------------------------
 def generate_summarization_config(text):
     """
-    Use the instruction model to recommend: length(short|medium|long), min_words, max_words, tone.
-    Falls back to heuristics on failure.
     """
     prompt = (
-        "You are an assistant that recommends optimal summarization settings.\n"
         "Given the text, respond ONLY with single-line JSON EXACTLY like:\n"
         '{"length":"short|medium|long","min_words":MIN,"max_words":MAX,"tone":"neutral|formal|casual|bullet"}\n\n'
         "Text:\n'''"
-        + text[:4000] +
-        "'''"
     )
     try:
-        # keep generation short and deterministic; use max_new_tokens (avoid max_length)
-        gen = param_generator(
             prompt,
             max_new_tokens=64,
             num_beams=1,
             do_sample=False,
             early_stopping=True
-        )
-        out = gen[0].get("generated_text", "").strip()
-        # attempt JSON parse
         cfg = None
         try:
             cfg = json.loads(out)
@@ -163,56 +171,44 @@ def generate_summarization_config(text):
                 raw = j.group().replace("'", '"')
                 cfg = json.loads(raw)
         if not cfg:
-            raise ValueError("Param-generator output not parseable")
-        length = cfg.get("length", "").lower()
-        tone = cfg.get("tone", "").lower()
         min_w = cfg.get("min_words")
         max_w = cfg.get("max_words")
-        if length not in ("short", "medium", "long"):
             words = len(text.split())
             length = "short" if words < 150 else ("medium" if words < 800 else "long")
-        if tone not in ("neutral", "formal", "casual", "bullet"):
             tone = "neutral"
-        if not isinstance(min_w, int):
             min_w = _first_int_from_text(out, fallback=None)
-        if not isinstance(max_w, int):
             max_w = _first_int_from_text(out[::-1], fallback=None)
-        defaults = {"short": (15, 50), "medium": (50, 130), "long": (130, 300)}
-        dmin, dmax = defaults.get(length, (50,130))
-        min_len = int(min_w) if isinstance(min_w, int) else dmin
-        max_len = int(max_w) if isinstance(max_w, int) else dmax
         min_len = max(5, min(min_len, 2000))
-        max_len = max(min_len + 5, min(max_len, 4000))
-        logger.info("Param-generator chose: length=%s tone=%s min=%s max=%s", length, tone, min_len, max_len)
-        return {"length": length, "min_length": min_len, "max_length": max_len, "tone": tone}
     except Exception as e:
-        logger.exception("Param-generator failed; falling back to heuristic: %s", str(e))
         words = len(text.split())
         length = "short" if words < 150 else ("medium" if words < 800 else "long")
-        fallback = {"short": (15, 50), "medium": (50, 130), "long": (130, 300)}
-        mn, mx = fallback[length]
-        return {"length": length, "min_length": mn, "max_length": mx, "tone": "neutral"}
 # -------------------------
-# Two-stage summarization helpers
 # -------------------------
-def refine_and_combine(summaries_list, tone, final_target_sentences=None):
-    """
-    Combine chunk summaries and perform a refinement pass to produce cohesive final summary.
-    """
     combined = "\n\n".join(summaries_list)
     if len(combined.split()) > 2000:
         combined = extractive_prefilter(combined, top_k=20)
     prompt = apply_tone_instruction(combined, tone, target_sentences=final_target_sentences)
-    # heuristics for min/max
     tgt_sent = final_target_sentences or 3
     gen_kwargs = {
         "min_length": max(20, int(tgt_sent * 8)),
@@ -222,87 +218,135 @@ def refine_and_combine(summaries_list, tone, final_target_sentences=None):
         "no_repeat_ngram_size": 3,
         "do_sample": False,
     }
     try:
-        out = summarizer(prompt, **gen_kwargs)[0]["summary_text"].strip()
         return out
     except Exception as e:
-        logger.exception("Refine step failed: %s", e)
         return " ".join(summaries_list[:3])
 # -------------------------
 # Routes
 # -------------------------
 @app.route("/")
 def home():
-    # Ensure you have templates/index.html in place
     return render_template("index.html")
 @app.route("/summarize", methods=["POST"])
 def summarize_route():
     t0 = time.time()
-    data = request.get_json(force=True)
-    text = (data.get("text") or "")[:60000]  # cap input to reasonable size
-    requested_length = (data.get("length") or "medium").lower()
-    requested_tone = (data.get("tone") or "neutral").lower()
     if not text or len(text.split()) < 5:
-        return jsonify({"error": "Input too short."}), 400
     # 1) Decide settings (AI or explicit)
-    if requested_length in ("auto", "ai") or requested_tone in ("auto", "ai"):
         cfg = generate_summarization_config(text)
-        length_choice = cfg.get("length", "medium")
-        tone_choice = cfg.get("tone", "neutral")
         preset_min = cfg.get("min_length")
         preset_max = cfg.get("max_length")
     else:
         length_choice = requested_length if requested_length in ("short","medium","long") else "medium"
-        tone_choice = requested_tone if requested_tone in ("neutral","formal","casual","bullet","short","long") else "neutral"
         preset_min = LENGTH_PRESETS.get(length_choice, LENGTH_PRESETS["medium"])["min_length"]
         preset_max = LENGTH_PRESETS.get(length_choice, LENGTH_PRESETS["medium"])["max_length"]
-    # Map chosen length to target final sentences
-    sentence_map = {"short": 1, "medium": 3, "long": 6}
-    final_target_sentences = sentence_map.get(length_choice, 3)
-    # 2) Prefilter if extremely long
     words_len = len(text.split())
-    if words_len > 3500:
         text_for_chunks = extractive_prefilter(text, top_k=40)
     else:
         text_for_chunks = text
-    # 3) Chunking
-    chunks = chunk_text_by_chars(text_for_chunks, max_chars=1400, overlap=200)
-    chunk_summaries = []
-    # 4) Summarize each chunk
     for chunk in chunks:
         chunk_target = 1 if length_choice == "short" else 2
         chunk_tone = tone_choice if tone_choice in ("formal","casual","bullet") else "neutral"
         prompt = apply_tone_instruction(chunk, chunk_tone, target_sentences=chunk_target)
-        gen_kwargs = {
-            "min_length": 12 if chunk_target == 1 else 24,
-            "max_length": 60 if chunk_target == 1 else 120,
-            "num_beams": 5,
-            "early_stopping": True,
-            "no_repeat_ngram_size": 3,
-            "do_sample": False,
-        }
         try:
-            out = summarizer(prompt, **gen_kwargs)[0]["summary_text"].strip()
         except Exception as e:
             logger.exception("Chunk summarization failed, using extractive fallback: %s", e)
             out = extractive_prefilter(chunk, top_k=3)
         chunk_summaries.append(out)
-    # 5) Combine & refine
-    final = refine_and_combine(chunk_summaries, tone_choice, final_target_sentences=final_target_sentences)
-    # 6) Post-process for bullet tone
     if tone_choice == "bullet":
         parts = re.split(r'[\n\r]+|(?:\.\s+)|(?:;\s+)', final)
         bullets = [f"- {p.strip().rstrip('.')}" for p in parts if p.strip()]
@@ -312,17 +356,18 @@ def summarize_route():
     meta = {
         "length_choice": length_choice,
         "tone": tone_choice,
         "chunks": len(chunks),
         "input_words": words_len,
         "time_seconds": round(elapsed, 2),
         "device": ("gpu" if USE_GPU else "cpu")
     }
     return jsonify({"summary": final, "meta": meta})
 # -------------------------
 # Run
 # -------------------------
 if __name__ == "__main__":
-    # In production use Gunicorn; debug True here only for local testing
     app.run(host="0.0.0.0", port=7860, debug=False)

 import re
 import logging
 from collections import Counter
+from typing import Optional
 from flask import Flask, request, jsonify, render_template
 import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 # -------------------------
+# App + logging
 # -------------------------
 app = Flask(__name__)
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("summarizer")
 # -------------------------
+# Device selection
 # -------------------------
 USE_GPU = torch.cuda.is_available()
 DEVICE = 0 if USE_GPU else -1
 logger.info("CUDA available: %s. Using device: %s", USE_GPU, DEVICE)
 # -------------------------
+# Model names (we'll load summarizers lazily)
 # -------------------------
+PEGASUS_MODEL = "google/pegasus-large"
+LED_MODEL = "allenai/led-large-16384"
+PARAM_MODEL = "google/flan-t5-small"   # instruction model for parameter generation
+# caches for lazy-loaded pipelines
+_SUMMARIZER_CACHE = {}
+# load the small param-generator right away (keeps it small)
+logger.info("Loading parameter generator model: %s", PARAM_MODEL)
 param_tokenizer = AutoTokenizer.from_pretrained(PARAM_MODEL)
 param_model = AutoModelForSeq2SeqLM.from_pretrained(PARAM_MODEL)
 param_generator = pipeline("text2text-generation", model=param_model, tokenizer=param_tokenizer, device=DEVICE)
 # -------------------------
+# Presets & utils
 # -------------------------
 LENGTH_PRESETS = {
     "short": {"min_length": 20, "max_length": 60},
     "long": {"min_length": 130, "max_length": 300},
 }
 _STOPWORDS = {
     "the","and","is","in","to","of","a","that","it","on","for","as","are","with","was","be","by","this","an","or","from","at","which","we","has","have"
 }
     return [s.strip() for s in sents if s.strip()]
 def extractive_prefilter(text, top_k=12):
     sents = tokenize_sentences(text)
     if len(sents) <= top_k:
         return text
     words = re.findall(r"\w+", text.lower())
     freqs = Counter(w for w in words if w not in _STOPWORDS)
+    scored = []
     for i, s in enumerate(sents):
         ws = re.findall(r"\w+", s.lower())
         score = sum(freqs.get(w, 0) for w in ws)
+        scored.append((score, i, s))
+    scored.sort(reverse=True)
+    chosen = [s for _, _, s in sorted(scored[:top_k], key=lambda t: t[1])]
     return " ".join(chosen)
 def chunk_text_by_chars(text, max_chars=1500, overlap=200):
             end = start + nl
             chunk = text[start:end]
         parts.append(chunk.strip())
+        start = max(end - overlap, end)
     return parts
+def _first_int_from_text(s, fallback=None):
+    m = re.search(r"\d{1,4}", s)
+    return int(m.group()) if m else fallback
+# -------------------------
+# Lazy summarizer loader
+# -------------------------
+def get_summarizer(model_key: str):
     """
+    Returns a pipeline summarizer for 'pegasus' or 'led', loading it lazily.
+    model_key: "pegasus" or "led"
     """
+    model_key = model_key.lower()
+    if model_key in _SUMMARIZER_CACHE:
+        return _SUMMARIZER_CACHE[model_key]
+    if model_key == "pegasus":
+        model_name = PEGASUS_MODEL
+    elif model_key == "led":
+        model_name = LED_MODEL
+    else:
+        raise ValueError("Unknown model_key: " + str(model_key))
+    logger.info("Loading summarizer model '%s' (%s) on device %s ...", model_key, model_name, DEVICE)
+    tok = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+    pipe = pipeline("summarization", model=model, tokenizer=tok, device=DEVICE)
+    _SUMMARIZER_CACHE[model_key] = pipe
+    logger.info("Loaded summarizer '%s' successfully.", model_key)
+    return pipe
+# -------------------------
+# Prompt and decision logic
+# -------------------------
+def apply_tone_instruction(text, tone, target_sentences=None):
     tone = (tone or "neutral").lower()
     if tone == "bullet":
+        instr = "Produce concise bullet points. Each bullet short (<=20 words). No extra commentary."
     elif tone == "short":
         ts = target_sentences or 1
         instr = f"Summarize the content in {ts} sentence{'s' if ts>1 else ''}. Be highly abstractive and avoid copying sentences verbatim."
     elif tone == "casual":
         instr = "Summarize in a casual, conversational tone in 1-3 sentences. Use plain, friendly language."
     elif tone == "long":
+        instr = "Provide a clear, structured summary in 4-8 sentences covering key points and context."
     else:
         instr = "Summarize the content in 2-3 sentences. Be clear and concise."
+    instr += " Do not repeat information; prefer rephrasing."
     return f"{instr}\n\nText:\n{text}"
 def generate_summarization_config(text):
     """
+    Ask small instruction model for settings; fallback to heuristic.
+    Returns dict with keys: length, min_length, max_length, tone
     """
     prompt = (
+        "You are an assistant that recommends summarization settings.\n"
         "Given the text, respond ONLY with single-line JSON EXACTLY like:\n"
         '{"length":"short|medium|long","min_words":MIN,"max_words":MAX,"tone":"neutral|formal|casual|bullet"}\n\n'
         "Text:\n'''"
+        + text[:4000] + "'''"
     )
     try:
+        out = param_generator(
             prompt,
             max_new_tokens=64,
             num_beams=1,
             do_sample=False,
             early_stopping=True
+        )[0].get("generated_text","").strip()
         cfg = None
         try:
             cfg = json.loads(out)
                 raw = j.group().replace("'", '"')
                 cfg = json.loads(raw)
         if not cfg:
+            raise ValueError("Unparseable param-generator output")
+        length = cfg.get("length","").lower()
+        tone = cfg.get("tone","").lower()
         min_w = cfg.get("min_words")
         max_w = cfg.get("max_words")
+        if length not in ("short","medium","long"):
             words = len(text.split())
             length = "short" if words < 150 else ("medium" if words < 800 else "long")
+        if tone not in ("neutral","formal","casual","bullet"):
             tone = "neutral"
+        if not isinstance(min_w,int):
             min_w = _first_int_from_text(out, fallback=None)
+        if not isinstance(max_w,int):
             max_w = _first_int_from_text(out[::-1], fallback=None)
+        defaults = {"short":(15,50),"medium":(50,130),"long":(130,300)}
+        dmin,dmax = defaults.get(length,(50,130))
+        min_len = int(min_w) if isinstance(min_w,int) else dmin
+        max_len = int(max_w) if isinstance(max_w,int) else dmax
         min_len = max(5, min(min_len, 2000))
+        max_len = max(min_len+5, min(max_len, 4000))
+        logger.info("Param-generator chose length=%s tone=%s min=%s max=%s", length, tone, min_len, max_len)
+        return {"length":length,"min_length":min_len,"max_length":max_len,"tone":tone}
     except Exception as e:
+        logger.exception("Param-generator failed: %s", e)
         words = len(text.split())
         length = "short" if words < 150 else ("medium" if words < 800 else "long")
+        fallback = {"short":(15,50),"medium":(50,130),"long":(130,300)}
+        mn,mx = fallback[length]
+        return {"length":length,"min_length":mn,"max_length":mx,"tone":"neutral"}
 # -------------------------
+# Two-stage summarization (chunk -> chunk summaries -> refine)
 # -------------------------
+def refine_and_combine(summaries_list, tone, final_target_sentences=None, summarizer_pipe=None):
     combined = "\n\n".join(summaries_list)
     if len(combined.split()) > 2000:
         combined = extractive_prefilter(combined, top_k=20)
     prompt = apply_tone_instruction(combined, tone, target_sentences=final_target_sentences)
     tgt_sent = final_target_sentences or 3
     gen_kwargs = {
         "min_length": max(20, int(tgt_sent * 8)),
         "no_repeat_ngram_size": 3,
         "do_sample": False,
     }
     try:
+        if summarizer_pipe is None:
+            # fallback to pegasus by default (if pipe not provided)
+            summarizer_pipe = get_summarizer("pegasus")
+        out = summarizer_pipe(prompt, **gen_kwargs)[0]["summary_text"].strip()
         return out
     except Exception as e:
+        logger.exception("Refine failed: %s", e)
         return " ".join(summaries_list[:3])
+# -------------------------
+# Model-specific generation helper
+# -------------------------
+def summarize_with_model(pipe, text_prompt, short_target=False):
+    """
+    Use model pipeline with conservative and model-appropriate generation settings.
+    short_target: if True use shorter min/max suitable for concise outputs
+    """
+    # heuristics: if pipe is LED (model name in tied tokenizer), allow larger max_length
+    model_name = getattr(pipe.model.config, "name_or_path", "") or ""
+    is_led = "led" in model_name or "longformer" in model_name or "allenai" in model_name and "led" in model_name
+    if short_target:
+        min_l = 12
+        max_l = 60
+    else:
+        min_l = 24
+        max_l = 140 if not is_led else 400  # LED can handle longer outputs
+    gen_kwargs = {
+        "min_length": min_l,
+        "max_length": max_l,
+        "num_beams": 5 if not is_led else 4,
+        "early_stopping": True,
+        "no_repeat_ngram_size": 3,
+        "do_sample": False,
+    }
+    return pipe(text_prompt, **gen_kwargs)[0]["summary_text"].strip()
 # -------------------------
 # Routes
 # -------------------------
 @app.route("/")
 def home():
     return render_template("index.html")
 @app.route("/summarize", methods=["POST"])
 def summarize_route():
     t0 = time.time()
+    data = request.get_json(force=True) or {}
+    text = (data.get("text") or "")[:90000]
+    user_model_pref = (data.get("model") or "auto").lower()   # 'pegasus' | 'led' | 'auto'
+    requested_length = (data.get("length") or "auto").lower()  # short|medium|long|auto
+    requested_tone = (data.get("tone") or "auto").lower()      # neutral|formal|casual|bullet|auto
     if not text or len(text.split()) < 5:
+        return jsonify({"error":"Input too short."}), 400
     # 1) Decide settings (AI or explicit)
+    if requested_length in ("auto","ai") or requested_tone in ("auto","ai"):
         cfg = generate_summarization_config(text)
+        length_choice = cfg.get("length","medium")
+        tone_choice = cfg.get("tone","neutral")
         preset_min = cfg.get("min_length")
         preset_max = cfg.get("max_length")
     else:
         length_choice = requested_length if requested_length in ("short","medium","long") else "medium"
+        tone_choice = requested_tone if requested_tone in ("neutral","formal","casual","bullet") else "neutral"
         preset_min = LENGTH_PRESETS.get(length_choice, LENGTH_PRESETS["medium"])["min_length"]
         preset_max = LENGTH_PRESETS.get(length_choice, LENGTH_PRESETS["medium"])["max_length"]
+    # 2) Model selection (user preference or auto)
+    # auto rules: if user specifically asked 'led' or param-generator picked long / input is very long -> led
     words_len = len(text.split())
+    prefer_led = False
+    if user_model_pref == "led":
+        prefer_led = True
+    elif user_model_pref == "pegasus":
+        prefer_led = False
+    else:  # auto
+        if length_choice == "long" or words_len > 3000:
+            prefer_led = True
+        else:
+            prefer_led = False
+    model_key = "led" if prefer_led else "pegasus"
+    # get the pipeline (lazy load)
+    try:
+        summarizer_pipe = get_summarizer(model_key)
+    except Exception as e:
+        logger.exception("Failed to load summarizer '%s': %s", model_key, e)
+        # fallback to pegasus if led fails
+        summarizer_pipe = get_summarizer("pegasus")
+        model_key = "pegasus"
+    # 3) Prefilter very long inputs (if not using LED)
+    if not prefer_led and words_len > 2500:
         text_for_chunks = extractive_prefilter(text, top_k=40)
     else:
         text_for_chunks = text
+    # 4) Chunking: choose chunk size depending on model
+    if model_key == "led":
+        chunk_max_chars = 8000   # LED can handle larger chunks
+        chunk_overlap = 400
+    else:
+        chunk_max_chars = 1400
+        chunk_overlap = 200
+    chunks = chunk_text_by_chars(text_for_chunks, max_chars=chunk_max_chars, overlap=chunk_overlap)
+    # 5) Summarize each chunk
+    chunk_summaries = []
     for chunk in chunks:
         chunk_target = 1 if length_choice == "short" else 2
         chunk_tone = tone_choice if tone_choice in ("formal","casual","bullet") else "neutral"
         prompt = apply_tone_instruction(chunk, chunk_tone, target_sentences=chunk_target)
         try:
+            # choose short_target True for tiny chunk summaries
+            out = summarize_with_model(summarizer_pipe, prompt, short_target=(chunk_target==1))
         except Exception as e:
             logger.exception("Chunk summarization failed, using extractive fallback: %s", e)
             out = extractive_prefilter(chunk, top_k=3)
         chunk_summaries.append(out)
+    # 6) Combine + refine using the same model for consistency (or prefer Pegasus for elegant refinement)
+    refine_model_key = model_key if model_key == "led" else "pegasus"
+    refine_pipe = get_summarizer(refine_model_key)
+    final_target_sentences = {"short":1,"medium":3,"long":6}.get(length_choice,3)
+    final = refine_and_combine(chunk_summaries, tone_choice, final_target_sentences, summarizer_pipe=refine_pipe)
+    # 7) Post-process bullet tone
     if tone_choice == "bullet":
         parts = re.split(r'[\n\r]+|(?:\.\s+)|(?:;\s+)', final)
         bullets = [f"- {p.strip().rstrip('.')}" for p in parts if p.strip()]
     meta = {
         "length_choice": length_choice,
         "tone": tone_choice,
+        "model_used": model_key,
+        "refine_model": refine_model_key,
         "chunks": len(chunks),
         "input_words": words_len,
         "time_seconds": round(elapsed, 2),
         "device": ("gpu" if USE_GPU else "cpu")
     }
     return jsonify({"summary": final, "meta": meta})
 # -------------------------
 # Run
 # -------------------------
 if __name__ == "__main__":
+    # debug=False for production; use Gunicorn in deployment
     app.run(host="0.0.0.0", port=7860, debug=False)