Spaces:
Runtime error
Runtime error
| import os | |
| import fitz # PyMuPDF | |
| import fasttext | |
| import requests | |
| import json | |
| import torch | |
| from PIL import Image | |
| from huggingface_hub import hf_hub_download | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline | |
| from IndicTransToolkit.processor import IndicProcessor | |
| import google.generativeai as genai | |
| import gradio as gr | |
| # === 1. CONFIGURATION & SECRETS === | |
| # --- Load the Gemini API Key from Hugging Face Secrets --- | |
| GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY") | |
| # --- Model IDs (Using the CPU-friendly TrOCR model) --- | |
| TRANSLATION_MODEL_REPO_ID = "ai4bharat/indictrans2-indic-en-1B" | |
| OCR_MODEL_ID = "microsoft/trocr-base-printed" | |
| # --- Language Settings --- | |
| LANGUAGE_TO_TRANSLATE = "mal" | |
| # --- Hardware Settings --- | |
| DEVICE = "cpu" # Forcing CPU for compatibility with free tier | |
| # === 2. LOAD MODELS & CONFIGURE API === | |
| # --- Configure Gemini API --- | |
| if not GEMINI_API_KEY: | |
| print("π΄ ERROR: Gemini API key is not set in the Space Secrets.") | |
| else: | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| # --- Load Translation Model --- | |
| print(f"Loading tokenizer & model: {TRANSLATION_MODEL_REPO_ID} ...") | |
| translation_tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_MODEL_REPO_ID, trust_remote_code=True) | |
| translation_model = AutoModelForSeq2SeqLM.from_pretrained( | |
| TRANSLATION_MODEL_REPO_ID, | |
| trust_remote_code=True, | |
| torch_dtype=torch.float32 # Use float32 for CPU | |
| ).to(DEVICE) | |
| print("β Translation model loaded.") | |
| ip = IndicProcessor(inference=True) | |
| # --- Load Language Detection Model --- | |
| print("Loading fastText language detector...") | |
| ft_model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin") | |
| lang_detect_model = fasttext.load_model(ft_model_path) | |
| print("β fastText loaded.") | |
| # --- Load Standard OCR Model --- | |
| print(f"Loading Standard OCR model: {OCR_MODEL_ID}...") | |
| ocr_pipeline = pipeline("image-to-text", model=OCR_MODEL_ID, device=-1) # device=-1 ensures CPU | |
| print("β Standard OCR model loaded.") | |
| # === 3. HELPER FUNCTIONS === | |
| # --- Phase 1: Text Extraction --- | |
| def classify_image_with_gemini(image: Image.Image): | |
| """Uses Gemini to classify an image as a 'document' or 'diagram'.""" | |
| model = genai.GenerativeModel('gemini-2.5-flash') | |
| prompt = "Is this image primarily a text document or an engineering/technical diagram? Answer with only 'document' or 'diagram'." | |
| response = model.generate_content([prompt, image]) | |
| classification = response.text.strip().lower() | |
| print(f"β Image classified as: {classification}") | |
| return "diagram" if "diagram" in classification else "document" | |
| def summarize_diagram_with_gemini(image: Image.Image): | |
| """Uses Gemini to generate a summary of an engineering diagram.""" | |
| model = genai.GenerativeModel('gemini-2.5-flash') | |
| prompt = "You are an engineering assistant for Kochi Metro Rail Limited (KMRL). Describe the contents of this technical diagram or engineering drawing in a concise summary. Identify key components and their apparent purpose." | |
| response = model.generate_content([prompt, image]) | |
| print("β Diagram summary successful.") | |
| return response.text.strip() | |
| def extract_text_from_image(path): | |
| """ | |
| Classifies an image and routes it for either OCR (if a text doc) or summarization (if a diagram). | |
| """ | |
| print("\n--- Starting Image Processing ---") | |
| try: | |
| image = Image.open(path).convert("RGB") | |
| # Step 1: Classify the image using Gemini | |
| image_type = classify_image_with_gemini(image) | |
| # Step 2: Route to the correct function | |
| if image_type == "diagram": | |
| print("-> Image is a diagram. Summarizing with Gemini...") | |
| return summarize_diagram_with_gemini(image) | |
| else: | |
| print("-> Image is a document. Extracting text with TrOCR...") | |
| out = ocr_pipeline(image) | |
| return out[0]["generated_text"] if out else "" | |
| except Exception as e: | |
| print(f"β An error occurred during image processing: {e}") | |
| return "Error during image processing." | |
| def extract_text_from_pdf(path): | |
| doc = fitz.open(path) | |
| return "".join(page.get_text("text") + "\n" for page in doc) | |
| def read_text_from_txt(path): | |
| with open(path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| # --- Phase 2: Translation --- | |
| def detect_language(text_snippet): | |
| s = text_snippet.replace("\n", " ").strip() | |
| if not s: return None | |
| preds = lang_detect_model.predict(s, k=1) | |
| return preds[0][0].split("__")[-1] if preds and preds[0] else None | |
| def translate_chunk(chunk): | |
| batch = ip.preprocess_batch([chunk], src_lang="mal_Mlym", tgt_lang="eng_Latn") | |
| inputs = translation_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(DEVICE) | |
| with torch.no_grad(): | |
| generated_tokens = translation_model.generate(**inputs, num_beams=5, max_length=512, early_stopping=True) | |
| decoded = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) | |
| return ip.postprocess_batch(decoded, lang=tgt_lang)[0] | |
| # --- Phase 3: Gemini Analysis --- | |
| def generate_structured_json(text_to_analyze): | |
| """Generates the detailed JSON analysis.""" | |
| model = genai.GenerativeModel('gemini-2.5-flash') | |
| prompt = f"You are an AI assistant for KMRL. Analyze this document and extract key info as JSON: {text_to_analyze}" | |
| json_schema = {"type": "OBJECT", "properties": {"summary": {"type": "STRING"}, "actions_required": {"type": "ARRAY", "items": {"type": "OBJECT", "properties": {"action": {"type": "STRING"}, "priority": {"type": "STRING", "enum": ["High", "Medium", "Low"]}, "deadline": {"type": "STRING"}, "notes": {"type": "STRING"}}, "required": ["action", "priority", "deadline", "notes"]}}, "departments_to_notify": {"type": "ARRAY", "items": {"type": "STRING"}}, "cross_document_flags": {"type": "ARRAY", "items": {"type": "OBJECT", "properties": {"related_document_type": {"type": "STRING"}, "related_issue": {"type": "STRING"}}, "required": ["related_document_type", "related_issue"]}}}, "required": ["summary", "actions_required", "departments_to_notify", "cross_document_flags"]} | |
| generation_config = genai.types.GenerationConfig(response_mime_type="application/json", response_schema=json_schema) | |
| response = model.generate_content(prompt, generation_config=generation_config) | |
| return json.loads(response.text) | |
| def check_relevance_with_gemini(summary_text): | |
| """Checks if the summary is relevant to KMRL.""" | |
| model = genai.GenerativeModel('gemini-2.5-flash') | |
| prompt = f'Is this summary related to transportation, infrastructure, railways, or metro systems? Answer only "Yes" or "No".\n\nSummary: {summary_text}' | |
| response = model.generate_content(prompt) | |
| return "yes" in response.text.strip().lower() | |
| # === 4. MAIN PROCESSING FUNCTION FOR GRADIO === | |
| def process_and_analyze_document(input_file): | |
| if not GEMINI_API_KEY: | |
| raise gr.Error("Gemini API key is not configured. The administrator must set it in the Space Secrets.") | |
| if input_file is None: | |
| raise gr.Error("No file uploaded. Please upload a document.") | |
| try: | |
| input_file_path = input_file.name | |
| ext = os.path.splitext(input_file_path)[1].lower() | |
| # --- Phase 1: Get Original Text --- | |
| if ext == ".pdf": | |
| original_text = extract_text_from_pdf(input_file_path) | |
| elif ext == ".txt": | |
| original_text = read_text_from_txt(input_file_path) | |
| elif ext in [".png", ".jpg", ".jpeg"]: | |
| original_text = extract_text_from_image(input_file_path) | |
| else: | |
| raise gr.Error("Unsupported file type.") | |
| if not original_text or not original_text.strip(): | |
| raise gr.Error("No text could be extracted from the document.") | |
| # --- Phase 2: Translate if Necessary --- | |
| lines = original_text.split("\n") | |
| translated_lines = [] | |
| for ln in lines: | |
| if not ln.strip(): continue | |
| lang = detect_language(ln) | |
| if lang == LANGUAGE_TO_TRANSLATE: | |
| translated_lines.append(translate_chunk(ln)) | |
| else: | |
| translated_lines.append(ln) | |
| final_text = "\n".join(translated_lines) | |
| # --- Phase 3: Analyze with Gemini --- | |
| summary_data = generate_structured_json(final_text) | |
| if not summary_data or "summary" not in summary_data: | |
| raise gr.Error("Failed to generate a valid analysis from the document.") | |
| is_relevant = check_relevance_with_gemini(summary_data["summary"]) | |
| if is_relevant: | |
| return summary_data | |
| else: | |
| return {"status": "Not Applicable", "reason": "The document was determined to be not relevant to KMRL."} | |
| except Exception as e: | |
| raise gr.Error(f"An unexpected error occurred: {str(e)}") | |
| iface = gr.Interface( | |
| fn=process_and_analyze_document, | |
| inputs=gr.File(label="Upload Document (.pdf, .txt, .png, .jpeg)"), | |
| outputs=gr.JSON(label="Analysis Result"), | |
| title="KMRL Document Analysis Pipeline", | |
| description="Upload a document (Malayalam or English). The system will detect and translate Malayalam text to English, then send the full text to Gemini for structured analysis.", | |
| allow_flagging="never", | |
| examples=[ | |
| ["Malayalam-en.txt"] # If you upload this file to your Space | |
| ] | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |