File size: 4,179 Bytes
2c200f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import PyPDF2
from PIL import Image
import base64
import io
import streamlit as st

try:
    from pdf2image import convert_from_path
    PDF2IMAGE_AVAILABLE = True
except ImportError:
    PDF2IMAGE_AVAILABLE = False
    st.warning("⚠️ pdf2image not available. PDF to image conversion will be limited.")

class DocumentProcessor:
    def __init__(self):
        pass
    
    def extract_text_from_pdf(self, pdf_file):
        """Extract text content from PDF file"""
        try:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
            return text
        except Exception as e:
            st.error(f"Error extracting text from PDF: {str(e)}")
            return None
    
    def convert_pdf_to_images(self, pdf_file):
        """Convert PDF pages to images"""
        if not PDF2IMAGE_AVAILABLE:
            st.warning("PDF to image conversion not available. Install poppler-utils and pdf2image.")
            return None
            
        try:
            images = convert_from_path(pdf_file, dpi=200)
            return images
        except Exception as e:
            st.error(f"Error converting PDF to images: {str(e)}")
            return None
    
    def image_to_base64(self, image):
        """Convert PIL image to base64 string for API"""
        try:
            if isinstance(image, str):
                with open(image, "rb") as img_file:
                    return base64.b64encode(img_file.read()).decode('utf-8')
            else:
                buffered = io.BytesIO()
                image.save(buffered, format="PNG")
                return base64.b64encode(buffered.getvalue()).decode('utf-8')
        except Exception as e:
            st.error(f"Error converting image to base64: {str(e)}")
            return None
    
    def process_uploaded_file(self, uploaded_file):
        """Process uploaded file (PDF or image)"""
        if uploaded_file is None:
            return None, None, None
        
        file_type = uploaded_file.type
        
        if file_type == "application/pdf":
            # Extract text
            text_content = self.extract_text_from_pdf(uploaded_file)
            
            # Convert to images for visual analysis (if available)
            images = None
            image_base64 = None
            
            if PDF2IMAGE_AVAILABLE:
                try:
                    import tempfile
                    import os
                    
                    # Use temporary file to avoid conflicts
                    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
                        temp_pdf.write(uploaded_file.getbuffer())
                        temp_pdf_path = temp_pdf.name
                    
                    try:
                        images = self.convert_pdf_to_images(temp_pdf_path)
                        
                        # Convert first page to base64 for LLM analysis
                        if images and len(images) > 0:
                            image_base64 = self.image_to_base64(images[0])
                    finally:
                        # Clean up temporary file
                        if os.path.exists(temp_pdf_path):
                            os.unlink(temp_pdf_path)
                            
                except Exception as e:
                    st.warning(f"PDF to image conversion failed: {str(e)}. Using text analysis only.")
            
            return text_content, images, image_base64
            
        elif file_type in ["image/jpeg", "image/png", "image/jpg"]:
            # For image files
            try:
                image = Image.open(uploaded_file)
                image_base64 = self.image_to_base64(image)
                
                return None, [image], image_base64
            except Exception as e:
                st.error(f"Error processing image file: {str(e)}")
                return None, None, None
        
        else:
            st.error("Unsupported file type. Please upload PDF or image files.")
            return None, None, None