File size: 5,561 Bytes
472d519
 
 
 
 
 
 
 
 
 
 
 
e35df36
472d519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import gradio as gr
import spaces
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
import os

# Initialize model and processors
model_path = "nanonets/Nanonets-OCR2-3B"

model = AutoModelForImageTextToText.from_pretrained(
    model_path, 
    torch_dtype="auto", 
    device_map="auto"
)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)


@spaces.GPU(duration=120)
def ocr_page_with_nanonets(image_path: str, max_new_tokens: int = 4096) -> str:
    """
    Extract text from a document image using Nanonets OCR model.
    
    Args:
        image_path (str): Path to the image file
        max_new_tokens (int): Maximum number of tokens to generate
    
    Returns:
        str: Extracted text with formatting
    """
    prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using โ˜ and โ˜‘ for check boxes."""
    
    image = Image.open(image_path)
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
            {"type": "image", "image": f"file://{image_path}"},
            {"type": "text", "text": prompt},
        ]},
    ]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
    inputs = inputs.to(model.device)
    
    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return output_text[0]


def process_image(image, max_tokens):
    """
    Process the uploaded image and extract text.
    
    Args:
        image: PIL Image or file path from Gradio
        max_tokens (int): Maximum tokens for generation
    
    Returns:
        str: Extracted text from the document
    """
    if image is None:
        return "Please upload an image first."
    
    # If image is a PIL Image, save it temporarily
    if isinstance(image, Image.Image):
        temp_path = "/tmp/ocr_temp_image.jpg"
        image.save(temp_path)
        image_path = temp_path
    else:
        image_path = image
    
    try:
        result = ocr_page_with_nanonets(image_path, max_new_tokens=max_tokens)
        return result
    except Exception as e:
        return f"Error processing image: {str(e)}"


# Build the Gradio interface
with gr.Blocks(title="Document OCR with Nanonets") as demo:
    gr.HTML(
        """
        <div style="text-align: center; margin-bottom: 20px;">
            <h1>๐Ÿ“„ Document OCR</h1>
            <p>Extract text, tables, equations, and more from your documents using Nanonets OCR</p>
            <p><a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #0066cc; text-decoration: none;">Built with anycoder</a></p>
        </div>
        """
    )
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Upload Document")
            image_input = gr.Image(
                label="Upload Image",
                type="pil",
                sources=["upload", "webcam", "clipboard"]
            )
            
            gr.Markdown("### Settings")
            max_tokens_slider = gr.Slider(
                label="Maximum Tokens",
                minimum=1000,
                maximum=15000,
                value=4096,
                step=500,
                info="Higher values allow for longer outputs"
            )
            
            process_button = gr.Button(
                "๐Ÿš€ Extract Text",
                variant="primary",
                size="lg"
            )
        
        with gr.Column():
            gr.Markdown("### Extracted Text")
            output_text = gr.Textbox(
                label="OCR Output",
                lines=20,
                max_lines=30,
                interactive=False,
                show_copy_button=True
            )
    
    gr.Markdown("---")
    gr.Markdown(
        """
        ### Features
        - โœจ Extracts text naturally from documents
        - ๐Ÿ“Š Converts tables to HTML format
        - ๐Ÿงฎ Detects and converts equations to LaTeX
        - ๐Ÿ–ผ๏ธ Identifies and describes images
        - ๐Ÿ“Œ Marks watermarks and page numbers
        - โ˜‘๏ธ Recognizes checkboxes
        
        ### How to use
        1. Upload an image of a document (JPG, PNG, etc.)
        2. Adjust the maximum tokens if needed (for longer documents)
        3. Click "Extract Text" to process
        4. Copy the extracted text using the copy button
        """
    )
    
    process_button.click(
        fn=process_image,
        inputs=[image_input, max_tokens_slider],
        outputs=output_text,
        show_progress=True
    )

if __name__ == "__main__":
    demo.launch()