Commit 
							
							·
						
						f2a2588
	
1
								Parent(s):
							
							b05b1be
								
completed mcp v1
Browse files- .gitignore +2 -0
- .gradio/certificate.pem +31 -0
- app.py +82 -0
- web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
- web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
- web2json/__pycache__/postprocessor.cpython-311.pyc +0 -0
- web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
- web2json/ai_extractor.py +126 -0
- web2json/pipeline.py +43 -0
- web2json/postprocessor.py +27 -0
- web2json/preprocessor.py +138 -0
    	
        .gitignore
    ADDED
    
    | @@ -0,0 +1,2 @@ | |
|  | |
|  | 
|  | |
| 1 | 
            +
            .env
         | 
| 2 | 
            +
            test.ipynb
         | 
    	
        .gradio/certificate.pem
    ADDED
    
    | @@ -0,0 +1,31 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            -----BEGIN CERTIFICATE-----
         | 
| 2 | 
            +
            MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
         | 
| 3 | 
            +
            TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
         | 
| 4 | 
            +
            cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
         | 
| 5 | 
            +
            WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
         | 
| 6 | 
            +
            ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
         | 
| 7 | 
            +
            MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
         | 
| 8 | 
            +
            h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
         | 
| 9 | 
            +
            0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
         | 
| 10 | 
            +
            A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
         | 
| 11 | 
            +
            T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
         | 
| 12 | 
            +
            B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
         | 
| 13 | 
            +
            B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
         | 
| 14 | 
            +
            KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
         | 
| 15 | 
            +
            OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
         | 
| 16 | 
            +
            jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
         | 
| 17 | 
            +
            qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
         | 
| 18 | 
            +
            rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
         | 
| 19 | 
            +
            HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
         | 
| 20 | 
            +
            hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
         | 
| 21 | 
            +
            ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
         | 
| 22 | 
            +
            3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
         | 
| 23 | 
            +
            NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
         | 
| 24 | 
            +
            ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
         | 
| 25 | 
            +
            TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
         | 
| 26 | 
            +
            jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
         | 
| 27 | 
            +
            oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
         | 
| 28 | 
            +
            4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
         | 
| 29 | 
            +
            mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
         | 
| 30 | 
            +
            emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
         | 
| 31 | 
            +
            -----END CERTIFICATE-----
         | 
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,82 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import json
         | 
| 2 | 
            +
            import pandas as pd
         | 
| 3 | 
            +
            import gradio as gr
         | 
| 4 | 
            +
            from typing import Dict, Any
         | 
| 5 | 
            +
            from web2json.preprocessor import BasicPreprocessor
         | 
| 6 | 
            +
            from web2json.ai_extractor import AIExtractor, GeminiLLMClient
         | 
| 7 | 
            +
            from web2json.postprocessor import PostProcessor
         | 
| 8 | 
            +
            from web2json.pipeline import Pipeline
         | 
| 9 | 
            +
            from pydantic import BaseModel, Field
         | 
| 10 | 
            +
            import os
         | 
| 11 | 
            +
            import dotenv
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            dotenv.load_dotenv()
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            # Define schemas
         | 
| 16 | 
            +
            class Article(BaseModel):
         | 
| 17 | 
            +
                title: str = Field(..., description="The title of the article.")
         | 
| 18 | 
            +
                author: str = Field(..., description="The author of the article.")
         | 
| 19 | 
            +
                content: str = Field(..., description="The main content of the article.")
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            class Product(BaseModel):
         | 
| 22 | 
            +
                name: str = Field(..., description="The name of the product.")
         | 
| 23 | 
            +
                description: str = Field(..., description="A detailed description of the product.")
         | 
| 24 | 
            +
                price: float = Field(..., description="The price of the product.")
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            class JobPosting(BaseModel):
         | 
| 27 | 
            +
                title: str = Field(..., description="The title of the job position.")
         | 
| 28 | 
            +
                company: str = Field(..., description="The name of the company offering the job.")
         | 
| 29 | 
            +
                location: str = Field(..., description="The location of the job.")
         | 
| 30 | 
            +
                description: str = Field(..., description="A detailed description of the job responsibilities.")
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            SCHEMA_OPTIONS = {
         | 
| 33 | 
            +
                "Article": Article,
         | 
| 34 | 
            +
                "Product": Product,
         | 
| 35 | 
            +
                "Job Posting": JobPosting,
         | 
| 36 | 
            +
            }
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            # Core processing function
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]:
         | 
| 41 | 
            +
                if schema_name not in SCHEMA_OPTIONS:
         | 
| 42 | 
            +
                    return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"}
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                schema = SCHEMA_OPTIONS[schema_name]
         | 
| 45 | 
            +
                prompt_template = "extract the following information: {content} based on schema: {schema}"
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                # Initialize pipeline components
         | 
| 48 | 
            +
                preprocessor = BasicPreprocessor(config={'keep_tags': False})
         | 
| 49 | 
            +
                try:
         | 
| 50 | 
            +
                    llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
         | 
| 51 | 
            +
                except Exception as e:
         | 
| 52 | 
            +
                    return {"error": f"Failed to initialize LLM client: {str(e)}"}
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
         | 
| 55 | 
            +
                postprocessor = PostProcessor()
         | 
| 56 | 
            +
                pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                try:
         | 
| 59 | 
            +
                    result = pipeline.run(content, is_url, schema)
         | 
| 60 | 
            +
                    # print("-"*80)
         | 
| 61 | 
            +
                    # print(f"Processed result: {result}")
         | 
| 62 | 
            +
                    return result
         | 
| 63 | 
            +
                except Exception as e:
         | 
| 64 | 
            +
                    return {"error": f"Processing error: {str(e)}"}
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            # Build Gradio Interface
         | 
| 67 | 
            +
            demo = gr.Interface(
         | 
| 68 | 
            +
                fn=webpage_to_json,
         | 
| 69 | 
            +
                inputs=[
         | 
| 70 | 
            +
                    gr.Textbox(label="Content (URL or Raw Text)", lines=10,
         | 
| 71 | 
            +
                               placeholder="Enter URL or paste raw HTML/text here."),
         | 
| 72 | 
            +
                    gr.Checkbox(label="Content is URL?", value=False),
         | 
| 73 | 
            +
                    gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()),
         | 
| 74 | 
            +
                                label="Select Schema", value="Article")
         | 
| 75 | 
            +
                ],
         | 
| 76 | 
            +
                outputs=gr.JSON(label="Output JSON"),
         | 
| 77 | 
            +
                title="Webpage to JSON Converter",
         | 
| 78 | 
            +
                description="Convert web pages or raw text into structured JSON using customizable schemas."
         | 
| 79 | 
            +
            )
         | 
| 80 | 
            +
             | 
| 81 | 
            +
            if __name__ == "__main__":
         | 
| 82 | 
            +
                demo.launch(mcp_server=True)
         | 
    	
        web2json/__pycache__/ai_extractor.cpython-311.pyc
    ADDED
    
    | Binary file (6.46 kB). View file | 
|  | 
    	
        web2json/__pycache__/pipeline.cpython-311.pyc
    ADDED
    
    | Binary file (2.49 kB). View file | 
|  | 
    	
        web2json/__pycache__/postprocessor.cpython-311.pyc
    ADDED
    
    | Binary file (1.65 kB). View file | 
|  | 
    	
        web2json/__pycache__/preprocessor.cpython-311.pyc
    ADDED
    
    | Binary file (5.68 kB). View file | 
|  | 
    	
        web2json/ai_extractor.py
    ADDED
    
    | @@ -0,0 +1,126 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
            +
            from abc import ABC, abstractmethod
         | 
| 3 | 
            +
            from google import genai
         | 
| 4 | 
            +
            from google.genai import types
         | 
| 5 | 
            +
            from pydantic import BaseModel
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            class LLMClient(ABC):
         | 
| 8 | 
            +
                """
         | 
| 9 | 
            +
                Abstract base class for calling LLM APIs.
         | 
| 10 | 
            +
                """
         | 
| 11 | 
            +
                def __init__(self, config: dict = None):
         | 
| 12 | 
            +
                    """
         | 
| 13 | 
            +
                    Initializes the LLMClient with a configuration dictionary.
         | 
| 14 | 
            +
                    
         | 
| 15 | 
            +
                    Args:
         | 
| 16 | 
            +
                        config (dict): Configuration settings for the LLM client.
         | 
| 17 | 
            +
                    """
         | 
| 18 | 
            +
                    self.config = config or {}
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                @abstractmethod
         | 
| 21 | 
            +
                def call_api(self, prompt: str) -> str:
         | 
| 22 | 
            +
                    """
         | 
| 23 | 
            +
                    Call the underlying LLM API with the given prompt.
         | 
| 24 | 
            +
                    
         | 
| 25 | 
            +
                    Args:
         | 
| 26 | 
            +
                        prompt (str): The prompt or input text for the LLM.
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    Returns:
         | 
| 29 | 
            +
                        str: The response from the LLM.
         | 
| 30 | 
            +
                    """
         | 
| 31 | 
            +
                    pass
         | 
| 32 | 
            +
             | 
| 33 | 
            +
             | 
| 34 | 
            +
            class GeminiLLMClient(LLMClient):
         | 
| 35 | 
            +
                """
         | 
| 36 | 
            +
                Concrete implementation of LLMClient for the Gemini API.
         | 
| 37 | 
            +
                """
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                def __init__(self, config: dict):
         | 
| 40 | 
            +
                    """
         | 
| 41 | 
            +
                    Initializes the GeminiLLMClient with an API key, model name, and optional generation settings.
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                    Args:
         | 
| 44 | 
            +
                        config (dict): Configuration containing:
         | 
| 45 | 
            +
                            - 'api_key': (optional) API key for Gemini (falls back to GEMINI_API_KEY env var)
         | 
| 46 | 
            +
                            - 'model_name': (optional) the model to use (default 'gemini-2.0-flash')
         | 
| 47 | 
            +
                            - 'generation_config': (optional) dict of GenerateContentConfig parameters
         | 
| 48 | 
            +
                    """
         | 
| 49 | 
            +
                    api_key = config.get("api_key") or os.environ.get("GEMINI_API_KEY")
         | 
| 50 | 
            +
                    if not api_key:
         | 
| 51 | 
            +
                        raise ValueError(
         | 
| 52 | 
            +
                            "API key for Gemini must be provided in config['api_key'] or GEMINI_API_KEY env var."
         | 
| 53 | 
            +
                        )
         | 
| 54 | 
            +
                    self.client = genai.Client(api_key=api_key)
         | 
| 55 | 
            +
                    self.model_name = config.get("model_name", "gemini-2.0-flash")
         | 
| 56 | 
            +
                    # allow custom generation settings, fallback to sensible defaults
         | 
| 57 | 
            +
                    gen_conf = config.get("generation_config", {})
         | 
| 58 | 
            +
                    self.generate_config = types.GenerateContentConfig(
         | 
| 59 | 
            +
                        response_mime_type=gen_conf.get("response_mime_type", "text/plain"),
         | 
| 60 | 
            +
                        temperature=gen_conf.get("temperature"),
         | 
| 61 | 
            +
                        max_output_tokens=gen_conf.get("max_output_tokens"),
         | 
| 62 | 
            +
                        top_p=gen_conf.get("top_p"),
         | 
| 63 | 
            +
                        top_k=gen_conf.get("top_k"),
         | 
| 64 | 
            +
                        # add any other fields you want to expose
         | 
| 65 | 
            +
                    )
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                def call_api(self, prompt: str) -> str:
         | 
| 68 | 
            +
                    """
         | 
| 69 | 
            +
                    Call the Gemini API with the given prompt (non-streaming).
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                    Args:
         | 
| 72 | 
            +
                        prompt (str): The input text for the API.
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                    Returns:
         | 
| 75 | 
            +
                        str: The generated text from the Gemini API.
         | 
| 76 | 
            +
                    """
         | 
| 77 | 
            +
                    contents = [
         | 
| 78 | 
            +
                        types.Content(
         | 
| 79 | 
            +
                            role="user",
         | 
| 80 | 
            +
                            parts=[types.Part.from_text(text=prompt)],
         | 
| 81 | 
            +
                        )
         | 
| 82 | 
            +
                    ]
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                    # Non-streaming call returns a full response object
         | 
| 85 | 
            +
                    response = self.client.models.generate_content(
         | 
| 86 | 
            +
                        model=self.model_name,
         | 
| 87 | 
            +
                        contents=contents,
         | 
| 88 | 
            +
                        config=self.generate_config,
         | 
| 89 | 
            +
                    )
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                    # Combine all output parts into a single string
         | 
| 92 | 
            +
                    return response.text
         | 
| 93 | 
            +
             | 
| 94 | 
            +
                    
         | 
| 95 | 
            +
             | 
| 96 | 
            +
            class AIExtractor:
         | 
| 97 | 
            +
                def __init__(self, llm_client: LLMClient, prompt_template: str):
         | 
| 98 | 
            +
                    """
         | 
| 99 | 
            +
                    Initializes the AIExtractor with a specific LLM client and configuration.
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                    Args:
         | 
| 102 | 
            +
                        llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
         | 
| 103 | 
            +
                        prompt_template (str): The template to use for generating prompts for the LLM.
         | 
| 104 | 
            +
                        should contain placeholders for dynamic content. 
         | 
| 105 | 
            +
                        e.g., "Extract the following information: {content} based on schema: {schema}"
         | 
| 106 | 
            +
                    """
         | 
| 107 | 
            +
                    self.llm_client = llm_client
         | 
| 108 | 
            +
                    self.prompt_template = prompt_template
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                def extract(self, content: str, schema: BaseModel) -> str:
         | 
| 111 | 
            +
                    """
         | 
| 112 | 
            +
                    Extracts structured information from the given content based on the provided schema.
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                    Args:
         | 
| 115 | 
            +
                        content (str): The raw content to extract information from.
         | 
| 116 | 
            +
                        schema (BaseModel): A Pydantic model defining the structure of the expected output.
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                    Returns:
         | 
| 119 | 
            +
                        str: The structured JSON object as a string.
         | 
| 120 | 
            +
                    """
         | 
| 121 | 
            +
                    prompt = self.prompt_template.format(content=content, schema=schema.model_json_schema())
         | 
| 122 | 
            +
                    # print(f"Generated prompt: {prompt}")
         | 
| 123 | 
            +
                    response = self.llm_client.call_api(prompt)
         | 
| 124 | 
            +
                    return response
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                    
         | 
    	
        web2json/pipeline.py
    ADDED
    
    | @@ -0,0 +1,43 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from web2json.ai_extractor import *
         | 
| 2 | 
            +
            from web2json.postprocessor import *
         | 
| 3 | 
            +
            from web2json.preprocessor import *
         | 
| 4 | 
            +
            from pydantic import BaseModel
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            class Pipeline:
         | 
| 7 | 
            +
                
         | 
| 8 | 
            +
                def __init__(self,
         | 
| 9 | 
            +
                             preprocessor: Preprocessor,
         | 
| 10 | 
            +
                             ai_extractor: AIExtractor,
         | 
| 11 | 
            +
                             postprocessor: PostProcessor):
         | 
| 12 | 
            +
                    self.preprocessor = preprocessor
         | 
| 13 | 
            +
                    self.ai_extractor = ai_extractor
         | 
| 14 | 
            +
                    self.postprocessor = postprocessor
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
         | 
| 17 | 
            +
                    """
         | 
| 18 | 
            +
                    Run the entire pipeline: preprocess, extract, and postprocess.
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    Args:
         | 
| 21 | 
            +
                        content (str): The raw content to process.
         | 
| 22 | 
            +
                        is_url (bool): Whether the content is a URL or raw text.
         | 
| 23 | 
            +
                        schema (BaseModel): The schema defining the structure of the expected output.
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                    Returns:
         | 
| 26 | 
            +
                        dict: The final structured data after processing.
         | 
| 27 | 
            +
                    """
         | 
| 28 | 
            +
                    # Step 1: Preprocess the content
         | 
| 29 | 
            +
                    preprocessed_content = self.preprocessor.preprocess(content, is_url)
         | 
| 30 | 
            +
                    # print(f"Preprocessed content: {preprocessed_content[:100]}...")
         | 
| 31 | 
            +
                    # print('+'*80)
         | 
| 32 | 
            +
                    # Step 2: Extract structured information using AI
         | 
| 33 | 
            +
                    extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
         | 
| 34 | 
            +
                    # print(f"Extracted data: {extracted_data[:100]}...")
         | 
| 35 | 
            +
                    # print('+'*80)
         | 
| 36 | 
            +
                    # Step 3: Post-process the extracted data
         | 
| 37 | 
            +
                    final_output = self.postprocessor.process(extracted_data)
         | 
| 38 | 
            +
                    # print(f"Final output: {final_output}")
         | 
| 39 | 
            +
                    # print('+'*80)
         | 
| 40 | 
            +
                    
         | 
| 41 | 
            +
                    return final_output
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                    
         | 
    	
        web2json/postprocessor.py
    ADDED
    
    | @@ -0,0 +1,27 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from json_repair import repair_json
         | 
| 2 | 
            +
            import json
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            class PostProcessor:
         | 
| 5 | 
            +
             | 
| 6 | 
            +
                def process(self, response: str) -> dict:
         | 
| 7 | 
            +
                    json_response = {}
         | 
| 8 | 
            +
                    try:
         | 
| 9 | 
            +
                        # Extract the JSON from the generated text.  Handle variations in output format.
         | 
| 10 | 
            +
                        json_string = response
         | 
| 11 | 
            +
                        if "```json" in response:
         | 
| 12 | 
            +
                            json_string = response.split("```json")[1].split("```")[0]
         | 
| 13 | 
            +
                        elif "{" in response and "}" in response:
         | 
| 14 | 
            +
                            # try to grab the json
         | 
| 15 | 
            +
                            start_index = response.find("{")
         | 
| 16 | 
            +
                            end_index = response.rfind("}") + 1
         | 
| 17 | 
            +
                            json_string = response[start_index:end_index]
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                        json_response = json.loads(repair_json(json_string)) # Added for robustness
         | 
| 20 | 
            +
                    except Exception as e:
         | 
| 21 | 
            +
                        print(f"Error parsing JSON: {e}")
         | 
| 22 | 
            +
                        print(f"Generated text: {response}")
         | 
| 23 | 
            +
                        json_response = {}
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                        
         | 
| 26 | 
            +
                    return json_response
         | 
| 27 | 
            +
             | 
    	
        web2json/preprocessor.py
    ADDED
    
    | @@ -0,0 +1,138 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import re
         | 
| 2 | 
            +
            import requests
         | 
| 3 | 
            +
            from bs4 import BeautifulSoup , Comment
         | 
| 4 | 
            +
            from abc import ABC, abstractmethod
         | 
| 5 | 
            +
            from typing import Any, Dict, Optional
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
            class Preprocessor(ABC):
         | 
| 9 | 
            +
                """
         | 
| 10 | 
            +
                Abstract base class for preprocessors.
         | 
| 11 | 
            +
                Defines the interface for transforming raw inputs into structured data.
         | 
| 12 | 
            +
                """
         | 
| 13 | 
            +
             | 
| 14 | 
            +
                def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
         | 
| 15 | 
            +
                    """
         | 
| 16 | 
            +
                    Initialize the preprocessor with optional configuration.
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                    Args:
         | 
| 19 | 
            +
                        config: A dictionary of configuration settings.
         | 
| 20 | 
            +
                        - keep_tags (bool): If True, keeps HTML tags in the output; otherwise, cleans them.
         | 
| 21 | 
            +
                    """
         | 
| 22 | 
            +
                    self.config = config if config is not None else {'keep_tags': False}
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                def _fetch_content(self, url: str) -> str:
         | 
| 25 | 
            +
                    """
         | 
| 26 | 
            +
                    Fetches and parses the text content from a URL.
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    Args:
         | 
| 29 | 
            +
                        url: The URL to fetch content from.
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                    Returns:
         | 
| 32 | 
            +
                        The clean, extracted text content from the page.
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    Raises:
         | 
| 35 | 
            +
                        ValueError: If the URL cannot be fetched or processed.
         | 
| 36 | 
            +
                    """
         | 
| 37 | 
            +
                    try:
         | 
| 38 | 
            +
                        # Set a User-Agent header to mimic a browser, which can help avoid
         | 
| 39 | 
            +
                        # being blocked by some websites.
         | 
| 40 | 
            +
                        # Inside _fetch_content method
         | 
| 41 | 
            +
                        headers = {
         | 
| 42 | 
            +
                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
         | 
| 43 | 
            +
                            'Accept-Language': 'en-US,en;q=0.9',
         | 
| 44 | 
            +
                            'Accept-Encoding': 'gzip, deflate, br',
         | 
| 45 | 
            +
                            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
         | 
| 46 | 
            +
                            'Connection': 'keep-alive',
         | 
| 47 | 
            +
                        }
         | 
| 48 | 
            +
                        
         | 
| 49 | 
            +
                        # Make the HTTP GET request with a timeout.
         | 
| 50 | 
            +
                        response = requests.get(url, headers=headers, timeout=15)
         | 
| 51 | 
            +
                        
         | 
| 52 | 
            +
                        
         | 
| 53 | 
            +
                        return response.text
         | 
| 54 | 
            +
                        
         | 
| 55 | 
            +
                    except requests.exceptions.RequestException as e:
         | 
| 56 | 
            +
                        # Catch any network-related errors (DNS, connection, timeout, etc.)
         | 
| 57 | 
            +
                        # and re-raise them as a more user-friendly ValueError.
         | 
| 58 | 
            +
                        raise ValueError(f"Failed to fetch content from URL: {url}. Error: {e}")
         | 
| 59 | 
            +
                    
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                @abstractmethod
         | 
| 62 | 
            +
                def preprocess(self, content: str, is_url: bool) -> str:
         | 
| 63 | 
            +
                    """
         | 
| 64 | 
            +
                    Take raw content (HTML, text, etc.) and apply preprocessing steps.
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                    Args:
         | 
| 67 | 
            +
                        content: The raw data to preprocess.
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                    Returns:
         | 
| 70 | 
            +
                        A dictionary containing structured, cleaned data ready for downstream tasks.
         | 
| 71 | 
            +
                    """
         | 
| 72 | 
            +
                    pass
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            class BasicPreprocessor(Preprocessor):
         | 
| 75 | 
            +
                """
         | 
| 76 | 
            +
                Base preprocessor with common functionality.
         | 
| 77 | 
            +
                Can be extended for specific preprocessing tasks.
         | 
| 78 | 
            +
                """
         | 
| 79 | 
            +
                # TODO: Might need to think of how to improve this later
         | 
| 80 | 
            +
                def _clean_html(self, html_content: str) -> str:
         | 
| 81 | 
            +
                    """
         | 
| 82 | 
            +
                    Cleans up the given HTML content by:
         | 
| 83 | 
            +
                    - Removing <script> and <style> tags and their content.
         | 
| 84 | 
            +
                    - Removing HTML comments.
         | 
| 85 | 
            +
                    - Extracting and returning the visible text with normalized whitespace if keep_tags is False.
         | 
| 86 | 
            +
                    
         | 
| 87 | 
            +
                    Args:
         | 
| 88 | 
            +
                        html_content (str): The HTML content to clean.
         | 
| 89 | 
            +
                    
         | 
| 90 | 
            +
                    Returns:
         | 
| 91 | 
            +
                        str: The cleaned, visible text from the HTML.
         | 
| 92 | 
            +
                    """
         | 
| 93 | 
            +
                    # Parse the HTML content
         | 
| 94 | 
            +
                    soup = BeautifulSoup(html_content, "html.parser")
         | 
| 95 | 
            +
                    
         | 
| 96 | 
            +
                    # Remove script and style elements
         | 
| 97 | 
            +
                    for tag in soup(["script", "style"]):
         | 
| 98 | 
            +
                        tag.decompose()
         | 
| 99 | 
            +
                    
         | 
| 100 | 
            +
                    # Remove HTML comments
         | 
| 101 | 
            +
                    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
         | 
| 102 | 
            +
                        comment.extract()
         | 
| 103 | 
            +
                    
         | 
| 104 | 
            +
                    # Extract text and normalize whitespace
         | 
| 105 | 
            +
                    if self.config.get('keep_tags', False):
         | 
| 106 | 
            +
                        # If keep_tags is True, return the raw HTML
         | 
| 107 | 
            +
                        return str(soup)
         | 
| 108 | 
            +
                    
         | 
| 109 | 
            +
                    text = soup.get_text(separator=" ", strip=True)
         | 
| 110 | 
            +
                    clean_text = re.sub(r'\s+', ' ', text)
         | 
| 111 | 
            +
                    
         | 
| 112 | 
            +
                    return clean_text
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                def preprocess(self, content: str, is_url: bool) -> str:
         | 
| 115 | 
            +
                    """
         | 
| 116 | 
            +
                    Take raw content (HTML, text, etc.) and apply preprocessing steps.
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                    Args:
         | 
| 119 | 
            +
                        content: The raw data to preprocess.
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                    Returns:
         | 
| 122 | 
            +
                        A dictionary containing structured, cleaned data ready for downstream tasks.
         | 
| 123 | 
            +
                    """
         | 
| 124 | 
            +
                    
         | 
| 125 | 
            +
                    html_content = content
         | 
| 126 | 
            +
                    if is_url:
         | 
| 127 | 
            +
                        # Fetch content from the URL
         | 
| 128 | 
            +
                        html_content = self._fetch_content(content)
         | 
| 129 | 
            +
             | 
| 130 | 
            +
             | 
| 131 | 
            +
                    # Clean the HTML content
         | 
| 132 | 
            +
                    cleaned_content = self._clean_html(html_content)
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                    return cleaned_content.strip()  # Return the cleaned text content, stripped of leading/trailing whitespace
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                    
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                    
         | 

