Spaces:

Agents-MCP-Hackathon
/

MCP_Server_Web2JSON

Running

App Files Files Community

abdo-Mansour commited on Jun 9

Commit

f2a2588

1 Parent(s): b05b1be

completed mcp v1

Browse files

Files changed (11) hide show

.gitignore +2 -0
.gradio/certificate.pem +31 -0
app.py +82 -0
web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
web2json/__pycache__/postprocessor.cpython-311.pyc +0 -0
web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
web2json/ai_extractor.py +126 -0
web2json/pipeline.py +43 -0
web2json/postprocessor.py +27 -0
web2json/preprocessor.py +138 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .env
2	+ test.ipynb

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import json
+import pandas as pd
+import gradio as gr
+from typing import Dict, Any
+from web2json.preprocessor import BasicPreprocessor
+from web2json.ai_extractor import AIExtractor, GeminiLLMClient
+from web2json.postprocessor import PostProcessor
+from web2json.pipeline import Pipeline
+from pydantic import BaseModel, Field
+import os
+import dotenv
+dotenv.load_dotenv()
+# Define schemas
+class Article(BaseModel):
+    title: str = Field(..., description="The title of the article.")
+    author: str = Field(..., description="The author of the article.")
+    content: str = Field(..., description="The main content of the article.")
+class Product(BaseModel):
+    name: str = Field(..., description="The name of the product.")
+    description: str = Field(..., description="A detailed description of the product.")
+    price: float = Field(..., description="The price of the product.")
+class JobPosting(BaseModel):
+    title: str = Field(..., description="The title of the job position.")
+    company: str = Field(..., description="The name of the company offering the job.")
+    location: str = Field(..., description="The location of the job.")
+    description: str = Field(..., description="A detailed description of the job responsibilities.")
+SCHEMA_OPTIONS = {
+    "Article": Article,
+    "Product": Product,
+    "Job Posting": JobPosting,
+}
+# Core processing function
+def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]:
+    if schema_name not in SCHEMA_OPTIONS:
+        return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"}
+    schema = SCHEMA_OPTIONS[schema_name]
+    prompt_template = "extract the following information: {content} based on schema: {schema}"
+    # Initialize pipeline components
+    preprocessor = BasicPreprocessor(config={'keep_tags': False})
+    try:
+        llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
+    except Exception as e:
+        return {"error": f"Failed to initialize LLM client: {str(e)}"}
+    ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
+    postprocessor = PostProcessor()
+    pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
+    try:
+        result = pipeline.run(content, is_url, schema)
+        # print("-"*80)
+        # print(f"Processed result: {result}")
+        return result
+    except Exception as e:
+        return {"error": f"Processing error: {str(e)}"}
+# Build Gradio Interface
+demo = gr.Interface(
+    fn=webpage_to_json,
+    inputs=[
+        gr.Textbox(label="Content (URL or Raw Text)", lines=10,
+                   placeholder="Enter URL or paste raw HTML/text here."),
+        gr.Checkbox(label="Content is URL?", value=False),
+        gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()),
+                    label="Select Schema", value="Article")
+    ],
+    outputs=gr.JSON(label="Output JSON"),
+    title="Webpage to JSON Converter",
+    description="Convert web pages or raw text into structured JSON using customizable schemas."
+)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

web2json/__pycache__/ai_extractor.cpython-311.pyc ADDED Viewed

Binary file (6.46 kB). View file

web2json/__pycache__/pipeline.cpython-311.pyc ADDED Viewed

Binary file (2.49 kB). View file

web2json/__pycache__/postprocessor.cpython-311.pyc ADDED Viewed

Binary file (1.65 kB). View file

web2json/__pycache__/preprocessor.cpython-311.pyc ADDED Viewed

Binary file (5.68 kB). View file

web2json/ai_extractor.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+from abc import ABC, abstractmethod
+from google import genai
+from google.genai import types
+from pydantic import BaseModel
+class LLMClient(ABC):
+    """
+    Abstract base class for calling LLM APIs.
+    """
+    def __init__(self, config: dict = None):
+        """
+        Initializes the LLMClient with a configuration dictionary.
+        Args:
+            config (dict): Configuration settings for the LLM client.
+        """
+        self.config = config or {}
+    @abstractmethod
+    def call_api(self, prompt: str) -> str:
+        """
+        Call the underlying LLM API with the given prompt.
+        Args:
+            prompt (str): The prompt or input text for the LLM.
+        Returns:
+            str: The response from the LLM.
+        """
+        pass
+class GeminiLLMClient(LLMClient):
+    """
+    Concrete implementation of LLMClient for the Gemini API.
+    """
+    def __init__(self, config: dict):
+        """
+        Initializes the GeminiLLMClient with an API key, model name, and optional generation settings.
+        Args:
+            config (dict): Configuration containing:
+                - 'api_key': (optional) API key for Gemini (falls back to GEMINI_API_KEY env var)
+                - 'model_name': (optional) the model to use (default 'gemini-2.0-flash')
+                - 'generation_config': (optional) dict of GenerateContentConfig parameters
+        """
+        api_key = config.get("api_key") or os.environ.get("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "API key for Gemini must be provided in config['api_key'] or GEMINI_API_KEY env var."
+            )
+        self.client = genai.Client(api_key=api_key)
+        self.model_name = config.get("model_name", "gemini-2.0-flash")
+        # allow custom generation settings, fallback to sensible defaults
+        gen_conf = config.get("generation_config", {})
+        self.generate_config = types.GenerateContentConfig(
+            response_mime_type=gen_conf.get("response_mime_type", "text/plain"),
+            temperature=gen_conf.get("temperature"),
+            max_output_tokens=gen_conf.get("max_output_tokens"),
+            top_p=gen_conf.get("top_p"),
+            top_k=gen_conf.get("top_k"),
+            # add any other fields you want to expose
+        )
+    def call_api(self, prompt: str) -> str:
+        """
+        Call the Gemini API with the given prompt (non-streaming).
+        Args:
+            prompt (str): The input text for the API.
+        Returns:
+            str: The generated text from the Gemini API.
+        """
+        contents = [
+            types.Content(
+                role="user",
+                parts=[types.Part.from_text(text=prompt)],
+            )
+        ]
+        # Non-streaming call returns a full response object
+        response = self.client.models.generate_content(
+            model=self.model_name,
+            contents=contents,
+            config=self.generate_config,
+        )
+        # Combine all output parts into a single string
+        return response.text
+class AIExtractor:
+    def __init__(self, llm_client: LLMClient, prompt_template: str):
+        """
+        Initializes the AIExtractor with a specific LLM client and configuration.
+        Args:
+            llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
+            prompt_template (str): The template to use for generating prompts for the LLM.
+            should contain placeholders for dynamic content.
+            e.g., "Extract the following information: {content} based on schema: {schema}"
+        """
+        self.llm_client = llm_client
+        self.prompt_template = prompt_template
+    def extract(self, content: str, schema: BaseModel) -> str:
+        """
+        Extracts structured information from the given content based on the provided schema.
+        Args:
+            content (str): The raw content to extract information from.
+            schema (BaseModel): A Pydantic model defining the structure of the expected output.
+        Returns:
+            str: The structured JSON object as a string.
+        """
+        prompt = self.prompt_template.format(content=content, schema=schema.model_json_schema())
+        # print(f"Generated prompt: {prompt}")
+        response = self.llm_client.call_api(prompt)
+        return response

web2json/pipeline.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from web2json.ai_extractor import *
+from web2json.postprocessor import *
+from web2json.preprocessor import *
+from pydantic import BaseModel
+class Pipeline:
+    def __init__(self,
+                 preprocessor: Preprocessor,
+                 ai_extractor: AIExtractor,
+                 postprocessor: PostProcessor):
+        self.preprocessor = preprocessor
+        self.ai_extractor = ai_extractor
+        self.postprocessor = postprocessor
+    def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
+        """
+        Run the entire pipeline: preprocess, extract, and postprocess.
+        Args:
+            content (str): The raw content to process.
+            is_url (bool): Whether the content is a URL or raw text.
+            schema (BaseModel): The schema defining the structure of the expected output.
+        Returns:
+            dict: The final structured data after processing.
+        """
+        # Step 1: Preprocess the content
+        preprocessed_content = self.preprocessor.preprocess(content, is_url)
+        # print(f"Preprocessed content: {preprocessed_content[:100]}...")
+        # print('+'*80)
+        # Step 2: Extract structured information using AI
+        extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
+        # print(f"Extracted data: {extracted_data[:100]}...")
+        # print('+'*80)
+        # Step 3: Post-process the extracted data
+        final_output = self.postprocessor.process(extracted_data)
+        # print(f"Final output: {final_output}")
+        # print('+'*80)
+        return final_output

web2json/postprocessor.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from json_repair import repair_json
+import json
+class PostProcessor:
+    def process(self, response: str) -> dict:
+        json_response = {}
+        try:
+            # Extract the JSON from the generated text.  Handle variations in output format.
+            json_string = response
+            if "```json" in response:
+                json_string = response.split("```json")[1].split("```")[0]
+            elif "{" in response and "}" in response:
+                # try to grab the json
+                start_index = response.find("{")
+                end_index = response.rfind("}") + 1
+                json_string = response[start_index:end_index]
+            json_response = json.loads(repair_json(json_string)) # Added for robustness
+        except Exception as e:
+            print(f"Error parsing JSON: {e}")
+            print(f"Generated text: {response}")
+            json_response = {}
+        return json_response

web2json/preprocessor.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import re
+import requests
+from bs4 import BeautifulSoup , Comment
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+class Preprocessor(ABC):
+    """
+    Abstract base class for preprocessors.
+    Defines the interface for transforming raw inputs into structured data.
+    """
+    def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
+        """
+        Initialize the preprocessor with optional configuration.
+        Args:
+            config: A dictionary of configuration settings.
+            - keep_tags (bool): If True, keeps HTML tags in the output; otherwise, cleans them.
+        """
+        self.config = config if config is not None else {'keep_tags': False}
+    def _fetch_content(self, url: str) -> str:
+        """
+        Fetches and parses the text content from a URL.
+        Args:
+            url: The URL to fetch content from.
+        Returns:
+            The clean, extracted text content from the page.
+        Raises:
+            ValueError: If the URL cannot be fetched or processed.
+        """
+        try:
+            # Set a User-Agent header to mimic a browser, which can help avoid
+            # being blocked by some websites.
+            # Inside _fetch_content method
+            headers = {
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+                'Accept-Language': 'en-US,en;q=0.9',
+                'Accept-Encoding': 'gzip, deflate, br',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+                'Connection': 'keep-alive',
+            }
+            # Make the HTTP GET request with a timeout.
+            response = requests.get(url, headers=headers, timeout=15)
+            return response.text
+        except requests.exceptions.RequestException as e:
+            # Catch any network-related errors (DNS, connection, timeout, etc.)
+            # and re-raise them as a more user-friendly ValueError.
+            raise ValueError(f"Failed to fetch content from URL: {url}. Error: {e}")
+    @abstractmethod
+    def preprocess(self, content: str, is_url: bool) -> str:
+        """
+        Take raw content (HTML, text, etc.) and apply preprocessing steps.
+        Args:
+            content: The raw data to preprocess.
+        Returns:
+            A dictionary containing structured, cleaned data ready for downstream tasks.
+        """
+        pass
+class BasicPreprocessor(Preprocessor):
+    """
+    Base preprocessor with common functionality.
+    Can be extended for specific preprocessing tasks.
+    """
+    # TODO: Might need to think of how to improve this later
+    def _clean_html(self, html_content: str) -> str:
+        """
+        Cleans up the given HTML content by:
+        - Removing <script> and <style> tags and their content.
+        - Removing HTML comments.
+        - Extracting and returning the visible text with normalized whitespace if keep_tags is False.
+        Args:
+            html_content (str): The HTML content to clean.
+        Returns:
+            str: The cleaned, visible text from the HTML.
+        """
+        # Parse the HTML content
+        soup = BeautifulSoup(html_content, "html.parser")
+        # Remove script and style elements
+        for tag in soup(["script", "style"]):
+            tag.decompose()
+        # Remove HTML comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+        # Extract text and normalize whitespace
+        if self.config.get('keep_tags', False):
+            # If keep_tags is True, return the raw HTML
+            return str(soup)
+        text = soup.get_text(separator=" ", strip=True)
+        clean_text = re.sub(r'\s+', ' ', text)
+        return clean_text
+    def preprocess(self, content: str, is_url: bool) -> str:
+        """
+        Take raw content (HTML, text, etc.) and apply preprocessing steps.
+        Args:
+            content: The raw data to preprocess.
+        Returns:
+            A dictionary containing structured, cleaned data ready for downstream tasks.
+        """
+        html_content = content
+        if is_url:
+            # Fetch content from the URL
+            html_content = self._fetch_content(content)
+        # Clean the HTML content
+        cleaned_content = self._clean_html(html_content)
+        return cleaned_content.strip()  # Return the cleaned text content, stripped of leading/trailing whitespace