Spaces:

GIZ
/

eudr_chabo_ingestor

Running on CPU Upgrade

File size: 8,457 Bytes

# EUDR INGESTOR 

import gradio as gr
import os
import logging
from datetime import datetime
from pathlib import Path
from gradio_client import Client, handle_file
import pandas as pd

# Local imports
from .utils import getconfig

config = getconfig("params.cfg")

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

hf_token = os.getenv('HF_TOKEN')
if not hf_token:
    raise ValueError("HF_TOKEN environment variable not found")


# WHISP API configuration
WHISP_API_URL = config.get('whisp', 'WHISP_API_URL', fallback="https://giz-chatfed-whisp.hf.space/")

def get_value(df, colname):
    """Fetch value from WhispAPI-style Column/Value dataframe"""
    if "Column" in df.columns and "Value" in df.columns:
        match = df.loc[df["Column"] == colname, "Value"]
        if not match.empty:
            return match.values[0]
    return "No disponible"

def format_whisp_statistics(df):
    """Format WhispAPI statistics into readable text for RAG context"""
    try:
        # Country code mapping
        country_codes = {
            'HND': 'Honduras', 'GTM': 'Guatemala', 'ECU': 'Ecuador',
            'COL': 'Colombia', 'PER': 'Peru', 'BRA': 'Brasil',
            'BOL': 'Bolivia', 'CRI': 'Costa Rica', 'PAN': 'Panamá',
            'NIC': 'Nicaragua'
        }
        
        country_raw = get_value(df, "Country")
        country = country_codes.get(country_raw, country_raw)
        admin_level = get_value(df, "Admin_Level_1")
        area_raw = get_value(df, "Area")
        
        # Format area
        try:
            area_num = float(area_raw)
            if area_num < 1:
                area_text = f"{area_num:.3f} hectáreas"
            elif area_num < 100:
                area_text = f"{area_num:.2f} hectáreas"
            else:
                area_text = f"{area_num:,.1f} hectáreas"
        except:
            area_text = str(area_raw) if area_raw != "Not available" else "No disponible"

        # Risk assessments
        risk_pcrop = get_value(df, "risk_pcrop")
        risk_acrop = get_value(df, "risk_acrop")
        risk_timber = get_value(df, "risk_timber")
        def_after_2020_raw = get_value(df, "TMF_def_after_2020")
        def_before_2020_raw = get_value(df, "TMF_def_before_2020")

        
        # Helper function to format risk levels with colors/emojis
        def format_risk(risk_val):
            if not risk_val or risk_val in ["Not available", "not available"]:
                return "**No disponible**"
            elif isinstance(risk_val, str):
                risk_lower = risk_val.lower().strip()
                if risk_lower == "low":
                    return "*riesgo bajo*"
                elif risk_lower == "medium":
                    return "*riesgo medio*"
                elif risk_lower == "high":
                    return "*riesgo alto*"
                elif risk_lower == "very high":
                    return "*riesgo muy alto*"
                elif risk_lower == "more_info_needed":
                    return "*Se necesita más información.*"
                else:
                    return f"ℹ️ **{risk_val.title()}**"
            return str(risk_val)

        # Format deforestation data
        def format_deforestation(def_val):
            if not def_val or def_val in ["Not available", "not available"]:
                return "*No disponible*"
            try:
                def_num = float(def_val)
                if def_num == 0:
                    return "* No se detectó deforestación.*"
                elif def_num < 0.1:
                    return f"*{def_num:.3f} hectáreas*"
                else:
                    return f"*{def_num:.2f} hectáreas*"
            except:
                return f"ℹ️ **{def_val}**"

        # Format for RAG context
        context = f"""

**Respuesta generada mediante inteligencia artificíal:** \n\n
        
**Resultados del análisis geográfico**  \n\n
La siguiente información ha sido generada por la [WhispAPI creada por Forest Data Partnership (FDaP)](https://openforis.org/solutions/whisp/).

📍 **Detalles de la ubicación:**

- País: *{country}*
- Región administrativa: *{admin_level}*
- Área total: *{area_text}*

⚠️ **Evaluación del riesgo de deforestación:**
Los niveles de riesgo se basan en patrones históricos, factores ambientales y datos sobre el uso del suelo.

- Cultivos permanentes (Café, cacao, aceite de palma): {format_risk(risk_pcrop)}
- Cultivos anuales (Soja, maíz, arroz): {format_risk(risk_acrop)}
- Extracción de madera: {format_risk(risk_timber)}

🌳 **Datos de deforestación:**

- Deforestación antes de 2020: {format_deforestation(def_after_2020_raw)}
- Deforestación después de 2020: {format_deforestation(def_after_2020_raw)}

Fuente: Forest Data Partnership (FDaP) WhispAPI
Fecha de análisis: {datetime.now().isoformat()}"""

        return context
        
    except Exception as e:
        return f"Error en el análisis geográfico: {str(e)}"

def process_geojson_whisp(file_content: bytes, filename: str) -> tuple[str, dict]:
    """Process GeoJSON file through WHISP API and return formatted context"""
    try:
        
        
        # Create temporary file for WHISP API
        import tempfile
        with tempfile.NamedTemporaryFile(delete=False, suffix='.geojson') as tmp_file:
            tmp_file.write(file_content)
            tmp_file_path = tmp_file.name
        
        try:
            # Call WHISP API with authentication
            client = Client(WHISP_API_URL, hf_token=hf_token)
            result = client.predict(
                file=handle_file(tmp_file_path),
                api_name="/get_statistics"
            )
            
            # Convert result to DataFrame
            df = pd.DataFrame(result['data'], columns=result['headers'])
            
            # Format for RAG context
            formatted_context = format_whisp_statistics(df)
            
            metadata = {
                "analysis_type": "whisp_geojson",
                "country": get_value(df, "Country"),
                "admin_level": get_value(df, "Admin_Level_1"),
                "area": get_value(df, "Area"),
                "risk_levels": {
                    "pcrop": get_value(df, "risk_pcrop"),
                    "acrop": get_value(df, "risk_acrop"),
                    "timber": get_value(df, "risk_timber")
                }
            }
            
            return formatted_context, metadata
            
        finally:
            # Clean up temporary file
            os.unlink(tmp_file_path)
            
    except Exception as e:
        logger.error(f"WHISP API error: {str(e)}")
        raise Exception(f"Failed to process GeoJSON through WHISP API: {str(e)}")

def ingest(file):
    """Main ingestion function - processes GeoJSON file and returns WHISP analysis context"""
    if file is None:
        return "No file uploaded", ""
    
    try:
        with open(file.name, 'rb') as f:
            file_content = f.read()
        
        filename = os.path.basename(file.name)
        
        # Check file extension
        file_extension = os.path.splitext(filename)[1].lower()
        if file_extension not in ['.geojson', '.json']:
            raise ValueError(f"Unsupported file type: {file_extension}. Only GeoJSON files are supported.")
        
        # Process through WHISP API
        context, metadata = process_geojson_whisp(file_content, filename)
        
        logger.info(f"Successfully processed GeoJSON {filename} through WHISP API")
        
        return context
        
    except Exception as e:
        logger.error(f"GeoJSON processing failed: {str(e)}")
        raise Exception(f"Processing failed: {str(e)}")

if __name__ == "__main__":
    ui = gr.Interface(
        fn=ingest,
        inputs=gr.File(
            label="GeoJSON Upload",
            file_types=[".geojson", ".json"]
        ),
        outputs=gr.Textbox(
            label="WHISP Analysis Context",
            lines=15,
            show_copy_button=True
        ),
        title="EUDR Ingestion Module - WHISP API",
        description="Processes GeoJSON files through WHISP API and returns geographic analysis context for RAG pipelines.",
        api_name="ingest"
    )

    ui.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )