Spaces:

fahmiaziz
/

api-rerank-model

Sleeping

App Files Files Community

fahmiaziz98 commited on Sep 28

Commit

073edba

1 Parent(s): 76d149a

restapi

Browse files

Files changed (2) hide show

app.py +514 -5
requirements.txt +6 -2

app.py CHANGED Viewed

@@ -1,7 +1,516 @@
-from fastapi import FastAPI
-app = FastAPI()
-@app.get("/")
-def greet_json():
-    return {"Hello": "World!"}

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+from loguru import logger
+import time
+import torch
+from contextlib import asynccontextmanager
+from sentence_transformers import CrossEncoder
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# -------------------------
+# Request/Response Models
+# -------------------------
+class RerankRequest(BaseModel):
+    """
+    Request model for document reranking.
+    Attributes:
+        query: The search query
+        documents: List of documents to rerank
+        model_id: Identifier of the reranking model to use
+        instruction: Optional instruction for instruction-based models
+        top_k: Maximum number of documents to return (optional)
+    """
+    query: str = Field(..., description="Search query text")
+    documents: List[str] = Field(..., min_items=1, description="List of documents to rerank")
+    model_id: str = Field(..., description="Model identifier for reranking")
+    instruction: Optional[str] = Field(None, description="Optional instruction for reranking task")
+    top_k: Optional[int] = Field(None, description="Maximum number of results to return")
+class RerankResult(BaseModel):
+    """
+    Single reranking result.
+    Attributes:
+        text: The document text
+        score: Relevance score from the reranking model
+        index: Original index of the document in input list
+    """
+    text: str
+    score: float
+    index: int
+class RerankResponse(BaseModel):
+    """
+    Response model for document reranking.
+    Attributes:
+        results: List of reranked documents with scores
+        query: The original search query
+        model_id: Identifier of the model used
+        processing_time: Time taken to process the request
+        total_documents: Total number of input documents
+        returned_documents: Number of documents returned
+    """
+    results: List[RerankResult]
+    query: str
+    model_id: str
+    processing_time: float
+    total_documents: int
+    returned_documents: int
+# -------------------------
+# Model Management
+# -------------------------
+class RerankerModel:
+    """Base class for reranking models."""
+    def __init__(self, model_id: str, model_name: str, model_type: str):
+        self.model_id = model_id
+        self.model_name = model_name
+        self.model_type = model_type
+        self.model = None
+        self.tokenizer = None
+        self.loaded = False
+    def load(self):
+        """Load the model. To be implemented by subclasses."""
+        raise NotImplementedError
+    def rerank(self, query: str, documents: List[str], instruction: Optional[str] = None) -> List[float]:
+        """Rerank documents. To be implemented by subclasses."""
+        raise NotImplementedError
+class SentenceTransformersReranker(RerankerModel):
+    """Reranker using sentence-transformers CrossEncoder."""
+    def load(self):
+        """Load sentence-transformers CrossEncoder model."""
+        try:
+            logger.info(f"Loading SentenceTransformers model: {self.model_name}")
+            self.model = CrossEncoder(
+                self.model_name,
+                model_kwargs={"torch_dtype": "auto"},
+                trust_remote_code=True
+            )
+            self.loaded = True
+            logger.success(f"Successfully loaded {self.model_id}")
+        except Exception as e:
+            logger.error(f"Failed to load {self.model_id}: {e}")
+            raise
+    def rerank(self, query: str, documents: List[str], instruction: Optional[str] = None) -> List[float]:
+        """Rerank documents using CrossEncoder."""
+        if not self.loaded:
+            raise RuntimeError(f"Model {self.model_id} not loaded")
+        try:
+            # For sentence-transformers, we can use the rank method directly
+            rankings = self.model.rank(query, documents, convert_to_tensor=True)
+            # Extract scores and maintain original order
+            scores = [0.0] * len(documents)
+            for ranking in rankings:
+                scores[ranking['corpus_id']] = float(ranking['score'])
+            return scores
+        except Exception as e:
+            logger.error(f"Reranking failed with {self.model_id}: {e}")
+            raise
+class QwenReranker(RerankerModel):
+    """Reranker using Qwen3-Reranker model."""
+    def load(self):
+        """Load Qwen reranker model."""
+        try:
+            logger.info(f"Loading Qwen model: {self.model_name}")
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_name,
+                padding_side='left'
+            )
+            self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_name
+            ).eval()
+            # Set up Qwen-specific tokens
+            self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
+            self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
+            self.max_length = 8192
+            # Set up prompt templates
+            self.prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
+            self.suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
+            self.prefix_tokens = self.tokenizer.encode(self.prefix, add_special_tokens=False)
+            self.suffix_tokens = self.tokenizer.encode(self.suffix, add_special_tokens=False)
+            self.loaded = True
+            logger.success(f"Successfully loaded {self.model_id}")
+        except Exception as e:
+            logger.error(f"Failed to load {self.model_id}: {e}")
+            raise
+    def _format_instruction(self, instruction: str, query: str, doc: str) -> str:
+        """Format instruction for Qwen model."""
+        if instruction is None:
+            instruction = 'Given a web search query, retrieve relevant passages that answer the query'
+        return "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
+            instruction=instruction, query=query, doc=doc
+        )
+    def _process_inputs(self, pairs: List[str]):
+        """Process input pairs for Qwen model."""
+        inputs = self.tokenizer(
+            pairs,
+            padding=False,
+            truncation='longest_first',
+            return_attention_mask=False,
+            max_length=self.max_length - len(self.prefix_tokens) - len(self.suffix_tokens)
+        )
+        for i, ele in enumerate(inputs['input_ids']):
+            inputs['input_ids'][i] = self.prefix_tokens + ele + self.suffix_tokens
+        inputs = self.tokenizer.pad(
+            inputs,
+            padding=True,
+            return_tensors="pt",
+            max_length=self.max_length
+        )
+        for key in inputs:
+            inputs[key] = inputs[key].to(self.model.device)
+        return inputs
+    @torch.no_grad()
+    def _compute_logits(self, inputs):
+        """Compute relevance scores from model logits."""
+        batch_scores = self.model(**inputs).logits[:, -1, :]
+        true_vector = batch_scores[:, self.token_true_id]
+        false_vector = batch_scores[:, self.token_false_id]
+        batch_scores = torch.stack([false_vector, true_vector], dim=1)
+        batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
+        scores = batch_scores[:, 1].exp().tolist()
+        return scores
+    def rerank(self, query: str, documents: List[str], instruction: Optional[str] = None) -> List[float]:
+        """Rerank documents using Qwen model."""
+        if not self.loaded:
+            raise RuntimeError(f"Model {self.model_id} not loaded")
+        try:
+            # Format instruction pairs
+            pairs = [
+                self._format_instruction(instruction, query, doc)
+                for doc in documents
+            ]
+            # Process inputs
+            inputs = self._process_inputs(pairs)
+            # Compute scores
+            scores = self._compute_logits(inputs)
+            return scores
+        except Exception as e:
+            logger.error(f"Reranking failed with {self.model_id}: {e}")
+            raise
+class ModelManager:
+    """Manager for reranking models with preloading."""
+    def __init__(self):
+        self.models: Dict[str, RerankerModel] = {}
+        self.model_configs = {
+            "jina-reranker-v2": {
+                "model_name": "jinaai/jina-reranker-v2-base-multilingual",
+                "model_type": "sentence_transformers",
+                "description": "Multilingual reranker from Jina AI"
+            },
+            "bge-reranker-v2": {
+                "model_name": "BAAI/bge-reranker-v2-m3",
+                "model_type": "sentence_transformers",
+                "description": "BGE multilingual reranker"
+            },
+            "qwen3-reranker": {
+                "model_name": "Qwen/Qwen3-Reranker-0.6B",
+                "model_type": "qwen",
+                "description": "Qwen3 instruction-based reranker"
+            }
+        }
+    async def preload_all_models(self):
+        """Preload all configured models."""
+        logger.info(f"Starting preload of {len(self.model_configs)} reranking models...")
+        for model_id, config in self.model_configs.items():
+            try:
+                logger.info(f"Loading {model_id}...")
+                if config["model_type"] == "sentence_transformers":
+                    model = SentenceTransformersReranker(
+                        model_id=model_id,
+                        model_name=config["model_name"],
+                        model_type=config["model_type"]
+                    )
+                elif config["model_type"] == "qwen":
+                    model = QwenReranker(
+                        model_id=model_id,
+                        model_name=config["model_name"],
+                        model_type=config["model_type"]
+                    )
+                else:
+                    logger.error(f"Unknown model type: {config['model_type']}")
+                    continue
+                model.load()
+                self.models[model_id] = model
+                logger.success(f"Successfully preloaded {model_id}")
+            except Exception as e:
+                logger.error(f"Failed to preload {model_id}: {e}")
+        loaded_count = len([m for m in self.models.values() if m.loaded])
+        logger.success(f"Preloaded {loaded_count}/{len(self.model_configs)} models successfully")
+    def get_model(self, model_id: str) -> RerankerModel:
+        """Get a loaded model by ID."""
+        if model_id not in self.models:
+            raise ValueError(f"Model {model_id} not found")
+        model = self.models[model_id]
+        if not model.loaded:
+            raise ValueError(f"Model {model_id} not loaded")
+        return model
+    def list_models(self) -> List[Dict[str, Any]]:
+        """List all available models with their status."""
+        models_info = []
+        for model_id, config in self.model_configs.items():
+            model = self.models.get(model_id)
+            info = {
+                "id": model_id,
+                "name": config["model_name"],
+                "type": config["model_type"],
+                "description": config["description"],
+                "loaded": model.loaded if model else False
+            }
+            models_info.append(info)
+        return models_info
+# -------------------------
+# Application Setup
+# -------------------------
+model_manager = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan manager with model preloading."""
+    global model_manager
+    # Startup
+    logger.info("Starting reranking API...")
+    try:
+        model_manager = ModelManager()
+        await model_manager.preload_all_models()
+        logger.success("Reranking API startup complete!")
+    except Exception as e:
+        logger.error(f"Failed to initialize models: {e}")
+        raise
+    yield
+    # Shutdown
+    logger.info("Shutting down reranking API...")
+app = FastAPI(
+    title="Multi-Model Reranking API",
+    description="""
+High-performance API for document reranking using multiple state-of-the-art models.
+✅ **Supported Models:**
+- **Jina Reranker V2**: Multilingual reranker optimized for search
+- **BGE Reranker V2**: High-performance multilingual reranking
+- **Qwen3 Reranker**: Instruction-based reranking with reasoning
+🚀 **Features:**
+- Multiple reranking models preloaded at startup
+- Batch document reranking with relevance scoring
+- Optional instruction-based reranking (Qwen3)
+- Comprehensive performance metrics
+- Zero cold start delay
+📊 **Input/Output:**
+- Input: Query + documents + optional instruction
+- Output: Ranked documents with relevance scores
+    """,
+    version="1.0.0",
+    lifespan=lifespan
+)
+# -------------------------
+# API Endpoints
+# -------------------------
+@app.post("/rerank", response_model=RerankResponse, tags=["Reranking"])
+async def rerank_documents(request: RerankRequest):
+    """
+    Rerank documents based on relevance to query.
+    This endpoint takes a query and list of documents, then returns them
+    ranked by relevance using the specified reranking model.
+    Args:
+        request: RerankRequest containing query, documents, and model info
+    Returns:
+        RerankResponse with ranked documents, scores, and metadata
+    Example:
+        ```json
+        {
+            "query": "machine learning algorithms",
+            "documents": [
+                "Deep learning uses neural networks",
+                "Weather forecast for tomorrow",
+                "Supervised learning with labeled data"
+            ],
+            "model_id": "jina-reranker-v2"
+        }
+        ```
+    """
+    if not request.query.strip():
+        raise HTTPException(400, "Query cannot be empty")
+    if not request.documents:
+        raise HTTPException(400, "Documents list cannot be empty")
+    # Filter out empty documents
+    valid_docs = [(i, doc.strip()) for i, doc in enumerate(request.documents) if doc.strip()]
+    if not valid_docs:
+        raise HTTPException(400, "No valid documents found after filtering empty strings")
+    try:
+        start_time = time.time()
+        # Get model
+        model = model_manager.get_model(request.model_id)
+        # Extract valid documents and their indices
+        original_indices, documents = zip(*valid_docs)
+        # Perform reranking
+        scores = model.rerank(
+            query=request.query.strip(),
+            documents=list(documents),
+            instruction=request.instruction
+        )
+        # Create results with original indices
+        results = []
+        for i, (orig_idx, doc, score) in enumerate(zip(original_indices, documents, scores)):
+            results.append(RerankResult(
+                text=doc,
+                score=score,
+                index=orig_idx
+            ))
+        # Sort by score (descending)
+        results.sort(key=lambda x: x.score, reverse=True)
+        # Apply top_k limit if specified
+        if request.top_k:
+            results = results[:request.top_k]
+        processing_time = time.time() - start_time
+        logger.info(
+            f"Reranked {len(documents)} documents in {processing_time:.3f}s "
+            f"using {request.model_id}"
+        )
+        return RerankResponse(
+            results=results,
+            query=request.query.strip(),
+            model_id=request.model_id,
+            processing_time=processing_time,
+            total_documents=len(request.documents),
+            returned_documents=len(results)
+        )
+    except ValueError as e:
+        raise HTTPException(400, str(e))
+    except Exception as e:
+        logger.error(f"Reranking failed: {e}")
+        raise HTTPException(500, f"Reranking failed: {str(e)}")
+@app.get("/models", tags=["Models"])
+async def list_models():
+    """
+    List all available reranking models.
+    Returns information about all configured models including their
+    loading status and capabilities.
+    Returns:
+        List of model information dictionaries
+    """
+    try:
+        return model_manager.list_models()
+    except Exception as e:
+        logger.error(f"Failed to list models: {e}")
+        raise HTTPException(500, str(e))
+@app.get("/health", tags=["Monitoring"])
+async def health_check():
+    """
+    Check API health and model status.
+    Returns comprehensive health information including model loading
+    status and system metrics.
+    Returns:
+        Health status dictionary
+    """
+    try:
+        models = model_manager.list_models()
+        loaded_models = [m for m in models if m['loaded']]
+        return {
+            "status": "ok",
+            "total_models": len(models),
+            "loaded_models": len(loaded_models),
+            "available_models": [m['id'] for m in loaded_models],
+            "models_info": models
+        }
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        return {
+            "status": "error",
+            "error": str(e)
+        }

requirements.txt CHANGED Viewed

@@ -1,2 +1,6 @@
-fastapi
-uvicorn[standard]

+fastapi==0.116.2
+uvicorn[standard]==0.35.0
+torch==2.8.0
+sentence-transformers==5.1.1
+loguru==0.7.3
+einops==0.8.1