Spaces:

OrganizedProgrammers
/

DocFinder

Running

App Files Files Community

om4r932 commited on Jul 18

Commit

405abe1

1 Parent(s): a6af380

Add documentation + fix bugs

Browse files

Files changed (4) hide show

app.py +92 -10
classes.py +9 -0
documentation.md +48 -0
schemas.py +152 -25

app.py CHANGED Viewed

@@ -58,7 +58,7 @@ def get_tdoc_url(doc_id):
     for tdoc in tdoc_locations:
         if tdoc["doc_id"] == doc_id:
             return tdoc["url"]
-    return "Document not indexed (Re-index TDocs)"
 def get_spec_url(document):
     series = document.split(".")[0].zfill(2)
@@ -74,7 +74,33 @@ def get_document(spec_id: str, spec_title: str, source: str):
             text.extend([section['section'], section['content']])
     return text
-app = FastAPI(title="Document Finder Back-End", docs_url="/", description="Backend for DocFinder - Searching technical documents & specifications from 3GPP & ETSI")
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -92,13 +118,35 @@ valid_3gpp_spec_format = re.compile(r'^\d{2}\.\d{3}(?:-\d+)?')
 valid_etsi_doc_format = re.compile(r'^(?:SET|SCP|SETTEC|SETREQ|SCPTEC|SCPREQ)\(\d+\)\d+(?:r\d+)?', flags=re.IGNORECASE)
 valid_etsi_spec_format = re.compile(r'^\d{3} \d{3}(?:-\d+)?')
-@app.post("/find", response_model=DocResponse)
 def find_document(request: DocRequest):
     start_time = time.time()
     document = request.doc_id
-    source = request.source
-    spec_metadatas = spec_metadatas_3gpp if source == "3GPP" else spec_metadatas_etsi if source == "ETSI" else spec_metadatas_3gpp + spec_metadatas_etsi
-    is_3gpp = valid_3gpp_doc_format.match(document) or valid_3gpp_spec_format.match(document)
     url = get_tdoc_url(document) if valid_3gpp_doc_format.match(document) else \
         get_spec_url(document) if valid_3gpp_spec_format.match(document) else \
@@ -108,9 +156,10 @@ def find_document(request: DocRequest):
         raise HTTPException(status_code=404, detail=url)
     version = None
-    if is_3gpp:
         version = url.split("/")[-1].replace(".zip", "").split("-")[-1]
     scope = None
     for spec in spec_metadatas:
         if spec['id'] == document:
             scope = spec['scope']
@@ -124,7 +173,23 @@ def find_document(request: DocRequest):
         scope=scope
     )
-@app.post("/batch", response_model=BatchDocResponse)
 def find_document_batch(request: BatchDocRequest):
     start_time = time.time()
     documents = request.doc_ids
@@ -148,7 +213,17 @@ def find_document_batch(request: BatchDocRequest):
         search_time=time.time()-start_time
     )
-@app.post('/search-spec', response_model=KeywordResponse)
 def search_specifications(request: KeywordRequest):
     start_time = time.time()
     boolSensitiveCase = request.case_sensitive
@@ -215,7 +290,14 @@ def search_specifications(request: KeywordRequest):
     else:
         raise HTTPException(status_code=404, detail="Specifications not found")
-@app.post("/search-spec/experimental", response_model=KeywordResponse)
 def bm25_search_specification(request: BM25KeywordRequest):
     start_time = time.time()
     source = request.source

     for tdoc in tdoc_locations:
         if tdoc["doc_id"] == doc_id:
             return tdoc["url"]
+    return "Document not indexed (re-indexing documents ?)"
 def get_spec_url(document):
     series = document.split(".")[0].zfill(2)
             text.extend([section['section'], section['content']])
     return text
+tags_metadata = [
+    {
+        "name": "Document Retrieval",
+        "description": """
+        Direct document lookup operations for retrieving specific documents by their unique identifiers.
+        These endpoints provide fast access to document URLs, versions, and metadata without requiring keyword searches.
+        Perfect for when you know the exact document ID you're looking for.
+        """,
+    },
+    {
+        "name": "Content Search",
+        "description": """
+        Advanced search operations for finding documents based on keywords and content matching.
+        Includes both quick metadata-based searches and deep content analysis with flexible filtering options.
+        Supports different search modes and logical operators for precise results.
+        """,
+    },
+]
+app = FastAPI(
+    title="3GPP & ETSI Document Finder API",
+    description=open('documentation.md').read(),
+    openapi_tags=tags_metadata
+)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
 valid_etsi_doc_format = re.compile(r'^(?:SET|SCP|SETTEC|SETREQ|SCPTEC|SCPREQ)\(\d+\)\d+(?:r\d+)?', flags=re.IGNORECASE)
 valid_etsi_spec_format = re.compile(r'^\d{3} \d{3}(?:-\d+)?')
+@app.post("/find/single", response_model=DocResponse, tags=["Document Retrieval"], summary="Retrieve a single document by ID", responses={
+             200: {
+                 "description": "Document found successfully",
+                 "content": {
+                     "application/json": {
+                         "example": {
+                             "doc_id": "23.401",
+                             "url": "https://www.3gpp.org/ftp/Specs/archive/23_series/23.401/23401-h20.zip",
+                             "version": "h20",
+                             "scope": "General Packet Radio Service (GPRS) enhancements for Evolved Universal Terrestrial Radio Access Network (E-UTRAN) access",
+                             "search_time": 0.0234
+                         }
+                     }
+                 }
+             },
+             404: {
+                 "description": "Document not found or not indexed",
+                 "content": {
+                     "application/json": {
+                         "example": {
+                             "detail": "Specification 99.999 not found"
+                         }
+                     }
+                 }
+             }
+         })
 def find_document(request: DocRequest):
     start_time = time.time()
     document = request.doc_id
     url = get_tdoc_url(document) if valid_3gpp_doc_format.match(document) else \
         get_spec_url(document) if valid_3gpp_spec_format.match(document) else \
         raise HTTPException(status_code=404, detail=url)
     version = None
+    if valid_3gpp_spec_format.match(document):
         version = url.split("/")[-1].replace(".zip", "").split("-")[-1]
     scope = None
+    spec_metadatas = spec_metadatas_3gpp if valid_3gpp_spec_format.match(document) else spec_metadatas_etsi
     for spec in spec_metadatas:
         if spec['id'] == document:
             scope = spec['scope']
         scope=scope
     )
+@app.post("/find/batch", response_model=BatchDocResponse, summary="Retrieve multiple documents by IDs", tags=["Document Retrieval"], responses={
+             200: {
+                 "description": "Batch processing completed",
+                 "content": {
+                     "application/json": {
+                         "example": {
+                             "results": {
+                                 "23.401": "https://www.3gpp.org/ftp/Specs/archive/23_series/23.401/23401-h20.zip",
+                                 "S1-123456": "https://www.3gpp.org/ftp/tsg_sa/WG1_Serv/TSGSI_123/Docs/S1-123456.zip"
+                             },
+                             "missing": ["99.999", "INVALID-DOC"],
+                             "search_time": 0.156
+                         }
+                     }
+                 }
+             }
+         })
 def find_document_batch(request: BatchDocRequest):
     start_time = time.time()
     documents = request.doc_ids
         search_time=time.time()-start_time
     )
+@app.post('/search', response_model=KeywordResponse, tags=["Content Search"], summary="Search specifications by keywords", responses={
+             200: {
+                 "description": "Search completed successfully"
+             },
+             400: {
+                 "description": "You must enter keywords in deep search mode"
+             },
+             404: {
+                 "description": "No specifications found matching the criteria"
+             }
+         })
 def search_specifications(request: KeywordRequest):
     start_time = time.time()
     boolSensitiveCase = request.case_sensitive
     else:
         raise HTTPException(status_code=404, detail="Specifications not found")
+@app.post("/search/bm25", response_model=KeywordResponse, tags=["Content Search"], summary="Advanced BM25 search with relevance scoring", responses={
+             200: {
+                 "description": "BM25 search completed successfully"
+             },
+             404: {
+                 "description": "No specifications found above the relevance threshold"
+             }
+         })
 def bm25_search_specification(request: BM25KeywordRequest):
     start_time = time.time()
     source = request.source

classes.py CHANGED Viewed

@@ -59,6 +59,7 @@ class ETSIDocFinder:
 class ETSISpecFinder:
     def __init__(self):
         self.main_url = "https://www.etsi.org/deliver/etsi_ts"
         self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}
     def get_spec_path(self, doc_id: str):
@@ -89,12 +90,20 @@ class ETSISpecFinder:
         original = doc_id
         url = f"{self.main_url}/{self.get_spec_path(original)}/"
         print(url)
         releases = self.get_docs_from_url(url)
         files = self.get_docs_from_url(url + releases[-1])
         for f in files:
             if f.endswith(".pdf"):
                 return url + releases[-1] + "/" + f
         return f"Specification {doc_id} not found"

 class ETSISpecFinder:
     def __init__(self):
         self.main_url = "https://www.etsi.org/deliver/etsi_ts"
+        self.second_url = "https://www.etsi.org/deliver/etsi_tr"
         self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"}
     def get_spec_path(self, doc_id: str):
         original = doc_id
         url = f"{self.main_url}/{self.get_spec_path(original)}/"
+        url2 = f"{self.second_url}/{self.get_spec_path(original)}/"
         print(url)
+        print(url2)
         releases = self.get_docs_from_url(url)
         files = self.get_docs_from_url(url + releases[-1])
         for f in files:
             if f.endswith(".pdf"):
                 return url + releases[-1] + "/" + f
+        releases = self.get_docs_from_url(url2)
+        files = self.get_docs_from_url(url + releases[-1])
+        for f in files:
+            if f.endswith('.pdf'):
+                return url + releases[-1] + "/" + f
         return f"Specification {doc_id} not found"

documentation.md ADDED Viewed

	@@ -0,0 +1,48 @@

+# 📋 Document Finder Backend API
+A comprehensive REST API for searching and retrieving technical documents and specifications from **3GPP** and **ETSI** organizations.
+### 🚀 Key Features
+* **Document Retrieval**: Get direct download URLs and metadata for specific documents
+* **Batch Processing**: Handle multiple document requests simultaneously
+* **Advanced Search**: Multiple search modes with keyword matching
+* **BM25 Scoring**: State-of-the-art relevance ranking using BM25 algorithm
+* **Cross-Organization**: Search across both 3GPP and ETSI document repositories
+### 📚 Supported Document Types
+#### 3GPP Documents
+* **TDocs (Technical Documents)**:
+  - Format: `S1-123456`, `C4-234567`, `R2-345678`
+  - Working group documents from SA, CT, RAN groups
+* **Technical Specifications**:
+  - Format: `23.401`, `38.331-16`
+  - Official published specifications
+#### ETSI Documents
+* **TDocs (Technical Documents)**:
+  - Format: `SET(25)000001`, `SCPTEQ(19)000011`
+  - Committee working documents
+* **Technical Specifications**:
+  - Format: `131 102`, `188 008-2`
+  - Published ETSI standards
+### 🔍 Search Capabilities
+* **Quick Search**: Lightning-fast metadata-only search
+* **Deep Search**: Comprehensive content-based search within document sections
+* **BM25 Search**: Advanced relevance scoring with normalization
+* **Flexible Filtering**: By source organization, document type, and specification category
+### 🛡️ Data Sources
+This API indexes and searches through:
+- 3GPP specification metadata and content
+- ETSI specification metadata and content
+- 3GPP TDoc location mappings
+- Pre-built BM25 search indices
+### 🔧 Technical Stack
+Built with FastAPI, featuring automatic OpenAPI documentation, request validation, and comprehensive error handling.

schemas.py CHANGED Viewed

@@ -1,38 +1,165 @@
-from pydantic import BaseModel
 from typing import *
 class DocRequest(BaseModel):
-    doc_id: str
 class DocResponse(BaseModel):
-    doc_id: str
-    url: str
-    version: Optional[str] = None
-    scope: Optional[str] = None
-    search_time: float
 class BatchDocRequest(BaseModel):
-    doc_ids: List[str]
 class BatchDocResponse(BaseModel):
-    results: Dict[str, str]
-    missing: List[str]
-    search_time: float
-class BM25KeywordRequest(BaseModel):
-    keywords: Optional[str] = ""
-    source: Optional[Literal["3GPP", "ETSI", "all"]] = "all"
-    threshold: Optional[int] = 60
-    spec_type: Optional[Literal["TS", "TR"]] = None
 class KeywordRequest(BaseModel):
-    keywords: Optional[str] = ""
-    search_mode: Literal["quick", "deep"]
-    case_sensitive: Optional[bool] = False
-    source: Optional[Literal["3GPP", "ETSI", "all"]] = "all"
-    spec_type: Optional[Literal["TS", "TR"]] = None
-    mode: Optional[Literal["and", "or"]] = "and"
 class KeywordResponse(BaseModel):
-    results: List[Dict[str, Any]]
-    search_time: float

+from pydantic import BaseModel, Field
 from typing import *
 class DocRequest(BaseModel):
+    """
+    Request model for single document retrieval.
+    Used to specify which document or specification to retrieve by its unique identifier.
+    """
+    doc_id: str = Field(
+        ...,
+        title="Document Identifier",
+        description="Unique identifier for the document or specification.",
+    )
 class DocResponse(BaseModel):
+    """
+    Response model for single document retrieval.
+    Contains all available metadata and access information for the requested document.
+    """
+    doc_id: str = Field(
+        ...,
+        title="Document Identifier",
+        description="Echoed document identifier from the request"
+    )
+    url: str = Field(
+        ...,
+        title="Document URL",
+        description="Direct download URL"
+    )
+    version: Optional[str] = Field(
+        None,
+        title="Document Version",
+        description="Extracted version information (e.g., 'h20', 'v17.9.0') when available"
+    )
+    scope: Optional[str] = Field(
+        None,
+        title="Document Scope",
+        description="Brief description of the document's scope and purpose from metadata"
+    )
+    search_time: float = Field(
+        ...,
+        title="Search Duration",
+        description="Time spent processing the request in seconds"
+    )
 class BatchDocRequest(BaseModel):
+    """
+    Request model for batch document retrieval.
+    Allows retrieval of multiple documents in a single API call for efficiency.
+    """
+    doc_ids: List[str] = Field(
+        ...,
+        title="Document Identifier List",
+        description="List of document identifiers to retrieve."
+    )
 class BatchDocResponse(BaseModel):
+    """
+    Response model for batch document retrieval.
+    Provides organized results separating found documents from missing ones.
+    """
+    results: Dict[str, str] = Field(
+        ...,
+        title="Found Documents",
+        description="Dictionary mapping document IDs to their corresponding URLs"
+    )
+    missing: List[str] = Field(
+        ...,
+        title="Missing Documents",
+        description="List of document IDs that could not be found or are not indexed"
+    )
+    search_time: float = Field(
+        ...,
+        title="Total Search Duration",
+        description="Total time spent processing the batch request in seconds"
+    )
 class KeywordRequest(BaseModel):
+    """
+    Request model for keyword-based specification search.
+    Provides flexible search options with multiple modes and filtering capabilities.
+    """
+    keywords: Optional[str] = Field(
+        "",
+        title="Search Keywords",
+        description="Comma-separated keywords for searching specifications.",
+        examples=["5G NR,authentication", "handover,mobility", "security,encryption"]
+    )
+    search_mode: Literal["quick", "deep"] = Field(
+        ...,
+        title="Search Mode",
+        description="Search mode: 'quick' searches metadata only, 'deep' searches metadata and document content"
+    )
+    case_sensitive: Optional[bool] = Field(
+        False,
+        title="Case Sensitive Search",
+        description="Enable case-sensitive keyword matching"
+    )
+    source: Optional[Literal["3GPP", "ETSI", "all"]] = Field(
+        "all",
+        title="Document Source",
+        description="Limit search to specific organization or search all repositories"
+    )
+    spec_type: Optional[Literal["TS", "TR"]] = Field(
+        None,
+        title="Specification Type",
+        description="Filter by specification type: 'TS' (Technical Specification) or 'TR' (Technical Report)"
+    )
+    mode: Optional[Literal["and", "or"]] = Field(
+        "and",
+        title="Search Logic",
+        description="Logical operator: 'and' requires all keywords to match, 'or' matches any keyword"
+    )
+class BM25KeywordRequest(BaseModel):
+    """
+    Request model for BM25 advanced search.
+    Provides parameters for relevance-based search using BM25 scoring algorithm.
+    """
+    keywords: Optional[str] = Field(
+        "",
+        title="Search Query",
+        description="Natural language search query for BM25 processing",
+        examples=["5G authentication procedures", "handover mobility management", "security key derivation"]
+    )
+    source: Optional[Literal["3GPP", "ETSI", "all"]] = Field(
+        "all",
+        title="Document Source",
+        description="Limit search to specific organization repositories"
+    )
+    threshold: Optional[int] = Field(
+        60,
+        title="Relevance Threshold",
+        description="Minimum normalized BM25 relevance score (0-100) for results inclusion",
+        ge=0,
+        le=100
+    )
+    spec_type: Optional[Literal["TS", "TR"]] = Field(
+        None,
+        title="Specification Type",
+        description="Filter results by specification type"
+    )
 class KeywordResponse(BaseModel):
+    """
+    Response model for keyword and BM25 search results.
+    Contains ranked search results with metadata and timing information.
+    """
+    results: List[Dict[str, Any]] = Field(
+        ...,
+        title="Search Results",
+        description="List of matching specifications with complete metadata. In deep search mode, includes 'contains' field with matching content sections."
+    )
+    search_time: float = Field(
+        ...,
+        title="Search Duration",
+        description="Time spent processing the search request in seconds"
+    )