Spaces:

HebaElshimy
/

systematic-reviews

Sleeping

App Files Files Community

HebaElshimy commited on May 25

Commit

abcb2f1

verified ·

1 Parent(s): 95daf43

Upload 2 files

Browse files

Files changed (2) hide show

app.py +212 -45
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -2,19 +2,29 @@ import gradio as gr
 import pandas as pd
 import requests
 import json
-from transformers import pipeline
 import time
 from typing import List, Dict, Tuple
 import re
-# Initialize the classification pipeline using a free, open-source model
-# Using a biomedical text classification model that works well for research papers
 classifier = pipeline(
-    "text-classification",
-    model="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
-    return_all_scores=True
 )
 def parse_csv_file(file) -> pd.DataFrame:
     """Parse uploaded CSV file and return DataFrame"""
     try:
@@ -79,53 +89,210 @@ Reasoning: [Provide specific reasons based on the criteria]
 """
     return prompt
-def classify_single_study(title: str, abstract: str, criteria: str) -> Dict:
-    """Classify a single study using the criteria"""
-    # Create the classification text
-    study_text = f"Title: {title}. Abstract: {abstract}"
-    # Simple keyword-based classification as backup
-    # This is a simplified approach - in practice you'd want more sophisticated NLP
-    include_keywords = []
-    exclude_keywords = []
-    # Parse criteria to extract keywords (simplified)
-    criteria_lines = criteria.lower().split('\n')
-    for line in criteria_lines:
         if 'include' in line and ':' in line:
-            keywords = line.split(':')[1].strip()
-            include_keywords.extend([kw.strip() for kw in keywords.split(',') if kw.strip()])
         elif 'exclude' in line and ':' in line:
-            keywords = line.split(':')[1].strip()
-            exclude_keywords.extend([kw.strip() for kw in keywords.split(',') if kw.strip()])
-    # Score based on keyword presence
-    study_text_lower = study_text.lower()
-    include_score = sum(1 for kw in include_keywords if kw in study_text_lower)
-    exclude_score = sum(1 for kw in exclude_keywords if kw in study_text_lower)
-    # Simple decision logic
-    if exclude_score > 0:
-        decision = "EXCLUDE"
-        confidence = min(80 + exclude_score * 5, 95)
-        reasoning = f"Found exclusion criteria: {', '.join([kw for kw in exclude_keywords if kw in study_text_lower])}"
-    elif include_score >= 1:
-        decision = "INCLUDE"
-        confidence = min(70 + include_score * 5, 90)
-        reasoning = f"Matches inclusion criteria: {', '.join([kw for kw in include_keywords if kw in study_text_lower])}"
-    else:
-        decision = "UNCLEAR"
-        confidence = 50
-        reasoning = "Insufficient information to make clear determination"
-    return {
-        'decision': decision,
-        'confidence': confidence,
-        'reasoning': reasoning
-    }
 def process_studies(file, title_col, abstract_col, criteria, sample_size):
     """Main processing function"""

 import pandas as pd
 import requests
 import json
+from transformers import pipeline, AutoTokenizer, AutoModel
+import torch
 import time
 from typing import List, Dict, Tuple
 import re
+from sentence_transformers import SentenceTransformer
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+# Initialize multiple models for different approaches
+print("Loading models...")
+# For semantic similarity matching
+sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
+# For zero-shot classification
 classifier = pipeline(
+    "zero-shot-classification",
+    model="facebook/bart-large-mnli"
 )
+print("Models loaded successfully!")
 def parse_csv_file(file) -> pd.DataFrame:
     """Parse uploaded CSV file and return DataFrame"""
     try:
 """
     return prompt
+def parse_criteria(criteria_text: str) -> Dict[str, List[str]]:
+    """Parse inclusion/exclusion criteria into structured format"""
+    include_terms = []
+    exclude_terms = []
+    lines = criteria_text.lower().split('\n')
+    current_section = None
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
         if 'include' in line and ':' in line:
+            current_section = 'include'
+            # Extract terms after the colon
+            terms = line.split(':')[1].strip()
+            if terms:
+                include_terms.extend([t.strip() for t in terms.split(',') if t.strip()])
         elif 'exclude' in line and ':' in line:
+            current_section = 'exclude'
+            terms = line.split(':')[1].strip()
+            if terms:
+                exclude_terms.extend([t.strip() for t in terms.split(',') if t.strip()])
+        elif current_section and line.startswith('-'):
+            # Handle bullet points
+            term = line[1:].strip()
+            if term:
+                if current_section == 'include':
+                    include_terms.append(term)
+                else:
+                    exclude_terms.append(term)
+        elif current_section and not line.startswith(('include', 'exclude')):
+            # Handle continuation lines
+            if line:
+                if current_section == 'include':
+                    include_terms.extend([t.strip() for t in line.split(',') if t.strip()])
+                else:
+                    exclude_terms.extend([t.strip() for t in line.split(',') if t.strip()])
+    return {
+        'include': [term for term in include_terms if len(term) > 2],  # Filter very short terms
+        'exclude': [term for term in exclude_terms if len(term) > 2]
+    }
+def classify_with_semantic_similarity(title: str, abstract: str, criteria: Dict) -> Dict:
+    """Use semantic similarity to classify studies"""
+    # Combine title and abstract
+    study_text = f"{title} {abstract}".strip()
+    if not study_text or len(study_text) < 10:
+        return {
+            'decision': 'UNCLEAR',
+            'confidence': 30,
+            'reasoning': 'Insufficient text for analysis'
+        }
+    try:
+        # Get embeddings for the study
+        study_embedding = sentence_model.encode([study_text])
+        include_scores = []
+        exclude_scores = []
+        # Calculate similarity with inclusion criteria
+        if criteria['include']:
+            include_embeddings = sentence_model.encode(criteria['include'])
+            include_similarities = cosine_similarity(study_embedding, include_embeddings)[0]
+            include_scores = include_similarities.tolist()
+        # Calculate similarity with exclusion criteria
+        if criteria['exclude']:
+            exclude_embeddings = sentence_model.encode(criteria['exclude'])
+            exclude_similarities = cosine_similarity(study_embedding, exclude_embeddings)[0]
+            exclude_scores = exclude_similarities.tolist()
+        # Decision logic
+        max_include_score = max(include_scores) if include_scores else 0
+        max_exclude_score = max(exclude_scores) if exclude_scores else 0
+        # Find which criteria matched best
+        include_reasons = []
+        exclude_reasons = []
+        if include_scores:
+            best_include_idx = np.argmax(include_scores)
+            if include_scores[best_include_idx] > 0.3:  # Threshold for meaningful similarity
+                include_reasons.append(f"Similar to: '{criteria['include'][best_include_idx]}'")
+        if exclude_scores:
+            best_exclude_idx = np.argmax(exclude_scores)
+            if exclude_scores[best_exclude_idx] > 0.3:
+                exclude_reasons.append(f"Similar to: '{criteria['exclude'][best_exclude_idx]}'")
+        # Make decision
+        if max_exclude_score > 0.4:  # Strong exclusion match
+            decision = 'EXCLUDE'
+            confidence = min(int(max_exclude_score * 100), 95)
+            reasoning = f"Strong match with exclusion criteria. {'; '.join(exclude_reasons)}"
+        elif max_include_score > 0.4:  # Strong inclusion match
+            decision = 'INCLUDE'
+            confidence = min(int(max_include_score * 100), 90)
+            reasoning = f"Strong match with inclusion criteria. {'; '.join(include_reasons)}"
+        elif max_include_score > 0.25:  # Moderate inclusion match
+            decision = 'INCLUDE'
+            confidence = min(int(max_include_score * 80), 75)
+            reasoning = f"Moderate match with inclusion criteria. {'; '.join(include_reasons)}"
+        else:
+            decision = 'UNCLEAR'
+            confidence = 40
+            reasoning = f"No strong matches found. Best include: {max_include_score:.2f}, Best exclude: {max_exclude_score:.2f}"
+        return {
+            'decision': decision,
+            'confidence': confidence,
+            'reasoning': reasoning
+        }
+    except Exception as e:
+        return {
+            'decision': 'UNCLEAR',
+            'confidence': 30,
+            'reasoning': f'Error in semantic analysis: {str(e)}'
+        }
+def classify_with_zero_shot(title: str, abstract: str, criteria_text: str) -> Dict:
+    """Use zero-shot classification as a secondary method"""
+    study_text = f"{title} {abstract}".strip()
+    if not study_text or len(study_text) < 10:
+        return None
+    try:
+        # Create labels from criteria
+        candidate_labels = ["should be included in systematic review", "should be excluded from systematic review"]
+        # Use the criteria as hypothesis
+        hypothesis_template = f"This study {{}}, based on the criteria: {criteria_text}"
+        result = classifier(study_text, candidate_labels, hypothesis_template=hypothesis_template)
+        top_label = result['labels'][0]
+        top_score = result['scores'][0]
+        if 'included' in top_label:
+            decision = 'INCLUDE'
+        else:
+            decision = 'EXCLUDE'
+        confidence = int(top_score * 100)
+        reasoning = f"Zero-shot classification: {top_label} (confidence: {confidence}%)"
+        return {
+            'decision': decision,
+            'confidence': confidence,
+            'reasoning': reasoning
+        }
+    except Exception as e:
+        return None
+def classify_single_study(title: str, abstract: str, criteria_text: str) -> Dict:
+    """Enhanced classification using multiple approaches"""
+    # Parse criteria
+    parsed_criteria = parse_criteria(criteria_text)
+    if not parsed_criteria['include'] and not parsed_criteria['exclude']:
+        return {
+            'decision': 'UNCLEAR',
+            'confidence': 20,
+            'reasoning': 'No clear inclusion/exclusion criteria provided'
+        }
+    # Method 1: Semantic similarity
+    semantic_result = classify_with_semantic_similarity(title, abstract, parsed_criteria)
+    # Method 2: Zero-shot classification (as backup)
+    zero_shot_result = classify_with_zero_shot(title, abstract, criteria_text)
+    # Combine results (prioritize semantic similarity)
+    if semantic_result['confidence'] > 60:
+        return semantic_result
+    elif zero_shot_result and zero_shot_result['confidence'] > 70:
+        return zero_shot_result
+    elif semantic_result['confidence'] > 40:
+        # Add zero-shot info if available
+        combined_reasoning = semantic_result['reasoning']
+        if zero_shot_result:
+            combined_reasoning += f" | {zero_shot_result['reasoning']}"
+        return {
+            'decision': semantic_result['decision'],
+            'confidence': semantic_result['confidence'],
+            'reasoning': combined_reasoning
+        }
+    else:
+        return {
+            'decision': 'UNCLEAR',
+            'confidence': 35,
+            'reasoning': 'Low confidence from all classification methods'
+        }
 def process_studies(file, title_col, abstract_col, criteria, sample_size):
     """Main processing function"""

requirements.txt CHANGED Viewed

@@ -4,3 +4,5 @@ transformers==4.36.2
 torch==2.1.2
 requests==2.31.0
 numpy==1.24.3

 torch==2.1.2
 requests==2.31.0
 numpy==1.24.3
+sentence-transformers==2.2.2
+scikit-learn==1.3.2