HebaElshimy commited on
Commit
abcb2f1
·
verified ·
1 Parent(s): 95daf43

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +212 -45
  2. requirements.txt +2 -0
app.py CHANGED
@@ -2,19 +2,29 @@ import gradio as gr
2
  import pandas as pd
3
  import requests
4
  import json
5
- from transformers import pipeline
 
6
  import time
7
  from typing import List, Dict, Tuple
8
  import re
 
 
 
9
 
10
- # Initialize the classification pipeline using a free, open-source model
11
- # Using a biomedical text classification model that works well for research papers
 
 
 
 
 
12
  classifier = pipeline(
13
- "text-classification",
14
- model="microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
15
- return_all_scores=True
16
  )
17
 
 
 
18
  def parse_csv_file(file) -> pd.DataFrame:
19
  """Parse uploaded CSV file and return DataFrame"""
20
  try:
@@ -79,53 +89,210 @@ Reasoning: [Provide specific reasons based on the criteria]
79
  """
80
  return prompt
81
 
82
- def classify_single_study(title: str, abstract: str, criteria: str) -> Dict:
83
- """Classify a single study using the criteria"""
 
 
84
 
85
- # Create the classification text
86
- study_text = f"Title: {title}. Abstract: {abstract}"
87
 
88
- # Simple keyword-based classification as backup
89
- # This is a simplified approach - in practice you'd want more sophisticated NLP
90
-
91
- include_keywords = []
92
- exclude_keywords = []
93
-
94
- # Parse criteria to extract keywords (simplified)
95
- criteria_lines = criteria.lower().split('\n')
96
- for line in criteria_lines:
97
  if 'include' in line and ':' in line:
98
- keywords = line.split(':')[1].strip()
99
- include_keywords.extend([kw.strip() for kw in keywords.split(',') if kw.strip()])
 
 
 
100
  elif 'exclude' in line and ':' in line:
101
- keywords = line.split(':')[1].strip()
102
- exclude_keywords.extend([kw.strip() for kw in keywords.split(',') if kw.strip()])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- # Score based on keyword presence
105
- study_text_lower = study_text.lower()
 
 
 
 
 
106
 
107
- include_score = sum(1 for kw in include_keywords if kw in study_text_lower)
108
- exclude_score = sum(1 for kw in exclude_keywords if kw in study_text_lower)
109
 
110
- # Simple decision logic
111
- if exclude_score > 0:
112
- decision = "EXCLUDE"
113
- confidence = min(80 + exclude_score * 5, 95)
114
- reasoning = f"Found exclusion criteria: {', '.join([kw for kw in exclude_keywords if kw in study_text_lower])}"
115
- elif include_score >= 1:
116
- decision = "INCLUDE"
117
- confidence = min(70 + include_score * 5, 90)
118
- reasoning = f"Matches inclusion criteria: {', '.join([kw for kw in include_keywords if kw in study_text_lower])}"
119
- else:
120
- decision = "UNCLEAR"
121
- confidence = 50
122
- reasoning = "Insufficient information to make clear determination"
123
 
124
- return {
125
- 'decision': decision,
126
- 'confidence': confidence,
127
- 'reasoning': reasoning
128
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  def process_studies(file, title_col, abstract_col, criteria, sample_size):
131
  """Main processing function"""
 
2
  import pandas as pd
3
  import requests
4
  import json
5
+ from transformers import pipeline, AutoTokenizer, AutoModel
6
+ import torch
7
  import time
8
  from typing import List, Dict, Tuple
9
  import re
10
+ from sentence_transformers import SentenceTransformer
11
+ import numpy as np
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
 
14
+ # Initialize multiple models for different approaches
15
+ print("Loading models...")
16
+
17
+ # For semantic similarity matching
18
+ sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
19
+
20
+ # For zero-shot classification
21
  classifier = pipeline(
22
+ "zero-shot-classification",
23
+ model="facebook/bart-large-mnli"
 
24
  )
25
 
26
+ print("Models loaded successfully!")
27
+
28
  def parse_csv_file(file) -> pd.DataFrame:
29
  """Parse uploaded CSV file and return DataFrame"""
30
  try:
 
89
  """
90
  return prompt
91
 
92
+ def parse_criteria(criteria_text: str) -> Dict[str, List[str]]:
93
+ """Parse inclusion/exclusion criteria into structured format"""
94
+ include_terms = []
95
+ exclude_terms = []
96
 
97
+ lines = criteria_text.lower().split('\n')
98
+ current_section = None
99
 
100
+ for line in lines:
101
+ line = line.strip()
102
+ if not line:
103
+ continue
104
+
 
 
 
 
105
  if 'include' in line and ':' in line:
106
+ current_section = 'include'
107
+ # Extract terms after the colon
108
+ terms = line.split(':')[1].strip()
109
+ if terms:
110
+ include_terms.extend([t.strip() for t in terms.split(',') if t.strip()])
111
  elif 'exclude' in line and ':' in line:
112
+ current_section = 'exclude'
113
+ terms = line.split(':')[1].strip()
114
+ if terms:
115
+ exclude_terms.extend([t.strip() for t in terms.split(',') if t.strip()])
116
+ elif current_section and line.startswith('-'):
117
+ # Handle bullet points
118
+ term = line[1:].strip()
119
+ if term:
120
+ if current_section == 'include':
121
+ include_terms.append(term)
122
+ else:
123
+ exclude_terms.append(term)
124
+ elif current_section and not line.startswith(('include', 'exclude')):
125
+ # Handle continuation lines
126
+ if line:
127
+ if current_section == 'include':
128
+ include_terms.extend([t.strip() for t in line.split(',') if t.strip()])
129
+ else:
130
+ exclude_terms.extend([t.strip() for t in line.split(',') if t.strip()])
131
 
132
+ return {
133
+ 'include': [term for term in include_terms if len(term) > 2], # Filter very short terms
134
+ 'exclude': [term for term in exclude_terms if len(term) > 2]
135
+ }
136
+
137
+ def classify_with_semantic_similarity(title: str, abstract: str, criteria: Dict) -> Dict:
138
+ """Use semantic similarity to classify studies"""
139
 
140
+ # Combine title and abstract
141
+ study_text = f"{title} {abstract}".strip()
142
 
143
+ if not study_text or len(study_text) < 10:
144
+ return {
145
+ 'decision': 'UNCLEAR',
146
+ 'confidence': 30,
147
+ 'reasoning': 'Insufficient text for analysis'
148
+ }
 
 
 
 
 
 
 
149
 
150
+ try:
151
+ # Get embeddings for the study
152
+ study_embedding = sentence_model.encode([study_text])
153
+
154
+ include_scores = []
155
+ exclude_scores = []
156
+
157
+ # Calculate similarity with inclusion criteria
158
+ if criteria['include']:
159
+ include_embeddings = sentence_model.encode(criteria['include'])
160
+ include_similarities = cosine_similarity(study_embedding, include_embeddings)[0]
161
+ include_scores = include_similarities.tolist()
162
+
163
+ # Calculate similarity with exclusion criteria
164
+ if criteria['exclude']:
165
+ exclude_embeddings = sentence_model.encode(criteria['exclude'])
166
+ exclude_similarities = cosine_similarity(study_embedding, exclude_embeddings)[0]
167
+ exclude_scores = exclude_similarities.tolist()
168
+
169
+ # Decision logic
170
+ max_include_score = max(include_scores) if include_scores else 0
171
+ max_exclude_score = max(exclude_scores) if exclude_scores else 0
172
+
173
+ # Find which criteria matched best
174
+ include_reasons = []
175
+ exclude_reasons = []
176
+
177
+ if include_scores:
178
+ best_include_idx = np.argmax(include_scores)
179
+ if include_scores[best_include_idx] > 0.3: # Threshold for meaningful similarity
180
+ include_reasons.append(f"Similar to: '{criteria['include'][best_include_idx]}'")
181
+
182
+ if exclude_scores:
183
+ best_exclude_idx = np.argmax(exclude_scores)
184
+ if exclude_scores[best_exclude_idx] > 0.3:
185
+ exclude_reasons.append(f"Similar to: '{criteria['exclude'][best_exclude_idx]}'")
186
+
187
+ # Make decision
188
+ if max_exclude_score > 0.4: # Strong exclusion match
189
+ decision = 'EXCLUDE'
190
+ confidence = min(int(max_exclude_score * 100), 95)
191
+ reasoning = f"Strong match with exclusion criteria. {'; '.join(exclude_reasons)}"
192
+ elif max_include_score > 0.4: # Strong inclusion match
193
+ decision = 'INCLUDE'
194
+ confidence = min(int(max_include_score * 100), 90)
195
+ reasoning = f"Strong match with inclusion criteria. {'; '.join(include_reasons)}"
196
+ elif max_include_score > 0.25: # Moderate inclusion match
197
+ decision = 'INCLUDE'
198
+ confidence = min(int(max_include_score * 80), 75)
199
+ reasoning = f"Moderate match with inclusion criteria. {'; '.join(include_reasons)}"
200
+ else:
201
+ decision = 'UNCLEAR'
202
+ confidence = 40
203
+ reasoning = f"No strong matches found. Best include: {max_include_score:.2f}, Best exclude: {max_exclude_score:.2f}"
204
+
205
+ return {
206
+ 'decision': decision,
207
+ 'confidence': confidence,
208
+ 'reasoning': reasoning
209
+ }
210
+
211
+ except Exception as e:
212
+ return {
213
+ 'decision': 'UNCLEAR',
214
+ 'confidence': 30,
215
+ 'reasoning': f'Error in semantic analysis: {str(e)}'
216
+ }
217
+
218
+ def classify_with_zero_shot(title: str, abstract: str, criteria_text: str) -> Dict:
219
+ """Use zero-shot classification as a secondary method"""
220
+
221
+ study_text = f"{title} {abstract}".strip()
222
+
223
+ if not study_text or len(study_text) < 10:
224
+ return None
225
+
226
+ try:
227
+ # Create labels from criteria
228
+ candidate_labels = ["should be included in systematic review", "should be excluded from systematic review"]
229
+
230
+ # Use the criteria as hypothesis
231
+ hypothesis_template = f"This study {{}}, based on the criteria: {criteria_text}"
232
+
233
+ result = classifier(study_text, candidate_labels, hypothesis_template=hypothesis_template)
234
+
235
+ top_label = result['labels'][0]
236
+ top_score = result['scores'][0]
237
+
238
+ if 'included' in top_label:
239
+ decision = 'INCLUDE'
240
+ else:
241
+ decision = 'EXCLUDE'
242
+
243
+ confidence = int(top_score * 100)
244
+ reasoning = f"Zero-shot classification: {top_label} (confidence: {confidence}%)"
245
+
246
+ return {
247
+ 'decision': decision,
248
+ 'confidence': confidence,
249
+ 'reasoning': reasoning
250
+ }
251
+
252
+ except Exception as e:
253
+ return None
254
+
255
+ def classify_single_study(title: str, abstract: str, criteria_text: str) -> Dict:
256
+ """Enhanced classification using multiple approaches"""
257
+
258
+ # Parse criteria
259
+ parsed_criteria = parse_criteria(criteria_text)
260
+
261
+ if not parsed_criteria['include'] and not parsed_criteria['exclude']:
262
+ return {
263
+ 'decision': 'UNCLEAR',
264
+ 'confidence': 20,
265
+ 'reasoning': 'No clear inclusion/exclusion criteria provided'
266
+ }
267
+
268
+ # Method 1: Semantic similarity
269
+ semantic_result = classify_with_semantic_similarity(title, abstract, parsed_criteria)
270
+
271
+ # Method 2: Zero-shot classification (as backup)
272
+ zero_shot_result = classify_with_zero_shot(title, abstract, criteria_text)
273
+
274
+ # Combine results (prioritize semantic similarity)
275
+ if semantic_result['confidence'] > 60:
276
+ return semantic_result
277
+ elif zero_shot_result and zero_shot_result['confidence'] > 70:
278
+ return zero_shot_result
279
+ elif semantic_result['confidence'] > 40:
280
+ # Add zero-shot info if available
281
+ combined_reasoning = semantic_result['reasoning']
282
+ if zero_shot_result:
283
+ combined_reasoning += f" | {zero_shot_result['reasoning']}"
284
+
285
+ return {
286
+ 'decision': semantic_result['decision'],
287
+ 'confidence': semantic_result['confidence'],
288
+ 'reasoning': combined_reasoning
289
+ }
290
+ else:
291
+ return {
292
+ 'decision': 'UNCLEAR',
293
+ 'confidence': 35,
294
+ 'reasoning': 'Low confidence from all classification methods'
295
+ }
296
 
297
  def process_studies(file, title_col, abstract_col, criteria, sample_size):
298
  """Main processing function"""
requirements.txt CHANGED
@@ -4,3 +4,5 @@ transformers==4.36.2
4
  torch==2.1.2
5
  requests==2.31.0
6
  numpy==1.24.3
 
 
 
4
  torch==2.1.2
5
  requests==2.31.0
6
  numpy==1.24.3
7
+ sentence-transformers==2.2.2
8
+ scikit-learn==1.3.2