Spaces:
Running
Running
init commit
Browse files- .gitignore +1 -1
- .gradio/certificate.pem +31 -0
- README.md +62 -10
- app.py +295 -256
- dataset_card_template.py +0 -40
- env.example +3 -0
- packages.txt +0 -1
- pitfalls.json +92 -0
- requirements.in +2 -4
.gitignore
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
__pycache__/
|
| 2 |
-
|
|
|
|
| 1 |
__pycache__/
|
| 2 |
+
.env
|
.gradio/certificate.pem
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-----BEGIN CERTIFICATE-----
|
| 2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
| 3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
| 4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
| 5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
| 6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
| 7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
| 8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
| 9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
| 10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
| 11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
| 12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
| 13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
| 14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
| 15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
| 16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
| 17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
| 18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
| 19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
| 20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
| 21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
| 22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
| 23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
| 24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
| 25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
| 26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
| 27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
| 28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
| 29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
| 30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
| 31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
|
@@ -1,18 +1,70 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
hf_oauth: true
|
| 11 |
-
hf_oauth_scopes:
|
| 12 |
-
- read-repos
|
| 13 |
-
- write-repos
|
| 14 |
-
- manage-repos
|
| 15 |
license: agpl-3.0
|
| 16 |
---
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Research Paper Pitfall Checker
|
| 3 |
+
emoji: ππ
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
license: agpl-3.0
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# Research Paper Pitfall Checker
|
| 14 |
+
|
| 15 |
+
A Gradio application that analyzes research papers to identify potential evaluation pitfalls using AI-powered analysis.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- **PDF Text Extraction**: Extracts text content from uploaded research papers
|
| 20 |
+
- **AI-Powered Analysis**: Uses OpenRouter API with Qwen model for intelligent pitfall detection
|
| 21 |
+
- **Comprehensive Pitfall Detection**: Identifies common evaluation pitfalls including:
|
| 22 |
+
- π The Lock-In Effect
|
| 23 |
+
- ππ Apples-to-Oranges Comparisons
|
| 24 |
+
- π§ Contamination Leak
|
| 25 |
+
- π€β Unvalidated Automation
|
| 26 |
+
- π§ Vague Scales
|
| 27 |
+
|
| 28 |
+
## How to Use
|
| 29 |
+
|
| 30 |
+
1. **Get an API Key**: Sign up at [OpenRouter.ai](https://openrouter.ai) to get your API key
|
| 31 |
+
2. **Set Environment Variable**: Set your API key as an environment variable
|
| 32 |
+
3. **Upload a PDF**: Upload your research paper PDF file
|
| 33 |
+
4. **Analyze**: Click "Analyze Paper for Pitfalls" to get a detailed analysis
|
| 34 |
+
5. **Review Results**: Review the analysis report for potential issues and improvement suggestions
|
| 35 |
+
|
| 36 |
+
## Setup
|
| 37 |
+
|
| 38 |
+
1. Install dependencies:
|
| 39 |
+
```bash
|
| 40 |
+
pip install -r requirements.in
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
2. Set your OpenRouter API key as an environment variable:
|
| 44 |
+
```bash
|
| 45 |
+
export OPENROUTER_API_KEY="your-api-key-here"
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
Or create a `.env` file (copy from `env.example`):
|
| 49 |
+
```
|
| 50 |
+
OPENROUTER_API_KEY=your-api-key-here
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
3. Run the application:
|
| 54 |
+
```bash
|
| 55 |
+
python app.py
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
## Configuration
|
| 59 |
+
|
| 60 |
+
The pitfalls are defined in `pitfalls.json`. You can modify this file to add or remove specific pitfalls to check for.
|
| 61 |
+
|
| 62 |
+
## API Requirements
|
| 63 |
+
|
| 64 |
+
- OpenRouter API key (get one from [OpenRouter.ai](https://openrouter.ai))
|
| 65 |
+
- Uses the `qwen/qwen2.5-next-80b-a3b-instruct` model via OpenRouter
|
| 66 |
+
- API key must be set as the `OPENROUTER_API_KEY` environment variable
|
| 67 |
+
|
| 68 |
+
## License
|
| 69 |
+
|
| 70 |
+
AGPL-3.0
|
app.py
CHANGED
|
@@ -1,303 +1,342 @@
|
|
| 1 |
-
import
|
| 2 |
import os
|
| 3 |
-
import random
|
| 4 |
-
import shutil
|
| 5 |
import tempfile
|
| 6 |
-
import
|
| 7 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 8 |
-
from datetime import datetime
|
| 9 |
|
| 10 |
import fitz # PyMuPDF
|
| 11 |
import gradio as gr
|
| 12 |
-
from
|
| 13 |
-
from PIL import Image
|
| 14 |
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
from dataset_card_template import DATASET_CARD_TEMPLATE
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
| 23 |
|
| 24 |
|
| 25 |
-
def
|
| 26 |
-
|
| 27 |
-
pdf_path = pdf_file.name
|
| 28 |
-
doc = fitz.open(pdf_path)
|
| 29 |
-
total_pages = len(doc)
|
| 30 |
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
|
| 42 |
-
|
| 43 |
-
for page_num in selected_pages:
|
| 44 |
-
page = doc[page_num]
|
| 45 |
-
pix = page.get_pixmap() # Remove the Matrix scaling
|
| 46 |
-
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 47 |
-
image_path = os.path.join(
|
| 48 |
-
temp_dir, f"{os.path.basename(pdf_path)}_page_{page_num+1}.jpg"
|
| 49 |
-
)
|
| 50 |
-
image.save(image_path, "JPEG", quality=85, optimize=True)
|
| 51 |
-
images.append(image_path)
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
except Exception as e:
|
| 56 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
all_images = []
|
| 65 |
-
skipped_pdfs = []
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
if error:
|
| 80 |
-
skipped_pdfs.append(error)
|
| 81 |
-
gr.Info(error)
|
| 82 |
-
else:
|
| 83 |
-
all_images.extend(images)
|
| 84 |
-
|
| 85 |
-
processed_pages += pages_processed
|
| 86 |
-
progress((processed_pages / total_pages), desc=f"Processing {pdf.name}")
|
| 87 |
-
|
| 88 |
-
message = f"Saved {len(all_images)} images to temporary directory"
|
| 89 |
-
if skipped_pdfs:
|
| 90 |
-
message += f"\nSkipped {len(skipped_pdfs)} PDFs due to errors: {', '.join(skipped_pdfs)}"
|
| 91 |
-
return all_images, message
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
def get_size_category(num_images):
|
| 95 |
-
if num_images < 1000:
|
| 96 |
-
return "n<1K"
|
| 97 |
-
elif num_images < 10000:
|
| 98 |
-
return "1K<n<10K"
|
| 99 |
-
elif num_images < 100000:
|
| 100 |
-
return "10K<n<100K"
|
| 101 |
-
elif num_images < 1000000:
|
| 102 |
-
return "100K<n<1M"
|
| 103 |
-
else:
|
| 104 |
-
return "n>1M"
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
def process_pdfs(
|
| 108 |
-
pdf_files,
|
| 109 |
-
sample_percentage,
|
| 110 |
-
hf_repo,
|
| 111 |
-
create_zip,
|
| 112 |
-
private_repo,
|
| 113 |
-
oauth_token: gr.OAuthToken | None,
|
| 114 |
-
progress=gr.Progress(),
|
| 115 |
-
):
|
| 116 |
-
if not pdf_files:
|
| 117 |
-
return (
|
| 118 |
-
None,
|
| 119 |
-
None,
|
| 120 |
-
gr.Markdown(
|
| 121 |
-
"β οΈ No PDF files uploaded. Please upload at least one PDF file."
|
| 122 |
-
),
|
| 123 |
)
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
# gr.Markdown(
|
| 130 |
-
# "β οΈ Not logged in to Hugging Face. Please log in to upload to a Hugging Face dataset."
|
| 131 |
-
# ),
|
| 132 |
-
# )
|
| 133 |
|
| 134 |
try:
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
#
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
if
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
# Determine size category
|
| 190 |
-
size_category = get_size_category(len(images))
|
| 191 |
-
|
| 192 |
-
# Create DatasetCardData instance
|
| 193 |
-
card_data = DatasetCardData(
|
| 194 |
-
tags=["created-with-pdfs-to-page-images-converter", "pdf-to-image"],
|
| 195 |
-
size_categories=[size_category],
|
| 196 |
-
)
|
| 197 |
-
|
| 198 |
-
# Create and populate the dataset card
|
| 199 |
-
card = DatasetCard.from_template(
|
| 200 |
-
card_data,
|
| 201 |
-
template_path=None, # Use default template
|
| 202 |
-
hf_repo=hf_repo,
|
| 203 |
-
num_images=len(images),
|
| 204 |
-
num_pdfs=len(pdf_files),
|
| 205 |
-
sample_size=sample_percentage
|
| 206 |
-
if sample_percentage > 0
|
| 207 |
-
else "All pages",
|
| 208 |
-
creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 209 |
-
)
|
| 210 |
-
|
| 211 |
-
# Add our custom content to the card
|
| 212 |
-
card.text = DATASET_CARD_TEMPLATE.format(
|
| 213 |
-
hf_repo=hf_repo,
|
| 214 |
-
num_images=len(images),
|
| 215 |
-
num_pdfs=len(pdf_files),
|
| 216 |
-
sample_size=sample_percentage
|
| 217 |
-
if sample_percentage > 0
|
| 218 |
-
else "All pages",
|
| 219 |
-
creation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 220 |
-
size_category=size_category,
|
| 221 |
-
)
|
| 222 |
-
|
| 223 |
-
repo_url = f"https://huggingface.co/datasets/{hf_repo}"
|
| 224 |
-
message += f"\nUploaded dataset card to Hugging Face repo: [{hf_repo}]({repo_url})"
|
| 225 |
-
|
| 226 |
-
card.push_to_hub(hf_repo, token=oauth_token.token)
|
| 227 |
-
except Exception as e:
|
| 228 |
-
message += f"\nFailed to upload to Hugging Face: {str(e)}"
|
| 229 |
-
|
| 230 |
-
return images, zip_path, message
|
| 231 |
except Exception as e:
|
| 232 |
-
|
| 233 |
-
shutil.rmtree(temp_dir)
|
| 234 |
-
return None, None, f"An error occurred: {str(e)}"
|
| 235 |
|
| 236 |
|
| 237 |
# Define the Gradio interface
|
| 238 |
-
with gr.Blocks() as demo:
|
| 239 |
gr.HTML(
|
| 240 |
-
"""<h1 style='text-align: center;'
|
| 241 |
-
<center><i>
|
| 242 |
)
|
|
|
|
| 243 |
gr.HTML(
|
| 244 |
"""
|
| 245 |
-
<div style="
|
| 246 |
-
<
|
| 247 |
-
|
| 248 |
-
<
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
<li>(Optionally) sample a specific number of pages from each PDF</li>
|
| 252 |
-
<li>(Optionally) Create a downloadable ZIP file of the converted images</li>
|
| 253 |
-
<li>(Optionally) Upload the images to a Hugging Face dataset repository</li>
|
| 254 |
</ol>
|
| 255 |
-
|
| 256 |
-
<
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
| 261 |
</div>
|
| 262 |
"""
|
| 263 |
)
|
| 264 |
|
| 265 |
with gr.Row():
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
with gr.Row():
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
value=100,
|
| 275 |
-
step=1,
|
| 276 |
-
label="Percentage of pages to sample per PDF",
|
| 277 |
-
info="0% for no sampling (all pages), 100% for all pages",
|
| 278 |
)
|
| 279 |
-
create_zip = gr.Checkbox(label="Create ZIP file of images?", value=False)
|
| 280 |
-
|
| 281 |
-
with gr.Accordion("Hugging Face Upload Options", open=True):
|
| 282 |
-
gr.LoginButton(size="sm")
|
| 283 |
-
with gr.Row():
|
| 284 |
-
hf_repo = gr.Textbox(
|
| 285 |
-
label="Hugging Face Repo",
|
| 286 |
-
placeholder="username/repo-name",
|
| 287 |
-
info="Enter the Hugging Face repository name in the format 'username/repo-name'",
|
| 288 |
-
)
|
| 289 |
-
private_repo = gr.Checkbox(label="Make repository private?", value=False)
|
| 290 |
-
|
| 291 |
-
with gr.Accordion("View converted images", open=False):
|
| 292 |
-
output_gallery = gr.Gallery(label="Converted Images")
|
| 293 |
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
)
|
| 303 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
import os
|
|
|
|
|
|
|
| 3 |
import tempfile
|
| 4 |
+
from typing import List, Dict, Any
|
|
|
|
|
|
|
| 5 |
|
| 6 |
import fitz # PyMuPDF
|
| 7 |
import gradio as gr
|
| 8 |
+
from openai import OpenAI
|
|
|
|
| 9 |
|
| 10 |
+
# Load API key from environment variable
|
| 11 |
+
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
| 12 |
|
|
|
|
| 13 |
|
| 14 |
+
def load_pitfalls() -> List[Dict[str, Any]]:
|
| 15 |
+
"""Load pitfalls from the JSON file."""
|
| 16 |
+
try:
|
| 17 |
+
with open("pitfalls.json", "r") as f:
|
| 18 |
+
data = json.load(f)
|
| 19 |
+
return data.get("pitfalls", [])
|
| 20 |
+
except FileNotFoundError:
|
| 21 |
+
gr.Warning("pitfalls.json file not found!")
|
| 22 |
+
return []
|
| 23 |
+
except json.JSONDecodeError:
|
| 24 |
+
gr.Warning("Invalid JSON in pitfalls.json file!")
|
| 25 |
+
return []
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def extract_text_from_pdf(pdf_file) -> str:
|
| 29 |
+
"""Extract text content from a PDF file."""
|
| 30 |
+
try:
|
| 31 |
+
pdf_path = pdf_file.name
|
| 32 |
+
doc = fitz.open(pdf_path)
|
| 33 |
+
text_content = ""
|
| 34 |
|
| 35 |
+
for page_num in range(len(doc)):
|
| 36 |
+
page = doc[page_num]
|
| 37 |
+
text_content += f"\n--- Page {page_num + 1} ---\n"
|
| 38 |
+
text_content += page.get_text()
|
| 39 |
|
| 40 |
+
doc.close()
|
| 41 |
+
return text_content
|
| 42 |
+
except Exception as e:
|
| 43 |
+
raise gr.Error(f"Error extracting text from {pdf_file.name}: {str(e)}")
|
| 44 |
|
| 45 |
|
| 46 |
+
def format_paper_text(paper_text: str) -> Dict[str, Any]:
|
| 47 |
+
"""First stage: Format the paper text to make it more readable and suitable for analysis."""
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
# Check if API key is available
|
| 50 |
+
if not OPENROUTER_API_KEY:
|
| 51 |
+
return {
|
| 52 |
+
"formatted_text": None,
|
| 53 |
+
"success": False,
|
| 54 |
+
"error": "OpenRouter API key not found. Please set the OPENROUTER_API_KEY environment variable.",
|
| 55 |
+
}
|
| 56 |
|
| 57 |
+
# Initialize OpenAI client with OpenRouter
|
| 58 |
+
client = OpenAI(
|
| 59 |
+
base_url="https://openrouter.ai/api/v1",
|
| 60 |
+
api_key=OPENROUTER_API_KEY,
|
| 61 |
+
)
|
| 62 |
|
| 63 |
+
format_prompt = f"""You are an expert academic text processor. Your task is to clean and format the following research paper text to make it more readable and suitable for detailed analysis.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
Please:
|
| 66 |
+
1. Remove excessive whitespace and formatting artifacts
|
| 67 |
+
2. Organize the text into clear sections (Abstract, Introduction, Methods, Results, Discussion, Conclusion, References, Appendix)
|
| 68 |
+
3. Preserve all important content including figures, tables, and equations
|
| 69 |
+
4. Make the text flow better while maintaining academic integrity
|
| 70 |
+
5. Ensure all evaluation-related content is clearly identifiable
|
| 71 |
+
6. Keep the text under 8000 characters while preserving key information
|
| 72 |
+
|
| 73 |
+
Original paper text:
|
| 74 |
+
{paper_text}
|
| 75 |
+
|
| 76 |
+
Please provide the cleaned and formatted text:"""
|
| 77 |
+
|
| 78 |
+
try:
|
| 79 |
+
completion = client.chat.completions.create(
|
| 80 |
+
extra_headers={
|
| 81 |
+
"HTTP-Referer": "https://github.com/paper-eval-checker",
|
| 82 |
+
"X-Title": "Paper Evaluation Pitfall Checker",
|
| 83 |
+
},
|
| 84 |
+
model="x-ai/grok-4-fast:free",
|
| 85 |
+
messages=[{"role": "user", "content": format_prompt}],
|
| 86 |
+
temperature=0.1, # Very low temperature for consistent formatting
|
| 87 |
+
max_tokens=3000,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
return {
|
| 91 |
+
"formatted_text": completion.choices[0].message.content,
|
| 92 |
+
"success": True,
|
| 93 |
+
"error": None,
|
| 94 |
+
}
|
| 95 |
except Exception as e:
|
| 96 |
+
return {"formatted_text": None, "success": False, "error": str(e)}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def analyze_paper_for_pitfalls(
|
| 100 |
+
formatted_text: str, pitfalls: List[Dict[str, Any]]
|
| 101 |
+
) -> Dict[str, Any]:
|
| 102 |
+
"""Second stage: Use OpenRouter API with Grok model to analyze the formatted paper for potential pitfalls."""
|
| 103 |
+
|
| 104 |
+
# Check if API key is available
|
| 105 |
+
if not OPENROUTER_API_KEY:
|
| 106 |
+
return {
|
| 107 |
+
"analysis": None,
|
| 108 |
+
"success": False,
|
| 109 |
+
"error": "OpenRouter API key not found. Please set the OPENROUTER_API_KEY environment variable.",
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
# Initialize OpenAI client with OpenRouter
|
| 113 |
+
client = OpenAI(
|
| 114 |
+
base_url="https://openrouter.ai/api/v1",
|
| 115 |
+
api_key=OPENROUTER_API_KEY,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# Create the prompt for pitfall analysis
|
| 119 |
+
pitfalls_description = "\n\n".join(
|
| 120 |
+
[
|
| 121 |
+
f"**{pitfall['name']}** {pitfall['emoji']}\n"
|
| 122 |
+
f"Category: {pitfall['category']}\n"
|
| 123 |
+
f"Description: {pitfall['description']}\n"
|
| 124 |
+
f"Subjective/Objective: {pitfall['subjective_objective']}\n"
|
| 125 |
+
f"Actors Most Affected: {', '.join(pitfall['actors_most_affected'])}\n"
|
| 126 |
+
f"Evaluation Use: {pitfall['evaluation_use']}\n"
|
| 127 |
+
f"Modalities: {', '.join(pitfall['modalities'])}"
|
| 128 |
+
for pitfall in pitfalls
|
| 129 |
+
]
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
analysis_prompt = f"""You are an expert research paper reviewer specializing in identifying evaluation pitfalls in academic papers.
|
| 133 |
+
|
| 134 |
+
Your task is to analyze the provided formatted research paper text and identify any potential pitfalls from the following list:
|
| 135 |
+
|
| 136 |
+
{pitfalls_description}
|
| 137 |
+
|
| 138 |
+
Please analyze the paper carefully and provide:
|
| 139 |
+
1. A list of potential pitfalls found (if any)
|
| 140 |
+
2. For each pitfall found, provide:
|
| 141 |
+
- The pitfall name
|
| 142 |
+
- Specific evidence from the paper that suggests this pitfall
|
| 143 |
+
- The section/page where this evidence appears
|
| 144 |
+
- A confidence level (High/Medium/Low) for your assessment
|
| 145 |
+
- Suggestions for improvement
|
| 146 |
+
3. Be concise, and use markdown formatting.
|
| 147 |
+
4. If you find evidence of a pitfall, make sure to look at ALL of the paper to see if it is mitigated elsewhere -- make sure to check the appendix of the paper as well.
|
| 148 |
|
| 149 |
+
The output format:
|
| 150 |
|
| 151 |
+
# Overall
|
| 152 |
+
<img src="https://img.shields.io/severity/high-red" alt="Severity: High"> (for low use green, for medium use yellow, for high use red)
|
| 153 |
+
<img src="https://img.shields.io/evaluation/objective-blue" alt="Objective evaluation"> (either write 'subjective', 'objective', or include two images in case both are present in the paper)
|
| 154 |
+
[One sentence summary of evaluation use]
|
| 155 |
|
| 156 |
+
# Pitfall
|
|
|
|
|
|
|
| 157 |
|
| 158 |
+
## Evidence
|
| 159 |
+
"specific evidence from the paper"
|
| 160 |
|
| 161 |
+
If no pitfalls are found, please state that clearly.
|
| 162 |
+
|
| 163 |
+
Formatted paper text to analyze:
|
| 164 |
+
{formatted_text}
|
| 165 |
+
|
| 166 |
+
Please provide your analysis in a structured format."""
|
| 167 |
+
|
| 168 |
+
try:
|
| 169 |
+
completion = client.chat.completions.create(
|
| 170 |
+
extra_headers={
|
| 171 |
+
"HTTP-Referer": "https://github.com/paper-eval-checker",
|
| 172 |
+
"X-Title": "Paper Evaluation Pitfall Checker",
|
| 173 |
+
},
|
| 174 |
+
model="x-ai/grok-4-fast:free",
|
| 175 |
+
messages=[{"role": "user", "content": analysis_prompt}],
|
| 176 |
+
temperature=0.3, # Lower temperature for more consistent analysis
|
| 177 |
+
max_tokens=2000,
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
return {
|
| 181 |
+
"analysis": completion.choices[0].message.content,
|
| 182 |
+
"success": True,
|
| 183 |
+
"error": None,
|
| 184 |
}
|
| 185 |
+
except Exception as e:
|
| 186 |
+
return {"analysis": None, "success": False, "error": str(e)}
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def process_paper(pdf_file, progress=gr.Progress()):
|
| 190 |
+
"""Main function to process a research paper for pitfall detection using two-stage approach."""
|
| 191 |
|
| 192 |
+
if not pdf_file:
|
| 193 |
+
return gr.Markdown(
|
| 194 |
+
"β οΈ No PDF file uploaded. Please upload a research paper PDF."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
)
|
| 196 |
|
| 197 |
+
if not OPENROUTER_API_KEY:
|
| 198 |
+
return gr.Markdown(
|
| 199 |
+
"β οΈ OpenRouter API key not found. Please set the OPENROUTER_API_KEY environment variable."
|
| 200 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
try:
|
| 203 |
+
# Step 1: Load pitfalls
|
| 204 |
+
progress(0.1, desc="Loading pitfalls definitions...")
|
| 205 |
+
pitfalls = load_pitfalls()
|
| 206 |
+
|
| 207 |
+
if not pitfalls:
|
| 208 |
+
return gr.Markdown(
|
| 209 |
+
"β οΈ No pitfalls definitions found. Please check pitfalls.json file."
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Step 2: Extract text from PDF
|
| 213 |
+
progress(0.2, desc="Extracting text from PDF...")
|
| 214 |
+
paper_text = extract_text_from_pdf(pdf_file)
|
| 215 |
+
|
| 216 |
+
if not paper_text.strip():
|
| 217 |
+
return gr.Markdown(
|
| 218 |
+
"β οΈ No text content found in the PDF. Please check if the PDF contains readable text."
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
# Step 3: Format paper text (First AI call)
|
| 222 |
+
progress(0.3, desc="Formatting paper text for analysis...")
|
| 223 |
+
format_result = format_paper_text(paper_text)
|
| 224 |
+
|
| 225 |
+
if not format_result["success"]:
|
| 226 |
+
return gr.Markdown(
|
| 227 |
+
f"β Error during text formatting: {format_result['error']}"
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# Step 4: Analyze for pitfalls (Second AI call)
|
| 231 |
+
progress(0.7, desc="Analyzing paper for potential pitfalls...")
|
| 232 |
+
analysis_result = analyze_paper_for_pitfalls(
|
| 233 |
+
format_result["formatted_text"], pitfalls
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
if not analysis_result["success"]:
|
| 237 |
+
return gr.Markdown(f"β Error during analysis: {analysis_result['error']}")
|
| 238 |
+
|
| 239 |
+
# Step 5: Format final results
|
| 240 |
+
progress(0.9, desc="Preparing final report...")
|
| 241 |
+
analysis_text = analysis_result["analysis"]
|
| 242 |
+
|
| 243 |
+
# Create a formatted markdown report
|
| 244 |
+
report = f"""# Research Paper Pitfall Analysis Report
|
| 245 |
+
|
| 246 |
+
## Analysis Results
|
| 247 |
+
|
| 248 |
+
{analysis_text}
|
| 249 |
+
|
| 250 |
+
---
|
| 251 |
+
*Analysis completed using OpenRouter API with Grok model (two-stage processing)*
|
| 252 |
+
"""
|
| 253 |
+
|
| 254 |
+
progress(1.0, desc="Analysis complete!")
|
| 255 |
+
return gr.Markdown(report)
|
| 256 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
except Exception as e:
|
| 258 |
+
return gr.Markdown(f"β An error occurred: {str(e)}")
|
|
|
|
|
|
|
| 259 |
|
| 260 |
|
| 261 |
# Define the Gradio interface
|
| 262 |
+
with gr.Blocks(title="Research Paper Pitfall Checker") as demo:
|
| 263 |
gr.HTML(
|
| 264 |
+
"""<h1 style='text-align: center;'>π Research Paper Pitfall Checker</h1>
|
| 265 |
+
<center><i>Identify potential evaluation pitfalls in academic research papers</i></center>"""
|
| 266 |
)
|
| 267 |
+
|
| 268 |
gr.HTML(
|
| 269 |
"""
|
| 270 |
+
<div style="max-width: 800px; margin: 0 auto; padding: 20px;">
|
| 271 |
+
<h3>How it works:</h3>
|
| 272 |
+
<ol>
|
| 273 |
+
<li><strong>Upload a PDF</strong> of your research paper</li>
|
| 274 |
+
<li><strong>Click "Analyze Paper"</strong> to scan for potential pitfalls</li>
|
| 275 |
+
<li><strong>Review the analysis</strong> to identify areas for improvement</li>
|
|
|
|
|
|
|
|
|
|
| 276 |
</ol>
|
| 277 |
+
|
| 278 |
+
<h3>Supported Pitfalls:</h3>
|
| 279 |
+
<ul>
|
| 280 |
+
<li>π The Lock-In Effect</li>
|
| 281 |
+
<li>ππ Apples-to-Oranges Comparisons</li>
|
| 282 |
+
<li>π§ Contamination Leak</li>
|
| 283 |
+
<li>π€β Unvalidated Automation</li>
|
| 284 |
+
<li>π§ Vague Scales</li>
|
| 285 |
+
</ul>
|
| 286 |
</div>
|
| 287 |
"""
|
| 288 |
)
|
| 289 |
|
| 290 |
with gr.Row():
|
| 291 |
+
with gr.Column(scale=3):
|
| 292 |
+
pdf_file = gr.File(label="Upload Research Paper (PDF)", file_types=[".pdf"])
|
| 293 |
+
|
| 294 |
+
with gr.Column(scale=1):
|
| 295 |
+
analyze_button = gr.Button(
|
| 296 |
+
"π Analyze Paper for Pitfalls",
|
| 297 |
+
variant="primary",
|
| 298 |
+
size="lg",
|
| 299 |
+
elem_id="analyze-btn",
|
| 300 |
+
)
|
| 301 |
|
| 302 |
with gr.Row():
|
| 303 |
+
results = gr.Markdown(
|
| 304 |
+
value="Upload a PDF to get started with pitfall analysis.",
|
| 305 |
+
elem_id="results",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
|
| 308 |
+
# Add loading animation CSS
|
| 309 |
+
demo.css = """
|
| 310 |
+
#analyze-btn {
|
| 311 |
+
background: linear-gradient(45deg, #ff6b6b, #4ecdc4, #45b7d1, #96ceb4, #feca57);
|
| 312 |
+
background-size: 400% 400%;
|
| 313 |
+
animation: gradient 3s ease infinite;
|
| 314 |
+
border: none;
|
| 315 |
+
color: white;
|
| 316 |
+
font-weight: bold;
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
@keyframes gradient {
|
| 320 |
+
0% { background-position: 0% 50%; }
|
| 321 |
+
50% { background-position: 100% 50%; }
|
| 322 |
+
100% { background-position: 0% 50%; }
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
#results {
|
| 326 |
+
min-height: 200px;
|
| 327 |
+
padding: 20px;
|
| 328 |
+
border: 1px solid #e0e0e0;
|
| 329 |
+
border-radius: 8px;
|
| 330 |
+
background-color: #fafafa;
|
| 331 |
+
}
|
| 332 |
+
"""
|
| 333 |
+
|
| 334 |
+
# Connect the button to the processing function
|
| 335 |
+
analyze_button.click(
|
| 336 |
+
fn=process_paper,
|
| 337 |
+
inputs=[pdf_file],
|
| 338 |
+
outputs=[results],
|
| 339 |
)
|
| 340 |
+
|
| 341 |
+
if __name__ == "__main__":
|
| 342 |
+
demo.launch(share=True, server_name="localhost", server_port=9090)
|
dataset_card_template.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
DATASET_CARD_TEMPLATE = """
|
| 2 |
-
# Dataset Card for {hf_repo}
|
| 3 |
-
|
| 4 |
-
## Dataset Description
|
| 5 |
-
|
| 6 |
-
This dataset contains images converted from PDFs using the PDFs to Page Images Converter Space.
|
| 7 |
-
|
| 8 |
-
- **Number of images:** {num_images}
|
| 9 |
-
- **Number of PDFs processed:** {num_pdfs}
|
| 10 |
-
- **Sample size per PDF:** {sample_size}
|
| 11 |
-
- **Created on:** {creation_date}
|
| 12 |
-
|
| 13 |
-
## Dataset Creation
|
| 14 |
-
|
| 15 |
-
### Source Data
|
| 16 |
-
|
| 17 |
-
The images in this dataset were generated from user-uploaded PDF files.
|
| 18 |
-
|
| 19 |
-
### Processing Steps
|
| 20 |
-
|
| 21 |
-
1. PDF files were uploaded to the PDFs to Page Images Converter.
|
| 22 |
-
2. Each PDF was processed, converting selected pages to images.
|
| 23 |
-
3. The resulting images were saved and uploaded to this dataset.
|
| 24 |
-
|
| 25 |
-
## Dataset Structure
|
| 26 |
-
|
| 27 |
-
The dataset consists of JPEG images, each representing a single page from the source PDFs.
|
| 28 |
-
|
| 29 |
-
### Data Fields
|
| 30 |
-
|
| 31 |
-
- `images/`: A folder containing all the converted images.
|
| 32 |
-
|
| 33 |
-
### Data Splits
|
| 34 |
-
|
| 35 |
-
This dataset does not have specific splits.
|
| 36 |
-
|
| 37 |
-
## Additional Information
|
| 38 |
-
|
| 39 |
-
- **Contributions:** Thanks to the PDFs to Page Images Converter for creating this dataset.
|
| 40 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
env.example
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenRouter API Key
|
| 2 |
+
# Get your API key from https://openrouter.ai
|
| 3 |
+
OPENROUTER_API_KEY=your-api-key-here
|
packages.txt
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
poppler-utils
|
|
|
|
|
|
pitfalls.json
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"pitfalls": [
|
| 3 |
+
{
|
| 4 |
+
"name": "The Lock-In Effect",
|
| 5 |
+
"emoji": "π",
|
| 6 |
+
"category": "General",
|
| 7 |
+
"description": "Practices known to be problematic remain widespread simply because they are already widespread, making it difficult for new, better methods to be adopted.",
|
| 8 |
+
"subjective_objective": "Both",
|
| 9 |
+
"actors_most_affected": [
|
| 10 |
+
"Academic researcher",
|
| 11 |
+
"Model creator"
|
| 12 |
+
],
|
| 13 |
+
"evaluation_use": "Compare models",
|
| 14 |
+
"modalities": [
|
| 15 |
+
"General"
|
| 16 |
+
],
|
| 17 |
+
"sources": []
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"name": "Apples-to-Oranges",
|
| 21 |
+
"emoji": "ππ",
|
| 22 |
+
"category": "General",
|
| 23 |
+
"description": "Models or data are compared on an unequal footing, such as evaluating models using a different number of examples or under different conditions.",
|
| 24 |
+
"subjective_objective": "Both",
|
| 25 |
+
"actors_most_affected": [
|
| 26 |
+
"Academic researcher",
|
| 27 |
+
"Model deployer"
|
| 28 |
+
],
|
| 29 |
+
"evaluation_use": "Compare models",
|
| 30 |
+
"modalities": [
|
| 31 |
+
"General",
|
| 32 |
+
"NLP",
|
| 33 |
+
"Speech"
|
| 34 |
+
],
|
| 35 |
+
"sources": []
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"name": "Contamination Leak",
|
| 39 |
+
"emoji": "π§",
|
| 40 |
+
"category": "General",
|
| 41 |
+
"description": "The model has already been exposed to the evaluation data during its training phase, which invalidates the results. This is a widespread and subtle problem.",
|
| 42 |
+
"subjective_objective": "Both",
|
| 43 |
+
"actors_most_affected": [
|
| 44 |
+
"Academic researcher",
|
| 45 |
+
"Model creator"
|
| 46 |
+
],
|
| 47 |
+
"evaluation_use": "Compare models, assess system reliability",
|
| 48 |
+
"modalities": [
|
| 49 |
+
"General"
|
| 50 |
+
],
|
| 51 |
+
"sources": []
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"name": "Unvalidated Automation",
|
| 55 |
+
"emoji": "π€β",
|
| 56 |
+
"category": "NLP",
|
| 57 |
+
"description": "Using an LLM-as-a-judge to evaluate outputs without first validating the judge LLM's performance against human experts or established criteria. While LLMs can scale evaluation, they are not yet reliable enough to be the sole evaluators.",
|
| 58 |
+
"subjective_objective": "Both",
|
| 59 |
+
"actors_most_affected": [
|
| 60 |
+
"Academic researcher",
|
| 61 |
+
"Model creator",
|
| 62 |
+
"Model deployer"
|
| 63 |
+
],
|
| 64 |
+
"evaluation_use": "Assess system reliability",
|
| 65 |
+
"modalities": [
|
| 66 |
+
"Text",
|
| 67 |
+
"General"
|
| 68 |
+
],
|
| 69 |
+
"sources": [
|
| 70 |
+
"The LLM Evaluation guidebook"
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"name": "Vague Scales",
|
| 75 |
+
"emoji": "π§",
|
| 76 |
+
"category": "TTS",
|
| 77 |
+
"description": "Papers on synthetic speech fail to report crucial details, such as whether they are evaluating 'quality' or 'naturalness,' or do not disclose the labels used in their Mean Opinion Score (MOS) scale.",
|
| 78 |
+
"subjective_objective": "Subjective",
|
| 79 |
+
"actors_most_affected": [
|
| 80 |
+
"Academic researcher"
|
| 81 |
+
],
|
| 82 |
+
"evaluation_use": "Compare models, assess system reliability",
|
| 83 |
+
"modalities": [
|
| 84 |
+
"Speech"
|
| 85 |
+
],
|
| 86 |
+
"sources": [
|
| 87 |
+
"Good practices for evaluation of synthesized speech",
|
| 88 |
+
"Hot topics in speech synthesis evaluation"
|
| 89 |
+
]
|
| 90 |
+
}
|
| 91 |
+
]
|
| 92 |
+
}
|
requirements.in
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
huggingface_hub[hf_transfer]
|
| 5 |
-
pdf2image
|
| 6 |
PyMuPDF
|
|
|
|
| 1 |
|
| 2 |
+
gradio==4.44.0
|
| 3 |
+
openai
|
|
|
|
|
|
|
| 4 |
PyMuPDF
|