Update app.py
Browse files
app.py
CHANGED
|
@@ -7,23 +7,47 @@ import re
|
|
| 7 |
# Global datasets - load lazily
|
| 8 |
math_samples = None
|
| 9 |
|
| 10 |
-
def
|
| 11 |
-
"""Load
|
| 12 |
global math_samples
|
| 13 |
if math_samples is not None:
|
| 14 |
return math_samples
|
| 15 |
|
| 16 |
-
|
| 17 |
try:
|
|
|
|
| 18 |
gsm8k = load_dataset("openai/gsm8k", "main", streaming=True)
|
| 19 |
-
samples = []
|
| 20 |
for i, item in enumerate(gsm8k["train"]):
|
| 21 |
samples.append(item["question"])
|
| 22 |
if i >= 50:
|
| 23 |
break
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
math_samples = samples
|
| 26 |
return samples
|
|
|
|
| 27 |
except Exception as e:
|
| 28 |
print(f"⚠️ Dataset error: {e}, using fallback")
|
| 29 |
math_samples = [
|
|
@@ -31,7 +55,9 @@ def load_datasets_lazy():
|
|
| 31 |
"A triangle has sides of length 5, 12, and 13. What is its area?",
|
| 32 |
"If log₂(x) + log₂(x+6) = 4, find the value of x.",
|
| 33 |
"Find the limit: lim(x->0) (sin(x)/x)",
|
| 34 |
-
"Solve the system: x + 2y = 7, 3x - y = 4"
|
|
|
|
|
|
|
| 35 |
]
|
| 36 |
return math_samples
|
| 37 |
|
|
|
|
| 7 |
# Global datasets - load lazily
|
| 8 |
math_samples = None
|
| 9 |
|
| 10 |
+
def load_sample_problems():
|
| 11 |
+
"""Load sample problems from ALL datasets"""
|
| 12 |
global math_samples
|
| 13 |
if math_samples is not None:
|
| 14 |
return math_samples
|
| 15 |
|
| 16 |
+
samples = []
|
| 17 |
try:
|
| 18 |
+
# GSM8K (math problems)
|
| 19 |
gsm8k = load_dataset("openai/gsm8k", "main", streaming=True)
|
|
|
|
| 20 |
for i, item in enumerate(gsm8k["train"]):
|
| 21 |
samples.append(item["question"])
|
| 22 |
if i >= 50:
|
| 23 |
break
|
| 24 |
+
|
| 25 |
+
# Fineweb-edu (educational text - extract math-like questions)
|
| 26 |
+
fw = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train", streaming=True)
|
| 27 |
+
fw_count = 0
|
| 28 |
+
for item in fw:
|
| 29 |
+
# Filter for math-related content (simple keyword match)
|
| 30 |
+
if any(word in item['text'].lower() for word in ['math', 'calculate', 'solve', 'derivative', 'integral', 'triangle', 'equation']):
|
| 31 |
+
samples.append(item['text'][:200] + " (Solve this math problem.)") # Truncate for brevity
|
| 32 |
+
fw_count += 1
|
| 33 |
+
if fw_count >= 20:
|
| 34 |
+
break
|
| 35 |
+
|
| 36 |
+
# Ultrachat_200k (chat-like math queries)
|
| 37 |
+
ds = load_dataset("HuggingFaceH4/ultrachat_200k", streaming=True)
|
| 38 |
+
ds_count = 0
|
| 39 |
+
for item in ds:
|
| 40 |
+
if 'math' in item['messages'][0]['content'].lower() or 'calculate' in item['messages'][0]['content'].lower():
|
| 41 |
+
user_msg = item['messages'][0]['content']
|
| 42 |
+
samples.append(user_msg)
|
| 43 |
+
ds_count += 1
|
| 44 |
+
if ds_count >= 20:
|
| 45 |
+
break
|
| 46 |
+
|
| 47 |
+
print(f"✅ Loaded {len(samples)} samples: GSM8K ({50}), Fineweb-edu ({fw_count}), Ultrachat ({ds_count})")
|
| 48 |
math_samples = samples
|
| 49 |
return samples
|
| 50 |
+
|
| 51 |
except Exception as e:
|
| 52 |
print(f"⚠️ Dataset error: {e}, using fallback")
|
| 53 |
math_samples = [
|
|
|
|
| 55 |
"A triangle has sides of length 5, 12, and 13. What is its area?",
|
| 56 |
"If log₂(x) + log₂(x+6) = 4, find the value of x.",
|
| 57 |
"Find the limit: lim(x->0) (sin(x)/x)",
|
| 58 |
+
"Solve the system: x + 2y = 7, 3x - y = 4",
|
| 59 |
+
"Calculate the integral of sin(x) from 0 to pi.",
|
| 60 |
+
"What is the probability of rolling a 6 on a die 3 times in a row?"
|
| 61 |
]
|
| 62 |
return math_samples
|
| 63 |
|