Khoi1234210 commited on
Commit
85ea4ec
·
verified ·
1 Parent(s): aaf8367

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -6
app.py CHANGED
@@ -7,23 +7,47 @@ import re
7
  # Global datasets - load lazily
8
  math_samples = None
9
 
10
- def load_datasets_lazy():
11
- """Load datasets only when needed - prevents startup crashes"""
12
  global math_samples
13
  if math_samples is not None:
14
  return math_samples
15
 
16
- print("🔄 Loading datasets...")
17
  try:
 
18
  gsm8k = load_dataset("openai/gsm8k", "main", streaming=True)
19
- samples = []
20
  for i, item in enumerate(gsm8k["train"]):
21
  samples.append(item["question"])
22
  if i >= 50:
23
  break
24
- print(f"✅ Loaded {len(samples)} GSM8K samples")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  math_samples = samples
26
  return samples
 
27
  except Exception as e:
28
  print(f"⚠️ Dataset error: {e}, using fallback")
29
  math_samples = [
@@ -31,7 +55,9 @@ def load_datasets_lazy():
31
  "A triangle has sides of length 5, 12, and 13. What is its area?",
32
  "If log₂(x) + log₂(x+6) = 4, find the value of x.",
33
  "Find the limit: lim(x->0) (sin(x)/x)",
34
- "Solve the system: x + 2y = 7, 3x - y = 4"
 
 
35
  ]
36
  return math_samples
37
 
 
7
  # Global datasets - load lazily
8
  math_samples = None
9
 
10
+ def load_sample_problems():
11
+ """Load sample problems from ALL datasets"""
12
  global math_samples
13
  if math_samples is not None:
14
  return math_samples
15
 
16
+ samples = []
17
  try:
18
+ # GSM8K (math problems)
19
  gsm8k = load_dataset("openai/gsm8k", "main", streaming=True)
 
20
  for i, item in enumerate(gsm8k["train"]):
21
  samples.append(item["question"])
22
  if i >= 50:
23
  break
24
+
25
+ # Fineweb-edu (educational text - extract math-like questions)
26
+ fw = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train", streaming=True)
27
+ fw_count = 0
28
+ for item in fw:
29
+ # Filter for math-related content (simple keyword match)
30
+ if any(word in item['text'].lower() for word in ['math', 'calculate', 'solve', 'derivative', 'integral', 'triangle', 'equation']):
31
+ samples.append(item['text'][:200] + " (Solve this math problem.)") # Truncate for brevity
32
+ fw_count += 1
33
+ if fw_count >= 20:
34
+ break
35
+
36
+ # Ultrachat_200k (chat-like math queries)
37
+ ds = load_dataset("HuggingFaceH4/ultrachat_200k", streaming=True)
38
+ ds_count = 0
39
+ for item in ds:
40
+ if 'math' in item['messages'][0]['content'].lower() or 'calculate' in item['messages'][0]['content'].lower():
41
+ user_msg = item['messages'][0]['content']
42
+ samples.append(user_msg)
43
+ ds_count += 1
44
+ if ds_count >= 20:
45
+ break
46
+
47
+ print(f"✅ Loaded {len(samples)} samples: GSM8K ({50}), Fineweb-edu ({fw_count}), Ultrachat ({ds_count})")
48
  math_samples = samples
49
  return samples
50
+
51
  except Exception as e:
52
  print(f"⚠️ Dataset error: {e}, using fallback")
53
  math_samples = [
 
55
  "A triangle has sides of length 5, 12, and 13. What is its area?",
56
  "If log₂(x) + log₂(x+6) = 4, find the value of x.",
57
  "Find the limit: lim(x->0) (sin(x)/x)",
58
+ "Solve the system: x + 2y = 7, 3x - y = 4",
59
+ "Calculate the integral of sin(x) from 0 to pi.",
60
+ "What is the probability of rolling a 6 on a die 3 times in a row?"
61
  ]
62
  return math_samples
63