Update app.py
Browse files
app.py
CHANGED
|
@@ -33,7 +33,14 @@ combined_dataset = chain(*(stream_subreddit_data(sub) for sub in target_subreddi
|
|
| 33 |
comments = list(islice(combined_dataset, 100000))
|
| 34 |
|
| 35 |
# Extract text and subreddit
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
# Clean text function
|
| 39 |
def clean_body(text):
|
|
|
|
| 33 |
comments = list(islice(combined_dataset, 100000))
|
| 34 |
|
| 35 |
# Extract text and subreddit
|
| 36 |
+
comments = []
|
| 37 |
+
|
| 38 |
+
for sub in target_subreddits:
|
| 39 |
+
stream = load_dataset("HuggingFaceGECLM/REDDIT_comments", split=sub, streaming=True)
|
| 40 |
+
comments.extend({"body": ex["body"], "subreddit": sub} for ex in islice(stream, 20000)) # ~100k total
|
| 41 |
+
|
| 42 |
+
# Convert to DataFrame
|
| 43 |
+
df = pd.DataFrame(comments)
|
| 44 |
|
| 45 |
# Clean text function
|
| 46 |
def clean_body(text):
|