peihsin0715
commited on
Commit
·
3ca6cb8
1
Parent(s):
97a31eb
Fix data saving
Browse files- backend/server.py +10 -10
- backend/utils/finetune.py +1 -1
- backend/utils/utils.py +1 -1
backend/server.py
CHANGED
|
@@ -54,7 +54,7 @@ _MODELS = {}
|
|
| 54 |
_CURRENT_DATASET = None
|
| 55 |
_GENERATION_RESULTS = None
|
| 56 |
|
| 57 |
-
@app.route('/
|
| 58 |
def serve_data(filename):
|
| 59 |
import os
|
| 60 |
from flask import Response
|
|
@@ -419,7 +419,7 @@ def run_pipeline():
|
|
| 419 |
|
| 420 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 421 |
os.makedirs("data", exist_ok=True)
|
| 422 |
-
output_file = f"
|
| 423 |
evaluated_results.to_csv(output_file, index=False)
|
| 424 |
results['generation_file'] = output_file
|
| 425 |
results['generation_samples'] = len(evaluated_results)
|
|
@@ -431,7 +431,7 @@ def run_pipeline():
|
|
| 431 |
category_col="category",
|
| 432 |
num_cf_per_row=num_cf_per_row
|
| 433 |
)
|
| 434 |
-
augmented_file = f"
|
| 435 |
augmented_results.to_csv(augmented_file, index=False)
|
| 436 |
results['counterfactual_file'] = augmented_file
|
| 437 |
results['counterfactual_added'] = len(augmented_results) - len(evaluated_results)
|
|
@@ -465,7 +465,7 @@ def run_pipeline():
|
|
| 465 |
mid_point = len(evaluated_results) // 2
|
| 466 |
best_sent_subset = evaluated_results.iloc[:mid_point].copy()
|
| 467 |
|
| 468 |
-
sent_file = f"
|
| 469 |
best_sent_subset.to_csv(sent_file, index=False)
|
| 470 |
|
| 471 |
print(f"Pipeline Step 5: Rank sampling on CF-augmented results...(iterations={iterations}, temp={tau})")
|
|
@@ -476,7 +476,7 @@ def run_pipeline():
|
|
| 476 |
mid_point = len(augmented_results) // 2
|
| 477 |
cf_best_sent_subset = augmented_results.iloc[:mid_point].copy()
|
| 478 |
|
| 479 |
-
cf_sent_file = f"
|
| 480 |
cf_best_sent_subset.to_csv(cf_sent_file, index=False)
|
| 481 |
|
| 482 |
orig_means = _mean_by_cat(best_sent_subset, selected_groups_used)
|
|
@@ -519,8 +519,8 @@ def run_pipeline():
|
|
| 519 |
print("[Plot check exists]", cf_path, os.path.exists(cf_path))
|
| 520 |
|
| 521 |
results['plots'] = {
|
| 522 |
-
'original_sentiment': f"/
|
| 523 |
-
'counterfactual_sentiment': f"/
|
| 524 |
}
|
| 525 |
|
| 526 |
print("[Plot urls]", results['plots'])
|
|
@@ -534,7 +534,7 @@ def run_pipeline():
|
|
| 534 |
lr = float(ft_cfg.get("learningRate", 5e-5))
|
| 535 |
|
| 536 |
input_csv = augmented_file
|
| 537 |
-
ft_output_dir = f"
|
| 538 |
os.makedirs(ft_output_dir, exist_ok=True)
|
| 539 |
|
| 540 |
try:
|
|
@@ -548,10 +548,10 @@ def run_pipeline():
|
|
| 548 |
)
|
| 549 |
print(f"[Fine-tune] Saved fine-tuned model to {ft_output_dir}")
|
| 550 |
results["finetuned_model_dir"] = ft_output_dir
|
| 551 |
-
zip_base = f"
|
| 552 |
import shutil
|
| 553 |
zip_path = shutil.make_archive(zip_base, 'zip', ft_output_dir)
|
| 554 |
-
results["finetuned_model_zip"] = f"/
|
| 555 |
except Exception as fe:
|
| 556 |
print(f"[Fine-tune] Failed: {fe}")
|
| 557 |
results["finetuned_model_error"] = str(fe)
|
|
|
|
| 54 |
_CURRENT_DATASET = None
|
| 55 |
_GENERATION_RESULTS = None
|
| 56 |
|
| 57 |
+
@app.route('/tmp/<path:filename>')
|
| 58 |
def serve_data(filename):
|
| 59 |
import os
|
| 60 |
from flask import Response
|
|
|
|
| 419 |
|
| 420 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 421 |
os.makedirs("data", exist_ok=True)
|
| 422 |
+
output_file = f"/tmp/pipeline_generation_{timestamp}.csv"
|
| 423 |
evaluated_results.to_csv(output_file, index=False)
|
| 424 |
results['generation_file'] = output_file
|
| 425 |
results['generation_samples'] = len(evaluated_results)
|
|
|
|
| 431 |
category_col="category",
|
| 432 |
num_cf_per_row=num_cf_per_row
|
| 433 |
)
|
| 434 |
+
augmented_file = f"/tmp/pipeline_generation_cf_augmented_{timestamp}.csv"
|
| 435 |
augmented_results.to_csv(augmented_file, index=False)
|
| 436 |
results['counterfactual_file'] = augmented_file
|
| 437 |
results['counterfactual_added'] = len(augmented_results) - len(evaluated_results)
|
|
|
|
| 465 |
mid_point = len(evaluated_results) // 2
|
| 466 |
best_sent_subset = evaluated_results.iloc[:mid_point].copy()
|
| 467 |
|
| 468 |
+
sent_file = f"/tmp/pipeline_sent_subset_{timestamp}.csv"
|
| 469 |
best_sent_subset.to_csv(sent_file, index=False)
|
| 470 |
|
| 471 |
print(f"Pipeline Step 5: Rank sampling on CF-augmented results...(iterations={iterations}, temp={tau})")
|
|
|
|
| 476 |
mid_point = len(augmented_results) // 2
|
| 477 |
cf_best_sent_subset = augmented_results.iloc[:mid_point].copy()
|
| 478 |
|
| 479 |
+
cf_sent_file = f"/tmp/pipeline_cf_sent_subset_{timestamp}.csv"
|
| 480 |
cf_best_sent_subset.to_csv(cf_sent_file, index=False)
|
| 481 |
|
| 482 |
orig_means = _mean_by_cat(best_sent_subset, selected_groups_used)
|
|
|
|
| 519 |
print("[Plot check exists]", cf_path, os.path.exists(cf_path))
|
| 520 |
|
| 521 |
results['plots'] = {
|
| 522 |
+
'original_sentiment': f"/tmp/{orig_sent_title}.png",
|
| 523 |
+
'counterfactual_sentiment': f"/tmp/{cf_sent_title}.png",
|
| 524 |
}
|
| 525 |
|
| 526 |
print("[Plot urls]", results['plots'])
|
|
|
|
| 534 |
lr = float(ft_cfg.get("learningRate", 5e-5))
|
| 535 |
|
| 536 |
input_csv = augmented_file
|
| 537 |
+
ft_output_dir = f"/tmp/ft_{timestamp}"
|
| 538 |
os.makedirs(ft_output_dir, exist_ok=True)
|
| 539 |
|
| 540 |
try:
|
|
|
|
| 548 |
)
|
| 549 |
print(f"[Fine-tune] Saved fine-tuned model to {ft_output_dir}")
|
| 550 |
results["finetuned_model_dir"] = ft_output_dir
|
| 551 |
+
zip_base = f"/tmp/ft_{timestamp}"
|
| 552 |
import shutil
|
| 553 |
zip_path = shutil.make_archive(zip_base, 'zip', ft_output_dir)
|
| 554 |
+
results["finetuned_model_zip"] = f"/tmp/{os.path.basename(zip_path)}"
|
| 555 |
except Exception as fe:
|
| 556 |
print(f"[Fine-tune] Failed: {fe}")
|
| 557 |
results["finetuned_model_error"] = str(fe)
|
backend/utils/finetune.py
CHANGED
|
@@ -28,7 +28,7 @@ def build_text_column(df: pd.DataFrame) -> pd.Series:
|
|
| 28 |
def finetune_gpt2_from_csv(
|
| 29 |
csv_path: str,
|
| 30 |
base_model: str = "gpt2",
|
| 31 |
-
output_dir: str = "
|
| 32 |
train_split: float = 0.9,
|
| 33 |
epochs: int = 3,
|
| 34 |
lr: float = 5e-5,
|
|
|
|
| 28 |
def finetune_gpt2_from_csv(
|
| 29 |
csv_path: str,
|
| 30 |
base_model: str = "gpt2",
|
| 31 |
+
output_dir: str = "/tmp/ft_gpt2_out",
|
| 32 |
train_split: float = 0.9,
|
| 33 |
epochs: int = 3,
|
| 34 |
lr: float = 5e-5,
|
backend/utils/utils.py
CHANGED
|
@@ -103,7 +103,7 @@ def load_model_and_tokenizer(model_name: str):
|
|
| 103 |
raise RuntimeError(f"Failed to load model '{model_name}': {e}")
|
| 104 |
|
| 105 |
def finetune(train_texts, tokenizer, model, num_epochs=20, output_dir='./data'):
|
| 106 |
-
train_path = f"
|
| 107 |
|
| 108 |
with open(train_path, "w", encoding="utf-8") as f:
|
| 109 |
for text in train_texts:
|
|
|
|
| 103 |
raise RuntimeError(f"Failed to load model '{model_name}': {e}")
|
| 104 |
|
| 105 |
def finetune(train_texts, tokenizer, model, num_epochs=20, output_dir='./data'):
|
| 106 |
+
train_path = f"/tmp/train.txt"
|
| 107 |
|
| 108 |
with open(train_path, "w", encoding="utf-8") as f:
|
| 109 |
for text in train_texts:
|