peihsin0715 commited on
Commit
3ca6cb8
·
1 Parent(s): 97a31eb

Fix data saving

Browse files
backend/server.py CHANGED
@@ -54,7 +54,7 @@ _MODELS = {}
54
  _CURRENT_DATASET = None
55
  _GENERATION_RESULTS = None
56
 
57
- @app.route('/data/<path:filename>')
58
  def serve_data(filename):
59
  import os
60
  from flask import Response
@@ -419,7 +419,7 @@ def run_pipeline():
419
 
420
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
421
  os.makedirs("data", exist_ok=True)
422
- output_file = f"data/pipeline_generation_{timestamp}.csv"
423
  evaluated_results.to_csv(output_file, index=False)
424
  results['generation_file'] = output_file
425
  results['generation_samples'] = len(evaluated_results)
@@ -431,7 +431,7 @@ def run_pipeline():
431
  category_col="category",
432
  num_cf_per_row=num_cf_per_row
433
  )
434
- augmented_file = f"data/pipeline_generation_cf_augmented_{timestamp}.csv"
435
  augmented_results.to_csv(augmented_file, index=False)
436
  results['counterfactual_file'] = augmented_file
437
  results['counterfactual_added'] = len(augmented_results) - len(evaluated_results)
@@ -465,7 +465,7 @@ def run_pipeline():
465
  mid_point = len(evaluated_results) // 2
466
  best_sent_subset = evaluated_results.iloc[:mid_point].copy()
467
 
468
- sent_file = f"data/pipeline_sent_subset_{timestamp}.csv"
469
  best_sent_subset.to_csv(sent_file, index=False)
470
 
471
  print(f"Pipeline Step 5: Rank sampling on CF-augmented results...(iterations={iterations}, temp={tau})")
@@ -476,7 +476,7 @@ def run_pipeline():
476
  mid_point = len(augmented_results) // 2
477
  cf_best_sent_subset = augmented_results.iloc[:mid_point].copy()
478
 
479
- cf_sent_file = f"data/pipeline_cf_sent_subset_{timestamp}.csv"
480
  cf_best_sent_subset.to_csv(cf_sent_file, index=False)
481
 
482
  orig_means = _mean_by_cat(best_sent_subset, selected_groups_used)
@@ -519,8 +519,8 @@ def run_pipeline():
519
  print("[Plot check exists]", cf_path, os.path.exists(cf_path))
520
 
521
  results['plots'] = {
522
- 'original_sentiment': f"/data/{orig_sent_title}.png",
523
- 'counterfactual_sentiment': f"/data/{cf_sent_title}.png",
524
  }
525
 
526
  print("[Plot urls]", results['plots'])
@@ -534,7 +534,7 @@ def run_pipeline():
534
  lr = float(ft_cfg.get("learningRate", 5e-5))
535
 
536
  input_csv = augmented_file
537
- ft_output_dir = f"data/ft_{timestamp}"
538
  os.makedirs(ft_output_dir, exist_ok=True)
539
 
540
  try:
@@ -548,10 +548,10 @@ def run_pipeline():
548
  )
549
  print(f"[Fine-tune] Saved fine-tuned model to {ft_output_dir}")
550
  results["finetuned_model_dir"] = ft_output_dir
551
- zip_base = f"data/ft_{timestamp}"
552
  import shutil
553
  zip_path = shutil.make_archive(zip_base, 'zip', ft_output_dir)
554
- results["finetuned_model_zip"] = f"/data/{os.path.basename(zip_path)}"
555
  except Exception as fe:
556
  print(f"[Fine-tune] Failed: {fe}")
557
  results["finetuned_model_error"] = str(fe)
 
54
  _CURRENT_DATASET = None
55
  _GENERATION_RESULTS = None
56
 
57
+ @app.route('/tmp/<path:filename>')
58
  def serve_data(filename):
59
  import os
60
  from flask import Response
 
419
 
420
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
421
  os.makedirs("data", exist_ok=True)
422
+ output_file = f"/tmp/pipeline_generation_{timestamp}.csv"
423
  evaluated_results.to_csv(output_file, index=False)
424
  results['generation_file'] = output_file
425
  results['generation_samples'] = len(evaluated_results)
 
431
  category_col="category",
432
  num_cf_per_row=num_cf_per_row
433
  )
434
+ augmented_file = f"/tmp/pipeline_generation_cf_augmented_{timestamp}.csv"
435
  augmented_results.to_csv(augmented_file, index=False)
436
  results['counterfactual_file'] = augmented_file
437
  results['counterfactual_added'] = len(augmented_results) - len(evaluated_results)
 
465
  mid_point = len(evaluated_results) // 2
466
  best_sent_subset = evaluated_results.iloc[:mid_point].copy()
467
 
468
+ sent_file = f"/tmp/pipeline_sent_subset_{timestamp}.csv"
469
  best_sent_subset.to_csv(sent_file, index=False)
470
 
471
  print(f"Pipeline Step 5: Rank sampling on CF-augmented results...(iterations={iterations}, temp={tau})")
 
476
  mid_point = len(augmented_results) // 2
477
  cf_best_sent_subset = augmented_results.iloc[:mid_point].copy()
478
 
479
+ cf_sent_file = f"/tmp/pipeline_cf_sent_subset_{timestamp}.csv"
480
  cf_best_sent_subset.to_csv(cf_sent_file, index=False)
481
 
482
  orig_means = _mean_by_cat(best_sent_subset, selected_groups_used)
 
519
  print("[Plot check exists]", cf_path, os.path.exists(cf_path))
520
 
521
  results['plots'] = {
522
+ 'original_sentiment': f"/tmp/{orig_sent_title}.png",
523
+ 'counterfactual_sentiment': f"/tmp/{cf_sent_title}.png",
524
  }
525
 
526
  print("[Plot urls]", results['plots'])
 
534
  lr = float(ft_cfg.get("learningRate", 5e-5))
535
 
536
  input_csv = augmented_file
537
+ ft_output_dir = f"/tmp/ft_{timestamp}"
538
  os.makedirs(ft_output_dir, exist_ok=True)
539
 
540
  try:
 
548
  )
549
  print(f"[Fine-tune] Saved fine-tuned model to {ft_output_dir}")
550
  results["finetuned_model_dir"] = ft_output_dir
551
+ zip_base = f"/tmp/ft_{timestamp}"
552
  import shutil
553
  zip_path = shutil.make_archive(zip_base, 'zip', ft_output_dir)
554
+ results["finetuned_model_zip"] = f"/tmp/{os.path.basename(zip_path)}"
555
  except Exception as fe:
556
  print(f"[Fine-tune] Failed: {fe}")
557
  results["finetuned_model_error"] = str(fe)
backend/utils/finetune.py CHANGED
@@ -28,7 +28,7 @@ def build_text_column(df: pd.DataFrame) -> pd.Series:
28
  def finetune_gpt2_from_csv(
29
  csv_path: str,
30
  base_model: str = "gpt2",
31
- output_dir: str = "data/ft_gpt2_out",
32
  train_split: float = 0.9,
33
  epochs: int = 3,
34
  lr: float = 5e-5,
 
28
  def finetune_gpt2_from_csv(
29
  csv_path: str,
30
  base_model: str = "gpt2",
31
+ output_dir: str = "/tmp/ft_gpt2_out",
32
  train_split: float = 0.9,
33
  epochs: int = 3,
34
  lr: float = 5e-5,
backend/utils/utils.py CHANGED
@@ -103,7 +103,7 @@ def load_model_and_tokenizer(model_name: str):
103
  raise RuntimeError(f"Failed to load model '{model_name}': {e}")
104
 
105
  def finetune(train_texts, tokenizer, model, num_epochs=20, output_dir='./data'):
106
- train_path = f"data/train.txt"
107
 
108
  with open(train_path, "w", encoding="utf-8") as f:
109
  for text in train_texts:
 
103
  raise RuntimeError(f"Failed to load model '{model_name}': {e}")
104
 
105
  def finetune(train_texts, tokenizer, model, num_epochs=20, output_dir='./data'):
106
+ train_path = f"/tmp/train.txt"
107
 
108
  with open(train_path, "w", encoding="utf-8") as f:
109
  for text in train_texts: