Spaces:

wuhp
/

test-detr

Sleeping

App Files Files Community

wuhp commited on Sep 18

Commit

ae4cf01

verified ·

1 Parent(s): f735495

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -136

app.py CHANGED Viewed

@@ -12,6 +12,8 @@ import requests
 import json
 from PIL import Image
 import pandas as pd
 import matplotlib.pyplot as plt
 from threading import Thread
 from queue import Queue
@@ -22,27 +24,21 @@ import sys
 import time
 import glob
-# --- Configuration ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# Defaults for RT-DETRv2 (Supervisely ecosystem) integration
 RTDETRV2_REPO_URL = "https://github.com/supervisely-ecosystem/RT-DETRv2"
 DEFAULT_REPO_DIR = os.path.join("third_party", "rtdetrv2")
-# You can still offer "model size" choices to hint the user which config to use,
-# but the actual command is controlled by the template.
 RTDETRV2_MODELS = [
-    "rtdetrv2-l-640",  # label only; adapt your command template to use real config/weights
     "rtdetrv2-x-640"
 ]
 DEFAULT_MODEL = RTDETRV2_MODELS[0]
-# ------------------------------
-# Utilities
-# ------------------------------
 def handle_remove_readonly(func, path, exc_info):
-    """Error handler for shutil.rmtree."""
     try:
         os.chmod(path, stat.S_IWRITE)
     except Exception:
@@ -67,14 +63,6 @@ _ROBO_URL_RX = re.compile(
 )
 def parse_roboflow_url(s: str):
-    """
-    Support:
-      - https://universe.roboflow.com/<workspace>/<project>[/vN]
-      - https://app.roboflow.com/<workspace>/<project>[/vN]
-      - https://roboflow.com/<workspace>/<project>[/vN]
-      - raw: <workspace>/<project>[/vN]
-    Returns: (workspace, project, version_or_None)
-    """
     s = s.strip()
     m = _ROBO_URL_RX.match(s)
     if m:
@@ -110,7 +98,6 @@ def parse_roboflow_url(s: str):
     return None, None, None
 def get_latest_version(api_key, workspace, project):
-    """Gets the latest version number of a Roboflow project."""
     try:
         rf = Roboflow(api_key=api_key)
         proj = rf.workspace(workspace).project(project)
@@ -120,39 +107,24 @@ def get_latest_version(api_key, workspace, project):
         logging.error(f"Could not get latest version for {workspace}/{project}: {e}")
         return None
-# --- Normalize class names from data.yaml ---
 def _extract_class_names(data_yaml):
-    """
-    Return list[str] of class names in index order.
-    Supports:
-      - list
-      - dict with numeric keys {0:'cat',1:'dog'}
-      - fallback to ['class_0', ...]
-    """
     names = data_yaml.get('names', None)
     if isinstance(names, dict):
         def _k(x):
-            try:
-                return int(x)
-            except Exception:
-                return str(x)
         ordered = sorted(names.keys(), key=_k)
         names_list = [names[k] for k in ordered]
     elif isinstance(names, list):
         names_list = names
     else:
         nc = data_yaml.get('nc', 0)
-        try:
-            nc = int(nc)
-        except Exception:
-            nc = 0
         names_list = [f"class_{i}" for i in range(nc)]
     return [str(x) for x in names_list]
 def download_dataset(api_key, workspace, project, version):
-    """Download Roboflow dataset in 'yolov8' layout (works fine for RT-DETR variants)."""
     try:
         rf = Roboflow(api_key=api_key)
         proj = rf.workspace(workspace).project(project)
@@ -180,25 +152,17 @@ def download_dataset(api_key, workspace, project, version):
         return None, [], [], None
 def label_path_for(img_path: str) -> str:
-    """Convert .../split/images/file.jpg -> .../split/labels/file.txt."""
     split_dir = os.path.dirname(os.path.dirname(img_path))  # .../split
     base = os.path.splitext(os.path.basename(img_path))[0] + '.txt'
     return os.path.join(split_dir, 'labels', base)
 def gather_class_counts(dataset_info, class_mapping):
-    """
-    Count per final class how many images contain that class at least once (counted once per image).
-    class_mapping: original_name -> final_name (or None if removed).
-    """
     if not dataset_info:
         return {}
     final_names = set(v for v in class_mapping.values() if v is not None)
     counts = {name: 0 for name in final_names}
     for loc, names, splits, _ in dataset_info:
         id_to_name = {idx: class_mapping.get(n, None) for idx, n in enumerate(names)}
         for split in splits:
             labels_dir = os.path.join(loc, split, 'labels')
             if not os.path.exists(labels_dir):
@@ -221,11 +185,9 @@ def gather_class_counts(dataset_info, class_mapping):
                             continue
                 for m in found:
                     counts[m] += 1
     return counts
 def finalize_merged_dataset(dataset_info, class_mapping, class_limits, progress=gr.Progress()):
-    """Merge datasets following mapping and per-class image limits."""
     merged_dir = 'rolo_merged_dataset'
     if os.path.exists(merged_dir):
         shutil.rmtree(merged_dir, onerror=handle_remove_readonly)
@@ -238,7 +200,6 @@ def finalize_merged_dataset(dataset_info, class_mapping, class_limits, progress=
     active_classes = sorted(set([cls for cls, limit in class_limits.items() if limit > 0]))
     final_class_map = {name: i for i, name in enumerate(active_classes)}
-    # Collect candidates
     all_images = []
     for loc, _, splits, _ in dataset_info:
         for split in splits:
@@ -327,18 +288,81 @@ def finalize_merged_dataset(dataset_info, class_mapping, class_limits, progress=
     return f"Dataset finalized with {len(selected_images)} images.", os.path.abspath(merged_dir)
-# ------------------------------
-# RT-DETRv2 backend helpers
-# ------------------------------
 def ensure_repo(repo_dir: str, repo_url: str = RTDETRV2_REPO_URL):
-    """Clone the repo into repo_dir if not present."""
     if os.path.isdir(repo_dir) and os.path.isdir(os.path.join(repo_dir, ".git")):
         return
     os.makedirs(os.path.dirname(repo_dir), exist_ok=True)
     logging.info(f"Cloning RT-DETRv2 repo into {repo_dir} ...")
-    cmd = ["git", "clone", "--depth", "1", repo_url, repo_dir]
-    subprocess.run(cmd, check=True)
 def make_train_command(template: str, data_yaml: str, epochs: int, batch: int, imgsz: int,
                        lr: float, optimizer: str, run_name: str, output_dir: str) -> str:
@@ -354,7 +378,6 @@ def make_train_command(template: str, data_yaml: str, epochs: int, batch: int, i
     )
 _METRIC_PATTERNS = [
-    # add more patterns if your repo prints differently
     (re.compile(r"mAP@0\.5[:/]?0\.95[^0-9]*([0-9]*\.?[0-9]+)"), "mAP50_95"),
     (re.compile(r"mAP50[^0-9]*([0-9]*\.?[0-9]+)"), "mAP50"),
     (re.compile(r"\bval[_/ ]?loss[^0-9\-]*([0-9]*\.?[0-9]+)"), "val_loss"),
@@ -375,10 +398,6 @@ def parse_metrics_from_line(line: str):
     return result
 def guess_final_weights(output_dir: str):
-    """
-    Try to locate a 'best' checkpoint in output_dir.
-    Supports .pt/.pth/.pdparams etc. Return first match or None.
-    """
     patterns = [
         os.path.join(output_dir, "**", "best.*"),
         os.path.join(output_dir, "**", "best_model.*"),
@@ -390,12 +409,8 @@ def guess_final_weights(output_dir: str):
             return hits[0]
     return None
-# ------------------------------
-# Gradio UI Event Handlers
-# ------------------------------
 def load_datasets_handler(api_key, url_file, progress=gr.Progress()):
-    """Handles the 'Load Datasets' button click."""
     api_key = api_key or os.getenv("ROBOFLOW_API_KEY", "")
     if not api_key:
         raise gr.Error("Roboflow API Key is required (or set ROBOFLOW_API_KEY).")
@@ -407,14 +422,12 @@ def load_datasets_handler(api_key, url_file, progress=gr.Progress()):
     dataset_info = []
     failures = []
     for i, raw in enumerate(urls):
         progress((i + 1) / max(1, len(urls)), desc=f"Parsing {i+1}/{len(urls)}")
         ws, proj, ver = parse_roboflow_url(raw)
         if not (ws and proj):
             failures.append((raw, "ParseError: could not resolve workspace/project"))
             continue
         if ver is None:
             ver = get_latest_version(api_key, ws, proj)
             if ver is None:
@@ -431,26 +444,21 @@ def load_datasets_handler(api_key, url_file, progress=gr.Progress()):
         msg = "No datasets were loaded successfully.\n" + "\n".join([f"- {u}: {why}" for u, why in failures[:10]])
         raise gr.Error(msg)
-    # ensure names are strings before sorting
     all_names = sorted({str(n) for _, names, _, _ in dataset_info for n in names})
     class_map = {name: name for name in all_names}
     initial_counts = gather_class_counts(dataset_info, class_map)
     df_data = [[name, name, initial_counts.get(name, 0), False] for name in all_names]
     status_text = "Datasets loaded successfully."
     if failures:
         status_text += f" ({len(dataset_info)} OK, {len(failures)} failed; see console logs)."
-    # FIX: gr.update(...) (not gr.DataFrame.update)
     return status_text, dataset_info, gr.update(
         value=pd.DataFrame(df_data, columns=["Original Name", "Rename To", "Max Images", "Remove"])
     )
 def update_class_counts_handler(class_df, dataset_info):
-    """Live preview of merged class counts given the current mapping/removals."""
     if class_df is None or not dataset_info:
         return None
     class_df = pd.DataFrame(class_df)
     mapping = {}
     for _, row in class_df.iterrows():
@@ -462,10 +470,8 @@ def update_class_counts_handler(class_df, dataset_info):
     final_names = sorted(set(v for v in mapping.values() if v))
     counts = {k: 0 for k in final_names}
     for loc, names, splits, _ in dataset_info:
         id_to_final = {idx: mapping.get(n, None) for idx, n in enumerate(names)}
         for split in splits:
             labels_dir = os.path.join(loc, split, 'labels')
             if not os.path.exists(labels_dir):
@@ -493,7 +499,6 @@ def update_class_counts_handler(class_df, dataset_info):
     return summary_df
 def finalize_handler(dataset_info, class_df, progress=gr.Progress()):
-    """Create the merged dataset directory with relabeled .txts and data.yaml."""
     if not dataset_info:
         raise gr.Error("Load datasets first in Tab 1.")
     if class_df is None:
@@ -515,20 +520,19 @@ def finalize_handler(dataset_info, class_df, progress=gr.Progress()):
 def training_handler_rtdetrv2(dataset_path, repo_dir, model_choice, run_name, epochs, batch, imgsz, lr, opt,
                               cmd_template, progress=gr.Progress()):
-    """
-    Train using RT-DETRv2 repo via a configurable command template.
-    We stream logs, parse simple metrics when patterns match, and try to locate a best checkpoint on completion.
-    """
     if not dataset_path:
         raise gr.Error("Finalize a dataset in Tab 2 before training.")
-    # Make sure repo exists
     try:
         ensure_repo(repo_dir)
     except subprocess.CalledProcessError as e:
-        raise gr.Error(f"Failed to clone RT-DETRv2 repo: {e}")
-    # Prepare output directory
     output_dir = os.path.join("runs", "train", str(run_name))
     os.makedirs(output_dir, exist_ok=True)
@@ -536,7 +540,7 @@ def training_handler_rtdetrv2(dataset_path, repo_dir, model_choice, run_name, ep
     if not os.path.isfile(data_yaml):
         raise gr.Error(f"'data.yaml' was not found in: {dataset_path}")
-    # Build the command
     cmd = make_train_command(
         template=cmd_template,
         data_yaml=data_yaml,
@@ -549,54 +553,36 @@ def training_handler_rtdetrv2(dataset_path, repo_dir, model_choice, run_name, ep
         output_dir=output_dir
     )
-    # Launch training subprocess in repo_dir
     logging.info(f"Running training command in {repo_dir}: {cmd}")
     proc = subprocess.Popen(
-        cmd,
-        cwd=repo_dir,
-        shell=True,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        bufsize=1,
-        universal_newlines=True,
-        env={**os.environ}  # inherit env (CUDA, etc.)
     )
-    # Live metrics
     history = {k: [] for k in ['epoch', 'train_loss', 'val_loss', 'mAP50', 'mAP50_95']}
-    last_epoch = 0
-    # Stream logs and parse
     for line in iter(proc.stdout.readline, ''):
         line = line.rstrip()
-        # Update progress indeterminately (we don't know total epochs from logs generically)
-        if "epoch" in line.lower():
-            progress(0.0, desc=line[-120:])  # show last part of the line
-        else:
-            progress(0.0, desc=line[-120:])
         metrics = parse_metrics_from_line(line)
         if metrics:
             for k, v in metrics.items():
                 history[k].append(v)
-            # Plot when we detect an epoch number or mAP/loss update
-            # Plot Loss
             fig_loss = plt.figure()
             ax_loss = fig_loss.add_subplot(111)
             ax_loss.plot(history['epoch'], history['train_loss'], "o-", label='Train Loss')
             ax_loss.plot(history['epoch'], history['val_loss'], "o-", label='Val Loss')
-            ax_loss.legend()
-            ax_loss.set_title("Loss")
-            # Plot mAP
             fig_map = plt.figure()
             ax_map = fig_map.add_subplot(111)
             ax_map.plot(history['epoch'], history['mAP50'], "o-", label='mAP@0.5')
             ax_map.plot(history['epoch'], history['mAP50_95'], "o-", label='mAP@0.5:0.95')
-            ax_map.legend()
-            ax_map.set_title("mAP")
-            # Emit an update to the UI (status text is the last log line)
             yield line[-200:], fig_loss, fig_map, None
     proc.stdout.close()
@@ -604,16 +590,14 @@ def training_handler_rtdetrv2(dataset_path, repo_dir, model_choice, run_name, ep
     if ret != 0:
         raise gr.Error(f"Training process exited with code {ret}. Check console/logs for details.")
-    # Try to locate a best checkpoint
     final_ckpt = guess_final_weights(output_dir)
     if final_ckpt and os.path.isfile(final_ckpt):
         yield "Training complete!", None, None, gr.File.update(value=final_ckpt, visible=True)
     else:
-        # Still complete, but we couldn't find a checkpoint automatically
-        yield "Training finished. Could not auto-detect 'best' checkpoint; please check the output directory.", None, None, gr.update(visible=False)
 def upload_handler(model_file, hf_token, hf_repo, gh_token, gh_repo, progress=gr.Progress()):
-    """Handles model upload to Hugging Face and GitHub."""
     if not model_file:
         raise gr.Error("No trained model file available to upload. Train a model first.")
@@ -640,7 +624,6 @@ def upload_handler(model_file, hf_token, hf_repo, gh_token, gh_repo, progress=gr
         try:
             if '/' not in gh_repo:
                 raise ValueError("GitHub repo must be in the form 'username/repo'.")
             username, repo_name = gh_repo.split('/')
             api_url = f"https://api.github.com/repos/{username}/{repo_name}/contents/{os.path.basename(model_file.name)}"
             headers = {"Authorization": f"token {gh_token}"}
@@ -652,11 +635,9 @@ def upload_handler(model_file, hf_token, hf_repo, gh_token, gh_repo, progress=gr
             sha = get_resp.json().get('sha') if get_resp.ok else None
             data = {"message": "Upload trained model from Rolo app", "content": content}
-            if sha:
-                data["sha"] = sha
             put_resp = requests.put(api_url, headers=headers, json=data, timeout=60)
             if put_resp.ok:
                 gh_status = f"Success! Model at: {put_resp.json()['content']['html_url']}"
             else:
@@ -668,27 +649,24 @@ def upload_handler(model_file, hf_token, hf_repo, gh_token, gh_repo, progress=gr
     progress(1)
     return hf_status, gh_status
-# ------------------------------
-# Gradio UI
-# ------------------------------
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky")) as app:
-    gr.Markdown("# Rolo: RT-DETRv2 Training Dashboard (Supervisely Ecosystem Backend)")
-    # State variables
     dataset_info_state = gr.State([])
     final_dataset_path_state = gr.State(None)
     with gr.Tabs():
         with gr.TabItem("1. Prepare Datasets"):
-            gr.Markdown("### Load Roboflow Datasets\nProvide your Roboflow API key and upload a `.txt` file containing one Roboflow dataset URL or `workspace/project[/vN]` per line.")
             with gr.Row():
-                rf_api_key = gr.Textbox(label="Roboflow API Key (or set ROBOFLOW_API_KEY env)", type="password", scale=2)
                 rf_url_file = gr.File(label="Upload Roboflow URLs (.txt)", file_types=[".txt"], scale=1)
             load_btn = gr.Button("Load Datasets", variant="primary")
             dataset_status = gr.Textbox(label="Status", interactive=False)
         with gr.TabItem("2. Manage & Merge"):
-            gr.Markdown("### Configure Classes and Finalize Dataset\nRename classes to merge them, set image limits, or remove them. Click **Update Counts** to preview, then **Finalize** to create the dataset.")
             with gr.Row():
                 class_df = gr.DataFrame(
                     headers=["Original Name", "Rename To", "Max Images", "Remove"],
@@ -702,18 +680,16 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky")) as app:
                         interactive=False
                     )
                     update_counts_btn = gr.Button("Update Counts")
             finalize_btn = gr.Button("Finalize Merged Dataset", variant="primary")
             finalize_status = gr.Textbox(label="Status", interactive=False)
         with gr.TabItem("3. Configure & Train"):
-            gr.Markdown("### Set Hyperparameters and Train with RT-DETRv2")
             with gr.Row():
                 with gr.Column(scale=1):
                     model_choice_dd = gr.Dropdown(
-                        label="Model Choice (label only – adjust your command template to use the right config)",
-                        choices=RTDETRV2_MODELS,
-                        value=DEFAULT_MODEL
                     )
                     run_name_tb = gr.Textbox(label="Run Name", value="rtdetrv2_run_1")
                     epochs_sl = gr.Slider(1, 500, 100, step=1, label="Epochs")
@@ -721,7 +697,6 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky")) as app:
                     imgsz_num = gr.Number(label="Image Size", value=640)
                     lr_num = gr.Number(label="Learning Rate", value=0.001)
                     opt_dd = gr.Dropdown(["Adam", "AdamW", "SGD"], value="AdamW", label="Optimizer")
                     repo_dir_tb = gr.Textbox(label="RT-DETRv2 repo directory", value=DEFAULT_REPO_DIR)
                     cmd_template_tb = gr.Textbox(
                         label="Train command template",
@@ -745,7 +720,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky")) as app:
                     final_model_file = gr.File(label="Download Trained Model (best.*)", interactive=False, visible=False)
         with gr.TabItem("4. Upload Model"):
-            gr.Markdown("### Upload Your Trained Model\nAfter training, you can upload the best checkpoint to Hugging Face and/or GitHub.")
             with gr.Row():
                 with gr.Column():
                     gr.Markdown("#### Hugging Face")
@@ -760,7 +735,6 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky")) as app:
                 hf_status = gr.Textbox(label="Hugging Face Status", interactive=False)
                 gh_status = gr.Textbox(label="GitHub Status", interactive=False)
-    # Wire UI handlers
     load_btn.click(
         fn=load_datasets_handler,
         inputs=[rf_api_key, rf_url_file],
@@ -780,7 +754,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky")) as app:
         fn=training_handler_rtdetrv2,
         inputs=[
             final_dataset_path_state,  # dataset_path
-            repo_dir_tb,               # repo_dir
             model_choice_dd,           # model_choice (label only)
             run_name_tb,
             epochs_sl,
@@ -799,5 +773,6 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky")) as app:
     )
 if __name__ == "__main__":
-    # If Ultralytics warnings annoy you, set: export YOLO_CONFIG_DIR=/tmp/Ultralytics
     app.launch(debug=True)

 import json
 from PIL import Image
 import pandas as pd
+import matplotlib
+matplotlib.use("Agg")  # headless (HF Spaces)
 import matplotlib.pyplot as plt
 from threading import Thread
 from queue import Queue
 import time
 import glob
+# --- Logging ---
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# --- RT-DETRv2 backend defaults (Supervisely ecosystem) ---
 RTDETRV2_REPO_URL = "https://github.com/supervisely-ecosystem/RT-DETRv2"
 DEFAULT_REPO_DIR = os.path.join("third_party", "rtdetrv2")
 RTDETRV2_MODELS = [
+    "rtdetrv2-l-640",  # labels only; match your config via the command template
     "rtdetrv2-x-640"
 ]
 DEFAULT_MODEL = RTDETRV2_MODELS[0]
+# --- Utilities ---
 def handle_remove_readonly(func, path, exc_info):
     try:
         os.chmod(path, stat.S_IWRITE)
     except Exception:
 )
 def parse_roboflow_url(s: str):
     s = s.strip()
     m = _ROBO_URL_RX.match(s)
     if m:
     return None, None, None
 def get_latest_version(api_key, workspace, project):
     try:
         rf = Roboflow(api_key=api_key)
         proj = rf.workspace(workspace).project(project)
         logging.error(f"Could not get latest version for {workspace}/{project}: {e}")
         return None
 def _extract_class_names(data_yaml):
     names = data_yaml.get('names', None)
     if isinstance(names, dict):
         def _k(x):
+            try: return int(x)
+            except Exception: return str(x)
         ordered = sorted(names.keys(), key=_k)
         names_list = [names[k] for k in ordered]
     elif isinstance(names, list):
         names_list = names
     else:
         nc = data_yaml.get('nc', 0)
+        try: nc = int(nc)
+        except Exception: nc = 0
         names_list = [f"class_{i}" for i in range(nc)]
     return [str(x) for x in names_list]
 def download_dataset(api_key, workspace, project, version):
     try:
         rf = Roboflow(api_key=api_key)
         proj = rf.workspace(workspace).project(project)
         return None, [], [], None
 def label_path_for(img_path: str) -> str:
     split_dir = os.path.dirname(os.path.dirname(img_path))  # .../split
     base = os.path.splitext(os.path.basename(img_path))[0] + '.txt'
     return os.path.join(split_dir, 'labels', base)
 def gather_class_counts(dataset_info, class_mapping):
     if not dataset_info:
         return {}
     final_names = set(v for v in class_mapping.values() if v is not None)
     counts = {name: 0 for name in final_names}
     for loc, names, splits, _ in dataset_info:
         id_to_name = {idx: class_mapping.get(n, None) for idx, n in enumerate(names)}
         for split in splits:
             labels_dir = os.path.join(loc, split, 'labels')
             if not os.path.exists(labels_dir):
                             continue
                 for m in found:
                     counts[m] += 1
     return counts
 def finalize_merged_dataset(dataset_info, class_mapping, class_limits, progress=gr.Progress()):
     merged_dir = 'rolo_merged_dataset'
     if os.path.exists(merged_dir):
         shutil.rmtree(merged_dir, onerror=handle_remove_readonly)
     active_classes = sorted(set([cls for cls, limit in class_limits.items() if limit > 0]))
     final_class_map = {name: i for i, name in enumerate(active_classes)}
     all_images = []
     for loc, _, splits, _ in dataset_info:
         for split in splits:
     return f"Dataset finalized with {len(selected_images)} images.", os.path.abspath(merged_dir)
+# --- Repo + deps helpers (auto-install for HF Spaces) ---
+def run_pip_install(args, desc="pip install"):
+    logging.info(f"{desc}: {args}")
+    cmd = [sys.executable, "-m", "pip", "install"] + args
+    proc = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    logging.info(proc.stdout)
+    if proc.returncode != 0:
+        raise RuntimeError(f"{desc} failed with code {proc.returncode}")
 def ensure_repo(repo_dir: str, repo_url: str = RTDETRV2_REPO_URL):
     if os.path.isdir(repo_dir) and os.path.isdir(os.path.join(repo_dir, ".git")):
         return
     os.makedirs(os.path.dirname(repo_dir), exist_ok=True)
     logging.info(f"Cloning RT-DETRv2 repo into {repo_dir} ...")
+    subprocess.run(["git", "clone", "--depth", "1", repo_url, repo_dir], check=True)
+def ensure_python_deps(repo_dir: str):
+    """
+    Auto-install dependencies (idempotent).
+    - Tries to install pinned basics that are often needed.
+    - If repo has requirements*.txt, install them.
+    - Creates a .deps_installed marker to skip on next run.
+    """
+    marker = os.path.join(repo_dir, ".deps_installed")
+    if os.path.exists(marker):
+        logging.info("Dependencies already installed; skipping.")
+        return
+    # 1) Common essentials for vision training environments on HF Spaces
+    basics = [
+        "numpy<2",               # safer with many libs
+        "pillow",
+        "tqdm",
+        "pyyaml",
+        "matplotlib",
+        "pandas",
+        "scipy",
+        "opencv-python-headless",
+        "packaging",
+        "requests",
+        "pycocotools-windows; platform_system=='Windows'",
+        "pycocotools; platform_system!='Windows'",
+    ]
+    try:
+        run_pip_install(basics, desc="Installing common basics")
+    except Exception as e:
+        logging.warning(f"Basic installs had issues: {e}")
+    # 2) Repo requirements
+    req_files = []
+    for name in ["requirements.txt", "requirements-dev.txt", "requirements.in"]:
+        p = os.path.join(repo_dir, name)
+        if os.path.isfile(p):
+            req_files.append(p)
+    for rf in req_files:
+        try:
+            run_pip_install(["-r", rf], desc=f"Installing repo requirements from {rf}")
+        except Exception as e:
+            logging.warning(f"Installing {rf} failed: {e}")
+    # 3) Optional: torch if not present (CPU-only by default on Spaces)
+    try:
+        import torch  # noqa: F401
+    except Exception:
+        # Try a CPU-friendly torch; change version/cuda wheels if needed
+        try:
+            run_pip_install(["torch", "torchvision", "torchaudio", "--index-url", "https://download.pytorch.org/whl/cpu"], desc="Installing PyTorch (CPU)")
+        except Exception as e:
+            logging.warning(f"PyTorch installation failed/skipped: {e}")
+    # Mark done
+    with open(marker, "w") as f:
+        f.write("ok\n")
 def make_train_command(template: str, data_yaml: str, epochs: int, batch: int, imgsz: int,
                        lr: float, optimizer: str, run_name: str, output_dir: str) -> str:
     )
 _METRIC_PATTERNS = [
     (re.compile(r"mAP@0\.5[:/]?0\.95[^0-9]*([0-9]*\.?[0-9]+)"), "mAP50_95"),
     (re.compile(r"mAP50[^0-9]*([0-9]*\.?[0-9]+)"), "mAP50"),
     (re.compile(r"\bval[_/ ]?loss[^0-9\-]*([0-9]*\.?[0-9]+)"), "val_loss"),
     return result
 def guess_final_weights(output_dir: str):
     patterns = [
         os.path.join(output_dir, "**", "best.*"),
         os.path.join(output_dir, "**", "best_model.*"),
             return hits[0]
     return None
+# --- Gradio handlers ---
 def load_datasets_handler(api_key, url_file, progress=gr.Progress()):
     api_key = api_key or os.getenv("ROBOFLOW_API_KEY", "")
     if not api_key:
         raise gr.Error("Roboflow API Key is required (or set ROBOFLOW_API_KEY).")
     dataset_info = []
     failures = []
     for i, raw in enumerate(urls):
         progress((i + 1) / max(1, len(urls)), desc=f"Parsing {i+1}/{len(urls)}")
         ws, proj, ver = parse_roboflow_url(raw)
         if not (ws and proj):
             failures.append((raw, "ParseError: could not resolve workspace/project"))
             continue
         if ver is None:
             ver = get_latest_version(api_key, ws, proj)
             if ver is None:
         msg = "No datasets were loaded successfully.\n" + "\n".join([f"- {u}: {why}" for u, why in failures[:10]])
         raise gr.Error(msg)
     all_names = sorted({str(n) for _, names, _, _ in dataset_info for n in names})
     class_map = {name: name for name in all_names}
     initial_counts = gather_class_counts(dataset_info, class_map)
     df_data = [[name, name, initial_counts.get(name, 0), False] for name in all_names]
     status_text = "Datasets loaded successfully."
     if failures:
         status_text += f" ({len(dataset_info)} OK, {len(failures)} failed; see console logs)."
     return status_text, dataset_info, gr.update(
         value=pd.DataFrame(df_data, columns=["Original Name", "Rename To", "Max Images", "Remove"])
     )
 def update_class_counts_handler(class_df, dataset_info):
     if class_df is None or not dataset_info:
         return None
     class_df = pd.DataFrame(class_df)
     mapping = {}
     for _, row in class_df.iterrows():
     final_names = sorted(set(v for v in mapping.values() if v))
     counts = {k: 0 for k in final_names}
     for loc, names, splits, _ in dataset_info:
         id_to_final = {idx: mapping.get(n, None) for idx, n in enumerate(names)}
         for split in splits:
             labels_dir = os.path.join(loc, split, 'labels')
             if not os.path.exists(labels_dir):
     return summary_df
 def finalize_handler(dataset_info, class_df, progress=gr.Progress()):
     if not dataset_info:
         raise gr.Error("Load datasets first in Tab 1.")
     if class_df is None:
 def training_handler_rtdetrv2(dataset_path, repo_dir, model_choice, run_name, epochs, batch, imgsz, lr, opt,
                               cmd_template, progress=gr.Progress()):
     if not dataset_path:
         raise gr.Error("Finalize a dataset in Tab 2 before training.")
+    # Clone + deps (idempotent)
     try:
         ensure_repo(repo_dir)
+        ensure_python_deps(repo_dir)
     except subprocess.CalledProcessError as e:
+        raise gr.Error(f"Failed to clone repo: {e}")
+    except Exception as e:
+        raise gr.Error(f"Dependency setup failed: {e}")
+    # Output dir
     output_dir = os.path.join("runs", "train", str(run_name))
     os.makedirs(output_dir, exist_ok=True)
     if not os.path.isfile(data_yaml):
         raise gr.Error(f"'data.yaml' was not found in: {dataset_path}")
+    # Build command from template
     cmd = make_train_command(
         template=cmd_template,
         data_yaml=data_yaml,
         output_dir=output_dir
     )
     logging.info(f"Running training command in {repo_dir}: {cmd}")
     proc = subprocess.Popen(
+        cmd, cwd=repo_dir, shell=True,
+        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        bufsize=1, universal_newlines=True, env={**os.environ}
     )
     history = {k: [] for k in ['epoch', 'train_loss', 'val_loss', 'mAP50', 'mAP50_95']}
     for line in iter(proc.stdout.readline, ''):
         line = line.rstrip()
+        progress(0.0, desc=line[-120:])
         metrics = parse_metrics_from_line(line)
         if metrics:
             for k, v in metrics.items():
                 history[k].append(v)
+            # plot loss
             fig_loss = plt.figure()
             ax_loss = fig_loss.add_subplot(111)
             ax_loss.plot(history['epoch'], history['train_loss'], "o-", label='Train Loss')
             ax_loss.plot(history['epoch'], history['val_loss'], "o-", label='Val Loss')
+            ax_loss.legend(); ax_loss.set_title("Loss")
+            # plot mAP
             fig_map = plt.figure()
             ax_map = fig_map.add_subplot(111)
             ax_map.plot(history['epoch'], history['mAP50'], "o-", label='mAP@0.5')
             ax_map.plot(history['epoch'], history['mAP50_95'], "o-", label='mAP@0.5:0.95')
+            ax_map.legend(); ax_map.set_title("mAP")
             yield line[-200:], fig_loss, fig_map, None
     proc.stdout.close()
     if ret != 0:
         raise gr.Error(f"Training process exited with code {ret}. Check console/logs for details.")
     final_ckpt = guess_final_weights(output_dir)
     if final_ckpt and os.path.isfile(final_ckpt):
         yield "Training complete!", None, None, gr.File.update(value=final_ckpt, visible=True)
     else:
+        yield ("Training finished. Could not auto-detect a 'best' checkpoint; "
+               "please check the output directory."), None, None, gr.update(visible=False)
 def upload_handler(model_file, hf_token, hf_repo, gh_token, gh_repo, progress=gr.Progress()):
     if not model_file:
         raise gr.Error("No trained model file available to upload. Train a model first.")
         try:
             if '/' not in gh_repo:
                 raise ValueError("GitHub repo must be in the form 'username/repo'.")
             username, repo_name = gh_repo.split('/')
             api_url = f"https://api.github.com/repos/{username}/{repo_name}/contents/{os.path.basename(model_file.name)}"
             headers = {"Authorization": f"token {gh_token}"}
             sha = get_resp.json().get('sha') if get_resp.ok else None
             data = {"message": "Upload trained model from Rolo app", "content": content}
+            if sha: data["sha"] = sha
             put_resp = requests.put(api_url, headers=headers, json=data, timeout=60)
             if put_resp.ok:
                 gh_status = f"Success! Model at: {put_resp.json()['content']['html_url']}"
             else:
     progress(1)
     return hf_status, gh_status
+# --- Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky")) as app:
+    gr.Markdown("# Rolo: RT-DETRv2 Training Dashboard (Auto-setup for Hugging Face)")
     dataset_info_state = gr.State([])
     final_dataset_path_state = gr.State(None)
     with gr.Tabs():
         with gr.TabItem("1. Prepare Datasets"):
+            gr.Markdown("Upload a `.txt` with Roboflow URLs or `workspace/project[/vN]` lines.")
             with gr.Row():
+                rf_api_key = gr.Textbox(label="Roboflow API Key (or set ROBOFLOW_API_KEY)", type="password", scale=2)
                 rf_url_file = gr.File(label="Upload Roboflow URLs (.txt)", file_types=[".txt"], scale=1)
             load_btn = gr.Button("Load Datasets", variant="primary")
             dataset_status = gr.Textbox(label="Status", interactive=False)
         with gr.TabItem("2. Manage & Merge"):
+            gr.Markdown("Rename classes, set image limits, or remove them. Preview, then finalize.")
             with gr.Row():
                 class_df = gr.DataFrame(
                     headers=["Original Name", "Rename To", "Max Images", "Remove"],
                         interactive=False
                     )
                     update_counts_btn = gr.Button("Update Counts")
             finalize_btn = gr.Button("Finalize Merged Dataset", variant="primary")
             finalize_status = gr.Textbox(label="Status", interactive=False)
         with gr.TabItem("3. Configure & Train"):
+            gr.Markdown("Set hyperparameters and the training command template.")
             with gr.Row():
                 with gr.Column(scale=1):
                     model_choice_dd = gr.Dropdown(
+                        label="Model Choice (label only; use your config in the template)",
+                        choices=RTDETRV2_MODELS, value=DEFAULT_MODEL
                     )
                     run_name_tb = gr.Textbox(label="Run Name", value="rtdetrv2_run_1")
                     epochs_sl = gr.Slider(1, 500, 100, step=1, label="Epochs")
                     imgsz_num = gr.Number(label="Image Size", value=640)
                     lr_num = gr.Number(label="Learning Rate", value=0.001)
                     opt_dd = gr.Dropdown(["Adam", "AdamW", "SGD"], value="AdamW", label="Optimizer")
                     repo_dir_tb = gr.Textbox(label="RT-DETRv2 repo directory", value=DEFAULT_REPO_DIR)
                     cmd_template_tb = gr.Textbox(
                         label="Train command template",
                     final_model_file = gr.File(label="Download Trained Model (best.*)", interactive=False, visible=False)
         with gr.TabItem("4. Upload Model"):
+            gr.Markdown("Upload your best checkpoint to Hugging Face or GitHub.")
             with gr.Row():
                 with gr.Column():
                     gr.Markdown("#### Hugging Face")
                 hf_status = gr.Textbox(label="Hugging Face Status", interactive=False)
                 gh_status = gr.Textbox(label="GitHub Status", interactive=False)
     load_btn.click(
         fn=load_datasets_handler,
         inputs=[rf_api_key, rf_url_file],
         fn=training_handler_rtdetrv2,
         inputs=[
             final_dataset_path_state,  # dataset_path
+            repo_dir_tb,               # repo_dir (auto clone + pip install)
             model_choice_dd,           # model_choice (label only)
             run_name_tb,
             epochs_sl,
     )
 if __name__ == "__main__":
+    # Hugging Face Spaces: set server name/port via env if needed.
+    # Example: app.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), debug=True)
     app.launch(debug=True)