Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	rename files
Browse files- Dockerfile +2 -2
- requirements.txt +1 -0
- run.py β run_job.py +3 -1
- app.py β start_app.py +56 -12
    	
        Dockerfile
    CHANGED
    
    | @@ -20,8 +20,8 @@ RUN pip install --no-cache-dir --upgrade pip | |
| 20 | 
             
            COPY --chown=user run.py app.py requirements.txt README.md $HOME/app/
         | 
| 21 |  | 
| 22 | 
             
            # Install dependencies
         | 
| 23 | 
            -
            RUN pip install "gradio[oauth]" | 
| 24 | 
             
            RUN pip install -r requirements.txt
         | 
| 25 |  | 
| 26 | 
             
            # Run app
         | 
| 27 | 
            -
            ENTRYPOINT python  | 
|  | |
| 20 | 
             
            COPY --chown=user run.py app.py requirements.txt README.md $HOME/app/
         | 
| 21 |  | 
| 22 | 
             
            # Install dependencies
         | 
| 23 | 
            +
            RUN pip install "gradio[oauth]"
         | 
| 24 | 
             
            RUN pip install -r requirements.txt
         | 
| 25 |  | 
| 26 | 
             
            # Run app
         | 
| 27 | 
            +
            ENTRYPOINT python start_app.py
         | 
    	
        requirements.txt
    CHANGED
    
    | @@ -1,3 +1,4 @@ | |
|  | |
| 1 | 
             
            duckdb
         | 
| 2 | 
             
            huggingface_hub
         | 
| 3 | 
             
            tabulate
         | 
|  | |
| 1 | 
            +
            fire
         | 
| 2 | 
             
            duckdb
         | 
| 3 | 
             
            huggingface_hub
         | 
| 4 | 
             
            tabulate
         | 
    	
        run.py β run_job.py
    RENAMED
    
    | @@ -50,11 +50,13 @@ def sql(src: str, dst: str, query: str, config: str = "default", split: str = "t | |
| 50 | 
             
                    src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
         | 
| 51 | 
             
                    if not src_kwargs:
         | 
| 52 | 
             
                        raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
         | 
|  | |
| 53 | 
             
                    con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
         | 
| 54 | 
             
                    if dry_run:
         | 
| 55 | 
            -
                        print(f"Sample data from '{src}' that would be written to '{dst}':\n")
         | 
| 56 | 
             
                    else:
         | 
| 57 | 
             
                        con.sql("PRAGMA enable_progress_bar;")
         | 
|  | |
| 58 | 
             
                    result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
         | 
| 59 | 
             
                    if dry_run:
         | 
| 60 | 
             
                        print(result.df().to_markdown())
         | 
|  | |
| 50 | 
             
                    src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
         | 
| 51 | 
             
                    if not src_kwargs:
         | 
| 52 | 
             
                        raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
         | 
| 53 | 
            +
             | 
| 54 | 
             
                    con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
         | 
| 55 | 
             
                    if dry_run:
         | 
| 56 | 
            +
                        print(f"Sample data from '{src}' that would be written to dataset '{dst}':\n")
         | 
| 57 | 
             
                    else:
         | 
| 58 | 
             
                        con.sql("PRAGMA enable_progress_bar;")
         | 
| 59 | 
            +
             | 
| 60 | 
             
                    result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
         | 
| 61 | 
             
                    if dry_run:
         | 
| 62 | 
             
                        print(result.df().to_markdown())
         | 
    	
        app.py β start_app.py
    RENAMED
    
    | @@ -1,18 +1,20 @@ | |
|  | |
| 1 | 
             
            import re
         | 
| 2 | 
             
            import subprocess
         | 
| 3 | 
             
            import yaml
         | 
| 4 |  | 
| 5 | 
             
            import gradio as gr
         | 
| 6 | 
             
            import requests
         | 
| 7 | 
            -
            from huggingface_hub import HfApi
         | 
| 8 |  | 
| 9 |  | 
| 10 | 
            -
            CMD = ["python" ," | 
| 11 |  | 
| 12 | 
             
            with open("README.md") as f:
         | 
| 13 | 
             
                METADATA = yaml.safe_load(f.read().split("---\n")[1])
         | 
| 14 | 
             
            TITLE = METADATA["title"]
         | 
| 15 | 
             
            EMOJI = METADATA["emoji"]
         | 
|  | |
| 16 |  | 
| 17 | 
             
            try:
         | 
| 18 | 
             
                process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         | 
| @@ -22,26 +24,68 @@ except Exception: | |
| 22 |  | 
| 23 | 
             
            DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
         | 
| 24 |  | 
| 25 | 
            -
            def  | 
| 26 | 
             
                if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|ββ"):
         | 
| 27 | 
             
                    [pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
         | 
| 28 | 
             
                    percent = float(percent_match.group(0)[:-1]) / 100
         | 
| 29 | 
             
                    desc = line[:percent_match.start()].strip() or "Progress"
         | 
| 30 | 
             
                    pbars[desc] = percent
         | 
|  | |
|  | |
|  | |
| 31 |  | 
| 32 | 
             
            def dry_run(src, config, split, dst, query):
         | 
| 33 | 
             
                if not all([src, config, split, dst, query]):
         | 
| 34 | 
             
                    raise gr.Error("Please fill source, destination and query.")
         | 
| 35 | 
            -
                 | 
| 36 | 
            -
                 | 
|  | |
|  | |
|  | |
| 37 | 
             
                for line in iter(process.stdout.readline, b""):
         | 
| 38 | 
             
                    logs += line.decode()
         | 
| 39 | 
            -
                    yield {output_markdown: logs | 
| 40 |  | 
| 41 | 
            -
            def run(src, config, split, dst, query):
         | 
| 42 | 
             
                if not all([src, config, split, dst, query]):
         | 
| 43 | 
             
                    raise gr.Error("Please fill source, destination and query.")
         | 
| 44 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 45 |  | 
| 46 | 
             
            READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
         | 
| 47 | 
             
            NUM_TRENDING_DATASETS = 10
         | 
| @@ -51,17 +95,17 @@ with gr.Blocks() as demo: | |
| 51 | 
             
                    with gr.Column(scale=10):
         | 
| 52 | 
             
                        gr.Markdown(f"# {TITLE} {EMOJI}")
         | 
| 53 | 
             
                    with gr.Column():
         | 
| 54 | 
            -
                        gr.LoginButton( | 
| 55 | 
             
                with gr.Row():
         | 
| 56 | 
            -
                    with gr.Column():
         | 
| 57 | 
             
                        with gr.Row():
         | 
| 58 | 
             
                            loading_codes_json = gr.JSON([], visible=False)
         | 
| 59 | 
             
                            dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10)
         | 
| 60 | 
             
                            subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
         | 
| 61 | 
             
                            split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
         | 
| 62 | 
            -
                    with gr.Column( | 
| 63 | 
             
                        gr.HTML("<div style='font-size: 4em;'>β</div>")
         | 
| 64 | 
            -
                    with gr.Column():
         | 
| 65 | 
             
                        dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True)
         | 
| 66 | 
             
                query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False)
         | 
| 67 | 
             
                with gr.Row():
         | 
|  | |
| 1 | 
            +
            import os
         | 
| 2 | 
             
            import re
         | 
| 3 | 
             
            import subprocess
         | 
| 4 | 
             
            import yaml
         | 
| 5 |  | 
| 6 | 
             
            import gradio as gr
         | 
| 7 | 
             
            import requests
         | 
| 8 | 
            +
            from huggingface_hub import HfApi, get_token
         | 
| 9 |  | 
| 10 |  | 
| 11 | 
            +
            CMD = ["python" ,"run_job.py"]
         | 
| 12 |  | 
| 13 | 
             
            with open("README.md") as f:
         | 
| 14 | 
             
                METADATA = yaml.safe_load(f.read().split("---\n")[1])
         | 
| 15 | 
             
            TITLE = METADATA["title"]
         | 
| 16 | 
             
            EMOJI = METADATA["emoji"]
         | 
| 17 | 
            +
            spaceId = os.environ.get("SPACE_ID") or "lhoestq/run-duckdb"
         | 
| 18 |  | 
| 19 | 
             
            try:
         | 
| 20 | 
             
                process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         | 
|  | |
| 24 |  | 
| 25 | 
             
            DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
         | 
| 26 |  | 
| 27 | 
            +
            def parse_log(line: str, pbars: dict[str, float]):
         | 
| 28 | 
             
                if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|ββ"):
         | 
| 29 | 
             
                    [pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
         | 
| 30 | 
             
                    percent = float(percent_match.group(0)[:-1]) / 100
         | 
| 31 | 
             
                    desc = line[:percent_match.start()].strip() or "Progress"
         | 
| 32 | 
             
                    pbars[desc] = percent
         | 
| 33 | 
            +
                    yield ""
         | 
| 34 | 
            +
                else:
         | 
| 35 | 
            +
                    yield line
         | 
| 36 |  | 
| 37 | 
             
            def dry_run(src, config, split, dst, query):
         | 
| 38 | 
             
                if not all([src, config, split, dst, query]):
         | 
| 39 | 
             
                    raise gr.Error("Please fill source, destination and query.")
         | 
| 40 | 
            +
                args = ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query, DRY_RUN]
         | 
| 41 | 
            +
                cmd = CMD + args
         | 
| 42 | 
            +
                logs = "Job:\n\n```bash\n" + " ".join('"' + arg.replace('"', '\"""') + '"' if " " in arg else arg for arg in cmd) + "\n```\nOutput:\n\n"
         | 
| 43 | 
            +
                yield {output_markdown: logs, progress_labels: gr.Label(visible=False)}
         | 
| 44 | 
            +
                process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
         | 
| 45 | 
             
                for line in iter(process.stdout.readline, b""):
         | 
| 46 | 
             
                    logs += line.decode()
         | 
| 47 | 
            +
                    yield {output_markdown: logs}
         | 
| 48 |  | 
| 49 | 
            +
            def run(src, config, split, dst, query, oauth_token: gr.OAuthToken | None, profile: gr.OAuthProfile | None):
         | 
| 50 | 
             
                if not all([src, config, split, dst, query]):
         | 
| 51 | 
             
                    raise gr.Error("Please fill source, destination and query.")
         | 
| 52 | 
            +
                if oauth_token and profile:
         | 
| 53 | 
            +
                    token = oauth_token.token
         | 
| 54 | 
            +
                    username = profile.username
         | 
| 55 | 
            +
                elif (token := get_token()):
         | 
| 56 | 
            +
                    username = HfApi().whoami(token=token)["name"]
         | 
| 57 | 
            +
                else:
         | 
| 58 | 
            +
                    raise gr.Error("Please log in to run the job.")
         | 
| 59 | 
            +
                args = ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query]
         | 
| 60 | 
            +
                cmd = CMD + args
         | 
| 61 | 
            +
                logs = "Job:\n\n```bash\n" + " ".join('"' + arg.replace('"', '\"""') + '"' if " " in arg else arg for arg in cmd) + "\n```\nOutput:\n\n"
         | 
| 62 | 
            +
                pbars = {}
         | 
| 63 | 
            +
                yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
         | 
| 64 | 
            +
                resp = requests.post(
         | 
| 65 | 
            +
                    f"https://huggingface.co/api/jobs/{username}",
         | 
| 66 | 
            +
                    json={
         | 
| 67 | 
            +
                        "spaceId": spaceId,
         | 
| 68 | 
            +
                        "arguments": args,
         | 
| 69 | 
            +
                        "command":  CMD,
         | 
| 70 | 
            +
                        "environment": {},
         | 
| 71 | 
            +
                        "flavor": "cpu-basic" 
         | 
| 72 | 
            +
                    },
         | 
| 73 | 
            +
                    headers={"Authorization": f"Bearer {token}"}
         | 
| 74 | 
            +
                )
         | 
| 75 | 
            +
                if resp.status_code != 200:
         | 
| 76 | 
            +
                    logs += resp.text
         | 
| 77 | 
            +
                    pbars = {"Finished with an error β": 1.0}
         | 
| 78 | 
            +
                else:
         | 
| 79 | 
            +
                    job_id = resp.json()["metadata"]["job_id"]
         | 
| 80 | 
            +
                    resp = requests.get(
         | 
| 81 | 
            +
                        f"https://huggingface.co/api/jobs/{username}/{job_id}/logs-stream",
         | 
| 82 | 
            +
                        headers={"Authorization": f"Bearer {token}"}
         | 
| 83 | 
            +
                    )
         | 
| 84 | 
            +
                    for line in iter(resp.raw.readline, b""):
         | 
| 85 | 
            +
                        logs += parse_log(line.decode(), pbars=pbars)
         | 
| 86 | 
            +
                        yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
         | 
| 87 | 
            +
                    pbars = {"Finished" + (" β
" if process.returncode == 0 else " with an error β"): 1.0}
         | 
| 88 | 
            +
                yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
         | 
| 89 |  | 
| 90 | 
             
            READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
         | 
| 91 | 
             
            NUM_TRENDING_DATASETS = 10
         | 
|  | |
| 95 | 
             
                    with gr.Column(scale=10):
         | 
| 96 | 
             
                        gr.Markdown(f"# {TITLE} {EMOJI}")
         | 
| 97 | 
             
                    with gr.Column():
         | 
| 98 | 
            +
                        gr.LoginButton()
         | 
| 99 | 
             
                with gr.Row():
         | 
| 100 | 
            +
                    with gr.Column(scale=10):
         | 
| 101 | 
             
                        with gr.Row():
         | 
| 102 | 
             
                            loading_codes_json = gr.JSON([], visible=False)
         | 
| 103 | 
             
                            dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10)
         | 
| 104 | 
             
                            subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
         | 
| 105 | 
             
                            split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
         | 
| 106 | 
            +
                    with gr.Column(min_width=60):
         | 
| 107 | 
             
                        gr.HTML("<div style='font-size: 4em;'>β</div>")
         | 
| 108 | 
            +
                    with gr.Column(scale=10):
         | 
| 109 | 
             
                        dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True)
         | 
| 110 | 
             
                query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False)
         | 
| 111 | 
             
                with gr.Row():
         | 
