Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| from gradio_client import Client | |
| from gradio_client.exceptions import AppError | |
| import frontmatter | |
| import os | |
| import spaces | |
| import torch | |
| import logging | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from transformers.utils import logging as transformers_logging | |
| # Set up comprehensive logging | |
| logging.basicConfig( | |
| level=logging.DEBUG, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| import huggingface_hub | |
| import prep_decompiled | |
| # Model configuration constants | |
| MAX_CONTEXT_LENGTH = 8192 | |
| MAX_NEW_TOKENS = 1024 | |
| hf_key = os.environ["HF_TOKEN"] | |
| huggingface_hub.login(token=hf_key) | |
| tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderbase-3b") | |
| vardecoder_model = AutoModelForCausalLM.from_pretrained( | |
| "ejschwartz/resym-vardecoder", | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", | |
| ) | |
| print("Loaded vardecoder model successfully.") | |
| logger.info("Loading fielddecoder model...") | |
| fielddecoder_model = None | |
| #fielddecoder_model = AutoModelForCausalLM.from_pretrained( | |
| # "ejschwartz/resym-fielddecoder", | |
| # torch_dtype=torch.bfloat16, | |
| #) | |
| #logger.info("Successfully loaded fielddecoder model") | |
| make_gradio_client = lambda: Client("https://ejschwartz-resym-field-helper.hf.space/") | |
| examples = [ | |
| ex.encode().decode("unicode_escape") for ex in open("examples.txt", "r").readlines() | |
| ] | |
| # Example prompt | |
| # "input": "```\n_BOOL8 __fastcall sub_409B9A(_QWORD *a1, _QWORD *a2)\n{\nreturn *a1 < *a2 || *a1 == *a2 && a1[1] < a2[1];\n}\n```\nWhat are the variable name and type for the following memory accesses:a1, a1[1], a2, a2[1]?\n", | |
| # "output": "a1: a, os_reltime* -> sec, os_time_t\na1[1]: a, os_reltime* -> usec, os_time_t\na2: b, os_reltime* -> sec, os_time_t\na2[1]: b, os_reltime* -> usec, os_time_t", | |
| def field_prompt(code): | |
| try: | |
| field_helper_result = make_gradio_client().predict( | |
| decompiled_code=code, | |
| api_name="/predict", | |
| ) | |
| except AppError as e: | |
| print(f"AppError: {e}") | |
| return None, [], None | |
| print(f"field helper result: {field_helper_result}") | |
| fields = sorted(list(set([e['expr'] for e in field_helper_result[0] if e['expr'] != '']))) | |
| print(f"fields: {fields}") | |
| prompt = f"```\n{code}\n```\nWhat are the variable name and type for the following memory accesses:{', '.join(fields)}?\n" | |
| if len(fields) > 0: | |
| prompt += f"{fields[0]}:" | |
| print(f"field prompt: {repr(prompt)}") | |
| return prompt, fields, field_helper_result | |
| def infer(code): | |
| splitcode = code.splitlines() | |
| #splitcode = [s.strip() for s in code.splitlines()] | |
| #code = "\n".join(splitcode) | |
| bodyvars = [ | |
| v["name"] for v in prep_decompiled.extract_comments(splitcode) if "name" in v | |
| ] | |
| argvars = [ | |
| v["name"] for v in prep_decompiled.parse_signature(splitcode) if "name" in v | |
| ] | |
| vars = argvars + bodyvars | |
| # comments = prep_decompiled.extract_comments(splitcode) | |
| # sig = prep_decompiled.parse_signature(splitcode) | |
| # print(f"vars {vars}") | |
| varstring = ", ".join([f"`{v}`" for v in vars]) | |
| first_var = vars[0] | |
| # ejs: Yeah, this var_name thing is really bizarre. But look at https://github.com/lt-asset/resym/blob/main/training_src/fielddecoder_inf.py | |
| var_prompt = f"What are the original name and data types of variables {varstring}?\n```\n{code}\n```{first_var}:" | |
| print(f"Prompt:\n{repr(var_prompt)}") | |
| var_input_ids = tokenizer.encode(var_prompt, return_tensors="pt").cuda()[ | |
| :, : MAX_CONTEXT_LENGTH - MAX_NEW_TOKENS | |
| ] | |
| var_output = vardecoder_model.generate( | |
| input_ids=var_input_ids, | |
| max_new_tokens=MAX_NEW_TOKENS, | |
| num_beams=4, | |
| num_return_sequences=1, | |
| do_sample=False, | |
| early_stopping=False, | |
| pad_token_id=0, | |
| eos_token_id=0, | |
| ) | |
| print(f"Pre Var output: {var_output}") | |
| var_output = var_output[0] | |
| var_output = tokenizer.decode( | |
| var_output[var_input_ids.size(1) :], | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=True, | |
| ) | |
| print(f"Var output: {repr(var_output)}") | |
| field_prompt_result, fields, field_helper_result = field_prompt(code) | |
| if len(fields) == 0: | |
| field_output = "Failed to parse fields" if field_prompt_result is None else "No fields" | |
| else: | |
| field_input_ids = tokenizer.encode(field_prompt_result, return_tensors="pt").cuda()[ | |
| :, : MAX_CONTEXT_LENGTH - MAX_NEW_TOKENS | |
| ] | |
| if fielddecoder_model is None: | |
| field_output = "TEMPORARILY DISABLED" | |
| else: | |
| field_output = fielddecoder_model.generate( | |
| input_ids=field_input_ids, | |
| max_new_tokens=MAX_NEW_TOKENS, | |
| num_beams=4, | |
| num_return_sequences=1, | |
| do_sample=False, | |
| early_stopping=False, | |
| pad_token_id=0, | |
| eos_token_id=0, | |
| )[0] | |
| field_output = tokenizer.decode( | |
| field_output[field_input_ids.size(1) :], | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=True, | |
| ) | |
| field_output = fields[0] + ":" + field_output | |
| var_output = first_var + ":" + var_output | |
| fieldstring = ", ".join(fields) | |
| return var_output, field_output, varstring, fieldstring | |
| demo = gr.Interface( | |
| fn=infer, | |
| inputs=[ | |
| gr.Textbox(lines=10, value=examples[0], label="Hex-Rays Decompilation"), | |
| ], | |
| outputs=[ | |
| gr.Text(label="Var Decoder Output"), | |
| gr.Text(label="Field Decoder Output"), | |
| gr.Text(label="Generated Variable List"), | |
| gr.Text(label="Generated Field Access List"), | |
| ], | |
| # description=frontmatter.load("README.md").content, | |
| description="""This is a test space of the models from the [ReSym | |
| artifacts](https://github.com/lt-asset/resym). For more information, please see | |
| [the | |
| README](https://huggingface.co/spaces/ejschwartz/resym/blob/main/README.md). If | |
| you get an error, please make sure the [ReSym field helper | |
| space](https://huggingface.co/spaces/ejschwartz/resym-field-helper) is | |
| running. | |
| The field decoder model is currently **not working** due to a HuggingFace accelerate library problem. I am investigating the issue. | |
| """, | |
| examples=examples, | |
| ) | |
| demo.launch() | |