Spaces:

Roxanne-WANG
/

LangSQL

Paused

App Files Files Community

Roxanne-WANG commited on Apr 20

Commit

036a85e

1 Parent(s): ebacf16

update final code

Browse files

Files changed (1) hide show

text2sql.py +24 -28

text2sql.py CHANGED Viewed

@@ -106,68 +106,64 @@ class ChatBot():
     def __init__(self) -> None:
         os.environ["CUDA_VISIBLE_DEVICES"] = "0"
         model_name = "seeklhy/codes-1b"
-        # Load tokenizer and model
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
-        # Set the device for the model (this ensures it's on either GPU or CPU)
-        self.device = self.model.device  # This will get the device the model is loaded on (either CUDA or CPU)
-        # Define other parameters
         self.max_length = 4096
         self.max_new_tokens = 256
         self.max_prefix_length = self.max_length - self.max_new_tokens
-        # Load the Schema Item Classifier
         self.sic = SchemaItemClassifierInference("Roxanne-WANG/LangSQL")
-        # Initialize searcher for DB content (Whoosh index)
         self.db_id2content_searcher = dict()
         for db_id in os.listdir("db_contents_index"):
             index_dir = os.path.join("db_contents_index", db_id)
             if index.exists_in(index_dir):
                 ix = index.open_dir(index_dir)
-                self.db_id2content_searcher[db_id] = ix
     def get_response(self, question, db_id):
-        # Prepare the data for schema filtering
         data = {
             "text": question,
             "schema": copy.deepcopy(self.db_id2schema[db_id]),
             "matched_contents": get_matched_contents(question, self.db_id2content_searcher[db_id])
         }
-        # Filter schema based on predictions
         data = filter_schema(data, self.sic, 6, 10)
         data["schema_sequence"] = get_db_schema_sequence(data["schema"])
         data["content_sequence"] = get_matched_content_sequence(data["matched_contents"])
-        # Prepare input sequence for the model
         prefix_seq = data["schema_sequence"] + "\n" + data["content_sequence"] + "\n" + data["text"] + "\n"
-        input_ids = [self.tokenizer.bos_token_id] + self.tokenizer(prefix_seq, truncation=False)["input_ids"]
         if len(input_ids) > self.max_prefix_length:
             input_ids = [self.tokenizer.bos_token_id] + input_ids[-(self.max_prefix_length-1):]
         attention_mask = [1] * len(input_ids)
-        # Move input tensors to the same device as the model
         inputs = {
-            "input_ids": torch.tensor([input_ids], dtype=torch.int64).to(self.device),
-            "attention_mask": torch.tensor([attention_mask], dtype=torch.int64).to(self.device)
         }
-        # Generate SQL query using the model
         with torch.no_grad():
             generate_ids = self.model.generate(
                 **inputs,
-                max_new_tokens=self.max_new_tokens,
-                num_beams=4,
-                num_return_sequences=4
             )
-        # Decode the generated SQL queries
-        generated_sqls = self.tokenizer.batch_decode(generate_ids[:, len(input_ids):], skip_special_tokens=True, clean_up_tokenization_spaces=False)
         final_generated_sql = None
         for generated_sql in generated_sqls:
             execution_error = check_sql_executability(generated_sql, os.path.join("databases", db_id, db_id + ".sqlite"))

     def __init__(self) -> None:
         os.environ["CUDA_VISIBLE_DEVICES"] = "0"
         model_name = "seeklhy/codes-1b"
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, device_map = "auto", torch_dtype = torch.float16)
         self.max_length = 4096
         self.max_new_tokens = 256
         self.max_prefix_length = self.max_length - self.max_new_tokens
+        # Directly loading the model from Hugging Face
         self.sic = SchemaItemClassifierInference("Roxanne-WANG/LangSQL")
         self.db_id2content_searcher = dict()
         for db_id in os.listdir("db_contents_index"):
             index_dir = os.path.join("db_contents_index", db_id)
+            # Open existing Whoosh index directory
             if index.exists_in(index_dir):
                 ix = index.open_dir(index_dir)
+                # keep a searcher around for querying
+                self.db_id2content_searcher[db_id] = ix.searcher()
+            else:
+                raise ValueError(f"No Whoosh index found for '{db_id}' at '{index_dir}'")
+        self.db_ids = sorted(os.listdir("databases"))
+        self.db_id2schema = get_db_id2schema("databases", "data/tables.json")
+        self.db_id2ddl = get_db_id2ddl("databases")
     def get_response(self, question, db_id):
         data = {
             "text": question,
             "schema": copy.deepcopy(self.db_id2schema[db_id]),
             "matched_contents": get_matched_contents(question, self.db_id2content_searcher[db_id])
         }
         data = filter_schema(data, self.sic, 6, 10)
         data["schema_sequence"] = get_db_schema_sequence(data["schema"])
         data["content_sequence"] = get_matched_content_sequence(data["matched_contents"])
         prefix_seq = data["schema_sequence"] + "\n" + data["content_sequence"] + "\n" + data["text"] + "\n"
+        print(prefix_seq)
+        input_ids = [self.tokenizer.bos_token_id] + self.tokenizer(prefix_seq , truncation = False)["input_ids"]
         if len(input_ids) > self.max_prefix_length:
+            print("the current input sequence exceeds the max_tokens, we will truncate it.")
             input_ids = [self.tokenizer.bos_token_id] + input_ids[-(self.max_prefix_length-1):]
         attention_mask = [1] * len(input_ids)
         inputs = {
+            "input_ids": torch.tensor([input_ids], dtype = torch.int64).to(self.model.device),
+            "attention_mask": torch.tensor([attention_mask], dtype = torch.int64).to(self.model.device)
         }
+        input_length = inputs["input_ids"].shape[1]
         with torch.no_grad():
             generate_ids = self.model.generate(
                 **inputs,
+                max_new_tokens = self.max_new_tokens,
+                num_beams = 4,
+                num_return_sequences = 4
             )
+        generated_sqls = self.tokenizer.batch_decode(generate_ids[:, input_length:], skip_special_tokens = True, clean_up_tokenization_spaces = False)
         final_generated_sql = None
         for generated_sql in generated_sqls:
             execution_error = check_sql_executability(generated_sql, os.path.join("databases", db_id, db_id + ".sqlite"))