| import sys | |
| import teradataml as tdml | |
| from tabulate import tabulate | |
| import json | |
| with open('conversion_config.json') as json_file: | |
| conversion_config = json.load(json_file) | |
| model_id = conversion_config["model_id"] | |
| number_of_generated_embeddings = conversion_config["number_of_generated_embeddings"] | |
| precision_to_filename_map = conversion_config["precision_to_filename_map"] | |
| host = sys.argv[1] | |
| username = sys.argv[2] | |
| password = sys.argv[3] | |
| print("Setting up connection to teradata...") | |
| tdml.create_context(host = host, username = username, password = password) | |
| print("Done\n\n") | |
| print("Deploying tokenizer...") | |
| try: | |
| tdml.db_drop_table('tokenizer_table') | |
| except: | |
| print("Can't drop tokenizers table - it's not existing") | |
| tdml.save_byom('tokenizer', | |
| 'tokenizer.json', | |
| 'tokenizer_table') | |
| print("Done\n\n") | |
| print("Testing models...") | |
| try: | |
| tdml.db_drop_table('model_table') | |
| except: | |
| print("Can't drop models table - it's not existing") | |
| for precision, file_name in precision_to_filename_map.items(): | |
| print(f"Deploying {precision} model...") | |
| tdml.save_byom(precision, | |
| file_name, | |
| 'model_table') | |
| print(f"Model {precision} is deployed\n") | |
| print(f"Calculating embeddings with {precision} model...") | |
| try: | |
| tdml.db_drop_table('emails_embeddings_store') | |
| except: | |
| print("Can't drop embeddings table - it's not existing") | |
| tdml.execute_sql(f""" | |
| create volatile table emails_embeddings_store as ( | |
| select | |
| * | |
| from mldb.ONNXEmbeddings( | |
| on emails.emails as InputTable | |
| on (select * from model_table where model_id = '{precision}') as ModelTable DIMENSION | |
| on (select model as tokenizer from tokenizer_table where model_id = 'tokenizer') as TokenizerTable DIMENSION | |
| using | |
| Accumulate('id', 'txt') | |
| ModelOutputTensor('sentence_embedding') | |
| EnableMemoryCheck('false') | |
| OutputFormat('FLOAT32({number_of_generated_embeddings})') | |
| OverwriteCachedModel('true') | |
| ) a | |
| ) with data on commit preserve rows | |
| """) | |
| print("Embeddings calculated") | |
| print(f"Testing semantic search with cosine similiarity on the output of the model with precision '{precision}'...") | |
| tdf_embeddings_store = tdml.DataFrame('emails_embeddings_store') | |
| tdf_embeddings_store_tgt = tdf_embeddings_store[tdf_embeddings_store.id == 3] | |
| tdf_embeddings_store_ref = tdf_embeddings_store[tdf_embeddings_store.id != 3] | |
| cos_sim_pd = tdml.DataFrame.from_query(f""" | |
| SELECT | |
| dt.target_id, | |
| dt.reference_id, | |
| e_tgt.txt as target_txt, | |
| e_ref.txt as reference_txt, | |
| (1.0 - dt.distance) as similiarity | |
| FROM | |
| TD_VECTORDISTANCE ( | |
| ON ({tdf_embeddings_store_tgt.show_query()}) AS TargetTable | |
| ON ({tdf_embeddings_store_ref.show_query()}) AS ReferenceTable DIMENSION | |
| USING | |
| TargetIDColumn('id') | |
| TargetFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]') | |
| RefIDColumn('id') | |
| RefFeatureColumns('[emb_0:emb_{number_of_generated_embeddings - 1}]') | |
| DistanceMeasure('cosine') | |
| topk(3) | |
| ) AS dt | |
| JOIN emails.emails e_tgt on e_tgt.id = dt.target_id | |
| JOIN emails.emails e_ref on e_ref.id = dt.reference_id; | |
| """).to_pandas() | |
| print(tabulate(cos_sim_pd, headers='keys', tablefmt='fancy_grid')) | |
| print("Done\n\n") | |
| tdml.remove_context() |