CRUD-SQL2Text / app.py
alfakat's picture
cleaned up the GUI
cc98bea verified
import os
import re
import gradio as gr
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
CRUD_VERB = {
"INSERT": "load",
"UPDATE": "changed",
"DELETE": "deleted",
"SELECT": "specified",
}
# ---------- helpers ----------
def plural(n: int, singular: str = "record", plural_word: str = "records") -> str:
return f"{n} {singular if n == 1 else plural_word}"
def detect_command(sql_text: str) -> str:
m = re.search(r"\b(INSERT|UPDATE|DELETE|SELECT)\b", sql_text, flags=re.IGNORECASE)
return m.group(1).upper() if m else "OTHER"
def parse_table_name(sql: str) -> str:
for pat in [
r"INSERT\s+INTO\s+([A-Za-z0-9\.\[\]_]+)",
r"UPDATE\s+([A-Za-z0-9\.\[\]_]+)",
r"DELETE\s+FROM\s+([A-Za-z0-9\.\[\]_]+)",
r"\bFROM\s+([A-Za-z0-9\.\[\]_]+)",
]:
m = re.search(pat, sql, flags=re.IGNORECASE)
if m:
return m.group(1)
return ""
def clean_statement(text: str) -> str:
text = re.sub(r"^(What|Which|How|Give|Provide)[^:]*:\s*", "", text, flags=re.IGNORECASE).strip()
if text and text[-1] not in ".!?":
text += "."
return text[0].upper() + text[1:] if text else text
def infer_in_list_count(where_clause: str) -> None:
if not where_clause:
return None
m = re.search(r"\bIN\s*\(\s*([^)]+?)\s*\)", where_clause, flags=re.IGNORECASE | re.DOTALL)
if not m:
return None
items = [x.strip() for x in m.group(1).split(",") if x.strip()]
return len(items) if items else None
# ---------- deterministic CRUD summaries ----------
def summarize_insert(sql: str) -> str:
table = parse_table_name(sql)
cols_match = re.search(r"\(\s*([^)]+?)\s*\)\s*VALUES", sql, flags=re.IGNORECASE | re.DOTALL)
cols = []
if cols_match:
cols = [c.strip().strip("[]") for c in cols_match.group(1).split(",")]
try:
uid_idx = cols.index("user_id")
except ValueError:
uid_idx = None
tuples = re.findall(r"VALUES\s*\(\s*([^)]+?)\s*\)", sql, flags=re.IGNORECASE | re.DOTALL)
if not tuples:
tuples = re.findall(r"\(\s*([^)]+?)\s*\)", sql, flags=re.IGNORECASE)
user_ids = []
for tup in tuples:
parts = [p.strip() for p in tup.split(",")]
if uid_idx is not None and uid_idx < len(parts):
uid_raw = parts[uid_idx].strip().strip("'").strip('"')
if re.fullmatch(r"-?\d+", uid_raw):
user_ids.append(int(uid_raw))
count = len(tuples)
verb = CRUD_VERB["INSERT"]
if user_ids:
groups = {}
for u in user_ids:
groups[u] = groups.get(u, 0) + 1
if len(groups) == 1:
uid = next(iter(groups))
return f"{count} record(s) {verb} into table {table} (column user_id {uid})."
else:
parts = [f"{n} with column user_id {uid}" for uid, n in sorted(groups.items())]
return f"{count} record(s) {verb} into table {table} ({', '.join(parts)})."
else:
return f"{count} record(s) {verb} into table {table}."
def summarize_update(sql: str) -> str:
table = parse_table_name(sql)
set_match = re.search(r"\bSET\b\s+(.+?)(\bWHERE\b|;|$)", sql, flags=re.IGNORECASE | re.DOTALL)
changed_cols = []
if set_match:
assigns = [a.strip() for a in set_match.group(1).split(",")]
for a in assigns:
col = a.split("=")[0].strip().strip("[]")
if col:
changed_cols.append(col)
where = ""
w = re.search(r"\bWHERE\b\s+(.+?)(;|$)", sql, flags=re.IGNORECASE | re.DOTALL)
if w:
where = re.sub(r"\s+", " ", w.group(1)).strip()
verb = CRUD_VERB["UPDATE"]
base = f"Record(s) {verb} in table {table}"
if changed_cols:
cols_txt = ", ".join(changed_cols)
base += f" (changed: column(s) {cols_txt})"
if where:
base += f" with {where}"
return base + "."
def summarize_delete(sql: str) -> str:
table = parse_table_name(sql)
where = ""
w = re.search(r"\bWHERE\b\s+(.+?)(;|$)", sql, flags=re.IGNORECASE | re.DOTALL)
if w:
where = re.sub(r"\s+", " ", w.group(1)).strip()
n_targets = infer_in_list_count(where)
verb = CRUD_VERB["DELETE"]
base = (f"{plural(n_targets, 'record', 'records')} {verb} from {table}"
if n_targets else f"Records {verb} from {table}")
if where:
base += f" that match {where}"
return base + "."
def summarize_select(sql: str) -> str:
table = parse_table_name(sql)
cols = "data"
cm = re.search(r"\bSELECT\b\s+(.+?)\bFROM\b", sql, flags=re.IGNORECASE | re.DOTALL)
if cm:
cols_raw = cm.group(1).strip()
cols = "all columns" if cols_raw == "*" else re.sub(r"\s+", " ", cols_raw)
where = ""
w = re.search(r"\bWHERE\b\s+(.+?)(;|$)", sql, flags=re.IGNORECASE | re.DOTALL)
if w:
where = re.sub(r"\s+", " ", w.group(1)).strip()
verb = CRUD_VERB["SELECT"]
base = f"{cols} will be {verb} from {table}"
if where:
base += f" that match {where}"
return base + "."
def deterministic_summary(sql_text: str) -> str:
cmd = detect_command(sql_text)
if cmd == "INSERT":
return summarize_insert(sql_text)
if cmd == "UPDATE":
return summarize_update(sql_text)
if cmd == "DELETE":
return summarize_delete(sql_text)
if cmd == "SELECT":
return summarize_select(sql_text)
return "Unrecognized SQL command."
# ---------- optional T5 rephrase ----------
_HAS_T5 = False
try:
from transformers import T5Tokenizer, T5ForConditionalGeneration
_HAS_T5 = True
except Exception:
_HAS_T5 = False
_T5_TOKENIZER = None
_T5_MODEL = None
CRUD_PROMPT = {
"INSERT": "Rewrite as a clear statement that new records will be added. Keep numbers the same.",
"UPDATE": "Rewrite as a clear statement that existing records will be updated. Keep names and conditions.",
"DELETE": "Rewrite as a clear statement that records will be deleted. Keep conditions if present.",
"SELECT": "Rewrite as a clear statement that data will be retrieved. Keep table/filters.",
"OTHER": "Rewrite as a short, clear statement for non-technical users.",
}
def load_t5():
global _T5_TOKENIZER, _T5_MODEL
if _T5_TOKENIZER is None or _T5_MODEL is None:
_T5_TOKENIZER = T5Tokenizer.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL-sql-to-en")
_T5_MODEL = T5ForConditionalGeneration.from_pretrained("mrm8488/t5-base-finetuned-wikiSQL-sql-to-en")
def rephrase_with_t5(summary: str, cmd: str) -> str:
if not _HAS_T5:
return summary
load_t5()
instruction = CRUD_PROMPT.get(cmd, CRUD_PROMPT["OTHER"])
input_text = f"explain sql in plain english statement: {instruction} {summary}"
feats = _T5_TOKENIZER([input_text], return_tensors="pt")
out = _T5_MODEL.generate(
input_ids=feats["input_ids"],
attention_mask=feats["attention_mask"],
max_new_tokens=64,
do_sample=False,
)
decoded = _T5_TOKENIZER.decode(out[0], skip_special_tokens=True)
return clean_statement(decoded)
def _bad_rephrase(text: str) -> bool:
if text is None:
return True
t = str(text).strip()
if len(t) >= 2 and ((t[0] == t[-1] == '"') or (t[0] == t[-1] == "'")):
t = t[1:-1].strip()
t_norm = re.sub(r"[\s\.\!\?]+$", "", t).strip().lower()
# added "true" here to force fallback for True./False. answers
if t_norm in ("false", "true", "none", "null", "n/a", "na", ""):
return True
if re.match(r"^(what|which|how)\b", t_norm, re.IGNORECASE):
return True
return len(t_norm) < 3
def explain(sql_text: str):
sql_text = (sql_text or "").strip()
cmd = detect_command(sql_text)
deterministic = deterministic_summary(sql_text)
final = None
if _HAS_T5 and cmd != "OTHER":
try:
final = rephrase_with_t5(deterministic, cmd)
except Exception as e:
print(f"T5 rephrase failed: {e}")
final = None
if _bad_rephrase(final):
final = deterministic
return final
# ---------- UI ----------
EXAMPLES = [
# INSERT
"""INSERT INTO demo_database..user_records (record_id, person_id, created_at)
VALUES (101, 5, GETDATE()), (102, 5, GETDATE()), (103, 5, GETDATE());""",
# UPDATE
"""UPDATE users
SET status = 'active', last_login = GETDATE()
WHERE user_id IN (101, 102, 103);""",
# DELETE
"""DELETE FROM orders
WHERE order_date < '2024-01-01' AND status = 'cancelled';""",
# SELECT
"""SELECT user_id, email, created_at
FROM accounts
WHERE email LIKE '%@example.com' AND created_at >= '2025-01-01';""",
]
with gr.Blocks(theme=gr.themes.Glass()) as demo:
gr.Markdown("## πŸ“ CRUD-SQL2Text")
sql_in = gr.Textbox(label="Enter SQL Query", lines=8, placeholder="Paste your SQL statement here...")
final_out = gr.Textbox(label="Natural Language Output", lines=3)
btn = gr.Button("Explain SQL")
btn.click(explain, inputs=[sql_in], outputs=[final_out])
gr.Examples(examples=EXAMPLES, inputs=[sql_in], outputs=[final_out], fn=explain, cache_examples=False)
demo.launch(share=True)