Spaces:
Sleeping
Sleeping
File size: 3,619 Bytes
1849ee0 cb1ca41 a57db7e cb1ca41 ff9adfe cb1ca41 a57db7e cb1ca41 1849ee0 cb1ca41 a57db7e cb1ca41 1849ee0 cb1ca41 fb39cec cb1ca41 1849ee0 d0f094c cb1ca41 d0f094c cb1ca41 1849ee0 cb1ca41 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import gradio as gr
from transformers import BertTokenizerFast, BertForSequenceClassification
import torch
from PIL import Image
import pytesseract
from pdf2image import convert_from_bytes
import io
# -------------------------------
# 1️⃣ Load Hugging Face model
# -------------------------------
model_name = "AventIQ-AI/BERT-Spam-Job-Posting-Detection-Model"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# -------------------------------
# 2️⃣ Text extraction from files
# -------------------------------
def extract_text_from_file(file):
extracted_text = ""
try:
if hasattr(file, "read"):
file_bytes = file.read()
else:
with open(file, "rb") as f:
file_bytes = f.read()
if str(file.name).lower().endswith(".pdf"):
pages = convert_from_bytes(file_bytes)
for page in pages:
extracted_text += pytesseract.image_to_string(page)
elif str(file.name).lower().endswith((".png", ".jpg", ".jpeg")):
img = Image.open(io.BytesIO(file_bytes))
extracted_text = pytesseract.image_to_string(img)
else:
extracted_text = file_bytes.decode(errors="ignore")
except Exception as e:
return f"Error reading file: {e}"
return extracted_text
# -------------------------------
# 3️⃣ Detection function
# -------------------------------
def detect_job(text, file):
extracted_text = ""
if file:
extracted_text = extract_text_from_file(file)
if text:
extracted_text += " " + text
if not extracted_text.strip():
return "No text found to classify."
# Tokenize and truncate for BERT
inputs = tokenizer(
extracted_text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=128
).to(device)
# Model prediction
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
prediction = torch.argmax(logits, dim=-1).item()
return "Fake" if prediction == 1 else "Legitimate"
# -------------------------------
# 4️⃣ Gradio Interface
# -------------------------------
css = """
body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background-color: #f7f9fc; color: #333; }
h1, h2 { color: #1a73e8; text-align: center; margin-bottom: 20px; }
input, textarea { width: 100%; padding: 12px 15px; margin: 10px 0 20px 0; border: 1px solid #ccc; border-radius: 8px; font-size: 16px; }
button { background-color: #1a73e8; color: #fff; border: none; padding: 12px 25px; font-size: 16px; border-radius: 8px; cursor: pointer; transition: 0.3s ease; }
button:hover { background-color: #155ab6; }
.output { background-color: #f1f3f5; border-left: 4px solid #1a73e8; padding: 15px 20px; border-radius: 8px; font-size: 16px; line-height: 1.5; margin-top: 20px; white-space: pre-wrap; }
"""
iface = gr.Interface(
fn=detect_job,
inputs=[
gr.Textbox(label="Paste Job Description Here", placeholder="Type or paste job text..."),
gr.File(label="Upload PDF/Image/Text file")
],
outputs=gr.Textbox(label="Prediction"),
title="AI Fake Job Detector",
description="Detect if a job posting is potentially fake or scam using Hugging Face AI model.",
css=css
)
# -------------------------------
# 5️⃣ Launch app
# -------------------------------
if __name__ == "__main__":
iface.launch()
|