Infinity-1995 commited on
Commit
cb1ca41
·
verified ·
1 Parent(s): fb39cec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -25
app.py CHANGED
@@ -1,42 +1,103 @@
1
  import gradio as gr
2
- from transformers import pipeline
 
3
  from PIL import Image
4
  import pytesseract
5
  from pdf2image import convert_from_bytes
 
6
 
7
- # Load classifier
8
- classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
 
 
 
 
 
9
 
10
- def detect_job(text, file):
 
 
 
 
 
 
11
  extracted_text = ""
12
- if file:
13
- filename = file.name if hasattr(file, "name") else "uploaded_file"
14
- if filename.endswith(".pdf"):
15
- # file is bytes, convert PDF to images
16
- images = convert_from_bytes(file.read() if hasattr(file, "read") else file)
17
- for img in images:
18
- extracted_text += pytesseract.image_to_string(img) + "\n"
19
  else:
20
- img = Image.open(file if hasattr(file, "read") else open(file, "rb"))
 
 
 
 
 
 
 
 
21
  extracted_text = pytesseract.image_to_string(img)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- full_text = text + "\n" + extracted_text
24
- if full_text.strip() == "":
25
- return "No text provided!"
26
- result = classifier(full_text)
27
- label = "Legitimate" if result[0]['label'] == "POSITIVE" else "Suspicious / Fake"
28
- score = result[0]['score']
29
- return f"Prediction: {label} (Confidence: {score:.2f})"
 
 
 
 
30
 
31
- # Gradio UI
32
  iface = gr.Interface(
33
  fn=detect_job,
34
  inputs=[
35
- gr.Textbox(lines=10, placeholder="Paste job description here..."),
36
- gr.File(label="Upload PDF/Image", file_types=[".pdf", ".png", ".jpg", ".jpeg"], type="binary")
37
  ],
38
- outputs="text",
39
- title="Fake Job Detector"
 
 
40
  )
41
 
42
- iface.launch()
 
 
 
 
 
1
  import gradio as gr
2
+ from transformers import BertTokenizerFast, BertForSequenceClassification
3
+ import torch
4
  from PIL import Image
5
  import pytesseract
6
  from pdf2image import convert_from_bytes
7
+ import io
8
 
9
+ # -------------------------------
10
+ # 1️⃣ Load Hugging Face model
11
+ # -------------------------------
12
+ model_name = "AventIQ-AI/BERT-Spam-Job-Posting-Detection-Model"
13
+ tokenizer = BertTokenizerFast.from_pretrained(model_name)
14
+ model = BertForSequenceClassification.from_pretrained(model_name)
15
+ model.eval()
16
 
17
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18
+ model.to(device)
19
+
20
+ # -------------------------------
21
+ # 2️⃣ Text extraction from files
22
+ # -------------------------------
23
+ def extract_text_from_file(file):
24
  extracted_text = ""
25
+ try:
26
+ if hasattr(file, "read"):
27
+ file_bytes = file.read()
 
 
 
 
28
  else:
29
+ with open(file, "rb") as f:
30
+ file_bytes = f.read()
31
+
32
+ if str(file.name).lower().endswith(".pdf"):
33
+ pages = convert_from_bytes(file_bytes)
34
+ for page in pages:
35
+ extracted_text += pytesseract.image_to_string(page)
36
+ elif str(file.name).lower().endswith((".png", ".jpg", ".jpeg")):
37
+ img = Image.open(io.BytesIO(file_bytes))
38
  extracted_text = pytesseract.image_to_string(img)
39
+ else:
40
+ extracted_text = file_bytes.decode(errors="ignore")
41
+ except Exception as e:
42
+ return f"Error reading file: {e}"
43
+ return extracted_text
44
+
45
+ # -------------------------------
46
+ # 3️⃣ Detection function
47
+ # -------------------------------
48
+ def detect_job(text, file):
49
+ extracted_text = ""
50
+ if file:
51
+ extracted_text = extract_text_from_file(file)
52
+ if text:
53
+ extracted_text += " " + text
54
+
55
+ if not extracted_text.strip():
56
+ return "No text found to classify."
57
+
58
+ # Tokenize and truncate for BERT
59
+ inputs = tokenizer(
60
+ extracted_text,
61
+ return_tensors="pt",
62
+ truncation=True,
63
+ padding=True,
64
+ max_length=128
65
+ ).to(device)
66
+
67
+ # Model prediction
68
+ with torch.no_grad():
69
+ outputs = model(**inputs)
70
+ logits = outputs.logits
71
+ prediction = torch.argmax(logits, dim=-1).item()
72
+
73
+ return "Fake" if prediction == 1 else "Legitimate"
74
 
75
+ # -------------------------------
76
+ # 4️⃣ Gradio Interface
77
+ # -------------------------------
78
+ css = """
79
+ body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background-color: #f7f9fc; color: #333; }
80
+ h1, h2 { color: #1a73e8; text-align: center; margin-bottom: 20px; }
81
+ input, textarea { width: 100%; padding: 12px 15px; margin: 10px 0 20px 0; border: 1px solid #ccc; border-radius: 8px; font-size: 16px; }
82
+ button { background-color: #1a73e8; color: #fff; border: none; padding: 12px 25px; font-size: 16px; border-radius: 8px; cursor: pointer; transition: 0.3s ease; }
83
+ button:hover { background-color: #155ab6; }
84
+ .output { background-color: #f1f3f5; border-left: 4px solid #1a73e8; padding: 15px 20px; border-radius: 8px; font-size: 16px; line-height: 1.5; margin-top: 20px; white-space: pre-wrap; }
85
+ """
86
 
 
87
  iface = gr.Interface(
88
  fn=detect_job,
89
  inputs=[
90
+ gr.Textbox(label="Paste Job Description Here", placeholder="Type or paste job text..."),
91
+ gr.File(label="Upload PDF/Image/Text file")
92
  ],
93
+ outputs=gr.Textbox(label="Prediction"),
94
+ title="AI Fake Job Detector",
95
+ description="Detect if a job posting is potentially fake or scam using Hugging Face AI model.",
96
+ css=css
97
  )
98
 
99
+ # -------------------------------
100
+ # 5️⃣ Launch app
101
+ # -------------------------------
102
+ if __name__ == "__main__":
103
+ iface.launch()