Spaces:
Runtime error
Runtime error
Update utility/utils.py
Browse files- utility/utils.py +46 -17
utility/utils.py
CHANGED
|
@@ -24,6 +24,8 @@ logging.basicConfig(
|
|
| 24 |
os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
|
| 25 |
|
| 26 |
RESULT_FOLDER = 'static/results/'
|
|
|
|
|
|
|
| 27 |
if not os.path.exists('/tmp/.paddleocr'):
|
| 28 |
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
| 29 |
|
|
@@ -45,11 +47,13 @@ client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HFT)
|
|
| 45 |
|
| 46 |
# Load image using OpenCV
|
| 47 |
def load_image(image_path):
|
| 48 |
-
|
| 49 |
-
if
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
| 53 |
# Function for upscaling image using OpenCV's INTER_CUBIC
|
| 54 |
def upscale_image(image, scale=2):
|
| 55 |
height, width = image.shape[:2]
|
|
@@ -171,7 +175,7 @@ def extract_text_from_images(image_paths):
|
|
| 171 |
# Function to call the Gemma model and process the output as Json
|
| 172 |
def Data_Extractor(data, client=client):
|
| 173 |
text = f'''Act as a Text extractor for the following text given in text: {data}
|
| 174 |
-
|
| 175 |
{{
|
| 176 |
"Name": ["Identify and Extract All the person's name from the text."],
|
| 177 |
"Designation": ["Extract All the designation or job title mentioned in the text."],
|
|
@@ -180,17 +184,19 @@ def Data_Extractor(data, client=client):
|
|
| 180 |
"Address": ["Extract All the full postal address or location mentioned in the text."],
|
| 181 |
"Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
|
| 182 |
"Link": ["Identify and Extract any website URLs or social media links present in the text."]
|
| 183 |
-
}}
|
| 184 |
-
|
|
|
|
| 185 |
'''
|
| 186 |
# Call the API for inference
|
| 187 |
-
response = client.text_generation(text, max_new_tokens=
|
| 188 |
|
| 189 |
print("parse in text ---:",response)
|
| 190 |
|
| 191 |
# Convert the response text to JSON
|
| 192 |
try:
|
| 193 |
json_data = json.loads(response)
|
|
|
|
| 194 |
return json_data
|
| 195 |
except json.JSONDecodeError as e:
|
| 196 |
return {"error": f"Error decoding JSON: {e}"}
|
|
@@ -228,8 +234,22 @@ def extract_contact_details(text):
|
|
| 228 |
\+91\s\d{5}-\d{5} | # India Intl +91 XXXXX-XXXXX
|
| 229 |
\+91\s\d{4}-\d{6} | # India Intl +91 XXXX-XXXXXX
|
| 230 |
\+91\s\d{10} | # India Intl +91 XXXXXXXXXX
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
|
| 232 |
\+91\d{10} | # +91 XXXXXXXXXX
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
\+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
|
| 234 |
\+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
|
| 235 |
0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
|
|
@@ -385,16 +405,25 @@ def process_resume_data(LLMdata,cont_data,extracted_text):
|
|
| 385 |
|
| 386 |
# Initialize the processed data dictionary
|
| 387 |
processed_data = {
|
| 388 |
-
"name": [
|
| 389 |
-
"contact_number": [
|
| 390 |
-
"Designation":[
|
| 391 |
-
"email": [
|
| 392 |
-
"Location": [
|
| 393 |
-
"Link": [
|
| 394 |
-
"Company":[
|
| 395 |
"extracted_text": extracted_text
|
| 396 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
processed_data['email'].extend(cont_data.get("emails", []))
|
| 398 |
processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
|
| 399 |
processed_data['Link'].extend(cont_data.get("links_RE", []))
|
| 400 |
-
return processed_data
|
|
|
|
| 24 |
os.environ['PADDLEOCR_HOME'] = '/tmp/.paddleocr'
|
| 25 |
|
| 26 |
RESULT_FOLDER = 'static/results/'
|
| 27 |
+
JSON_FOLDER = 'static/json/'
|
| 28 |
+
|
| 29 |
if not os.path.exists('/tmp/.paddleocr'):
|
| 30 |
os.makedirs(RESULT_FOLDER, exist_ok=True)
|
| 31 |
|
|
|
|
| 47 |
|
| 48 |
# Load image using OpenCV
|
| 49 |
def load_image(image_path):
|
| 50 |
+
ext = os.path.splitext(image_path)[1].lower()
|
| 51 |
+
if ext in ['.png', '.jpg', '.jpeg', '.webp', '.tiff']:
|
| 52 |
+
image = cv2.imread(image_path)
|
| 53 |
+
return image
|
| 54 |
+
else:
|
| 55 |
+
raise ValueError(f"Could not load image from {image_path}. It may be corrupted or the path is incorrect. or in a not supported format")
|
| 56 |
+
|
| 57 |
# Function for upscaling image using OpenCV's INTER_CUBIC
|
| 58 |
def upscale_image(image, scale=2):
|
| 59 |
height, width = image.shape[:2]
|
|
|
|
| 175 |
# Function to call the Gemma model and process the output as Json
|
| 176 |
def Data_Extractor(data, client=client):
|
| 177 |
text = f'''Act as a Text extractor for the following text given in text: {data}
|
| 178 |
+
Extract text in the following output JSON string:
|
| 179 |
{{
|
| 180 |
"Name": ["Identify and Extract All the person's name from the text."],
|
| 181 |
"Designation": ["Extract All the designation or job title mentioned in the text."],
|
|
|
|
| 184 |
"Address": ["Extract All the full postal address or location mentioned in the text."],
|
| 185 |
"Email": ["Identify and Extract All valid email addresses mentioned in the text else 'Not found'."],
|
| 186 |
"Link": ["Identify and Extract any website URLs or social media links present in the text."]
|
| 187 |
+
}}
|
| 188 |
+
|
| 189 |
+
Output:
|
| 190 |
'''
|
| 191 |
# Call the API for inference
|
| 192 |
+
response = client.text_generation(text, max_new_tokens=1000, temperature=0.4, top_k=50, top_p=0.9, repetition_penalty=1.2)
|
| 193 |
|
| 194 |
print("parse in text ---:",response)
|
| 195 |
|
| 196 |
# Convert the response text to JSON
|
| 197 |
try:
|
| 198 |
json_data = json.loads(response)
|
| 199 |
+
print("Json_data-------------->",json_data)
|
| 200 |
return json_data
|
| 201 |
except json.JSONDecodeError as e:
|
| 202 |
return {"error": f"Error decoding JSON: {e}"}
|
|
|
|
| 234 |
\+91\s\d{5}-\d{5} | # India Intl +91 XXXXX-XXXXX
|
| 235 |
\+91\s\d{4}-\d{6} | # India Intl +91 XXXX-XXXXXX
|
| 236 |
\+91\s\d{10} | # India Intl +91 XXXXXXXXXX
|
| 237 |
+
\+91\s\d{3}\s\d{3}\s\d{4} | # India Intl +91 XXX XXX XXXX
|
| 238 |
+
\+91\s\d{3}-\d{3}-\d{4} | # India Intl +91 XXX-XXX-XXXX
|
| 239 |
+
\+91\s\d{2}\s\d{4}\s\d{4} | # India Intl +91 XX XXXX XXXX
|
| 240 |
+
\+91\s\d{2}-\d{4}-\d{4} | # India Intl +91 XX-XXXX-XXXX
|
| 241 |
+
\+91\s\d{5}\s\d{5} | # India Intl +91 XXXXX XXXXX
|
| 242 |
+
\d{5}\s\d{5} | # India XXXXX XXXXX
|
| 243 |
+
\d{5}-\d{5} | # India XXXXX-XXXXX
|
| 244 |
0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
|
| 245 |
\+91\d{10} | # +91 XXXXXXXXXX
|
| 246 |
+
\d{10} | # XXXXXXXXXX # Here is the regex to handle all possible combination of the contact
|
| 247 |
+
\d{6}-\d{4} | # XXXXXX-XXXX
|
| 248 |
+
\d{4}-\d{6} | # XXXX-XXXXXX
|
| 249 |
+
\d{3}\s\d{3}\s\d{4} | # XXX XXX XXXX
|
| 250 |
+
\d{3}-\d{3}-\d{4} | # XXX-XXX-XXXX
|
| 251 |
+
\d{4}\s\d{3}\s\d{3} | # XXXX XXX XXX
|
| 252 |
+
\d{4}-\d{3}-\d{3} | # XXXX-XXX-XXX #-----
|
| 253 |
\+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
|
| 254 |
\+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
|
| 255 |
0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
|
|
|
|
| 405 |
|
| 406 |
# Initialize the processed data dictionary
|
| 407 |
processed_data = {
|
| 408 |
+
"name": [],
|
| 409 |
+
"contact_number": [],
|
| 410 |
+
"Designation":[],
|
| 411 |
+
"email": [],
|
| 412 |
+
"Location": [],
|
| 413 |
+
"Link": [],
|
| 414 |
+
"Company":[],
|
| 415 |
"extracted_text": extracted_text
|
| 416 |
}
|
| 417 |
+
#LLM
|
| 418 |
+
processed_data['name'].extend(LLMdata.get('Name', []))
|
| 419 |
+
processed_data['contact_number'].extend(LLMdata.get('Contact', []))
|
| 420 |
+
processed_data['Designation'].extend(LLMdata.get('Designation', []))
|
| 421 |
+
processed_data['email'].extend(LLMdata.get("Email", []))
|
| 422 |
+
processed_data['Location'].extend(LLMdata.get('Address', []))
|
| 423 |
+
processed_data['Link'].extend(LLMdata.get('Link', []))
|
| 424 |
+
processed_data['Company'].extend(LLMdata.get('Company', []))
|
| 425 |
+
#Contact
|
| 426 |
processed_data['email'].extend(cont_data.get("emails", []))
|
| 427 |
processed_data['contact_number'].extend(cont_data.get("phone_numbers", []))
|
| 428 |
processed_data['Link'].extend(cont_data.get("links_RE", []))
|
| 429 |
+
return processed_data
|