Spaces:
Running
on
Zero
Running
on
Zero
Sean-Case
commited on
Commit
·
d80c8f5
1
Parent(s):
e1c1f68
Minor cleaning, csv formatting changes
Browse files- funcs/clean_funcs.py +2 -16
- funcs/topic_core_funcs.py +16 -11
funcs/clean_funcs.py
CHANGED
|
@@ -8,32 +8,18 @@ custom_words = []
|
|
| 8 |
my_stop_words = custom_words
|
| 9 |
|
| 10 |
# #### Some of my cleaning functions
|
| 11 |
-
email_start_pattern_regex = r'.*(?i)importance:|.*(?i)subject:'
|
| 12 |
-
email_end_pattern_regex = r'(?i)kind regards.*|(?i)many thanks.*|(?i)sincerely.*'
|
| 13 |
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
|
|
|
| 14 |
email_pattern_regex = r'\S*@\S*\s?'
|
| 15 |
num_pattern_regex = r'[0-9]+'
|
| 16 |
nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
|
| 17 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
| 18 |
-
warning_pattern_regex = r'(?i)caution: this email originated from outside of the organization. do not click links or open attachments unless you recognize the sender and know the content is safe.'
|
| 19 |
-
egress_pattern_regex = r'(?i)has been securely delivered by egress switch and was securely decoded on'
|
| 20 |
-
nbsp_pattern_regex = r' '
|
| 21 |
multiple_spaces_regex = r'\s{2,}'
|
| 22 |
|
| 23 |
-
# Pre-compiling the regular expressions for efficiency (not actually used)
|
| 24 |
-
# email_start_pattern = re.compile(email_start_pattern_regex)
|
| 25 |
-
# email_end_pattern = re.compile(email_end_pattern_regex)
|
| 26 |
-
# html_pattern = re.compile(html_pattern_regex)
|
| 27 |
-
# email_pattern = re.compile(email_end_pattern_regex)
|
| 28 |
-
# num_pattern = re.compile(num_pattern_regex)
|
| 29 |
-
# nums_two_more_regex_pattern = re.compile(nums_two_more_regex)
|
| 30 |
-
# postcode_pattern = re.compile(postcode_pattern_regex)
|
| 31 |
-
# warning_pattern = re.compile(warning_pattern_regex)
|
| 32 |
-
# nbsp_pattern = re.compile(nbsp_pattern_regex)
|
| 33 |
-
|
| 34 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
| 35 |
texts = pl.Series(texts).str.strip_chars()
|
| 36 |
text = texts.str.replace_all(html_pattern_regex, ' ')
|
|
|
|
| 37 |
text = text.str.replace_all(email_pattern_regex, ' ')
|
| 38 |
text = text.str.replace_all(nums_two_more_regex, ' ')
|
| 39 |
text = text.str.replace_all(postcode_pattern_regex, ' ')
|
|
|
|
| 8 |
my_stop_words = custom_words
|
| 9 |
|
| 10 |
# #### Some of my cleaning functions
|
|
|
|
|
|
|
| 11 |
html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0| '
|
| 12 |
+
html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
|
| 13 |
email_pattern_regex = r'\S*@\S*\s?'
|
| 14 |
num_pattern_regex = r'[0-9]+'
|
| 15 |
nums_two_more_regex = r'\b[0-9]{2,}\b|\b[0-9]+\s[0-9]+\b'
|
| 16 |
postcode_pattern_regex = r'(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9][A-Z]{2})|((GIR ?0A{2})\b$)|(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]? ?[0-9]{1}?)$)|(\b(?:[A-Z][A-HJ-Y]?[0-9][0-9A-Z]?)\b$)'
|
|
|
|
|
|
|
|
|
|
| 17 |
multiple_spaces_regex = r'\s{2,}'
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
def initial_clean(texts, custom_regex, progress=gr.Progress()):
|
| 20 |
texts = pl.Series(texts).str.strip_chars()
|
| 21 |
text = texts.str.replace_all(html_pattern_regex, ' ')
|
| 22 |
+
text = text.str.replace_all(html_start_pattern_end_dots_regex, ' ')
|
| 23 |
text = text.str.replace_all(email_pattern_regex, ' ')
|
| 24 |
text = text.str.replace_all(nums_two_more_regex, ' ')
|
| 25 |
text = text.str.replace_all(postcode_pattern_regex, ' ')
|
funcs/topic_core_funcs.py
CHANGED
|
@@ -494,19 +494,24 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
|
|
| 494 |
|
| 495 |
hierarchical_topics = hierarchical_topics_custom(topic_model, docs)
|
| 496 |
|
| 497 |
-
# Print topic tree
|
| 498 |
-
|
| 499 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
|
|
|
| 504 |
|
| 505 |
-
output_list.append(tree_name)
|
| 506 |
|
| 507 |
# Save new hierarchical topic model to file
|
| 508 |
-
hierarchical_topics_name = data_file_name_no_ext + '_' + '
|
| 509 |
-
hierarchical_topics.to_csv(hierarchical_topics_name)
|
| 510 |
output_list.append(hierarchical_topics_name)
|
| 511 |
|
| 512 |
|
|
@@ -516,12 +521,12 @@ def visualise_topics(topic_model, data, data_file_name_no_ext, low_resource_mode
|
|
| 516 |
|
| 517 |
# Write hierarchical topics levels to df
|
| 518 |
hierarchy_df_name = data_file_name_no_ext + '_' + 'hierarchy_topics_df_' + today_rev + '.csv'
|
| 519 |
-
hierarchy_df.to_csv(hierarchy_df_name)
|
| 520 |
output_list.append(hierarchy_df_name)
|
| 521 |
|
| 522 |
# Write hierarchical topics names to df
|
| 523 |
hierarchy_topic_names_name = data_file_name_no_ext + '_' + 'hierarchy_topics_names_' + today_rev + '.csv'
|
| 524 |
-
hierarchy_topic_names.to_csv(hierarchy_topic_names_name)
|
| 525 |
output_list.append(hierarchy_topic_names_name)
|
| 526 |
|
| 527 |
#except:
|
|
|
|
| 494 |
|
| 495 |
hierarchical_topics = hierarchical_topics_custom(topic_model, docs)
|
| 496 |
|
| 497 |
+
# Print topic tree - may get encoding errors, so doing try except
|
| 498 |
+
try:
|
| 499 |
+
tree = topic_model.get_topic_tree(hierarchical_topics, tight_layout = True)
|
| 500 |
+
tree_name = data_file_name_no_ext + '_' + 'vis_hierarchy_tree_' + today_rev + '.txt'
|
| 501 |
+
|
| 502 |
+
with open(tree_name, "w") as file:
|
| 503 |
+
# Write the string to the file
|
| 504 |
+
file.write(tree)
|
| 505 |
|
| 506 |
+
output_list.append(tree_name)
|
| 507 |
+
|
| 508 |
+
except Exception as error:
|
| 509 |
+
print("An exception occurred when making topic tree document, skipped:", error)
|
| 510 |
|
|
|
|
| 511 |
|
| 512 |
# Save new hierarchical topic model to file
|
| 513 |
+
hierarchical_topics_name = data_file_name_no_ext + '_' + 'vis_hierarchy_topics_dist_' + today_rev + '.csv'
|
| 514 |
+
hierarchical_topics.to_csv(hierarchical_topics_name, index = None)
|
| 515 |
output_list.append(hierarchical_topics_name)
|
| 516 |
|
| 517 |
|
|
|
|
| 521 |
|
| 522 |
# Write hierarchical topics levels to df
|
| 523 |
hierarchy_df_name = data_file_name_no_ext + '_' + 'hierarchy_topics_df_' + today_rev + '.csv'
|
| 524 |
+
hierarchy_df.to_csv(hierarchy_df_name, index = None)
|
| 525 |
output_list.append(hierarchy_df_name)
|
| 526 |
|
| 527 |
# Write hierarchical topics names to df
|
| 528 |
hierarchy_topic_names_name = data_file_name_no_ext + '_' + 'hierarchy_topics_names_' + today_rev + '.csv'
|
| 529 |
+
hierarchy_topic_names.to_csv(hierarchy_topic_names_name, index = None)
|
| 530 |
output_list.append(hierarchy_topic_names_name)
|
| 531 |
|
| 532 |
#except:
|