Spaces:
Runtime error
Runtime error
File size: 5,950 Bytes
25d7670 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import re
from dateutil.parser import parse
from num2words import num2words
import inflect
from ftfy import fix_text
# Initialize the inflect engine
inflect_engine = inflect.engine()
# Define alphabet pronunciation mapping
alphabet_map = {
"A": " Eh ", "B": " Bee ", "C": " See ", "D": " Dee ", "E": " Eee ",
"F": " Eff ", "G": " Jee ", "H": " Aitch ", "I": " Eye ", "J": " Jay ",
"K": " Kay ", "L": " El ", "M": " Emm ", "N": " Enn ", "O": " Ohh ",
"P": " Pee ", "Q": " Queue ", "R": " Are ", "S": " Ess ", "T": " Tee ",
"U": " You ", "V": " Vee ", "W": " Double You ", "X": " Ex ", "Y": " Why ", "Z": " Zed "
}
# Function to add ordinal suffix to a number
def add_ordinal_suffix(day):
"""Adds ordinal suffix to a day (e.g., 13 -> 13th)."""
if 11 <= day <= 13: # Special case for 11th, 12th, 13th
return f"{day}th"
elif day % 10 == 1:
return f"{day}st"
elif day % 10 == 2:
return f"{day}nd"
elif day % 10 == 3:
return f"{day}rd"
else:
return f"{day}th"
# Function to format dates in a human-readable form
def format_date(parsed_date, include_time=True):
"""Formats a parsed date into a human-readable string."""
if not parsed_date:
return None
# Convert the day into an ordinal (e.g., 13 -> 13th)
day = add_ordinal_suffix(parsed_date.day)
# Format the date in a TTS-friendly way
if include_time and parsed_date.hour != 0 and parsed_date.minute != 0:
return parsed_date.strftime(f"%B {day}, %Y at %-I:%M %p") # Unix
return parsed_date.strftime(f"%B {day}, %Y") # Only date
# Normalize dates in the text
def normalize_dates(text):
"""
Finds and replaces date strings with a nicely formatted, TTS-friendly version.
"""
def replace_date(match):
raw_date = match.group(0)
try:
parsed_date = parse(raw_date)
if parsed_date:
include_time = "T" in raw_date or " " in raw_date # Include time only if explicitly provided
return format_date(parsed_date, include_time)
except ValueError:
pass
return raw_date
# Match common date formats
date_pattern = r"\b(\d{4}-\d{2}-\d{2}(?:[ T]\d{2}:\d{2}:\d{2})?|\d{2}/\d{2}/\d{4}|\d{1,2} \w+ \d{4})\b"
return re.sub(date_pattern, replace_date, text)
# Replace invalid characters and clean text
def replace_invalid_chars(string):
string = fix_text(string)
replacements = {
"**": "",
''': "'",
'AI;': 'Artificial Intelligence!',
'iddqd;': 'Immortality cheat code',
'😉;': 'wink wink!',
':D': '*laughs* Ahahaha!',
';D': '*laughs* Ahahaha!'
}
for old, new in replacements.items():
string = string.replace(old, new)
return string
# Replace numbers with their word equivalents
def replace_numbers(string):
ipv4_pattern = r'(\b\d{1,3}(\.\d{1,3}){3}\b)'
ipv6_pattern = r'([0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}'
range_pattern = r'\b\d+-\d+\b' # Detect ranges like 1-4
date_pattern = r'\b\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2})?\b'
alphanumeric_pattern = r'\b[A-Za-z]+\d+|\d+[A-Za-z]+\b'
# Do not process IP addresses, date patterns, or alphanumerics
if re.search(ipv4_pattern, string) or re.search(ipv6_pattern, string) or re.search(range_pattern, string) or re.search(date_pattern, string) or re.search(alphanumeric_pattern, string):
return string
# Convert standalone numbers and port numbers
def convert_number(match):
number = match.group()
return num2words(int(number)) if number.isdigit() else number
pattern = re.compile(r'\b\d+\b')
return re.sub(pattern, convert_number, string)
# Replace abbreviations with expanded form
def replace_abbreviations(string):
words = string.split()
for i, word in enumerate(words):
if word.isupper() and len(word) <= 4 and not any(char.isdigit() for char in word) and word not in ["ID", "AM", "PM"]:
words[i] = ''.join([alphabet_map.get(char, char) for char in word])
return ' '.join(words)
def clean_whitespace(string):
# Remove spaces before punctuation
string = re.sub(r'\s+([.,?!])', r'\1', string)
# Collapse multiple spaces into one, but don’t touch inside tokens like "test.com"
string = re.sub(r'\s{2,}', ' ', string)
return string.strip()
def make_dots_tts_friendly(text):
# Handle IP addresses (force "dot")
ipv4_pattern = r'\b\d{1,3}(\.\d{1,3}){3}\b'
text = re.sub(ipv4_pattern, lambda m: m.group(0).replace('.', ' dot '), text)
# Handle domain-like endings (force "dot")
domain_pattern = r'\b([\w-]+)\.(com|net|org|io|gov|edu|exe|dll|local)\b'
text = re.sub(domain_pattern, lambda m: m.group(0).replace('.', ' dot '), text)
# Handle decimals (use "point")
decimal_pattern = r'\b\d+\.\d+\b'
text = re.sub(decimal_pattern, lambda m: m.group(0).replace('.', ' point '), text)
# Handle leading dot words (.Net → dot Net)
text = re.sub(r'\.(?=\w)', 'dot ', text)
return text
# Main preprocessing pipeline
def preprocess_all(string):
string = normalize_dates(string)
string = replace_invalid_chars(string)
string = replace_numbers(string)
string = replace_abbreviations(string)
string = make_dots_tts_friendly(string)
string = clean_whitespace(string)
return string
# Expose a testing function for external use
def test_preprocessing(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
for line in lines:
original = line.strip()
processed = preprocess_all(original)
print(f"Original: {original}")
print(f"Processed: {processed}\n")
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
test_file = sys.argv[1]
test_preprocessing(test_file)
else:
print("Please provide a file path as an argument.")
|