Spaces:
Running
Running
| from flask import Flask, render_template, request, redirect, url_for | |
| import pandas as pd | |
| from fuzzywuzzy import fuzz | |
| import os | |
| from flask import send_file | |
| app = Flask(__name__) | |
| app.config['UPLOAD_FOLDER'] = 'uploads' | |
| app.config['OUTPUT_FOLDER'] = 'output' | |
| output_file = None | |
| def process_csv(input_path): | |
| global output_file | |
| df = pd.read_csv(input_path) | |
| df['Address'] = df['Address (street)'].astype(str) + '-' + df['Postal code'].astype(str) + '-' + df['City'].astype( | |
| str) + '-' + df['Country'].astype(str) + df['Region'].astype(str) | |
| # df['Name'] = df['NAME_FIRST'].astype(str) + '-' + df['NAME_LAST'].astype(str) + '-' + df['NAME3'].astype(str) + '-' + df['NAME4'].astype(str) | |
| df['Name'] = df['Vendor Name'].astype(str) | |
| df['Name'] = df['Name'].str.lower() | |
| df['Address'] = df['Address'].str.lower() | |
| df.sort_values(['Name'], inplace=True) | |
| df = df.reset_index(drop=True) | |
| df['name_fuzzy_ratio'] = '' | |
| df['address_fuzzy_ratio'] = '' | |
| df['name_based_group'] = '' | |
| df['address_based_group'] = '' | |
| last_row_index = len(df) - 1 | |
| df.at[0, 'name_fuzzy_ratio'] = 100 | |
| df.at[0, 'address_fuzzy_ratio'] = 100 | |
| df.at[last_row_index, 'name_fuzzy_ratio'] = 100 | |
| df.at[last_row_index, 'address_fuzzy_ratio'] = 100 | |
| for i in range(1, last_row_index): | |
| current_name = df['Name'].iloc[i] | |
| previous_name = df['Name'].iloc[i - 1] | |
| fuzzy_ratio = fuzz.ratio(previous_name, current_name) | |
| df.at[i, 'name_fuzzy_ratio'] = fuzzy_ratio | |
| df['name_fuzzy_ratio'] = pd.to_numeric(df['name_fuzzy_ratio'], errors='coerce') | |
| group_counter = 1 | |
| df.at[0, 'name_based_group'] = group_counter | |
| for i in range(1, len(df)): | |
| if df.at[i, 'name_fuzzy_ratio'] > 80: | |
| df.at[i, 'name_based_group'] = df.at[i - 1, 'name_based_group'] | |
| else: | |
| group_counter += 1 | |
| df.at[i, 'name_based_group'] = group_counter | |
| group = df.at[0, 'name_based_group'] | |
| df.sort_values(['name_based_group', 'Address'], inplace=True) | |
| df = df.reset_index(drop=True) | |
| for i in range(1, last_row_index): | |
| current_address = df['Address'].iloc[i] | |
| previous_address = df['Address'].iloc[i - 1] | |
| fuzzy_ratio = fuzz.ratio(previous_address, current_address) | |
| df.at[i, 'address_fuzzy_ratio'] = fuzzy_ratio | |
| df['address_fuzzy_ratio'] = pd.to_numeric(df['address_fuzzy_ratio'], errors='coerce') | |
| address_group_counter = 1 | |
| df.at[0, 'address_based_group'] = str(address_group_counter) | |
| for i in range(1, len(df)): | |
| if df.at[i, 'address_fuzzy_ratio'] > 70: | |
| df.at[i, 'address_based_group'] = df.at[i - 1, 'address_based_group'] | |
| else: | |
| if df.at[i, 'name_based_group'] != group: | |
| address_group_counter = 1 | |
| group = df.at[i, 'name_based_group'] | |
| else: | |
| address_group_counter += 1 | |
| df.at[i, 'address_based_group'] = str(address_group_counter) | |
| # Concatenate for unique group name | |
| df['Group'] = df.apply(lambda row: 'Group_{}_{}'.format(row['name_based_group'], row['address_based_group']), | |
| axis=1) | |
| columns_to_drop = ['name_fuzzy_ratio', 'address_fuzzy_ratio', 'Address', 'Name'] | |
| df.drop(columns=columns_to_drop, inplace=True) | |
| output_path = os.path.join(app.config['OUTPUT_FOLDER'], 'output.csv') | |
| df.to_csv(output_path, index=False) | |
| output_file = 'output.csv' | |
| return output_path | |
| def upload_file(): | |
| global output_file | |
| if request.method == 'POST': | |
| file = request.files['file'] | |
| if file: | |
| file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename) | |
| file.save(file_path) | |
| output_file = process_csv(file_path) | |
| return redirect(url_for('upload_file')) | |
| return render_template('index.html', output_file=output_file) | |
| def download_file(): | |
| output_file = os.path.join(app.config['OUTPUT_FOLDER'], 'output.csv') | |
| return send_file(output_file, as_attachment=True) | |
| if __name__ == '__main__': | |
| app.run(debug=True) | |