DUPLICATE_REM / app.py
Sambit20030731's picture
Upload 8 files
f218e1e verified
from flask import Flask, render_template, request, redirect, url_for
import pandas as pd
from fuzzywuzzy import fuzz
import os
from flask import send_file
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['OUTPUT_FOLDER'] = 'output'
output_file = None
def process_csv(input_path):
global output_file
df = pd.read_csv(input_path)
df['Address'] = df['Address (street)'].astype(str) + '-' + df['Postal code'].astype(str) + '-' + df['City'].astype(
str) + '-' + df['Country'].astype(str) + df['Region'].astype(str)
# df['Name'] = df['NAME_FIRST'].astype(str) + '-' + df['NAME_LAST'].astype(str) + '-' + df['NAME3'].astype(str) + '-' + df['NAME4'].astype(str)
df['Name'] = df['Vendor Name'].astype(str)
df['Name'] = df['Name'].str.lower()
df['Address'] = df['Address'].str.lower()
df.sort_values(['Name'], inplace=True)
df = df.reset_index(drop=True)
df['name_fuzzy_ratio'] = ''
df['address_fuzzy_ratio'] = ''
df['name_based_group'] = ''
df['address_based_group'] = ''
last_row_index = len(df) - 1
df.at[0, 'name_fuzzy_ratio'] = 100
df.at[0, 'address_fuzzy_ratio'] = 100
df.at[last_row_index, 'name_fuzzy_ratio'] = 100
df.at[last_row_index, 'address_fuzzy_ratio'] = 100
for i in range(1, last_row_index):
current_name = df['Name'].iloc[i]
previous_name = df['Name'].iloc[i - 1]
fuzzy_ratio = fuzz.ratio(previous_name, current_name)
df.at[i, 'name_fuzzy_ratio'] = fuzzy_ratio
df['name_fuzzy_ratio'] = pd.to_numeric(df['name_fuzzy_ratio'], errors='coerce')
group_counter = 1
df.at[0, 'name_based_group'] = group_counter
for i in range(1, len(df)):
if df.at[i, 'name_fuzzy_ratio'] > 80:
df.at[i, 'name_based_group'] = df.at[i - 1, 'name_based_group']
else:
group_counter += 1
df.at[i, 'name_based_group'] = group_counter
group = df.at[0, 'name_based_group']
df.sort_values(['name_based_group', 'Address'], inplace=True)
df = df.reset_index(drop=True)
for i in range(1, last_row_index):
current_address = df['Address'].iloc[i]
previous_address = df['Address'].iloc[i - 1]
fuzzy_ratio = fuzz.ratio(previous_address, current_address)
df.at[i, 'address_fuzzy_ratio'] = fuzzy_ratio
df['address_fuzzy_ratio'] = pd.to_numeric(df['address_fuzzy_ratio'], errors='coerce')
address_group_counter = 1
df.at[0, 'address_based_group'] = str(address_group_counter)
for i in range(1, len(df)):
if df.at[i, 'address_fuzzy_ratio'] > 70:
df.at[i, 'address_based_group'] = df.at[i - 1, 'address_based_group']
else:
if df.at[i, 'name_based_group'] != group:
address_group_counter = 1
group = df.at[i, 'name_based_group']
else:
address_group_counter += 1
df.at[i, 'address_based_group'] = str(address_group_counter)
# Concatenate for unique group name
df['Group'] = df.apply(lambda row: 'Group_{}_{}'.format(row['name_based_group'], row['address_based_group']),
axis=1)
columns_to_drop = ['name_fuzzy_ratio', 'address_fuzzy_ratio', 'Address', 'Name']
df.drop(columns=columns_to_drop, inplace=True)
output_path = os.path.join(app.config['OUTPUT_FOLDER'], 'output.csv')
df.to_csv(output_path, index=False)
output_file = 'output.csv'
return output_path
@app.route('/', methods=['GET', 'POST'])
def upload_file():
global output_file
if request.method == 'POST':
file = request.files['file']
if file:
file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
file.save(file_path)
output_file = process_csv(file_path)
return redirect(url_for('upload_file'))
return render_template('index.html', output_file=output_file)
@app.route('/downloads/output.csv')
def download_file():
output_file = os.path.join(app.config['OUTPUT_FOLDER'], 'output.csv')
return send_file(output_file, as_attachment=True)
if __name__ == '__main__':
app.run(debug=True)