File size: 4,176 Bytes
f218e1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from flask import Flask, render_template, request, redirect, url_for
import pandas as pd
from fuzzywuzzy import fuzz
import os
from flask import send_file

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'uploads'
app.config['OUTPUT_FOLDER'] = 'output'
output_file = None

def process_csv(input_path):
    global output_file
    df = pd.read_csv(input_path)

    df['Address'] = df['Address (street)'].astype(str) + '-' + df['Postal code'].astype(str) + '-' + df['City'].astype(
        str) + '-' + df['Country'].astype(str) + df['Region'].astype(str)
    # df['Name'] = df['NAME_FIRST'].astype(str) + '-' + df['NAME_LAST'].astype(str) + '-' + df['NAME3'].astype(str) + '-' + df['NAME4'].astype(str)
    df['Name'] = df['Vendor Name'].astype(str)

    df['Name'] = df['Name'].str.lower()
    df['Address'] = df['Address'].str.lower()

    df.sort_values(['Name'], inplace=True)
    df = df.reset_index(drop=True)

    df['name_fuzzy_ratio'] = ''
    df['address_fuzzy_ratio'] = ''
    df['name_based_group'] = ''
    df['address_based_group'] = ''

    last_row_index = len(df) - 1
    df.at[0, 'name_fuzzy_ratio'] = 100
    df.at[0, 'address_fuzzy_ratio'] = 100
    df.at[last_row_index, 'name_fuzzy_ratio'] = 100
    df.at[last_row_index, 'address_fuzzy_ratio'] = 100

    for i in range(1, last_row_index):
        current_name = df['Name'].iloc[i]
        previous_name = df['Name'].iloc[i - 1]
        fuzzy_ratio = fuzz.ratio(previous_name, current_name)
        df.at[i, 'name_fuzzy_ratio'] = fuzzy_ratio

    df['name_fuzzy_ratio'] = pd.to_numeric(df['name_fuzzy_ratio'], errors='coerce')

    group_counter = 1
    df.at[0, 'name_based_group'] = group_counter

    for i in range(1, len(df)):
        if df.at[i, 'name_fuzzy_ratio'] > 80:
            df.at[i, 'name_based_group'] = df.at[i - 1, 'name_based_group']
        else:
            group_counter += 1
            df.at[i, 'name_based_group'] = group_counter

    group = df.at[0, 'name_based_group']

    df.sort_values(['name_based_group', 'Address'], inplace=True)
    df = df.reset_index(drop=True)

    for i in range(1, last_row_index):
        current_address = df['Address'].iloc[i]
        previous_address = df['Address'].iloc[i - 1]
        fuzzy_ratio = fuzz.ratio(previous_address, current_address)
        df.at[i, 'address_fuzzy_ratio'] = fuzzy_ratio

    df['address_fuzzy_ratio'] = pd.to_numeric(df['address_fuzzy_ratio'], errors='coerce')

    address_group_counter = 1
    df.at[0, 'address_based_group'] = str(address_group_counter)

    for i in range(1, len(df)):
        if df.at[i, 'address_fuzzy_ratio'] > 70:
            df.at[i, 'address_based_group'] = df.at[i - 1, 'address_based_group']
        else:
            if df.at[i, 'name_based_group'] != group:
                address_group_counter = 1
                group = df.at[i, 'name_based_group']
            else:
                address_group_counter += 1
            df.at[i, 'address_based_group'] = str(address_group_counter)

    # Concatenate for unique group name
    df['Group'] = df.apply(lambda row: 'Group_{}_{}'.format(row['name_based_group'], row['address_based_group']),
                           axis=1)

    columns_to_drop = ['name_fuzzy_ratio', 'address_fuzzy_ratio', 'Address', 'Name']
    df.drop(columns=columns_to_drop, inplace=True)

    output_path = os.path.join(app.config['OUTPUT_FOLDER'], 'output.csv')
    df.to_csv(output_path, index=False)
    output_file = 'output.csv'
    return output_path

@app.route('/', methods=['GET', 'POST'])
def upload_file():
    global output_file
    if request.method == 'POST':
        file = request.files['file']
        if file:
            file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
            file.save(file_path)
            output_file = process_csv(file_path)
            return redirect(url_for('upload_file'))

    return render_template('index.html', output_file=output_file)

@app.route('/downloads/output.csv')
def download_file():
    output_file = os.path.join(app.config['OUTPUT_FOLDER'], 'output.csv')
    return send_file(output_file, as_attachment=True)


if __name__ == '__main__':
    app.run(debug=True)