File size: 2,493 Bytes
1c3d5f7
 
 
0f96a0a
1c3d5f7
70635df
0f96a0a
 
8c8c685
 
 
fa3105f
0f96a0a
 
 
 
 
 
 
fa3105f
 
 
 
 
0f96a0a
1c3d5f7
 
 
 
 
 
fa3105f
 
0f96a0a
1c3d5f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f96a0a
 
 
1c3d5f7
0f96a0a
1c3d5f7
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from collections import Counter
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree

df_amenities = pd.read_csv("df_amenities.csv")
df_banks = pd.read_csv("df_banks.csv")

df_amenities["fsq_category_labels"] = df_amenities["fsq_category_labels"].apply(
    lambda x: eval(x)
)

bank_coords = df_banks[['lat','lon']].values
tree_banks = cKDTree(bank_coords)

amenity_coords = df_amenities[['lat','lon']].values
tree_amenities = cKDTree(amenity_coords)

DATASET_COLUMNS = [
    'Dining and Drinking', 'Community and Government', 'Retail',
       'Business and Professional Services', 'Landmarks and Outdoors',
       'Arts and Entertainment', 'Health and Medicine',
       'Travel and Transportation', 'Sports and Recreation',
       'Event'
]

def compute_features(candidate_point, radius=0.005):
    lat, lon = candidate_point

    # Banks
    bank_idxs = tree_banks.query_ball_point([lat, lon], r=radius)

    print("[BANK]", bank_idxs)
    
    n_banks = len(bank_idxs)
    if n_banks > 0:
        neighbors = df_banks.iloc[bank_idxs]
        mean_dist_banks = np.mean(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2))
        min_dist_bank = np.min(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2))
    else:
        mean_dist_banks = radius
        min_dist_bank = radius

    # Amenities
    amenity_idxs = tree_amenities.query_ball_point([lat, lon], r=radius)
    amenities = df_amenities.iloc[amenity_idxs]

    total_amenities = len(amenities)

    # Flatten all category IDs
    all_category_ids = [cats[0].split(">")[0].strip() for cats in amenities['fsq_category_labels'] if len(cats)>0]
    category_diversity = len(set(all_category_ids))

    features = {
        'num_banks_in_radius': n_banks,
        'mean_dist_banks': mean_dist_banks,
        'min_dist_bank': min_dist_bank,
        'total_amenities': total_amenities,
        'category_diversity': category_diversity
    }

    # Count occurrences per category
    count_per_category = Counter(all_category_ids)
    for feat in DATASET_COLUMNS:
    # for cat, cnt in count_per_category.items():
        features[f'num_{feat}'] = count_per_category.get(feat, 0)

    
    # # Count occurrences of first category
    # first_categories = [cats[0] for cats in amenities['fsq_category_ids'] if len(cats)>0]
    # count_first_category = Counter(first_categories)
    # for cat, cnt in count_first_category.items():
    #     features[f'num_first_{cat}'] = cnt

    return features