File size: 2,493 Bytes
1c3d5f7 0f96a0a 1c3d5f7 70635df 0f96a0a 8c8c685 fa3105f 0f96a0a fa3105f 0f96a0a 1c3d5f7 fa3105f 0f96a0a 1c3d5f7 0f96a0a 1c3d5f7 0f96a0a 1c3d5f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
from collections import Counter
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
df_amenities = pd.read_csv("df_amenities.csv")
df_banks = pd.read_csv("df_banks.csv")
df_amenities["fsq_category_labels"] = df_amenities["fsq_category_labels"].apply(
lambda x: eval(x)
)
bank_coords = df_banks[['lat','lon']].values
tree_banks = cKDTree(bank_coords)
amenity_coords = df_amenities[['lat','lon']].values
tree_amenities = cKDTree(amenity_coords)
DATASET_COLUMNS = [
'Dining and Drinking', 'Community and Government', 'Retail',
'Business and Professional Services', 'Landmarks and Outdoors',
'Arts and Entertainment', 'Health and Medicine',
'Travel and Transportation', 'Sports and Recreation',
'Event'
]
def compute_features(candidate_point, radius=0.005):
lat, lon = candidate_point
# Banks
bank_idxs = tree_banks.query_ball_point([lat, lon], r=radius)
print("[BANK]", bank_idxs)
n_banks = len(bank_idxs)
if n_banks > 0:
neighbors = df_banks.iloc[bank_idxs]
mean_dist_banks = np.mean(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2))
min_dist_bank = np.min(np.sqrt((neighbors['lat']-lat)**2 + (neighbors['lon']-lon)**2))
else:
mean_dist_banks = radius
min_dist_bank = radius
# Amenities
amenity_idxs = tree_amenities.query_ball_point([lat, lon], r=radius)
amenities = df_amenities.iloc[amenity_idxs]
total_amenities = len(amenities)
# Flatten all category IDs
all_category_ids = [cats[0].split(">")[0].strip() for cats in amenities['fsq_category_labels'] if len(cats)>0]
category_diversity = len(set(all_category_ids))
features = {
'num_banks_in_radius': n_banks,
'mean_dist_banks': mean_dist_banks,
'min_dist_bank': min_dist_bank,
'total_amenities': total_amenities,
'category_diversity': category_diversity
}
# Count occurrences per category
count_per_category = Counter(all_category_ids)
for feat in DATASET_COLUMNS:
# for cat, cnt in count_per_category.items():
features[f'num_{feat}'] = count_per_category.get(feat, 0)
# # Count occurrences of first category
# first_categories = [cats[0] for cats in amenities['fsq_category_ids'] if len(cats)>0]
# count_first_category = Counter(first_categories)
# for cat, cnt in count_first_category.items():
# features[f'num_first_{cat}'] = cnt
return features |