amitom commited on
Commit
73e9c25
·
0 Parent(s):

Minimal app for HF Space

Browse files
Files changed (44) hide show
  1. .gitignore +56 -0
  2. Dockerfile +20 -0
  3. README.md +62 -0
  4. app.py +112 -0
  5. data/Los Angeles/coordinates/CA 110 North.csv +3 -0
  6. data/Los Angeles/coordinates/CA 110 South.csv +3 -0
  7. data/Los Angeles/coordinates/CA 118 East.csv +3 -0
  8. data/Los Angeles/coordinates/CA 118 West.csv +3 -0
  9. data/Los Angeles/coordinates/CA 134 East.csv +3 -0
  10. data/Los Angeles/coordinates/CA 134 West.csv +3 -0
  11. data/Los Angeles/coordinates/CA 170 North.csv +3 -0
  12. data/Los Angeles/coordinates/CA 170 South.csv +3 -0
  13. data/Los Angeles/coordinates/CA 2 North.csv +3 -0
  14. data/Los Angeles/coordinates/CA 2 South.csv +3 -0
  15. data/Los Angeles/coordinates/I 10 East.csv +3 -0
  16. data/Los Angeles/coordinates/I 10 West.csv +3 -0
  17. data/Los Angeles/coordinates/I 110 North.csv +3 -0
  18. data/Los Angeles/coordinates/I 110 South.csv +3 -0
  19. data/Los Angeles/coordinates/I 210 East.csv +3 -0
  20. data/Los Angeles/coordinates/I 210 West.csv +3 -0
  21. data/Los Angeles/coordinates/I 405 North.csv +3 -0
  22. data/Los Angeles/coordinates/I 405 South.csv +3 -0
  23. data/Los Angeles/coordinates/I 5 North.csv +3 -0
  24. data/Los Angeles/coordinates/I 5 South.csv +3 -0
  25. data/Los Angeles/coordinates/I 605 North.csv +3 -0
  26. data/Los Angeles/coordinates/I 605 South.csv +3 -0
  27. data/Los Angeles/coordinates/US 101 North.csv +3 -0
  28. data/Los Angeles/coordinates/US 101 South.csv +3 -0
  29. data_collection/collect.py +98 -0
  30. data_process/process.py +138 -0
  31. data_process/split.py +9 -0
  32. data_process/unified.py +93 -0
  33. model_v3/encode.py +332 -0
  34. model_v3/evaluate.py +385 -0
  35. model_v3/experiments.md +17 -0
  36. model_v3/final_encoder.pkl +3 -0
  37. model_v3/final_lstm.pt +3 -0
  38. model_v3/predict_road.py +437 -0
  39. model_v3/train_lstm.py +498 -0
  40. requirements.txt +33 -0
  41. roadmap/RoadMap.py +513 -0
  42. roadmap/__init__.py +0 -0
  43. roadmap/mock_predictor.py +43 -0
  44. roadmap/utils.py +167 -0
.gitignore ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Virtual environment
7
+ .venv/
8
+ env/
9
+ venv/
10
+
11
+ # Logs
12
+ *.log
13
+
14
+ # Mac / Windows / Linux OS generated files
15
+ .DS_Store
16
+ Thumbs.db
17
+
18
+ # IDEs and editors
19
+ .vscode/
20
+ .idea/
21
+
22
+ # Cache and large files
23
+ cache/
24
+ map/cache/
25
+
26
+ # Common large data formats
27
+ *.sqlite
28
+ *.h5
29
+ *.tar
30
+ *.zip
31
+ *.gz
32
+ *.npz
33
+ *.ckpt
34
+ *.pdf
35
+ *.mp4
36
+ *.mov
37
+ *.avi
38
+ *.tiff
39
+ *.jpg
40
+ *.jpeg
41
+ *.png
42
+ *.webp
43
+ *.html
44
+
45
+ # Block heavy .json and .graphml
46
+ *.json
47
+ *.graphml
48
+
49
+ # Project-specific exclusions
50
+ roadmap/old_implmentation/road_network.graphml
51
+ roadmap/data/Los Angeles/maps/Los_Angeles_network.graphml
52
+ data_process/exmaple.csv
53
+
54
+ # Prevent Git LFS tracked files from being committed directly
55
+ .gitattributes
56
+ data_process/balanced_example.csv
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.13.5-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ build-essential \
7
+ curl \
8
+ git \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY requirements.txt ./
12
+ COPY src/ ./src/
13
+
14
+ RUN pip3 install -r requirements.txt
15
+
16
+ EXPOSE 8501
17
+
18
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
+
20
+ ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ sdk: streamlit
3
+ app_file: app.py
4
+ ---
5
+
6
+ # TrafCast
7
+
8
+ A traffic speed prediction system for Los Angeles using LSTM neural networks.
9
+
10
+ ## Overview
11
+
12
+ TrafCast predicts real-time traffic speeds across major Los Angeles highways and roads using deep learning. The system uses an LSTM (Long Short-Term Memory) model trained on historical traffic data to forecast speed patterns.
13
+
14
+ ## Model Details
15
+
16
+ - **Architecture**: LSTM neural network with 2,191,617 parameters
17
+ - **Training Data**: 32+ million data points from LA traffic sensors
18
+ - **Performance**: Best validation loss of 6.6276, test loss of 6.0229
19
+ - **Features**: Weather data, road characteristics, time patterns, and historical speeds
20
+
21
+ ## Quick Start
22
+
23
+ ### Prerequisites
24
+ - Python 3.8+
25
+ - Virtual environment (recommended)
26
+
27
+ ### Installation
28
+
29
+ 1. **Clone the repository**
30
+ ```bash
31
+ git clone <repository-url>
32
+ cd TrafCast
33
+ ```
34
+
35
+ 2. **Create and activate virtual environment**
36
+ ```bash
37
+ python -m venv .venv
38
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
39
+ ```
40
+
41
+ 3. **Install dependencies**
42
+ ```bash
43
+ pip install -r requirements.txt
44
+ ```
45
+
46
+ 4. **Run the application**
47
+ ```bash
48
+ streamlit run app.py
49
+ ```
50
+
51
+ The app will be available at `http://localhost:8501`
52
+
53
+ ## Usage
54
+
55
+ 1. Select roads from the available LA highways
56
+ 2. Choose a date and time for prediction
57
+ 3. Select visualization mode (Predicted, Real, or Comparison)
58
+ 4. Click "Apply Prediction" to generate traffic speed maps
59
+
60
+ ## Data
61
+
62
+ The model was trained on compressed CSV files containing traffic sensor data from major LA roads including I-405, US-101, I-5, and state highways.
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ from datetime import datetime
4
+ from streamlit_folium import st_folium
5
+ from roadmap.RoadMap import RoadMapManager
6
+
7
+ SUPPORTED_CITIES = ["Los Angeles"]
8
+ LA_ROADS = [
9
+ 'I 405 North', 'I 405 South',
10
+ 'US 101 North', 'US 101 South',
11
+ 'I 5 North', 'I 5 South',
12
+ 'I 110 North', 'I 110 South',
13
+ 'CA 170 North', 'CA 170 South',
14
+ 'CA 118 East', 'CA 118 West',
15
+ 'CA 134 East', 'CA 134 West',
16
+ 'I 605 North', 'I 605 South',
17
+ 'I 210 East', 'I 210 West'
18
+ ]
19
+ LA_BBOX = (-118.569946, 33.252470, -116.976929, 34.388779)
20
+
21
+ st.title("TrafCast: Traffic Forecasting for Los Angeles")
22
+
23
+ city = st.selectbox("Select City", SUPPORTED_CITIES)
24
+
25
+ @st.cache_resource
26
+ def get_map_manager(city_name):
27
+ return RoadMapManager(city_name, LA_BBOX)
28
+
29
+ map_manager = get_map_manager(city)
30
+
31
+ selected_roads = st.multiselect("Select Roads to Load", LA_ROADS)
32
+
33
+ if selected_roads and st.button("Load Road Data"):
34
+ with st.spinner("Loading road data..."):
35
+ map_manager.set_roads(selected_roads)
36
+ st.session_state["roads_loaded"] = True
37
+ st.success("Road data loaded successfully.")
38
+
39
+ if st.session_state.get("roads_loaded"):
40
+ default_date = st.session_state.get("selected_date", datetime.now().date())
41
+ default_time = st.session_state.get("selected_time", datetime.now().time())
42
+
43
+ st.date_input("Choose Date", value=default_date, key="selected_date")
44
+ st.time_input("Choose Time", value=default_time, key="selected_time")
45
+
46
+ predict_time = datetime.combine(
47
+ st.session_state["selected_date"],
48
+ st.session_state["selected_time"]
49
+ )
50
+
51
+
52
+ map_option = st.radio(
53
+ "Choose map visualization:",
54
+ ["Predicted Speed Only", "Real Speed Only", "Side by Side Comparison"],
55
+ key="map_option"
56
+ )
57
+
58
+ if st.button("Apply Prediction"):
59
+ with st.spinner("Running prediction and generating map..."):
60
+ map_manager.apply_prediction_data(predict_time)
61
+
62
+ if map_option == "Predicted Speed Only":
63
+ folium_map = map_manager.draw_map_offset()
64
+ st.session_state["folium_map"] = folium_map
65
+ st.session_state["map_type"] = "predicted"
66
+ elif map_option == "Real Speed Only":
67
+ folium_map = map_manager.draw_map_with_real_speed()
68
+ st.session_state["folium_map"] = folium_map
69
+ st.session_state["map_type"] = "real"
70
+ else: # Side by Side Comparison
71
+ predicted_map, real_map = map_manager.draw_side_by_side_maps()
72
+ st.session_state["predicted_map"] = predicted_map
73
+ st.session_state["real_map"] = real_map
74
+ st.session_state["map_type"] = "side_by_side"
75
+ st.success("Map updated!")
76
+
77
+ if st.session_state.get("map_type") == "side_by_side":
78
+ if "predicted_map" in st.session_state and "real_map" in st.session_state:
79
+ # Use container to control spacing
80
+ with st.container():
81
+ st.subheader("🟢 Predicted Speed")
82
+ st_folium(
83
+ st.session_state["predicted_map"],
84
+ width=1200,
85
+ height=600,
86
+ returned_objects=[],
87
+ key="predicted_map"
88
+ )
89
+
90
+ # Minimal spacing
91
+ st.markdown("<div style='margin-top: 10px;'></div>", unsafe_allow_html=True)
92
+
93
+ with st.container():
94
+ st.subheader("🔴 Real Speed")
95
+ st_folium(
96
+ st.session_state["real_map"],
97
+ width=1200,
98
+ height=600,
99
+ returned_objects=[],
100
+ key="real_map"
101
+ )
102
+
103
+ elif "folium_map" in st.session_state:
104
+ map_title = "Predicted Speed" if st.session_state.get("map_type") == "predicted" else "Real Speed"
105
+ st.subheader(f"🗺️ {map_title}")
106
+ st_folium(
107
+ st.session_state["folium_map"],
108
+ width=1000,
109
+ height=1000,
110
+ returned_objects=[],
111
+ key="traffic_map"
112
+ )
data/Los Angeles/coordinates/CA 110 North.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e343e9c6e8222b00ee1078c75c54a3511ed2009eae77a3146bf34ffd42f77343
3
+ size 25792
data/Los Angeles/coordinates/CA 110 South.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50348bf9e65c9dc44f9a7900deb539baa494bcca855c0abc10eeeb45f6d618f4
3
+ size 28368
data/Los Angeles/coordinates/CA 118 East.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97b274ac0b44681b9768774e53280db2003b2eeda670915d0455b79a3cb241c9
3
+ size 10637
data/Los Angeles/coordinates/CA 118 West.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c564c75b547dc96c480c957467e46b6b1d3775a98159b93d8539cc81a06f9ef
3
+ size 11257
data/Los Angeles/coordinates/CA 134 East.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:419e83eec0a4b448b66443210c5384bb3ab692d5c48a06d7501f7d37d4e8a101
3
+ size 28810
data/Los Angeles/coordinates/CA 134 West.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8b19c456b566ba7c06ba5a00809632eb72b17ace442deefdc97079346f2ddfc
3
+ size 30711
data/Los Angeles/coordinates/CA 170 North.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8eb7225f8319965002d9b304349b15fb82a59e804353cdffcd70bf528b50f05
3
+ size 11192
data/Los Angeles/coordinates/CA 170 South.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3490d44731131ab2bbfcd1c2f76c03705b1d0ecc9cbaee9ceaba353dc4d4c25
3
+ size 10316
data/Los Angeles/coordinates/CA 2 North.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4d9e165b566c129bfd30f41faa36f30aa8ccfabc5d3dcb1aa69c6cab8a6a58b
3
+ size 129078
data/Los Angeles/coordinates/CA 2 South.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:419c5cdafe432fbc968c98fec5e5e6774fc0155427c8a7b2d5958ed4affb6294
3
+ size 128876
data/Los Angeles/coordinates/I 10 East.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44e09007d08b1c0f078a8c42a8a7260a4b996e7990818dc8e4e0b6bb76a7aef1
3
+ size 146208
data/Los Angeles/coordinates/I 10 West.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:333b59cb3426fcc2766d1798d1d924211a9bb260bd148bfa436da9b3fb34ba50
3
+ size 139940
data/Los Angeles/coordinates/I 110 North.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b137934c12d5b5be71fa37b3cc8327c26550d7e716bb926623cf0bbfbc75467c
3
+ size 37294
data/Los Angeles/coordinates/I 110 South.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3a3e9ff223dced2f8c1e3d5ef456ea78a4a5896fd8f7e5efdecf3a74ac7842e
3
+ size 37664
data/Los Angeles/coordinates/I 210 East.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77662bfdd3e2f52055b830918649099a11895ad0183cf913f7e671cd6be34b7e
3
+ size 57529
data/Los Angeles/coordinates/I 210 West.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db1667f653de6d15d3c483c77485c396426b49f452b3a0aa00d3e08f4fff366e
3
+ size 55855
data/Los Angeles/coordinates/I 405 North.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bdd36a44ab90041704654717bf9ec09a4431ccdc231ccbfb84b9fb956021d26b
3
+ size 135243
data/Los Angeles/coordinates/I 405 South.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dda435dd55be2789e05e8ad1a1cad5312495916a9ec7bd2a900aea5f043ea9b
3
+ size 140106
data/Los Angeles/coordinates/I 5 North.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b0a6f76b99ac3755f12d92661190bdd0e1071ef0c5ed21a060c557f121f2a35
3
+ size 141981
data/Los Angeles/coordinates/I 5 South.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47ff4e2b871bb69ddbca3510c55d407ca6c9da0d227f1f18ec297b6efdd273c1
3
+ size 141235
data/Los Angeles/coordinates/I 605 North.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c5ce7940b7cf70f9fdcb7c373ce0f851631b618ef60d822eb8bb1e925a81a3c
3
+ size 34769
data/Los Angeles/coordinates/I 605 South.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a85c66aac9b5f23a30199edfc5c0181ef2ccc1d7112ea0a4667330a69201b02a
3
+ size 31924
data/Los Angeles/coordinates/US 101 North.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11106ce640fec84cd685511a9e35d532a8255d8e79768dd9a38f7d19a389edaa
3
+ size 49616
data/Los Angeles/coordinates/US 101 South.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41f589bb93d40f90c2806fb577142247bc3a21d0d36dca5b9b5bf4e2d974d5fb
3
+ size 52135
data_collection/collect.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import os
3
+ from urllib.parse import urlparse, parse_qs
4
+ from datetime import datetime
5
+ from selenium import webdriver
6
+ from selenium.webdriver.chrome.service import Service
7
+ from selenium.webdriver.chrome.options import Options
8
+ from selenium.webdriver.common.by import By
9
+ from selenium.webdriver.common.keys import Keys
10
+
11
+ # Specify the download directory
12
+ download_path = '/Users/noamcohen/Downloads/data collection/data'
13
+ ROADS = {'405': ['N', 'S'], 101: ['N', 'S'], 5: ['N', 'S'], 110: ['N', 'S'], 170: ['N', 'S'], 118: ['E', 'W'],
14
+ 134: ['E', 'W'], 605: ['N', 'S'], 210: ['E', 'W']}
15
+ # Set up Chrome options
16
+ chrome_options = Options()
17
+ chrome_prefs = {
18
+ "download.default_directory": download_path, # Change the download directory
19
+ "download.prompt_for_download": False, # Disable download prompt
20
+ "download.directory_upgrade": True, # Automatically upgrade download path
21
+ }
22
+ chrome_options.add_experimental_option("prefs", chrome_prefs)
23
+
24
+ # Set up ChromeDriver with the path to your chromedriver and options
25
+ driver = webdriver.Chrome(service=Service('/opt/homebrew/bin/chromedriver'), options=chrome_options)
26
+
27
+ # Continue with your script as normal...
28
+
29
+ # Open the login page
30
+ url = "https://pems.dot.ca.gov/"
31
+ driver.get(url)
32
+
33
+ # Wait for the page to load
34
+ time.sleep(20)
35
+
36
+ # Find the username and password fields
37
+ username_field = driver.find_element(By.ID, "username")
38
+ password_field = driver.find_element(By.ID, "password")
39
+
40
+ # Enter your credentials
41
+ username_field.send_keys("amitomer1912@gmail.com")
42
+ password_field.send_keys("5^applel?X")
43
+
44
+ # Find and click the login button (using the 'login' name attribute)
45
+ login_button = driver.find_element(By.NAME, "login")
46
+ login_button.click()
47
+
48
+ # Wait for login to complete
49
+ time.sleep(20)
50
+ i = 1
51
+ s_time_id=1740787200
52
+ while(i <= 1):
53
+ # Once logged in, navigate to the report page
54
+ if i <= 9:
55
+ day = f"0{i}"
56
+ else:
57
+ day = i
58
+ for road in ROADS.keys():
59
+ for dir in ROADS.get(road):
60
+
61
+ report_url = f"https://pems.dot.ca.gov/?report_form=1&dnode=Freeway&content=spatial&tab=contours&export=&fwy={road}&dir={dir}&s_time_id={s_time_id}&s_time_id_f=03%2F{day}%2F2025&from_hh=0&to_hh=23&start_pm=.0&end_pm=1000.09&lanes=&station_type=ml&q=speed&colormap=30%2C31%2C32&sc=auto&ymin=&ymax=&view_d=2&chart.x=93&chart.y=20"
62
+ driver.get(report_url)
63
+ time.sleep(60)
64
+
65
+ # Find the "Export XLS" button by its 'name' or 'alt' attribute
66
+ export_button = driver.find_element(By.NAME, "xls")
67
+
68
+ # Click the "Export XLS" button
69
+ export_button.click()
70
+
71
+ # Wait for the download to complete (you can increase or decrease this based on your network speed)
72
+ time.sleep(100)
73
+
74
+ # Extract highway number and date from the report URL
75
+ parsed_url = urlparse(report_url)
76
+ query_params = parse_qs(parsed_url.query)
77
+ highway_number = query_params.get('fwy', ['unknown'])[0] # Default to 'unknown' if not found
78
+ date_taken = query_params.get('s_time_id_f', ['unknown_date'])[0] # Default to 'unknown_date' if not found
79
+
80
+ # Construct the new file name
81
+ new_file_name = rf"{highway_number}_{dir}_{date_taken.replace("/", "*")}.xlsx"
82
+ new_file_path = os.path.join(download_path, new_file_name)
83
+
84
+ # Original file path (before renaming)
85
+ original_file_path = os.path.join(download_path, 'pems_output.xlsx')
86
+
87
+ # Rename the file if it exists
88
+ if os.path.exists(original_file_path):
89
+ os.rename(original_file_path, new_file_path)
90
+ print(f"File renamed to: {new_file_path}")
91
+ else:
92
+ print("Original file not found!")
93
+
94
+ print(f"Download {i} to March complete!")
95
+ i+=1
96
+ s_time_id += 86400
97
+
98
+ driver.quit()
data_process/process.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import glob
3
+ import os
4
+ import sys
5
+ import numpy as np
6
+ from sklearn.neighbors import BallTree
7
+ from datetime import datetime
8
+
9
+ EARTH_RADIUS_M = 6371000
10
+ PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
11
+ sys.path.append(PROJECT_ROOT)
12
+
13
+ from roadmap.utils import add_weather_to_df
14
+
15
+ def compute_sensor_id(df: pd.DataFrame,
16
+ lat_col: str = "Latitude",
17
+ lon_col: str = "Longitude",
18
+ decimals: int = 6,
19
+ out_col: str = "sensor_id") -> pd.DataFrame:
20
+ df[out_col] = (
21
+ df[lat_col].round(decimals).astype(str)
22
+ + ";" +
23
+ df[lon_col].round(decimals).astype(str)
24
+ )
25
+ return df
26
+
27
+ def prepare_data_df(df_data: pd.DataFrame, coordinate: pd.DataFrame, date: str):
28
+ """"
29
+ first remove points with no observations, add date to the table and weather
30
+ """
31
+ #df_data.drop(df_data[df_data["% Observed"] < 50].index, inplace=True)
32
+ df_data["Time"] = pd.to_datetime(date + " " + df_data["Time"].astype(str),format="%Y-%m-%d %H:%M")
33
+
34
+ df_data = add_coordinate(coordinate, df_data)
35
+ df_data = compute_sensor_id(df_data)
36
+
37
+ df_data["Time_hour"] = df_data["Time"].dt.round("h")
38
+
39
+ try:
40
+ df_data = enrich_weather_hourly(df_data) #TODO find better way to do this
41
+ except Exception as e:
42
+ print(f"Error enriching weather: {e}")
43
+ df_data["weather"] = None
44
+
45
+ df_data["Day"] = df_data["Time"].dt.dayofweek
46
+
47
+ return df_data
48
+
49
+ def add_coordinate(df_coord: pd.DataFrame, df_data: pd.DataFrame):
50
+ df_coord = df_coord.sort_values(by="Abs PM").reset_index(drop=True)
51
+ df_data = df_data.sort_values(by="Postmile (Abs)").reset_index(drop=True)
52
+
53
+ coord_abs_pm = df_coord["Abs PM"].values
54
+ coord_lat = df_coord["Latitude"].values
55
+ coord_lon = df_coord["Longitude"].values
56
+
57
+ def find_closest_index(target):
58
+ return np.abs(coord_abs_pm - target).argmin()
59
+
60
+ closest_indices = df_data["Postmile (Abs)"].apply(find_closest_index)
61
+
62
+ df_data["Latitude"] = closest_indices.apply(lambda idx: coord_lat[idx]) # type: ignore
63
+ df_data["Longitude"] = closest_indices.apply(lambda idx: coord_lon[idx]) # type: ignore
64
+
65
+ return df_data
66
+
67
+ def enrich_weather_hourly(full_df: pd.DataFrame) -> pd.DataFrame:
68
+ pieces = []
69
+ for t_hour, chunk in full_df.groupby("Time_hour", sort=False):
70
+ enriched = add_weather_to_df(chunk.copy(),time=t_hour.to_pydatetime()) # type: ignore
71
+ pieces.append(enriched)
72
+
73
+ return pd.concat(pieces, ignore_index=True).sort_values("Time")
74
+
75
+ def build_sensor_index(data_df: pd.DataFrame) -> pd.DataFrame:
76
+ sensors = (
77
+ data_df
78
+ .drop_duplicates(subset=["Latitude", "Longitude"])
79
+ .loc[:, ["Latitude", "Longitude"]]
80
+ .copy()
81
+ .reset_index(drop=True)
82
+ )
83
+ sensors = compute_sensor_id(sensors) # adds 'sensor_id' as "lat;lon"
84
+ # keep only what we need; you can also keep an integer 'sensor_idx' if you like
85
+ return sensors[["sensor_id", "Latitude", "Longitude"]]
86
+
87
+
88
+ def build_enriched_time_series(data_df: pd.DataFrame,
89
+ sensor_map: pd.DataFrame,
90
+ sensors: pd.DataFrame) -> pd.DataFrame:
91
+ # Ensure both sides have the same sensor_id key
92
+ data_df = compute_sensor_id(data_df) # from its own lat/lon
93
+
94
+ enriched = (
95
+ sensor_map[["sensor_id", "Latitude", "Longitude", "lanes", "maxspeed", "ref", "direction"]]
96
+ .merge(
97
+ data_df[["sensor_id", "Time", "AggSpeed", "% Observed", "weather"]],
98
+ on="sensor_id",
99
+ how="left"
100
+ )
101
+ .sort_values(["sensor_id", "Time"])
102
+ .reset_index(drop=True)
103
+ )
104
+ return enriched
105
+
106
+
107
+ def normalize_lanes(value):
108
+ if isinstance(value, list):
109
+ try:
110
+ return min(int(x) for x in value)
111
+ except ValueError:
112
+ return None
113
+ try:
114
+ return int(value)
115
+ except ValueError:
116
+ return None
117
+
118
+
119
+ def map_pms_to_sensors(network_df: pd.DataFrame, sensors: pd.DataFrame, max_distance_m: float | None = None) -> pd.DataFrame:
120
+ net = network_df.dropna(subset=["Latitude", "Longitude"]).copy()
121
+
122
+ sensor_rad = np.radians(sensors[["Latitude", "Longitude"]].to_numpy())
123
+ net_rad = np.radians(net[["Latitude", "Longitude"]].to_numpy())
124
+
125
+ tree = BallTree(sensor_rad, metric="haversine")
126
+ dist_rad, idx = tree.query(net_rad,k=1)
127
+ dist_m = dist_rad[:, 0] * EARTH_RADIUS_M
128
+
129
+ matched = net.copy()
130
+ matched["sensor_id"] = sensors.iloc[idx[:, 0]].sensor_id.values
131
+ matched["matched_sensor_lat"] = sensors.iloc[idx[:, 0]].Latitude.values
132
+ matched["matched_sensor_lon"] = sensors.iloc[idx[:, 0]].Longitude.values
133
+ matched["distance_m"] = dist_m
134
+
135
+ if max_distance_m is not None:
136
+ matched = matched[matched["distance_m"] <= max_distance_m].copy()
137
+
138
+ return matched
data_process/split.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ allowed = ["CA 134", "CA 170","I 605"]
4
+
5
+ df = pd.read_csv("/Users/amitomer/Desktop/Personal/University/deep_learning/TrafCast/old_data/split_df_v2.csv")
6
+
7
+ df = df[df["road_name"].isin(allowed)]
8
+
9
+ df.to_csv("weights_test.csv", index=False)
data_process/unified.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import glob
4
+ import pandas as pd
5
+ from process import prepare_data_df, build_sensor_index, map_pms_to_sensors
6
+
7
+ index_map = {
8
+ '405': 'I',
9
+ '101': 'US',
10
+ '101': 'US',
11
+ '110': 'I',
12
+ '170': 'CA',
13
+ '118': 'CA',
14
+ '134': 'CA',
15
+ '605': 'I',
16
+ '210': 'I',
17
+ '5': 'I'
18
+ }
19
+ direction_map = {
20
+ 'E': 'East',
21
+ 'W': 'West',
22
+ 'N': 'North',
23
+ 'S': 'South'
24
+ }
25
+
26
+
27
+ PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
28
+ sys.path.append(PROJECT_ROOT)
29
+
30
+
31
+ DATA_DIR = os.path.join(PROJECT_ROOT, "data_collection", "data")
32
+ COORDINATE_DIR = os.path.join(PROJECT_ROOT,"data_collection", "coordinates")
33
+ full_df = pd.DataFrame()
34
+
35
+ for entry in os.scandir(DATA_DIR):
36
+ if entry.is_dir():
37
+ road_number = entry.name
38
+ road_name = index_map[road_number] +" "+ road_number
39
+
40
+ with os.scandir(entry.path) as it:
41
+ for sub in it:
42
+ if sub.is_dir() and sub.name in {'E', 'W', 'N', 'S'}:
43
+ direction = direction_map[sub.name]
44
+ print(f"Processing {road_name} {direction}")
45
+ data_dir = os.path.join(entry.path, sub.name)
46
+ coordinate_dir = os.path.join(COORDINATE_DIR, f"{road_name} {direction}.xlsx")
47
+
48
+ for i in range(1,32):
49
+ if i <=9:
50
+ raw_data_pattern = os.path.join(data_dir, f"{road_number}_{sub.name}_03*0{i}*2025.xlsx")
51
+ date = f"2025-03-0{i}"
52
+ else:
53
+ raw_data_pattern = os.path.join(data_dir, f"{road_number}_{sub.name}_03*{i}*2025.xlsx")
54
+ date = f"2025-03-{i}"
55
+
56
+ matching_files = glob.glob(raw_data_pattern)
57
+ if not matching_files:
58
+ print(f"No data file found for {road_name} {direction} on {date}, skipping...")
59
+ continue
60
+
61
+ raw_data = matching_files[0]
62
+
63
+ df_coord = pd.read_excel(coordinate_dir)
64
+ df_data = pd.read_excel(raw_data)
65
+ clean_data_df = prepare_data_df(df_data, df_coord,date)
66
+ sensors = build_sensor_index(clean_data_df)
67
+
68
+ enriched = map_pms_to_sensors(clean_data_df, sensors)
69
+ enriched["road_name"] = road_name
70
+ enriched["direction"] = direction
71
+
72
+ full_df = pd.concat([full_df, enriched], ignore_index=True)
73
+ print(f"finished {date}")
74
+
75
+
76
+ full_df.drop(columns=["Postmile (Abs)", "Postmile (CA)", "VDS", "Time_hour", "matched_sensor_lat", "matched_sensor_lon", "distance_m"], inplace=True)
77
+ desired_order = [
78
+ "Time","sensor_id", "Latitude", "Longitude",
79
+ "road_name", "direction", "# Lane Points",
80
+ "% Observed", "weather", "Day", "AggSpeed"
81
+ ]
82
+ full_df = full_df[desired_order]
83
+ full_df.rename(columns={
84
+ "AggSpeed": "speed_mph",
85
+ "# Lane Points": "lanes"
86
+ }, inplace=True)
87
+ full_df.to_csv('full_df_weather.csv',index=False)
88
+
89
+
90
+
91
+
92
+
93
+
model_v3/encode.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ encode.py – Traffic data encoder for LSTM traffic flow prediction
3
+
4
+ This module provides TrafficDataEncoder for processing 5-minute traffic sensor data
5
+ into sequences suitable for LSTM training. Key features:
6
+ - Sensor-safe windowing (no cross-sensor leakage)
7
+ - Feature engineering (time, geographic, categorical)
8
+ - Speed-based class weighting support
9
+ - Robust missing value handling
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ from sklearn.preprocessing import OrdinalEncoder, StandardScaler
17
+ from sklearn.utils.validation import check_is_fitted
18
+ from typing import List, Tuple, Dict, Optional
19
+ import joblib
20
+ from pathlib import Path
21
+
22
+
23
+ class TrafficDataEncoder:
24
+ """
25
+ Encodes traffic sensor data into sequences for LSTM training.
26
+
27
+ Features:
28
+ - Geographic coordinates (lat/lon -> x/y km)
29
+ - Time features (hour, day of week)
30
+ - Categorical encoding (direction, weather)
31
+ - Speed-based class weighting
32
+ - Sensor-safe windowing
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ seq_len: int = 12, # 12 * 5min = 1 hour history
38
+ horizon: int = 1, # predict 1 step ahead (5 minutes)
39
+ target_col: str = "speed_mph"
40
+ ):
41
+ self.seq_len = seq_len
42
+ self.horizon = horizon
43
+ self.target_col = target_col
44
+
45
+ # Feature columns
46
+ self.cat_cols = ["direction", "weather"]
47
+ self.num_cols = [
48
+ "lanes", "% Observed", "Latitude", "Longitude",
49
+ "hour_sin", "hour_cos", "dow_sin", "dow_cos"
50
+ ]
51
+
52
+ # Fitted components
53
+ self.ordinal_encoder: Optional[OrdinalEncoder] = None
54
+ self.scaler: Optional[StandardScaler] = None
55
+ self.num_medians: Dict[str, float] = {}
56
+ self.is_fitted = False
57
+
58
+ def _ensure_sensor_id_and_sort(self, df: pd.DataFrame) -> pd.DataFrame:
59
+ """Create sensor_id and sort by sensor and time."""
60
+ df = df.copy()
61
+
62
+ # Create sensor_id from coordinates
63
+ if "sensor_id" not in df.columns:
64
+ df["sensor_id"] = (
65
+ df["Latitude"].round(6).astype(str) + ";" +
66
+ df["Longitude"].round(6).astype(str)
67
+ )
68
+
69
+ # Parse time and sort
70
+ df["Time"] = pd.to_datetime(df["Time"], errors="coerce")
71
+ return df.sort_values(["sensor_id", "Time"]).reset_index(drop=True)
72
+
73
+ def _add_time_features(self, df: pd.DataFrame) -> pd.DataFrame:
74
+ """Add cyclical time features."""
75
+ dt = pd.to_datetime(df["Time"], errors="coerce")
76
+ hour = dt.dt.hour + dt.dt.minute / 60.0
77
+ dow = dt.dt.dayofweek
78
+
79
+ df["hour_sin"] = np.sin(2 * np.pi * hour / 24)
80
+ df["hour_cos"] = np.cos(2 * np.pi * hour / 24)
81
+ df["dow_sin"] = np.sin(2 * np.pi * dow / 7)
82
+ df["dow_cos"] = np.cos(2 * np.pi * dow / 7)
83
+
84
+ return df
85
+
86
+ def _clean_numeric(self, df: pd.DataFrame) -> pd.DataFrame:
87
+ """Clean and convert numeric columns."""
88
+ # Ensure lanes is numeric
89
+ df["lanes"] = pd.to_numeric(df.get("lanes", 0), errors="coerce")
90
+
91
+ # Ensure % Observed is numeric
92
+ df["% Observed"] = pd.to_numeric(df.get("% Observed", 100), errors="coerce")
93
+
94
+ return df
95
+
96
+ def _compute_speed_weights(self, y: np.ndarray) -> Dict[str, float]:
97
+ """Compute class weights for speed-based weighting."""
98
+ # Define speed classes based on user's experience
99
+ low_mask = y <= 30
100
+ high_mask = y >= 60
101
+ medium_mask = ~(low_mask | high_mask)
102
+
103
+ n_low = low_mask.sum()
104
+ n_medium = medium_mask.sum()
105
+ n_high = high_mask.sum()
106
+ n_total = len(y)
107
+
108
+ print(f"Speed distribution:")
109
+ print(f" Low (≤30): {n_low} samples ({n_low/n_total*100:.1f}%)")
110
+ print(f" Medium (30-60): {n_medium} samples ({n_medium/n_total*100:.1f}%)")
111
+ print(f" High (≥60): {n_high} samples ({n_high/n_total*100:.1f}%)")
112
+
113
+ # Compute inverse frequency weights
114
+ if n_low > 0 and n_medium > 0 and n_high > 0:
115
+ weight_low = n_total / (3 * n_low)
116
+ weight_medium = n_total / (3 * n_medium)
117
+ weight_high = n_total / (3 * n_high)
118
+ else:
119
+ weight_low = weight_medium = weight_high = 1.0
120
+
121
+ print(f"Class weights: Low={weight_low:.2f}, Medium={weight_medium:.2f}, High={weight_high:.2f}")
122
+
123
+ return {
124
+ "weight_low": weight_low,
125
+ "weight_medium": weight_medium,
126
+ "weight_high": weight_high,
127
+ "low_threshold": 30,
128
+ "high_threshold": 60
129
+ }
130
+
131
+ def fit(self, df: pd.DataFrame) -> "TrafficDataEncoder":
132
+ """Fit the encoder on training data."""
133
+ print("Fitting encoder...")
134
+
135
+ # Preprocess data
136
+ df = self._ensure_sensor_id_and_sort(df)
137
+ df = self._add_time_features(df)
138
+ df = self._clean_numeric(df)
139
+
140
+ # Handle missing values
141
+ df[self.cat_cols] = df[self.cat_cols].fillna("UNK")
142
+ self.num_medians = df[self.num_cols].median(numeric_only=True).to_dict()
143
+ df[self.num_cols] = df[self.num_cols].fillna(self.num_medians)
144
+
145
+ # Fit encoders
146
+ self.ordinal_encoder = OrdinalEncoder(
147
+ handle_unknown="use_encoded_value",
148
+ unknown_value=-1
149
+ )
150
+ self.ordinal_encoder.fit(df[self.cat_cols])
151
+
152
+ self.scaler = StandardScaler()
153
+ self.scaler.fit(df[self.num_cols])
154
+
155
+ self.is_fitted = True
156
+ print("Encoder fitted successfully")
157
+ return self
158
+
159
+ def _preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
160
+ """Apply preprocessing steps."""
161
+ df = self._ensure_sensor_id_and_sort(df)
162
+ df = self._add_time_features(df)
163
+ df = self._clean_numeric(df)
164
+
165
+ # Handle missing values using fitted medians
166
+ df[self.cat_cols] = df[self.cat_cols].fillna("UNK")
167
+ df[self.num_cols] = df[self.num_cols].fillna(self.num_medians)
168
+
169
+ return df
170
+
171
+ def transform(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
172
+ """
173
+ Transform data into sequences.
174
+
175
+ Returns:
176
+ X: (N, seq_len, n_features) - input sequences
177
+ y: (N, horizon) - target values
178
+ target_indices: (N,) - indices of target rows in original df
179
+ timestamps: (N,) - timestamps of target rows
180
+ """
181
+ check_is_fitted(self, ["ordinal_encoder", "scaler", "num_medians"])
182
+
183
+ df = self._preprocess(df)
184
+
185
+ X_chunks = []
186
+ y_chunks = []
187
+ target_indices = []
188
+ timestamps = []
189
+
190
+ # Process each sensor separately to avoid cross-sensor leakage
191
+ for sensor_id, group in df.groupby("sensor_id", sort=False):
192
+ if len(group) < self.seq_len + self.horizon:
193
+ continue # Not enough data for this sensor
194
+
195
+ # Encode features
196
+ cat_features = self.ordinal_encoder.transform(group[self.cat_cols]).astype(np.float32)
197
+ num_features = self.scaler.transform(group[self.num_cols]).astype(np.float32)
198
+ features = np.concatenate([num_features, cat_features], axis=1)
199
+
200
+ # Get targets
201
+ targets = group[self.target_col].to_numpy(dtype=np.float32)
202
+ group_timestamps = group["Time"].to_numpy()
203
+ group_indices = group.index.to_numpy()
204
+
205
+ # Create sliding windows
206
+ n_windows = len(group) - self.seq_len - self.horizon + 1
207
+ for i in range(n_windows):
208
+ X_chunks.append(features[i:i + self.seq_len])
209
+ y_chunks.append(targets[i + self.seq_len:i + self.seq_len + self.horizon])
210
+ target_indices.append(group_indices[i + self.seq_len + self.horizon - 1])
211
+ timestamps.append(group_timestamps[i + self.seq_len + self.horizon - 1])
212
+
213
+ if not X_chunks:
214
+ # Return empty arrays with correct shapes
215
+ n_features = len(self.num_cols) + len(self.cat_cols)
216
+ return (
217
+ np.empty((0, self.seq_len, n_features), dtype=np.float32),
218
+ np.empty((0, self.horizon), dtype=np.float32),
219
+ np.empty(0, dtype=int),
220
+ np.empty(0, dtype=object)
221
+ )
222
+
223
+ X = np.stack(X_chunks, axis=0)
224
+ y = np.stack(y_chunks, axis=0)
225
+ target_indices = np.array(target_indices, dtype=int)
226
+ timestamps = np.array(timestamps)
227
+
228
+ print(f"Created {len(X)} sequences from {len(df.groupby('sensor_id'))} sensors")
229
+ return X, y, target_indices, timestamps
230
+
231
+ def fit_transform(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
232
+ """Fit encoder and transform data in one step."""
233
+ return self.fit(df).transform(df)
234
+
235
+ def get_speed_weights(self, y: np.ndarray) -> Dict[str, float]:
236
+ """Get speed-based class weights for weighted loss."""
237
+ return self._compute_speed_weights(y)
238
+
239
+ def save(self, filepath: str) -> None:
240
+ """Save the fitted encoder."""
241
+ if not self.is_fitted:
242
+ raise ValueError("Encoder must be fitted before saving")
243
+
244
+ joblib.dump(self, filepath)
245
+ print(f"Encoder saved to {filepath}")
246
+
247
+ @classmethod
248
+ def load(cls, filepath: str) -> "TrafficDataEncoder":
249
+ """Load a fitted encoder."""
250
+
251
+ try:
252
+ encoder = joblib.load(filepath)
253
+ if not isinstance(encoder, cls):
254
+ raise ValueError(f"Loaded object is not a {cls.__name__}")
255
+ return encoder
256
+ except AttributeError as e:
257
+ if "TrafficDataEncoder" in str(e):
258
+ # Handle the case where encoder was saved from a different module context
259
+ print("Warning: Encoder was saved from different module context. Reconstructing...")
260
+
261
+ # Use a more robust approach with joblib
262
+ import sys
263
+ import types
264
+
265
+ # Temporarily modify sys.modules to include our class
266
+ original_main = sys.modules.get('__main__')
267
+ temp_module = types.ModuleType('temp_encode')
268
+ temp_module.TrafficDataEncoder = cls
269
+ sys.modules['__main__'] = temp_module
270
+
271
+ try:
272
+ # Now try loading with the modified module context
273
+ encoder = joblib.load(filepath)
274
+ if not isinstance(encoder, cls):
275
+ raise ValueError(f"Loaded object is not a {cls.__name__}")
276
+ return encoder
277
+ finally:
278
+ # Restore original __main__ module
279
+ if original_main is not None:
280
+ sys.modules['__main__'] = original_main
281
+ else:
282
+ del sys.modules['__main__']
283
+ else:
284
+ raise e
285
+
286
+
287
+ def main():
288
+ """CLI interface for encoding data."""
289
+ import argparse
290
+
291
+ parser = argparse.ArgumentParser(description="Encode traffic data for LSTM training")
292
+ parser.add_argument("csv_file", help="Path to CSV file with traffic data")
293
+ parser.add_argument("--seq_len", type=int, default=12, help="Sequence length (default: 12)")
294
+ parser.add_argument("--horizon", type=int, default=1, help="Prediction horizon (default: 1)")
295
+ parser.add_argument("--target_col", default="speed_mph", help="Target column name")
296
+ parser.add_argument("--save_encoder", help="Path to save fitted encoder")
297
+ parser.add_argument("--output", help="Path to save encoded data (optional)")
298
+
299
+ args = parser.parse_args()
300
+
301
+ # Load data
302
+ print(f"Loading data from {args.csv_file}")
303
+ df = pd.read_csv(args.csv_file)
304
+
305
+ # Create and fit encoder
306
+ encoder = TrafficDataEncoder(
307
+ seq_len=args.seq_len,
308
+ horizon=args.horizon,
309
+ target_col=args.target_col
310
+ )
311
+
312
+ X, y, target_indices, timestamps = encoder.fit_transform(df)
313
+
314
+ print(f"Encoded data shapes:")
315
+ print(f" X: {X.shape}")
316
+ print(f" y: {y.shape}")
317
+ print(f" Target indices: {len(target_indices)}")
318
+ print(f" Timestamps: {len(timestamps)}")
319
+
320
+ # Save encoder if requested
321
+ if args.save_encoder:
322
+ encoder.save(args.save_encoder)
323
+
324
+ # Save encoded data if requested
325
+ if args.output:
326
+ np.savez(args.output, X=X, y=y, target_indices=target_indices, timestamps=timestamps)
327
+ print(f"Encoded data saved to {args.output}")
328
+
329
+
330
+ if __name__ == "__main__":
331
+ main()
332
+
model_v3/evaluate.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ evaluate.py – Model evaluation and prediction for traffic flow prediction
3
+
4
+ Features:
5
+ - Load trained model and encoder
6
+ - Generate predictions on new data
7
+ - Comprehensive evaluation metrics
8
+ - Visualization support
9
+ - Batch prediction for large datasets
10
+ """
11
+
12
+ import argparse
13
+ import numpy as np
14
+ import pandas as pd
15
+ import torch
16
+ import torch.nn as nn
17
+ from torch.utils.data import DataLoader, TensorDataset
18
+ from pathlib import Path
19
+ import joblib
20
+ from typing import Dict, List, Tuple, Optional
21
+ import matplotlib.pyplot as plt
22
+ try:
23
+ import seaborn as sns
24
+ sns.set_style("whitegrid")
25
+ except ImportError:
26
+ print("Warning: seaborn not available, using matplotlib defaults")
27
+
28
+ from encode import TrafficDataEncoder
29
+ from train_lstm import LSTMRegressor
30
+
31
+
32
+ def load_model_and_encoder(
33
+ model_path: str,
34
+ encoder_path: str,
35
+ device: torch.device
36
+ ) -> Tuple[LSTMRegressor, TrafficDataEncoder]:
37
+ """Load trained model and encoder."""
38
+ print(f"Loading encoder from {encoder_path}")
39
+ encoder = TrafficDataEncoder.load(encoder_path)
40
+
41
+ print(f"Loading model from {model_path}")
42
+ model_state = torch.load(model_path, map_location=device)
43
+
44
+ # Infer model architecture from the saved state_dict
45
+ n_features = len(encoder.num_cols) + len(encoder.cat_cols)
46
+
47
+ # Infer hidden_size from the first LSTM layer weights
48
+ # lstm.weight_ih_l0 shape is [4*hidden_size, n_features]
49
+ first_layer_weight_shape = model_state['lstm.weight_ih_l0'].shape
50
+ hidden_size = first_layer_weight_shape[0] // 4
51
+
52
+ # Check if bidirectional by looking for reverse weights
53
+ bidirectional = 'lstm.weight_ih_l0_reverse' in model_state
54
+
55
+ # Infer number of layers by counting unique layer indices
56
+ layer_keys = [k for k in model_state.keys() if k.startswith('lstm.weight_ih_l')]
57
+ n_layers = len(set([k.split('_l')[1].split('_')[0] for k in layer_keys]))
58
+
59
+ # Infer dropout from the model structure (this is harder to infer, so we'll use a default)
60
+ dropout = 0.3 # Default value
61
+
62
+ print(f"Inferred model architecture:")
63
+ print(f" n_features: {n_features}")
64
+ print(f" hidden_size: {hidden_size}")
65
+ print(f" n_layers: {n_layers}")
66
+ print(f" bidirectional: {bidirectional}")
67
+ print(f" dropout: {dropout}")
68
+
69
+ # Create model with inferred architecture
70
+ model = LSTMRegressor(
71
+ n_features=n_features,
72
+ hidden_size=hidden_size,
73
+ n_layers=n_layers,
74
+ dropout=dropout,
75
+ bidirectional=bidirectional
76
+ ).to(device)
77
+
78
+ model.load_state_dict(model_state)
79
+ model.eval()
80
+
81
+ print("Model and encoder loaded successfully")
82
+ return model, encoder
83
+
84
+
85
+ def compute_metrics(predictions: np.ndarray, targets: np.ndarray) -> Dict[str, float]:
86
+ """Compute comprehensive evaluation metrics."""
87
+ predictions = predictions.flatten()
88
+ targets = targets.flatten()
89
+
90
+ # Basic metrics
91
+ mae = np.mean(np.abs(predictions - targets))
92
+ mse = np.mean((predictions - targets) ** 2)
93
+ rmse = np.sqrt(mse)
94
+
95
+ # Percentage metrics
96
+ mape = np.mean(np.abs((targets - predictions) / (targets + 1e-8))) * 100
97
+
98
+ # R-squared
99
+ ss_res = np.sum((targets - predictions) ** 2)
100
+ ss_tot = np.sum((targets - np.mean(targets)) ** 2)
101
+ r2 = 1 - (ss_res / (ss_tot + 1e-8))
102
+
103
+ # Speed-specific metrics
104
+ speed_ranges = {
105
+ 'low (≤30)': targets <= 30,
106
+ 'medium (30-60)': (targets > 30) & (targets <= 60),
107
+ 'high (≥60)': targets >= 60
108
+ }
109
+
110
+ range_metrics = {}
111
+ for range_name, mask in speed_ranges.items():
112
+ if np.sum(mask) > 0:
113
+ range_pred = predictions[mask]
114
+ range_target = targets[mask]
115
+ range_metrics[f'mae_{range_name.replace(" ", "_").replace("(", "").replace(")", "")}'] = np.mean(np.abs(range_pred - range_target))
116
+ range_metrics[f'count_{range_name.replace(" ", "_").replace("(", "").replace(")", "")}'] = np.sum(mask)
117
+
118
+ metrics = {
119
+ 'mae': mae,
120
+ 'mse': mse,
121
+ 'rmse': rmse,
122
+ 'mape': mape,
123
+ 'r2': r2,
124
+ **range_metrics
125
+ }
126
+
127
+ return metrics
128
+
129
+
130
+ def predict_batch(
131
+ model: LSTMRegressor,
132
+ encoder: TrafficDataEncoder,
133
+ df: pd.DataFrame,
134
+ batch_size: int = 256,
135
+ device: torch.device = torch.device('cpu'),
136
+ train_ratio: float = 0.7,
137
+ val_ratio: float = 0.15
138
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
139
+ """
140
+ Generate predictions for the TEST portion of a dataset in batches.
141
+ Uses the same chronological split as training to ensure we only evaluate on test data.
142
+
143
+ Returns:
144
+ predictions: (N,) - predicted values (test set only)
145
+ targets: (N,) - actual values (test set only)
146
+ target_indices: (N,) - indices of target rows in original df (test set only)
147
+ """
148
+ print("Encoding data for prediction...")
149
+ X, y, target_indices, timestamps = encoder.transform(df)
150
+
151
+ if len(X) == 0:
152
+ print("No valid sequences found in data")
153
+ return np.array([]), np.array([]), np.array([])
154
+
155
+ # Apply the same chronological split as training
156
+ print("Applying chronological split to match training...")
157
+ sorted_indices = np.argsort(timestamps)
158
+ X_sorted = X[sorted_indices]
159
+ y_sorted = y[sorted_indices]
160
+ target_indices_sorted = target_indices[sorted_indices]
161
+ timestamps_sorted = timestamps[sorted_indices]
162
+
163
+ # Calculate split points (same as training)
164
+ n_total = len(X_sorted)
165
+ n_train = int(n_total * train_ratio)
166
+ n_val = int(n_total * val_ratio)
167
+
168
+ # Get test indices
169
+ test_indices = sorted_indices[n_train + n_val:]
170
+ X_test = X[test_indices]
171
+ y_test = y[test_indices]
172
+ target_indices_test = target_indices[test_indices]
173
+
174
+ print(f"Using test set: {len(X_test):,} samples ({(1-train_ratio-val_ratio)*100:.0f}%)")
175
+ if len(X_test) > 0:
176
+ test_timestamps = pd.to_datetime(timestamps[test_indices])
177
+ print(f"Test date range: {test_timestamps.min()} to {test_timestamps.max()}")
178
+
179
+ if len(X_test) == 0:
180
+ print("No test data available")
181
+ return np.array([]), np.array([]), np.array([])
182
+
183
+ print(f"Generating predictions for {len(X_test)} test sequences...")
184
+
185
+ # Create data loader for test set only
186
+ dataset = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())
187
+ data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
188
+
189
+ predictions = []
190
+ targets = []
191
+
192
+ model.eval()
193
+ with torch.no_grad():
194
+ for batch_X, batch_y in data_loader:
195
+ batch_X = batch_X.to(device)
196
+ batch_pred = model(batch_X).cpu().numpy()
197
+ predictions.append(batch_pred)
198
+ targets.append(batch_y.numpy())
199
+
200
+ predictions = np.concatenate(predictions, axis=0).flatten()
201
+ targets = np.concatenate(targets, axis=0).flatten()
202
+
203
+ return predictions, targets, target_indices_test
204
+
205
+
206
+ def create_evaluation_plots(
207
+ predictions: np.ndarray,
208
+ targets: np.ndarray,
209
+ save_path: Optional[str] = None
210
+ ) -> None:
211
+ """Create comprehensive evaluation plots."""
212
+ fig, axes = plt.subplots(2, 2, figsize=(15, 12))
213
+
214
+ # Scatter plot: predictions vs targets
215
+ axes[0, 0].scatter(targets, predictions, alpha=0.5, s=1)
216
+ axes[0, 0].plot([targets.min(), targets.max()], [targets.min(), targets.max()], 'r--', lw=2)
217
+ axes[0, 0].set_xlabel('Actual Speed (mph)')
218
+ axes[0, 0].set_ylabel('Predicted Speed (mph)')
219
+ axes[0, 0].set_title('Predictions vs Actual')
220
+ axes[0, 0].grid(True, alpha=0.3)
221
+
222
+ # Residuals plot
223
+ residuals = predictions - targets
224
+ axes[0, 1].scatter(targets, residuals, alpha=0.5, s=1)
225
+ axes[0, 1].axhline(y=0, color='r', linestyle='--')
226
+ axes[0, 1].set_xlabel('Actual Speed (mph)')
227
+ axes[0, 1].set_ylabel('Residuals (mph)')
228
+ axes[0, 1].set_title('Residuals vs Actual')
229
+ axes[0, 1].grid(True, alpha=0.3)
230
+
231
+ # Error distribution
232
+ axes[1, 0].hist(residuals, bins=50, alpha=0.7, edgecolor='black')
233
+ axes[1, 0].set_xlabel('Residuals (mph)')
234
+ axes[1, 0].set_ylabel('Frequency')
235
+ axes[1, 0].set_title('Error Distribution')
236
+ axes[1, 0].grid(True, alpha=0.3)
237
+
238
+ # Speed range performance
239
+ speed_ranges = {
240
+ 'Low (≤30)': targets <= 30,
241
+ 'Medium (30-60)': (targets > 30) & (targets <= 60),
242
+ 'High (≥60)': targets >= 60
243
+ }
244
+
245
+ range_maes = []
246
+ range_names = []
247
+ for name, mask in speed_ranges.items():
248
+ if np.sum(mask) > 0:
249
+ range_mae = np.mean(np.abs(predictions[mask] - targets[mask]))
250
+ range_maes.append(range_mae)
251
+ range_names.append(name)
252
+
253
+ axes[1, 1].bar(range_names, range_maes, alpha=0.7)
254
+ axes[1, 1].set_ylabel('MAE (mph)')
255
+ axes[1, 1].set_title('MAE by Speed Range')
256
+ axes[1, 1].grid(True, alpha=0.3)
257
+
258
+ plt.tight_layout()
259
+
260
+ if save_path:
261
+ plt.savefig(save_path, dpi=300, bbox_inches='tight')
262
+ print(f"Evaluation plots saved to {save_path}")
263
+ else:
264
+ plt.show()
265
+
266
+
267
+ def main():
268
+ """Main evaluation function."""
269
+ parser = argparse.ArgumentParser(description="Evaluate trained LSTM model")
270
+
271
+ # Required arguments
272
+ parser.add_argument("--csv", required=True, help="Path to CSV file with test data")
273
+ parser.add_argument("--model", required=True, help="Path to trained model (.pt file)")
274
+ parser.add_argument("--encoder", required=True, help="Path to fitted encoder (.pkl file)")
275
+
276
+ # Optional arguments
277
+ parser.add_argument("--batch_size", type=int, default=256, help="Batch size for prediction")
278
+ parser.add_argument("--train_ratio", type=float, default=0.7, help="Training data ratio (must match training)")
279
+ parser.add_argument("--val_ratio", type=float, default=0.15, help="Validation data ratio (must match training)")
280
+ parser.add_argument("--output", help="Path to save predictions CSV")
281
+ parser.add_argument("--metrics_output", help="Path to save metrics JSON")
282
+ parser.add_argument("--plots_output", help="Path to save evaluation plots")
283
+ parser.add_argument("--device", default="auto", help="Device to use (auto, cpu, cuda, mps)")
284
+
285
+ args = parser.parse_args()
286
+
287
+ # Device selection
288
+ if args.device == "auto":
289
+ if torch.backends.mps.is_available():
290
+ device = torch.device("mps")
291
+ elif torch.cuda.is_available():
292
+ device = torch.device("cuda")
293
+ else:
294
+ device = torch.device("cpu")
295
+ else:
296
+ device = torch.device(args.device)
297
+
298
+ print(f"Using device: {device}")
299
+
300
+ # Load model and encoder
301
+ model, encoder = load_model_and_encoder(args.model, args.encoder, device)
302
+
303
+ # Load test data
304
+ print(f"Loading test data from {args.csv}")
305
+ df = pd.read_csv(args.csv)
306
+ print(f"Loaded {len(df):,} rows")
307
+
308
+ # Generate predictions (using same split ratios as training)
309
+ predictions, targets, target_indices = predict_batch(
310
+ model, encoder, df, args.batch_size, device,
311
+ train_ratio=args.train_ratio, val_ratio=args.val_ratio
312
+ )
313
+
314
+ if len(predictions) == 0:
315
+ print("No predictions generated. Check your data format.")
316
+ return
317
+
318
+ # Compute metrics
319
+ print("Computing evaluation metrics...")
320
+ metrics = compute_metrics(predictions, targets)
321
+
322
+ # Print metrics
323
+ print("\n" + "="*50)
324
+ print("EVALUATION METRICS")
325
+ print("="*50)
326
+ print(f"MAE (Mean Absolute Error): {metrics['mae']:.4f} mph")
327
+ print(f"RMSE (Root Mean Square Error): {metrics['rmse']:.4f} mph")
328
+ print(f"MAPE (Mean Absolute Percentage Error): {metrics['mape']:.2f}%")
329
+ print(f"R² (Coefficient of Determination): {metrics['r2']:.4f}")
330
+
331
+ # Speed range metrics
332
+ print("\nSpeed Range Performance:")
333
+ for key, value in metrics.items():
334
+ if key.startswith('mae_') and key.endswith('_count'):
335
+ continue
336
+ elif key.startswith('mae_'):
337
+ range_name = key.replace('mae_', '').replace('_', ' ')
338
+ count_key = f"count_{key.replace('mae_', '')}"
339
+ count = metrics.get(count_key, 0)
340
+ print(f" {range_name.title()}: {value:.4f} mph (n={count})")
341
+
342
+ # Save predictions if requested
343
+ if args.output:
344
+ print(f"\nSaving predictions to {args.output}")
345
+
346
+ # Create detailed prediction DataFrame
347
+ pred_df = pd.DataFrame({
348
+ 'prediction': predictions,
349
+ 'target': targets,
350
+ 'error': predictions - targets,
351
+ 'abs_error': np.abs(predictions - targets),
352
+ 'target_index': target_indices
353
+ })
354
+
355
+ # Add original data columns if possible
356
+ if len(target_indices) > 0 and max(target_indices) < len(df):
357
+ for col in ['Time', 'Latitude', 'Longitude', 'direction', 'weather']:
358
+ if col in df.columns:
359
+ pred_df[col] = df.iloc[target_indices][col].values
360
+
361
+ pred_df.to_csv(args.output, index=False)
362
+ print(f"Predictions saved with {len(pred_df)} rows")
363
+
364
+ # Save metrics if requested
365
+ if args.metrics_output:
366
+ import json
367
+
368
+ # right before json.dump
369
+ metrics = {k: (float(v) if isinstance(v, (np.floating, np.float32, np.float64)) else int(v) if isinstance(v, (np.integer,)) else v)
370
+ for k, v in metrics.items()}
371
+
372
+ with open(args.metrics_output, 'w') as f:
373
+ json.dump(metrics, f, indent=2)
374
+
375
+
376
+ # Create and save plots if requested
377
+ if args.plots_output:
378
+ print(f"Creating evaluation plots...")
379
+ create_evaluation_plots(predictions, targets, args.plots_output)
380
+
381
+ print("\nEvaluation completed successfully!")
382
+
383
+
384
+ if __name__ == "__main__":
385
+ main()
model_v3/experiments.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🏆 BEST PERFORMANCE MODEL (Lowest MAE)
2
+ python train_lstm.py --csv test1.csv --epochs 20 --batch_size 128 --hidden_size 256 --bidirectional --loss_type weighted_huber --model_out lstm_model_v3.pt
3
+ ==================================================
4
+ EVALUATION METRICS
5
+ ==================================================
6
+ MAE (Mean Absolute Error): 4.1815 mph
7
+ RMSE (Root Mean Square Error): 7.6657 mph
8
+ MAPE (Mean Absolute Percentage Error): 8.20%
9
+ R² (Coefficient of Determination): 0.4294
10
+
11
+ Speed Range Performance:
12
+ Low ≤30: 11.5730 mph (n=1889)
13
+ Medium 30-60: 7.1722 mph (n=15717)
14
+ High ≥60: 3.2165 mph (n=63171)
15
+
16
+ Saving predictions to test_predictions.csv
17
+ Predictions saved with 80539 rows
model_v3/final_encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73fbe65e3e7acbe6bd4a31d26bd53fec9b316340343805845b0ba2f5b910ce15
3
+ size 2493
model_v3/final_lstm.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d0f959fd9a7e62507407b7c2b72e90989ee14fad7c7f287271c339764d9460b
3
+ size 8770554
model_v3/predict_road.py ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ predict_road.py – Predict traffic speeds for all sensors on a specific road and direction
3
+
4
+ This module provides functions to predict traffic speeds for all sensors on a given road
5
+ and direction at a specific time. Designed for map visualization and real-time prediction.
6
+ """
7
+
8
+ import pandas as pd
9
+ import numpy as np
10
+ import torch
11
+ from typing import List, Dict, Tuple, Optional
12
+ from datetime import datetime, timedelta
13
+ import joblib
14
+ import sys
15
+ import os
16
+
17
+ # Add current directory to path for local imports
18
+ current_dir = os.path.dirname(os.path.abspath(__file__))
19
+ sys.path.append(current_dir)
20
+
21
+ from encode import TrafficDataEncoder
22
+ from train_lstm import LSTMRegressor
23
+
24
+
25
+ class RoadPredictor:
26
+ """
27
+ Predictor for traffic speeds on specific roads and directions.
28
+
29
+ This class loads a trained model and encoder, then provides methods to predict
30
+ speeds for all sensors on a given road/direction at a specific time.
31
+ """
32
+
33
+ def __init__(self, model_path: str, encoder_path: str, device: str = "auto"):
34
+ """
35
+ Initialize the road predictor.
36
+
37
+ Args:
38
+ model_path: Path to trained model (.pt file)
39
+ encoder_path: Path to fitted encoder (.pkl file)
40
+ device: Device to use (auto, cpu, cuda, mps)
41
+ """
42
+ # Device selection
43
+ if device == "auto":
44
+ if torch.backends.mps.is_available():
45
+ self.device = torch.device("mps")
46
+ elif torch.cuda.is_available():
47
+ self.device = torch.device("cuda")
48
+ else:
49
+ self.device = torch.device("cpu")
50
+ else:
51
+ self.device = torch.device(device)
52
+
53
+ print(f"Using device: {self.device}")
54
+
55
+ # Load encoder
56
+ print(f"Loading encoder from {encoder_path}")
57
+ self.encoder = TrafficDataEncoder.load(encoder_path)
58
+
59
+ # Load model
60
+ print(f"Loading model from {model_path}")
61
+ model_state = torch.load(model_path, map_location=self.device)
62
+
63
+ # Infer model architecture from saved state
64
+ n_features = len(self.encoder.num_cols) + len(self.encoder.cat_cols)
65
+
66
+ # Infer hidden_size from first LSTM layer weights
67
+ first_layer_weight_shape = model_state['lstm.weight_ih_l0'].shape
68
+ hidden_size = first_layer_weight_shape[0] // 4
69
+
70
+ # Check if bidirectional
71
+ bidirectional = 'lstm.weight_ih_l0_reverse' in model_state
72
+
73
+ # Infer number of layers
74
+ layer_keys = [k for k in model_state.keys() if k.startswith('lstm.weight_ih_l')]
75
+ n_layers = len(set([k.split('_l')[1].split('_')[0] for k in layer_keys]))
76
+
77
+ print(f"Model architecture: hidden_size={hidden_size}, n_layers={n_layers}, bidirectional={bidirectional}")
78
+
79
+ # Create and load model
80
+ self.model = LSTMRegressor(
81
+ n_features=n_features,
82
+ hidden_size=hidden_size,
83
+ n_layers=n_layers,
84
+ dropout=0.3,
85
+ bidirectional=bidirectional
86
+ ).to(self.device)
87
+
88
+ self.model.load_state_dict(model_state)
89
+ self.model.eval()
90
+
91
+ print("Model and encoder loaded successfully")
92
+
93
+ def get_road_sensors(self, df: pd.DataFrame, road_name: str, direction: str) -> pd.DataFrame:
94
+ """
95
+ Get all sensors for a specific road and direction.
96
+
97
+ Args:
98
+ df: DataFrame with traffic data
99
+ road_name: Name of the road (e.g., "I 405")
100
+ direction: Direction (e.g., "North", "South", "East", "West")
101
+
102
+ Returns:
103
+ DataFrame with unique sensors for the road/direction
104
+ """
105
+ # Filter for the specific road and direction
106
+ road_data = df[(df['road_name'] == road_name) & (df['direction'] == direction)].copy()
107
+
108
+ if len(road_data) == 0:
109
+ raise ValueError(f"No data found for road '{road_name}' direction '{direction}'")
110
+
111
+ # Get unique sensors using the actual sensor_id from the data
112
+ sensors = road_data.groupby('sensor_id').agg({
113
+ 'Latitude': 'first',
114
+ 'Longitude': 'first',
115
+ 'road_name': 'first',
116
+ 'direction': 'first',
117
+ 'lanes': 'first'
118
+ }).reset_index()
119
+
120
+ print(f"Found {len(sensors)} sensors on {road_name} {direction}")
121
+ return sensors
122
+
123
+ def prepare_prediction_data(
124
+ self,
125
+ df: pd.DataFrame,
126
+ road_name: str,
127
+ direction: str,
128
+ target_time: datetime,
129
+ seq_len: int = 12
130
+ ) -> Tuple[pd.DataFrame, List[str]]:
131
+ """
132
+ Prepare data for prediction by getting historical sequences for each sensor.
133
+
134
+ Args:
135
+ df: DataFrame with traffic data
136
+ road_name: Name of the road
137
+ direction: Direction
138
+ target_time: Time to predict for
139
+ seq_len: Length of historical sequence needed
140
+
141
+ Returns:
142
+ Tuple of (prepared_data, sensor_ids)
143
+ """
144
+ # Get sensors for this road/direction
145
+ sensors = self.get_road_sensors(df, road_name, direction)
146
+
147
+ # Prepare data for each sensor
148
+ prepared_data = []
149
+ sensor_ids = []
150
+
151
+ for _, sensor in sensors.iterrows():
152
+ sensor_id = sensor['sensor_id']
153
+
154
+ # Get all data for this sensor
155
+ sensor_data = df[df['sensor_id'] == sensor_id].copy()
156
+ if len(sensor_data) == 0:
157
+ print(f"Warning: No data found for sensor {sensor_id}")
158
+ continue
159
+ sensor_data = sensor_data.sort_values('Time').reset_index(drop=True)
160
+
161
+ # Convert Time to datetime
162
+ sensor_data['Time'] = pd.to_datetime(sensor_data['Time'])
163
+
164
+ # Find the closest time to target_time
165
+ time_diffs = abs(sensor_data['Time'] - target_time)
166
+ if len(time_diffs) == 0:
167
+ print(f"Warning: No time data for sensor {sensor_id}")
168
+ continue
169
+ closest_idx = time_diffs.idxmin()
170
+
171
+ # Get historical sequence ending at closest time
172
+ start_idx = max(0, closest_idx - seq_len + 1)
173
+ end_idx = closest_idx + 1
174
+
175
+ if end_idx - start_idx < seq_len:
176
+ # Not enough historical data, skip this sensor
177
+ print(f"Warning: Not enough historical data for sensor {sensor_id} (need {seq_len}, have {end_idx - start_idx})")
178
+ continue
179
+
180
+ # Get the sequence
181
+ sequence_data = sensor_data.iloc[start_idx:end_idx].copy()
182
+
183
+ # Ensure we have exactly seq_len points
184
+ if len(sequence_data) > seq_len:
185
+ sequence_data = sequence_data.tail(seq_len)
186
+
187
+ # Verify we have the right number of points
188
+ if len(sequence_data) != seq_len:
189
+ print(f"Warning: Sequence length mismatch for sensor {sensor_id} (expected {seq_len}, got {len(sequence_data)})")
190
+ continue
191
+
192
+ # Add to prepared data
193
+ prepared_data.append(sequence_data)
194
+ sensor_ids.append(sensor_id)
195
+
196
+ if not prepared_data:
197
+ raise ValueError(f"No sensors with sufficient historical data for {road_name} {direction}")
198
+
199
+ # Combine all sensor data and ensure proper sorting
200
+ combined_data = pd.concat(prepared_data, ignore_index=True)
201
+
202
+ # Ensure the data is sorted by sensor_id and Time (required by encoder)
203
+ combined_data = combined_data.sort_values(['sensor_id', 'Time']).reset_index(drop=True)
204
+
205
+ # Add time features that the encoder expects
206
+ combined_data = self.encoder._add_time_features(combined_data)
207
+
208
+ print(f"Prepared data for {len(sensor_ids)} sensors")
209
+ print(f"Combined data shape: {combined_data.shape}")
210
+ print(f"Unique sensors in prepared data: {combined_data['sensor_id'].nunique()}")
211
+
212
+ return combined_data, sensor_ids
213
+
214
+ def predict_road_speeds(
215
+ self,
216
+ df: pd.DataFrame,
217
+ road_name: str,
218
+ direction: str,
219
+ target_time: datetime
220
+ ) -> pd.DataFrame:
221
+ """
222
+ Predict speeds for all sensors on a specific road and direction.
223
+
224
+ Args:
225
+ df: DataFrame with traffic data
226
+ road_name: Name of the road (e.g., "I 405")
227
+ direction: Direction (e.g., "North", "South", "East", "West")
228
+ target_time: Time to predict for
229
+
230
+ Returns:
231
+ DataFrame with predictions for each sensor
232
+ """
233
+ print(f"Predicting speeds for {road_name} {direction} at {target_time}")
234
+
235
+ # Prepare data
236
+ prepared_data, sensor_ids = self.prepare_prediction_data(
237
+ df, road_name, direction, target_time
238
+ )
239
+
240
+ # Instead of using the encoder's transform method, let's create sequences manually
241
+ # since we already have the exact sequences we want
242
+ print(f"Creating sequences manually from {len(prepared_data)} rows...")
243
+
244
+ # Group by sensor and create sequences
245
+ X_sequences = []
246
+ y_targets = []
247
+ sensor_mapping = []
248
+
249
+ for sensor_id in sensor_ids:
250
+ sensor_data = prepared_data[prepared_data['sensor_id'] == sensor_id].sort_values('Time')
251
+
252
+ if len(sensor_data) >= self.encoder.seq_len:
253
+ # Get the last seq_len points as input sequence
254
+ sequence_data = sensor_data.tail(self.encoder.seq_len)
255
+
256
+ # Prepare features
257
+ cat_features = self.encoder.ordinal_encoder.transform(sequence_data[self.encoder.cat_cols]).astype(np.float32)
258
+ num_features = self.encoder.scaler.transform(sequence_data[self.encoder.num_cols]).astype(np.float32)
259
+ features = np.concatenate([num_features, cat_features], axis=1)
260
+
261
+ X_sequences.append(features)
262
+
263
+ # For prediction, we don't have a target, so we'll use the last speed as placeholder
264
+ last_speed = sequence_data[self.encoder.target_col].iloc[-1]
265
+ y_targets.append([last_speed])
266
+
267
+ sensor_mapping.append(sensor_id)
268
+
269
+ if not X_sequences:
270
+ raise ValueError("No valid sequences found for prediction")
271
+
272
+ X = np.stack(X_sequences, axis=0)
273
+ y = np.array(y_targets)
274
+
275
+ print(f"Created {len(X_sequences)} sequences with shape {X.shape}")
276
+
277
+ # Make predictions for each sequence
278
+ predictions = []
279
+ sensor_info = []
280
+
281
+ for i, sensor_id in enumerate(sensor_mapping):
282
+ # Get the sequence for this sensor
283
+ sensor_sequence = X[i:i+1] # Keep batch dimension
284
+
285
+ # Make prediction
286
+ with torch.no_grad():
287
+ sensor_sequence_tensor = torch.from_numpy(sensor_sequence).float().to(self.device)
288
+ prediction = self.model(sensor_sequence_tensor).cpu().numpy()[0, 0]
289
+
290
+ predictions.append(prediction)
291
+
292
+ # Get sensor info
293
+ sensor_data = prepared_data[prepared_data['sensor_id'] == sensor_id].iloc[0]
294
+
295
+ # Get real speed from the most recent data point
296
+ real_speed = sensor_data.get('speed_mph', None)
297
+ if real_speed is None and 'speed' in sensor_data:
298
+ real_speed = sensor_data['speed']
299
+ elif real_speed is None and 'Speed' in sensor_data:
300
+ real_speed = sensor_data['Speed']
301
+
302
+ sensor_info.append({
303
+ 'sensor_id': sensor_id,
304
+ 'Latitude': sensor_data['Latitude'],
305
+ 'Longitude': sensor_data['Longitude'],
306
+ 'road_name': sensor_data['road_name'],
307
+ 'direction': sensor_data['direction'],
308
+ 'lanes': sensor_data['lanes'],
309
+ 'predicted_speed': prediction,
310
+ 'real_speed': real_speed,
311
+ 'target_time': target_time
312
+ })
313
+
314
+ # Create results DataFrame
315
+ results_df = pd.DataFrame(sensor_info)
316
+
317
+ print(f"Generated predictions for {len(results_df)} sensors")
318
+ print(f"Predicted speed range: {results_df['predicted_speed'].min():.1f} - {results_df['predicted_speed'].max():.1f} mph")
319
+
320
+ # Print real speed statistics if available
321
+ real_speeds = results_df['real_speed'].dropna()
322
+ if len(real_speeds) > 0:
323
+ print(f"Real speed range: {real_speeds.min():.1f} - {real_speeds.max():.1f} mph")
324
+ print(f"Real speed available for {len(real_speeds)}/{len(results_df)} sensors")
325
+ else:
326
+ print("No real speed data available")
327
+
328
+ return results_df
329
+
330
+ def predict_multiple_times(
331
+ self,
332
+ df: pd.DataFrame,
333
+ road_name: str,
334
+ direction: str,
335
+ target_times: List[datetime]
336
+ ) -> pd.DataFrame:
337
+ """
338
+ Predict speeds for multiple time points.
339
+
340
+ Args:
341
+ df: DataFrame with traffic data
342
+ road_name: Name of the road
343
+ direction: Direction
344
+ target_times: List of times to predict for
345
+
346
+ Returns:
347
+ DataFrame with predictions for all sensors at all times
348
+ """
349
+ all_predictions = []
350
+
351
+ for target_time in target_times:
352
+ try:
353
+ predictions = self.predict_road_speeds(df, road_name, direction, target_time)
354
+ all_predictions.append(predictions)
355
+ except Exception as e:
356
+ print(f"Error predicting for {target_time}: {e}")
357
+ continue
358
+
359
+ if not all_predictions:
360
+ raise ValueError("No successful predictions generated")
361
+
362
+ # Combine all predictions
363
+ combined_df = pd.concat(all_predictions, ignore_index=True)
364
+
365
+ return combined_df
366
+
367
+
368
+ def predict_road_speeds(
369
+ df: pd.DataFrame,
370
+ road_name: str,
371
+ direction: str,
372
+ target_time: datetime,
373
+ model_path: str,
374
+ encoder_path: str,
375
+ device: str = "auto"
376
+ ) -> pd.DataFrame:
377
+ """
378
+ Convenience function to predict speeds for all sensors on a road.
379
+
380
+ Args:
381
+ df: DataFrame with traffic data
382
+ road_name: Name of the road (e.g., "I 405")
383
+ direction: Direction (e.g., "North", "South", "East", "West")
384
+ target_time: Time to predict for
385
+ model_path: Path to trained model (.pt file)
386
+ encoder_path: Path to fitted encoder (.pkl file)
387
+ device: Device to use (auto, cpu, cuda, mps)
388
+
389
+ Returns:
390
+ DataFrame with predictions for each sensor
391
+ """
392
+ predictor = RoadPredictor(model_path, encoder_path, device)
393
+ return predictor.predict_road_speeds(df, road_name, direction, target_time)
394
+
395
+
396
+ def main():
397
+ """Example usage of the road predictor."""
398
+ import argparse
399
+
400
+ parser = argparse.ArgumentParser(description="Predict traffic speeds for a specific road and direction")
401
+ parser.add_argument("--csv", required=True, help="Path to CSV file with traffic data")
402
+ parser.add_argument("--model", required=True, help="Path to trained model (.pt file)")
403
+ parser.add_argument("--encoder", required=True, help="Path to fitted encoder (.pkl file)")
404
+ parser.add_argument("--road", required=True, help="Road name (e.g., 'I 405')")
405
+ parser.add_argument("--direction", required=True, help="Direction (e.g., 'North', 'South', 'East', 'West')")
406
+ parser.add_argument("--time", required=True, help="Target time (YYYY-MM-DD HH:MM:SS)")
407
+ parser.add_argument("--output", help="Path to save predictions CSV")
408
+
409
+ args = parser.parse_args()
410
+
411
+ # Load data
412
+ print(f"Loading data from {args.csv}")
413
+ df = pd.read_csv(args.csv)
414
+
415
+ # Parse target time
416
+ target_time = datetime.strptime(args.time, "%Y-%m-%d %H:%M:%S")
417
+
418
+ # Make predictions
419
+ predictions = predict_road_speeds(
420
+ df, args.road, args.direction, target_time,
421
+ args.model, args.encoder
422
+ )
423
+
424
+ # Print results
425
+ print(f"\nPredictions for {args.road} {args.direction} at {target_time}:")
426
+ print("=" * 60)
427
+ for _, row in predictions.iterrows():
428
+ print(f"Sensor {row['sensor_id']}: {row['predicted_speed']:.1f} mph")
429
+
430
+ # Save if requested
431
+ if args.output:
432
+ predictions.to_csv(args.output, index=False)
433
+ print(f"\nPredictions saved to {args.output}")
434
+
435
+
436
+ if __name__ == "__main__":
437
+ main()
model_v3/train_lstm.py ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ train_lstm.py – LSTM model training for traffic flow prediction
3
+
4
+ Features:
5
+ - LSTM model with configurable architecture
6
+ - Weighted loss for handling speed class imbalance
7
+ - Huber loss option (better than regular loss per user experience)
8
+ - CLI interface for hyperparameter tuning
9
+ - Model and encoder saving
10
+ - Chronological train/val/test splits
11
+ """
12
+
13
+ import argparse
14
+ import numpy as np
15
+ import pandas as pd
16
+ import torch
17
+ import torch.nn as nn
18
+ from torch.utils.data import DataLoader, TensorDataset
19
+ from pathlib import Path
20
+ import joblib
21
+ from typing import Dict, Tuple, Optional
22
+
23
+ from encode import TrafficDataEncoder
24
+
25
+
26
+ # Device selection
27
+ if torch.backends.mps.is_available():
28
+ DEVICE = torch.device("mps")
29
+ elif torch.cuda.is_available():
30
+ DEVICE = torch.device("cuda")
31
+ else:
32
+ DEVICE = torch.device("cpu")
33
+
34
+ print(f"Using device: {DEVICE}")
35
+
36
+
37
+ class LSTMRegressor(nn.Module):
38
+ """LSTM model for traffic speed prediction."""
39
+
40
+ def __init__(
41
+ self,
42
+ n_features: int,
43
+ hidden_size: int = 128,
44
+ n_layers: int = 2,
45
+ dropout: float = 0.3,
46
+ bidirectional: bool = False
47
+ ):
48
+ super().__init__()
49
+
50
+ self.hidden_size = hidden_size
51
+ self.n_layers = n_layers
52
+ self.bidirectional = bidirectional
53
+
54
+ # LSTM layer
55
+ self.lstm = nn.LSTM(
56
+ input_size=n_features,
57
+ hidden_size=hidden_size,
58
+ num_layers=n_layers,
59
+ batch_first=True,
60
+ dropout=dropout if n_layers > 1 else 0,
61
+ bidirectional=bidirectional
62
+ )
63
+
64
+ # Output layer
65
+ lstm_output_size = hidden_size * (2 if bidirectional else 1)
66
+ self.head = nn.Sequential(
67
+ nn.Linear(lstm_output_size, hidden_size // 2),
68
+ nn.ReLU(),
69
+ nn.Dropout(dropout),
70
+ nn.Linear(hidden_size // 2, 1)
71
+ )
72
+
73
+ def forward(self, x):
74
+ """Forward pass through the LSTM."""
75
+ # LSTM forward pass
76
+ lstm_out, _ = self.lstm(x)
77
+
78
+ # Use the last timestep output
79
+ last_output = lstm_out[:, -1, :]
80
+
81
+ # Final prediction
82
+ prediction = self.head(last_output)
83
+ return prediction
84
+
85
+
86
+ class WeightedHuberLoss(nn.Module):
87
+ """Weighted Huber loss for handling speed class imbalance."""
88
+
89
+ def __init__(self, weight_dict: Dict[str, float], delta: float = 1.0, boost_low: float = 1.0):
90
+ super().__init__()
91
+ self.delta = delta
92
+ self.weight_low = weight_dict["weight_low"] * boost_low # Additional boost for low speeds
93
+ self.weight_medium = weight_dict["weight_medium"]
94
+ self.weight_high = weight_dict["weight_high"]
95
+ self.low_threshold = weight_dict["low_threshold"]
96
+ self.high_threshold = weight_dict["high_threshold"]
97
+
98
+ def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
99
+ """Compute weighted Huber loss."""
100
+ # Ensure target is 1D
101
+ if target.dim() > 1:
102
+ target = target.squeeze()
103
+ if pred.dim() > 1:
104
+ pred = pred.squeeze()
105
+
106
+ # Compute Huber loss
107
+ diff = torch.abs(pred - target)
108
+ huber_loss = torch.where(
109
+ diff <= self.delta,
110
+ 0.5 * diff ** 2,
111
+ self.delta * (diff - 0.5 * self.delta)
112
+ )
113
+
114
+ # Compute weights based on speed classes
115
+ weights = torch.ones_like(target)
116
+ low_mask = target <= self.low_threshold
117
+ high_mask = target >= self.high_threshold
118
+ medium_mask = ~(low_mask | high_mask)
119
+
120
+ weights[low_mask] = self.weight_low
121
+ weights[medium_mask] = self.weight_medium
122
+ weights[high_mask] = self.weight_high
123
+
124
+ # Apply weights
125
+ weighted_loss = huber_loss * weights
126
+ return weighted_loss.mean()
127
+
128
+
129
+ class FocalHuberLoss(nn.Module):
130
+ """Focal loss variant for Huber loss to focus on hard examples."""
131
+
132
+ def __init__(self, weight_dict: Dict[str, float], delta: float = 1.0, alpha: float = 2.0, gamma: float = 2.0):
133
+ super().__init__()
134
+ self.delta = delta
135
+ self.alpha = alpha
136
+ self.gamma = gamma
137
+ self.weight_low = weight_dict["weight_low"]
138
+ self.weight_medium = weight_dict["weight_medium"]
139
+ self.weight_high = weight_dict["weight_high"]
140
+ self.low_threshold = weight_dict["low_threshold"]
141
+ self.high_threshold = weight_dict["high_threshold"]
142
+
143
+ def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
144
+ """Compute focal Huber loss."""
145
+ if target.dim() > 1:
146
+ target = target.squeeze()
147
+ if pred.dim() > 1:
148
+ pred = pred.squeeze()
149
+
150
+ # Compute Huber loss
151
+ diff = torch.abs(pred - target)
152
+ huber_loss = torch.where(
153
+ diff <= self.delta,
154
+ 0.5 * diff ** 2,
155
+ self.delta * (diff - 0.5 * self.delta)
156
+ )
157
+
158
+ # Compute focal weights (higher loss = harder example)
159
+ focal_weights = self.alpha * (huber_loss ** self.gamma)
160
+
161
+ # Apply class weights
162
+ class_weights = torch.ones_like(target)
163
+ low_mask = target <= self.low_threshold
164
+ high_mask = target >= self.high_threshold
165
+ medium_mask = ~(low_mask | high_mask)
166
+
167
+ class_weights[low_mask] = self.weight_low
168
+ class_weights[medium_mask] = self.weight_medium
169
+ class_weights[high_mask] = self.weight_high
170
+
171
+ # Combine focal and class weights
172
+ total_weights = focal_weights * class_weights
173
+ weighted_loss = huber_loss * total_weights
174
+
175
+ return weighted_loss.mean()
176
+
177
+
178
+ def create_data_loaders(
179
+ X: np.ndarray,
180
+ y: np.ndarray,
181
+ timestamps: np.ndarray,
182
+ batch_size: int,
183
+ train_ratio: float = 0.7,
184
+ val_ratio: float = 0.15
185
+ ) -> Tuple[DataLoader, DataLoader, DataLoader, np.ndarray]:
186
+ """
187
+ Create chronological train/validation/test data loaders.
188
+
189
+ Args:
190
+ X: Input sequences (N, seq_len, n_features)
191
+ y: Target values (N, horizon)
192
+ timestamps: Timestamps for each sample
193
+ batch_size: Batch size for data loaders
194
+ train_ratio: Fraction of data for training
195
+ val_ratio: Fraction of data for validation
196
+
197
+ Returns:
198
+ train_loader, val_loader, test_loader, test_indices
199
+ """
200
+ # Sort by timestamp to ensure chronological order
201
+ sorted_indices = np.argsort(timestamps)
202
+ X_sorted = X[sorted_indices]
203
+ y_sorted = y[sorted_indices]
204
+
205
+ # Calculate split points
206
+ n_total = len(X_sorted)
207
+ n_train = int(n_total * train_ratio)
208
+ n_val = int(n_total * val_ratio)
209
+
210
+ # Split indices
211
+ train_indices = sorted_indices[:n_train]
212
+ val_indices = sorted_indices[n_train:n_train + n_val]
213
+ test_indices = sorted_indices[n_train + n_val:]
214
+
215
+ # Convert timestamps to datetime for date range display
216
+ timestamps_dt = pd.to_datetime(timestamps)
217
+
218
+ print(f"Data split:")
219
+ print(f" Train: {len(train_indices):,} samples ({train_ratio*100:.0f}%)")
220
+ if len(train_indices) > 0:
221
+ train_dates = timestamps_dt[train_indices]
222
+ print(f" Date range: {train_dates.min()} to {train_dates.max()}")
223
+
224
+ print(f" Val: {len(val_indices):,} samples ({val_ratio*100:.0f}%)")
225
+ if len(val_indices) > 0:
226
+ val_dates = timestamps_dt[val_indices]
227
+ print(f" Date range: {val_dates.min()} to {val_dates.max()}")
228
+
229
+ print(f" Test: {len(test_indices):,} samples ({(1-train_ratio-val_ratio)*100:.0f}%)")
230
+ if len(test_indices) > 0:
231
+ test_dates = timestamps_dt[test_indices]
232
+ print(f" Date range: {test_dates.min()} to {test_dates.max()}")
233
+
234
+ # Create data loaders
235
+ def create_loader(indices, shuffle=False):
236
+ X_subset = torch.from_numpy(X[indices]).float()
237
+ y_subset = torch.from_numpy(y[indices]).float()
238
+ dataset = TensorDataset(X_subset, y_subset)
239
+ return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
240
+
241
+ train_loader = create_loader(train_indices, shuffle=True)
242
+ val_loader = create_loader(val_indices, shuffle=False)
243
+ test_loader = create_loader(test_indices, shuffle=False)
244
+
245
+ return train_loader, val_loader, test_loader, test_indices
246
+
247
+
248
+ def train_epoch(
249
+ model: LSTMRegressor,
250
+ train_loader: DataLoader,
251
+ optimizer: torch.optim.Optimizer,
252
+ loss_fn: nn.Module,
253
+ device: torch.device
254
+ ) -> float:
255
+ """Train the model for one epoch."""
256
+ model.train()
257
+ total_loss = 0.0
258
+ num_batches = 0
259
+
260
+ for batch_X, batch_y in train_loader:
261
+ batch_X = batch_X.to(device)
262
+ batch_y = batch_y.to(device)
263
+
264
+ # Forward pass
265
+ optimizer.zero_grad()
266
+ predictions = model(batch_X)
267
+ loss = loss_fn(predictions, batch_y)
268
+
269
+ # Backward pass
270
+ loss.backward()
271
+ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
272
+ optimizer.step()
273
+
274
+ total_loss += loss.item()
275
+ num_batches += 1
276
+
277
+ return total_loss / num_batches
278
+
279
+
280
+ def evaluate(
281
+ model: LSTMRegressor,
282
+ data_loader: DataLoader,
283
+ loss_fn: nn.Module,
284
+ device: torch.device
285
+ ) -> float:
286
+ """Evaluate the model on a dataset."""
287
+ model.eval()
288
+ total_loss = 0.0
289
+ num_batches = 0
290
+
291
+ with torch.no_grad():
292
+ for batch_X, batch_y in data_loader:
293
+ batch_X = batch_X.to(device)
294
+ batch_y = batch_y.to(device)
295
+
296
+ predictions = model(batch_X)
297
+ loss = loss_fn(predictions, batch_y)
298
+
299
+ total_loss += loss.item()
300
+ num_batches += 1
301
+
302
+ return total_loss / num_batches
303
+
304
+
305
+ def main():
306
+ """Main training function."""
307
+ parser = argparse.ArgumentParser(description="Train LSTM model for traffic prediction")
308
+
309
+ # Data parameters
310
+ parser.add_argument("--csv", required=True, help="Path to CSV file with traffic data")
311
+ parser.add_argument("--seq_len", type=int, default=12, help="Sequence length (default: 12)")
312
+ parser.add_argument("--horizon", type=int, default=1, help="Prediction horizon (default: 1)")
313
+ parser.add_argument("--target_col", default="speed_mph", help="Target column name")
314
+
315
+ # Model parameters
316
+ parser.add_argument("--hidden_size", type=int, default=128, help="LSTM hidden size")
317
+ parser.add_argument("--n_layers", type=int, default=2, help="Number of LSTM layers")
318
+ parser.add_argument("--dropout", type=float, default=0.3, help="Dropout rate")
319
+ parser.add_argument("--bidirectional", action="store_true", help="Use bidirectional LSTM")
320
+
321
+ # Training parameters
322
+ parser.add_argument("--epochs", type=int, default=50, help="Number of training epochs")
323
+ parser.add_argument("--batch_size", type=int, default=256, help="Batch size")
324
+ parser.add_argument("--lr", type=float, default=1e-3, help="Learning rate")
325
+ parser.add_argument("--weight_decay", type=float, default=1e-5, help="Weight decay")
326
+
327
+ # Loss parameters
328
+ parser.add_argument("--loss_type", choices=["mse", "mae", "huber", "weighted_huber", "focal_huber"],
329
+ default="weighted_huber", help="Loss function type")
330
+ parser.add_argument("--huber_delta", type=float, default=1.0, help="Huber loss delta")
331
+ parser.add_argument("--boost_low", type=float, default=1.0, help="Additional boost for low-speed loss (weighted_huber only)")
332
+ parser.add_argument("--focal_alpha", type=float, default=2.0, help="Focal loss alpha parameter")
333
+ parser.add_argument("--focal_gamma", type=float, default=2.0, help="Focal loss gamma parameter")
334
+
335
+ # Data split parameters
336
+ parser.add_argument("--train_ratio", type=float, default=0.7, help="Training data ratio")
337
+ parser.add_argument("--val_ratio", type=float, default=0.15, help="Validation data ratio")
338
+
339
+ # Output parameters
340
+ parser.add_argument("--model_out", help="Path to save the best model")
341
+ parser.add_argument("--encoder_out", help="Path to save the fitted encoder")
342
+ parser.add_argument("--pred_csv", help="Path to save test predictions")
343
+ parser.add_argument("--log_file", help="Path to save training log")
344
+
345
+ args = parser.parse_args()
346
+
347
+ # Load and encode data
348
+ print("Loading data...")
349
+ df = pd.read_csv(args.csv)
350
+ print(f"Loaded {len(df):,} rows from {args.csv}")
351
+
352
+ # Create encoder
353
+ encoder = TrafficDataEncoder(
354
+ seq_len=args.seq_len,
355
+ horizon=args.horizon,
356
+ target_col=args.target_col
357
+ )
358
+
359
+ # Fit encoder and transform data
360
+ print("Encoding data...")
361
+ X, y, target_indices, timestamps = encoder.fit_transform(df)
362
+ print(f"Encoded data shapes: X={X.shape}, y={y.shape}")
363
+
364
+ # Save encoder if requested
365
+ if args.encoder_out:
366
+ encoder.save(args.encoder_out)
367
+
368
+ # Create data loaders
369
+ print("Creating data loaders...")
370
+ train_loader, val_loader, test_loader, test_indices = create_data_loaders(
371
+ X, y, timestamps, args.batch_size, args.train_ratio, args.val_ratio
372
+ )
373
+
374
+ # Initialize model
375
+ print("Initializing model...")
376
+ model = LSTMRegressor(
377
+ n_features=X.shape[2],
378
+ hidden_size=args.hidden_size,
379
+ n_layers=args.n_layers,
380
+ dropout=args.dropout,
381
+ bidirectional=args.bidirectional
382
+ ).to(DEVICE)
383
+
384
+ print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
385
+
386
+ # Initialize optimizer
387
+ optimizer = torch.optim.Adam(
388
+ model.parameters(),
389
+ lr=args.lr,
390
+ weight_decay=args.weight_decay
391
+ )
392
+
393
+ # Initialize loss function
394
+ if args.loss_type == "weighted_huber":
395
+ # Get speed weights from encoder
396
+ weight_dict = encoder.get_speed_weights(y.flatten())
397
+ loss_fn = WeightedHuberLoss(weight_dict, args.huber_delta, args.boost_low)
398
+ print(f"Using weighted Huber loss with low-speed boost: {args.boost_low}")
399
+ elif args.loss_type == "focal_huber":
400
+ # Get speed weights from encoder
401
+ weight_dict = encoder.get_speed_weights(y.flatten())
402
+ loss_fn = FocalHuberLoss(weight_dict, args.huber_delta, args.focal_alpha, args.focal_gamma)
403
+ print(f"Using focal Huber loss (alpha={args.focal_alpha}, gamma={args.focal_gamma})")
404
+ elif args.loss_type == "huber":
405
+ loss_fn = nn.SmoothL1Loss(beta=args.huber_delta)
406
+ print("Using Huber loss")
407
+ elif args.loss_type == "mae":
408
+ loss_fn = nn.L1Loss()
409
+ print("Using MAE loss")
410
+ else: # mse
411
+ loss_fn = nn.MSELoss()
412
+ print("Using MSE loss")
413
+
414
+ # Training loop
415
+ print("Starting training...")
416
+ best_val_loss = float('inf')
417
+ best_model_state = None
418
+ train_losses = []
419
+ val_losses = []
420
+
421
+ for epoch in range(1, args.epochs + 1):
422
+ # Train
423
+ train_loss = train_epoch(model, train_loader, optimizer, loss_fn, DEVICE)
424
+
425
+ # Validate
426
+ val_loss = evaluate(model, val_loader, loss_fn, DEVICE)
427
+
428
+ train_losses.append(train_loss)
429
+ val_losses.append(val_loss)
430
+
431
+ print(f"Epoch {epoch:3d}/{args.epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")
432
+
433
+ # Save best model
434
+ if val_loss < best_val_loss:
435
+ best_val_loss = val_loss
436
+ best_model_state = model.state_dict().copy()
437
+ print(f" -> New best validation loss: {best_val_loss:.4f}")
438
+
439
+ # Load best model and evaluate on test set
440
+ print("\nEvaluating on test set...")
441
+ model.load_state_dict(best_model_state)
442
+ test_loss = evaluate(model, test_loader, loss_fn, DEVICE)
443
+ print(f"Test Loss: {test_loss:.4f}")
444
+
445
+ # Save best model
446
+ if args.model_out:
447
+ torch.save(best_model_state, args.model_out)
448
+ print(f"Best model saved to {args.model_out}")
449
+
450
+ # Save predictions if requested
451
+ if args.pred_csv:
452
+ print("Generating test predictions...")
453
+ model.eval()
454
+ predictions = []
455
+ targets = []
456
+
457
+ with torch.no_grad():
458
+ for batch_X, batch_y in test_loader:
459
+ batch_X = batch_X.to(DEVICE)
460
+ batch_pred = model(batch_X).cpu().numpy()
461
+ predictions.append(batch_pred)
462
+ targets.append(batch_y.numpy())
463
+
464
+ predictions = np.concatenate(predictions, axis=0)
465
+ targets = np.concatenate(targets, axis=0)
466
+
467
+ # Create prediction DataFrame
468
+ pred_df = pd.DataFrame({
469
+ 'prediction': predictions.flatten(),
470
+ 'target': targets.flatten(),
471
+ 'error': predictions.flatten() - targets.flatten(),
472
+ 'abs_error': np.abs(predictions.flatten() - targets.flatten())
473
+ })
474
+
475
+ pred_df.to_csv(args.pred_csv, index=False)
476
+ print(f"Predictions saved to {args.pred_csv}")
477
+
478
+ # Print some statistics
479
+ mae = pred_df['abs_error'].mean()
480
+ rmse = np.sqrt((pred_df['error'] ** 2).mean())
481
+ print(f"Test MAE: {mae:.4f}")
482
+ print(f"Test RMSE: {rmse:.4f}")
483
+
484
+ # Save training log if requested
485
+ if args.log_file:
486
+ log_df = pd.DataFrame({
487
+ 'epoch': range(1, len(train_losses) + 1),
488
+ 'train_loss': train_losses,
489
+ 'val_loss': val_losses
490
+ })
491
+ log_df.to_csv(args.log_file, index=False)
492
+ print(f"Training log saved to {args.log_file}")
493
+
494
+
495
+ if __name__ == "__main__":
496
+ main()
497
+
498
+
requirements.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Data Science
2
+ numpy==2.3.1
3
+ pandas==2.3.0
4
+ scikit-learn==1.7.2
5
+ scipy==1.16.0
6
+
7
+ # Deep Learning
8
+ torch
9
+
10
+ # Geographic Processing
11
+ geopandas==1.1.1
12
+ osmnx==2.0.4
13
+ folium==0.20.0
14
+ shapely==2.1.1
15
+ geopy==2.4.1
16
+
17
+ # Web Scraping (for data collection)
18
+ selenium
19
+
20
+ # Web Interface
21
+ streamlit==1.46.1
22
+ streamlit_folium==0.25.0
23
+
24
+ # Utilities
25
+ joblib==1.5.1
26
+ requests==2.32.4
27
+ openpyxl==3.1.5
28
+
29
+ # Visualization (for evaluation)
30
+ matplotlib
31
+
32
+ #huggingface
33
+ altair
roadmap/RoadMap.py ADDED
@@ -0,0 +1,513 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ import geopandas as gpd
5
+ import osmnx as ox
6
+ import folium
7
+ from shapely.geometry import LineString, MultiLineString, Point
8
+ from shapely import ops
9
+ from sklearn.neighbors import BallTree
10
+ from typing import Tuple, List
11
+ from .utils import get_coordinates_from_network, sort_gps_by_greedy_path, add_weather_to_df
12
+ from .mock_predictor import MockTrafficPredictor
13
+ from geopy.distance import geodesic
14
+ import re
15
+ import math
16
+ from datetime import datetime
17
+ import sys
18
+ import os
19
+ # Add parent directory to path to import model_v3
20
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
21
+ from model_v3.predict_road import RoadPredictor
22
+
23
+ # Define model paths relative to the project root
24
+ PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
25
+ MODEL_PATH = os.path.join(PROJECT_ROOT, "model_v3", "final_lstm.pt")
26
+ ENCODER_PATH = os.path.join(PROJECT_ROOT, "model_v3", "final_encoder.pkl")
27
+
28
+ # Validate that model files exist
29
+ if not os.path.exists(MODEL_PATH):
30
+ raise FileNotFoundError(f"Model file not found at: {MODEL_PATH}")
31
+ if not os.path.exists(ENCODER_PATH):
32
+ raise FileNotFoundError(f"Encoder file not found at: {ENCODER_PATH}")
33
+
34
+ DIST_THRESHOLD_METERS_MAX = 1200 #2000
35
+ DIST_THRESHOLD_METERS_MIN = 10 #10
36
+
37
+
38
+ class RoadMapManager:
39
+
40
+ def __init__(self, city: str,bbox: Tuple[float,float,float,float], base_data_dir: str = "data"):
41
+ self.city = city
42
+ self.bbox = bbox
43
+ self.base_data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", base_data_dir))
44
+ self.city_path = os.path.join(self.base_data_dir, self.city)
45
+ self.coordinates_path = os.path.join(self.city_path, 'coordinates')
46
+ self.roads_path = os.path.join(self.city_path, 'roads')
47
+ self.road_network_path = os.path.join(self.city_path, 'maps')
48
+ self.visualizations_path = os.path.join(self.city_path, 'visualizations')
49
+
50
+ self.roads = dict()
51
+
52
+ self._validate_structure()
53
+ self._load_road_network(self.bbox)
54
+
55
+
56
+ def _validate_structure(self):
57
+ for path in [self.coordinates_path, self.roads_path, self.road_network_path, self.visualizations_path]:
58
+ os.makedirs(path, exist_ok=True)
59
+
60
+ @staticmethod
61
+ def split_road_name_direction(road_name: str) -> Tuple[str, str]:
62
+ parts = road_name.split()
63
+ return " ".join(parts[:-1]), parts[-1]
64
+
65
+ def set_roads(self, roads: List[str]):
66
+ os.makedirs(self.coordinates_path, exist_ok=True)
67
+
68
+ for road in roads:
69
+ road_name, direction = self.split_road_name_direction(road)
70
+ file_name = f"{road_name} {direction}.csv"
71
+ file_path = os.path.join(self.coordinates_path,file_name)
72
+
73
+ if os.path.exists(file_path):
74
+ print(f"DataFrame for {road_name} - {direction} already exists")
75
+ df = pd.read_csv(file_path)
76
+ else:
77
+ print(f"Downloading DataFrame for {road_name} - {direction}")
78
+ df = get_coordinates_from_network(self.road_network, road_name, direction)
79
+ df.to_csv(file_path, index=False)
80
+
81
+ self.roads[(road_name,direction)] = df
82
+
83
+ def get_roads(self):
84
+ """
85
+ Used for testing
86
+ """
87
+ for (road_name, direction), df in self.roads.items():
88
+ print(f"road name: {road_name} - {direction}")
89
+ print(df.head())
90
+ print("\n" + "-"*40 + "\n")
91
+
92
+ def _load_road_network(self, bbox: Tuple[float,float,float,float]):
93
+ network_filename = f"{self.city.replace(' ', "_")}_network.graphml"
94
+ network_path = os.path.join(self.road_network_path, network_filename)
95
+
96
+ if os.path.exists(network_path):
97
+ print("map already exists")
98
+ self.road_network = ox.load_graphml(network_path)
99
+ else:
100
+ print("Downloading map")
101
+ self.road_network = ox.graph_from_bbox(
102
+ bbox=bbox,
103
+ network_type='drive'
104
+ )
105
+
106
+ self.road_network = ox.bearing.add_edge_bearings(self.road_network)
107
+
108
+ ox.save_graphml(self.road_network, filepath=network_path)
109
+
110
+ def apply_prediction_data(self, predict_time: datetime | None = None):
111
+ """
112
+ Needed data to predict:
113
+ Gather data about weather in current day in time for each point
114
+ Speed limit - from the road network
115
+ Road name, Direction - key of the coordinates dict
116
+ Cooridnate from the coordinate df
117
+ Lanes - either from coordinate if given else from road network
118
+ Time - input from user
119
+ """
120
+ predictions = {}
121
+ road_predictor = RoadPredictor(MODEL_PATH, ENCODER_PATH)
122
+ for (road_name, direction) in self.roads.keys():
123
+ road_under = road_name.replace(" ", "_")
124
+ df = pd.read_csv(os.path.join(self.roads_path, f"{road_under}_{direction.lower()}.csv.gz"), compression='gzip')
125
+ predictions[(road_name, direction)] = road_predictor.predict_road_speeds(df, road_name, direction, predict_time)
126
+
127
+ # Map predictions to road coordinates
128
+ self._map_predictions_to_roads(predictions)
129
+
130
+ for (road_name, direction), df in self.roads.items():
131
+ print(df.head())
132
+ df = sort_gps_by_greedy_path(df)
133
+ self.roads[(road_name, direction)] = df
134
+
135
+ def _map_predictions_to_roads(self, predictions: dict):
136
+ """
137
+ Map predicted speeds to the closest points in self.roads coordinates.
138
+
139
+ Args:
140
+ predictions: Dictionary with (road_name, direction) keys and prediction DataFrames as values
141
+ """
142
+ from sklearn.neighbors import BallTree
143
+
144
+ for (road_name, direction), road_df in self.roads.items():
145
+ if (road_name, direction) not in predictions:
146
+ print(f"No predictions found for {road_name} {direction}")
147
+ continue
148
+
149
+ pred_df = predictions[(road_name, direction)]
150
+
151
+ if pred_df.empty:
152
+ print(f"Empty predictions for {road_name} {direction}")
153
+ continue
154
+
155
+ # Extract coordinates from road data
156
+ road_coords = road_df[['Latitude', 'Longitude']].values
157
+
158
+ # Extract coordinates from predictions
159
+ pred_coords = pred_df[['Latitude', 'Longitude']].values
160
+
161
+ # Create BallTree for efficient nearest neighbor search
162
+ # Convert to radians for haversine distance
163
+ road_coords_rad = np.radians(road_coords)
164
+ pred_coords_rad = np.radians(pred_coords)
165
+
166
+ tree = BallTree(pred_coords_rad, metric='haversine')
167
+
168
+ # Find closest prediction for each road point
169
+ distances, indices = tree.query(road_coords_rad, k=1)
170
+
171
+ # Convert distances from radians to meters (approximate)
172
+ distances_meters = distances.flatten() * 6371000 # Earth radius in meters
173
+
174
+ # Get predicted speeds for closest points
175
+ closest_pred_speeds = pred_df.iloc[indices.flatten()]['predicted_speed'].values
176
+
177
+ # Get real speeds for closest points (if available)
178
+ if 'real_speed' in pred_df.columns:
179
+ closest_real_speeds = pred_df.iloc[indices.flatten()]['real_speed'].values
180
+ road_df['real_speed'] = closest_real_speeds
181
+ else:
182
+ road_df['real_speed'] = None
183
+
184
+ # Add predicted speeds to road DataFrame
185
+ road_df['predicted_speed'] = closest_pred_speeds
186
+ road_df['prediction_distance_m'] = distances_meters
187
+
188
+ # Use predicted speed as the main speed for visualization
189
+ road_df['speed'] = road_df['predicted_speed']
190
+
191
+ # Check for points that are too far from any prediction
192
+ max_distance_threshold = 1000 # 1km threshold
193
+ far_points = distances_meters > max_distance_threshold
194
+
195
+ if far_points.any():
196
+ print(f"Warning: {far_points.sum()} points in {road_name} {direction} are >{max_distance_threshold}m from predictions")
197
+ # For points too far, use a default speed or interpolate
198
+ road_df.loc[far_points, 'predicted_speed'] = road_df.loc[~far_points, 'predicted_speed'].mean()
199
+ road_df.loc[far_points, 'speed'] = road_df.loc[~far_points, 'speed'].mean()
200
+
201
+ print(f"Mapped predictions for {road_name} {direction}: "
202
+ f"{len(road_df)} points, avg distance: {distances_meters.mean():.1f}m")
203
+
204
+ def get_prediction_statistics(self) -> dict:
205
+ """
206
+ Get statistics about the prediction mapping for all roads.
207
+
208
+ Returns:
209
+ Dictionary with statistics for each road
210
+ """
211
+ stats = {}
212
+
213
+ for (road_name, direction), road_df in self.roads.items():
214
+ if 'predicted_speed' not in road_df.columns:
215
+ continue
216
+
217
+ stats[(road_name, direction)] = {
218
+ 'total_points': len(road_df),
219
+ 'avg_predicted_speed': road_df['predicted_speed'].mean(),
220
+ 'min_predicted_speed': road_df['predicted_speed'].min(),
221
+ 'max_predicted_speed': road_df['predicted_speed'].max(),
222
+ 'avg_distance_to_prediction': road_df.get('prediction_distance_m', pd.Series([0])).mean(),
223
+ 'max_distance_to_prediction': road_df.get('prediction_distance_m', pd.Series([0])).max(),
224
+ 'points_with_predictions': road_df['predicted_speed'].notna().sum()
225
+ }
226
+
227
+ return stats
228
+
229
+ def print_prediction_summary(self):
230
+ """Print a summary of prediction statistics for all roads."""
231
+ stats = self.get_prediction_statistics()
232
+
233
+ if not stats:
234
+ print("No prediction statistics available. Run apply_prediction_data() first.")
235
+ return
236
+
237
+ print("\n" + "="*80)
238
+ print("PREDICTION MAPPING SUMMARY")
239
+ print("="*80)
240
+
241
+ for (road_name, direction), stat in stats.items():
242
+ print(f"\n{road_name} {direction}:")
243
+ print(f" Points: {stat['points_with_predictions']}/{stat['total_points']}")
244
+ print(f" Speed: {stat['avg_predicted_speed']:.1f} mph (range: {stat['min_predicted_speed']:.1f}-{stat['max_predicted_speed']:.1f})")
245
+ print(f" Avg distance to prediction: {stat['avg_distance_to_prediction']:.1f}m")
246
+ print(f" Max distance to prediction: {stat['max_distance_to_prediction']:.1f}m")
247
+
248
+
249
+
250
+ def draw_map(self):
251
+
252
+ def get_color(speed, max_speed):
253
+ if speed >= 0.85 * max_speed:
254
+ return '#00FF00' # Bright neon green
255
+ elif speed >= 0.55 * max_speed:
256
+ return '#FFA500' # Bright orange
257
+ else:
258
+ return '#FF0000' # Bright red
259
+
260
+ center_lon = (self.bbox[0] + self.bbox[2]) / 2
261
+ center_lat = (self.bbox[1] + self.bbox[3]) / 2
262
+
263
+ m = folium.Map(
264
+ location=[center_lat, center_lon],
265
+ zoom_start=13,
266
+ tiles='CartoDB dark_matter'
267
+ )
268
+
269
+ for (road_name, direction), df in self.roads.items():
270
+ for i in range(len(df) - 1):
271
+ lat1, lon1, speed1 = df.loc[i, ['Latitude', 'Longitude', 'speed']] # type: ignore
272
+ lat2, lon2, speed2 = df.loc[i+1, ['Latitude', 'Longitude', 'speed']] # type: ignore
273
+ raw_speed = df.loc[i, 'maxspeed']
274
+ match = re.search(r'\d+', str(raw_speed))
275
+ if match:
276
+ max_speed = float(match.group())
277
+ else:
278
+ max_speed = 60
279
+
280
+ dist = geodesic((lat1, lon1), (lat2, lon2)).meters
281
+ if dist > DIST_THRESHOLD_METERS_MAX or dist < DIST_THRESHOLD_METERS_MIN:
282
+ continue # Skip if too far or too close
283
+
284
+ avg_speed = (speed1 + speed2) / 2
285
+ color = get_color(avg_speed,max_speed)
286
+
287
+ folium.PolyLine(
288
+ locations=[(lat1, lon1), (lat2, lon2)],
289
+ color=color,
290
+ weight=1,
291
+ opacity=0.9
292
+ ).add_to(m)
293
+
294
+ output_path = os.path.join(self.visualizations_path, "sorted_path.html")
295
+ m.save(output_path)
296
+ print("Saved map with distance filtering to 'sorted_path.html'")
297
+
298
+
299
+
300
+
301
+ def draw_map_offset(self):
302
+
303
+ def get_color(speed, max_speed):
304
+ if speed >= 0.85 * max_speed:
305
+ return '#00FF00' # Neon green
306
+ elif speed >= 0.55 * max_speed:
307
+ return '#FFA500' # Bright orange
308
+ else:
309
+ return '#FF0000' # Bright red
310
+
311
+ def get_maxspeed(raw_speed):
312
+ match = re.search(r'\d+', str(raw_speed))
313
+ return float(match.group()) if match else 60
314
+
315
+ def apply_offset(lat, lon, bearing, direction):
316
+ """Offset lat/lon a little perpendicular to bearing, based on direction."""
317
+ offset_meters = -600 if direction.lower() in ["north", "east"] else 600
318
+
319
+ # Convert bearing to radians and rotate 90°
320
+ angle_rad = math.radians((bearing + 90) % 360)
321
+ delta_lat = offset_meters * math.cos(angle_rad) / 111111
322
+ delta_lon = offset_meters * math.sin(angle_rad) / (111111 * math.cos(math.radians(lat)))
323
+
324
+ return lat + delta_lat, lon + delta_lon
325
+
326
+ # Create dark base map
327
+ center_lon = (self.bbox[0] + self.bbox[2]) / 2
328
+ center_lat = (self.bbox[1] + self.bbox[3]) / 2
329
+
330
+ m = folium.Map(
331
+ location=[center_lat, center_lon],
332
+ zoom_start=13,
333
+ tiles='CartoDB dark_matter'
334
+ )
335
+
336
+ # Group by road name
337
+ road_groups = {}
338
+ for (road_name, direction), df in self.roads.items():
339
+ road_groups.setdefault(road_name, {})[direction] = df
340
+
341
+ for road_name, direction_map in road_groups.items():
342
+ for direction, df in direction_map.items():
343
+ for i in range(len(df) - 1):
344
+ lat1, lon1, speed1 = df.loc[i, ['Latitude', 'Longitude', 'speed']]
345
+ lat2, lon2, speed2 = df.loc[i + 1, ['Latitude', 'Longitude', 'speed']]
346
+ raw_speed = df.loc[i, 'maxspeed']
347
+ max_speed = get_maxspeed(raw_speed)
348
+ bearing = df.loc[i, 'bearing'] if 'bearing' in df.columns else 0
349
+
350
+ dist = geodesic((lat1, lon1), (lat2, lon2)).meters
351
+ if dist > DIST_THRESHOLD_METERS_MAX or dist < DIST_THRESHOLD_METERS_MIN:
352
+ continue
353
+
354
+ avg_speed = (speed1 + speed2) / 2
355
+ color = get_color(avg_speed, max_speed)
356
+
357
+ # Apply visual offset if road has both directions
358
+ has_opposite = len(direction_map) > 1
359
+ if has_opposite:
360
+ lat1, lon1 = apply_offset(lat1, lon1, bearing, direction)
361
+ lat2, lon2 = apply_offset(lat2, lon2, bearing, direction)
362
+
363
+ folium.PolyLine(
364
+ locations=[(lat1, lon1), (lat2, lon2)],
365
+ color=color,
366
+ weight=2,
367
+ opacity=0.95
368
+ ).add_to(m)
369
+
370
+ output_path = os.path.join(self.visualizations_path, "direction_offset_map.html")
371
+ m.save(output_path)
372
+ print("✅ Saved map with directional offsets to 'direction_offset_map.html'")
373
+ return m
374
+
375
+ def draw_map_with_real_speed(self):
376
+ """
377
+ Draw map using real speed data instead of predicted speed.
378
+ """
379
+ def get_color(speed, max_speed):
380
+ if speed >= 0.85 * max_speed:
381
+ return '#00FF00' # Neon green
382
+ elif speed >= 0.55 * max_speed:
383
+ return '#FFA500' # Bright orange
384
+ else:
385
+ return '#FF0000' # Bright red
386
+
387
+ def get_maxspeed(raw_speed):
388
+ match = re.search(r'\d+', str(raw_speed))
389
+ return float(match.group()) if match else 60
390
+
391
+ def apply_offset(lat, lon, bearing, direction):
392
+ """Offset lat/lon a little perpendicular to bearing, based on direction."""
393
+ offset_meters = -600 if direction.lower() in ["north", "east"] else 600
394
+
395
+ # Convert bearing to radians and rotate 90°
396
+ angle_rad = math.radians((bearing + 90) % 360)
397
+ delta_lat = offset_meters * math.cos(angle_rad) / 111111
398
+ delta_lon = offset_meters * math.sin(angle_rad) / (111111 * math.cos(math.radians(lat)))
399
+
400
+ return lat + delta_lat, lon + delta_lon
401
+
402
+ # Create dark base map
403
+ center_lon = (self.bbox[0] + self.bbox[2]) / 2
404
+ center_lat = (self.bbox[1] + self.bbox[3]) / 2
405
+
406
+ m = folium.Map(
407
+ location=[center_lat, center_lon],
408
+ zoom_start=13,
409
+ tiles='CartoDB dark_matter'
410
+ )
411
+
412
+ # Group by road name
413
+ road_groups = {}
414
+ for (road_name, direction), df in self.roads.items():
415
+ road_groups.setdefault(road_name, {})[direction] = df
416
+
417
+ for road_name, direction_map in road_groups.items():
418
+ for direction, df in direction_map.items():
419
+ for i in range(len(df) - 1):
420
+ lat1, lon1 = df.loc[i, ['Latitude', 'Longitude']]
421
+ lat2, lon2 = df.loc[i + 1, ['Latitude', 'Longitude']]
422
+
423
+ # Use real speed if available, otherwise fall back to predicted speed
424
+ if 'real_speed' in df.columns and pd.notna(df.loc[i, 'real_speed']):
425
+ speed1 = df.loc[i, 'real_speed']
426
+ speed2 = df.loc[i + 1, 'real_speed'] if i + 1 < len(df) and pd.notna(df.loc[i + 1, 'real_speed']) else speed1
427
+ else:
428
+ speed1 = df.loc[i, 'speed']
429
+ speed2 = df.loc[i + 1, 'speed']
430
+
431
+ raw_speed = df.loc[i, 'maxspeed']
432
+ max_speed = get_maxspeed(raw_speed)
433
+ bearing = df.loc[i, 'bearing'] if 'bearing' in df.columns else 0
434
+
435
+ dist = geodesic((lat1, lon1), (lat2, lon2)).meters
436
+ if dist > DIST_THRESHOLD_METERS_MAX or dist < DIST_THRESHOLD_METERS_MIN:
437
+ continue
438
+
439
+ avg_speed = (speed1 + speed2) / 2
440
+ color = get_color(avg_speed, max_speed)
441
+
442
+ # Apply visual offset if road has both directions
443
+ has_opposite = len(direction_map) > 1
444
+ if has_opposite:
445
+ lat1, lon1 = apply_offset(lat1, lon1, bearing, direction)
446
+ lat2, lon2 = apply_offset(lat2, lon2, bearing, direction)
447
+
448
+ folium.PolyLine(
449
+ locations=[(lat1, lon1), (lat2, lon2)],
450
+ color=color,
451
+ weight=2,
452
+ opacity=0.95
453
+ ).add_to(m)
454
+
455
+ output_path = os.path.join(self.visualizations_path, "real_speed_map.html")
456
+ m.save(output_path)
457
+ print("✅ Saved map with real speed data to 'real_speed_map.html'")
458
+ return m
459
+
460
+ def draw_side_by_side_maps(self):
461
+ """
462
+ Create side-by-side maps showing both predicted and real speeds.
463
+ Returns a tuple of (predicted_map, real_map) for use in Streamlit.
464
+ """
465
+ # Create predicted speed map
466
+ predicted_map = self.draw_map_offset()
467
+
468
+ # Create real speed map
469
+ real_map = self.draw_map_with_real_speed()
470
+
471
+ return predicted_map, real_map
472
+
473
+
474
+ """
475
+ mock_predictor = MockTrafficPredictor({
476
+ 'I 405 North': 'moderate',
477
+ 'I 405 South': 'free',
478
+ 'US 101 North': 'busy',
479
+ 'US 101 South': 'free',
480
+ 'I 5 North': 'busy',
481
+ 'I 5 South': 'free',
482
+ 'I 10 East': 'moderate',
483
+ 'I 10 West': 'moderate',
484
+ 'I 110 North': 'busy',
485
+ 'I 110 South': 'busy',
486
+ 'CA 110 North': 'busy',
487
+ 'CA 110 South': 'busy',
488
+ 'CA 170 North': 'moderate',
489
+ 'CA 170 South': 'free',
490
+ 'CA 118 East': 'free',
491
+ 'CA 118 West': 'free',
492
+ 'CA 134 East': 'moderate',
493
+ 'CA 134 West': 'free',
494
+ 'CA 2 North': 'moderate',
495
+ 'CA 2 South': 'moderate',
496
+ 'I 605 North': 'busy',
497
+ 'I 605': 'free',
498
+ 'I 210 East' : 'free',
499
+ 'I 210 West' : 'busy'
500
+ })
501
+
502
+ if predict_time is None:
503
+ predict_time = datetime.now()
504
+
505
+ for (road_name, direction), df in self.roads.items():
506
+ #self.roads[(road_name, direction)] = add_weather_to_df(self.roads[(road_name, direction)], time = predict_time)
507
+ print(f"Mocking for {road_name} - {direction}")
508
+ df = mock_predictor.predict(df)
509
+ print(df.head())
510
+ df = sort_gps_by_greedy_path(df)
511
+ self.roads[(road_name, direction)] = df
512
+
513
+ """
roadmap/__init__.py ADDED
File without changes
roadmap/mock_predictor.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ class MockTrafficPredictor:
5
+ def __init__(self, road_classification_map: dict, seed: int = 42):
6
+ """
7
+ road_classification_map: dict mapping road ref (e.g., 'I 405') to classification ('busy', 'moderate', 'free')
8
+ """
9
+ valid_classes = {'busy', 'moderate', 'free'}
10
+ for cls in road_classification_map.values():
11
+ if cls not in valid_classes:
12
+ raise ValueError(f"Invalid classification '{cls}', must be one of {valid_classes}")
13
+ self.road_classification_map = road_classification_map
14
+ self.random = np.random.default_rng(seed)
15
+
16
+ self.speed_range = {
17
+ 'busy': (0.2, 0.5),
18
+ 'moderate': (0.5, 0.8),
19
+ 'free': (0.8, 1.0)
20
+ }
21
+
22
+ def predict(self, df: pd.DataFrame) -> pd.DataFrame:
23
+ df = df.copy()
24
+
25
+ if 'ref' not in df.columns:
26
+ raise ValueError("Input DataFrame must contain a 'ref' column with road names")
27
+
28
+ road_ref = df['ref'].iloc[0]
29
+ road_direction = df['direction'].iloc[0]
30
+ classification_name = road_ref + " " + road_direction
31
+ classification = self.road_classification_map.get(classification_name, 'moderate')
32
+ min_r, max_r = self.speed_range[classification]
33
+
34
+ df['maxspeed_numeric'] = df['maxspeed'].str.extract(r'(\d+)').astype(float)
35
+
36
+ # Generate base values with slight spatial variation
37
+ base = self.random.uniform(min_r, max_r)
38
+ noise = self.random.normal(loc=0, scale=0.05, size=len(df)) # small gaussian noise
39
+ raw_factors = np.clip(base + noise, min_r, max_r)
40
+
41
+ df['speed'] = df['maxspeed_numeric'] * raw_factors
42
+
43
+ return df.drop(columns=['maxspeed_numeric'])
roadmap/utils.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from geopandas import GeoDataFrame
2
+ from networkx import MultiDiGraph
3
+ import pandas as pd
4
+ import numpy as np
5
+ import osmnx as ox
6
+ from shapely.geometry import LineString, MultiLineString
7
+ from sklearn.neighbors import BallTree
8
+ import requests
9
+ from sklearn.cluster import KMeans
10
+ from datetime import datetime
11
+
12
+ def filter_by_direction(selected_road: GeoDataFrame, road_direction: str) -> GeoDataFrame:
13
+ if road_direction == 'North':
14
+ return selected_road[
15
+ (selected_road['bearing'] >= 270) | (selected_road['bearing'] <= 90)
16
+ ]
17
+ elif road_direction == 'South':
18
+ return selected_road[
19
+ (selected_road['bearing'] > 90) & (selected_road['bearing'] < 270)
20
+ ]
21
+ elif road_direction == 'East':
22
+ return selected_road[
23
+ (selected_road['bearing'] >= 0) & (selected_road['bearing'] <= 180)
24
+ ]
25
+ elif road_direction == 'West':
26
+ return selected_road[
27
+ (selected_road['bearing'] > 180) & (selected_road['bearing'] < 360)
28
+ ]
29
+ else:
30
+ raise ValueError(f"Invalid road_direction: {road_direction}. Must be one of: North, South, East, West.")
31
+
32
+ def add_weather_to_df(df: pd.DataFrame, num_clusters: int = 4 , api_key = 'FLMEW5QEEB8WT8YGUJXF6KAPK', time: datetime | None = None) -> pd.DataFrame:
33
+ if df.empty:
34
+ df['weather'] = None
35
+ return df
36
+
37
+ if time is None:
38
+ time = datetime.now()
39
+
40
+ coords = df[['Latitude', 'Longitude']].dropna().values
41
+ kmeans = KMeans(n_clusters=min(num_clusters, len(coords)), random_state=42)
42
+ df['weather_cluster'] = kmeans.fit_predict(coords)
43
+
44
+ weather_data = {}
45
+ date_str = time.strftime("%Y-%m-%d")
46
+ target_hour = time.strftime("%H:%M:%S")
47
+
48
+ for cluster_id in range(kmeans.n_clusters): # type: ignore
49
+ lat, lon = kmeans.cluster_centers_[cluster_id]
50
+ url = f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{lat},{lon}/{date_str}"
51
+ params = {
52
+ "key": api_key,
53
+ "unitGroup": "metric",
54
+ "contentType": "json"
55
+ }
56
+
57
+ try:
58
+ response = requests.get(url=url, params=params)
59
+ response.raise_for_status()
60
+ data = response.json()
61
+ hours = data.get("days", [{}])[0].get("hours", [])
62
+
63
+ def hour_diff(hour_entry):
64
+ try:
65
+ return abs(datetime.strptime(hour_entry["datetime"], "%H:%M:%S") - datetime.strptime(target_hour, "%H:%M:%S"))
66
+ except:
67
+ return datetime.max
68
+
69
+ if hours:
70
+ best_match = min(hours, key=hour_diff)
71
+ weather = best_match.get("conditions", "Unknown")
72
+ weather_time = best_match.get("datetime", None)
73
+ else:
74
+ weather = "Unknown"
75
+ weather_time = None
76
+
77
+ except Exception as e:
78
+ print(f"Weather api error for cluster {cluster_id}: {e}")
79
+ weather = "Unknown"
80
+ weather_time = None
81
+
82
+ weather_data[cluster_id] = {
83
+ "conditions": weather,
84
+ "datetime": weather_time
85
+ }
86
+
87
+ df['time'] = time
88
+ df['weather'] = df['weather_cluster'].map(lambda x: weather_data[x]["conditions"])
89
+ df['weather_time'] = df['weather_cluster'].map(lambda x: weather_data[x]["datetime"])
90
+ df.drop(columns=['weather_cluster'], inplace=True)
91
+ return df
92
+
93
+ def get_coordinates_from_network(G : MultiDiGraph, road_name: str, road_direction: str):
94
+
95
+ edges = ox.graph_to_gdfs(G, nodes=False, edges=True)
96
+
97
+ edges_motorway = edges[edges['highway'].isin(['motorway', 'motorway_link'])]
98
+
99
+ selected_road = edges_motorway[
100
+ edges_motorway['ref'].str.contains(road_name, na=False, case=False)
101
+ ]
102
+
103
+ selected_road = filter_by_direction(selected_road, road_direction)
104
+
105
+ rows = []
106
+
107
+ for _, row in selected_road.iterrows():
108
+ lanes = row.get("lanes", None)
109
+ maxspeed = row.get("maxspeed", None)
110
+ road_name = row.get("name", None) # type: ignore
111
+ ref = row.get("ref", None)
112
+ geometry = row.geometry
113
+
114
+ if isinstance(geometry, LineString):
115
+ coords = geometry.coords
116
+ elif isinstance(geometry, MultiLineString):
117
+ coords = [pt for line in geometry.geoms for pt in line.coords]
118
+ else:
119
+ continue
120
+
121
+ for lon, lat in coords:
122
+ rows.append({
123
+ "Longitude": lon,
124
+ "Latitude": lat,
125
+ "lanes": lanes,
126
+ "maxspeed": maxspeed,
127
+ "road_name": road_name,
128
+ "ref": ref,
129
+ "direction" : road_direction
130
+ })
131
+
132
+ # Step 6: Build DataFrame
133
+ road_df = pd.DataFrame(rows)
134
+ print(f"Total points in {road_name} - {road_direction}: {len(road_df)}")
135
+ return road_df
136
+
137
+
138
+ def sort_gps_by_greedy_path(df: pd.DataFrame) -> pd.DataFrame:
139
+ """
140
+ Greedy nearest-neighbor sorting of GPS coordinates.
141
+
142
+ Args:
143
+ df (pd.DataFrame): DataFrame with 'Latitude' and 'Longitude' columns.
144
+
145
+ Returns:
146
+ pd.DataFrame: Reordered DataFrame.
147
+ """
148
+ coords_rad = np.radians(df[['Latitude', 'Longitude']].values)
149
+ tree = BallTree(coords_rad, metric='haversine')
150
+
151
+ visited = np.zeros(len(df), dtype=bool)
152
+ path = []
153
+ current_idx = 0 # or use farthest-point-start logic
154
+
155
+ for _ in range(len(df)):
156
+ visited[current_idx] = True
157
+ path.append(current_idx)
158
+
159
+ dist, ind = tree.query([coords_rad[current_idx]], k=len(df))
160
+
161
+ for next_idx in ind[0]:
162
+ if not visited[next_idx]:
163
+ current_idx = next_idx
164
+ break
165
+
166
+ return df.iloc[path].reset_index(drop=True)
167
+