Spaces:

amitom
/

TrafCast

Sleeping

App Files Files Community

amitom commited on Sep 13

Commit

73e9c25

0 Parent(s):

Minimal app for HF Space

Browse files

Files changed (44) hide show

.gitignore +56 -0
Dockerfile +20 -0
README.md +62 -0
app.py +112 -0
data/Los Angeles/coordinates/CA 110 North.csv +3 -0
data/Los Angeles/coordinates/CA 110 South.csv +3 -0
data/Los Angeles/coordinates/CA 118 East.csv +3 -0
data/Los Angeles/coordinates/CA 118 West.csv +3 -0
data/Los Angeles/coordinates/CA 134 East.csv +3 -0
data/Los Angeles/coordinates/CA 134 West.csv +3 -0
data/Los Angeles/coordinates/CA 170 North.csv +3 -0
data/Los Angeles/coordinates/CA 170 South.csv +3 -0
data/Los Angeles/coordinates/CA 2 North.csv +3 -0
data/Los Angeles/coordinates/CA 2 South.csv +3 -0
data/Los Angeles/coordinates/I 10 East.csv +3 -0
data/Los Angeles/coordinates/I 10 West.csv +3 -0
data/Los Angeles/coordinates/I 110 North.csv +3 -0
data/Los Angeles/coordinates/I 110 South.csv +3 -0
data/Los Angeles/coordinates/I 210 East.csv +3 -0
data/Los Angeles/coordinates/I 210 West.csv +3 -0
data/Los Angeles/coordinates/I 405 North.csv +3 -0
data/Los Angeles/coordinates/I 405 South.csv +3 -0
data/Los Angeles/coordinates/I 5 North.csv +3 -0
data/Los Angeles/coordinates/I 5 South.csv +3 -0
data/Los Angeles/coordinates/I 605 North.csv +3 -0
data/Los Angeles/coordinates/I 605 South.csv +3 -0
data/Los Angeles/coordinates/US 101 North.csv +3 -0
data/Los Angeles/coordinates/US 101 South.csv +3 -0
data_collection/collect.py +98 -0
data_process/process.py +138 -0
data_process/split.py +9 -0
data_process/unified.py +93 -0
model_v3/encode.py +332 -0
model_v3/evaluate.py +385 -0
model_v3/experiments.md +17 -0
model_v3/final_encoder.pkl +3 -0
model_v3/final_lstm.pt +3 -0
model_v3/predict_road.py +437 -0
model_v3/train_lstm.py +498 -0
requirements.txt +33 -0
roadmap/RoadMap.py +513 -0
roadmap/__init__.py +0 -0
roadmap/mock_predictor.py +43 -0
roadmap/utils.py +167 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,56 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Virtual environment
+.venv/
+env/
+venv/
+# Logs
+*.log
+# Mac / Windows / Linux OS generated files
+.DS_Store
+Thumbs.db
+# IDEs and editors
+.vscode/
+.idea/
+# Cache and large files
+cache/
+map/cache/
+# Common large data formats
+*.sqlite
+*.h5
+*.tar
+*.zip
+*.gz
+*.npz
+*.ckpt
+*.pdf
+*.mp4
+*.mov
+*.avi
+*.tiff
+*.jpg
+*.jpeg
+*.png
+*.webp
+*.html
+# Block heavy .json and .graphml
+*.json
+*.graphml
+# Project-specific exclusions
+roadmap/old_implmentation/road_network.graphml
+roadmap/data/Los Angeles/maps/Los_Angeles_network.graphml
+data_process/exmaple.csv
+# Prevent Git LFS tracked files from being committed directly
+.gitattributes
+data_process/balanced_example.csv

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.13.5-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt ./
+COPY src/ ./src/
+RUN pip3 install -r requirements.txt
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+---
+sdk: streamlit
+app_file: app.py
+---
+# TrafCast
+A traffic speed prediction system for Los Angeles using LSTM neural networks.
+## Overview
+TrafCast predicts real-time traffic speeds across major Los Angeles highways and roads using deep learning. The system uses an LSTM (Long Short-Term Memory) model trained on historical traffic data to forecast speed patterns.
+## Model Details
+- **Architecture**: LSTM neural network with 2,191,617 parameters
+- **Training Data**: 32+ million data points from LA traffic sensors
+- **Performance**: Best validation loss of 6.6276, test loss of 6.0229
+- **Features**: Weather data, road characteristics, time patterns, and historical speeds
+## Quick Start
+### Prerequisites
+- Python 3.8+
+- Virtual environment (recommended)
+### Installation
+1. **Clone the repository**
+   ```bash
+   git clone <repository-url>
+   cd TrafCast
+   ```
+2. **Create and activate virtual environment**
+   ```bash
+   python -m venv .venv
+   source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+   ```
+3. **Install dependencies**
+   ```bash
+   pip install -r requirements.txt
+   ```
+4. **Run the application**
+   ```bash
+   streamlit run app.py
+   ```
+The app will be available at `http://localhost:8501`
+## Usage
+1. Select roads from the available LA highways
+2. Choose a date and time for prediction
+3. Select visualization mode (Predicted, Real, or Comparison)
+4. Click "Apply Prediction" to generate traffic speed maps
+## Data
+The model was trained on compressed CSV files containing traffic sensor data from major LA roads including I-405, US-101, I-5, and state highways.

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# app.py
+import streamlit as st
+from datetime import datetime
+from streamlit_folium import st_folium
+from roadmap.RoadMap import RoadMapManager
+SUPPORTED_CITIES = ["Los Angeles"]
+LA_ROADS = [
+    'I 405 North', 'I 405 South',
+    'US 101 North', 'US 101 South',
+    'I 5 North', 'I 5 South',
+    'I 110 North', 'I 110 South',
+    'CA 170 North', 'CA 170 South',
+    'CA 118 East', 'CA 118 West',
+    'CA 134 East', 'CA 134 West',
+    'I 605 North', 'I 605 South',
+    'I 210 East', 'I 210 West'
+]
+LA_BBOX = (-118.569946, 33.252470, -116.976929, 34.388779)
+st.title("TrafCast: Traffic Forecasting for Los Angeles")
+city = st.selectbox("Select City", SUPPORTED_CITIES)
+@st.cache_resource
+def get_map_manager(city_name):
+    return RoadMapManager(city_name, LA_BBOX)
+map_manager = get_map_manager(city)
+selected_roads = st.multiselect("Select Roads to Load", LA_ROADS)
+if selected_roads and st.button("Load Road Data"):
+    with st.spinner("Loading road data..."):
+        map_manager.set_roads(selected_roads)
+        st.session_state["roads_loaded"] = True
+    st.success("Road data loaded successfully.")
+if st.session_state.get("roads_loaded"):
+    default_date = st.session_state.get("selected_date", datetime.now().date())
+    default_time = st.session_state.get("selected_time", datetime.now().time())
+    st.date_input("Choose Date", value=default_date, key="selected_date")
+    st.time_input("Choose Time", value=default_time, key="selected_time")
+    predict_time = datetime.combine(
+        st.session_state["selected_date"],
+        st.session_state["selected_time"]
+    )
+    map_option = st.radio(
+        "Choose map visualization:",
+        ["Predicted Speed Only", "Real Speed Only", "Side by Side Comparison"],
+        key="map_option"
+    )
+    if st.button("Apply Prediction"):
+        with st.spinner("Running prediction and generating map..."):
+            map_manager.apply_prediction_data(predict_time)
+            if map_option == "Predicted Speed Only":
+                folium_map = map_manager.draw_map_offset()
+                st.session_state["folium_map"] = folium_map
+                st.session_state["map_type"] = "predicted"
+            elif map_option == "Real Speed Only":
+                folium_map = map_manager.draw_map_with_real_speed()
+                st.session_state["folium_map"] = folium_map
+                st.session_state["map_type"] = "real"
+            else:  # Side by Side Comparison
+                predicted_map, real_map = map_manager.draw_side_by_side_maps()
+                st.session_state["predicted_map"] = predicted_map
+                st.session_state["real_map"] = real_map
+                st.session_state["map_type"] = "side_by_side"
+        st.success("Map updated!")
+if st.session_state.get("map_type") == "side_by_side":
+    if "predicted_map" in st.session_state and "real_map" in st.session_state:
+        # Use container to control spacing
+        with st.container():
+            st.subheader("🟢 Predicted Speed")
+            st_folium(
+                st.session_state["predicted_map"],
+                width=1200,
+                height=600,
+                returned_objects=[],
+                key="predicted_map"
+            )
+        # Minimal spacing
+        st.markdown("<div style='margin-top: 10px;'></div>", unsafe_allow_html=True)
+        with st.container():
+            st.subheader("🔴 Real Speed")
+            st_folium(
+                st.session_state["real_map"],
+                width=1200,
+                height=600,
+                returned_objects=[],
+                key="real_map"
+            )
+elif "folium_map" in st.session_state:
+    map_title = "Predicted Speed" if st.session_state.get("map_type") == "predicted" else "Real Speed"
+    st.subheader(f"🗺️ {map_title}")
+    st_folium(
+        st.session_state["folium_map"],
+        width=1000,
+        height=1000,
+        returned_objects=[],
+        key="traffic_map"
+    )

data/Los Angeles/coordinates/CA 110 North.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e343e9c6e8222b00ee1078c75c54a3511ed2009eae77a3146bf34ffd42f77343
+size 25792

data/Los Angeles/coordinates/CA 110 South.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50348bf9e65c9dc44f9a7900deb539baa494bcca855c0abc10eeeb45f6d618f4
+size 28368

data/Los Angeles/coordinates/CA 118 East.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97b274ac0b44681b9768774e53280db2003b2eeda670915d0455b79a3cb241c9
+size 10637

data/Los Angeles/coordinates/CA 118 West.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c564c75b547dc96c480c957467e46b6b1d3775a98159b93d8539cc81a06f9ef
+size 11257

data/Los Angeles/coordinates/CA 134 East.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:419e83eec0a4b448b66443210c5384bb3ab692d5c48a06d7501f7d37d4e8a101
+size 28810

data/Los Angeles/coordinates/CA 134 West.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8b19c456b566ba7c06ba5a00809632eb72b17ace442deefdc97079346f2ddfc
+size 30711

data/Los Angeles/coordinates/CA 170 North.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8eb7225f8319965002d9b304349b15fb82a59e804353cdffcd70bf528b50f05
+size 11192

data/Los Angeles/coordinates/CA 170 South.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3490d44731131ab2bbfcd1c2f76c03705b1d0ecc9cbaee9ceaba353dc4d4c25
+size 10316

data/Los Angeles/coordinates/CA 2 North.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4d9e165b566c129bfd30f41faa36f30aa8ccfabc5d3dcb1aa69c6cab8a6a58b
+size 129078

data/Los Angeles/coordinates/CA 2 South.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:419c5cdafe432fbc968c98fec5e5e6774fc0155427c8a7b2d5958ed4affb6294
+size 128876

data/Los Angeles/coordinates/I 10 East.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44e09007d08b1c0f078a8c42a8a7260a4b996e7990818dc8e4e0b6bb76a7aef1
+size 146208

data/Los Angeles/coordinates/I 10 West.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:333b59cb3426fcc2766d1798d1d924211a9bb260bd148bfa436da9b3fb34ba50
+size 139940

data/Los Angeles/coordinates/I 110 North.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b137934c12d5b5be71fa37b3cc8327c26550d7e716bb926623cf0bbfbc75467c
+size 37294

data/Los Angeles/coordinates/I 110 South.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3a3e9ff223dced2f8c1e3d5ef456ea78a4a5896fd8f7e5efdecf3a74ac7842e
+size 37664

data/Los Angeles/coordinates/I 210 East.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77662bfdd3e2f52055b830918649099a11895ad0183cf913f7e671cd6be34b7e
+size 57529

data/Los Angeles/coordinates/I 210 West.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db1667f653de6d15d3c483c77485c396426b49f452b3a0aa00d3e08f4fff366e
+size 55855

data/Los Angeles/coordinates/I 405 North.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bdd36a44ab90041704654717bf9ec09a4431ccdc231ccbfb84b9fb956021d26b
+size 135243

data/Los Angeles/coordinates/I 405 South.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dda435dd55be2789e05e8ad1a1cad5312495916a9ec7bd2a900aea5f043ea9b
+size 140106

data/Los Angeles/coordinates/I 5 North.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b0a6f76b99ac3755f12d92661190bdd0e1071ef0c5ed21a060c557f121f2a35
+size 141981

data/Los Angeles/coordinates/I 5 South.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47ff4e2b871bb69ddbca3510c55d407ca6c9da0d227f1f18ec297b6efdd273c1
+size 141235

data/Los Angeles/coordinates/I 605 North.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c5ce7940b7cf70f9fdcb7c373ce0f851631b618ef60d822eb8bb1e925a81a3c
+size 34769

data/Los Angeles/coordinates/I 605 South.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a85c66aac9b5f23a30199edfc5c0181ef2ccc1d7112ea0a4667330a69201b02a
+size 31924

data/Los Angeles/coordinates/US 101 North.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11106ce640fec84cd685511a9e35d532a8255d8e79768dd9a38f7d19a389edaa
+size 49616

data/Los Angeles/coordinates/US 101 South.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41f589bb93d40f90c2806fb577142247bc3a21d0d36dca5b9b5bf4e2d974d5fb
+size 52135

data_collection/collect.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import time
+import os
+from urllib.parse import urlparse, parse_qs
+from datetime import datetime
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+# Specify the download directory
+download_path = '/Users/noamcohen/Downloads/data collection/data'
+ROADS = {'405': ['N', 'S'], 101: ['N', 'S'], 5: ['N', 'S'], 110: ['N', 'S'], 170: ['N', 'S'], 118: ['E', 'W'],
+         134: ['E', 'W'], 605: ['N', 'S'], 210: ['E', 'W']}
+# Set up Chrome options
+chrome_options = Options()
+chrome_prefs = {
+    "download.default_directory": download_path,  # Change the download directory
+    "download.prompt_for_download": False,  # Disable download prompt
+    "download.directory_upgrade": True,  # Automatically upgrade download path
+}
+chrome_options.add_experimental_option("prefs", chrome_prefs)
+# Set up ChromeDriver with the path to your chromedriver and options
+driver = webdriver.Chrome(service=Service('/opt/homebrew/bin/chromedriver'), options=chrome_options)
+# Continue with your script as normal...
+# Open the login page
+url = "https://pems.dot.ca.gov/"
+driver.get(url)
+# Wait for the page to load
+time.sleep(20)
+# Find the username and password fields
+username_field = driver.find_element(By.ID, "username")
+password_field = driver.find_element(By.ID, "password")
+# Enter your credentials
+username_field.send_keys("amitomer1912@gmail.com")
+password_field.send_keys("5^applel?X")
+# Find and click the login button (using the 'login' name attribute)
+login_button = driver.find_element(By.NAME, "login")
+login_button.click()
+# Wait for login to complete
+time.sleep(20)
+i = 1
+s_time_id=1740787200
+while(i <= 1):
+    # Once logged in, navigate to the report page
+    if i <= 9:
+        day = f"0{i}"
+    else:
+        day = i
+    for road in ROADS.keys():
+        for dir in ROADS.get(road):
+            report_url = f"https://pems.dot.ca.gov/?report_form=1&dnode=Freeway&content=spatial&tab=contours&export=&fwy={road}&dir={dir}&s_time_id={s_time_id}&s_time_id_f=03%2F{day}%2F2025&from_hh=0&to_hh=23&start_pm=.0&end_pm=1000.09&lanes=&station_type=ml&q=speed&colormap=30%2C31%2C32&sc=auto&ymin=&ymax=&view_d=2&chart.x=93&chart.y=20"
+            driver.get(report_url)
+            time.sleep(60)
+            # Find the "Export XLS" button by its 'name' or 'alt' attribute
+            export_button = driver.find_element(By.NAME, "xls")
+            # Click the "Export XLS" button
+            export_button.click()
+            # Wait for the download to complete (you can increase or decrease this based on your network speed)
+            time.sleep(100)
+            # Extract highway number and date from the report URL
+            parsed_url = urlparse(report_url)
+            query_params = parse_qs(parsed_url.query)
+            highway_number = query_params.get('fwy', ['unknown'])[0]  # Default to 'unknown' if not found
+            date_taken = query_params.get('s_time_id_f', ['unknown_date'])[0]  # Default to 'unknown_date' if not found
+            # Construct the new file name
+            new_file_name = rf"{highway_number}_{dir}_{date_taken.replace("/", "*")}.xlsx"
+            new_file_path = os.path.join(download_path, new_file_name)
+            # Original file path (before renaming)
+            original_file_path = os.path.join(download_path, 'pems_output.xlsx')
+            # Rename the file if it exists
+            if os.path.exists(original_file_path):
+                os.rename(original_file_path, new_file_path)
+                print(f"File renamed to: {new_file_path}")
+            else:
+                print("Original file not found!")
+    print(f"Download {i} to March complete!")
+    i+=1
+    s_time_id += 86400
+driver.quit()

data_process/process.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import pandas as pd
+import glob
+import os
+import sys
+import numpy as np
+from sklearn.neighbors import BallTree
+from datetime import datetime
+EARTH_RADIUS_M = 6371000
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(PROJECT_ROOT)
+from roadmap.utils import add_weather_to_df
+def compute_sensor_id(df: pd.DataFrame,
+                      lat_col: str = "Latitude",
+                      lon_col: str = "Longitude",
+                      decimals: int = 6,
+                      out_col: str = "sensor_id") -> pd.DataFrame:
+    df[out_col] = (
+        df[lat_col].round(decimals).astype(str)
+        + ";" +
+        df[lon_col].round(decimals).astype(str)
+    )
+    return df
+def prepare_data_df(df_data: pd.DataFrame, coordinate: pd.DataFrame, date: str):
+    """"
+    first remove points with no observations, add date to the table and weather
+    """
+    #df_data.drop(df_data[df_data["% Observed"] < 50].index, inplace=True)
+    df_data["Time"] = pd.to_datetime(date + " " + df_data["Time"].astype(str),format="%Y-%m-%d %H:%M")
+    df_data = add_coordinate(coordinate, df_data)
+    df_data = compute_sensor_id(df_data)
+    df_data["Time_hour"] = df_data["Time"].dt.round("h")
+    try:
+        df_data = enrich_weather_hourly(df_data) #TODO find better way to do this
+    except Exception as e:
+        print(f"Error enriching weather: {e}")
+        df_data["weather"] = None
+    df_data["Day"] = df_data["Time"].dt.dayofweek
+    return df_data
+def add_coordinate(df_coord: pd.DataFrame, df_data: pd.DataFrame):
+    df_coord = df_coord.sort_values(by="Abs PM").reset_index(drop=True)
+    df_data = df_data.sort_values(by="Postmile (Abs)").reset_index(drop=True)
+    coord_abs_pm = df_coord["Abs PM"].values
+    coord_lat = df_coord["Latitude"].values
+    coord_lon = df_coord["Longitude"].values
+    def find_closest_index(target):
+        return np.abs(coord_abs_pm - target).argmin()
+    closest_indices = df_data["Postmile (Abs)"].apply(find_closest_index)
+    df_data["Latitude"] = closest_indices.apply(lambda idx: coord_lat[idx]) # type: ignore
+    df_data["Longitude"] = closest_indices.apply(lambda idx: coord_lon[idx]) # type: ignore
+    return df_data
+def enrich_weather_hourly(full_df: pd.DataFrame) -> pd.DataFrame:
+    pieces = []
+    for t_hour, chunk in full_df.groupby("Time_hour", sort=False):
+        enriched = add_weather_to_df(chunk.copy(),time=t_hour.to_pydatetime()) # type: ignore
+        pieces.append(enriched)
+    return pd.concat(pieces, ignore_index=True).sort_values("Time")
+def build_sensor_index(data_df: pd.DataFrame) -> pd.DataFrame:
+    sensors = (
+        data_df
+        .drop_duplicates(subset=["Latitude", "Longitude"])
+        .loc[:, ["Latitude", "Longitude"]]
+        .copy()
+        .reset_index(drop=True)
+    )
+    sensors = compute_sensor_id(sensors)  # adds 'sensor_id' as "lat;lon"
+    # keep only what we need; you can also keep an integer 'sensor_idx' if you like
+    return sensors[["sensor_id", "Latitude", "Longitude"]]
+def build_enriched_time_series(data_df: pd.DataFrame,
+                               sensor_map: pd.DataFrame,
+                               sensors: pd.DataFrame) -> pd.DataFrame:
+    # Ensure both sides have the same sensor_id key
+    data_df = compute_sensor_id(data_df)  # from its own lat/lon
+    enriched = (
+        sensor_map[["sensor_id", "Latitude", "Longitude", "lanes", "maxspeed", "ref", "direction"]]
+        .merge(
+            data_df[["sensor_id", "Time", "AggSpeed", "% Observed", "weather"]],
+            on="sensor_id",
+            how="left"
+        )
+        .sort_values(["sensor_id", "Time"])
+        .reset_index(drop=True)
+    )
+    return enriched
+def normalize_lanes(value):
+    if isinstance(value, list):
+        try:
+            return min(int(x) for x in value)
+        except ValueError:
+            return None
+    try:
+        return int(value)
+    except ValueError:
+        return None
+def map_pms_to_sensors(network_df: pd.DataFrame, sensors: pd.DataFrame, max_distance_m: float | None = None) -> pd.DataFrame:
+    net = network_df.dropna(subset=["Latitude", "Longitude"]).copy()
+    sensor_rad = np.radians(sensors[["Latitude", "Longitude"]].to_numpy())
+    net_rad = np.radians(net[["Latitude", "Longitude"]].to_numpy())
+    tree = BallTree(sensor_rad, metric="haversine")
+    dist_rad, idx = tree.query(net_rad,k=1)
+    dist_m = dist_rad[:, 0] * EARTH_RADIUS_M
+    matched = net.copy()
+    matched["sensor_id"] = sensors.iloc[idx[:, 0]].sensor_id.values
+    matched["matched_sensor_lat"] = sensors.iloc[idx[:, 0]].Latitude.values
+    matched["matched_sensor_lon"] = sensors.iloc[idx[:, 0]].Longitude.values
+    matched["distance_m"] = dist_m
+    if max_distance_m is not None:
+        matched = matched[matched["distance_m"] <= max_distance_m].copy()
+    return matched

data_process/split.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import pandas as pd
+allowed = ["CA 134", "CA 170","I 605"]
+df = pd.read_csv("/Users/amitomer/Desktop/Personal/University/deep_learning/TrafCast/old_data/split_df_v2.csv")
+df = df[df["road_name"].isin(allowed)]
+df.to_csv("weights_test.csv", index=False)

data_process/unified.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import os
+import sys
+import glob
+import pandas as pd
+from process import prepare_data_df, build_sensor_index, map_pms_to_sensors
+index_map = {
+    '405': 'I',
+    '101': 'US',
+    '101': 'US',
+    '110': 'I',
+    '170': 'CA',
+    '118': 'CA',
+    '134': 'CA',
+    '605': 'I',
+    '210': 'I',
+    '5': 'I'
+}
+direction_map = {
+    'E': 'East',
+    'W': 'West',
+    'N': 'North',
+    'S': 'South'
+}
+PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(PROJECT_ROOT)
+DATA_DIR = os.path.join(PROJECT_ROOT, "data_collection", "data")
+COORDINATE_DIR = os.path.join(PROJECT_ROOT,"data_collection", "coordinates")
+full_df = pd.DataFrame()
+for entry in os.scandir(DATA_DIR):
+    if entry.is_dir():
+        road_number = entry.name
+        road_name = index_map[road_number] +" "+ road_number
+        with os.scandir(entry.path) as it:
+            for sub in it:
+                if sub.is_dir() and sub.name in {'E', 'W', 'N', 'S'}:
+                    direction = direction_map[sub.name]
+                    print(f"Processing {road_name} {direction}")
+                    data_dir = os.path.join(entry.path, sub.name)
+                    coordinate_dir = os.path.join(COORDINATE_DIR, f"{road_name} {direction}.xlsx")
+                    for i in range(1,32):
+                        if i <=9:
+                            raw_data_pattern = os.path.join(data_dir, f"{road_number}_{sub.name}_03*0{i}*2025.xlsx")
+                            date = f"2025-03-0{i}"
+                        else:
+                            raw_data_pattern = os.path.join(data_dir, f"{road_number}_{sub.name}_03*{i}*2025.xlsx")
+                            date = f"2025-03-{i}"
+                        matching_files = glob.glob(raw_data_pattern)
+                        if not matching_files:
+                            print(f"No data file found for {road_name} {direction} on {date}, skipping...")
+                            continue
+                        raw_data = matching_files[0]
+                        df_coord = pd.read_excel(coordinate_dir)
+                        df_data = pd.read_excel(raw_data)
+                        clean_data_df = prepare_data_df(df_data, df_coord,date)
+                        sensors = build_sensor_index(clean_data_df)
+                        enriched = map_pms_to_sensors(clean_data_df, sensors)
+                        enriched["road_name"] = road_name
+                        enriched["direction"] = direction
+                        full_df = pd.concat([full_df, enriched], ignore_index=True)
+                        print(f"finished {date}")
+full_df.drop(columns=["Postmile (Abs)", "Postmile (CA)", "VDS", "Time_hour", "matched_sensor_lat", "matched_sensor_lon", "distance_m"], inplace=True)
+desired_order = [
+    "Time","sensor_id", "Latitude", "Longitude",
+    "road_name", "direction", "# Lane Points",
+    "% Observed", "weather", "Day", "AggSpeed"
+]
+full_df = full_df[desired_order]
+full_df.rename(columns={
+    "AggSpeed": "speed_mph",
+    "# Lane Points": "lanes"
+}, inplace=True)
+full_df.to_csv('full_df_weather.csv',index=False)

model_v3/encode.py ADDED Viewed

	@@ -0,0 +1,332 @@

+"""
+encode.py – Traffic data encoder for LSTM traffic flow prediction
+This module provides TrafficDataEncoder for processing 5-minute traffic sensor data
+into sequences suitable for LSTM training. Key features:
+- Sensor-safe windowing (no cross-sensor leakage)
+- Feature engineering (time, geographic, categorical)
+- Speed-based class weighting support
+- Robust missing value handling
+"""
+from __future__ import annotations
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import OrdinalEncoder, StandardScaler
+from sklearn.utils.validation import check_is_fitted
+from typing import List, Tuple, Dict, Optional
+import joblib
+from pathlib import Path
+class TrafficDataEncoder:
+    """
+    Encodes traffic sensor data into sequences for LSTM training.
+    Features:
+    - Geographic coordinates (lat/lon -> x/y km)
+    - Time features (hour, day of week)
+    - Categorical encoding (direction, weather)
+    - Speed-based class weighting
+    - Sensor-safe windowing
+    """
+    def __init__(
+        self,
+        seq_len: int = 12,  # 12 * 5min = 1 hour history
+        horizon: int = 1,   # predict 1 step ahead (5 minutes)
+        target_col: str = "speed_mph"
+    ):
+        self.seq_len = seq_len
+        self.horizon = horizon
+        self.target_col = target_col
+        # Feature columns
+        self.cat_cols = ["direction", "weather"]
+        self.num_cols = [
+            "lanes", "% Observed", "Latitude", "Longitude",
+            "hour_sin", "hour_cos", "dow_sin", "dow_cos"
+        ]
+        # Fitted components
+        self.ordinal_encoder: Optional[OrdinalEncoder] = None
+        self.scaler: Optional[StandardScaler] = None
+        self.num_medians: Dict[str, float] = {}
+        self.is_fitted = False
+    def _ensure_sensor_id_and_sort(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Create sensor_id and sort by sensor and time."""
+        df = df.copy()
+        # Create sensor_id from coordinates
+        if "sensor_id" not in df.columns:
+            df["sensor_id"] = (
+                df["Latitude"].round(6).astype(str) + ";" +
+                df["Longitude"].round(6).astype(str)
+            )
+        # Parse time and sort
+        df["Time"] = pd.to_datetime(df["Time"], errors="coerce")
+        return df.sort_values(["sensor_id", "Time"]).reset_index(drop=True)
+    def _add_time_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Add cyclical time features."""
+        dt = pd.to_datetime(df["Time"], errors="coerce")
+        hour = dt.dt.hour + dt.dt.minute / 60.0
+        dow = dt.dt.dayofweek
+        df["hour_sin"] = np.sin(2 * np.pi * hour / 24)
+        df["hour_cos"] = np.cos(2 * np.pi * hour / 24)
+        df["dow_sin"] = np.sin(2 * np.pi * dow / 7)
+        df["dow_cos"] = np.cos(2 * np.pi * dow / 7)
+        return df
+    def _clean_numeric(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Clean and convert numeric columns."""
+        # Ensure lanes is numeric
+        df["lanes"] = pd.to_numeric(df.get("lanes", 0), errors="coerce")
+        # Ensure % Observed is numeric
+        df["% Observed"] = pd.to_numeric(df.get("% Observed", 100), errors="coerce")
+        return df
+    def _compute_speed_weights(self, y: np.ndarray) -> Dict[str, float]:
+        """Compute class weights for speed-based weighting."""
+        # Define speed classes based on user's experience
+        low_mask = y <= 30
+        high_mask = y >= 60
+        medium_mask = ~(low_mask | high_mask)
+        n_low = low_mask.sum()
+        n_medium = medium_mask.sum()
+        n_high = high_mask.sum()
+        n_total = len(y)
+        print(f"Speed distribution:")
+        print(f"  Low (≤30): {n_low} samples ({n_low/n_total*100:.1f}%)")
+        print(f"  Medium (30-60): {n_medium} samples ({n_medium/n_total*100:.1f}%)")
+        print(f"  High (≥60): {n_high} samples ({n_high/n_total*100:.1f}%)")
+        # Compute inverse frequency weights
+        if n_low > 0 and n_medium > 0 and n_high > 0:
+            weight_low = n_total / (3 * n_low)
+            weight_medium = n_total / (3 * n_medium)
+            weight_high = n_total / (3 * n_high)
+        else:
+            weight_low = weight_medium = weight_high = 1.0
+        print(f"Class weights: Low={weight_low:.2f}, Medium={weight_medium:.2f}, High={weight_high:.2f}")
+        return {
+            "weight_low": weight_low,
+            "weight_medium": weight_medium,
+            "weight_high": weight_high,
+            "low_threshold": 30,
+            "high_threshold": 60
+        }
+    def fit(self, df: pd.DataFrame) -> "TrafficDataEncoder":
+        """Fit the encoder on training data."""
+        print("Fitting encoder...")
+        # Preprocess data
+        df = self._ensure_sensor_id_and_sort(df)
+        df = self._add_time_features(df)
+        df = self._clean_numeric(df)
+        # Handle missing values
+        df[self.cat_cols] = df[self.cat_cols].fillna("UNK")
+        self.num_medians = df[self.num_cols].median(numeric_only=True).to_dict()
+        df[self.num_cols] = df[self.num_cols].fillna(self.num_medians)
+        # Fit encoders
+        self.ordinal_encoder = OrdinalEncoder(
+            handle_unknown="use_encoded_value",
+            unknown_value=-1
+        )
+        self.ordinal_encoder.fit(df[self.cat_cols])
+        self.scaler = StandardScaler()
+        self.scaler.fit(df[self.num_cols])
+        self.is_fitted = True
+        print("Encoder fitted successfully")
+        return self
+    def _preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Apply preprocessing steps."""
+        df = self._ensure_sensor_id_and_sort(df)
+        df = self._add_time_features(df)
+        df = self._clean_numeric(df)
+        # Handle missing values using fitted medians
+        df[self.cat_cols] = df[self.cat_cols].fillna("UNK")
+        df[self.num_cols] = df[self.num_cols].fillna(self.num_medians)
+        return df
+    def transform(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Transform data into sequences.
+        Returns:
+            X: (N, seq_len, n_features) - input sequences
+            y: (N, horizon) - target values
+            target_indices: (N,) - indices of target rows in original df
+            timestamps: (N,) - timestamps of target rows
+        """
+        check_is_fitted(self, ["ordinal_encoder", "scaler", "num_medians"])
+        df = self._preprocess(df)
+        X_chunks = []
+        y_chunks = []
+        target_indices = []
+        timestamps = []
+        # Process each sensor separately to avoid cross-sensor leakage
+        for sensor_id, group in df.groupby("sensor_id", sort=False):
+            if len(group) < self.seq_len + self.horizon:
+                continue  # Not enough data for this sensor
+            # Encode features
+            cat_features = self.ordinal_encoder.transform(group[self.cat_cols]).astype(np.float32)
+            num_features = self.scaler.transform(group[self.num_cols]).astype(np.float32)
+            features = np.concatenate([num_features, cat_features], axis=1)
+            # Get targets
+            targets = group[self.target_col].to_numpy(dtype=np.float32)
+            group_timestamps = group["Time"].to_numpy()
+            group_indices = group.index.to_numpy()
+            # Create sliding windows
+            n_windows = len(group) - self.seq_len - self.horizon + 1
+            for i in range(n_windows):
+                X_chunks.append(features[i:i + self.seq_len])
+                y_chunks.append(targets[i + self.seq_len:i + self.seq_len + self.horizon])
+                target_indices.append(group_indices[i + self.seq_len + self.horizon - 1])
+                timestamps.append(group_timestamps[i + self.seq_len + self.horizon - 1])
+        if not X_chunks:
+            # Return empty arrays with correct shapes
+            n_features = len(self.num_cols) + len(self.cat_cols)
+            return (
+                np.empty((0, self.seq_len, n_features), dtype=np.float32),
+                np.empty((0, self.horizon), dtype=np.float32),
+                np.empty(0, dtype=int),
+                np.empty(0, dtype=object)
+            )
+        X = np.stack(X_chunks, axis=0)
+        y = np.stack(y_chunks, axis=0)
+        target_indices = np.array(target_indices, dtype=int)
+        timestamps = np.array(timestamps)
+        print(f"Created {len(X)} sequences from {len(df.groupby('sensor_id'))} sensors")
+        return X, y, target_indices, timestamps
+    def fit_transform(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Fit encoder and transform data in one step."""
+        return self.fit(df).transform(df)
+    def get_speed_weights(self, y: np.ndarray) -> Dict[str, float]:
+        """Get speed-based class weights for weighted loss."""
+        return self._compute_speed_weights(y)
+    def save(self, filepath: str) -> None:
+        """Save the fitted encoder."""
+        if not self.is_fitted:
+            raise ValueError("Encoder must be fitted before saving")
+        joblib.dump(self, filepath)
+        print(f"Encoder saved to {filepath}")
+    @classmethod
+    def load(cls, filepath: str) -> "TrafficDataEncoder":
+        """Load a fitted encoder."""
+        try:
+            encoder = joblib.load(filepath)
+            if not isinstance(encoder, cls):
+                raise ValueError(f"Loaded object is not a {cls.__name__}")
+            return encoder
+        except AttributeError as e:
+            if "TrafficDataEncoder" in str(e):
+                # Handle the case where encoder was saved from a different module context
+                print("Warning: Encoder was saved from different module context. Reconstructing...")
+                # Use a more robust approach with joblib
+                import sys
+                import types
+                # Temporarily modify sys.modules to include our class
+                original_main = sys.modules.get('__main__')
+                temp_module = types.ModuleType('temp_encode')
+                temp_module.TrafficDataEncoder = cls
+                sys.modules['__main__'] = temp_module
+                try:
+                    # Now try loading with the modified module context
+                    encoder = joblib.load(filepath)
+                    if not isinstance(encoder, cls):
+                        raise ValueError(f"Loaded object is not a {cls.__name__}")
+                    return encoder
+                finally:
+                    # Restore original __main__ module
+                    if original_main is not None:
+                        sys.modules['__main__'] = original_main
+                    else:
+                        del sys.modules['__main__']
+            else:
+                raise e
+def main():
+    """CLI interface for encoding data."""
+    import argparse
+    parser = argparse.ArgumentParser(description="Encode traffic data for LSTM training")
+    parser.add_argument("csv_file", help="Path to CSV file with traffic data")
+    parser.add_argument("--seq_len", type=int, default=12, help="Sequence length (default: 12)")
+    parser.add_argument("--horizon", type=int, default=1, help="Prediction horizon (default: 1)")
+    parser.add_argument("--target_col", default="speed_mph", help="Target column name")
+    parser.add_argument("--save_encoder", help="Path to save fitted encoder")
+    parser.add_argument("--output", help="Path to save encoded data (optional)")
+    args = parser.parse_args()
+    # Load data
+    print(f"Loading data from {args.csv_file}")
+    df = pd.read_csv(args.csv_file)
+    # Create and fit encoder
+    encoder = TrafficDataEncoder(
+        seq_len=args.seq_len,
+        horizon=args.horizon,
+        target_col=args.target_col
+    )
+    X, y, target_indices, timestamps = encoder.fit_transform(df)
+    print(f"Encoded data shapes:")
+    print(f"  X: {X.shape}")
+    print(f"  y: {y.shape}")
+    print(f"  Target indices: {len(target_indices)}")
+    print(f"  Timestamps: {len(timestamps)}")
+    # Save encoder if requested
+    if args.save_encoder:
+        encoder.save(args.save_encoder)
+    # Save encoded data if requested
+    if args.output:
+        np.savez(args.output, X=X, y=y, target_indices=target_indices, timestamps=timestamps)
+        print(f"Encoded data saved to {args.output}")
+if __name__ == "__main__":
+    main()

model_v3/evaluate.py ADDED Viewed

	@@ -0,0 +1,385 @@

+"""
+evaluate.py – Model evaluation and prediction for traffic flow prediction
+Features:
+- Load trained model and encoder
+- Generate predictions on new data
+- Comprehensive evaluation metrics
+- Visualization support
+- Batch prediction for large datasets
+"""
+import argparse
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
+from pathlib import Path
+import joblib
+from typing import Dict, List, Tuple, Optional
+import matplotlib.pyplot as plt
+try:
+    import seaborn as sns
+    sns.set_style("whitegrid")
+except ImportError:
+    print("Warning: seaborn not available, using matplotlib defaults")
+from encode import TrafficDataEncoder
+from train_lstm import LSTMRegressor
+def load_model_and_encoder(
+    model_path: str,
+    encoder_path: str,
+    device: torch.device
+) -> Tuple[LSTMRegressor, TrafficDataEncoder]:
+    """Load trained model and encoder."""
+    print(f"Loading encoder from {encoder_path}")
+    encoder = TrafficDataEncoder.load(encoder_path)
+    print(f"Loading model from {model_path}")
+    model_state = torch.load(model_path, map_location=device)
+    # Infer model architecture from the saved state_dict
+    n_features = len(encoder.num_cols) + len(encoder.cat_cols)
+    # Infer hidden_size from the first LSTM layer weights
+    # lstm.weight_ih_l0 shape is [4*hidden_size, n_features]
+    first_layer_weight_shape = model_state['lstm.weight_ih_l0'].shape
+    hidden_size = first_layer_weight_shape[0] // 4
+    # Check if bidirectional by looking for reverse weights
+    bidirectional = 'lstm.weight_ih_l0_reverse' in model_state
+    # Infer number of layers by counting unique layer indices
+    layer_keys = [k for k in model_state.keys() if k.startswith('lstm.weight_ih_l')]
+    n_layers = len(set([k.split('_l')[1].split('_')[0] for k in layer_keys]))
+    # Infer dropout from the model structure (this is harder to infer, so we'll use a default)
+    dropout = 0.3  # Default value
+    print(f"Inferred model architecture:")
+    print(f"  n_features: {n_features}")
+    print(f"  hidden_size: {hidden_size}")
+    print(f"  n_layers: {n_layers}")
+    print(f"  bidirectional: {bidirectional}")
+    print(f"  dropout: {dropout}")
+    # Create model with inferred architecture
+    model = LSTMRegressor(
+        n_features=n_features,
+        hidden_size=hidden_size,
+        n_layers=n_layers,
+        dropout=dropout,
+        bidirectional=bidirectional
+    ).to(device)
+    model.load_state_dict(model_state)
+    model.eval()
+    print("Model and encoder loaded successfully")
+    return model, encoder
+def compute_metrics(predictions: np.ndarray, targets: np.ndarray) -> Dict[str, float]:
+    """Compute comprehensive evaluation metrics."""
+    predictions = predictions.flatten()
+    targets = targets.flatten()
+    # Basic metrics
+    mae = np.mean(np.abs(predictions - targets))
+    mse = np.mean((predictions - targets) ** 2)
+    rmse = np.sqrt(mse)
+    # Percentage metrics
+    mape = np.mean(np.abs((targets - predictions) / (targets + 1e-8))) * 100
+    # R-squared
+    ss_res = np.sum((targets - predictions) ** 2)
+    ss_tot = np.sum((targets - np.mean(targets)) ** 2)
+    r2 = 1 - (ss_res / (ss_tot + 1e-8))
+    # Speed-specific metrics
+    speed_ranges = {
+        'low (≤30)': targets <= 30,
+        'medium (30-60)': (targets > 30) & (targets <= 60),
+        'high (≥60)': targets >= 60
+    }
+    range_metrics = {}
+    for range_name, mask in speed_ranges.items():
+        if np.sum(mask) > 0:
+            range_pred = predictions[mask]
+            range_target = targets[mask]
+            range_metrics[f'mae_{range_name.replace(" ", "_").replace("(", "").replace(")", "")}'] = np.mean(np.abs(range_pred - range_target))
+            range_metrics[f'count_{range_name.replace(" ", "_").replace("(", "").replace(")", "")}'] = np.sum(mask)
+    metrics = {
+        'mae': mae,
+        'mse': mse,
+        'rmse': rmse,
+        'mape': mape,
+        'r2': r2,
+        **range_metrics
+    }
+    return metrics
+def predict_batch(
+    model: LSTMRegressor,
+    encoder: TrafficDataEncoder,
+    df: pd.DataFrame,
+    batch_size: int = 256,
+    device: torch.device = torch.device('cpu'),
+    train_ratio: float = 0.7,
+    val_ratio: float = 0.15
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Generate predictions for the TEST portion of a dataset in batches.
+    Uses the same chronological split as training to ensure we only evaluate on test data.
+    Returns:
+        predictions: (N,) - predicted values (test set only)
+        targets: (N,) - actual values (test set only)
+        target_indices: (N,) - indices of target rows in original df (test set only)
+    """
+    print("Encoding data for prediction...")
+    X, y, target_indices, timestamps = encoder.transform(df)
+    if len(X) == 0:
+        print("No valid sequences found in data")
+        return np.array([]), np.array([]), np.array([])
+    # Apply the same chronological split as training
+    print("Applying chronological split to match training...")
+    sorted_indices = np.argsort(timestamps)
+    X_sorted = X[sorted_indices]
+    y_sorted = y[sorted_indices]
+    target_indices_sorted = target_indices[sorted_indices]
+    timestamps_sorted = timestamps[sorted_indices]
+    # Calculate split points (same as training)
+    n_total = len(X_sorted)
+    n_train = int(n_total * train_ratio)
+    n_val = int(n_total * val_ratio)
+    # Get test indices
+    test_indices = sorted_indices[n_train + n_val:]
+    X_test = X[test_indices]
+    y_test = y[test_indices]
+    target_indices_test = target_indices[test_indices]
+    print(f"Using test set: {len(X_test):,} samples ({(1-train_ratio-val_ratio)*100:.0f}%)")
+    if len(X_test) > 0:
+        test_timestamps = pd.to_datetime(timestamps[test_indices])
+        print(f"Test date range: {test_timestamps.min()} to {test_timestamps.max()}")
+    if len(X_test) == 0:
+        print("No test data available")
+        return np.array([]), np.array([]), np.array([])
+    print(f"Generating predictions for {len(X_test)} test sequences...")
+    # Create data loader for test set only
+    dataset = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())
+    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+    predictions = []
+    targets = []
+    model.eval()
+    with torch.no_grad():
+        for batch_X, batch_y in data_loader:
+            batch_X = batch_X.to(device)
+            batch_pred = model(batch_X).cpu().numpy()
+            predictions.append(batch_pred)
+            targets.append(batch_y.numpy())
+    predictions = np.concatenate(predictions, axis=0).flatten()
+    targets = np.concatenate(targets, axis=0).flatten()
+    return predictions, targets, target_indices_test
+def create_evaluation_plots(
+    predictions: np.ndarray,
+    targets: np.ndarray,
+    save_path: Optional[str] = None
+) -> None:
+    """Create comprehensive evaluation plots."""
+    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
+    # Scatter plot: predictions vs targets
+    axes[0, 0].scatter(targets, predictions, alpha=0.5, s=1)
+    axes[0, 0].plot([targets.min(), targets.max()], [targets.min(), targets.max()], 'r--', lw=2)
+    axes[0, 0].set_xlabel('Actual Speed (mph)')
+    axes[0, 0].set_ylabel('Predicted Speed (mph)')
+    axes[0, 0].set_title('Predictions vs Actual')
+    axes[0, 0].grid(True, alpha=0.3)
+    # Residuals plot
+    residuals = predictions - targets
+    axes[0, 1].scatter(targets, residuals, alpha=0.5, s=1)
+    axes[0, 1].axhline(y=0, color='r', linestyle='--')
+    axes[0, 1].set_xlabel('Actual Speed (mph)')
+    axes[0, 1].set_ylabel('Residuals (mph)')
+    axes[0, 1].set_title('Residuals vs Actual')
+    axes[0, 1].grid(True, alpha=0.3)
+    # Error distribution
+    axes[1, 0].hist(residuals, bins=50, alpha=0.7, edgecolor='black')
+    axes[1, 0].set_xlabel('Residuals (mph)')
+    axes[1, 0].set_ylabel('Frequency')
+    axes[1, 0].set_title('Error Distribution')
+    axes[1, 0].grid(True, alpha=0.3)
+    # Speed range performance
+    speed_ranges = {
+        'Low (≤30)': targets <= 30,
+        'Medium (30-60)': (targets > 30) & (targets <= 60),
+        'High (≥60)': targets >= 60
+    }
+    range_maes = []
+    range_names = []
+    for name, mask in speed_ranges.items():
+        if np.sum(mask) > 0:
+            range_mae = np.mean(np.abs(predictions[mask] - targets[mask]))
+            range_maes.append(range_mae)
+            range_names.append(name)
+    axes[1, 1].bar(range_names, range_maes, alpha=0.7)
+    axes[1, 1].set_ylabel('MAE (mph)')
+    axes[1, 1].set_title('MAE by Speed Range')
+    axes[1, 1].grid(True, alpha=0.3)
+    plt.tight_layout()
+    if save_path:
+        plt.savefig(save_path, dpi=300, bbox_inches='tight')
+        print(f"Evaluation plots saved to {save_path}")
+    else:
+        plt.show()
+def main():
+    """Main evaluation function."""
+    parser = argparse.ArgumentParser(description="Evaluate trained LSTM model")
+    # Required arguments
+    parser.add_argument("--csv", required=True, help="Path to CSV file with test data")
+    parser.add_argument("--model", required=True, help="Path to trained model (.pt file)")
+    parser.add_argument("--encoder", required=True, help="Path to fitted encoder (.pkl file)")
+    # Optional arguments
+    parser.add_argument("--batch_size", type=int, default=256, help="Batch size for prediction")
+    parser.add_argument("--train_ratio", type=float, default=0.7, help="Training data ratio (must match training)")
+    parser.add_argument("--val_ratio", type=float, default=0.15, help="Validation data ratio (must match training)")
+    parser.add_argument("--output", help="Path to save predictions CSV")
+    parser.add_argument("--metrics_output", help="Path to save metrics JSON")
+    parser.add_argument("--plots_output", help="Path to save evaluation plots")
+    parser.add_argument("--device", default="auto", help="Device to use (auto, cpu, cuda, mps)")
+    args = parser.parse_args()
+    # Device selection
+    if args.device == "auto":
+        if torch.backends.mps.is_available():
+            device = torch.device("mps")
+        elif torch.cuda.is_available():
+            device = torch.device("cuda")
+        else:
+            device = torch.device("cpu")
+    else:
+        device = torch.device(args.device)
+    print(f"Using device: {device}")
+    # Load model and encoder
+    model, encoder = load_model_and_encoder(args.model, args.encoder, device)
+    # Load test data
+    print(f"Loading test data from {args.csv}")
+    df = pd.read_csv(args.csv)
+    print(f"Loaded {len(df):,} rows")
+    # Generate predictions (using same split ratios as training)
+    predictions, targets, target_indices = predict_batch(
+        model, encoder, df, args.batch_size, device,
+        train_ratio=args.train_ratio, val_ratio=args.val_ratio
+    )
+    if len(predictions) == 0:
+        print("No predictions generated. Check your data format.")
+        return
+    # Compute metrics
+    print("Computing evaluation metrics...")
+    metrics = compute_metrics(predictions, targets)
+    # Print metrics
+    print("\n" + "="*50)
+    print("EVALUATION METRICS")
+    print("="*50)
+    print(f"MAE (Mean Absolute Error): {metrics['mae']:.4f} mph")
+    print(f"RMSE (Root Mean Square Error): {metrics['rmse']:.4f} mph")
+    print(f"MAPE (Mean Absolute Percentage Error): {metrics['mape']:.2f}%")
+    print(f"R² (Coefficient of Determination): {metrics['r2']:.4f}")
+    # Speed range metrics
+    print("\nSpeed Range Performance:")
+    for key, value in metrics.items():
+        if key.startswith('mae_') and key.endswith('_count'):
+            continue
+        elif key.startswith('mae_'):
+            range_name = key.replace('mae_', '').replace('_', ' ')
+            count_key = f"count_{key.replace('mae_', '')}"
+            count = metrics.get(count_key, 0)
+            print(f"  {range_name.title()}: {value:.4f} mph (n={count})")
+    # Save predictions if requested
+    if args.output:
+        print(f"\nSaving predictions to {args.output}")
+        # Create detailed prediction DataFrame
+        pred_df = pd.DataFrame({
+            'prediction': predictions,
+            'target': targets,
+            'error': predictions - targets,
+            'abs_error': np.abs(predictions - targets),
+            'target_index': target_indices
+        })
+        # Add original data columns if possible
+        if len(target_indices) > 0 and max(target_indices) < len(df):
+            for col in ['Time', 'Latitude', 'Longitude', 'direction', 'weather']:
+                if col in df.columns:
+                    pred_df[col] = df.iloc[target_indices][col].values
+        pred_df.to_csv(args.output, index=False)
+        print(f"Predictions saved with {len(pred_df)} rows")
+    # Save metrics if requested
+    if args.metrics_output:
+        import json
+        # right before json.dump
+        metrics = {k: (float(v) if isinstance(v, (np.floating, np.float32, np.float64)) else int(v) if isinstance(v, (np.integer,)) else v)
+                for k, v in metrics.items()}
+        with open(args.metrics_output, 'w') as f:
+            json.dump(metrics, f, indent=2)
+    # Create and save plots if requested
+    if args.plots_output:
+        print(f"Creating evaluation plots...")
+        create_evaluation_plots(predictions, targets, args.plots_output)
+    print("\nEvaluation completed successfully!")
+if __name__ == "__main__":
+    main()

model_v3/experiments.md ADDED Viewed

	@@ -0,0 +1,17 @@

+# 🏆 BEST PERFORMANCE MODEL (Lowest MAE)
+python train_lstm.py --csv test1.csv --epochs 20 --batch_size 128 --hidden_size 256 --bidirectional --loss_type weighted_huber --model_out lstm_model_v3.pt
+==================================================
+EVALUATION METRICS
+==================================================
+MAE (Mean Absolute Error): 4.1815 mph
+RMSE (Root Mean Square Error): 7.6657 mph
+MAPE (Mean Absolute Percentage Error): 8.20%
+R² (Coefficient of Determination): 0.4294
+Speed Range Performance:
+  Low ≤30: 11.5730 mph (n=1889)
+  Medium 30-60: 7.1722 mph (n=15717)
+  High ≥60: 3.2165 mph (n=63171)
+Saving predictions to test_predictions.csv
+Predictions saved with 80539 rows

model_v3/final_encoder.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73fbe65e3e7acbe6bd4a31d26bd53fec9b316340343805845b0ba2f5b910ce15
+size 2493

model_v3/final_lstm.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d0f959fd9a7e62507407b7c2b72e90989ee14fad7c7f287271c339764d9460b
+size 8770554

model_v3/predict_road.py ADDED Viewed

	@@ -0,0 +1,437 @@

+"""
+predict_road.py – Predict traffic speeds for all sensors on a specific road and direction
+This module provides functions to predict traffic speeds for all sensors on a given road
+and direction at a specific time. Designed for map visualization and real-time prediction.
+"""
+import pandas as pd
+import numpy as np
+import torch
+from typing import List, Dict, Tuple, Optional
+from datetime import datetime, timedelta
+import joblib
+import sys
+import os
+# Add current directory to path for local imports
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(current_dir)
+from encode import TrafficDataEncoder
+from train_lstm import LSTMRegressor
+class RoadPredictor:
+    """
+    Predictor for traffic speeds on specific roads and directions.
+    This class loads a trained model and encoder, then provides methods to predict
+    speeds for all sensors on a given road/direction at a specific time.
+    """
+    def __init__(self, model_path: str, encoder_path: str, device: str = "auto"):
+        """
+        Initialize the road predictor.
+        Args:
+            model_path: Path to trained model (.pt file)
+            encoder_path: Path to fitted encoder (.pkl file)
+            device: Device to use (auto, cpu, cuda, mps)
+        """
+        # Device selection
+        if device == "auto":
+            if torch.backends.mps.is_available():
+                self.device = torch.device("mps")
+            elif torch.cuda.is_available():
+                self.device = torch.device("cuda")
+            else:
+                self.device = torch.device("cpu")
+        else:
+            self.device = torch.device(device)
+        print(f"Using device: {self.device}")
+        # Load encoder
+        print(f"Loading encoder from {encoder_path}")
+        self.encoder = TrafficDataEncoder.load(encoder_path)
+        # Load model
+        print(f"Loading model from {model_path}")
+        model_state = torch.load(model_path, map_location=self.device)
+        # Infer model architecture from saved state
+        n_features = len(self.encoder.num_cols) + len(self.encoder.cat_cols)
+        # Infer hidden_size from first LSTM layer weights
+        first_layer_weight_shape = model_state['lstm.weight_ih_l0'].shape
+        hidden_size = first_layer_weight_shape[0] // 4
+        # Check if bidirectional
+        bidirectional = 'lstm.weight_ih_l0_reverse' in model_state
+        # Infer number of layers
+        layer_keys = [k for k in model_state.keys() if k.startswith('lstm.weight_ih_l')]
+        n_layers = len(set([k.split('_l')[1].split('_')[0] for k in layer_keys]))
+        print(f"Model architecture: hidden_size={hidden_size}, n_layers={n_layers}, bidirectional={bidirectional}")
+        # Create and load model
+        self.model = LSTMRegressor(
+            n_features=n_features,
+            hidden_size=hidden_size,
+            n_layers=n_layers,
+            dropout=0.3,
+            bidirectional=bidirectional
+        ).to(self.device)
+        self.model.load_state_dict(model_state)
+        self.model.eval()
+        print("Model and encoder loaded successfully")
+    def get_road_sensors(self, df: pd.DataFrame, road_name: str, direction: str) -> pd.DataFrame:
+        """
+        Get all sensors for a specific road and direction.
+        Args:
+            df: DataFrame with traffic data
+            road_name: Name of the road (e.g., "I 405")
+            direction: Direction (e.g., "North", "South", "East", "West")
+        Returns:
+            DataFrame with unique sensors for the road/direction
+        """
+        # Filter for the specific road and direction
+        road_data = df[(df['road_name'] == road_name) & (df['direction'] == direction)].copy()
+        if len(road_data) == 0:
+            raise ValueError(f"No data found for road '{road_name}' direction '{direction}'")
+        # Get unique sensors using the actual sensor_id from the data
+        sensors = road_data.groupby('sensor_id').agg({
+            'Latitude': 'first',
+            'Longitude': 'first',
+            'road_name': 'first',
+            'direction': 'first',
+            'lanes': 'first'
+        }).reset_index()
+        print(f"Found {len(sensors)} sensors on {road_name} {direction}")
+        return sensors
+    def prepare_prediction_data(
+        self,
+        df: pd.DataFrame,
+        road_name: str,
+        direction: str,
+        target_time: datetime,
+        seq_len: int = 12
+    ) -> Tuple[pd.DataFrame, List[str]]:
+        """
+        Prepare data for prediction by getting historical sequences for each sensor.
+        Args:
+            df: DataFrame with traffic data
+            road_name: Name of the road
+            direction: Direction
+            target_time: Time to predict for
+            seq_len: Length of historical sequence needed
+        Returns:
+            Tuple of (prepared_data, sensor_ids)
+        """
+        # Get sensors for this road/direction
+        sensors = self.get_road_sensors(df, road_name, direction)
+        # Prepare data for each sensor
+        prepared_data = []
+        sensor_ids = []
+        for _, sensor in sensors.iterrows():
+            sensor_id = sensor['sensor_id']
+            # Get all data for this sensor
+            sensor_data = df[df['sensor_id'] == sensor_id].copy()
+            if len(sensor_data) == 0:
+                print(f"Warning: No data found for sensor {sensor_id}")
+                continue
+            sensor_data = sensor_data.sort_values('Time').reset_index(drop=True)
+            # Convert Time to datetime
+            sensor_data['Time'] = pd.to_datetime(sensor_data['Time'])
+            # Find the closest time to target_time
+            time_diffs = abs(sensor_data['Time'] - target_time)
+            if len(time_diffs) == 0:
+                print(f"Warning: No time data for sensor {sensor_id}")
+                continue
+            closest_idx = time_diffs.idxmin()
+            # Get historical sequence ending at closest time
+            start_idx = max(0, closest_idx - seq_len + 1)
+            end_idx = closest_idx + 1
+            if end_idx - start_idx < seq_len:
+                # Not enough historical data, skip this sensor
+                print(f"Warning: Not enough historical data for sensor {sensor_id} (need {seq_len}, have {end_idx - start_idx})")
+                continue
+            # Get the sequence
+            sequence_data = sensor_data.iloc[start_idx:end_idx].copy()
+            # Ensure we have exactly seq_len points
+            if len(sequence_data) > seq_len:
+                sequence_data = sequence_data.tail(seq_len)
+            # Verify we have the right number of points
+            if len(sequence_data) != seq_len:
+                print(f"Warning: Sequence length mismatch for sensor {sensor_id} (expected {seq_len}, got {len(sequence_data)})")
+                continue
+            # Add to prepared data
+            prepared_data.append(sequence_data)
+            sensor_ids.append(sensor_id)
+        if not prepared_data:
+            raise ValueError(f"No sensors with sufficient historical data for {road_name} {direction}")
+        # Combine all sensor data and ensure proper sorting
+        combined_data = pd.concat(prepared_data, ignore_index=True)
+        # Ensure the data is sorted by sensor_id and Time (required by encoder)
+        combined_data = combined_data.sort_values(['sensor_id', 'Time']).reset_index(drop=True)
+        # Add time features that the encoder expects
+        combined_data = self.encoder._add_time_features(combined_data)
+        print(f"Prepared data for {len(sensor_ids)} sensors")
+        print(f"Combined data shape: {combined_data.shape}")
+        print(f"Unique sensors in prepared data: {combined_data['sensor_id'].nunique()}")
+        return combined_data, sensor_ids
+    def predict_road_speeds(
+        self,
+        df: pd.DataFrame,
+        road_name: str,
+        direction: str,
+        target_time: datetime
+    ) -> pd.DataFrame:
+        """
+        Predict speeds for all sensors on a specific road and direction.
+        Args:
+            df: DataFrame with traffic data
+            road_name: Name of the road (e.g., "I 405")
+            direction: Direction (e.g., "North", "South", "East", "West")
+            target_time: Time to predict for
+        Returns:
+            DataFrame with predictions for each sensor
+        """
+        print(f"Predicting speeds for {road_name} {direction} at {target_time}")
+        # Prepare data
+        prepared_data, sensor_ids = self.prepare_prediction_data(
+            df, road_name, direction, target_time
+        )
+        # Instead of using the encoder's transform method, let's create sequences manually
+        # since we already have the exact sequences we want
+        print(f"Creating sequences manually from {len(prepared_data)} rows...")
+        # Group by sensor and create sequences
+        X_sequences = []
+        y_targets = []
+        sensor_mapping = []
+        for sensor_id in sensor_ids:
+            sensor_data = prepared_data[prepared_data['sensor_id'] == sensor_id].sort_values('Time')
+            if len(sensor_data) >= self.encoder.seq_len:
+                # Get the last seq_len points as input sequence
+                sequence_data = sensor_data.tail(self.encoder.seq_len)
+                # Prepare features
+                cat_features = self.encoder.ordinal_encoder.transform(sequence_data[self.encoder.cat_cols]).astype(np.float32)
+                num_features = self.encoder.scaler.transform(sequence_data[self.encoder.num_cols]).astype(np.float32)
+                features = np.concatenate([num_features, cat_features], axis=1)
+                X_sequences.append(features)
+                # For prediction, we don't have a target, so we'll use the last speed as placeholder
+                last_speed = sequence_data[self.encoder.target_col].iloc[-1]
+                y_targets.append([last_speed])
+                sensor_mapping.append(sensor_id)
+        if not X_sequences:
+            raise ValueError("No valid sequences found for prediction")
+        X = np.stack(X_sequences, axis=0)
+        y = np.array(y_targets)
+        print(f"Created {len(X_sequences)} sequences with shape {X.shape}")
+        # Make predictions for each sequence
+        predictions = []
+        sensor_info = []
+        for i, sensor_id in enumerate(sensor_mapping):
+            # Get the sequence for this sensor
+            sensor_sequence = X[i:i+1]  # Keep batch dimension
+            # Make prediction
+            with torch.no_grad():
+                sensor_sequence_tensor = torch.from_numpy(sensor_sequence).float().to(self.device)
+                prediction = self.model(sensor_sequence_tensor).cpu().numpy()[0, 0]
+            predictions.append(prediction)
+            # Get sensor info
+            sensor_data = prepared_data[prepared_data['sensor_id'] == sensor_id].iloc[0]
+            # Get real speed from the most recent data point
+            real_speed = sensor_data.get('speed_mph', None)
+            if real_speed is None and 'speed' in sensor_data:
+                real_speed = sensor_data['speed']
+            elif real_speed is None and 'Speed' in sensor_data:
+                real_speed = sensor_data['Speed']
+            sensor_info.append({
+                'sensor_id': sensor_id,
+                'Latitude': sensor_data['Latitude'],
+                'Longitude': sensor_data['Longitude'],
+                'road_name': sensor_data['road_name'],
+                'direction': sensor_data['direction'],
+                'lanes': sensor_data['lanes'],
+                'predicted_speed': prediction,
+                'real_speed': real_speed,
+                'target_time': target_time
+            })
+        # Create results DataFrame
+        results_df = pd.DataFrame(sensor_info)
+        print(f"Generated predictions for {len(results_df)} sensors")
+        print(f"Predicted speed range: {results_df['predicted_speed'].min():.1f} - {results_df['predicted_speed'].max():.1f} mph")
+        # Print real speed statistics if available
+        real_speeds = results_df['real_speed'].dropna()
+        if len(real_speeds) > 0:
+            print(f"Real speed range: {real_speeds.min():.1f} - {real_speeds.max():.1f} mph")
+            print(f"Real speed available for {len(real_speeds)}/{len(results_df)} sensors")
+        else:
+            print("No real speed data available")
+        return results_df
+    def predict_multiple_times(
+        self,
+        df: pd.DataFrame,
+        road_name: str,
+        direction: str,
+        target_times: List[datetime]
+    ) -> pd.DataFrame:
+        """
+        Predict speeds for multiple time points.
+        Args:
+            df: DataFrame with traffic data
+            road_name: Name of the road
+            direction: Direction
+            target_times: List of times to predict for
+        Returns:
+            DataFrame with predictions for all sensors at all times
+        """
+        all_predictions = []
+        for target_time in target_times:
+            try:
+                predictions = self.predict_road_speeds(df, road_name, direction, target_time)
+                all_predictions.append(predictions)
+            except Exception as e:
+                print(f"Error predicting for {target_time}: {e}")
+                continue
+        if not all_predictions:
+            raise ValueError("No successful predictions generated")
+        # Combine all predictions
+        combined_df = pd.concat(all_predictions, ignore_index=True)
+        return combined_df
+def predict_road_speeds(
+    df: pd.DataFrame,
+    road_name: str,
+    direction: str,
+    target_time: datetime,
+    model_path: str,
+    encoder_path: str,
+    device: str = "auto"
+) -> pd.DataFrame:
+    """
+    Convenience function to predict speeds for all sensors on a road.
+    Args:
+        df: DataFrame with traffic data
+        road_name: Name of the road (e.g., "I 405")
+        direction: Direction (e.g., "North", "South", "East", "West")
+        target_time: Time to predict for
+        model_path: Path to trained model (.pt file)
+        encoder_path: Path to fitted encoder (.pkl file)
+        device: Device to use (auto, cpu, cuda, mps)
+    Returns:
+        DataFrame with predictions for each sensor
+    """
+    predictor = RoadPredictor(model_path, encoder_path, device)
+    return predictor.predict_road_speeds(df, road_name, direction, target_time)
+def main():
+    """Example usage of the road predictor."""
+    import argparse
+    parser = argparse.ArgumentParser(description="Predict traffic speeds for a specific road and direction")
+    parser.add_argument("--csv", required=True, help="Path to CSV file with traffic data")
+    parser.add_argument("--model", required=True, help="Path to trained model (.pt file)")
+    parser.add_argument("--encoder", required=True, help="Path to fitted encoder (.pkl file)")
+    parser.add_argument("--road", required=True, help="Road name (e.g., 'I 405')")
+    parser.add_argument("--direction", required=True, help="Direction (e.g., 'North', 'South', 'East', 'West')")
+    parser.add_argument("--time", required=True, help="Target time (YYYY-MM-DD HH:MM:SS)")
+    parser.add_argument("--output", help="Path to save predictions CSV")
+    args = parser.parse_args()
+    # Load data
+    print(f"Loading data from {args.csv}")
+    df = pd.read_csv(args.csv)
+    # Parse target time
+    target_time = datetime.strptime(args.time, "%Y-%m-%d %H:%M:%S")
+    # Make predictions
+    predictions = predict_road_speeds(
+        df, args.road, args.direction, target_time,
+        args.model, args.encoder
+    )
+    # Print results
+    print(f"\nPredictions for {args.road} {args.direction} at {target_time}:")
+    print("=" * 60)
+    for _, row in predictions.iterrows():
+        print(f"Sensor {row['sensor_id']}: {row['predicted_speed']:.1f} mph")
+    # Save if requested
+    if args.output:
+        predictions.to_csv(args.output, index=False)
+        print(f"\nPredictions saved to {args.output}")
+if __name__ == "__main__":
+    main()

model_v3/train_lstm.py ADDED Viewed

	@@ -0,0 +1,498 @@

+"""
+train_lstm.py – LSTM model training for traffic flow prediction
+Features:
+- LSTM model with configurable architecture
+- Weighted loss for handling speed class imbalance
+- Huber loss option (better than regular loss per user experience)
+- CLI interface for hyperparameter tuning
+- Model and encoder saving
+- Chronological train/val/test splits
+"""
+import argparse
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, TensorDataset
+from pathlib import Path
+import joblib
+from typing import Dict, Tuple, Optional
+from encode import TrafficDataEncoder
+# Device selection
+if torch.backends.mps.is_available():
+    DEVICE = torch.device("mps")
+elif torch.cuda.is_available():
+    DEVICE = torch.device("cuda")
+else:
+    DEVICE = torch.device("cpu")
+print(f"Using device: {DEVICE}")
+class LSTMRegressor(nn.Module):
+    """LSTM model for traffic speed prediction."""
+    def __init__(
+        self,
+        n_features: int,
+        hidden_size: int = 128,
+        n_layers: int = 2,
+        dropout: float = 0.3,
+        bidirectional: bool = False
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.n_layers = n_layers
+        self.bidirectional = bidirectional
+        # LSTM layer
+        self.lstm = nn.LSTM(
+            input_size=n_features,
+            hidden_size=hidden_size,
+            num_layers=n_layers,
+            batch_first=True,
+            dropout=dropout if n_layers > 1 else 0,
+            bidirectional=bidirectional
+        )
+        # Output layer
+        lstm_output_size = hidden_size * (2 if bidirectional else 1)
+        self.head = nn.Sequential(
+            nn.Linear(lstm_output_size, hidden_size // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_size // 2, 1)
+        )
+    def forward(self, x):
+        """Forward pass through the LSTM."""
+        # LSTM forward pass
+        lstm_out, _ = self.lstm(x)
+        # Use the last timestep output
+        last_output = lstm_out[:, -1, :]
+        # Final prediction
+        prediction = self.head(last_output)
+        return prediction
+class WeightedHuberLoss(nn.Module):
+    """Weighted Huber loss for handling speed class imbalance."""
+    def __init__(self, weight_dict: Dict[str, float], delta: float = 1.0, boost_low: float = 1.0):
+        super().__init__()
+        self.delta = delta
+        self.weight_low = weight_dict["weight_low"] * boost_low  # Additional boost for low speeds
+        self.weight_medium = weight_dict["weight_medium"]
+        self.weight_high = weight_dict["weight_high"]
+        self.low_threshold = weight_dict["low_threshold"]
+        self.high_threshold = weight_dict["high_threshold"]
+    def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        """Compute weighted Huber loss."""
+        # Ensure target is 1D
+        if target.dim() > 1:
+            target = target.squeeze()
+        if pred.dim() > 1:
+            pred = pred.squeeze()
+        # Compute Huber loss
+        diff = torch.abs(pred - target)
+        huber_loss = torch.where(
+            diff <= self.delta,
+            0.5 * diff ** 2,
+            self.delta * (diff - 0.5 * self.delta)
+        )
+        # Compute weights based on speed classes
+        weights = torch.ones_like(target)
+        low_mask = target <= self.low_threshold
+        high_mask = target >= self.high_threshold
+        medium_mask = ~(low_mask | high_mask)
+        weights[low_mask] = self.weight_low
+        weights[medium_mask] = self.weight_medium
+        weights[high_mask] = self.weight_high
+        # Apply weights
+        weighted_loss = huber_loss * weights
+        return weighted_loss.mean()
+class FocalHuberLoss(nn.Module):
+    """Focal loss variant for Huber loss to focus on hard examples."""
+    def __init__(self, weight_dict: Dict[str, float], delta: float = 1.0, alpha: float = 2.0, gamma: float = 2.0):
+        super().__init__()
+        self.delta = delta
+        self.alpha = alpha
+        self.gamma = gamma
+        self.weight_low = weight_dict["weight_low"]
+        self.weight_medium = weight_dict["weight_medium"]
+        self.weight_high = weight_dict["weight_high"]
+        self.low_threshold = weight_dict["low_threshold"]
+        self.high_threshold = weight_dict["high_threshold"]
+    def forward(self, pred: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        """Compute focal Huber loss."""
+        if target.dim() > 1:
+            target = target.squeeze()
+        if pred.dim() > 1:
+            pred = pred.squeeze()
+        # Compute Huber loss
+        diff = torch.abs(pred - target)
+        huber_loss = torch.where(
+            diff <= self.delta,
+            0.5 * diff ** 2,
+            self.delta * (diff - 0.5 * self.delta)
+        )
+        # Compute focal weights (higher loss = harder example)
+        focal_weights = self.alpha * (huber_loss ** self.gamma)
+        # Apply class weights
+        class_weights = torch.ones_like(target)
+        low_mask = target <= self.low_threshold
+        high_mask = target >= self.high_threshold
+        medium_mask = ~(low_mask | high_mask)
+        class_weights[low_mask] = self.weight_low
+        class_weights[medium_mask] = self.weight_medium
+        class_weights[high_mask] = self.weight_high
+        # Combine focal and class weights
+        total_weights = focal_weights * class_weights
+        weighted_loss = huber_loss * total_weights
+        return weighted_loss.mean()
+def create_data_loaders(
+    X: np.ndarray,
+    y: np.ndarray,
+    timestamps: np.ndarray,
+    batch_size: int,
+    train_ratio: float = 0.7,
+    val_ratio: float = 0.15
+) -> Tuple[DataLoader, DataLoader, DataLoader, np.ndarray]:
+    """
+    Create chronological train/validation/test data loaders.
+    Args:
+        X: Input sequences (N, seq_len, n_features)
+        y: Target values (N, horizon)
+        timestamps: Timestamps for each sample
+        batch_size: Batch size for data loaders
+        train_ratio: Fraction of data for training
+        val_ratio: Fraction of data for validation
+    Returns:
+        train_loader, val_loader, test_loader, test_indices
+    """
+    # Sort by timestamp to ensure chronological order
+    sorted_indices = np.argsort(timestamps)
+    X_sorted = X[sorted_indices]
+    y_sorted = y[sorted_indices]
+    # Calculate split points
+    n_total = len(X_sorted)
+    n_train = int(n_total * train_ratio)
+    n_val = int(n_total * val_ratio)
+    # Split indices
+    train_indices = sorted_indices[:n_train]
+    val_indices = sorted_indices[n_train:n_train + n_val]
+    test_indices = sorted_indices[n_train + n_val:]
+    # Convert timestamps to datetime for date range display
+    timestamps_dt = pd.to_datetime(timestamps)
+    print(f"Data split:")
+    print(f"  Train: {len(train_indices):,} samples ({train_ratio*100:.0f}%)")
+    if len(train_indices) > 0:
+        train_dates = timestamps_dt[train_indices]
+        print(f"    Date range: {train_dates.min()} to {train_dates.max()}")
+    print(f"  Val:   {len(val_indices):,} samples ({val_ratio*100:.0f}%)")
+    if len(val_indices) > 0:
+        val_dates = timestamps_dt[val_indices]
+        print(f"    Date range: {val_dates.min()} to {val_dates.max()}")
+    print(f"  Test:  {len(test_indices):,} samples ({(1-train_ratio-val_ratio)*100:.0f}%)")
+    if len(test_indices) > 0:
+        test_dates = timestamps_dt[test_indices]
+        print(f"    Date range: {test_dates.min()} to {test_dates.max()}")
+    # Create data loaders
+    def create_loader(indices, shuffle=False):
+        X_subset = torch.from_numpy(X[indices]).float()
+        y_subset = torch.from_numpy(y[indices]).float()
+        dataset = TensorDataset(X_subset, y_subset)
+        return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
+    train_loader = create_loader(train_indices, shuffle=True)
+    val_loader = create_loader(val_indices, shuffle=False)
+    test_loader = create_loader(test_indices, shuffle=False)
+    return train_loader, val_loader, test_loader, test_indices
+def train_epoch(
+    model: LSTMRegressor,
+    train_loader: DataLoader,
+    optimizer: torch.optim.Optimizer,
+    loss_fn: nn.Module,
+    device: torch.device
+) -> float:
+    """Train the model for one epoch."""
+    model.train()
+    total_loss = 0.0
+    num_batches = 0
+    for batch_X, batch_y in train_loader:
+        batch_X = batch_X.to(device)
+        batch_y = batch_y.to(device)
+        # Forward pass
+        optimizer.zero_grad()
+        predictions = model(batch_X)
+        loss = loss_fn(predictions, batch_y)
+        # Backward pass
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+        optimizer.step()
+        total_loss += loss.item()
+        num_batches += 1
+    return total_loss / num_batches
+def evaluate(
+    model: LSTMRegressor,
+    data_loader: DataLoader,
+    loss_fn: nn.Module,
+    device: torch.device
+) -> float:
+    """Evaluate the model on a dataset."""
+    model.eval()
+    total_loss = 0.0
+    num_batches = 0
+    with torch.no_grad():
+        for batch_X, batch_y in data_loader:
+            batch_X = batch_X.to(device)
+            batch_y = batch_y.to(device)
+            predictions = model(batch_X)
+            loss = loss_fn(predictions, batch_y)
+            total_loss += loss.item()
+            num_batches += 1
+    return total_loss / num_batches
+def main():
+    """Main training function."""
+    parser = argparse.ArgumentParser(description="Train LSTM model for traffic prediction")
+    # Data parameters
+    parser.add_argument("--csv", required=True, help="Path to CSV file with traffic data")
+    parser.add_argument("--seq_len", type=int, default=12, help="Sequence length (default: 12)")
+    parser.add_argument("--horizon", type=int, default=1, help="Prediction horizon (default: 1)")
+    parser.add_argument("--target_col", default="speed_mph", help="Target column name")
+    # Model parameters
+    parser.add_argument("--hidden_size", type=int, default=128, help="LSTM hidden size")
+    parser.add_argument("--n_layers", type=int, default=2, help="Number of LSTM layers")
+    parser.add_argument("--dropout", type=float, default=0.3, help="Dropout rate")
+    parser.add_argument("--bidirectional", action="store_true", help="Use bidirectional LSTM")
+    # Training parameters
+    parser.add_argument("--epochs", type=int, default=50, help="Number of training epochs")
+    parser.add_argument("--batch_size", type=int, default=256, help="Batch size")
+    parser.add_argument("--lr", type=float, default=1e-3, help="Learning rate")
+    parser.add_argument("--weight_decay", type=float, default=1e-5, help="Weight decay")
+    # Loss parameters
+    parser.add_argument("--loss_type", choices=["mse", "mae", "huber", "weighted_huber", "focal_huber"],
+                       default="weighted_huber", help="Loss function type")
+    parser.add_argument("--huber_delta", type=float, default=1.0, help="Huber loss delta")
+    parser.add_argument("--boost_low", type=float, default=1.0, help="Additional boost for low-speed loss (weighted_huber only)")
+    parser.add_argument("--focal_alpha", type=float, default=2.0, help="Focal loss alpha parameter")
+    parser.add_argument("--focal_gamma", type=float, default=2.0, help="Focal loss gamma parameter")
+    # Data split parameters
+    parser.add_argument("--train_ratio", type=float, default=0.7, help="Training data ratio")
+    parser.add_argument("--val_ratio", type=float, default=0.15, help="Validation data ratio")
+    # Output parameters
+    parser.add_argument("--model_out", help="Path to save the best model")
+    parser.add_argument("--encoder_out", help="Path to save the fitted encoder")
+    parser.add_argument("--pred_csv", help="Path to save test predictions")
+    parser.add_argument("--log_file", help="Path to save training log")
+    args = parser.parse_args()
+    # Load and encode data
+    print("Loading data...")
+    df = pd.read_csv(args.csv)
+    print(f"Loaded {len(df):,} rows from {args.csv}")
+    # Create encoder
+    encoder = TrafficDataEncoder(
+        seq_len=args.seq_len,
+        horizon=args.horizon,
+        target_col=args.target_col
+    )
+    # Fit encoder and transform data
+    print("Encoding data...")
+    X, y, target_indices, timestamps = encoder.fit_transform(df)
+    print(f"Encoded data shapes: X={X.shape}, y={y.shape}")
+    # Save encoder if requested
+    if args.encoder_out:
+        encoder.save(args.encoder_out)
+    # Create data loaders
+    print("Creating data loaders...")
+    train_loader, val_loader, test_loader, test_indices = create_data_loaders(
+        X, y, timestamps, args.batch_size, args.train_ratio, args.val_ratio
+    )
+    # Initialize model
+    print("Initializing model...")
+    model = LSTMRegressor(
+        n_features=X.shape[2],
+        hidden_size=args.hidden_size,
+        n_layers=args.n_layers,
+        dropout=args.dropout,
+        bidirectional=args.bidirectional
+    ).to(DEVICE)
+    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
+    # Initialize optimizer
+    optimizer = torch.optim.Adam(
+        model.parameters(),
+        lr=args.lr,
+        weight_decay=args.weight_decay
+    )
+    # Initialize loss function
+    if args.loss_type == "weighted_huber":
+        # Get speed weights from encoder
+        weight_dict = encoder.get_speed_weights(y.flatten())
+        loss_fn = WeightedHuberLoss(weight_dict, args.huber_delta, args.boost_low)
+        print(f"Using weighted Huber loss with low-speed boost: {args.boost_low}")
+    elif args.loss_type == "focal_huber":
+        # Get speed weights from encoder
+        weight_dict = encoder.get_speed_weights(y.flatten())
+        loss_fn = FocalHuberLoss(weight_dict, args.huber_delta, args.focal_alpha, args.focal_gamma)
+        print(f"Using focal Huber loss (alpha={args.focal_alpha}, gamma={args.focal_gamma})")
+    elif args.loss_type == "huber":
+        loss_fn = nn.SmoothL1Loss(beta=args.huber_delta)
+        print("Using Huber loss")
+    elif args.loss_type == "mae":
+        loss_fn = nn.L1Loss()
+        print("Using MAE loss")
+    else:  # mse
+        loss_fn = nn.MSELoss()
+        print("Using MSE loss")
+    # Training loop
+    print("Starting training...")
+    best_val_loss = float('inf')
+    best_model_state = None
+    train_losses = []
+    val_losses = []
+    for epoch in range(1, args.epochs + 1):
+        # Train
+        train_loss = train_epoch(model, train_loader, optimizer, loss_fn, DEVICE)
+        # Validate
+        val_loss = evaluate(model, val_loader, loss_fn, DEVICE)
+        train_losses.append(train_loss)
+        val_losses.append(val_loss)
+        print(f"Epoch {epoch:3d}/{args.epochs}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")
+        # Save best model
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+            best_model_state = model.state_dict().copy()
+            print(f"  -> New best validation loss: {best_val_loss:.4f}")
+    # Load best model and evaluate on test set
+    print("\nEvaluating on test set...")
+    model.load_state_dict(best_model_state)
+    test_loss = evaluate(model, test_loader, loss_fn, DEVICE)
+    print(f"Test Loss: {test_loss:.4f}")
+    # Save best model
+    if args.model_out:
+        torch.save(best_model_state, args.model_out)
+        print(f"Best model saved to {args.model_out}")
+    # Save predictions if requested
+    if args.pred_csv:
+        print("Generating test predictions...")
+        model.eval()
+        predictions = []
+        targets = []
+        with torch.no_grad():
+            for batch_X, batch_y in test_loader:
+                batch_X = batch_X.to(DEVICE)
+                batch_pred = model(batch_X).cpu().numpy()
+                predictions.append(batch_pred)
+                targets.append(batch_y.numpy())
+        predictions = np.concatenate(predictions, axis=0)
+        targets = np.concatenate(targets, axis=0)
+        # Create prediction DataFrame
+        pred_df = pd.DataFrame({
+            'prediction': predictions.flatten(),
+            'target': targets.flatten(),
+            'error': predictions.flatten() - targets.flatten(),
+            'abs_error': np.abs(predictions.flatten() - targets.flatten())
+        })
+        pred_df.to_csv(args.pred_csv, index=False)
+        print(f"Predictions saved to {args.pred_csv}")
+        # Print some statistics
+        mae = pred_df['abs_error'].mean()
+        rmse = np.sqrt((pred_df['error'] ** 2).mean())
+        print(f"Test MAE: {mae:.4f}")
+        print(f"Test RMSE: {rmse:.4f}")
+    # Save training log if requested
+    if args.log_file:
+        log_df = pd.DataFrame({
+            'epoch': range(1, len(train_losses) + 1),
+            'train_loss': train_losses,
+            'val_loss': val_losses
+        })
+        log_df.to_csv(args.log_file, index=False)
+        print(f"Training log saved to {args.log_file}")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+# Core Data Science
+numpy==2.3.1
+pandas==2.3.0
+scikit-learn==1.7.2
+scipy==1.16.0
+# Deep Learning
+torch
+# Geographic Processing
+geopandas==1.1.1
+osmnx==2.0.4
+folium==0.20.0
+shapely==2.1.1
+geopy==2.4.1
+# Web Scraping (for data collection)
+selenium
+# Web Interface
+streamlit==1.46.1
+streamlit_folium==0.25.0
+# Utilities
+joblib==1.5.1
+requests==2.32.4
+openpyxl==3.1.5
+# Visualization (for evaluation)
+matplotlib
+#huggingface
+altair

roadmap/RoadMap.py ADDED Viewed

	@@ -0,0 +1,513 @@

+import os
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+import osmnx as ox
+import folium
+from shapely.geometry import LineString, MultiLineString, Point
+from shapely import ops
+from sklearn.neighbors import BallTree
+from typing import Tuple, List
+from .utils import get_coordinates_from_network, sort_gps_by_greedy_path, add_weather_to_df
+from .mock_predictor import MockTrafficPredictor
+from geopy.distance import geodesic
+import re
+import math
+from datetime import datetime
+import sys
+import os
+# Add parent directory to path to import model_v3
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from model_v3.predict_road import RoadPredictor
+# Define model paths relative to the project root
+PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+MODEL_PATH = os.path.join(PROJECT_ROOT, "model_v3", "final_lstm.pt")
+ENCODER_PATH = os.path.join(PROJECT_ROOT, "model_v3", "final_encoder.pkl")
+# Validate that model files exist
+if not os.path.exists(MODEL_PATH):
+    raise FileNotFoundError(f"Model file not found at: {MODEL_PATH}")
+if not os.path.exists(ENCODER_PATH):
+    raise FileNotFoundError(f"Encoder file not found at: {ENCODER_PATH}")
+DIST_THRESHOLD_METERS_MAX = 1200 #2000
+DIST_THRESHOLD_METERS_MIN = 10 #10
+class RoadMapManager:
+    def __init__(self, city: str,bbox: Tuple[float,float,float,float], base_data_dir: str = "data"):
+        self.city = city
+        self.bbox = bbox
+        self.base_data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", base_data_dir))
+        self.city_path = os.path.join(self.base_data_dir, self.city)
+        self.coordinates_path = os.path.join(self.city_path, 'coordinates')
+        self.roads_path = os.path.join(self.city_path, 'roads')
+        self.road_network_path = os.path.join(self.city_path, 'maps')
+        self.visualizations_path = os.path.join(self.city_path, 'visualizations')
+        self.roads = dict()
+        self._validate_structure()
+        self._load_road_network(self.bbox)
+    def _validate_structure(self):
+        for path in [self.coordinates_path, self.roads_path, self.road_network_path, self.visualizations_path]:
+            os.makedirs(path, exist_ok=True)
+    @staticmethod
+    def split_road_name_direction(road_name: str) -> Tuple[str, str]:
+        parts = road_name.split()
+        return " ".join(parts[:-1]), parts[-1]
+    def set_roads(self, roads: List[str]):
+        os.makedirs(self.coordinates_path, exist_ok=True)
+        for road in roads:
+            road_name, direction = self.split_road_name_direction(road)
+            file_name = f"{road_name} {direction}.csv"
+            file_path =  os.path.join(self.coordinates_path,file_name)
+            if os.path.exists(file_path):
+                print(f"DataFrame for {road_name} - {direction} already exists")
+                df = pd.read_csv(file_path)
+            else:
+                print(f"Downloading DataFrame for {road_name} - {direction}")
+                df = get_coordinates_from_network(self.road_network, road_name, direction)
+                df.to_csv(file_path, index=False)
+            self.roads[(road_name,direction)] = df
+    def get_roads(self):
+        """
+        Used for testing
+        """
+        for (road_name, direction), df in self.roads.items():
+            print(f"road name: {road_name} - {direction}")
+            print(df.head())
+            print("\n" + "-"*40 + "\n")
+    def _load_road_network(self, bbox: Tuple[float,float,float,float]):
+        network_filename = f"{self.city.replace(' ', "_")}_network.graphml"
+        network_path = os.path.join(self.road_network_path, network_filename)
+        if os.path.exists(network_path):
+            print("map already exists")
+            self.road_network = ox.load_graphml(network_path)
+        else:
+            print("Downloading map")
+            self.road_network = ox.graph_from_bbox(
+                bbox=bbox,
+                network_type='drive'
+            )
+            self.road_network = ox.bearing.add_edge_bearings(self.road_network)
+            ox.save_graphml(self.road_network, filepath=network_path)
+    def apply_prediction_data(self, predict_time: datetime | None = None):
+        """
+        Needed data to predict:
+        Gather data about weather in current day in time for each point
+        Speed limit - from the road network
+        Road name, Direction - key of the coordinates dict
+        Cooridnate from the coordinate df
+        Lanes - either from coordinate if given else from road network
+        Time - input from user
+        """
+        predictions = {}
+        road_predictor = RoadPredictor(MODEL_PATH, ENCODER_PATH)
+        for (road_name, direction) in self.roads.keys():
+            road_under = road_name.replace(" ", "_")
+            df = pd.read_csv(os.path.join(self.roads_path, f"{road_under}_{direction.lower()}.csv.gz"), compression='gzip')
+            predictions[(road_name, direction)] = road_predictor.predict_road_speeds(df, road_name, direction, predict_time)
+        # Map predictions to road coordinates
+        self._map_predictions_to_roads(predictions)
+        for (road_name, direction), df in self.roads.items():
+            print(df.head())
+            df = sort_gps_by_greedy_path(df)
+            self.roads[(road_name, direction)] = df
+    def _map_predictions_to_roads(self, predictions: dict):
+        """
+        Map predicted speeds to the closest points in self.roads coordinates.
+        Args:
+            predictions: Dictionary with (road_name, direction) keys and prediction DataFrames as values
+        """
+        from sklearn.neighbors import BallTree
+        for (road_name, direction), road_df in self.roads.items():
+            if (road_name, direction) not in predictions:
+                print(f"No predictions found for {road_name} {direction}")
+                continue
+            pred_df = predictions[(road_name, direction)]
+            if pred_df.empty:
+                print(f"Empty predictions for {road_name} {direction}")
+                continue
+            # Extract coordinates from road data
+            road_coords = road_df[['Latitude', 'Longitude']].values
+            # Extract coordinates from predictions
+            pred_coords = pred_df[['Latitude', 'Longitude']].values
+            # Create BallTree for efficient nearest neighbor search
+            # Convert to radians for haversine distance
+            road_coords_rad = np.radians(road_coords)
+            pred_coords_rad = np.radians(pred_coords)
+            tree = BallTree(pred_coords_rad, metric='haversine')
+            # Find closest prediction for each road point
+            distances, indices = tree.query(road_coords_rad, k=1)
+            # Convert distances from radians to meters (approximate)
+            distances_meters = distances.flatten() * 6371000  # Earth radius in meters
+            # Get predicted speeds for closest points
+            closest_pred_speeds = pred_df.iloc[indices.flatten()]['predicted_speed'].values
+            # Get real speeds for closest points (if available)
+            if 'real_speed' in pred_df.columns:
+                closest_real_speeds = pred_df.iloc[indices.flatten()]['real_speed'].values
+                road_df['real_speed'] = closest_real_speeds
+            else:
+                road_df['real_speed'] = None
+            # Add predicted speeds to road DataFrame
+            road_df['predicted_speed'] = closest_pred_speeds
+            road_df['prediction_distance_m'] = distances_meters
+            # Use predicted speed as the main speed for visualization
+            road_df['speed'] = road_df['predicted_speed']
+            # Check for points that are too far from any prediction
+            max_distance_threshold = 1000  # 1km threshold
+            far_points = distances_meters > max_distance_threshold
+            if far_points.any():
+                print(f"Warning: {far_points.sum()} points in {road_name} {direction} are >{max_distance_threshold}m from predictions")
+                # For points too far, use a default speed or interpolate
+                road_df.loc[far_points, 'predicted_speed'] = road_df.loc[~far_points, 'predicted_speed'].mean()
+                road_df.loc[far_points, 'speed'] = road_df.loc[~far_points, 'speed'].mean()
+            print(f"Mapped predictions for {road_name} {direction}: "
+                  f"{len(road_df)} points, avg distance: {distances_meters.mean():.1f}m")
+    def get_prediction_statistics(self) -> dict:
+        """
+        Get statistics about the prediction mapping for all roads.
+        Returns:
+            Dictionary with statistics for each road
+        """
+        stats = {}
+        for (road_name, direction), road_df in self.roads.items():
+            if 'predicted_speed' not in road_df.columns:
+                continue
+            stats[(road_name, direction)] = {
+                'total_points': len(road_df),
+                'avg_predicted_speed': road_df['predicted_speed'].mean(),
+                'min_predicted_speed': road_df['predicted_speed'].min(),
+                'max_predicted_speed': road_df['predicted_speed'].max(),
+                'avg_distance_to_prediction': road_df.get('prediction_distance_m', pd.Series([0])).mean(),
+                'max_distance_to_prediction': road_df.get('prediction_distance_m', pd.Series([0])).max(),
+                'points_with_predictions': road_df['predicted_speed'].notna().sum()
+            }
+        return stats
+    def print_prediction_summary(self):
+        """Print a summary of prediction statistics for all roads."""
+        stats = self.get_prediction_statistics()
+        if not stats:
+            print("No prediction statistics available. Run apply_prediction_data() first.")
+            return
+        print("\n" + "="*80)
+        print("PREDICTION MAPPING SUMMARY")
+        print("="*80)
+        for (road_name, direction), stat in stats.items():
+            print(f"\n{road_name} {direction}:")
+            print(f"  Points: {stat['points_with_predictions']}/{stat['total_points']}")
+            print(f"  Speed: {stat['avg_predicted_speed']:.1f} mph (range: {stat['min_predicted_speed']:.1f}-{stat['max_predicted_speed']:.1f})")
+            print(f"  Avg distance to prediction: {stat['avg_distance_to_prediction']:.1f}m")
+            print(f"  Max distance to prediction: {stat['max_distance_to_prediction']:.1f}m")
+    def draw_map(self):
+        def get_color(speed, max_speed):
+            if speed >= 0.85 * max_speed:
+                return '#00FF00'  # Bright neon green
+            elif speed >= 0.55 * max_speed:
+                return '#FFA500'  # Bright orange
+            else:
+                return '#FF0000'  # Bright red
+        center_lon = (self.bbox[0] + self.bbox[2]) / 2
+        center_lat = (self.bbox[1] + self.bbox[3]) / 2
+        m = folium.Map(
+        location=[center_lat, center_lon],
+        zoom_start=13,
+        tiles='CartoDB dark_matter'
+        )
+        for (road_name, direction), df in self.roads.items():
+            for i in range(len(df) - 1):
+                lat1, lon1, speed1 = df.loc[i, ['Latitude', 'Longitude', 'speed']] # type: ignore
+                lat2, lon2, speed2 = df.loc[i+1, ['Latitude', 'Longitude', 'speed']] # type: ignore
+                raw_speed = df.loc[i, 'maxspeed']
+                match = re.search(r'\d+', str(raw_speed))
+                if match:
+                    max_speed = float(match.group())
+                else:
+                    max_speed = 60
+                dist = geodesic((lat1, lon1), (lat2, lon2)).meters
+                if dist > DIST_THRESHOLD_METERS_MAX or dist < DIST_THRESHOLD_METERS_MIN:
+                    continue  # Skip if too far or too close
+                avg_speed = (speed1 + speed2) / 2
+                color = get_color(avg_speed,max_speed)
+                folium.PolyLine(
+                    locations=[(lat1, lon1), (lat2, lon2)],
+                    color=color,
+                    weight=1,
+                    opacity=0.9
+                ).add_to(m)
+        output_path = os.path.join(self.visualizations_path, "sorted_path.html")
+        m.save(output_path)
+        print("Saved map with distance filtering to 'sorted_path.html'")
+    def draw_map_offset(self):
+        def get_color(speed, max_speed):
+            if speed >= 0.85 * max_speed:
+                return '#00FF00'  # Neon green
+            elif speed >= 0.55 * max_speed:
+                return '#FFA500'  # Bright orange
+            else:
+                return '#FF0000'  # Bright red
+        def get_maxspeed(raw_speed):
+            match = re.search(r'\d+', str(raw_speed))
+            return float(match.group()) if match else 60
+        def apply_offset(lat, lon, bearing, direction):
+            """Offset lat/lon a little perpendicular to bearing, based on direction."""
+            offset_meters = -600 if direction.lower() in ["north", "east"] else 600
+            # Convert bearing to radians and rotate 90°
+            angle_rad = math.radians((bearing + 90) % 360)
+            delta_lat = offset_meters * math.cos(angle_rad) / 111111
+            delta_lon = offset_meters * math.sin(angle_rad) / (111111 * math.cos(math.radians(lat)))
+            return lat + delta_lat, lon + delta_lon
+        # Create dark base map
+        center_lon = (self.bbox[0] + self.bbox[2]) / 2
+        center_lat = (self.bbox[1] + self.bbox[3]) / 2
+        m = folium.Map(
+            location=[center_lat, center_lon],
+            zoom_start=13,
+            tiles='CartoDB dark_matter'
+        )
+        # Group by road name
+        road_groups = {}
+        for (road_name, direction), df in self.roads.items():
+            road_groups.setdefault(road_name, {})[direction] = df
+        for road_name, direction_map in road_groups.items():
+            for direction, df in direction_map.items():
+                for i in range(len(df) - 1):
+                    lat1, lon1, speed1 = df.loc[i, ['Latitude', 'Longitude', 'speed']]
+                    lat2, lon2, speed2 = df.loc[i + 1, ['Latitude', 'Longitude', 'speed']]
+                    raw_speed = df.loc[i, 'maxspeed']
+                    max_speed = get_maxspeed(raw_speed)
+                    bearing = df.loc[i, 'bearing'] if 'bearing' in df.columns else 0
+                    dist = geodesic((lat1, lon1), (lat2, lon2)).meters
+                    if dist > DIST_THRESHOLD_METERS_MAX or dist < DIST_THRESHOLD_METERS_MIN:
+                        continue
+                    avg_speed = (speed1 + speed2) / 2
+                    color = get_color(avg_speed, max_speed)
+                    # Apply visual offset if road has both directions
+                    has_opposite = len(direction_map) > 1
+                    if has_opposite:
+                        lat1, lon1 = apply_offset(lat1, lon1, bearing, direction)
+                        lat2, lon2 = apply_offset(lat2, lon2, bearing, direction)
+                    folium.PolyLine(
+                        locations=[(lat1, lon1), (lat2, lon2)],
+                        color=color,
+                        weight=2,
+                        opacity=0.95
+                    ).add_to(m)
+        output_path = os.path.join(self.visualizations_path, "direction_offset_map.html")
+        m.save(output_path)
+        print("✅ Saved map with directional offsets to 'direction_offset_map.html'")
+        return m
+    def draw_map_with_real_speed(self):
+        """
+        Draw map using real speed data instead of predicted speed.
+        """
+        def get_color(speed, max_speed):
+            if speed >= 0.85 * max_speed:
+                return '#00FF00'  # Neon green
+            elif speed >= 0.55 * max_speed:
+                return '#FFA500'  # Bright orange
+            else:
+                return '#FF0000'  # Bright red
+        def get_maxspeed(raw_speed):
+            match = re.search(r'\d+', str(raw_speed))
+            return float(match.group()) if match else 60
+        def apply_offset(lat, lon, bearing, direction):
+            """Offset lat/lon a little perpendicular to bearing, based on direction."""
+            offset_meters = -600 if direction.lower() in ["north", "east"] else 600
+            # Convert bearing to radians and rotate 90°
+            angle_rad = math.radians((bearing + 90) % 360)
+            delta_lat = offset_meters * math.cos(angle_rad) / 111111
+            delta_lon = offset_meters * math.sin(angle_rad) / (111111 * math.cos(math.radians(lat)))
+            return lat + delta_lat, lon + delta_lon
+        # Create dark base map
+        center_lon = (self.bbox[0] + self.bbox[2]) / 2
+        center_lat = (self.bbox[1] + self.bbox[3]) / 2
+        m = folium.Map(
+            location=[center_lat, center_lon],
+            zoom_start=13,
+            tiles='CartoDB dark_matter'
+        )
+        # Group by road name
+        road_groups = {}
+        for (road_name, direction), df in self.roads.items():
+            road_groups.setdefault(road_name, {})[direction] = df
+        for road_name, direction_map in road_groups.items():
+            for direction, df in direction_map.items():
+                for i in range(len(df) - 1):
+                    lat1, lon1 = df.loc[i, ['Latitude', 'Longitude']]
+                    lat2, lon2 = df.loc[i + 1, ['Latitude', 'Longitude']]
+                    # Use real speed if available, otherwise fall back to predicted speed
+                    if 'real_speed' in df.columns and pd.notna(df.loc[i, 'real_speed']):
+                        speed1 = df.loc[i, 'real_speed']
+                        speed2 = df.loc[i + 1, 'real_speed'] if i + 1 < len(df) and pd.notna(df.loc[i + 1, 'real_speed']) else speed1
+                    else:
+                        speed1 = df.loc[i, 'speed']
+                        speed2 = df.loc[i + 1, 'speed']
+                    raw_speed = df.loc[i, 'maxspeed']
+                    max_speed = get_maxspeed(raw_speed)
+                    bearing = df.loc[i, 'bearing'] if 'bearing' in df.columns else 0
+                    dist = geodesic((lat1, lon1), (lat2, lon2)).meters
+                    if dist > DIST_THRESHOLD_METERS_MAX or dist < DIST_THRESHOLD_METERS_MIN:
+                        continue
+                    avg_speed = (speed1 + speed2) / 2
+                    color = get_color(avg_speed, max_speed)
+                    # Apply visual offset if road has both directions
+                    has_opposite = len(direction_map) > 1
+                    if has_opposite:
+                        lat1, lon1 = apply_offset(lat1, lon1, bearing, direction)
+                        lat2, lon2 = apply_offset(lat2, lon2, bearing, direction)
+                    folium.PolyLine(
+                        locations=[(lat1, lon1), (lat2, lon2)],
+                        color=color,
+                        weight=2,
+                        opacity=0.95
+                    ).add_to(m)
+        output_path = os.path.join(self.visualizations_path, "real_speed_map.html")
+        m.save(output_path)
+        print("✅ Saved map with real speed data to 'real_speed_map.html'")
+        return m
+    def draw_side_by_side_maps(self):
+        """
+        Create side-by-side maps showing both predicted and real speeds.
+        Returns a tuple of (predicted_map, real_map) for use in Streamlit.
+        """
+        # Create predicted speed map
+        predicted_map = self.draw_map_offset()
+        # Create real speed map
+        real_map = self.draw_map_with_real_speed()
+        return predicted_map, real_map
+"""
+    mock_predictor = MockTrafficPredictor({
+            'I 405 North': 'moderate',
+            'I 405 South': 'free',
+            'US 101 North': 'busy',
+            'US 101 South': 'free',
+            'I 5 North': 'busy',
+            'I 5 South': 'free',
+            'I 10 East': 'moderate',
+            'I 10 West': 'moderate',
+            'I 110 North': 'busy',
+            'I 110 South': 'busy',
+            'CA 110 North': 'busy',
+            'CA 110 South': 'busy',
+            'CA 170 North': 'moderate',
+            'CA 170 South': 'free',
+            'CA 118 East': 'free',
+            'CA 118 West': 'free',
+            'CA 134 East': 'moderate',
+            'CA 134 West': 'free',
+            'CA 2 North': 'moderate',
+            'CA 2 South': 'moderate',
+            'I 605 North': 'busy',
+            'I 605': 'free',
+            'I 210 East' : 'free',
+            'I 210 West' : 'busy'
+        })
+        if predict_time is None:
+            predict_time = datetime.now()
+        for (road_name, direction), df in self.roads.items():
+            #self.roads[(road_name, direction)] = add_weather_to_df(self.roads[(road_name, direction)], time = predict_time)
+            print(f"Mocking for {road_name} - {direction}")
+            df = mock_predictor.predict(df)
+            print(df.head())
+            df = sort_gps_by_greedy_path(df)
+            self.roads[(road_name, direction)] = df
+"""

roadmap/__init__.py ADDED Viewed

File without changes

roadmap/mock_predictor.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import pandas as pd
+import numpy as np
+class MockTrafficPredictor:
+    def __init__(self, road_classification_map: dict, seed: int = 42):
+        """
+        road_classification_map: dict mapping road ref (e.g., 'I 405') to classification ('busy', 'moderate', 'free')
+        """
+        valid_classes = {'busy', 'moderate', 'free'}
+        for cls in road_classification_map.values():
+            if cls not in valid_classes:
+                raise ValueError(f"Invalid classification '{cls}', must be one of {valid_classes}")
+        self.road_classification_map = road_classification_map
+        self.random = np.random.default_rng(seed)
+        self.speed_range = {
+            'busy': (0.2, 0.5),
+            'moderate': (0.5, 0.8),
+            'free': (0.8, 1.0)
+        }
+    def predict(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df.copy()
+        if 'ref' not in df.columns:
+            raise ValueError("Input DataFrame must contain a 'ref' column with road names")
+        road_ref = df['ref'].iloc[0]
+        road_direction = df['direction'].iloc[0]
+        classification_name = road_ref + " " + road_direction
+        classification = self.road_classification_map.get(classification_name, 'moderate')
+        min_r, max_r = self.speed_range[classification]
+        df['maxspeed_numeric'] = df['maxspeed'].str.extract(r'(\d+)').astype(float)
+        # Generate base values with slight spatial variation
+        base = self.random.uniform(min_r, max_r)
+        noise = self.random.normal(loc=0, scale=0.05, size=len(df))  # small gaussian noise
+        raw_factors = np.clip(base + noise, min_r, max_r)
+        df['speed'] = df['maxspeed_numeric'] * raw_factors
+        return df.drop(columns=['maxspeed_numeric'])

roadmap/utils.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from geopandas import GeoDataFrame
+from networkx import MultiDiGraph
+import pandas as pd
+import numpy as np
+import osmnx as ox
+from shapely.geometry import LineString, MultiLineString
+from sklearn.neighbors import BallTree
+import requests
+from sklearn.cluster import KMeans
+from datetime import datetime
+def filter_by_direction(selected_road: GeoDataFrame, road_direction: str) -> GeoDataFrame:
+    if road_direction == 'North':
+        return selected_road[
+            (selected_road['bearing'] >= 270) | (selected_road['bearing'] <= 90)
+        ]
+    elif road_direction == 'South':
+        return selected_road[
+            (selected_road['bearing'] > 90) & (selected_road['bearing'] < 270)
+        ]
+    elif road_direction == 'East':
+        return selected_road[
+            (selected_road['bearing'] >= 0) & (selected_road['bearing'] <= 180)
+        ]
+    elif road_direction == 'West':
+        return selected_road[
+            (selected_road['bearing'] > 180) & (selected_road['bearing'] < 360)
+        ]
+    else:
+        raise ValueError(f"Invalid road_direction: {road_direction}. Must be one of: North, South, East, West.")
+def add_weather_to_df(df: pd.DataFrame, num_clusters: int = 4 , api_key = 'FLMEW5QEEB8WT8YGUJXF6KAPK', time: datetime | None = None) -> pd.DataFrame:
+    if df.empty:
+        df['weather'] = None
+        return df
+    if time is None:
+        time = datetime.now()
+    coords = df[['Latitude', 'Longitude']].dropna().values
+    kmeans = KMeans(n_clusters=min(num_clusters, len(coords)), random_state=42)
+    df['weather_cluster'] = kmeans.fit_predict(coords)
+    weather_data = {}
+    date_str = time.strftime("%Y-%m-%d")
+    target_hour = time.strftime("%H:%M:%S")
+    for cluster_id in range(kmeans.n_clusters): # type: ignore
+        lat, lon = kmeans.cluster_centers_[cluster_id]
+        url = f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{lat},{lon}/{date_str}"
+        params = {
+            "key": api_key,
+            "unitGroup": "metric",
+            "contentType": "json"
+        }
+        try:
+            response = requests.get(url=url, params=params)
+            response.raise_for_status()
+            data = response.json()
+            hours = data.get("days", [{}])[0].get("hours", [])
+            def hour_diff(hour_entry):
+                try:
+                    return abs(datetime.strptime(hour_entry["datetime"], "%H:%M:%S") - datetime.strptime(target_hour, "%H:%M:%S"))
+                except:
+                    return datetime.max
+            if hours:
+                best_match = min(hours, key=hour_diff)
+                weather = best_match.get("conditions", "Unknown")
+                weather_time = best_match.get("datetime", None)
+            else:
+                weather = "Unknown"
+                weather_time = None
+        except Exception as e:
+            print(f"Weather api error for cluster {cluster_id}: {e}")
+            weather = "Unknown"
+            weather_time = None
+        weather_data[cluster_id] = {
+            "conditions": weather,
+            "datetime": weather_time
+        }
+    df['time'] = time
+    df['weather'] = df['weather_cluster'].map(lambda x: weather_data[x]["conditions"])
+    df['weather_time'] = df['weather_cluster'].map(lambda x: weather_data[x]["datetime"])
+    df.drop(columns=['weather_cluster'], inplace=True)
+    return df
+def get_coordinates_from_network(G : MultiDiGraph, road_name: str, road_direction: str):
+    edges = ox.graph_to_gdfs(G, nodes=False, edges=True)
+    edges_motorway = edges[edges['highway'].isin(['motorway', 'motorway_link'])]
+    selected_road = edges_motorway[
+        edges_motorway['ref'].str.contains(road_name, na=False, case=False)
+    ]
+    selected_road = filter_by_direction(selected_road, road_direction)
+    rows = []
+    for _, row in selected_road.iterrows():
+        lanes = row.get("lanes", None)
+        maxspeed = row.get("maxspeed", None)
+        road_name = row.get("name", None) # type: ignore
+        ref = row.get("ref", None)
+        geometry = row.geometry
+        if isinstance(geometry, LineString):
+            coords = geometry.coords
+        elif isinstance(geometry, MultiLineString):
+            coords = [pt for line in geometry.geoms for pt in line.coords]
+        else:
+            continue
+        for lon, lat in coords:
+            rows.append({
+                "Longitude": lon,
+                "Latitude": lat,
+                "lanes": lanes,
+                "maxspeed": maxspeed,
+                "road_name": road_name,
+                "ref": ref,
+                "direction" : road_direction
+            })
+    # Step 6: Build DataFrame
+    road_df = pd.DataFrame(rows)
+    print(f"Total points in {road_name} - {road_direction}: {len(road_df)}")
+    return road_df
+def sort_gps_by_greedy_path(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Greedy nearest-neighbor sorting of GPS coordinates.
+    Args:
+        df (pd.DataFrame): DataFrame with 'Latitude' and 'Longitude' columns.
+    Returns:
+        pd.DataFrame: Reordered DataFrame.
+    """
+    coords_rad = np.radians(df[['Latitude', 'Longitude']].values)
+    tree = BallTree(coords_rad, metric='haversine')
+    visited = np.zeros(len(df), dtype=bool)
+    path = []
+    current_idx = 0  # or use farthest-point-start logic
+    for _ in range(len(df)):
+        visited[current_idx] = True
+        path.append(current_idx)
+        dist, ind = tree.query([coords_rad[current_idx]], k=len(df))
+        for next_idx in ind[0]:
+            if not visited[next_idx]:
+                current_idx = next_idx
+                break
+    return df.iloc[path].reset_index(drop=True)