Spaces:

Risha15
/

steam-game-recommender

Sleeping

File size: 19,760 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import requests
import io
import tempfile
import os

# Try to import plotly with error handling
try:
    import plotly.express as px
    import plotly.graph_objects as go
    PLOTLY_AVAILABLE = True
except Exception as e:
    st.warning(f"Plotly import warning: {e}")
    PLOTLY_AVAILABLE = False

# Page configuration
st.set_page_config(
    page_title="Steam Game Recommender",
    page_icon="🎮",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 3rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .recommendation-card {
        padding: 1.5rem;
        border-radius: 10px;
        border: 1px solid #ddd;
        margin: 1rem 0;
        background-color: #f9f9f9;
        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
    }
    .similarity-high { 
        background-color: #d4edda; 
        border-left: 5px solid #28a745;
    }
    .similarity-medium { 
        background-color: #fff3cd; 
        border-left: 5px solid #ffc107;
    }
    .similarity-low { 
        background-color: #f8d7da; 
        border-left: 5px solid #dc3545;
    }
    .game-title {
        color: #1f77b4;
        margin-bottom: 0.5rem;
    }
    .stat-card {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        padding: 1rem;
        border-radius: 10px;
        text-align: center;
    }
</style>
""", unsafe_allow_html=True)

@st.cache_data(ttl=3600)
def load_data():
    """Load the full Steam games dataset using huggingface_hub"""
    try:
        st.info("🚀 Initializing dataset download...")
        
        # Method: Using huggingface_hub with snapshot download
        from huggingface_hub import snapshot_download
        
        repo_id = "FronkonGames/steam-games-dataset"
        
        progress_bar = st.progress(0)
        status_text = st.empty()
        
        status_text.text("📥 Downloading Steam games dataset from Hugging Face...")
        progress_bar.progress(20)
        
        with tempfile.TemporaryDirectory() as tmpdir:
            # Download the dataset files
            snapshot_download(
                repo_id=repo_id,
                repo_type="dataset",
                allow_patterns="data/*.parquet",
                local_dir=tmpdir,
                local_dir_use_symlinks=False
            )
            
            status_text.text("🔍 Locating data files...")
            progress_bar.progress(50)
            
            # Find and load the parquet file
            data_dir = os.path.join(tmpdir, "data")
            if os.path.exists(data_dir):
                parquet_files = [f for f in os.listdir(data_dir) if f.endswith('.parquet')]
                
                if parquet_files:
                    parquet_path = os.path.join(data_dir, parquet_files[0])
                    
                    status_text.text("📊 Loading dataset into memory...")
                    progress_bar.progress(80)
                    
                    # Read the parquet file
                    df = pd.read_parquet(parquet_path)
                    
                    progress_bar.progress(100)
                    status_text.text("✅ Dataset loaded successfully!")
                    
                    st.success(f"🎉 Successfully loaded {len(df):,} Steam games!")
                    return df
                else:
                    st.error("❌ No parquet files found in the dataset")
                    return pd.DataFrame()
            else:
                st.error("❌ Data directory not found")
                return pd.DataFrame()
                
    except Exception as e:
        st.error(f"❌ Error loading dataset: {str(e)}")
        return pd.DataFrame()

@st.cache_data
def preprocess_data(df):
    """Preprocess the data for recommendations"""
    if df.empty:
        return df
        
    st.info("🔄 Preprocessing data...")
    
    # Create a copy to avoid modifying cached data
    df_processed = df.copy()
    
    # Handle missing values
    numeric_columns = ['Price', 'Positive', 'Negative']
    for col in numeric_columns:
        if col in df_processed.columns:
            df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce').fillna(0)
    
    # Calculate rating score
    if 'Positive' in df_processed.columns and 'Negative' in df_processed.columns:
        df_processed['Total_Reviews'] = df_processed['Positive'] + df_processed['Negative']
        df_processed['Rating_Score'] = np.where(
            df_processed['Total_Reviews'] > 0, 
            df_processed['Positive'] / df_processed['Total_Reviews'] * 100, 
            0
        )
    else:
        df_processed['Rating_Score'] = 50  # Default rating
    
    # Clean text columns
    text_columns = ['Genres', 'Tags', 'Categories', 'About the game', 'Name']
    for col in text_columns:
        if col in df_processed.columns:
            df_processed[col] = df_processed[col].fillna('').astype(str)
    
    # Ensure platform columns are boolean
    platform_columns = ['Windows', 'Mac', 'Linux']
    for col in platform_columns:
        if col in df_processed.columns:
            df_processed[col] = df_processed[col].fillna(False).astype(bool)
    
    st.success("✅ Data preprocessing complete!")
    return df_processed

class SteamGameRecommender:
    def __init__(self, games_df):
        self.df = games_df
    
    def recommend_by_features(self, genres='', tags='', price_max=60, 
                            platforms=None, min_rating=0, min_reviews=0, top_n=10):
        """Type 1: Feature-based recommendations"""
        if self.df.empty:
            return pd.DataFrame()
            
        filtered_games = self.df.copy()
        
        # Apply genre filter
        if genres:
            filtered_games = filtered_games[
                filtered_games['Genres'].str.contains(genres, case=False, na=False)
            ]
        
        # Apply tag filter
        if tags:
            filtered_games = filtered_games[
                filtered_games['Tags'].str.contains(tags, case=False, na=False)
            ]
        
        # Apply price filter
        if price_max is not None:
            filtered_games = filtered_games[filtered_games['Price'] <= price_max]
        
        # Apply platform filters
        if platforms:
            platform_filter = False
            if 'Windows' in platforms and 'Windows' in filtered_games.columns:
                platform_filter = platform_filter | (filtered_games['Windows'] == True)
            if 'Mac' in platforms and 'Mac' in filtered_games.columns:
                platform_filter = platform_filter | (filtered_games['Mac'] == True)
            if 'Linux' in platforms and 'Linux' in filtered_games.columns:
                platform_filter = platform_filter | (filtered_games['Linux'] == True)
            
            if platform_filter is not False:
                filtered_games = filtered_games[platform_filter]
        
        # Apply rating filter
        if min_rating > 0:
            filtered_games = filtered_games[filtered_games['Rating_Score'] >= min_rating]
        
        # Apply minimum reviews filter
        if min_reviews > 0 and 'Total_Reviews' in filtered_games.columns:
            filtered_games = filtered_games[filtered_games['Total_Reviews'] >= min_reviews]
        
        # Sort by rating and return top N
        if len(filtered_games) > 0:
            # Sort by rating score (descending), then by number of reviews (descending)
            if 'Total_Reviews' in filtered_games.columns:
                recommendations = filtered_games.sort_values(
                    ['Rating_Score', 'Total_Reviews'], 
                    ascending=[False, False]
                ).head(top_n)
            else:
                recommendations = filtered_games.sort_values('Rating_Score', ascending=False).head(top_n)
            
            # Select available columns
            available_columns = [
                'Name', 'Genres', 'Price', 'Rating_Score', 'Positive', 'Negative', 
                'Release date', 'About the game', 'Total_Reviews'
            ]
            result_columns = [col for col in available_columns if col in recommendations.columns]
            
            return recommendations[result_columns]
        else:
            return pd.DataFrame()
    
    def get_popular_games(self, top_n=10):
        """Get popular games based on reviews and ratings"""
        if self.df.empty:
            return pd.DataFrame()
            
        # Filter games with substantial reviews
        if 'Total_Reviews' in self.df.columns:
            popular_games = self.df[self.df['Total_Reviews'] > 100].copy()
        else:
            popular_games = self.df.copy()
        
        if len(popular_games) > 0:
            # Sort by rating and reviews
            if 'Total_Reviews' in popular_games.columns:
                popular_games = popular_games.sort_values(
                    ['Rating_Score', 'Total_Reviews'], 
                    ascending=[False, False]
                ).head(top_n)
            else:
                popular_games = popular_games.sort_values('Rating_Score', ascending=False).head(top_n)
            
            available_columns = [
                'Name', 'Genres', 'Price', 'Rating_Score', 'Positive', 'Negative', 
                'Release date', 'About the game'
            ]
            result_columns = [col for col in available_columns if col in popular_games.columns]
            
            return popular_games[result_columns]
        else:
            return pd.DataFrame()

def display_game_card(game, index):
    """Display a game card with consistent formatting"""
    rating = game.get('Rating_Score', 0)
    
    # Determine card color based on rating
    if rating >= 80:
        card_class = "similarity-high"
        rating_emoji = "🔥"
    elif rating >= 60:
        card_class = "similarity-medium"
        rating_emoji = "⭐"
    else:
        card_class = "similarity-low"
        rating_emoji = "⚠️"
    
    with st.container():
        st.markdown(f'<div class="recommendation-card {card_class}">', unsafe_allow_html=True)
        
        col1, col2, col3 = st.columns([3, 1, 1])
        
        with col1:
            st.markdown(f'<h3 class="game-title">{index + 1}. {game.get("Name", "Unknown")}</h3>', unsafe_allow_html=True)
            st.write(f"**Genres:** {game.get('Genres', 'N/A')}")
            
            description = game.get('About the game', '')
            if description and len(description) > 50:
                st.write(f"**Description:** {description[:250]}...")
        
        with col2:
            price = game.get('Price', 0)
            st.metric("💰 Price", f"${price:.2f}" if price > 0 else "Free")
            st.metric(f"{rating_emoji} Rating", f"{rating:.1f}%")
        
        with col3:
            release_date = game.get('Release date', 'N/A')
            st.write(f"**Release Date:** {release_date}")
            
            positive = game.get('Positive', 0)
            negative = game.get('Negative', 0)
            total_reviews = positive + negative
            
            if total_reviews > 0:
                st.write(f"**Reviews:** 👍 {int(positive):,} | 👎 {int(negative):,}")
                st.write(f"**Approval:** {(positive/total_reviews*100):.1f}%")
        
        st.markdown('</div>', unsafe_allow_html=True)

def display_text_chart(data, title):
    """Display a simple text-based chart when plotly is not available"""
    st.subheader(title)
    for item, count in data.items():
        st.write(f"**{item}:** {count:,} games")

def main():
    # Header
    st.markdown('<h1 class="main-header">🎮 Steam Game Recommendation System</h1>', 
                unsafe_allow_html=True)
    st.markdown("### Type 1: Feature-Based Recommendations | Full Dataset (80,000+ Games)")
    
    # Initialize session state
    if 'last_recommendations' not in st.session_state:
        st.session_state.last_recommendations = None
    
    # Load data
    df = load_data()
    
    if df.empty:
        st.error("""
        ❌ Unable to load the dataset. This might be due to:
        - Network connectivity issues
        - Hugging Face API limitations
        - Dataset availability
        
        Please try refreshing the page or check back later.
        """)
        return
    
    # Preprocess data
    df = preprocess_data(df)
    recommender = SteamGameRecommender(df)
    
    if df.empty:
        return
    
    # Sidebar for filters
    st.sidebar.title("🔍 Filter Games")
    st.sidebar.markdown(f"<div class='stat-card'><h3>📊 Loaded</h3><h2>{len(df):,}</h2><p>Games</p></div>", 
                       unsafe_allow_html=True)
    
    # Genre selection
    st.sidebar.subheader("🎯 Genres & Tags")
    if 'Genres' in df.columns:
        all_genres = set()
        for genres in df['Genres'].head(5000):
            if isinstance(genres, str):
                for genre in genres.split(','):
                    genre_clean = genre.strip()
                    if genre_clean and len(genre_clean) > 1:
                        all_genres.add(genre_clean)
        
        all_genres = sorted(list(all_genres))
        selected_genre = st.sidebar.selectbox("Select Genre", [""] + all_genres)
    else:
        selected_genre = ""
    
    # Tag selection
    if 'Tags' in df.columns:
        all_tags = set()
        for tags in df['Tags'].head(5000):
            if isinstance(tags, str):
                for tag in tags.split(','):
                    tag_clean = tag.strip()
                    if tag_clean and len(tag_clean) > 1:
                        all_tags.add(tag_clean)
        
        all_tags = sorted(list(all_tags))[:150]
        selected_tag = st.sidebar.selectbox("Select Tag (Optional)", [""] + all_tags)
    else:
        selected_tag = ""
    
    # Price and rating filters
    st.sidebar.subheader("💰 Price & Rating")
    max_price = st.sidebar.slider("Maximum Price ($)", 0, 100, 60)
    min_rating = st.sidebar.slider("Minimum Rating (%)", 0, 100, 70)
    min_reviews = st.sidebar.slider("Minimum Reviews", 0, 1000, 10)
    
    # Platforms
    st.sidebar.subheader("🖥️ Platforms")
    windows = st.sidebar.checkbox("Windows", value=True)
    mac = st.sidebar.checkbox("Mac")
    linux = st.sidebar.checkbox("Linux")
    
    platforms = []
    if windows: platforms.append('Windows')
    if mac: platforms.append('Mac')
    if linux: platforms.append('Linux')
    
    # Number of recommendations
    st.sidebar.subheader("📋 Results")
    num_recommendations = st.sidebar.slider("Number of Recommendations", 5, 50, 15)
    
    # Recommendation buttons
    col1, col2 = st.sidebar.columns(2)
    
    with col1:
        if st.button("🎯 Get Recommendations", type="primary", use_container_width=True):
            with st.spinner(f'🔍 Searching through {len(df):,} games...'):
                recommendations = recommender.recommend_by_features(
                    genres=selected_genre,
                    tags=selected_tag,
                    price_max=max_price,
                    platforms=platforms,
                    min_rating=min_rating,
                    min_reviews=min_reviews,
                    top_n=num_recommendations
                )
                st.session_state.last_recommendations = recommendations
    
    with col2:
        if st.button("🔥 Popular Games", use_container_width=True):
            with st.spinner('Finding popular games...'):
                recommendations = recommender.get_popular_games(top_n=num_recommendations)
                st.session_state.last_recommendations = recommendations
    
    # Display recommendations
    if st.session_state.last_recommendations is not None:
        recommendations = st.session_state.last_recommendations
        
        if len(recommendations) > 0:
            st.success(f"🎉 Found {len(recommendations)} games matching your criteria!")
            
            for idx, (_, game) in enumerate(recommendations.iterrows()):
                display_game_card(game, idx)
            
            # Statistics
            st.subheader("📊 Recommendation Statistics")
            col1, col2, col3, col4 = st.columns(4)
            
            with col1:
                avg_price = recommendations['Price'].mean()
                st.metric("Average Price", f"${avg_price:.2f}")
            with col2:
                avg_rating = recommendations['Rating_Score'].mean()
                st.metric("Average Rating", f"{avg_rating:.1f}%")
            with col3:
                total_positive = recommendations.get('Positive', pd.Series([0])).sum()
                st.metric("Total 👍 Reviews", f"{int(total_positive):,}")
            with col4:
                st.metric("Games Found", len(recommendations))
            
        else:
            st.warning("No games found matching your criteria. Try adjusting your filters.")
    
    # Main area when no search is performed
    else:
        st.info("""
        👋 **Welcome to the Steam Game Recommender!**
        
        Use the sidebar filters to find your perfect games from our database of **80,000+ games**.
        """)
        
        # Dataset statistics
        if not df.empty:
            col1, col2, col3, col4 = st.columns(4)
            
            with col1:
                total_games = len(df)
                st.metric("Total Games", f"{total_games:,}")
            
            with col2:
                avg_price = df['Price'].mean()
                st.metric("Average Price", f"${avg_price:.2f}")
            
            with col3:
                rated_games = df[df['Rating_Score'] > 0]
                avg_rating = rated_games['Rating_Score'].mean() if len(rated_games) > 0 else 0
                st.metric("Average Rating", f"{avg_rating:.1f}%")
            
            with col4:
                free_games = len(df[df['Price'] == 0])
                st.metric("Free Games", f"{free_games:,}")
            
            # Popular genres display
            st.subheader("🎯 Most Popular Genres")
            if 'Genres' in df.columns:
                genre_counts = pd.Series([
                    genre for genres in df['Genres'].head(10000)
                    for genre in str(genres).split(',') 
                    if genre.strip() and len(genre.strip()) > 1
                ]).value_counts().head(15)
                
                if len(genre_counts) > 0:
                    if PLOTLY_AVAILABLE:
                        try:
                            fig_genres = px.bar(
                                x=genre_counts.values,
                                y=genre_counts.index,
                                orientation='h',
                                title='Top 15 Game Genres in Database',
                                labels={'x': 'Number of Games', 'y': 'Genre'},
                                color=genre_counts.values,
                                color_continuous_scale='viridis'
                            )
                            fig_genres.update_layout(showlegend=False)
                            st.plotly_chart(fig_genres, use_container_width=True)
                        except Exception as e:
                            st.warning(f"Plotly chart error: {e}")
                            display_text_chart(genre_counts.head(10), "Top 10 Genres")
                    else:
                        display_text_chart(genre_counts.head(10), "Top 10 Genres")

if __name__ == "__main__":
    main()