Spaces:

Risha15
/

steam-game-recommender

Sleeping

App Files Files Community

steam-game-recommender / app.py

Risha15

Update app.py

cbbf6b8 verified 11 days ago

raw

history blame contribute delete

19.8 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import requests
	import io
	import tempfile
	import os

	# Try to import plotly with error handling
	try:
	import plotly.express as px
	import plotly.graph_objects as go
	PLOTLY_AVAILABLE = True
	except Exception as e:
	st.warning(f"Plotly import warning: {e}")
	PLOTLY_AVAILABLE = False

	# Page configuration
	st.set_page_config(
	page_title="Steam Game Recommender",
	page_icon="🎮",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Custom CSS
	st.markdown("""
	<style>
	.main-header {
	font-size: 3rem;
	color: #1f77b4;
	text-align: center;
	margin-bottom: 2rem;
	}
	.recommendation-card {
	padding: 1.5rem;
	border-radius: 10px;
	border: 1px solid #ddd;
	margin: 1rem 0;
	background-color: #f9f9f9;
	box-shadow: 0 2px 4px rgba(0,0,0,0.1);
	}
	.similarity-high {
	background-color: #d4edda;
	border-left: 5px solid #28a745;
	}
	.similarity-medium {
	background-color: #fff3cd;
	border-left: 5px solid #ffc107;
	}
	.similarity-low {
	background-color: #f8d7da;
	border-left: 5px solid #dc3545;
	}
	.game-title {
	color: #1f77b4;
	margin-bottom: 0.5rem;
	}
	.stat-card {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	padding: 1rem;
	border-radius: 10px;
	text-align: center;
	}
	</style>
	""", unsafe_allow_html=True)

	@st.cache_data(ttl=3600)
	def load_data():
	"""Load the full Steam games dataset using huggingface_hub"""
	try:
	st.info("🚀 Initializing dataset download...")

	# Method: Using huggingface_hub with snapshot download
	from huggingface_hub import snapshot_download

	repo_id = "FronkonGames/steam-games-dataset"

	progress_bar = st.progress(0)
	status_text = st.empty()

	status_text.text("📥 Downloading Steam games dataset from Hugging Face...")
	progress_bar.progress(20)

	with tempfile.TemporaryDirectory() as tmpdir:
	# Download the dataset files
	snapshot_download(
	repo_id=repo_id,
	repo_type="dataset",
	allow_patterns="data/*.parquet",
	local_dir=tmpdir,
	local_dir_use_symlinks=False
	)

	status_text.text("🔍 Locating data files...")
	progress_bar.progress(50)

	# Find and load the parquet file
	data_dir = os.path.join(tmpdir, "data")
	if os.path.exists(data_dir):
	parquet_files = [f for f in os.listdir(data_dir) if f.endswith('.parquet')]

	if parquet_files:
	parquet_path = os.path.join(data_dir, parquet_files[0])

	status_text.text("📊 Loading dataset into memory...")
	progress_bar.progress(80)

	# Read the parquet file
	df = pd.read_parquet(parquet_path)

	progress_bar.progress(100)
	status_text.text("✅ Dataset loaded successfully!")

	st.success(f"🎉 Successfully loaded {len(df):,} Steam games!")
	return df
	else:
	st.error("❌ No parquet files found in the dataset")
	return pd.DataFrame()
	else:
	st.error("❌ Data directory not found")
	return pd.DataFrame()

	except Exception as e:
	st.error(f"❌ Error loading dataset: {str(e)}")
	return pd.DataFrame()

	@st.cache_data
	def preprocess_data(df):
	"""Preprocess the data for recommendations"""
	if df.empty:
	return df

	st.info("🔄 Preprocessing data...")

	# Create a copy to avoid modifying cached data
	df_processed = df.copy()

	# Handle missing values
	numeric_columns = ['Price', 'Positive', 'Negative']
	for col in numeric_columns:
	if col in df_processed.columns:
	df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce').fillna(0)

	# Calculate rating score
	if 'Positive' in df_processed.columns and 'Negative' in df_processed.columns:
	df_processed['Total_Reviews'] = df_processed['Positive'] + df_processed['Negative']
	df_processed['Rating_Score'] = np.where(
	df_processed['Total_Reviews'] > 0,
	df_processed['Positive'] / df_processed['Total_Reviews'] * 100,
	0
	)
	else:
	df_processed['Rating_Score'] = 50 # Default rating

	# Clean text columns
	text_columns = ['Genres', 'Tags', 'Categories', 'About the game', 'Name']
	for col in text_columns:
	if col in df_processed.columns:
	df_processed[col] = df_processed[col].fillna('').astype(str)

	# Ensure platform columns are boolean
	platform_columns = ['Windows', 'Mac', 'Linux']
	for col in platform_columns:
	if col in df_processed.columns:
	df_processed[col] = df_processed[col].fillna(False).astype(bool)

	st.success("✅ Data preprocessing complete!")
	return df_processed

	class SteamGameRecommender:
	def __init__(self, games_df):
	self.df = games_df

	def recommend_by_features(self, genres='', tags='', price_max=60,
	platforms=None, min_rating=0, min_reviews=0, top_n=10):
	"""Type 1: Feature-based recommendations"""
	if self.df.empty:
	return pd.DataFrame()

	filtered_games = self.df.copy()

	# Apply genre filter
	if genres:
	filtered_games = filtered_games[
	filtered_games['Genres'].str.contains(genres, case=False, na=False)
	]

	# Apply tag filter
	if tags:
	filtered_games = filtered_games[
	filtered_games['Tags'].str.contains(tags, case=False, na=False)
	]

	# Apply price filter
	if price_max is not None:
	filtered_games = filtered_games[filtered_games['Price'] <= price_max]

	# Apply platform filters
	if platforms:
	platform_filter = False
	if 'Windows' in platforms and 'Windows' in filtered_games.columns:
	platform_filter = platform_filter \| (filtered_games['Windows'] == True)
	if 'Mac' in platforms and 'Mac' in filtered_games.columns:
	platform_filter = platform_filter \| (filtered_games['Mac'] == True)
	if 'Linux' in platforms and 'Linux' in filtered_games.columns:
	platform_filter = platform_filter \| (filtered_games['Linux'] == True)

	if platform_filter is not False:
	filtered_games = filtered_games[platform_filter]

	# Apply rating filter
	if min_rating > 0:
	filtered_games = filtered_games[filtered_games['Rating_Score'] >= min_rating]

	# Apply minimum reviews filter
	if min_reviews > 0 and 'Total_Reviews' in filtered_games.columns:
	filtered_games = filtered_games[filtered_games['Total_Reviews'] >= min_reviews]

	# Sort by rating and return top N
	if len(filtered_games) > 0:
	# Sort by rating score (descending), then by number of reviews (descending)
	if 'Total_Reviews' in filtered_games.columns:
	recommendations = filtered_games.sort_values(
	['Rating_Score', 'Total_Reviews'],
	ascending=[False, False]
	).head(top_n)
	else:
	recommendations = filtered_games.sort_values('Rating_Score', ascending=False).head(top_n)

	# Select available columns
	available_columns = [
	'Name', 'Genres', 'Price', 'Rating_Score', 'Positive', 'Negative',
	'Release date', 'About the game', 'Total_Reviews'
	]
	result_columns = [col for col in available_columns if col in recommendations.columns]

	return recommendations[result_columns]
	else:
	return pd.DataFrame()

	def get_popular_games(self, top_n=10):
	"""Get popular games based on reviews and ratings"""
	if self.df.empty:
	return pd.DataFrame()

	# Filter games with substantial reviews
	if 'Total_Reviews' in self.df.columns:
	popular_games = self.df[self.df['Total_Reviews'] > 100].copy()
	else:
	popular_games = self.df.copy()

	if len(popular_games) > 0:
	# Sort by rating and reviews
	if 'Total_Reviews' in popular_games.columns:
	popular_games = popular_games.sort_values(
	['Rating_Score', 'Total_Reviews'],
	ascending=[False, False]
	).head(top_n)
	else:
	popular_games = popular_games.sort_values('Rating_Score', ascending=False).head(top_n)

	available_columns = [
	'Name', 'Genres', 'Price', 'Rating_Score', 'Positive', 'Negative',
	'Release date', 'About the game'
	]
	result_columns = [col for col in available_columns if col in popular_games.columns]

	return popular_games[result_columns]
	else:
	return pd.DataFrame()

	def display_game_card(game, index):
	"""Display a game card with consistent formatting"""
	rating = game.get('Rating_Score', 0)

	# Determine card color based on rating
	if rating >= 80:
	card_class = "similarity-high"
	rating_emoji = "🔥"
	elif rating >= 60:
	card_class = "similarity-medium"
	rating_emoji = "⭐"
	else:
	card_class = "similarity-low"
	rating_emoji = "⚠️"

	with st.container():
	st.markdown(f'<div class="recommendation-card {card_class}">', unsafe_allow_html=True)

	col1, col2, col3 = st.columns([3, 1, 1])

	with col1:
	st.markdown(f'<h3 class="game-title">{index + 1}. {game.get("Name", "Unknown")}</h3>', unsafe_allow_html=True)
	st.write(f"Genres: {game.get('Genres', 'N/A')}")

	description = game.get('About the game', '')
	if description and len(description) > 50:
	st.write(f"Description: {description[:250]}...")

	with col2:
	price = game.get('Price', 0)
	st.metric("💰 Price", f"${price:.2f}" if price > 0 else "Free")
	st.metric(f"{rating_emoji} Rating", f"{rating:.1f}%")

	with col3:
	release_date = game.get('Release date', 'N/A')
	st.write(f"Release Date: {release_date}")

	positive = game.get('Positive', 0)
	negative = game.get('Negative', 0)
	total_reviews = positive + negative

	if total_reviews > 0:
	st.write(f"Reviews: 👍 {int(positive):,} \| 👎 {int(negative):,}")
	st.write(f"Approval: {(positive/total_reviews*100):.1f}%")

	st.markdown('</div>', unsafe_allow_html=True)

	def display_text_chart(data, title):
	"""Display a simple text-based chart when plotly is not available"""
	st.subheader(title)
	for item, count in data.items():
	st.write(f"{item}: {count:,} games")

	def main():
	# Header
	st.markdown('<h1 class="main-header">🎮 Steam Game Recommendation System</h1>',
	unsafe_allow_html=True)
	st.markdown("### Type 1: Feature-Based Recommendations \| Full Dataset (80,000+ Games)")

	# Initialize session state
	if 'last_recommendations' not in st.session_state:
	st.session_state.last_recommendations = None

	# Load data
	df = load_data()

	if df.empty:
	st.error("""
	❌ Unable to load the dataset. This might be due to:
	- Network connectivity issues
	- Hugging Face API limitations
	- Dataset availability

	Please try refreshing the page or check back later.
	""")
	return

	# Preprocess data
	df = preprocess_data(df)
	recommender = SteamGameRecommender(df)

	if df.empty:
	return

	# Sidebar for filters
	st.sidebar.title("🔍 Filter Games")
	st.sidebar.markdown(f"<div class='stat-card'><h3>📊 Loaded</h3><h2>{len(df):,}</h2><p>Games</p></div>",
	unsafe_allow_html=True)

	# Genre selection
	st.sidebar.subheader("🎯 Genres & Tags")
	if 'Genres' in df.columns:
	all_genres = set()
	for genres in df['Genres'].head(5000):
	if isinstance(genres, str):
	for genre in genres.split(','):
	genre_clean = genre.strip()
	if genre_clean and len(genre_clean) > 1:
	all_genres.add(genre_clean)

	all_genres = sorted(list(all_genres))
	selected_genre = st.sidebar.selectbox("Select Genre", [""] + all_genres)
	else:
	selected_genre = ""

	# Tag selection
	if 'Tags' in df.columns:
	all_tags = set()
	for tags in df['Tags'].head(5000):
	if isinstance(tags, str):
	for tag in tags.split(','):
	tag_clean = tag.strip()
	if tag_clean and len(tag_clean) > 1:
	all_tags.add(tag_clean)

	all_tags = sorted(list(all_tags))[:150]
	selected_tag = st.sidebar.selectbox("Select Tag (Optional)", [""] + all_tags)
	else:
	selected_tag = ""

	# Price and rating filters
	st.sidebar.subheader("💰 Price & Rating")
	max_price = st.sidebar.slider("Maximum Price ($)", 0, 100, 60)
	min_rating = st.sidebar.slider("Minimum Rating (%)", 0, 100, 70)
	min_reviews = st.sidebar.slider("Minimum Reviews", 0, 1000, 10)

	# Platforms
	st.sidebar.subheader("🖥️ Platforms")
	windows = st.sidebar.checkbox("Windows", value=True)
	mac = st.sidebar.checkbox("Mac")
	linux = st.sidebar.checkbox("Linux")

	platforms = []
	if windows: platforms.append('Windows')
	if mac: platforms.append('Mac')
	if linux: platforms.append('Linux')

	# Number of recommendations
	st.sidebar.subheader("📋 Results")
	num_recommendations = st.sidebar.slider("Number of Recommendations", 5, 50, 15)

	# Recommendation buttons
	col1, col2 = st.sidebar.columns(2)

	with col1:
	if st.button("🎯 Get Recommendations", type="primary", use_container_width=True):
	with st.spinner(f'🔍 Searching through {len(df):,} games...'):
	recommendations = recommender.recommend_by_features(
	genres=selected_genre,
	tags=selected_tag,
	price_max=max_price,
	platforms=platforms,
	min_rating=min_rating,
	min_reviews=min_reviews,
	top_n=num_recommendations
	)
	st.session_state.last_recommendations = recommendations

	with col2:
	if st.button("🔥 Popular Games", use_container_width=True):
	with st.spinner('Finding popular games...'):
	recommendations = recommender.get_popular_games(top_n=num_recommendations)
	st.session_state.last_recommendations = recommendations

	# Display recommendations
	if st.session_state.last_recommendations is not None:
	recommendations = st.session_state.last_recommendations

	if len(recommendations) > 0:
	st.success(f"🎉 Found {len(recommendations)} games matching your criteria!")

	for idx, (_, game) in enumerate(recommendations.iterrows()):
	display_game_card(game, idx)

	# Statistics
	st.subheader("📊 Recommendation Statistics")
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	avg_price = recommendations['Price'].mean()
	st.metric("Average Price", f"${avg_price:.2f}")
	with col2:
	avg_rating = recommendations['Rating_Score'].mean()
	st.metric("Average Rating", f"{avg_rating:.1f}%")
	with col3:
	total_positive = recommendations.get('Positive', pd.Series([0])).sum()
	st.metric("Total 👍 Reviews", f"{int(total_positive):,}")
	with col4:
	st.metric("Games Found", len(recommendations))

	else:
	st.warning("No games found matching your criteria. Try adjusting your filters.")

	# Main area when no search is performed
	else:
	st.info("""
	👋 Welcome to the Steam Game Recommender!

	Use the sidebar filters to find your perfect games from our database of 80,000+ games.
	""")

	# Dataset statistics
	if not df.empty:
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	total_games = len(df)
	st.metric("Total Games", f"{total_games:,}")

	with col2:
	avg_price = df['Price'].mean()
	st.metric("Average Price", f"${avg_price:.2f}")

	with col3:
	rated_games = df[df['Rating_Score'] > 0]
	avg_rating = rated_games['Rating_Score'].mean() if len(rated_games) > 0 else 0
	st.metric("Average Rating", f"{avg_rating:.1f}%")

	with col4:
	free_games = len(df[df['Price'] == 0])
	st.metric("Free Games", f"{free_games:,}")

	# Popular genres display
	st.subheader("🎯 Most Popular Genres")
	if 'Genres' in df.columns:
	genre_counts = pd.Series([
	genre for genres in df['Genres'].head(10000)
	for genre in str(genres).split(',')
	if genre.strip() and len(genre.strip()) > 1
	]).value_counts().head(15)

	if len(genre_counts) > 0:
	if PLOTLY_AVAILABLE:
	try:
	fig_genres = px.bar(
	x=genre_counts.values,
	y=genre_counts.index,
	orientation='h',
	title='Top 15 Game Genres in Database',
	labels={'x': 'Number of Games', 'y': 'Genre'},
	color=genre_counts.values,
	color_continuous_scale='viridis'
	)
	fig_genres.update_layout(showlegend=False)
	st.plotly_chart(fig_genres, use_container_width=True)
	except Exception as e:
	st.warning(f"Plotly chart error: {e}")
	display_text_chart(genre_counts.head(10), "Top 10 Genres")
	else:
	display_text_chart(genre_counts.head(10), "Top 10 Genres")

	if __name__ == "__main__":
	main()