Excel_CSV_Analysis_and_Insights

Build error

App Files Files Community

Excel_CSV_Analysis_and_Insights / app.py

NCTCMumbai

Update app.py

343f097 verified over 1 year ago

raw

history blame contribute delete

11.2 kB

	import pandas as pd
	import ydata_profiling
	import gradio as gr
	from pydantic_settings import BaseSettings
	from tempfile import NamedTemporaryFile
	import sweetviz as sv
	import sketch
	#from datatile.summary.df import DataFrameSummary
	from autoviz.AutoViz_Class import AutoViz_Class
	from traceml.summary.df import DataFrameSummary
	import os
	df1=gr.State()

	def variable_table(df):
	"""
	Analyzes a DataFrame and categorizes variables with colorful HTML formatting.

	Args:
	df (pandas.DataFrame): The DataFrame to analyze.

	Returns:
	str: HTML code representing the analysis results with colorful highlights.
	"""
	# Analyze variable types
	numerical_vars=[]
	text_vars=[]
	categorical_vars=[]
	# Identify categorical, numerical, and text columns
	# Iterate over columns
	for col in df.columns:
	print(col )

	unique_values = df[col].nunique()
	total_values = df[col].count()
	dtype_kind = df[col].dtype.kind
	print( 'unique value-',unique_values,' total_values ',total_values,dtype_kind)
	# Check criteria for numerical, text, and categorical variables
	if (dtype_kind == 'f' or dtype_kind == 'i') and unique_values >= total_values / 10:
	numerical_vars.append(col)
	print('Numerical')
	elif (dtype_kind == 'O' or dtype_kind == 'b') and unique_values > total_values / 10:
	text_vars.append(col)
	print('Text')
	else:
	categorical_vars.append(col)
	print('categorical')
	print(categorical_vars,numerical_vars,text_vars)
	gr.Info(f"Column Analysis completed. {len(categorical_vars)} categorical variables , {len(numerical_vars)} numerical variables , {len(text_vars)} Text variables found!")
	# Build HTML table with styles
	table_style = 'border: 1px solid #ddd; border-collapse: collapse; text-align: left; font-size: 14px;'
	header_style = 'background-color: #f2f2f2; padding: 5px 10px;'
	data_style = 'padding: 5px 10px; border-bottom: 1px solid #ddd;'
	category_color = '#90ee90' # Light green for categorical
	numerical_color = '#add8e6' # Light blue for numerical
	text_color = '#ffd9b3' # Light yellow for text

	html = f"<table style='{table_style}'>"
	html += f"<tr><th style='{header_style}'>Variable Type</th><th style='{header_style}'>Columns</th></tr>"

	# Add rows for each variable type with coloring
	if categorical_vars:
	html += f"<tr style='background-color: {category_color};'><td>Categorical</td><td style='{data_style}'>{', '.join(categorical_vars)}</td></tr>"
	if numerical_vars:
	html += f"<tr style='background-color: {numerical_color};'><td>Numerical</td><td style='{data_style}'>{', '.join(numerical_vars)}</td></tr>"
	if text_vars:
	html += f"<tr style='background-color: {text_color};'><td>Text</td><td style='{data_style}'>{', '.join(text_vars)}</td></tr>"

	# Handle cases where no variables are found
	if not (categorical_vars or numerical_vars or text_vars):
	html += "<tr><td>No variables found!</td></tr>"

	html += "</table>"
	return html

	def generate_report(file, type):
	df = pd.read_csv(file) if file.name.endswith(".csv") else pd.read_excel(file)


	pandas_html_report =ydata_profiling.ProfileReport(df).to_html()
	temp_file1 = NamedTemporaryFile(delete=False, suffix=".html")
	temp_file1.write(pandas_html_report.encode('utf-8'))
	temp_file1.close()
	print('Pandas completed')
	gr.Info('Pandas Profiling report completed')
	# dataprep_report = create_report(df)
	# temp_file2 = NamedTemporaryFile(delete=False, suffix=".html")
	# temp_file2.write(dataprep_report.encode('utf-8'))
	# temp_file2.close()

	def convert_mixed_to_str(df):

	mixed_cols = df.select_dtypes(include=['object']).columns # Detect object dtype (usually mixed)
	for col in mixed_cols:
	df[col] = df[col].astype(str) # Convert mixed columns to strings
	return df

	df = convert_mixed_to_str(df.copy())
	df.columns = df.columns.str.upper()
	df1.value=df
	sweetviz_report = sv.analyze(df)
	#sweetviz_report=sweetviz_report.show_html()
	#print(type(sweetviz_report))
	report=sweetviz_report.show_html( filepath='SWEETVIZ_REPORT.html',
	open_browser=False,
	layout='widescreen',
	scale=None)
	HTMLFileToBeOpened = open('SWEETVIZ_REPORT.html', "r")

	# Reading the file and storing in a variable
	contents = HTMLFileToBeOpened.read()
	print('sweetviz completed')
	gr.Info('Sweetviz report completed')


	temp_file3 = NamedTemporaryFile(delete=False, suffix=".html")
	temp_file3.write(contents.encode('utf-8'))
	temp_file3.close()
	dfviz=df
	try:
	AV = AutoViz_Class()
	gr.Info('Starting Autoviz ..it may take a while..pls be patient')
	dft = AV.AutoViz(filename="", sep=',', dfte=df, header=0, verbose=1,
	lowess=False,chart_format='html',max_rows_analyzed=150000,
	max_cols_analyzed=30,save_plot_dir=None)
	reports = []
	print(os.listdir("./"))
	output_dir_html="./AutoViz_Plots/AutoViz"
	for filename in os.listdir(output_dir_html):
	if filename.endswith(".html"):
	title = os.path.splitext(filename)[0] # Extract title from filename (without extension)
	reports.append(title)
	reports.append(output_dir_html+'/'+filename)
	print(reports)

	dft = AV.AutoViz(filename='', sep=',', header=0, verbose=2,dfte=df,
	lowess=False,chart_format='png',max_rows_analyzed=150000,
	max_cols_analyzed=30,save_plot_dir=None)

	imgs = []
	output_dir_img="./AutoViz_Plots/AutoViz"
	for filename in os.listdir(output_dir_img):
	if filename.endswith(".png"):
	title = os.path.splitext(filename)[0] # Extract title from filename (without extension)
	imgs.append(title)
	imgs.append(output_dir_img+'/'+filename)
	print(imgs)

	except Exception as e:
	dfviz=df
	print(e)
	print('Exception during Autoviz')



	while len(reports)<10:
	# Comparing the lengths and appending t and m to output if needed

	t = gr.Markdown(visible=False)
	m = gr.File(visible=False)
	reports.append(t)
	reports.append(m)

	while len(imgs)<10:
	# Comparing the lengths and appending t and m to output if needed

	t = gr.Markdown(visible=False)
	m = gr.Image(visible=False)
	imgs.append(t)
	imgs.append(m)

	dfs = DataFrameSummary(df)
	sd=dfs.summary()
	sd.index.name = 'PARAMETERS'
	sd1= sd.reset_index(drop=False)
	def highlight_cols(x):
	df = x.copy()
	df.loc[:, :] = 'color: purple'
	df[['PARAMETERS']] = 'color: red'
	return df

	# Applying the style function
	sd1 = sd1.style.apply(highlight_cols, axis = None)
	sd1.columns = sd1.columns.str.upper()



	return [temp_file1.name ,temp_file3.name ,variable_table(df),dfviz,sd1]+reports+imgs

	def query_chat(query):
	df=df1.value
	print(df.sketch.ask(query))
	return df.sketch.ask(query)

	def handle_click(file, sheet_name):
	print('Handle click')
	results = generate_report(file, sheet_name)
	return results

	def update_sheets(file):
	sheet_names = pd.ExcelFile(file).sheet_names

	print('Sheet updated')
	sheet_dropdown = gr.Dropdown(
	label="Select sheet", choices=[sheet_names], visible=True # Initially disabled
	)
	return sheet_dropdown


	with gr.Blocks() as cluster:
	with gr.Column():
	gr.HTML(value="""<h1 style="color: #1999FF; text-shadow: 1px 1px 2px #ddd;">EXCEL ANALYSIS AND INSIGHTS</h1>""")
	with gr.Row():
	file=gr.File(file_types=['.csv', '.xlsx'], label="Upload a CSV or Excel file")
	btn=gr.Button(value="Download Report")

	with gr.Row():


	gr.HTML(value="""<h2 style="color: #3399FF; text-shadow: 1px 1px 2px #ddd;">PANDAS REPORT</h2>""")
	out1=gr.File(label="Download CSV")
	gr.HTML(value="""<h2 style="color: #3399FF; text-shadow: 1px 1px 2px #ddd;">SWEETVIZ REPORT</h2>""")
	out2=gr.File(label="Download CSV")
	sheet_dropdown = gr.Dropdown(
	label="Select sheet", choices=[], visible=False # Initially disabled
	)


	with gr.Column():
	gr.HTML(value="""<h2 style="color: #BA4A00; text-shadow: 1px 1px 2px #ddd;"COLUMNS IN FILE</h2>""")
	#gr.Markdown("*Uploaded File*")
	var=gr.HTML()
	with gr.Column():
	gr.HTML(value="""<h2 style="color: #BA4A00; text-shadow: 1px 1px 2px #ddd;"UPLOADED FILE</h2>""")
	#gr.Markdown("#Uploaded File")
	dataframe1=gr.Dataframe()
	with gr.Column():
	gr.HTML(value="""<h2 style="color: #BA4A00; text-shadow: 1px 1px 2px #ddd;"COLUMNS STATS AND ANALYSIS</h2>""")
	#gr.Markdown("#Columns Analysis")
	dataframe2=gr.Dataframe()

	with gr.Column():
	gr.HTML(value="""<h2 style="color: #BA4A00; text-shadow: 1px 1px 2px #ddd;"ASK QUESTIONS ON THE COLUMNS </h2>""")
	with gr.Row():
	query=gr.Textbox(label="Enter any question on data columns",
	info="Natural language query",
	lines=1,
	value="What is maximum value of first column")
	#outlabel=gr.HighlightedText(label='Query Result ')
	outbox=gr.Textbox(label='Query Result')
	query_btn=gr.Button(value='Submit Query')
	query_btn.click(query_chat,inputs=query,outputs=outbox)



	#Autoviz insights graphs

	gr.HTML(value="""<h3 style="color: #1999FF; text-shadow: 1px 1px 2px #ddd;">Insight Visualisations</h3>""")
	with gr.Column():
	plots2 = []
	for i in range(5):
	t = gr.Markdown()
	with gr.Column(scale=1):
	m=gr.Image()
	plots2.append(t)
	plots2.append(m)
	print(type(plots2[0]))
	gr.HTML(value="""<h3 style="color: #1999FF; text-shadow: 1px 1px 2px #ddd;">Insight graphs</h3>""")
	with gr.Row():
	plots1 = []
	for i in range(5):
	with gr.Row():
	t = gr.Markdown()
	m=gr.File()
	plots1.append(t)
	plots1.append(m)
	print(type(plots1[0]))

	btn.click(handle_click, inputs=[file, sheet_dropdown], outputs=[out1, out2, var,dataframe1, dataframe2]+plots1+plots2)
	file.change(update_sheets, inputs=file, outputs=sheet_dropdown)
	#btn.click(generate_report,inputs=[file],outputs=[out1,out2,var,dataframe1,dataframe2])
	cluster.launch(debug=True)