Spaces:
Paused
Paused
Updated assessment and solutions
Browse files- notebooks/assesment.ipynb +31 -15
- notebooks/solutions.ipynb +16 -1
notebooks/assesment.ipynb
CHANGED
|
@@ -87,8 +87,8 @@
|
|
| 87 |
"# spark_df = ?\n",
|
| 88 |
"\n",
|
| 89 |
"# Check schema and row count\n",
|
| 90 |
-
"# spark_df.
|
| 91 |
-
"# print(\"spark_df count:\", spark_df.
|
| 92 |
]
|
| 93 |
},
|
| 94 |
{
|
|
@@ -99,19 +99,19 @@
|
|
| 99 |
"source": [
|
| 100 |
"# 3. Split Data into Two Subsets for Merging/Joining\n",
|
| 101 |
"# ==================================================\n",
|
| 102 |
-
"#
|
| 103 |
-
"#
|
| 104 |
"# df_part1: subset of columns -> PassengerId, Name, Sex, Age\n",
|
| 105 |
"# df_part2: subset of columns -> PassengerId, Fare, Survived, Pclass\n",
|
| 106 |
"#\n",
|
| 107 |
-
"#
|
| 108 |
"\n",
|
| 109 |
"# 3.1 Pandas Split\n",
|
| 110 |
"# ----------------\n",
|
| 111 |
"\n",
|
| 112 |
"# TODO: Create two new DataFrames from pd_df:\n",
|
| 113 |
"# pd_part1 = pd_df[[\"PassengerId\", \"Name\", \"Sex\", \"Age\"]]\n",
|
| 114 |
-
"# pd_part2 = pd_df[
|
| 115 |
"\n",
|
| 116 |
"# pd_part1 = ?\n",
|
| 117 |
"# pd_part2 = ?\n",
|
|
@@ -129,8 +129,8 @@
|
|
| 129 |
"# 3.2 Spark Split\n",
|
| 130 |
"# ---------------\n",
|
| 131 |
"# TODO: Create two new DataFrames from spark_df:\n",
|
| 132 |
-
"# spark_part1 = spark_df.
|
| 133 |
-
"# spark_part2 = spark_df.
|
| 134 |
"\n",
|
| 135 |
"# spark_part1 = ?\n",
|
| 136 |
"# spark_part2 = ?\n",
|
|
@@ -153,7 +153,6 @@
|
|
| 153 |
"# TODO: Merge pd_part1 and pd_part2 on \"PassengerId\"\n",
|
| 154 |
"# We'll call the merged DataFrame \"pd_merged\".\n",
|
| 155 |
"#\n",
|
| 156 |
-
"# pd_merged = pd_part1.merge(pd_part2, on=\"PassengerId\", how=\"inner\")\n",
|
| 157 |
"\n",
|
| 158 |
"# pd_merged = ?\n",
|
| 159 |
"# print(\"pd_merged shape:\", pd_merged.shape)\n",
|
|
@@ -171,8 +170,9 @@
|
|
| 171 |
"# TODO: Join spark_part1 with spark_part2 on \"PassengerId\"\n",
|
| 172 |
"# We'll call the joined DataFrame \"spark_merged\".\n",
|
| 173 |
"#\n",
|
| 174 |
-
"# spark_merged = spark_part1.join(spark_part2, on=\"PassengerId\", how=\"inner\")\n",
|
| 175 |
"\n",
|
|
|
|
|
|
|
| 176 |
"# spark_merged = ?\n",
|
| 177 |
"# print(\"spark_merged count:\", spark_merged.count())\n",
|
| 178 |
"# spark_merged.show(5)\n",
|
|
@@ -242,7 +242,7 @@
|
|
| 242 |
"outputs": [],
|
| 243 |
"source": [
|
| 244 |
"# 6.2 TODO: Spark - Survival rate by Sex and Pclass\n",
|
| 245 |
-
"#
|
| 246 |
"#\n",
|
| 247 |
"# spark_survival_rate = ?\n",
|
| 248 |
"# spark_survival_rate.show()\n"
|
|
@@ -261,11 +261,20 @@
|
|
| 261 |
"# 7.1 TODO: Write spark_merged_clean to Parquet\n",
|
| 262 |
"# e.g., spark_merged_clean.write. ...\n",
|
| 263 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
"# 7.2 TODO: Read it back into a new Spark DataFrame called 'spark_parquet_df'\n",
|
| 265 |
"# spark_parquet_df = ?\n",
|
| 266 |
"\n",
|
| 267 |
"# print(\"spark_parquet_df count:\", spark_parquet_df.count())\n",
|
| 268 |
-
"# spark_parquet_df.show(5)
|
| 269 |
]
|
| 270 |
},
|
| 271 |
{
|
|
@@ -274,14 +283,21 @@
|
|
| 274 |
"metadata": {},
|
| 275 |
"outputs": [],
|
| 276 |
"source": [
|
| 277 |
-
"# 8.
|
| 278 |
"# ========================================\n",
|
| 279 |
"# 8.1 TODO: Create a temp view with 'spark_merged_clean' (e.g. \"titanic_merged\")\n",
|
| 280 |
"# spark_merged_clean.createOrReplaceTempView(\"titanic_merged\")\n",
|
| 281 |
"\n",
|
| 282 |
-
"# 8.2 TODO: Spark SQL query
|
|
|
|
|
|
|
| 283 |
"# result_df = spark.sql(\"SELECT ... FROM titanic_merged GROUP BY ...\")\n",
|
| 284 |
-
"# result_df.show()\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
]
|
| 286 |
},
|
| 287 |
{
|
|
|
|
| 87 |
"# spark_df = ?\n",
|
| 88 |
"\n",
|
| 89 |
"# Check schema and row count\n",
|
| 90 |
+
"# spark_df. ...\n",
|
| 91 |
+
"# print(\"spark_df count:\", spark_df. ...)\n"
|
| 92 |
]
|
| 93 |
},
|
| 94 |
{
|
|
|
|
| 99 |
"source": [
|
| 100 |
"# 3. Split Data into Two Subsets for Merging/Joining\n",
|
| 101 |
"# ==================================================\n",
|
| 102 |
+
"# Split the dataset into two df's by column, then merge them \n",
|
| 103 |
+
"# back together\n",
|
| 104 |
"# df_part1: subset of columns -> PassengerId, Name, Sex, Age\n",
|
| 105 |
"# df_part2: subset of columns -> PassengerId, Fare, Survived, Pclass\n",
|
| 106 |
"#\n",
|
| 107 |
+
"# \n",
|
| 108 |
"\n",
|
| 109 |
"# 3.1 Pandas Split\n",
|
| 110 |
"# ----------------\n",
|
| 111 |
"\n",
|
| 112 |
"# TODO: Create two new DataFrames from pd_df:\n",
|
| 113 |
"# pd_part1 = pd_df[[\"PassengerId\", \"Name\", \"Sex\", \"Age\"]]\n",
|
| 114 |
+
"# pd_part2 = pd_df[...]\n",
|
| 115 |
"\n",
|
| 116 |
"# pd_part1 = ?\n",
|
| 117 |
"# pd_part2 = ?\n",
|
|
|
|
| 129 |
"# 3.2 Spark Split\n",
|
| 130 |
"# ---------------\n",
|
| 131 |
"# TODO: Create two new DataFrames from spark_df:\n",
|
| 132 |
+
"# spark_part1 = spark_df. ...\n",
|
| 133 |
+
"# spark_part2 = spark_df. ...\n",
|
| 134 |
"\n",
|
| 135 |
"# spark_part1 = ?\n",
|
| 136 |
"# spark_part2 = ?\n",
|
|
|
|
| 153 |
"# TODO: Merge pd_part1 and pd_part2 on \"PassengerId\"\n",
|
| 154 |
"# We'll call the merged DataFrame \"pd_merged\".\n",
|
| 155 |
"#\n",
|
|
|
|
| 156 |
"\n",
|
| 157 |
"# pd_merged = ?\n",
|
| 158 |
"# print(\"pd_merged shape:\", pd_merged.shape)\n",
|
|
|
|
| 170 |
"# TODO: Join spark_part1 with spark_part2 on \"PassengerId\"\n",
|
| 171 |
"# We'll call the joined DataFrame \"spark_merged\".\n",
|
| 172 |
"#\n",
|
|
|
|
| 173 |
"\n",
|
| 174 |
+
"\n",
|
| 175 |
+
"#Uncomment below\n",
|
| 176 |
"# spark_merged = ?\n",
|
| 177 |
"# print(\"spark_merged count:\", spark_merged.count())\n",
|
| 178 |
"# spark_merged.show(5)\n",
|
|
|
|
| 242 |
"outputs": [],
|
| 243 |
"source": [
|
| 244 |
"# 6.2 TODO: Spark - Survival rate by Sex and Pclass\n",
|
| 245 |
+
"# Average survival rate by Sex and Pclass\n",
|
| 246 |
"#\n",
|
| 247 |
"# spark_survival_rate = ?\n",
|
| 248 |
"# spark_survival_rate.show()\n"
|
|
|
|
| 261 |
"# 7.1 TODO: Write spark_merged_clean to Parquet\n",
|
| 262 |
"# e.g., spark_merged_clean.write. ...\n",
|
| 263 |
"\n",
|
| 264 |
+
"\n"
|
| 265 |
+
]
|
| 266 |
+
},
|
| 267 |
+
{
|
| 268 |
+
"cell_type": "code",
|
| 269 |
+
"execution_count": null,
|
| 270 |
+
"metadata": {},
|
| 271 |
+
"outputs": [],
|
| 272 |
+
"source": [
|
| 273 |
"# 7.2 TODO: Read it back into a new Spark DataFrame called 'spark_parquet_df'\n",
|
| 274 |
"# spark_parquet_df = ?\n",
|
| 275 |
"\n",
|
| 276 |
"# print(\"spark_parquet_df count:\", spark_parquet_df.count())\n",
|
| 277 |
+
"# spark_parquet_df.show(5)"
|
| 278 |
]
|
| 279 |
},
|
| 280 |
{
|
|
|
|
| 283 |
"metadata": {},
|
| 284 |
"outputs": [],
|
| 285 |
"source": [
|
| 286 |
+
"# 8. Create a Temp View and Query\n",
|
| 287 |
"# ========================================\n",
|
| 288 |
"# 8.1 TODO: Create a temp view with 'spark_merged_clean' (e.g. \"titanic_merged\")\n",
|
| 289 |
"# spark_merged_clean.createOrReplaceTempView(\"titanic_merged\")\n",
|
| 290 |
"\n",
|
| 291 |
+
"# 8.2 TODO: Spark SQL query examples\n",
|
| 292 |
+
"\n",
|
| 293 |
+
"#Get the average passenger age grouped by PClass\n",
|
| 294 |
"# result_df = spark.sql(\"SELECT ... FROM titanic_merged GROUP BY ...\")\n",
|
| 295 |
+
"# result_df.show()\n",
|
| 296 |
+
"\n",
|
| 297 |
+
"# Calculate the Pearson correlation between passenger Fare and Survival\n",
|
| 298 |
+
"# using either SQL or another method\n",
|
| 299 |
+
"# Corr.(X, Y) = cov(X,Y)/(std(X)*std(Y))\n",
|
| 300 |
+
"# corr = ..."
|
| 301 |
]
|
| 302 |
},
|
| 303 |
{
|
notebooks/solutions.ipynb
CHANGED
|
@@ -256,7 +256,22 @@
|
|
| 256 |
" GROUP BY Pclass\n",
|
| 257 |
" ORDER BY Pclass\n",
|
| 258 |
" \"\"\")\n",
|
| 259 |
-
"result_df.show()\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
]
|
| 261 |
},
|
| 262 |
{
|
|
|
|
| 256 |
" GROUP BY Pclass\n",
|
| 257 |
" ORDER BY Pclass\n",
|
| 258 |
" \"\"\")\n",
|
| 259 |
+
"result_df.show()\n",
|
| 260 |
+
"\n",
|
| 261 |
+
"#Correlation between Fare and Survival\n",
|
| 262 |
+
"# Compute the Pearson correlation between Fare and Survived\n",
|
| 263 |
+
"\n",
|
| 264 |
+
"correlation1 = spark_merged_clean.stat.corr(\"Fare\", \"Survived\", \"pearson\")\n",
|
| 265 |
+
"\n",
|
| 266 |
+
"print(\"Pearson correlation between Fare and Survived:\", correlation1)\n",
|
| 267 |
+
"\n",
|
| 268 |
+
"correlation2 = spark.sql(\n",
|
| 269 |
+
" '''\n",
|
| 270 |
+
" SELECT\n",
|
| 271 |
+
" covar_samp(Fare, Survived) / (stddev_samp(Fare)*stddev_samp(Survived)) as correlation\n",
|
| 272 |
+
" FROM titanic_merged_clean\n",
|
| 273 |
+
" '''\n",
|
| 274 |
+
")\n"
|
| 275 |
]
|
| 276 |
},
|
| 277 |
{
|