06 Two-way (column or row) relative frequency tables
06 Two-way (column or row) relative frequency tables#
%%html
<iframe width="700" height="400" src="https://www.youtube.com/embed/_ETPMszULXc/" frameborder="0" allowfullscreen></iframe>
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName("statistics").master("local").getOrCreate()
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/runner/work/statistics/spark-3.1.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.3.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
22/07/21 02:32:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
dataset = {
"Car": 28 * ["SUV"] + 35 * ["Sport car"] + 97 * ["SUV"] + 104 * ["Sport car"],
"Accident": 28 * ["yes"] + 35 * ["yes"] + 97 * ["no"] + 104 * ["no"],
}
df = pd.DataFrame(dataset)
df
Car | Accident | |
---|---|---|
0 | SUV | yes |
1 | SUV | yes |
2 | SUV | yes |
3 | SUV | yes |
4 | SUV | yes |
... | ... | ... |
259 | Sport car | no |
260 | Sport car | no |
261 | Sport car | no |
262 | Sport car | no |
263 | Sport car | no |
264 rows × 2 columns
sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))
sdf.registerTempTable("sdf_table")
sdf.show()
[Stage 0:> (0 + 1) / 1]
+---+--------+
|Car|Accident|
+---+--------+
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
|SUV| yes|
+---+--------+
only showing top 20 rows
df.groupby(["Accident", "Car"]).size()
Accident Car
no SUV 97
Sport car 104
yes SUV 28
Sport car 35
dtype: int64
sdf.groupby("Accident", "Car").count().show()
[Stage 8:======================> (41 + 1) / 100]
[Stage 8:=================================> (59 + 1) / 100]
[Stage 8:=============================================> (81 + 1) / 100]
+--------+---------+-----+
|Accident| Car|count|
+--------+---------+-----+
| no| SUV| 97|
| yes|Sport car| 35|
| no|Sport car| 104|
| yes| SUV| 28|
+--------+---------+-----+
spark.sql(
"select Accident, Car, count(*) as count from sdf_table group by Accident, Car"
).show()
[Stage 18:====================================================> (96 + 1) / 100]
+--------+---------+-----+
|Accident| Car|count|
+--------+---------+-----+
| no| SUV| 97|
| yes|Sport car| 35|
| no|Sport car| 104|
| yes| SUV| 28|
+--------+---------+-----+
two_way_table = pd.crosstab(df["Accident"], df["Car"])
two_way_table
Car | SUV | Sport car |
---|---|---|
Accident | ||
no | 97 | 104 |
yes | 28 | 35 |
s_two_way_table = sdf.crosstab("Accident", "Car")
s_two_way_table.show()
+------------+---+---------+
|Accident_Car|SUV|Sport car|
+------------+---+---------+
| yes| 28| 35|
| no| 97| 104|
+------------+---+---------+
freq_table = two_way_table.copy()
freq_table["SUV"] = two_way_table["SUV"] / two_way_table["SUV"].sum()
freq_table["Sport car"] = two_way_table["Sport car"] / two_way_table["Sport car"].sum()
freq_table
Car | SUV | Sport car |
---|---|---|
Accident | ||
no | 0.776 | 0.748201 |
yes | 0.224 | 0.251799 |
s_freq_table = s_two_way_table
s_freq_table = s_freq_table.withColumn(
"SUV", F.col("SUV") / s_freq_table.select(F.sum("SUV")).collect()[0][0]
)
s_freq_table = s_freq_table.withColumn(
"Sport car",
F.col("Sport car") / s_freq_table.select(F.sum("Sport car")).collect()[0][0],
)
s_freq_table.show()
+------------+-----+------------------+
|Accident_Car| SUV| Sport car|
+------------+-----+------------------+
| yes|0.224|0.2517985611510791|
| no|0.776|0.7482014388489209|
+------------+-----+------------------+
freq_table = freq_table.append(freq_table.sum().rename("Total"))
freq_table
/tmp/ipykernel_4071/3552347670.py:1: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
freq_table = freq_table.append(freq_table.sum().rename("Total"))
Car | SUV | Sport car |
---|---|---|
Accident | ||
no | 0.776 | 0.748201 |
yes | 0.224 | 0.251799 |
Total | 1.000 | 1.000000 |