05 Two-way frequency tables and Venn diagrams
05 Two-way frequency tables and Venn diagrams#
%%html
<iframe width="700" height="400" src="https://www.youtube.com/embed/l5MrtV7ZN88/" frameborder="0" allowfullscreen></iframe>
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from matplotlib_venn import venn2
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName("statistics").master("local").getOrCreate()
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/runner/work/statistics/spark-3.1.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.3.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
22/07/21 02:31:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
dataset = {
"chocolate": [
"yes",
"yes",
"yes",
"yes",
"yes",
"yes",
"yes",
"yes",
"yes",
"no",
"no",
"no",
],
"coconut": [
"yes",
"yes",
"yes",
"no",
"no",
"no",
"no",
"no",
"no",
"yes",
"no",
"no",
],
}
df = pd.DataFrame(dataset)
df
chocolate | coconut | |
---|---|---|
0 | yes | yes |
1 | yes | yes |
2 | yes | yes |
3 | yes | no |
4 | yes | no |
5 | yes | no |
6 | yes | no |
7 | yes | no |
8 | yes | no |
9 | no | yes |
10 | no | no |
11 | no | no |
sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))
sdf.registerTempTable("sdf_table")
sdf.show()
[Stage 0:> (0 + 1) / 1]
+---------+-------+
|chocolate|coconut|
+---------+-------+
| yes| yes|
| yes| yes|
| yes| yes|
| yes| no|
| yes| no|
| yes| no|
| yes| no|
| yes| no|
| yes| no|
| no| yes|
| no| no|
| no| no|
+---------+-------+
df.groupby(["chocolate"]).size()
chocolate
no 3
yes 9
dtype: int64
sdf.groupby("chocolate").count().show()
[Stage 8:============================> (51 + 2) / 100]
[Stage 8:========================================> (72 + 1) / 100]
+---------+-----+
|chocolate|count|
+---------+-----+
| no| 3|
| yes| 9|
+---------+-----+
spark.sql(
"select chocolate, count(*) as count from sdf_table group by chocolate"
).show()
[Stage 18:=========================================> (75 + 1) / 100]
+---------+-----+
|chocolate|count|
+---------+-----+
| no| 3|
| yes| 9|
+---------+-----+
df.groupby(["coconut"]).size()
coconut
no 8
yes 4
dtype: int64
sdf.groupby("coconut").count().show()
+-------+-----+
|coconut|count|
+-------+-----+
| no| 8|
| yes| 4|
+-------+-----+
spark.sql("select coconut, count(*) as count from sdf_table group by coconut").show()
+-------+-----+
|coconut|count|
+-------+-----+
| no| 8|
| yes| 4|
+-------+-----+
df.groupby(["chocolate", "coconut"]).size()
chocolate coconut
no no 2
yes 1
yes no 6
yes 3
dtype: int64
sdf.groupby("chocolate", "coconut").count().show()
+---------+-------+-----+
|chocolate|coconut|count|
+---------+-------+-----+
| no| no| 2|
| no| yes| 1|
| yes| yes| 3|
| yes| no| 6|
+---------+-------+-----+
spark.sql(
"select chocolate, coconut, count(*) as count from sdf_table group by chocolate, coconut"
).show()
+---------+-------+-----+
|chocolate|coconut|count|
+---------+-------+-----+
| no| no| 2|
| no| yes| 1|
| yes| yes| 3|
| yes| no| 6|
+---------+-------+-----+
pd.crosstab(df["chocolate"], df["coconut"], margins=True)
coconut | no | yes | All |
---|---|---|---|
chocolate | |||
no | 2 | 1 | 3 |
yes | 6 | 3 | 9 |
All | 8 | 4 | 12 |
sdf.crosstab("chocolate", "coconut").show()
+-----------------+---+---+
|chocolate_coconut| no|yes|
+-----------------+---+---+
| yes| 6| 3|
| no| 2| 1|
+-----------------+---+---+
venn2(
subsets=[i for i in df.groupby(["chocolate", "coconut"]).size()[1:]],
set_labels=("coconut", "chocolate", "both"),
set_colors=("purple", "skyblue"),
alpha=0.7,
);