05 Two-way frequency tables and Venn diagrams#

%%html
<iframe width="700" height="400" src="https://www.youtube.com/embed/l5MrtV7ZN88/" frameborder="0" allowfullscreen></iframe>
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from matplotlib_venn import venn2
import findspark

findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.appName("statistics").master("local").getOrCreate()
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/runner/work/statistics/spark-3.1.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.3.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
22/07/21 02:31:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

khanacademy

Two-way frequency tables and Venn diagrams fig 1

dataset = {
    "chocolate": [
        "yes",
        "yes",
        "yes",
        "yes",
        "yes",
        "yes",
        "yes",
        "yes",
        "yes",
        "no",
        "no",
        "no",
    ],
    "coconut": [
        "yes",
        "yes",
        "yes",
        "no",
        "no",
        "no",
        "no",
        "no",
        "no",
        "yes",
        "no",
        "no",
    ],
}
df = pd.DataFrame(dataset)
df
chocolate coconut
0 yes yes
1 yes yes
2 yes yes
3 yes no
4 yes no
5 yes no
6 yes no
7 yes no
8 yes no
9 no yes
10 no no
11 no no
sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))
sdf.registerTempTable("sdf_table")
sdf.show()
[Stage 0:>                                                          (0 + 1) / 1]
+---------+-------+
|chocolate|coconut|
+---------+-------+
|      yes|    yes|
|      yes|    yes|
|      yes|    yes|
|      yes|     no|
|      yes|     no|
|      yes|     no|
|      yes|     no|
|      yes|     no|
|      yes|     no|
|       no|    yes|
|       no|     no|
|       no|     no|
+---------+-------+
                                                                                
df.groupby(["chocolate"]).size()
chocolate
no     3
yes    9
dtype: int64
sdf.groupby("chocolate").count().show()
[Stage 8:============================>                           (51 + 2) / 100]

[Stage 8:========================================>               (72 + 1) / 100]
                                                                                
+---------+-----+
|chocolate|count|
+---------+-----+
|       no|    3|
|      yes|    9|
+---------+-----+
spark.sql(
    "select chocolate, count(*) as count from sdf_table group by chocolate"
).show()
[Stage 18:=========================================>             (75 + 1) / 100]

                                                                                
+---------+-----+
|chocolate|count|
+---------+-----+
|       no|    3|
|      yes|    9|
+---------+-----+
df.groupby(["coconut"]).size()
coconut
no     8
yes    4
dtype: int64
sdf.groupby("coconut").count().show()
+-------+-----+
|coconut|count|
+-------+-----+
|     no|    8|
|    yes|    4|
+-------+-----+
spark.sql("select coconut, count(*) as count from sdf_table group by coconut").show()
+-------+-----+
|coconut|count|
+-------+-----+
|     no|    8|
|    yes|    4|
+-------+-----+
df.groupby(["chocolate", "coconut"]).size()
chocolate  coconut
no         no         2
           yes        1
yes        no         6
           yes        3
dtype: int64
sdf.groupby("chocolate", "coconut").count().show()
+---------+-------+-----+
|chocolate|coconut|count|
+---------+-------+-----+
|       no|     no|    2|
|       no|    yes|    1|
|      yes|    yes|    3|
|      yes|     no|    6|
+---------+-------+-----+
spark.sql(
    "select chocolate, coconut, count(*) as count from sdf_table group by chocolate, coconut"
).show()
+---------+-------+-----+
|chocolate|coconut|count|
+---------+-------+-----+
|       no|     no|    2|
|       no|    yes|    1|
|      yes|    yes|    3|
|      yes|     no|    6|
+---------+-------+-----+
pd.crosstab(df["chocolate"], df["coconut"], margins=True)
coconut no yes All
chocolate
no 2 1 3
yes 6 3 9
All 8 4 12
sdf.crosstab("chocolate", "coconut").show()
+-----------------+---+---+
|chocolate_coconut| no|yes|
+-----------------+---+---+
|              yes|  6|  3|
|               no|  2|  1|
+-----------------+---+---+
venn2(
    subsets=[i for i in df.groupby(["chocolate", "coconut"]).size()[1:]],
    set_labels=("coconut", "chocolate", "both"),
    set_colors=("purple", "skyblue"),
    alpha=0.7,
);
../_images/05 Two-way frequency tables and Venn diagrams_20_0.png