05 Two-way frequency tables and Venn diagrams

%%html
<iframe width="700" height="400" src="https://www.youtube.com/embed/l5MrtV7ZN88/" frameborder="0" allowfullscreen></iframe>

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from matplotlib_venn import venn2

import findspark

findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.appName("statistics").master("local").getOrCreate()

WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/runner/work/statistics/spark-3.1.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.3.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release

22/07/21 02:31:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

dataset = {
    "chocolate": [
        "yes",
        "yes",
        "yes",
        "yes",
        "yes",
        "yes",
        "yes",
        "yes",
        "yes",
        "no",
        "no",
        "no",
    ],
    "coconut": [
        "yes",
        "yes",
        "yes",
        "no",
        "no",
        "no",
        "no",
        "no",
        "no",
        "yes",
        "no",
        "no",
    ],
}

df = pd.DataFrame(dataset)
df

	chocolate	coconut
0	yes	yes
1	yes	yes
2	yes	yes
3	yes	no
4	yes	no
5	yes	no
6	yes	no
7	yes	no
8	yes	no
9	no	yes
10	no	no
11	no	no

sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))
sdf.registerTempTable("sdf_table")
sdf.show()

[Stage 0:>                                                          (0 + 1) / 1]

+---------+-------+
|chocolate|coconut|
+---------+-------+
|      yes|    yes|
|      yes|    yes|
|      yes|    yes|
|      yes|     no|
|      yes|     no|
|      yes|     no|
|      yes|     no|
|      yes|     no|
|      yes|     no|
|       no|    yes|
|       no|     no|
|       no|     no|
+---------+-------+

df.groupby(["chocolate"]).size()

chocolate
no     3
yes    9
dtype: int64

sdf.groupby("chocolate").count().show()

[Stage 8:============================>                           (51 + 2) / 100]

[Stage 8:========================================>               (72 + 1) / 100]

+---------+-----+
|chocolate|count|
+---------+-----+
|       no|    3|
|      yes|    9|
+---------+-----+

spark.sql(
    "select chocolate, count(*) as count from sdf_table group by chocolate"
).show()

[Stage 18:=========================================>             (75 + 1) / 100]

                                                                                

+---------+-----+
|chocolate|count|
+---------+-----+
|       no|    3|
|      yes|    9|
+---------+-----+

df.groupby(["coconut"]).size()

coconut
no     8
yes    4
dtype: int64

sdf.groupby("coconut").count().show()

+-------+-----+
|coconut|count|
+-------+-----+
|     no|    8|
|    yes|    4|
+-------+-----+

spark.sql("select coconut, count(*) as count from sdf_table group by coconut").show()

+-------+-----+
|coconut|count|
+-------+-----+
|     no|    8|
|    yes|    4|
+-------+-----+

df.groupby(["chocolate", "coconut"]).size()

chocolate  coconut
no         no         2
           yes        1
yes        no         6
           yes        3
dtype: int64

sdf.groupby("chocolate", "coconut").count().show()

+---------+-------+-----+
|chocolate|coconut|count|
+---------+-------+-----+
|       no|     no|    2|
|       no|    yes|    1|
|      yes|    yes|    3|
|      yes|     no|    6|
+---------+-------+-----+

spark.sql(
    "select chocolate, coconut, count(*) as count from sdf_table group by chocolate, coconut"
).show()

+---------+-------+-----+
|chocolate|coconut|count|
+---------+-------+-----+
|       no|     no|    2|
|       no|    yes|    1|
|      yes|    yes|    3|
|      yes|     no|    6|
+---------+-------+-----+

pd.crosstab(df["chocolate"], df["coconut"], margins=True)

coconut	no	yes	All
chocolate
no	2	1	3
yes	6	3	9
All	8	4	12

sdf.crosstab("chocolate", "coconut").show()

+-----------------+---+---+
|chocolate_coconut| no|yes|
+-----------------+---+---+
|              yes|  6|  3|
|               no|  2|  1|
+-----------------+---+---+

venn2(
    subsets=[i for i in df.groupby(["chocolate", "coconut"]).size()[1:]],
    set_labels=("coconut", "chocolate", "both"),
    set_colors=("purple", "skyblue"),
    alpha=0.7,
);

../_images/05 Two-way frequency tables and Venn diagrams_20_0.png

Statstics with Python

05 Two-way frequency tables and Venn diagrams

05 Two-way frequency tables and Venn diagrams#

	chocolate	coconut
0	yes	yes
1	yes	yes
2	yes	yes
3	yes	no
4	yes	no
5	yes	no
6	yes	no
7	yes	no
8	yes	no
9	no	yes
10	no	no
11	no	no

	chocolate	coconut
0	yes	yes
1	yes	yes
2	yes	yes
3	yes	no
4	yes	no
5	yes	no
6	yes	no
7	yes	no
8	yes	no
9	no	yes
10	no	no
11	no	no

	chocolate	coconut
0	yes	yes
1	yes	yes
2	yes	yes
3	yes	no
4	yes	no
5	yes	no
6	yes	no
7	yes	no
8	yes	no
9	no	yes
10	no	no
11	no	no