10 Analyzing mosaic plots#

%%html
<iframe width="700" height="400" src="https://www.youtube.com/embed/2sHkluggZp8/" frameborder="0" allowfullscreen></iframe>
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
import findspark

findspark.init()
from pyspark.context import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.appName("statistics").master("local").getOrCreate()
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/runner/work/statistics/spark-3.1.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.3.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
22/07/21 02:33:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

khanacademy

Analyzing mosaic plots fig 1

dataset = {
    "surveyees": ["Students", "Staff", "Parents"],
    "No": [800, 60, 150],
    "Yes": [200, 240, 150],
}
df = pd.DataFrame(dataset).set_index("surveyees")
df
No Yes
surveyees
Students 800 200
Staff 60 240
Parents 150 150
sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))
sdf.show()
[Stage 0:>                                                          (0 + 1) / 1]
+---------+---+---+
|surveyees| No|Yes|
+---------+---+---+
| Students|800|200|
|    Staff| 60|240|
|  Parents|150|150|
+---------+---+---+
                                                                                
mosaic(df.stack())
(<Figure size 432x288 with 3 Axes>,
 {('Students', 'No'): (0.0, 0.0, 0.6188118811881188, 0.7973421926910299),
  ('Students', 'Yes'): (0.0,
   0.8006644518272424,
   0.6188118811881188,
   0.19933554817275742),
  ('Staff', 'No'): (0.6237623762376238,
   0.0,
   0.18564356435643564,
   0.19933554817275748),
  ('Staff', 'Yes'): (0.6237623762376238,
   0.2026578073089701,
   0.18564356435643564,
   0.7973421926910299),
  ('Parents', 'No'): (0.8143564356435643,
   0.0,
   0.18564356435643564,
   0.4983388704318937),
  ('Parents', 'Yes'): (0.8143564356435643,
   0.5016611295681063,
   0.18564356435643564,
   0.4983388704318937)})
../_images/10 Analyzing mosaic plots_9_1.png