09 Mosaic plots and segmented bar charts#

%%html
<iframe width="700" height="400" src="https://www.youtube.com/embed/b9G5pWuxb1Y/" frameborder="0" allowfullscreen></iframe>
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
import findspark

findspark.init()
from pyspark.context import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.appName("statistics").master("local").getOrCreate()
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/runner/work/statistics/spark-3.1.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.3.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
22/07/21 02:32:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

khanacademy

Mosaic plots and segmented bar charts fig 1

dataset = {
    "Age": ["Adults", "Children", "Infants"],
    "Yes": [114, 54, 8],
    "No": [6, 6, 12],
}
df = pd.DataFrame(dataset).set_index("Age")
df
Yes No
Age
Adults 114 6
Children 54 6
Infants 8 12
sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))
sdf.show()
[Stage 0:>                                                          (0 + 1) / 1]
+--------+---+---+
|     Age|Yes| No|
+--------+---+---+
|  Adults|114|  6|
|Children| 54|  6|
| Infants|  8| 12|
+--------+---+---+
                                                                                
df_pct = df.copy()
df_pct["sum"] = df_pct["Yes"] + df_pct["No"]
df_pct["Yes"] = df_pct["Yes"] * 100 / df_pct["sum"]
df_pct["No"] = df_pct["No"] * 100 / df_pct["sum"]
df_pct = df_pct.drop(columns=["sum"])
df_pct
Yes No
Age
Adults 95.0 5.0
Children 90.0 10.0
Infants 40.0 60.0

Segmented bar chart#

df_pct.plot(kind="bar", stacked=True)
<AxesSubplot:xlabel='Age'>
../_images/09 Mosaic plots and segmented bar charts_11_1.png
plt.bar(df_pct.index, df_pct["Yes"], color="red")
plt.bar(df_pct.index, df_pct["No"], alpha=0.6)
plt.legend(["Yes", "No"])
plt.ylabel("Percent")
Text(0, 0.5, 'Percent')
../_images/09 Mosaic plots and segmented bar charts_12_1.png
melt_df = df_pct.reset_index().melt(
    id_vars="Age", var_name="Anwser", value_name="Percentage"
)
melt_df
Age Anwser Percentage
0 Adults Yes 95.0
1 Children Yes 90.0
2 Infants Yes 40.0
3 Adults No 5.0
4 Children No 10.0
5 Infants No 60.0
sns.barplot(x="Age", y="Percentage", hue="Anwser", data=melt_df)
<AxesSubplot:xlabel='Age', ylabel='Percentage'>
../_images/09 Mosaic plots and segmented bar charts_14_1.png
data = [
    go.Bar(x=df_pct.index, y=df_pct["Yes"], name="Yes"),
    go.Bar(x=df_pct.index, y=df_pct["No"], name="No"),
]
layout = go.Layout(barmode="stack")
fig = go.Figure(data, layout)
fig.show()

Mosaic plot#

mosaic(df.stack(), title="has antibiotics/age")
(<Figure size 432x288 with 3 Axes>,
 {('Adults', 'Yes'): (0.0, 0.0, 0.594059405940594, 0.9468438538205979),
  ('Adults', 'No'): (0.0,
   0.9501661129568105,
   0.594059405940594,
   0.04983388704318941),
  ('Children', 'Yes'): (0.599009900990099,
   0.0,
   0.29702970297029707,
   0.8970099667774086),
  ('Children', 'No'): (0.599009900990099,
   0.9003322259136212,
   0.29702970297029707,
   0.09966777408637871),
  ('Infants', 'Yes'): (0.900990099009901,
   0.0,
   0.09900990099009899,
   0.39867109634551495),
  ('Infants', 'No'): (0.900990099009901,
   0.4019933554817276,
   0.09900990099009899,
   0.5980066445182723)})
../_images/09 Mosaic plots and segmented bar charts_17_1.png
x: np.ndarray = np.array([1, 2, 3])
print()