03 Reading bar charts - comparing two sets of data

%%html
<iframe width="700" height="400" src="https://www.youtube.com/embed/gnyHsgTFXIY/" frameborder="0" allowfullscreen></iframe>

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns

import findspark

findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.appName("statistics").master("local").getOrCreate()

WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/runner/work/statistics/spark-3.1.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.3.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release

22/07/21 02:31:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

dataset = {
    "Student": ["Brandon", "Vanessa", "Daniel", "Kevin", "Wiliam"],
    "Midterm": [85, 60, 60, 65, 100],
    "Final": [90, 90, 65, 80, 95],
}

df = pd.DataFrame(dataset).set_index("Student")
df

	Midterm	Final
Student
Brandon	85	90
Vanessa	60	90
Daniel	60	65
Kevin	65	80
Wiliam	100	95

sdf = spark.createDataFrame(zip(*dataset.values()), list(dataset.keys()))
sdf.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------+-------+-----+
|Student|Midterm|Final|
+-------+-------+-----+
|Brandon|     85|   90|
|Vanessa|     60|   90|
| Daniel|     60|   65|
|  Kevin|     65|   80|
| Wiliam|    100|   95|
+-------+-------+-----+

[Stage 0:===========================================================(1 + 0) / 1]

                                                                                

df.plot(kind="bar", title="Scores on Midterm and Final Exams")

<AxesSubplot:title={'center':'Scores on Midterm and Final Exams'}, xlabel='Student'>

../_images/03 Reading bar charts - comparing two sets of data_9_1.png

plt.bar(x=df.index, height=df["Midterm"])
plt.bar(x=df.index, height=df["Final"], alpha=0.5)
plt.legend(["Midterm", "Final"])
plt.title("Scores on Midterm and Final Exams")
plt.xlabel("Student")
plt.ylabel("Scores(points)")
plt.show()

../_images/03 Reading bar charts - comparing two sets of data_10_0.png

melt_df = df.reset_index().melt(id_vars="Student", var_name="Exam", value_name="Score")
melt_df

	Student	Exam	Score
0	Brandon	Midterm	85
1	Vanessa	Midterm	60
2	Daniel	Midterm	60
3	Kevin	Midterm	65
4	Wiliam	Midterm	100
5	Brandon	Final	90
6	Vanessa	Final	90
7	Daniel	Final	65
8	Kevin	Final	80
9	Wiliam	Final	95

sns.catplot(data=melt_df, x="Student", y="Score", hue="Exam", kind="bar")
plt.title("Scores on Midterm and Final Exams")
plt.xlabel("Student")
plt.ylabel("Scores(points)")
plt.show()

../_images/03 Reading bar charts - comparing two sets of data_12_0.png

data = [
    go.Bar(
        x=df.index, y=df["Midterm"], name="Midterm", marker=dict(color="cornflowerblue")
    ),
    go.Bar(x=df.index, y=df["Final"], name="Final", marker=dict(color="orange")),
]
layout = go.Layout(
    title="Scores on Midterm and Final Exams",
    xaxis=dict(title="Student"),
    yaxis=dict(title="Scores(points)"),
)
fig = go.Figure(data, layout)
fig.show()

Statstics with Python

03 Reading bar charts - comparing two sets of data

03 Reading bar charts - comparing two sets of data#