04 Reading bar graphs movies
04 Reading bar graphs movies#
%%html
<iframe width="700" height="400" src="https://www.youtube.com/embed/R48KimqkYTA/" frameborder="0" allowfullscreen></iframe>
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import findspark
findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName("statistics").master("local").getOrCreate()
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/runner/work/statistics/spark-3.1.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.3.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
22/07/21 02:31:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
dataset = {
'Favorite type of movie': ['Comdey', 'Scary', 'Adenvture', 'Cartoon', 'Mystery'],
'Number of people' : [20, 6, 10, 10, 16],
}
df = pd.DataFrame(dataset).set_index('Favorite type of movie')
df
Number of people | |
---|---|
Favorite type of movie | |
Comdey | 20 |
Scary | 6 |
Adenvture | 10 |
Cartoon | 10 |
Mystery | 16 |
sdf = spark.createDataFrame(zip(*dataset.values()), list(dataset.keys()))
sdf.show()
[Stage 0:> (0 + 1) / 1]
+----------------------+----------------+
|Favorite type of movie|Number of people|
+----------------------+----------------+
| Comdey| 20|
| Scary| 6|
| Adenvture| 10|
| Cartoon| 10|
| Mystery| 16|
+----------------------+----------------+
df[df['Number of people'] < 14]
Number of people | |
---|---|
Favorite type of movie | |
Scary | 6 |
Adenvture | 10 |
Cartoon | 10 |
sdf[sdf['Number of people'] < 14].show()
+----------------------+----------------+
|Favorite type of movie|Number of people|
+----------------------+----------------+
| Scary| 6|
| Adenvture| 10|
| Cartoon| 10|
+----------------------+----------------+
df.plot(kind="bar")
<AxesSubplot:xlabel='Favorite type of movie'>
plt.bar(x=dataset['Favorite type of movie'], height=dataset['Number of people'])
plt.xlabel('Favorite type of movie')
plt.ylabel('Number of people')
plt.show()
sns.barplot(x=dataset['Favorite type of movie'], y=dataset['Number of people'])
plt.xlabel('Favorite type of movie')
plt.ylabel('Number of people')
plt.show()
data = [go.Bar(x=dataset['Favorite type of movie'], y=dataset['Number of people'])]
layout = go.Layout(
xaxis=dict(title='Favorite type of movie'),
yaxis=dict(title='Number of people')
)
fig = go.Figure(data, layout)
fig.show()
dataset = {
'Day':['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'],
'Dogs': [80, 160, 80, 140, 180]
}
df = pd.DataFrame(dataset).set_index('Day')
df
Dogs | |
---|---|
Day | |
Monday | 80 |
Tuesday | 160 |
Wednesday | 80 |
Thursday | 140 |
Friday | 180 |
sdf = spark.createDataFrame(zip(*dataset.values()), list(dataset.keys()))
sdf.show()
+---------+----+
| Day|Dogs|
+---------+----+
| Monday| 80|
| Tuesday| 160|
|Wednesday| 80|
| Thursday| 140|
| Friday| 180|
+---------+----+
sum_of_monday_and_wednesday = df[(df.index == 'Monday') | (df.index == 'Wednesday')]['Dogs'].sum()
df[df['Dogs'] == sum_of_monday_and_wednesday]
Dogs | |
---|---|
Day | |
Tuesday | 160 |
sum_of_monday_and_wednesday = sdf[(sdf['Day'] == 'Monday') | (sdf['Day'] == 'Wednesday')].groupby('Dogs').sum().collect()[0]['sum(Dogs)']
sdf[sdf['Dogs'] == sum_of_monday_and_wednesday].show()
[Stage 4:========> (31 + 1) / 200]
[Stage 4:==============> (52 + 1) / 200]
[Stage 4:===================> (70 + 1) / 200]
[Stage 4:=======================> (84 + 1) / 200]
[Stage 4:============================> (102 + 2) / 200]
[Stage 4:==================================> (126 + 1) / 200]
[Stage 4:==========================================> (154 + 1) / 200]
[Stage 4:==================================================> (183 + 1) / 200]
+-------+----+
| Day|Dogs|
+-------+----+
|Tuesday| 160|
+-------+----+
df.plot(kind='bar')
<AxesSubplot:xlabel='Day'>
dataset = {
'Ice cream flavor': ['Vanilla', 'Chocolate', 'Strawberry', 'Cookie dough'],
'Number of customers': [175, 225, 75, 200]
}
df = pd.DataFrame(dataset).set_index('Ice cream flavor')
df
Number of customers | |
---|---|
Ice cream flavor | |
Vanilla | 175 |
Chocolate | 225 |
Strawberry | 75 |
Cookie dough | 200 |
sdf = spark.createDataFrame(zip(*dataset.values()), list(dataset.keys()))
sdf.show()
+----------------+-------------------+
|Ice cream flavor|Number of customers|
+----------------+-------------------+
| Vanilla| 175|
| Chocolate| 225|
| Strawberry| 75|
| Cookie dough| 200|
+----------------+-------------------+
df['Number of customers'].max() - df['Number of customers'].min()
150
sdf.groupby().max('Number of customers').collect()[0]['max(Number of customers)'] - sdf.groupby().min('Number of customers').collect()[0]['min(Number of customers)']
150