04 Reading bar graphs movies#

%%html
<iframe width="700" height="400" src="https://www.youtube.com/embed/R48KimqkYTA/" frameborder="0" allowfullscreen></iframe>
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import findspark

findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.appName("statistics").master("local").getOrCreate()
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/runner/work/statistics/spark-3.1.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.3.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
22/07/21 02:31:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

khanacademy

Reading bar graphs: movies Reading bar graphs: movies Reading bar graphs: movies Reading bar graphs: movies Reading bar graphs: movies Reading bar graphs: movies

dataset = {
    'Favorite type of movie': ['Comdey', 'Scary', 'Adenvture', 'Cartoon', 'Mystery'],
    'Number of people' : [20, 6, 10, 10, 16],
}
df = pd.DataFrame(dataset).set_index('Favorite type of movie')
df
Number of people
Favorite type of movie
Comdey 20
Scary 6
Adenvture 10
Cartoon 10
Mystery 16
sdf = spark.createDataFrame(zip(*dataset.values()), list(dataset.keys()))
sdf.show()
[Stage 0:>                                                          (0 + 1) / 1]
+----------------------+----------------+
|Favorite type of movie|Number of people|
+----------------------+----------------+
|                Comdey|              20|
|                 Scary|               6|
|             Adenvture|              10|
|               Cartoon|              10|
|               Mystery|              16|
+----------------------+----------------+
                                                                                
df[df['Number of people'] < 14]
Number of people
Favorite type of movie
Scary 6
Adenvture 10
Cartoon 10
sdf[sdf['Number of people'] < 14].show()
+----------------------+----------------+
|Favorite type of movie|Number of people|
+----------------------+----------------+
|                 Scary|               6|
|             Adenvture|              10|
|               Cartoon|              10|
+----------------------+----------------+
df.plot(kind="bar")
<AxesSubplot:xlabel='Favorite type of movie'>
../_images/04 Reading bar graphs movies_12_1.png
plt.bar(x=dataset['Favorite type of movie'], height=dataset['Number of people'])
plt.xlabel('Favorite type of movie')
plt.ylabel('Number of people')
plt.show()
../_images/04 Reading bar graphs movies_13_0.png
sns.barplot(x=dataset['Favorite type of movie'], y=dataset['Number of people'])
plt.xlabel('Favorite type of movie')
plt.ylabel('Number of people')
plt.show()
../_images/04 Reading bar graphs movies_14_0.png
data = [go.Bar(x=dataset['Favorite type of movie'], y=dataset['Number of people'])]
layout = go.Layout(
    xaxis=dict(title='Favorite type of movie'),
    yaxis=dict(title='Number of people')
)
fig = go.Figure(data, layout)
fig.show()
dataset = {
    'Day':['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'],
    'Dogs': [80, 160, 80, 140, 180]
}
df = pd.DataFrame(dataset).set_index('Day')
df
Dogs
Day
Monday 80
Tuesday 160
Wednesday 80
Thursday 140
Friday 180
sdf = spark.createDataFrame(zip(*dataset.values()), list(dataset.keys()))
sdf.show()
+---------+----+
|      Day|Dogs|
+---------+----+
|   Monday|  80|
|  Tuesday| 160|
|Wednesday|  80|
| Thursday| 140|
|   Friday| 180|
+---------+----+
sum_of_monday_and_wednesday = df[(df.index == 'Monday') | (df.index == 'Wednesday')]['Dogs'].sum()
df[df['Dogs'] == sum_of_monday_and_wednesday]
Dogs
Day
Tuesday 160
sum_of_monday_and_wednesday = sdf[(sdf['Day'] == 'Monday') | (sdf['Day'] == 'Wednesday')].groupby('Dogs').sum().collect()[0]['sum(Dogs)']
sdf[sdf['Dogs'] == sum_of_monday_and_wednesday].show()
[Stage 4:========>                                               (31 + 1) / 200]

[Stage 4:==============>                                         (52 + 1) / 200]
[Stage 4:===================>                                    (70 + 1) / 200]

[Stage 4:=======================>                                (84 + 1) / 200]
[Stage 4:============================>                          (102 + 2) / 200]

[Stage 4:==================================>                    (126 + 1) / 200]
[Stage 4:==========================================>            (154 + 1) / 200]

[Stage 4:==================================================>    (183 + 1) / 200]
+-------+----+
|    Day|Dogs|
+-------+----+
|Tuesday| 160|
+-------+----+
                                                                                
df.plot(kind='bar')
<AxesSubplot:xlabel='Day'>
../_images/04 Reading bar graphs movies_21_1.png
dataset = {
    'Ice cream flavor': ['Vanilla', 'Chocolate', 'Strawberry', 'Cookie dough'],
    'Number of customers': [175, 225, 75, 200]
}
df = pd.DataFrame(dataset).set_index('Ice cream flavor')
df
Number of customers
Ice cream flavor
Vanilla 175
Chocolate 225
Strawberry 75
Cookie dough 200
sdf = spark.createDataFrame(zip(*dataset.values()), list(dataset.keys()))
sdf.show()
+----------------+-------------------+
|Ice cream flavor|Number of customers|
+----------------+-------------------+
|         Vanilla|                175|
|       Chocolate|                225|
|      Strawberry|                 75|
|    Cookie dough|                200|
+----------------+-------------------+
df['Number of customers'].max() - df['Number of customers'].min()
150
sdf.groupby().max('Number of customers').collect()[0]['max(Number of customers)'] - sdf.groupby().min('Number of customers').collect()[0]['min(Number of customers)'] 
150