08 Example - Comparing distributions#

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import findspark; findspark.init()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.appName('statistics').master('local').getOrCreate()
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/runner/work/statistics/spark-3.1.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.3.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
22/07/21 02:34:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

Example - Comparing distributions fig 1Example - Comparing distributions fig 2

poland = 8 *[4] + 12 * [7] + 7 * [10] + 4 * [13]
minneapolis = 1 * [-20] + 1 * [-17] + 3 * [-11] + 3 * [-8] + 5 * [-5] + 4 * [-2] + 5 * [1] + 7 * [4] + 2 * [7]
bins=[-20, -17, -11, -8, -5, -2, 1, 4, 7, 10, 13, 16]
plt.hist(minneapolis, bins=bins, color='#4ec7d6')
plt.ylim([0, 14])
plt.xlabel('Temperature (C)')
plt.ylabel('Number of days')
plt.title('Minneapolis')
plt.show()

plt.hist(poland, bins=bins, color='#57cb79')
plt.ylim([0, 14])
plt.xlabel('Temperature (C)')
plt.ylabel('Number of days')
plt.title('Poland')
plt.show()
../_images/08 Example - Comparing distributions_5_0.png ../_images/08 Example - Comparing distributions_5_1.png
sns.distplot(minneapolis, bins=bins, color='#4ec7d6')
# plt.ylim([0, 14])
plt.xlabel('Temperature (C)')
plt.ylabel('Number of days')
plt.title('Minneapolis')
plt.show()

sns.distplot(poland, bins=bins, color='#57cb79')
# plt.ylim([0, 14])
plt.xlabel('Temperature (C)')
plt.ylabel('Number of days')
plt.title('Poland')
plt.show()
/opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
../_images/08 Example - Comparing distributions_6_1.png
/opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
../_images/08 Example - Comparing distributions_6_3.png
data = [go.Histogram(x=minneapolis, nbinsx=10, marker_color='#4ec7d6', name='Minneapolis')]
fig = go.Figure(data=data)
fig.update_layout(
    xaxis_title='Temperature (C)',
    yaxis_title='Number of days',
    title='Minneapolis'
)
fig.show()

data = [go.Histogram(x=poland, nbinsx=5, marker_color='#57cb79', name='Poland')]
fig = go.Figure(data=data)
fig.update_layout(
    xaxis_title='Temperature (C)',
    yaxis_title='Number of days',
    title='Poland'
)
fig.show()
# Center Tendency 
print(f'Poland mean {np.mean(poland)} \tMinneapolis mean : {np.mean(minneapolis)}')
# Spread
print(f'Poland range {max(poland) - min(poland)} \tMinneapolis range : {max(minneapolis) - min(minneapolis)}')
print(f'Poland std {np.std(poland)} \tMinneapolis std : {np.std(minneapolis)}')
Poland mean 7.67741935483871 	Minneapolis mean : -2.5806451612903225
Poland range 9 	Minneapolis range : 27
Poland std 2.9225165555886767 	Minneapolis std : 6.70471280349243
first_round = 1 * [56] + 2 * [56.6] + 1 * [57.1] + 1 * [57.2] + 1 * [57.3] + 1 * [57.7] + 1 * [57.9] 
semifinal_round = 1 * [56.7] + + 1 * [57.2] + 2 * [57.3] + 1 * [57.4] + 1 * [57.5] + 1 * [57.7] + 1 * [57.8]
bins = np.arange(56, 58.5, 0.1)
plt.hist(first_round, bins, color='lightgreen', label='First round')
plt.hist(semifinal_round, bins, color='cornflowerblue', label='Semifinal round')
plt.xlabel('Time(seconds)')
plt.legend()
plt.show()
../_images/08 Example - Comparing distributions_10_0.png
sns.distplot(first_round, bins, color='lightgreen', label='First round')
sns.distplot(semifinal_round, bins, color='cornflowerblue', label='Semifinal round')
plt.xlabel('Time(seconds)')
plt.legend()
plt.show()
/opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

/opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/seaborn/distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
../_images/08 Example - Comparing distributions_11_1.png
data = [go.Histogram(x=first_round, nbinsx=10, marker_color='lightgreen', name='First round'), 
        go.Histogram(x=semifinal_round, nbinsx=10, marker_color='cornflowerblue', name='Semifinal round')]
fig = go.Figure(data=data)
fig.update_layout(
    xaxis_title='Time(seconds)'
)
fig.show()
# Center Tendency 
print(f'First round mean {np.mean(first_round)} \tSemifinal round mean : {np.mean(semifinal_round)}')
# Spread
print(f'First round range {max(first_round) - min(poland)} \tsemifinal round range : {max(semifinal_round) - min(minneapolis)}')
print(f'First round std {np.std(first_round)} \tSemifinal round std : {np.std(semifinal_round)}')
First round mean 57.05 	Semifinal round mean : 57.3625
First round range 53.9 	semifinal round range : 77.8
First round std 0.5852349955359811 	Semifinal round std : 0.3159806164941125