02 Creating a histogram#

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import findspark

findspark.init()
from pyspark.context import SparkContext
from pyspark.sql import functions as F
from pyspark.sql.session import SparkSession

spark = SparkSession.builder.appName("statistics").master("local").getOrCreate()
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/runner/work/statistics/spark-3.1.3-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.3.jar) to constructor java.nio.DirectByteBuffer(long,int)
WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
22/07/21 02:33:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).

Creating a histogram fig 1

ages = np.array(
    [1, 3, 27, 32, 5, 63, 26, 25, 18, 16, 4, 45, 29, 19, 22, 51, 58, 9, 42, 6]
)

Generating dataset using for loop#

buckets = ["0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69"]
ages_list = []
for i in buckets:
    min_, max_ = i.split("-")
    get_ages = ages[(ages >= int(min_)) & (ages <= int(max_))]
    ages_list.append(len(get_ages))
dataset = {"Buckets": buckets, "#": ages_list}

Pandas#

df = pd.DataFrame(dataset)
df
Buckets #
0 0-9 6
1 10-19 3
2 20-29 5
3 30-39 1
4 40-49 2
5 50-59 2
6 60-69 1

Spark#

sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))
sdf.show()
[Stage 0:>                                                          (0 + 1) / 1]
+-------+---+
|Buckets|  #|
+-------+---+
|    0-9|  6|
|  10-19|  3|
|  20-29|  5|
|  30-39|  1|
|  40-49|  2|
|  50-59|  2|
|  60-69|  1|
+-------+---+
                                                                                

Generating dataset using np histogram#

bins = np.arange(0, 80, 10)
ages_list, buckets = np.histogram(ages, bins)
dataset = {"Buckets": buckets[1:].tolist(), "#": ages_list.tolist()}

Pandas#

df = pd.DataFrame(dataset)
df
Buckets #
0 10 6
1 20 3
2 30 5
3 40 1
4 50 2
5 60 2
6 70 1

Spark#

sdf = spark.createDataFrame(zip(*dataset.values()), list(dataset.keys()))
sdf.show()
+-------+---+
|Buckets|  #|
+-------+---+
|     10|  6|
|     20|  3|
|     30|  5|
|     40|  1|
|     50|  2|
|     60|  2|
|     70|  1|
+-------+---+

Histogram#

Matplotlib#

n, bins, _ = plt.hist(ages, buckets)
plt.xlabel("Buckets")
plt.ylabel("#")
plt.show()
../_images/02 Creating a histogram_19_0.png
print(n, bins)
[6. 3. 5. 1. 2. 2. 1.] [ 0 10 20 30 40 50 60 70]

Seanorn#

sns.histplot(ages, bins=buckets)
plt.xlabel("Buckets")
plt.ylabel("#")
plt.show()
../_images/02 Creating a histogram_22_0.png

Plotly#

data = [
    go.Histogram(
        x=ages,
        nbinsx=7,
        marker=dict(
            color=[
                "lightyellow",
                "darkorange",
                "cornflowerblue",
                "magenta",
                "lightgreen",
                "darkviolet",
                "yellow",
            ]
        ),
    )
]
layout = go.Layout(xaxis=dict(title="bucket"), yaxis=dict(title="#"))
fig = go.Figure(data, layout)
fig.show()