21 Judging outliers in a dataset#

%%html
<iframe width="700" height="400" src="https://www.youtube.com/embed/FRlTh5HQORA/" frameborder="0" allowfullscreen></iframe>
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from scipy import stats

khanacademy

Judging outliers in a dataset fig 1Judging outliers in a dataset fig 2

x = Series(np.array([1,1,6, 13, 13, 14, 14, 14, 15, 15, 16, 18, 18, 18, 19]))
def is_outlier(num, dataset):
    dataset = np.array(dataset)
    Q1 = np.percentile(dataset, 25)
    Q3 = np.percentile(dataset, 75)
    IQR = stats.iqr(dataset)
    return num < (Q1 - 1.5 * IQR) or num > (Q3 + 1.5 * IQR)
is_outlier(13, x)
False
x.plot(kind='hist')
<AxesSubplot:ylabel='Frequency'>
../_images/21 Judging outliers in a dataset_8_1.png
x.plot(kind='box')
<AxesSubplot:>
../_images/21 Judging outliers in a dataset_9_1.png
plt.hist(x)
plt.axes().set_xticks(range(1, 19))
[<matplotlib.axis.XTick at 0x7f694f000610>,
 <matplotlib.axis.XTick at 0x7f694f0005e0>,
 <matplotlib.axis.XTick at 0x7f694efd5cd0>,
 <matplotlib.axis.XTick at 0x7f694efa9d90>,
 <matplotlib.axis.XTick at 0x7f694efb7520>,
 <matplotlib.axis.XTick at 0x7f694efb7c70>,
 <matplotlib.axis.XTick at 0x7f694efbd400>,
 <matplotlib.axis.XTick at 0x7f694efb7d60>,
 <matplotlib.axis.XTick at 0x7f694efa9dc0>,
 <matplotlib.axis.XTick at 0x7f694efbdd90>,
 <matplotlib.axis.XTick at 0x7f694efc3520>,
 <matplotlib.axis.XTick at 0x7f694efc3c70>,
 <matplotlib.axis.XTick at 0x7f694efca400>,
 <matplotlib.axis.XTick at 0x7f694efcab50>,
 <matplotlib.axis.XTick at 0x7f694efca7c0>,
 <matplotlib.axis.XTick at 0x7f694efc3550>,
 <matplotlib.axis.XTick at 0x7f694ef51160>,
 <matplotlib.axis.XTick at 0x7f694ef517f0>]
../_images/21 Judging outliers in a dataset_10_1.png
plt.boxplot(x)
plt.axes().set_yticks(range(1, 19))
[<matplotlib.axis.YTick at 0x7f694eeead90>,
 <matplotlib.axis.YTick at 0x7f694eeea730>,
 <matplotlib.axis.YTick at 0x7f694eee5670>,
 <matplotlib.axis.YTick at 0x7f694ee95760>,
 <matplotlib.axis.YTick at 0x7f694ee9f040>,
 <matplotlib.axis.YTick at 0x7f694ee9f640>,
 <matplotlib.axis.YTick at 0x7f694ee9fd90>,
 <matplotlib.axis.YTick at 0x7f694eea5520>,
 <matplotlib.axis.YTick at 0x7f694ee9fdf0>,
 <matplotlib.axis.YTick at 0x7f694ee95c40>,
 <matplotlib.axis.YTick at 0x7f694eea5580>,
 <matplotlib.axis.YTick at 0x7f694eeab370>,
 <matplotlib.axis.YTick at 0x7f694eeabac0>,
 <matplotlib.axis.YTick at 0x7f694eeb1250>,
 <matplotlib.axis.YTick at 0x7f694eeb19a0>,
 <matplotlib.axis.YTick at 0x7f694eeabb80>,
 <matplotlib.axis.YTick at 0x7f694ee95280>,
 <matplotlib.axis.YTick at 0x7f694eeba040>]
../_images/21 Judging outliers in a dataset_11_1.png
# fixing outlier with "accurate" IQR function 
# dot plot instead of histogram
# fix the warning

additional resources: towardsdatascience