21 Judging outliers in a dataset
21 Judging outliers in a dataset#
%%html
<iframe width="700" height="400" src="https://www.youtube.com/embed/FRlTh5HQORA/" frameborder="0" allowfullscreen></iframe>
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from scipy import stats
x = Series(np.array([1,1,6, 13, 13, 14, 14, 14, 15, 15, 16, 18, 18, 18, 19]))
def is_outlier(num, dataset):
dataset = np.array(dataset)
Q1 = np.percentile(dataset, 25)
Q3 = np.percentile(dataset, 75)
IQR = stats.iqr(dataset)
return num < (Q1 - 1.5 * IQR) or num > (Q3 + 1.5 * IQR)
is_outlier(13, x)
False
x.plot(kind='hist')
<AxesSubplot:ylabel='Frequency'>
x.plot(kind='box')
<AxesSubplot:>
plt.hist(x)
plt.axes().set_xticks(range(1, 19))
[<matplotlib.axis.XTick at 0x7f694f000610>,
<matplotlib.axis.XTick at 0x7f694f0005e0>,
<matplotlib.axis.XTick at 0x7f694efd5cd0>,
<matplotlib.axis.XTick at 0x7f694efa9d90>,
<matplotlib.axis.XTick at 0x7f694efb7520>,
<matplotlib.axis.XTick at 0x7f694efb7c70>,
<matplotlib.axis.XTick at 0x7f694efbd400>,
<matplotlib.axis.XTick at 0x7f694efb7d60>,
<matplotlib.axis.XTick at 0x7f694efa9dc0>,
<matplotlib.axis.XTick at 0x7f694efbdd90>,
<matplotlib.axis.XTick at 0x7f694efc3520>,
<matplotlib.axis.XTick at 0x7f694efc3c70>,
<matplotlib.axis.XTick at 0x7f694efca400>,
<matplotlib.axis.XTick at 0x7f694efcab50>,
<matplotlib.axis.XTick at 0x7f694efca7c0>,
<matplotlib.axis.XTick at 0x7f694efc3550>,
<matplotlib.axis.XTick at 0x7f694ef51160>,
<matplotlib.axis.XTick at 0x7f694ef517f0>]
plt.boxplot(x)
plt.axes().set_yticks(range(1, 19))
[<matplotlib.axis.YTick at 0x7f694eeead90>,
<matplotlib.axis.YTick at 0x7f694eeea730>,
<matplotlib.axis.YTick at 0x7f694eee5670>,
<matplotlib.axis.YTick at 0x7f694ee95760>,
<matplotlib.axis.YTick at 0x7f694ee9f040>,
<matplotlib.axis.YTick at 0x7f694ee9f640>,
<matplotlib.axis.YTick at 0x7f694ee9fd90>,
<matplotlib.axis.YTick at 0x7f694eea5520>,
<matplotlib.axis.YTick at 0x7f694ee9fdf0>,
<matplotlib.axis.YTick at 0x7f694ee95c40>,
<matplotlib.axis.YTick at 0x7f694eea5580>,
<matplotlib.axis.YTick at 0x7f694eeab370>,
<matplotlib.axis.YTick at 0x7f694eeabac0>,
<matplotlib.axis.YTick at 0x7f694eeb1250>,
<matplotlib.axis.YTick at 0x7f694eeb19a0>,
<matplotlib.axis.YTick at 0x7f694eeabb80>,
<matplotlib.axis.YTick at 0x7f694ee95280>,
<matplotlib.axis.YTick at 0x7f694eeba040>]
# fixing outlier with "accurate" IQR function
# dot plot instead of histogram
# fix the warning
additional resources: towardsdatascience