Categorical Data#

import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
import category_encoders as ce

%matplotlib inline

Identifying Categorical Data: Nominal, Ordinal and Continuous#

df_flights = pd.read_csv('https://raw.githubusercontent.com/ismayc/pnwflights14/master/data/flights.csv')
df_flights.head()
year month day dep_time dep_delay arr_time arr_delay carrier tailnum flight origin dest air_time distance hour minute
0 2014 1 1 1.0 96.0 235.0 70.0 AS N508AS 145 PDX ANC 194.0 1542 0.0 1.0
1 2014 1 1 4.0 -6.0 738.0 -23.0 US N195UW 1830 SEA CLT 252.0 2279 0.0 4.0
2 2014 1 1 8.0 13.0 548.0 -4.0 UA N37422 1609 PDX IAH 201.0 1825 0.0 8.0
3 2014 1 1 28.0 -2.0 800.0 -23.0 US N547UW 466 PDX CLT 251.0 2282 0.0 28.0
4 2014 1 1 34.0 44.0 325.0 43.0 AS N762AS 121 SEA ANC 201.0 1448 0.0 34.0
df_flights.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162049 entries, 0 to 162048
Data columns (total 16 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   year       162049 non-null  int64  
 1   month      162049 non-null  int64  
 2   day        162049 non-null  int64  
 3   dep_time   161192 non-null  float64
 4   dep_delay  161192 non-null  float64
 5   arr_time   161061 non-null  float64
 6   arr_delay  160748 non-null  float64
 7   carrier    162049 non-null  object 
 8   tailnum    161801 non-null  object 
 9   flight     162049 non-null  int64  
 10  origin     162049 non-null  object 
 11  dest       162049 non-null  object 
 12  air_time   160748 non-null  float64
 13  distance   162049 non-null  int64  
 14  hour       161192 non-null  float64
 15  minute     161192 non-null  float64
dtypes: float64(7), int64(5), object(4)
memory usage: 19.8+ MB
df_flights.boxplot('dep_time', 'origin',rot = 30, figsize=(5,6))
<AxesSubplot:title={'center':'dep_time'}, xlabel='origin'>
../_images/Categorical Data_6_1.png
cat_df_flights = df_flights.select_dtypes(include=['object']).copy()
cat_df_flights.head()
carrier tailnum origin dest
0 AS N508AS PDX ANC
1 US N195UW SEA CLT
2 UA N37422 PDX IAH
3 US N547UW PDX CLT
4 AS N762AS SEA ANC
cat_df_flights.isnull().values.sum()
248
cat_df_flights.isnull().sum()
carrier      0
tailnum    248
origin       0
dest         0
dtype: int64
cat_df_flights = cat_df_flights.fillna(cat_df_flights['tailnum'].value_counts().index[0])
cat_df_flights.isnull().values.sum()
0
carrier_count = cat_df_flights['carrier'].value_counts()
carrier_count
AS    62460
WN    23355
OO    18710
DL    16716
UA    16671
AA     7586
US     5946
B6     3540
VX     3272
F9     2698
HA     1095
Name: carrier, dtype: int64
carrier_count.count()
11
carrier_count.index
Index(['AS', 'WN', 'OO', 'DL', 'UA', 'AA', 'US', 'B6', 'VX', 'F9', 'HA'], dtype='object')
carrier_count.values
array([62460, 23355, 18710, 16716, 16671,  7586,  5946,  3540,  3272,
        2698,  1095])
sns.set_style('darkgrid')
sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9)
plt.title('Frequency Distribution of Carriers')
plt.xlabel('Carriers', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()
/opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
../_images/Categorical Data_19_1.png
plt.pie(carrier_count.values,
        labels=carrier_count.index,
        autopct='%1.1f%%', shadow=True)
plt.axis('equal')
plt.show()
../_images/Categorical Data_20_0.png

Encoding Categorical Data#

Replacing values#

replace_map_comp = {'carrier' :{val:idx 
                                for idx, val in enumerate(cat_df_flights['carrier']
                                                          .astype('category')
                                                          .cat.categories.tolist())}}
replace_map_comp
{'carrier': {'AA': 0,
  'AS': 1,
  'B6': 2,
  'DL': 3,
  'F9': 4,
  'HA': 5,
  'OO': 6,
  'UA': 7,
  'US': 8,
  'VX': 9,
  'WN': 10}}
cat_df_flights_replace = cat_df_flights.copy()
cat_df_flights_replace.replace(replace_map_comp, inplace=True)
cat_df_flights_replace.head()
carrier tailnum origin dest
0 1 N508AS PDX ANC
1 8 N195UW SEA CLT
2 7 N37422 PDX IAH
3 8 N547UW PDX CLT
4 1 N762AS SEA ANC
cat_df_flights_replace['carrier'].dtypes
dtype('int64')

Encoding labels#

cat_df_flights_lc = cat_df_flights.copy()
# typecasting categorical features to a category dtype to make it faster than object dtype
cat_df_flights_lc['carrier'] = cat_df_flights_lc['carrier'].astype('category')
cat_df_flights_lc['origin'] = cat_df_flights_lc['origin'].astype('category')
cat_df_flights_lc['carrier'] = cat_df_flights_lc['carrier'].cat.codes
cat_df_flights_lc.head()
carrier tailnum origin dest
0 1 N508AS PDX ANC
1 8 N195UW SEA CLT
2 7 N37422 PDX IAH
3 8 N547UW PDX CLT
4 1 N762AS SEA ANC
cat_df_flights_specific = cat_df_flights.copy()
cat_df_flights_specific['US_code'] = np.where(cat_df_flights_specific['carrier'].str.contains('US'), 1, 0)
cat_df_flights_specific.head()
carrier tailnum origin dest US_code
0 AS N508AS PDX ANC 0
1 US N195UW SEA CLT 1
2 UA N37422 PDX IAH 0
3 US N547UW PDX CLT 1
4 AS N762AS SEA ANC 0
cat_df_flights_sklearn = cat_df_flights.copy()
lb_make = LabelEncoder()
cat_df_flights_sklearn['carrier_code'] = lb_make.fit_transform(cat_df_flights_sklearn['carrier'])
cat_df_flights_sklearn.head()
carrier tailnum origin dest carrier_code
0 AS N508AS PDX ANC 1
1 US N195UW SEA CLT 8
2 UA N37422 PDX IAH 7
3 US N547UW PDX CLT 8
4 AS N762AS SEA ANC 1

One-Hot encoding#

cat_df_flights_onehot = cat_df_flights.copy()
# cat_df_flights_onehot = pd.get_dummies(cat_df_flights_onehot, columns=['carrier', 'dest'], prefix=['carrier', 'dest'])
cat_df_flights_onehot = pd.get_dummies(cat_df_flights_onehot, columns=['carrier'], prefix=['carrier'])
cat_df_flights_onehot.head()
tailnum origin dest carrier_AA carrier_AS carrier_B6 carrier_DL carrier_F9 carrier_HA carrier_OO carrier_UA carrier_US carrier_VX carrier_WN
0 N508AS PDX ANC 0 1 0 0 0 0 0 0 0 0 0
1 N195UW SEA CLT 0 0 0 0 0 0 0 0 1 0 0
2 N37422 PDX IAH 0 0 0 0 0 0 0 1 0 0 0
3 N547UW PDX CLT 0 0 0 0 0 0 0 0 1 0 0
4 N762AS SEA ANC 0 1 0 0 0 0 0 0 0 0 0
cat_df_flights_onehot_sklearn = cat_df_flights.copy()
lb = LabelBinarizer()
lb_result = lb.fit_transform(cat_df_flights_onehot_sklearn['carrier'])
lb_result_df = pd.DataFrame(lb_result, columns=lb.classes_)
lb_result_df.head()
AA AS B6 DL F9 HA OO UA US VX WN
0 0 1 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 1 0 0
2 0 0 0 0 0 0 0 1 0 0 0
3 0 0 0 0 0 0 0 0 1 0 0
4 0 1 0 0 0 0 0 0 0 0 0
result_df = pd.concat([cat_df_flights_onehot_sklearn, lb_result_df], axis=1)
result_df.head()
carrier tailnum origin dest AA AS B6 DL F9 HA OO UA US VX WN
0 AS N508AS PDX ANC 0 1 0 0 0 0 0 0 0 0 0
1 US N195UW SEA CLT 0 0 0 0 0 0 0 0 1 0 0
2 UA N37422 PDX IAH 0 0 0 0 0 0 0 1 0 0 0
3 US N547UW PDX CLT 0 0 0 0 0 0 0 0 1 0 0
4 AS N762AS SEA ANC 0 1 0 0 0 0 0 0 0 0 0

Binary encoding#

cat_df_flights_ce = cat_df_flights.copy()
encoder = ce.BinaryEncoder(cols=['carrier'])
df_binary = encoder.fit_transform(cat_df_flights_ce)
df_binary.head()
carrier_0 carrier_1 carrier_2 carrier_3 tailnum origin dest
0 0 0 0 1 N508AS PDX ANC
1 0 0 1 0 N195UW SEA CLT
2 0 0 1 1 N37422 PDX IAH
3 0 0 1 0 N547UW PDX CLT
4 0 0 0 1 N762AS SEA ANC

Backward difference encoding#

encoder = ce.BackwardDifferenceEncoder(cols=['carrier'])
df_bd = encoder.fit_transform(cat_df_flights_ce)
df_bd.head()
intercept carrier_0 carrier_1 carrier_2 carrier_3 carrier_4 carrier_5 carrier_6 carrier_7 carrier_8 carrier_9 tailnum origin dest
0 1 -0.909091 -0.818182 -0.727273 -0.636364 -0.545455 -0.454545 -0.363636 -0.272727 -0.181818 -0.090909 N508AS PDX ANC
1 1 0.090909 -0.818182 -0.727273 -0.636364 -0.545455 -0.454545 -0.363636 -0.272727 -0.181818 -0.090909 N195UW SEA CLT
2 1 0.090909 0.181818 -0.727273 -0.636364 -0.545455 -0.454545 -0.363636 -0.272727 -0.181818 -0.090909 N37422 PDX IAH
3 1 0.090909 -0.818182 -0.727273 -0.636364 -0.545455 -0.454545 -0.363636 -0.272727 -0.181818 -0.090909 N547UW PDX CLT
4 1 -0.909091 -0.818182 -0.727273 -0.636364 -0.545455 -0.454545 -0.363636 -0.272727 -0.181818 -0.090909 N762AS SEA ANC

Miscellaneous features#

dummy_df_age = pd.DataFrame({'age':['0-20', '20-40', '40-60','60-80']})
dummy_df_age['start'], dummy_df_age['end'] = zip(*dummy_df_age['age'].map(lambda x: x.split('-')))
dummy_df_age.head()
age start end
0 0-20 0 20
1 20-40 20 40
2 40-60 40 60
3 60-80 60 80
dummy_df_age = pd.DataFrame({'age':['0-20', '20-40', '40-60','60-80']})
def split_mean(x):
    split_list = x.split('-')
    mean = float(split_list[0]) + float(split_list[1]) / 2
    return mean
dummy_df_age['age_mean'] = dummy_df_age['age'].apply(lambda x: split_mean(x))
dummy_df_age.head()
age age_mean
0 0-20 10.0
1 20-40 40.0
2 40-60 70.0
3 60-80 100.0