Categorical Data
Contents
Categorical Data#
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
import category_encoders as ce
%matplotlib inline
Identifying Categorical Data: Nominal, Ordinal and Continuous#
df_flights = pd.read_csv('https://raw.githubusercontent.com/ismayc/pnwflights14/master/data/flights.csv')
df_flights.head()
year | month | day | dep_time | dep_delay | arr_time | arr_delay | carrier | tailnum | flight | origin | dest | air_time | distance | hour | minute | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2014 | 1 | 1 | 1.0 | 96.0 | 235.0 | 70.0 | AS | N508AS | 145 | PDX | ANC | 194.0 | 1542 | 0.0 | 1.0 |
1 | 2014 | 1 | 1 | 4.0 | -6.0 | 738.0 | -23.0 | US | N195UW | 1830 | SEA | CLT | 252.0 | 2279 | 0.0 | 4.0 |
2 | 2014 | 1 | 1 | 8.0 | 13.0 | 548.0 | -4.0 | UA | N37422 | 1609 | PDX | IAH | 201.0 | 1825 | 0.0 | 8.0 |
3 | 2014 | 1 | 1 | 28.0 | -2.0 | 800.0 | -23.0 | US | N547UW | 466 | PDX | CLT | 251.0 | 2282 | 0.0 | 28.0 |
4 | 2014 | 1 | 1 | 34.0 | 44.0 | 325.0 | 43.0 | AS | N762AS | 121 | SEA | ANC | 201.0 | 1448 | 0.0 | 34.0 |
df_flights.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162049 entries, 0 to 162048
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 year 162049 non-null int64
1 month 162049 non-null int64
2 day 162049 non-null int64
3 dep_time 161192 non-null float64
4 dep_delay 161192 non-null float64
5 arr_time 161061 non-null float64
6 arr_delay 160748 non-null float64
7 carrier 162049 non-null object
8 tailnum 161801 non-null object
9 flight 162049 non-null int64
10 origin 162049 non-null object
11 dest 162049 non-null object
12 air_time 160748 non-null float64
13 distance 162049 non-null int64
14 hour 161192 non-null float64
15 minute 161192 non-null float64
dtypes: float64(7), int64(5), object(4)
memory usage: 19.8+ MB
df_flights.boxplot('dep_time', 'origin',rot = 30, figsize=(5,6))
<AxesSubplot:title={'center':'dep_time'}, xlabel='origin'>
cat_df_flights = df_flights.select_dtypes(include=['object']).copy()
cat_df_flights.head()
carrier | tailnum | origin | dest | |
---|---|---|---|---|
0 | AS | N508AS | PDX | ANC |
1 | US | N195UW | SEA | CLT |
2 | UA | N37422 | PDX | IAH |
3 | US | N547UW | PDX | CLT |
4 | AS | N762AS | SEA | ANC |
cat_df_flights.isnull().values.sum()
248
cat_df_flights.isnull().sum()
carrier 0
tailnum 248
origin 0
dest 0
dtype: int64
cat_df_flights = cat_df_flights.fillna(cat_df_flights['tailnum'].value_counts().index[0])
cat_df_flights.isnull().values.sum()
0
carrier_count = cat_df_flights['carrier'].value_counts()
carrier_count
AS 62460
WN 23355
OO 18710
DL 16716
UA 16671
AA 7586
US 5946
B6 3540
VX 3272
F9 2698
HA 1095
Name: carrier, dtype: int64
carrier_count.count()
11
carrier_count.index
Index(['AS', 'WN', 'OO', 'DL', 'UA', 'AA', 'US', 'B6', 'VX', 'F9', 'HA'], dtype='object')
carrier_count.values
array([62460, 23355, 18710, 16716, 16671, 7586, 5946, 3540, 3272,
2698, 1095])
sns.set_style('darkgrid')
sns.barplot(carrier_count.index, carrier_count.values, alpha=0.9)
plt.title('Frequency Distribution of Carriers')
plt.xlabel('Carriers', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()
/opt/hostedtoolcache/Python/3.9.13/x64/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
plt.pie(carrier_count.values,
labels=carrier_count.index,
autopct='%1.1f%%', shadow=True)
plt.axis('equal')
plt.show()
Encoding Categorical Data#
Replacing values#
replace_map_comp = {'carrier' :{val:idx
for idx, val in enumerate(cat_df_flights['carrier']
.astype('category')
.cat.categories.tolist())}}
replace_map_comp
{'carrier': {'AA': 0,
'AS': 1,
'B6': 2,
'DL': 3,
'F9': 4,
'HA': 5,
'OO': 6,
'UA': 7,
'US': 8,
'VX': 9,
'WN': 10}}
cat_df_flights_replace = cat_df_flights.copy()
cat_df_flights_replace.replace(replace_map_comp, inplace=True)
cat_df_flights_replace.head()
carrier | tailnum | origin | dest | |
---|---|---|---|---|
0 | 1 | N508AS | PDX | ANC |
1 | 8 | N195UW | SEA | CLT |
2 | 7 | N37422 | PDX | IAH |
3 | 8 | N547UW | PDX | CLT |
4 | 1 | N762AS | SEA | ANC |
cat_df_flights_replace['carrier'].dtypes
dtype('int64')
Encoding labels#
cat_df_flights_lc = cat_df_flights.copy()
# typecasting categorical features to a category dtype to make it faster than object dtype
cat_df_flights_lc['carrier'] = cat_df_flights_lc['carrier'].astype('category')
cat_df_flights_lc['origin'] = cat_df_flights_lc['origin'].astype('category')
cat_df_flights_lc['carrier'] = cat_df_flights_lc['carrier'].cat.codes
cat_df_flights_lc.head()
carrier | tailnum | origin | dest | |
---|---|---|---|---|
0 | 1 | N508AS | PDX | ANC |
1 | 8 | N195UW | SEA | CLT |
2 | 7 | N37422 | PDX | IAH |
3 | 8 | N547UW | PDX | CLT |
4 | 1 | N762AS | SEA | ANC |
cat_df_flights_specific = cat_df_flights.copy()
cat_df_flights_specific['US_code'] = np.where(cat_df_flights_specific['carrier'].str.contains('US'), 1, 0)
cat_df_flights_specific.head()
carrier | tailnum | origin | dest | US_code | |
---|---|---|---|---|---|
0 | AS | N508AS | PDX | ANC | 0 |
1 | US | N195UW | SEA | CLT | 1 |
2 | UA | N37422 | PDX | IAH | 0 |
3 | US | N547UW | PDX | CLT | 1 |
4 | AS | N762AS | SEA | ANC | 0 |
cat_df_flights_sklearn = cat_df_flights.copy()
lb_make = LabelEncoder()
cat_df_flights_sklearn['carrier_code'] = lb_make.fit_transform(cat_df_flights_sklearn['carrier'])
cat_df_flights_sklearn.head()
carrier | tailnum | origin | dest | carrier_code | |
---|---|---|---|---|---|
0 | AS | N508AS | PDX | ANC | 1 |
1 | US | N195UW | SEA | CLT | 8 |
2 | UA | N37422 | PDX | IAH | 7 |
3 | US | N547UW | PDX | CLT | 8 |
4 | AS | N762AS | SEA | ANC | 1 |
One-Hot encoding#
cat_df_flights_onehot = cat_df_flights.copy()
# cat_df_flights_onehot = pd.get_dummies(cat_df_flights_onehot, columns=['carrier', 'dest'], prefix=['carrier', 'dest'])
cat_df_flights_onehot = pd.get_dummies(cat_df_flights_onehot, columns=['carrier'], prefix=['carrier'])
cat_df_flights_onehot.head()
tailnum | origin | dest | carrier_AA | carrier_AS | carrier_B6 | carrier_DL | carrier_F9 | carrier_HA | carrier_OO | carrier_UA | carrier_US | carrier_VX | carrier_WN | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | N508AS | PDX | ANC | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | N195UW | SEA | CLT | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | N37422 | PDX | IAH | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | N547UW | PDX | CLT | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | N762AS | SEA | ANC | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
cat_df_flights_onehot_sklearn = cat_df_flights.copy()
lb = LabelBinarizer()
lb_result = lb.fit_transform(cat_df_flights_onehot_sklearn['carrier'])
lb_result_df = pd.DataFrame(lb_result, columns=lb.classes_)
lb_result_df.head()
AA | AS | B6 | DL | F9 | HA | OO | UA | US | VX | WN | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
result_df = pd.concat([cat_df_flights_onehot_sklearn, lb_result_df], axis=1)
result_df.head()
carrier | tailnum | origin | dest | AA | AS | B6 | DL | F9 | HA | OO | UA | US | VX | WN | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | AS | N508AS | PDX | ANC | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | US | N195UW | SEA | CLT | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | UA | N37422 | PDX | IAH | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | US | N547UW | PDX | CLT | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | AS | N762AS | SEA | ANC | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Binary encoding#
cat_df_flights_ce = cat_df_flights.copy()
encoder = ce.BinaryEncoder(cols=['carrier'])
df_binary = encoder.fit_transform(cat_df_flights_ce)
df_binary.head()
carrier_0 | carrier_1 | carrier_2 | carrier_3 | tailnum | origin | dest | |
---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 1 | N508AS | PDX | ANC |
1 | 0 | 0 | 1 | 0 | N195UW | SEA | CLT |
2 | 0 | 0 | 1 | 1 | N37422 | PDX | IAH |
3 | 0 | 0 | 1 | 0 | N547UW | PDX | CLT |
4 | 0 | 0 | 0 | 1 | N762AS | SEA | ANC |
Backward difference encoding#
encoder = ce.BackwardDifferenceEncoder(cols=['carrier'])
df_bd = encoder.fit_transform(cat_df_flights_ce)
df_bd.head()
intercept | carrier_0 | carrier_1 | carrier_2 | carrier_3 | carrier_4 | carrier_5 | carrier_6 | carrier_7 | carrier_8 | carrier_9 | tailnum | origin | dest | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | -0.909091 | -0.818182 | -0.727273 | -0.636364 | -0.545455 | -0.454545 | -0.363636 | -0.272727 | -0.181818 | -0.090909 | N508AS | PDX | ANC |
1 | 1 | 0.090909 | -0.818182 | -0.727273 | -0.636364 | -0.545455 | -0.454545 | -0.363636 | -0.272727 | -0.181818 | -0.090909 | N195UW | SEA | CLT |
2 | 1 | 0.090909 | 0.181818 | -0.727273 | -0.636364 | -0.545455 | -0.454545 | -0.363636 | -0.272727 | -0.181818 | -0.090909 | N37422 | PDX | IAH |
3 | 1 | 0.090909 | -0.818182 | -0.727273 | -0.636364 | -0.545455 | -0.454545 | -0.363636 | -0.272727 | -0.181818 | -0.090909 | N547UW | PDX | CLT |
4 | 1 | -0.909091 | -0.818182 | -0.727273 | -0.636364 | -0.545455 | -0.454545 | -0.363636 | -0.272727 | -0.181818 | -0.090909 | N762AS | SEA | ANC |
Miscellaneous features#
dummy_df_age = pd.DataFrame({'age':['0-20', '20-40', '40-60','60-80']})
dummy_df_age['start'], dummy_df_age['end'] = zip(*dummy_df_age['age'].map(lambda x: x.split('-')))
dummy_df_age.head()
age | start | end | |
---|---|---|---|
0 | 0-20 | 0 | 20 |
1 | 20-40 | 20 | 40 |
2 | 40-60 | 40 | 60 |
3 | 60-80 | 60 | 80 |
dummy_df_age = pd.DataFrame({'age':['0-20', '20-40', '40-60','60-80']})
def split_mean(x):
split_list = x.split('-')
mean = float(split_list[0]) + float(split_list[1]) / 2
return mean
dummy_df_age['age_mean'] = dummy_df_age['age'].apply(lambda x: split_mean(x))
dummy_df_age.head()
age | age_mean | |
---|---|---|
0 | 0-20 | 10.0 |
1 | 20-40 | 40.0 |
2 | 40-60 | 70.0 |
3 | 60-80 | 100.0 |