US Flights of 2020 Jan
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
Load in your dataset and describe its properties through the questions below. Try and motivate your exploration goals through this section.
#load dataset
df = pd.read_csv('./62598021_T_ONTIME_REPORTING.csv')
df.head()
df.shape
df.dtypes
df.head(10)
# correcting dates formate
df['DEP_TIME'] = df.DEP_TIME.apply(lambda x: str(int(x)).zfill(4) if pd.notnull(x) else x)
df['CRS_DEP_TIME'] = df.CRS_DEP_TIME.apply(lambda x: str(int(x)).zfill(4) if pd.notnull(x) else x)
df['ARR_TIME'] = df.ARR_TIME.apply(lambda x: str(int(x)).zfill(4) if pd.notnull(x) else x)
df['CRS_ARR_TIME'] = df.CRS_ARR_TIME.apply(lambda x: str(int(x)).zfill(4) if pd.notnull(x) else x)
# creating AM/PM in df
def setPM(x):
if((x>=12) & (x<= 23)):
return 'PM'
elif((x>=0) & (x<= 11) | (x==24)):
return 'AM'
else:
return x
df.DEP_TIME.dropna(inplace=True)
df['DepTimePM'] = df.DEP_TIME.str[:2].astype('int64')
df['DepTimePM'] = df.DepTimePM.apply(setPM)
df.DepTimePM.dropna(inplace=True)
df.describe()
What is the structure of your dataset?
There are 607346 flight observations with 45 features in 2020 jan.
What is/are the main feature(s) of interest in your dataset?
delayed and canceled flights in terms fo carriers, locations & time.
What features in the dataset do you think will help support your investigation into your feature(s) of interest?
Arr_Delay, DepDelay, Cancelled, Month, Day_Of_Week, Dep_Time, Arr_Time, OP_UNIQUE_CARRIER, Origin & Dest.
#finding airline sizes
airlines = df.OP_UNIQUE_CARRIER.value_counts()
plt.title('Airline size')
plt.xlabel('Airline')
plt.ylabel('Flights')
plt.bar(airlines.index, airlines);
Flight quantity is different among airlines.
#finding flight size in every state
plt.figure(figsize=[20,4])
states = df.ORIGIN_STATE_ABR.value_counts()
# Add title and axis names
plt.title('Flights size in each state')
plt.xlabel('States')
plt.ylabel('Flights')
plt.bar(states.index, states);
Flight quantity is different among airlines.
#flights in week
width = 1
bins = np.arange(1, 9, width)
plt.hist(df.DAY_OF_WEEK, rwidth=.8 , bins=bins)
name_days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
# Add title and axis names
plt.title('Flights distributed during days of week')
plt.xlabel('Days')
plt.ylabel('Flights')
plt.xticks(np.arange(1.5,max(bins),width), name_days);
There are little drop of flights in weekend.
plt.title('Rate of cancelled flights')
plt.pie(df.CANCELLED.value_counts(), autopct='%.2f%%');
There are 1.14% flights were cancelled.
plt.figure(figsize=[15,4])
plt.subplot(2,1,1)
step = 5
x_min, x_max = -30, 110
bin_edges = np.arange(df.DEP_DELAY.min()-step, df.DEP_DELAY.max()+step, step)
plt.hist(df.DEP_DELAY, bins=bin_edges)
plt.xlim(x_min,x_max)
plt.xticks(np.arange(x_min, x_max, step))
# Add title and axis names
plt.title('Distribution of departure delay')
plt.ylabel('Flights')
plt.subplot(2,1,2)
plt.boxplot(x=df[pd.notnull(df.DEP_DELAY)].DEP_DELAY, vert=False, showfliers=False)
plt.xlim(x_min,x_max)
plt.xticks(np.arange(x_min, x_max, step));
There are delayed departures as well as earlier departures and the most of them are between -25 to 25 min differ of the scheduled time. However, there is a flight has been delayed more than 1 day and 17 hours, and also there is a flight had be earlier around 9 hours of the schedule time.
plt.figure(figsize=[15,4])
plt.subplot(2,1,1)
step = 5
x_min, x_max = -50, 160
bin_edges = np.arange(df.ARR_DELAY.min()-step, df.ARR_DELAY.max()+step, step)
plt.hist(df.ARR_DELAY, bins=bin_edges)
plt.xlim(x_min, x_max)
plt.xticks(np.arange(x_min, x_max, step))
# Add title and axis names
plt.title('Distribution of arrival delay')
plt.ylabel('Flights')
plt.subplot(2,1,2)
plt.boxplot(x=df[pd.notnull(df.ARR_DELAY)].ARR_DELAY, vert=False, showfliers=False)
plt.xlim(x_min, x_max)
plt.xticks(np.arange(x_min, x_max, step));
There are delayed arrivals as well as earlier arrivals and the most of them are between -45 to 45 min differ of the scheduled time.
sample = df.groupby('OP_UNIQUE_CARRIER')
sample = pd.merge(
pd.DataFrame(sample.CANCELLED.count()),
pd.DataFrame(sample.DEP_DELAY.sum()),
on='OP_UNIQUE_CARRIER'
).merge(
pd.DataFrame(sample.ARR_DELAY.sum()), on='OP_UNIQUE_CARRIER'
)
sb.pairplot(sample);
There are positive correlation coefficients among Cannelled, DepDelay, ArrDelay & flight quantity of airlines.
sample = df.groupby('ORIGIN_STATE_ABR')
sample = pd.merge(
pd.DataFrame(sample.CANCELLED.count()),
pd.DataFrame(sample.DEP_DELAY.sum()),
on='ORIGIN_STATE_ABR'
).merge(
pd.DataFrame(sample.ARR_DELAY.sum()), on='ORIGIN_STATE_ABR'
)
sb.pairplot(sample);
There are positive correlation coefficients among Cannelled, DepDelay, ArrDelay & flight quantity of states.
state_airlines = df.groupby(['ORIGIN_STATE_ABR', 'OP_UNIQUE_CARRIER'], as_index=False).count()
state_airlines = state_airlines[['ORIGIN_STATE_ABR', 'OP_UNIQUE_CARRIER']]
plt.figure(figsize=[17,5])
# Add title and axis names
plt.title('Relationship between states & airlines')
plt.xlabel('States')
plt.ylabel('Airlines')
plt.scatter(state_airlines.ORIGIN_STATE_ABR, state_airlines.OP_UNIQUE_CARRIER);
The most of airlines reach the most of states.
carriers = df.groupby(['OP_UNIQUE_CARRIER','CANCELLED'], as_index=False).count()
carriers = carriers.pivot_table('YEAR', 'OP_UNIQUE_CARRIER', 'CANCELLED', fill_value=0)
carriers.rename(columns={0:'cxl_0', 1:'cxl_1'}, inplace=True)
plt.figure(figsize=[12,10])
carriers
plt.subplot(1,4,2)
plt.barh(carriers.index, carriers.cxl_0, color='C0')
plt.subplot(1,4,1)
plt.barh(carriers.index, carriers.cxl_1*-1, color='C1')
plt.yticks(' ')
carriers.cxl_0 = np.log10(carriers.cxl_0)
carriers.cxl_1 = np.log10(carriers.cxl_1)
plt.subplot(1,4,4)
plt.barh(carriers.index, carriers.cxl_0, color='C0')
plt.xlim(0, 7)
plt.subplot(1,4,3)
plt.barh(carriers.index, carriers.cxl_1*-1, color='C1')
plt.xlim(-7, 0)
plt.yticks(' ');
Normalizing data by log10 make it more informative.
airlines_mean = df.groupby('OP_UNIQUE_CARRIER', as_index=False).mean()
airlines_count = df.groupby('OP_UNIQUE_CARRIER', as_index=False).count()
airlines = airlines_mean[['OP_UNIQUE_CARRIER','ARR_DELAY','CANCELLED']].merge(
airlines_count[['OP_UNIQUE_CARRIER','FLIGHTS']], on='OP_UNIQUE_CARRIER'
)
airlines.rename(columns={'FLIGHTS':'Count'}, inplace=True)
airlines['Size'] = airlines.Count.map(lambda x: round(x/120000)+1)
airlines['Size'] = airlines.Size.map(lambda x: 7 if x>7 else x)
plt.scatter(airlines.ARR_DELAY, airlines.CANCELLED, s=airlines.Size**3)
x_max = max(airlines.ARR_DELAY)
x_min = min(airlines.ARR_DELAY)
y_max = max(airlines.CANCELLED)
y_min = min(airlines.CANCELLED)
x_len = x_max - x_min
y_len = y_max - y_min
x = x_len*.05
y = y_len*.05
plt.xlim(x_min-x, x_max+x)
plt.ylim(y_min-y, y_max+y)
plt.axhline(y=y_max-(y_len/2))
plt.axvline(x_max - (x_len/2))
for i, txt in enumerate(airlines.OP_UNIQUE_CARRIER):
plt.annotate(txt, (airlines.ARR_DELAY[i], airlines.CANCELLED[i]));
delay factor is impact on the most airlines while cancellation is impact on around all of them.
state_mean = df.groupby('ORIGIN_STATE_ABR', as_index=False).mean()
state_count = df.groupby('ORIGIN_STATE_ABR', as_index=False).count()
states = state_mean[['ORIGIN_STATE_ABR','ARR_DELAY','CANCELLED']].merge(
state_count[['ORIGIN_STATE_ABR','FLIGHTS']], on='ORIGIN_STATE_ABR'
)
states.rename(columns={'FLIGHTS':'Count'}, inplace=True)
states['Size'] = states.Count.map(lambda x: round(x/120000)+1)
states['Size'] = states.Size.map(lambda x: 7 if x>7 else x)
plt.scatter(states.ARR_DELAY, states.CANCELLED, s=states.Size**3)
x_max = max(states.ARR_DELAY)
x_min = min(states.ARR_DELAY)
y_max = max(states.CANCELLED)
y_min = min(states.CANCELLED)
x_len = x_max - x_min
y_len = y_max - y_min
x = x_len*.05
y = y_len*.05
plt.xlim(x_min-x, x_max+x)
plt.ylim(y_min-y, y_max+y)
plt.axhline(y=y_max-(y_len/2))
plt.axvline(x_max - (x_len/2))
for i, txt in enumerate(states.ORIGIN_STATE_ABR):
plt.annotate(txt, (states.ARR_DELAY[i], states.CANCELLED[i]));
There is variation among states in terms of delayed and cancelled flightes and also the quantity of flights.
flights_mean = df.groupby(['OP_CARRIER_FL_NUM','DepTimePM'], as_index=False).mean()
flights_count = df.groupby('OP_CARRIER_FL_NUM', as_index=False).count()
flights = flights_mean[['OP_CARRIER_FL_NUM','DepTimePM', 'DEP_DELAY','CANCELLED']].merge(
flights_count[['OP_CARRIER_FL_NUM','FLIGHTS']], on='OP_CARRIER_FL_NUM'
)
flights.rename(columns={'FLIGHTS':'Count'}, inplace=True)
Q1 = flights.quantile(.25)
Q3 = flights.quantile(.75)
IQ = Q3-Q1
Q_max = Q3+IQ*1.5
Q_min = Q1-IQ*1.5
flights_filtered = flights[(flights.DEP_DELAY > Q_min.DEP_DELAY) & (flights.DEP_DELAY < Q_max.DEP_DELAY)]
flights_filtered = flights_filtered[
(flights_filtered.Count > Q_min.Count) & (flights_filtered.Count < Q_max.Count)
]
flights_no_cxl = flights_filtered[flights_filtered.CANCELLED == 0]
flights_filtered = flights_filtered[flights_filtered.CANCELLED > 0]
flights_filtered.reset_index(inplace=True)
flights_filtered = flights[flights.CANCELLED > 0]
sb.scatterplot(flights_filtered.DEP_DELAY, flights_filtered.CANCELLED, alpha=.5, hue=flights_filtered.DepTimePM)
# x_max = max(flights_filtered.DepDelay)
x_max = 60
x_min = min(flights_filtered.DEP_DELAY)
# y_max = max(flights_filtered.Cancelled)
y_max = .04
y_min = min(flights_filtered.CANCELLED)
x_len = x_max - x_min
y_len = y_max - y_min
x = x_len*.05
y = y_len*.05
plt.xlim(x_min-x, x_max+x)
plt.ylim(y_min-y, y_max+y)
plt.axhline(y=y_max-(y_len/2))
plt.axvline(x=x_max - (x_len/2));
Cancelled flights are in AM flights, while PM flights have more delayed flights.
At the end of your report, make sure that you export the notebook as an html file from the
File > Download as... > HTML
menu. Make sure you keep track of where the exported file goes, so you can put it in the same folder as this notebook for project submission. Also, make sure you remove all of the quote-formatted guide notes like this one before you finish your report!