'''housekeeping cell'''
import pandas as pd
import time
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

start_year = 2013
end_year = 2023

#relevant columns
selected_columns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DOT_ID_Reporting_Airline',
                     'OriginAirportID', 'OriginCityName',
                    'DestAirportID','DestCityName', 'CRSDepTime', 'DepTime', 'DepDelayMinutes',
                     'TaxiOut', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDelayMinutes',
                     'Cancelled', 'CancellationCode', 'Diverted', 'CRSElapsedTime',
                     'ActualElapsedTime', 'AirTime', 'Distance', 'CarrierDelay', 'WeatherDelay',
                     'NASDelay', 'SecurityDelay', 'LateAircraftDelay']

Top10AirportCodes = [11298, 12892, 10397, 11292, 12266, 12264, 13204, 13930, 12478, 14747]

start_time = time.time()

data_frames = []


#includes 2013-2023
for year in range(start_year, end_year + 1):
    #this is specific to the time at which we uploaded the data
    #change '8' if necessary
    max_month = 12 if year != 2023 else 8
    for month in range(1, max_month + 1):
        file_path = f"airline_data/{year}/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{year}_{month}/On_Time_Reporting_Carrier_On_Time_Performance_(1987_present)_{year}_{month}.csv"
        try:
            df = pd.read_csv(file_path, usecols=selected_columns)
            df = df[df['OriginAirportID'].isin(Top10AirportCodes) & df['DestAirportID'].isin(Top10AirportCodes)]
            data_frames.append(df)
        #prints error message if something goes wrong
        except FileNotFoundError: 
            print(f"File not found for {year}-{month}")

final_df = pd.concat(data_frames, ignore_index=True)

#keeps track of time spent, not necesary 
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

output_dir = 'airline_data'
os.makedirs(output_dir, exist_ok=True)

#writes large csv file with the data that we want
final_df.to_csv(os.path.join(output_dir, '10Airlines.csv'), index=False)


df = pd.read_csv('airline_data/10Airlines.csv')

#Cleaning CancellationCodes
df["CancellationReason"] = df["CancellationCode"].replace({'A':"Carrier",'B':"Weather",'C':"National Air System",'D':"Security",np.nan:'No Delay'})
df.drop(columns=['CancellationCode'],inplace=True)

#Cleaning Airline name 
DOT_ID_Airline_df = pd.read_csv('airline_data/L_AIRLINE_ID.csv')
ID_map_dict = dict(zip(DOT_ID_Airline_df['Code'], DOT_ID_Airline_df['Description'].str.split(':').str[0])) 
df['Airline'] = df['DOT_ID_Reporting_Airline'].map(ID_map_dict)
df.drop(columns=['DOT_ID_Reporting_Airline'],inplace=True)
#Fix this outlier
df['Airline'] = df['Airline'].replace('ExpressJet Airlines LLC d/b/a aha!', 'ExpressJet Airlines')

#Cleaning Delays (Some airlines did NaN on delays when they didn't exist. These are equivalent to a 0 time delay.)
fill_columns = ['CarrierDelay','WeatherDelay','NASDelay','SecurityDelay','LateAircraftDelay']
df[fill_columns] = df[fill_columns].fillna(0)

#Cleaning DayOfWeek 
DayOfWeek_mapping = {1:'Mon',2:'Tues',3:'Wed',4:'Thurs',5:'Fri',6:'Sat',7:'Sun'}
df['DayOfWeek'] = df['DayOfWeek'].map(DayOfWeek_mapping)

#Cleaning OriginAirportID
ForeignKey = pd.read_csv('airline_data/L_AIRPORT_ID.csv')
ForeignKeyDict = dict(zip(ForeignKey['Code'], ForeignKey['Description']))
df['OriginAirport'] = df['OriginAirportID'].map(ForeignKeyDict)
df.drop(columns=['OriginAirportID'],inplace=True)

#Cleaning Months
ForeignKey = pd.read_csv('airline_data/L_MONTHS.csv')
ForeignKeyDict = dict(zip(ForeignKey['Code'], ForeignKey['Description']))
df['Month'] = df['Month'].map(ForeignKeyDict)

#Cleaning DestAirportID
ForeignKey = pd.read_csv('airline_data/L_AIRPORT_ID.csv')
ForeignKeyDict = dict(zip(ForeignKey['Code'], ForeignKey['Description']))
df['DestAirport'] = df['DestAirportID'].map(ForeignKeyDict)
df.drop(columns=['DestAirportID'],inplace=True)

#Dropping Canceled Rows for Regression Model
df = df[df['Cancelled'] == 0]


df.dropna(inplace=True)


#can read in if need be
df = pd.read_csv('PreWeatherML.csv')


target_columns = ['ArrDelayMinutes']

numeric_columns = df.select_dtypes(include=['number']).columns

correlation_matrix = df[numeric_columns].corr()

correlations_with_target = correlation_matrix[target_columns]

correlated_columns = correlations_with_target.drop(target_columns).sort_values(by='ArrDelayMinutes', ascending=False)
print(f"\nColumns most correlated with 'ArrDelayMinutes':")
print(correlated_columns)

Columns most correlated with 'ArrDelayMinutes':
                   ArrDelayMinutes
DepDelayMinutes           0.968142
CarrierDelay              0.674785
LateAircraftDelay         0.625217
NASDelay                  0.416574
WeatherDelay              0.289817
TaxiOut                   0.198743
DepTime                   0.111587
TaxiIn                    0.101011
CRSDepTime                0.082757
CRSArrTime                0.067497
SecurityDelay             0.032922
ActualElapsedTime         0.029687
Year                      0.017994
DayofMonth                0.002138
AirTime                  -0.003832
ArrTime                  -0.008112
CRSElapsedTime           -0.015637
Distance                 -0.019258
Cancelled                      NaN
Diverted                       NaN


print("Decile 1:", df['ArrDelayMinutes'].quantile(0.10))
print("Decile 2:", df['ArrDelayMinutes'].quantile(0.20))
print("Decile 3:", df['ArrDelayMinutes'].quantile(0.30))
print("Decile 4:", df['ArrDelayMinutes'].quantile(0.40))
print("Decile 5:", df['ArrDelayMinutes'].quantile(0.50))
print("Decile 6:", df['ArrDelayMinutes'].quantile(0.60))
print("Decile 7:", df['ArrDelayMinutes'].quantile(0.70))
print("Decile 8:", df['ArrDelayMinutes'].quantile(0.80))
print("Decile 9:", df['ArrDelayMinutes'].quantile(0.90))
print("Decile 10:", df['ArrDelayMinutes'].quantile(1.0))

Decile 1: 0.0
Decile 2: 0.0
Decile 3: 0.0
Decile 4: 0.0
Decile 5: 0.0
Decile 6: 0.0
Decile 7: 5.0
Decile 8: 15.0
Decile 9: 39.0
Decile 10: 2837.0


df['Delayed'] = df['ArrDelayMinutes'] >= 5
df.to_csv('PreWeatherML.csv', index=False, mode='w')


#df = pd.read_csv('MLready.csv')
grouped_df = df.groupby(['Year', 'Airline'])['ArrDelayMinutes'].mean().reset_index()

sns.set(style="whitegrid")
plt.figure(figsize=(12, 6))

sns.barplot(x='Year', y='ArrDelayMinutes', hue='Airline', data=grouped_df)

plt.title('Average Delays per Airline per Year')
plt.xlabel('Year')
plt.ylabel('Average Arrival Delay Minutes per Flight')

plt.show()


#df = pd.read_csv('PreWeatherML.csv')


#Airline with the Most Delays:
most_delayed_airline = df.groupby('Airline')['ArrDelayMinutes'].mean().idxmax()
print(f"The airline with the most delays is {most_delayed_airline}.")

#Month with the Most Delays:
month_most_delays = df.groupby('Month')['ArrDelayMinutes'].mean().idxmax()
print(f"The month with the most delays is {month_most_delays}.")

#Year with the Most Delays:
year_most_delays = df.groupby('Year')['ArrDelayMinutes'].mean().idxmax()
print(f"The year with the most delays is {year_most_delays}.")

#Airport with the Longest Average Taxi Out Time:
airport_longest_taxi_out = df.groupby('OriginAirport')['TaxiOut'].mean().idxmax()
print(f"The airport with the longest average taxi out time is {airport_longest_taxi_out}.")

#Average Delay by City:
avg_arrival_delay_by_city = df.groupby('DestCityName')['ArrDelayMinutes'].mean().idxmax()
print(f"The city with the highest average arrival delay is {avg_arrival_delay_by_city}.")

#Busiest Airport (Most Flights):
busiest_airport = df['OriginAirport'].value_counts().idxmax()
print(f"The busiest airport (with the most flights) is {busiest_airport}.")

#Month with the Highest Average Airtime:
month_highest_airtime = df.groupby('Month')['AirTime'].mean().idxmax()
print(f"The month with the highest average airtime is {month_highest_airtime}.")

#Most Common Delay Type:
most_common_delay_type = df[['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']].idxmax(axis=1).mode().values[0]
print(f"The most common delay type is {most_common_delay_type}.")

#Day of the Week with the Fewest Delays:
day_fewest_delays = df.groupby('DayOfWeek')['ArrDelayMinutes'].mean().idxmin()
print(f"The day of the week with the fewest delays is {day_fewest_delays}.")

The airline with the most delays is JetBlue Airways.
The month with the most delays is June.
The year with the most delays is 2023.
The airport with the longest average taxi out time is New York, NY: John F. Kennedy International.
The city with the highest average arrival delay is New York, NY.
The busiest airport (with the most flights) is Los Angeles, CA: Los Angeles International.
The month with the highest average airtime is December.
The most common delay type is CarrierDelay.
The day of the week with the fewest delays is Tues.


import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import warnings
warnings.filterwarnings("ignore")


'''importing a csv file containing daily precipitation and snow in NYC for the last 30 years
# rain and snow measured in tenths of a mm'''

#this is a sample import and was done for all 10 cities that we focused on. 
weather = pd.read_csv("/Users/juliabaratta/Desktop/weather_data/USW00094789.csv")

'''deletes select columns by creating a list of all columns'''
columns = weather.columns.tolist()

'''deletes columns I want from that list '''
for i in ['STATION', 'DATE','NAME', 'PRCP','SNOW']:
    columns.remove(i)

'''drops the list''' 
weather_dropped = weather.drop(columns = columns)

'''gets only the last tenish years worth of days'''
rows = weather_dropped.tail(3967)

'''writes file with relevent columns and rows to a folder '''
file_path = "/Users/juliabaratta/Desktop/weather_data/newyork.csv"
rows.to_csv(file_path, index = False)


rows = pd.read_csv("/Users/juliabaratta/Desktop/weather_data/newyork.csv")

#creates a column called YEAR based on a slice of the DATE column
rows['YEAR'] = rows["DATE"].str[:4]

#sets index and plot
rows = rows.set_index('YEAR')
rows.plot()

<AxesSubplot:xlabel='YEAR'>


'''Inspects a slice of the data to get an idea of weather trends'''
#selects only Jan2023
w_Jan2023 = rows[3652:3683]

#creates a DAY column
w_Jan2023['DAY'] = w_Jan2023["DATE"].str[8:]
w_Jan2023.set_index("DAY", inplace = True)
w_Jan2023.plot()

<AxesSubplot:xlabel='DAY'>


#reads in airline data for Jan of 2023
a_Jan2023 = pd.read_csv("/Users/juliabaratta/Desktop/airline_data/Jan2023.csv")

#selects only flights going to NYC 
a_Jan2023NY_d = a_Jan2023[a_Jan2023['DEST_CITY_NAME'] == 'New York, NY']
file_path = "/Users/juliabaratta/Desktop/a_Jan2023NY_d.csv"
a_Jan2023NY_d.to_csv(file_path, index = False)

#selects only flights leaving NYC 
a_Jan2023NY_o = a_Jan2023[a_Jan2023['ORIGIN_CITY_NAME'] == 'New York, NY']
file_path = "/Users/juliabaratta/Desktop/a_Jan2023NY_o.csv"
a_Jan2023NY_o.to_csv(file_path, index = False)

a_Jan2023NY_d = pd.read_csv("/Users/juliabaratta/Desktop/a_Jan2023NY_d.csv")
a_Jan2023NY_o = pd.read_csv("/Users/juliabaratta/Desktop/a_Jan2023NY_o.csv")

#creates a groupby object for the sum of all delayed minutes on each day of January 
d_by_dowJan2023_o = (a_Jan2023NY_o.groupby('DAY_OF_MONTH')['DEP_DELAY_NEW'].sum())
d_by_dowJan2023_d = (a_Jan2023NY_d.groupby('DAY_OF_MONTH')['DEP_DELAY_NEW'].sum())

#standardizes 
d_by_dowJan2023_std_o = (
    (d_by_dowJan2023_o - d_by_dowJan2023_o.mean()) /
    d_by_dowJan2023_o.std())

d_by_dowJan2023_std_d = (
    (d_by_dowJan2023_d - d_by_dowJan2023_d.mean()) /
    d_by_dowJan2023_d.std())

w_Jan2023_std = (
    (w_Jan2023 - w_Jan2023.mean()) /
    w_Jan2023.std())

#converts to df 
w_Jan2023_std = pd.DataFrame(w_Jan2023_std)

#plots delays and precipitation
plt.plot(d_by_dowJan2023_std_o, label = 'delays leaving NYC')
plt.plot(w_Jan2023_std.index, w_Jan2023_std['PRCP'], label = 'precip')
plt.xlabel('Day of Month')
plt.legend()
plt.gca().xaxis.set_major_locator(MaxNLocator(nbins=15))


plt.plot(d_by_dowJan2023_std_d, label = 'delays going to NYC')
plt.plot(w_Jan2023_std.index, w_Jan2023_std['PRCP'], label = 'precip')
plt.xlabel('Day of Month')
plt.legend()
plt.gca().xaxis.set_major_locator(MaxNLocator(nbins=15))


#averages yearly precipitation in NYC 
yearly_precip = rows.groupby("YEAR")["PRCP"].mean()
yearly_precip.plot()

<AxesSubplot:xlabel='YEAR'>


#averages yearly snow in NYC 
yearly_precip = rows.groupby("YEAR")["SNOW"].mean()
yearly_precip.plot()

<AxesSubplot:xlabel='YEAR'>


#read in the '10AirlinesTransformed.csv'dataset
#df = pd.read_csv('10AirlinesTransformed.csv')

#creates a datetime variable
df['datetime_column'] = pd.to_datetime(df['datetime_column'])
df['Month'] = df['datetime_column'].dt.strftime('%B')

#reads in all the weather data for 2013 to 2023
NYW = pd.read_csv('weather_data/newyork.csv')
SEW = pd.read_csv('weather_data/seattle.csv')
CHW = pd.read_csv('weather_data/chicago.csv') 
ORW = pd.read_csv('weather_data/orlando.csv')
DCW = pd.read_csv('weather_data/washingtondc.csv')
HOW = pd.read_csv('weather_data/houston.csv')
DEW = pd.read_csv('weather_data/denver.csv')
ATW = pd.read_csv('weather_data/atlanta.csv')
LAW = pd.read_csv('weather_data/losangeles.csv')
DAW = pd.read_csv('weather_data/dalas.csv')

#names the weather city so they match 
NYW.NAME = 'New York, NY'
SEW.NAME = 'Seattle, WA'
CHW.NAME = 'Chicago, IL'
ORW.NAME = 'Orlando, FL'
DCW.NAME = 'Washington, DC'
HOW.NAME = 'Houston, TX'
DEW.NAME = 'Denver, CO'
ATW.NAME = 'Atlanta, GA'
LAW.NAME = 'Los Angeles, CA'
DAW.NAME = 'Dallas/Fort Worth, TX'

#makes a rain/snow in the last 2 days variable for weather
l = [NYW,SEW,CHW,ORW,DCW,HOW,DEW,ATW,LAW,DAW]

for el in l: 
    el['OriginSnow_Last2'] = el['SNOW'] + el['SNOW'].shift(1)
    el.loc[el.index[0], 'OriginSnow_Last2'] = el.loc[el.index[0], 'SNOW']
for el in l: 
    el['OriginPrcp_Last2'] = el['PRCP'] + el['PRCP'].shift(1)
    el.loc[el.index[0], 'OriginPrcp_Last2'] = el.loc[el.index[0], 'PRCP']

#iter through the list of weather data to check for NaN values
for el in l: 
    print(el.NAME.iloc[0])
    print(str(el.isna().sum().sum()))

New York, NY
0
Seattle, WA
3
Chicago, IL
0
Orlando, FL
0
Washington, DC
0
Houston, TX
0
Denver, CO
10
Atlanta, GA
0
Los Angeles, CA
7611
Dallas/Fort Worth, TX
0


SEW = SEW.fillna(0)
DEW = DEW.fillna(0)
LAW = LAW.fillna(0)


#changes months to numerical
df['Month'] = df['Month'].astype(str)
mm = {
    'January': '01',
    'February': '02',
    'March': '03',
    'April': '04',
    'May': '05',
    'June': '06',
    'July': '07',
    'August': '08',
    'September': '09',
    'October': '10',
    'November': '11',
    'December': '12'}
df['Month'] = df['Month'].map(mm)
    
#converts day to an object of the right format
if df['DayofMonth'].dtype == 'int64':
    df['DayofMonth'] = df['DayofMonth'].apply(lambda x: f'{x:02}')

#converts year to an object of the same format
df['Year'] = df['Year'].astype(str)

#creates a date column to match the weather data
df['DATE'] = df.Year + '-' + df.Month + '-' + df.DayofMonth


#renames column(df) 
df = df.rename(columns={'OriginCityName': 'ORIGIN', 'DestCityName':'DEST'})

#one big weather dataset
concat_weather = pd.concat([NYW,SEW,CHW,ORW,DCW,HOW,DEW,ATW,LAW,DAW])

#creates a new column to help wtih merging
concat_weather['ORIGIN'] = concat_weather['NAME']
concat_weather = concat_weather.drop(columns=['STATION','NAME'])

''' first merge is for weather for the origin airport'''

#merges datasets and renames/drops columns
df_merged = df.merge(concat_weather, on=['ORIGIN', 'DATE'])
df_merged['OriginSnow'] = df_merged['SNOW']
df_merged['OriginPrcp'] =df_merged['PRCP']
df_merged = df_merged.drop(columns = ['PRCP', 'SNOW'])

'''second merge is for weather for the destination airport'''

#edits concat_weather for second merge
concat_weather['DEST'] = concat_weather['ORIGIN']
concat_weather['DestSnow_Last2'] = concat_weather['OriginSnow_Last2']
concat_weather['DestPrcp_Last2'] = concat_weather['OriginPrcp_Last2']
concat_weather = concat_weather.drop(columns=['ORIGIN'])
concat_weather = concat_weather.drop(columns=['OriginSnow_Last2','OriginPrcp_Last2' ])
concat_weather

#merges 2nd time and renames/drops columns
df_merged1 = df_merged.merge(concat_weather, on=['DEST', 'DATE'])
df_merged1['DestSnow'] = df_merged1['SNOW']
df_merged1['DestPrcp'] =df_merged1['PRCP']
df_merged1 = df_merged1.drop(columns = ['PRCP', 'SNOW'])


print(df_merged1.iloc[3925350])

print(DEW[DEW['DATE'] == '2023-08-27'])

print(NYW[NYW['DATE'] == '2023-08-27'])

Year                                                         2023
DayofMonth                                                     27
DayOfWeek                                                     Sun
ORIGIN                                                 Denver, CO
DEST                                                 New York, NY
CRSDepTime                                                   2338
DepTime                                                    2351.0
DepDelayMinutes                                              13.0
TaxiOut                                                      16.0
TaxiIn                                                        7.0
CRSArrTime                                                    520
ArrTime                                                     516.0
ArrDelayMinutes                                               0.0
Cancelled                                                     0.0
Diverted                                                      0.0
CRSElapsedTime                                              222.0
ActualElapsedTime                                           205.0
AirTime                                                     182.0
Distance                                                   1626.0
CarrierDelay                                                  0.0
WeatherDelay                                                  0.0
NASDelay                                                      0.0
SecurityDelay                                                 0.0
LateAircraftDelay                                             0.0
CancellationReason                                       No Delay
Airline                                           JetBlue Airways
OriginAirport                    Denver, CO: Denver International
DestAirport           New York, NY: John F. Kennedy International
NumericMonth                                                    8
datetime_column                               2023-08-27 23:38:00
Delayed                                                     False
Month                                                          08
DATE                                                   2023-08-27
OriginSnow_Last2                                              0.0
OriginPrcp_Last2                                            269.0
OriginSnow                                                    0.0
OriginPrcp                                                    0.0
DestSnow_Last2                                                0.0
DestPrcp_Last2                                               28.0
DestSnow                                                      0.0
DestPrcp                                                      0.0
Name: 3925350, dtype: object
          STATION        DATE        NAME  PRCP  SNOW  OriginSnow_Last2  \
3890  USW00023062  2023-08-27  Denver, CO   0.0   0.0               0.0   

      OriginPrcp_Last2  
3890             269.0  
          STATION        DATE          NAME  PRCP  SNOW  OriginSnow_Last2  \
3890  USW00094789  2023-08-27  New York, NY   0.0   0.0               0.0   

      OriginPrcp_Last2  
3890              28.0


print(df_merged1.iloc[39256])

print(ATW[ATW['DATE'] == '2013-02-25'])

print(CHW[CHW['DATE'] == '2013-02-25'])

Year                                                               2013
DayofMonth                                                           25
DayOfWeek                                                           Mon
ORIGIN                                                      Atlanta, GA
DEST                                                        Chicago, IL
CRSDepTime                                                         1204
DepTime                                                          1205.0
DepDelayMinutes                                                     1.0
TaxiOut                                                            12.0
TaxiIn                                                              4.0
CRSArrTime                                                         1304
ArrTime                                                          1250.0
ArrDelayMinutes                                                     0.0
Cancelled                                                           0.0
Diverted                                                            0.0
CRSElapsedTime                                                    120.0
ActualElapsedTime                                                 105.0
AirTime                                                            89.0
Distance                                                          606.0
CarrierDelay                                                        0.0
WeatherDelay                                                        0.0
NASDelay                                                            0.0
SecurityDelay                                                       0.0
LateAircraftDelay                                                   0.0
CancellationReason                                             No Delay
Airline                                              Mesa Airlines Inc.
OriginAirport         Atlanta, GA: Hartsfield-Jackson Atlanta Intern...
DestAirport                   Chicago, IL: Chicago O'Hare International
NumericMonth                                                          2
datetime_column                                     2013-02-25 12:04:00
Delayed                                                           False
Month                                                                02
DATE                                                         2013-02-25
OriginSnow_Last2                                                    0.0
OriginPrcp_Last2                                                   10.0
OriginSnow                                                          0.0
OriginPrcp                                                         10.0
DestSnow_Last2                                                      0.0
DestPrcp_Last2                                                      0.0
DestSnow                                                            0.0
DestPrcp                                                            0.0
Name: 39256, dtype: object
        STATION        DATE         NAME  PRCP  SNOW  OriginSnow_Last2  \
55  USW00013874  2013-02-25  Atlanta, GA  10.0   0.0               0.0   

    OriginPrcp_Last2  
55              10.0  
        STATION        DATE         NAME  PRCP  SNOW  OriginSnow_Last2  \
55  USW00094846  2013-02-25  Chicago, IL   0.0   0.0               0.0   

    OriginPrcp_Last2  
55               0.0


print(df_merged1.iloc[516])

print(LAW[LAW['DATE'] == '2013-01-17'])

print(DEW[DEW['DATE'] == '2013-01-17'])

Year                                                        2013
DayofMonth                                                    17
DayOfWeek                                                  Thurs
ORIGIN                                           Los Angeles, CA
DEST                                                  Denver, CO
CRSDepTime                                                  1752
DepTime                                                   1745.0
DepDelayMinutes                                              0.0
TaxiOut                                                     12.0
TaxiIn                                                       7.0
CRSArrTime                                                  2105
ArrTime                                                   2056.0
ArrDelayMinutes                                              0.0
Cancelled                                                    0.0
Diverted                                                     0.0
CRSElapsedTime                                             133.0
ActualElapsedTime                                          131.0
AirTime                                                    112.0
Distance                                                   862.0
CarrierDelay                                                 0.0
WeatherDelay                                                 0.0
NASDelay                                                     0.0
SecurityDelay                                                0.0
LateAircraftDelay                                            0.0
CancellationReason                                      No Delay
Airline                                    United Air Lines Inc.
OriginAirport         Los Angeles, CA: Los Angeles International
DestAirport                     Denver, CO: Denver International
NumericMonth                                                   1
datetime_column                              2013-01-17 17:52:00
Delayed                                                    False
Month                                                         01
DATE                                                  2013-01-17
OriginSnow_Last2                                             0.0
OriginPrcp_Last2                                             0.0
OriginSnow                                                   0.0
OriginPrcp                                                   0.0
DestSnow_Last2                                               0.0
DestPrcp_Last2                                               0.0
DestSnow                                                     0.0
DestPrcp                                                     0.0
Name: 516, dtype: object
        STATION        DATE             NAME  PRCP  SNOW  OriginSnow_Last2  \
16  USW00023174  2013-01-17  Los Angeles, CA   0.0   0.0               0.0   

    OriginPrcp_Last2  
16               0.0  
        STATION        DATE        NAME  PRCP  SNOW  OriginSnow_Last2  \
16  USW00023062  2013-01-17  Denver, CO   0.0   0.0               0.0   

    OriginPrcp_Last2  
16               0.0


#sets index to datetime column and prints out file to be inputted in ML 
df_merged1 = df_merged1.set_index('datetime_column').sort_index()

file_path = "/Users/juliabaratta/Desktop/MLready.csv"
df_merged1.to_csv(file_path, index = False)


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score


df = pd.read_csv('MLReady.csv')

selected_columns = ['Year', 'DayofMonth', 'Month', 'CRSDepTime', 'DepDelayMinutes',
                    'CRSArrTime', 'Distance', 'OriginSnow_Last2', 'OriginPrcp_Last2', 'OriginSnow',
                    'OriginPrcp', 'DestSnow_Last2', 'DestPrcp_Last2', 'DestSnow', 'DestPrcp',
                    'OriginAirport', 'DestAirport', 'DayOfWeek', 'Airline', 'ArrDelayMinutes']

#Excludes other columns
df = df[selected_columns]


categorical_features = ['OriginAirport', 'DestAirport', 'DayOfWeek', 'Airline']
numerical_features = df.drop(columns=['ArrDelayMinutes']).columns.difference(categorical_features)

mae_list = []
r2_list = []


for year in range(2013, 2023):
    #creates a training dataset that is one year
    train_data = df[df['Year'] == year]
    #creates a testing dataset that is the next year
    test_data = df[df['Year'] == (year + 1)]

    #makes y and X for training
    X_train = train_data.drop(columns=['ArrDelayMinutes'])
    y_train = train_data['ArrDelayMinutes']

    #makes y and X for testing
    X_test = test_data.drop(columns=['ArrDelayMinutes'])
    y_test = test_data['ArrDelayMinutes']

    #creates instances of both a numerical and categorical transformer 
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

    #creates a preprocesser to deal with transorming both the num and cat columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    #creates a pipeline for the model 
    model = Pipeline(steps=[('preprocessor', preprocessor),
                             ('regressor', RandomForestRegressor(n_estimators=10, random_state=42))])

    #fits model to the years training data
    model.fit(X_train, y_train)

    #uses this fit to predict on the desting variables 
    y_pred = model.predict(X_test)

    #evaluates the success of that prediction
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mae_list.append(mae)
    r2_list.append(r2)

    print(f'Training on {year}, Testing on {year + 1} - Mean Absolute Error: {mae}, R-squared: {r2}')

#average efficacy of the model
print(f'Average Mean Absolute Error: {sum(mae_list) / len(mae_list)}')
print(f'Average R-squared: {sum(r2_list) / len(r2_list)}')

Training on 2013, Testing on 2014 - Mean Absolute Error: 6.487981144083558, R-squared: 0.9132239974039758
Training on 2014, Testing on 2015 - Mean Absolute Error: 6.812233708127388, R-squared: 0.914890110438901
Training on 2015, Testing on 2016 - Mean Absolute Error: 5.903338926832098, R-squared: 0.9279673615797543
Training on 2016, Testing on 2017 - Mean Absolute Error: 6.058833778954674, R-squared: 0.9236424608313948
Training on 2017, Testing on 2018 - Mean Absolute Error: 6.42770809330232, R-squared: 0.9233149913249296
Training on 2018, Testing on 2019 - Mean Absolute Error: 6.63172692814846, R-squared: 0.9363628946806113
Training on 2019, Testing on 2020 - Mean Absolute Error: 4.971198419998488, R-squared: 0.916248103806017
Training on 2020, Testing on 2021 - Mean Absolute Error: 5.313820978632428, R-squared: 0.9451646019900752
Training on 2021, Testing on 2022 - Mean Absolute Error: 6.2656328518457025, R-squared: 0.9516300949355766
Training on 2022, Testing on 2023 - Mean Absolute Error: 7.222455108552838, R-squared: 0.9558847170121954
Average Mean Absolute Error: 6.209492993847795
Average R-squared: 0.930832933400343


# Years for training and testing
years_train = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
mae_values = [6.487981144083558, 6.812233708127388, 5.903338926832098, 6.058833778954674, 6.42770809330232, 6.63172692814846, 4.971198419998488, 5.313820978632428, 6.2656328518457025, 7.222455108552838]
r_squared_values = [0.9132239974039758, 0.914890110438901, 0.9279673615797543, 0.9236424608313948, 0.9233149913249296, 0.9363628946806113, 0.916248103806017, 0.9451646019900752, 0.9516300949355766, 0.9558847170121954]

# Plot MAE
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(years_train, mae_values, marker='o', linestyle='-', color='b')
plt.title('Mean Absolute Error (MAE)')
plt.xlabel('Training Year')
plt.ylabel('MAE')
plt.ylim(0, 10)  # Set y-axis limits

# Plot R-squared
plt.subplot(1, 2, 2)
plt.plot(years_train, r_squared_values, marker='o', linestyle='-', color='r')
plt.title('R-squared')
plt.xlabel('Training Year')
plt.ylabel('R-squared')
plt.ylim(0, 1)  # Set y-axis limits

plt.tight_layout()
plt.show()


sean_flight_data = {
    'Year': 2023,
    'DayofMonth': 13,
    'Month': 12,
    'CRSDepTime': 1759,
    'DepDelayMinutes': 0, 
    'CRSArrTime': 2031,
    'Distance': 1946,
    'OriginSnow_Last2': 0,
    'OriginPrcp_Last2': 0,
    'OriginSnow': 0,
    'OriginPrcp': 0,
    'DestSnow_Last2': 0,
    'DestPrcp_Last2': 0,
    'DestSnow': 0,
    'DestPrcp': 0,
    'OriginAirport': 'Atlanta, GA: Hartsfield-Jackson Atlanta International',
    'DestAirport': 'Los Angeles, CA: Los Angeles International',
    'DayOfWeek': 'Wed',
    'Airline': 'Delta Air Lines Inc.'
}
sean_flight_df = pd.DataFrame([sean_flight_data])

julia_flight_data = {
    'Year': 2023,
    'DayofMonth': 17,
    'Month': 12,
    'CRSDepTime': 932,
    'DepDelayMinutes': 0,
    'CRSArrTime': 1315,
    'Distance': 541.79,
    'OriginSnow_Last2': 0,
    'OriginPrcp_Last2': 0,
    'OriginSnow': 0,
    'OriginPrcp': 0,
    'DestSnow_Last2': 0,
    'DestPrcp_Last2': 1955,
    'DestSnow': 0,
    'DestPrcp': 1955,
    'OriginAirport': 'Atlanta',
    'DestAirport': 'Washington, DC: Washington Dulles International',
    'DayOfWeek': 'Sun',
    'Airline': 'Frontier Airlines Inc.'
}

julia_flight_df = pd.DataFrame([julia_flight_data])


#df = pd.read_csv('MLReady.csv')


#relevant columns
selected_columns = ['Year', 'DayofMonth', 'Month', 'CRSDepTime', 'DepDelayMinutes',
                    'CRSArrTime', 'Distance', 'OriginSnow_Last2', 'OriginPrcp_Last2', 'OriginSnow',
                    'OriginPrcp', 'DestSnow_Last2', 'DestPrcp_Last2', 'DestSnow', 'DestPrcp',
                    'OriginAirport', 'DestAirport', 'DayOfWeek', 'Airline', 'ArrDelayMinutes']

df = df[selected_columns]

#transforming the columns 
categorical_features = ['OriginAirport', 'DestAirport', 'DayOfWeek', 'Airline']
numerical_features = df.drop(columns=['ArrDelayMinutes']).columns.difference(categorical_features)

#training on 2022
train_data = df[df['Year'] == 2022]
X_train = train_data.drop(columns=['ArrDelayMinutes'])
y_train = train_data['ArrDelayMinutes']

#preprocessing
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#initiating the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                         ('regressor', RandomForestRegressor(n_estimators=10, random_state=42))])
#fitting the model on the training data (2022)
model.fit(X_train, y_train)

#printing the result
julia_delay_prediction = model.predict(julia_flight_df)
print(f"Julia's Flight Delay Prediction: {julia_delay_prediction[0]} minutes")

sean_delay_prediction = model.predict(sean_flight_df)
print(f"Sean's Flight Delay Prediction: {sean_delay_prediction[0]} minutes")

Julia's Flight Delay Prediction: 10.6 minutes
Sean's Flight Delay Prediction: 3.1 minutes

Sean Hall and Julia Baratta's Data Science Project¶