0. ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ & ๋ชจ๋“ˆ import

### ML๊ด€๋ จ
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### ์‚ฌ์ดํ‚ท๋Ÿฐ
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer # sklearn ๋ฒ„์ „์œผ๋กœ ์ธํ•ด ๋ฐ”๊ฟ”์ค˜์•ผ ํ•จ
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

### ์‹œ๊ฐ„ ๊ด€๋ จ
from datetime import datetime
import calendar
import matplotlib.dates as mdates
import matplotlib as mpl
from datetime import timedelta
import datetime as dt

### ์ˆ˜ํ•™์  ๊ณ„์‚ฐ ๊ด€๋ จ
from math import sin, cos, sqrt, atan2, radians

### ์ง€๋„ ์‹œ๊ฐํ™” ๊ด€๋ จ
import folium
from folium import FeatureGroup, LayerControl, Map, Marker
from folium.plugins import HeatMap

### ํŒŒ์ผ ์ž…์ถœ๋ ฅ ๊ด€๋ จ
import pickle

### ์˜ค๋ฅ˜ ๊ด€๋ จ
import warnings
warnings.filterwarnings('ignore')

### ์˜ต์…˜ ์„ค์ •
pd.set_option('display.max_colwidth', -1)
plt.style.use('fivethirtyeight')

1. ๋ฐ์ดํ„ฐ ์ค€๋น„ํ•˜๊ธฐ

  • ์ฃผ์ œ: ๋‰ด์š•์‹œ์—์„œ ํƒ์‹œ ์—ฌํ–‰์˜ ์ด ์Šน์ฐจ ์‹œ๊ฐ„์„ ์˜ˆ์ธกํ•˜๋Š” ๋ชจ๋ธ์„ ๊ตฌ์ถ•ํ•˜๋Š” ๊ฒƒ
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
### ๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ

train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ECC 48แ„€แ…ต แ„ƒแ…ฆแ„€แ…ชB/9แ„Œแ…ฎแ„Žแ…ก/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ECC 48แ„€แ…ต แ„ƒแ…ฆแ„€แ…ชB/9แ„Œแ…ฎแ„Žแ…ก/data/test.csv")
### ๋ฐ์ดํ„ฐ ํ˜•ํƒœ ํ™•์ธํ•˜๊ธฐ

print('train shape : ', train.shape, 'test shape : ', test.shape)
train shape :  (1458644, 11) test shape :  (625134, 9)
### ๋ฐ์ดํ„ฐ ๋ฏธ๋ฆฌ๋ณด๊ธฐ

train.head()
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N 455
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 N 663
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 N 2124
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 N 429
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 N 435
test.head()
id vendor_id pickup_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag
0 id3004672 1 2016-06-30 23:59:58 1 -73.988129 40.732029 -73.990173 40.756680 N
1 id3505355 1 2016-06-30 23:59:53 1 -73.964203 40.679993 -73.959808 40.655403 N
2 id1217141 1 2016-06-30 23:59:47 1 -73.997437 40.737583 -73.986160 40.729523 N
3 id2150126 2 2016-06-30 23:59:41 1 -73.956070 40.771900 -73.986427 40.730469 N
4 id1598245 1 2016-06-30 23:59:33 1 -73.970215 40.761475 -73.961510 40.755890 N

1-1. Data Description

  • id: ๊ฐ trip์— ๋Œ€ํ•œ ๊ณ ์œ  ์‹๋ณ„์ž

  • vendor_id: ์ฃผํ–‰ ๊ธฐ๋ก๊ณผ ์—ฐ๊ฒฐ๋œ ์ œ๊ณต์ž๋ฅผ ๋‚˜ํƒ€๋‚ด๋Š” ์ฝ”๋“œ

  • pick_datetime: ๋ฏธํ„ฐ๊ธฐ๊ฐ€ ์ž‘๋™๋œ ๋‚ ์งœ ๋ฐ ์‹œ๊ฐ„

  • dropoff_datetime: ๋ฏธํ„ฐ๊ธฐ๊ฐ€ ํ•ด์ œ๋œ ๋‚ ์งœ ๋ฐ ์‹œ๊ฐ„

  • passenger_count: ์ฐจ๋Ÿ‰์— ํƒ‘์Šนํ•œ ์Šน๊ฐ ์ˆ˜(์šด์ „์ž ์ž…๋ ฅ ๊ฐ’)

  • pickup_longitude: ๋ฏธํ„ฐ๊ธฐ๊ฐ€ ๊ฑธ๋ ค ์žˆ๋˜ ๊ฒฝ๋„

  • pickup_latitude: ๋ฏธํ„ฐ๊ธฐ๊ฐ€ ์ž‘๋™๋œ ์œ„๋„

  • dropoff_longitude: ๋ฏธํ„ฐ๊ธฐ๊ฐ€ ํ•ด์ œ๋œ ๊ฒฝ๋„

  • dropoff_latitude: ๋ฏธํ„ฐ๊ธฐ๊ฐ€ ํ•ด์ œ๋œ ์œ„๋„

  • store_and_fwd_flag: ์ด ํ”Œ๋ž˜๊ทธ๋Š” ์ฐจ๋Ÿ‰์ด ์„œ๋ฒ„์™€ ์—ฐ๊ฒฐ๋˜์ง€ ์•Š์•˜๊ธฐ ๋•Œ๋ฌธ์— ๊ณต๊ธ‰์—…์ฒด์— ์ „์†กํ•˜๊ธฐ ์ „์— ํŠธ๋ฆฝ ๋ ˆ์ฝ”๋“œ๋ฅผ ์ฐจ๋Ÿ‰ ๋ฉ”๋ชจ๋ฆฌ์— ๋ณด๊ด€ํ–ˆ๋Š”์ง€ ์—ฌ๋ถ€๋ฅผ ํ‘œ์‹œํ•จ

    • Y: store and forward/ N: store ๋ฐ Forward trip
  • trip_timeout: ์—ฌํ–‰ ๊ธฐ๊ฐ„(์ดˆ)

1-2. ์ ์ ˆํ•œ ๋ฐ์ดํ„ฐํ˜•์œผ๋กœ ๋ณ€ํ™˜

train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'],format = '%Y-%m-%d %H:%M:%S')
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'],format = '%Y-%m-%d %H:%M:%S')
train.head()
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N 455
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 N 663
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 N 2124
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 N 429
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 N 435
  • ์ œ๋Œ€๋กœ ๋ณ€ํ™˜๋œ ๊ฒƒ์„ ํ™•์ธํ•  ์ˆ˜ ์žˆ๋‹ค.

2. ๋ฐ์ดํ„ฐ ์ •๋ฆฌ

2-1. ๊ฒฐ์ธก์น˜ ํ™•์ธ

train[pd.isnull(train)].sum()
id                    0  
vendor_id             0.0
passenger_count       0.0
pickup_longitude      0.0
pickup_latitude       0.0
dropoff_longitude     0.0
dropoff_latitude      0.0
store_and_fwd_flag    0  
trip_duration         0.0
dtype: object
  • ๊ฒฐ์ธก์น˜๋Š” ์—†์Œ์„ ํ™•์ธํ•  ์ˆ˜ ์žˆ๋‹ค.

2-2. ๋ฐ์ดํ„ฐ๊ฐ€ ์ธก์ •๋œ ๊ธฐ๊ฐ„

print("Min pickup time:",min(train['pickup_datetime']))
print("Max pickup time:",max(train['pickup_datetime']))
Min pickup time: 2016-01-01 00:00:17
Max pickup time: 2016-06-30 23:59:39
  • 2016/01/01๋ถ€ํ„ฐ 2016/06/30๊นŒ์ง€ ๊ธฐ๋ก๋œ ๋ฐ์ดํ„ฐ์ด๋‹ค.

2-3. ๋ฐ์ดํ„ฐ ๊ฐ€๊ณต

a) datetime์—์„œ day, month, hour ์ •๋ณด๋ฅผ ์ƒ์„ฑ

train['pickup_date'] = train['pickup_datetime'].dt.date
train['pickup_day'] = train['pickup_datetime'].apply(lambda x:x.day)
train['pickup_hour'] = train['pickup_datetime'].apply(lambda x:x.hour)
train['pickup_day_of_week'] = train['pickup_datetime'].apply(lambda x:calendar.day_name[x.weekday()])

train['dropoff_date'] = train['dropoff_datetime'].dt.date
train['dropoff_day'] = train['dropoff_datetime'].apply(lambda x:x.day)
train['dropoff_hour'] = train['dropoff_datetime'].apply(lambda x:x.hour)
train['dropoff_day_of_week'] = train['dropoff_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
train.head(3)
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration pickup_date pickup_day pickup_hour pickup_day_of_week dropoff_date dropoff_day dropoff_hour dropoff_day_of_week
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N 455 2016-03-14 14 17 Monday 2016-03-14 14 17 Monday
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 N 663 2016-06-12 12 0 Sunday 2016-06-12 12 0 Sunday
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 N 2124 2016-01-19 19 11 Tuesday 2016-01-19 19 12 Tuesday

b) ์œ„๋„/๊ฒฝ๋„ ์ •๋ณด ์ˆ˜์ •

# ์œ„๋„ ๊ฒฝ๋„ ๋ณ€์ˆ˜ ์†Œ์ˆ˜์  ์ดํ•˜ 3์ž๋ฆฌ๊นŒ์ง€ ๋ฐ˜์˜ฌ๋ฆผ

train['pickup_latitude_round3'] = train['pickup_latitude'].apply(lambda x:round(x,3))
train['pickup_longitude_round3'] = train['pickup_longitude'].apply(lambda x:round(x,3))
train['dropoff_latitude_round3'] = train['dropoff_latitude'].apply(lambda x:round(x,3))
train['dropoff_longitude_round3'] = train['dropoff_longitude'].apply(lambda x:round(x,3))

train.head()
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... pickup_hour pickup_day_of_week dropoff_date dropoff_day dropoff_hour dropoff_day_of_week pickup_latitude_round3 pickup_longitude_round3 dropoff_latitude_round3 dropoff_longitude_round3
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N ... 17 Monday 2016-03-14 14 17 Monday 40.768 -73.982 40.766 -73.965
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 N ... 0 Sunday 2016-06-12 12 0 Sunday 40.739 -73.980 40.731 -73.999
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 N ... 11 Tuesday 2016-01-19 19 12 Tuesday 40.764 -73.979 40.710 -74.005
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 N ... 19 Wednesday 2016-04-06 6 19 Wednesday 40.720 -74.010 40.707 -74.012
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 N ... 13 Saturday 2016-03-26 26 13 Saturday 40.793 -73.973 40.783 -73.973

5 rows ร— 23 columns

c) ์œ„๋„ ๊ฒฝ๋„๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ km ๋‹จ์œ„ ๊ฑฐ๋ฆฌ๋กœ ๊ณ„์‚ฐํ•˜๊ธฐ

### ํ•จ์ˆ˜ ์ •์˜
# lambda ํ•จ์ˆ˜์œผ ์ธ์ž๋กœ ๋„ฃ์–ด ๋ฐ์ดํ„ฐ๋ฅผ ์‰ฝ๊ฒŒ ๋ณ€๊ฒฝํ•˜๊ธฐ ์œ„ํ•ด์„œ!

def calculateDistance(row):
    R = 6373.0 # ๋Œ€๋žต์ ์ธ ์ง€๊ตฌ์˜ ๋ฐ˜์ง€๋ฆ„(์ƒ์ˆ˜)
    
    # 60๋ถ„๋ฒ• -> ํ˜ธ๋„๋ฒ•(๋ผ๋””์•ˆ ๊ฐ)
    pickup_lat = radians(row['pickup_latitude'])
    pickup_lon = radians(row['pickup_longitude'])
    dropoff_lat = radians(row['dropoff_latitude'])
    dropoff_lon = radians(row['dropoff_longitude'])

    dlon = dropoff_lon - pickup_lon # ๊ฒฝ๋„
    dlat = dropoff_lat - pickup_lat # ์œ„๋„

    ### ๊ฑฐ๋ฆฌ ๊ณ„์‚ฐ
    a = sin(dlat / 2)**2 + cos(pickup_lat) * cos(dropoff_lat) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    
    return distance
train['trip_distance'] = train.apply(lambda row: calculateDistance(row), axis = 1)
train.head()
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... pickup_day_of_week dropoff_date dropoff_day dropoff_hour dropoff_day_of_week pickup_latitude_round3 pickup_longitude_round3 dropoff_latitude_round3 dropoff_longitude_round3 trip_distance
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N ... Monday 2016-03-14 14 17 Monday 40.768 -73.982 40.766 -73.965 1.498991
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 N ... Sunday 2016-06-12 12 0 Sunday 40.739 -73.980 40.731 -73.999 1.806074
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 N ... Tuesday 2016-01-19 19 12 Tuesday 40.764 -73.979 40.710 -74.005 6.387103
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 N ... Wednesday 2016-04-06 6 19 Wednesday 40.720 -74.010 40.707 -74.012 1.485965
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 N ... Saturday 2016-03-26 26 13 Saturday 40.793 -73.973 40.783 -73.973 1.188962

5 rows ร— 24 columns

train['trip_duration_in_hour'] = train['trip_duration'].apply(lambda x:x/3600)
train.head()
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... dropoff_date dropoff_day dropoff_hour dropoff_day_of_week pickup_latitude_round3 pickup_longitude_round3 dropoff_latitude_round3 dropoff_longitude_round3 trip_distance trip_duration_in_hour
0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 -73.982155 40.767937 -73.964630 40.765602 N ... 2016-03-14 14 17 Monday 40.768 -73.982 40.766 -73.965 1.498991 0.126389
1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 1 -73.980415 40.738564 -73.999481 40.731152 N ... 2016-06-12 12 0 Sunday 40.739 -73.980 40.731 -73.999 1.806074 0.184167
2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 1 -73.979027 40.763939 -74.005333 40.710087 N ... 2016-01-19 19 12 Tuesday 40.764 -73.979 40.710 -74.005 6.387103 0.590000
3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 1 -74.010040 40.719971 -74.012268 40.706718 N ... 2016-04-06 6 19 Wednesday 40.720 -74.010 40.707 -74.012 1.485965 0.119167
4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 1 -73.973053 40.793209 -73.972923 40.782520 N ... 2016-03-26 26 13 Saturday 40.793 -73.973 40.783 -73.973 1.188962 0.120833

5 rows ร— 25 columns

3. EDA(Exploratory Data Analysis)

3-1. ๋ฐ์ดํ„ฐ ๋ถ„ํฌ ํ™•์ธ

### ์—ฌํ–‰ ์ง€์† ์‹œ๊ฐ„ ๋ถ„ํฌ

plt.figure(figsize = (8,5))
sns.distplot(train['trip_duration_in_hour']).set_title("Distribution of Trip Duration")
plt.xlabel("Trip Duration (in hour)")
Text(0.5, 0, 'Trip Duration (in hour)')

  • ์—ฌํ–‰ ์ง€์†์‹œ๊ฐ„์ด 24์‹œ๊ฐ„ ์ด์ƒ์ธ ๋ฐ์ดํ„ฐ๋“ค์ด ์กด์žฌํ•จ
outlier_trip_duration = train.loc[train['trip_duration_in_hour'] > 24]
outlier_trip_duration
id vendor_id pickup_datetime dropoff_datetime passenger_count pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag ... dropoff_date dropoff_day dropoff_hour dropoff_day_of_week pickup_latitude_round3 pickup_longitude_round3 dropoff_latitude_round3 dropoff_longitude_round3 trip_distance trip_duration_in_hour
355003 id1864733 1 2016-01-05 00:19:42 2016-01-27 11:08:38 1 -73.789650 40.643559 -73.956810 40.773087 N ... 2016-01-27 27 11 Wednesday 40.644 -73.790 40.773 -73.957 20.154989 538.815556
680594 id0369307 1 2016-02-13 22:38:00 2016-03-08 15:57:38 2 -73.921677 40.735252 -73.984749 40.759979 N ... 2016-03-08 8 15 Tuesday 40.735 -73.922 40.760 -73.985 5.984365 569.327222
924150 id1325766 1 2016-01-05 06:14:15 2016-01-31 01:01:07 1 -73.983788 40.742325 -73.985489 40.727676 N ... 2016-01-31 31 1 Sunday 40.742 -73.984 40.728 -73.985 1.635641 618.781111
978383 id0053347 1 2016-02-13 22:46:52 2016-03-25 18:18:14 1 -73.783905 40.648632 -73.978271 40.750202 N ... 2016-03-25 25 18 Friday 40.649 -73.784 40.750 -73.978 19.906909 979.522778

4 rows ร— 25 columns

  • ์—ฌํ–‰ ๊ธฐ๊ฐ„์ด ๋งค์šฐ ๊ธด 4๊ฐœ์˜ ๊ธฐ๋ก์ด ์žˆ์Œ

    • ํ•˜์ง€๋งŒ ์—ฌํ–‰ ๊ฑฐ๋ฆฌ๋Š” ๋งค์šฐ ์งง์Œ

    • ํ•ด๋‹น ๋ฐ์ดํ„ฐ๋“ค์„ ์ด์ƒ์น˜๋ผ ํŒ๋‹จํ•  ์ˆ˜ ์žˆ์Œ

  • ์—ฌํ–‰ ๊ธฐ๊ฐ„ ๋˜ํ•œ ์™œ๊ณก๋˜์–ด ์žˆ์Œ

=> ๋กœ๊ทธ ๋ณ€ํ™˜์„ ์ˆ˜ํ–‰

### ๋กœ๊ทธ ๋ณ€ํ™˜ ํ›„ ๋ฐ์ดํ„ฐ ๋ถ„ํฌ ํ™•์ธ

plt.figure(figsize = (8,5))
sns.distplot(np.log(train['trip_duration'].values)).set_title("Distribution of Trip Duration")
plt.title("Distribution of trip duration (sec) in Log Scale")
Text(0.5, 1.0, 'Distribution of trip duration (sec) in Log Scale')

  • ์—ฌํ–‰ ์ง€์†์‹œ๊ฐ„ ์ปฌ๋Ÿผ์„ ๋กœ๊ทธ ๋ณ€ํ™˜ํ•œ ๊ฒฐ๊ณผ ๋ฐ์ดํ„ฐ์˜ ๋ถ„ํฌ๊ฐ€ ์ •๊ทœ๋ถ„ํฌ๋ฅผ ๋”ฐ๋ฆ„

  • ๋Œ€๋ถ€๋ถ„์˜ ์—ฌํ–‰์€ 54์ดˆ(4)์—์„œ 2980์ดˆ(8) ์‚ฌ์ด์— ์žˆ์Œ

    • ๋Œ€๋ถ€๋ถ„์˜ ์—ฌํ–‰์€ 1์‹œ๊ฐ„ ์ด๋‚ด๋ผ๋Š” ์ ์„ ์‹œ์‚ฌ
  • ํ•˜์ง€๋งŒ, 1๋ถ„์ด ์ฑ„ ๋˜์ง€ ์•Š๋Š” ์—ฌํ–‰๋“ค๊ณผ 100์‹œ๊ฐ„ ๋™์•ˆ ์ง€์†๋˜๋Š” ์—ฌํ–‰๋“ค๋„ ์กด์žฌ

    • ์ด์ƒ์น˜์ผ ๊ฐ€๋Šฅ์„ฑ์ด ๋†’์Œ

3-2. ์žฅ์†Œ

a) pickup๊ณผ dropoff์ด ๊ณตํ†ต์ ์œผ๋กœ ๋ฐœ์ƒํ•˜๋Š” ์žฅ์†Œ๋“ค์— ๋Œ€ํ•œ heatmap

pickup = train.groupby(['pickup_latitude_round3','pickup_longitude_round3'])['id'].count().reset_index().rename(columns = {'id':'Num_Trips'})
pickup.head()
pickup_latitude_round3 pickup_longitude_round3 Num_Trips
0 34.360 -65.848 1
1 34.712 -75.354 1
2 35.082 -71.800 1
3 35.310 -72.074 1
4 36.029 -77.441 1
### folium(์ง€๋„ ์‹œ๊ฐํ™” ํˆด)์„ ํ™œ์šฉํ•˜์—ฌ ์‹œ๊ฐํ™”
# pickup ์žฅ์†Œ

pickup_map = folium.Map(location = [40.730610,-73.935242],
                        zoom_start = 10,)

# print(pickup.shape)

### ๊ฐ pickup ์ง€์ ์„ ์›ํ˜• ๋งˆ์ปค๋กœ ํ‘œ๊ธฐ
'''
for index, row in pickup.iterrows():
    
    folium.CircleMarker([row['pickup_latitude_round3'], row['pickup_longitude_round3']],
                        radius=3,
                        
                        fill_color="#3db7e4", 
                        fill_opacity=0.9
                       ).add_to(pickup_map)
    count = count + 1


'''

hm_wide = HeatMap(list(zip(pickup.pickup_latitude_round3.values, pickup.pickup_longitude_round3.values, np.array(pickup.Num_Trips.values).astype('float64'))),
                     min_opacity = 0.2,
                     radius = 5, blur = 15,
                     max_zoom = 1 
                 )
pickup_map.add_child(hm_wide)

pickup_map 
Make this Notebook Trusted to load map: File -> Trust Notebook
city_long_border = (-74.03, -73.75) # ๊ฒฝ๋„ ๋ฒ”์œ„
city_lat_border = (40.63, 40.85) # ์œ„๋„ ๋ฒ”์œ„
fig, ax = plt.subplots(ncols = 1, sharex = True, sharey = True)
ax.scatter(train['pickup_longitude'], train['pickup_latitude'],
              color = 'blue', label = 'train', alpha = 0.1)

fig.suptitle('Lat Lng of Pickups in Train Data as Scatter Plot')

ax.set_ylabel('latitude')
ax.set_xlabel('longitude')
plt.ylim(city_lat_border)
plt.xlim(city_long_border)
(-74.03, -73.75)

  • JFK ๊ทผ์ฒ˜์˜ ํ”ฝ์—… ๋ฐ€๋„๊ฐ€ ๋†’์€ ๊ฒƒ์„ ๋ช…ํ™•ํ•˜๊ฒŒ ๋“œ๋Ÿฌ๋‚จ
drop = train.groupby(['dropoff_latitude_round3','dropoff_longitude_round3'])['id'].count().reset_index().rename(columns = {'id':'Num_Trips'})
### dropout ์žฅ์†Œ ์‹œ๊ฐํ™”

drop_map = folium.Map(location = [40.730610,-73.935242],zoom_start = 10,)
#print(pickup.shape)
### For each pickup point add a circlemarker
'''
for index, row in drop.iterrows():
    
    folium.CircleMarker([row['dropoff_latitude_round3'], row['dropoff_longitude_round3']],
                        radius=3,
                        
                        color="#008000", 
                        fill_opacity=0.9
                       ).add_to(drop_map)
    count=count + 1

'''
hm_wide = HeatMap(list(zip(drop.dropoff_latitude_round3.values, drop.dropoff_longitude_round3.values, np.array(drop.Num_Trips.values).astype('float64'))),
                  min_opacity = 0.2,
                  radius = 5, blur = 15,
                  max_zoom = 1 
                 )
drop_map.add_child(hm_wide)

drop_map
Make this Notebook Trusted to load map: File -> Trust Notebook
  • pickup ์žฅ์†Œ์™€ dropoff ์žฅ์†Œ๊ฐ€ ๋น„์Šทํ•จ

b) pickup์ด point์—์„œ ์‹œ์ž‘๋  ๋•Œ ์—ฌํ–‰ ์ง€์† ์‹œ๊ฐ„์˜ heatmap

pickup = train.groupby(['pickup_latitude_round3','pickup_longitude_round3'])['trip_duration'].mean().reset_index().rename(columns = {'trip_duration':'Avg_Trip_duration'})
### folium์œผ๋กœ ์‹œ๊ฐํ™”

pickup_map = folium.Map(location = [40.730610, -73.935242], zoom_start = 10,)


hm_wide = HeatMap(list(zip(pickup.pickup_latitude_round3.values, pickup.pickup_longitude_round3.values, 
                           pickup.Avg_Trip_duration.values)),
                     min_opacity = 0.2,
                     radius = 7, blur = 15,
                     max_zoom = 1 
                 )
pickup_map.add_child(hm_wide)

pickup_map
Make this Notebook Trusted to load map: File -> Trust Notebook
  • JFK์—์„œ ์ถœ๋ฐœํ•˜๋Š” ๊ฒฝ์šฐ ํ‰๊ท  ์—ฌํ–‰ ๊ธฐ๊ฐ„์ด ๋” ๊ธด ๊ฒฝํ–ฅ์ด ์žˆ๋‹ค.

  • ์ž์„ธํžˆ ๋“ค์—ฌ๋‹ค๋ณด๋ฉด ๋งจํ•˜ํƒ„ ์ดํ›„์ž„์„ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Œ

3-3. ์‹œ๊ฐ„๋Œ€

a) pickup๊ณผ dropoff์ด ๋งŽ์ด ๋ฐœ์ƒํ•˜๋Š” ์‹œ๊ฐ„๋Œ€

plt.figure(figsize = (8,5))
sns.countplot(x = train['pickup_hour']).set_title("Pickup Hours Distribution")
Text(0.5, 1.0, 'Pickup Hours Distribution')

  • ์ด๋ฅธ ์•„์นจ์‹œ๊ฐ„์—๋Š” ์ ์Œ

  • ์˜คํ›„ 6 ~ 8์‹œ ์‚ฌ์ด๊ฐ€ ํ”ผํฌ์ž„

plt.figure(figsize = (8,5))
sns.countplot(x = train['dropoff_hour']).set_title("Dropoff Hours Distribution")
Text(0.5, 1.0, 'Dropoff Hours Distribution')

  • dropout ์‹œ๊ฐ„๋Œ€๋„ pickup ์‹œ๊ฐ„๋Œ€์™€ ๋น„์Šทํ•จ

b) ์ „์ฒด์ ์ธ pickup ์‹œ๊ฐ„๋Œ€ ๋ถ„ํฌ

plt.figure(figsize = (8,5))
plt.plot(train.groupby('pickup_date').count()[['id']], 
         'o-',label = 'train')

plt.title("Distribution of Pickups over time")
Text(0.5, 1.0, 'Distribution of Pickups over time')

  • 2016๋…„ 1์›” ๋ง pickup ์ˆ˜๊ฐ€ ๊ธ‰๊ฒฉํžˆ ๊ฐ์†Œํ•œ ๊ฒƒ์„ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Œ

c) ์‹œ๊ฐ„๋Œ€ ๋ณ„ trip ๊ธฐ๊ฐ„

avg_duration_hour = train.groupby(['pickup_hour'])['trip_duration'].mean().reset_index().rename(columns = {'trip_duration':'avg_trip_duration'})
plt.figure(figsize = (8,5))
plt.plot(train.groupby(['pickup_hour'])['trip_duration'].mean(), 'o-')
[<matplotlib.lines.Line2D at 0x7f226bddd960>]

  • 10 ~ 15 ์‹œ๊ฐ„๋Œ€ ์‚ฌ์ด์— duration์ด ์ฆ๊ฐ€ํ•จ

d) ์š”์ผ๋ณ„ pickup ๋ถ„ํฌ ์‹œ๊ฐ„

plt.figure(figsize=(8,5))
sns.countplot(data = train['pickup_day_of_week'],
              order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 
                     'Friday', 'Saturday', 'Sunday'])
<Figure size 800x500 with 0 Axes>
  • ์™œ ์—๋Ÿฌโ€ฆ

image.png

e) ์š”์ผ๋ณ„ ํ‰๊ท  trip ๊ธฐ๊ฐ„

avg_duration_day = train.groupby(['pickup_day_of_week'])['trip_duration'].mean().reset_index().rename(columns = {'trip_duration' : 'avg_trip_duration'})
plt.figure(figsize = (8,5))
sns.barplot(x = 'pickup_day_of_week', y = 'avg_trip_duration', 
            data = avg_duration_day, 
            order = ['Monday','Tuesday','Wednesday','Thursday',
                     'Friday','Saturday', 'Sunday']).set_title("Avg Trip Duration vs Pickup Days of Week")
Text(0.5, 1.0, 'Avg Trip Duration vs Pickup Days of Week')

3-4. ๊ฑฐ๋ฆฌ, ์ง€์—ญ, ์†๋„

a) ๊ฑฐ๋ฆฌ ๋ถ„ํฌ ํ™•์ธ

plt.figure(figsize = (8,5))
sns.kdeplot(np.log(train['trip_distance'].values)).set_title("Trip Distance Distribution")
plt.xlabel("Trip Distance(log)") # ๋กœ๊ทธ ๋ณ€ํ™˜๋œ ๊ฑฐ๋ฆฌ
Text(0.5, 0, 'Trip Distance(log)')

b) ์—ฌํ–‰ ์ง€์† ์‹œ๊ฐ„ & ์—ฌํ–‰ ๊ฑฐ๋ฆฌ ๋น„๊ต

plt.scatter(np.log(train['trip_distance'].values), np.log(train['trip_duration'].values),
            color = 'blue', label = 'train')
plt.title("Distribution of Trip Distance vs Trip Duration")
plt.xlabel("Trip Distance(log scale)")
plt.ylabel("Trip Duration(log scale)")
Text(0, 0.5, 'Trip Duration(log scale)')

The number of pickups are very low on Monday.From Tuesday to Friday the number of pickups keep increasing

3-5. ์ œ๊ณตํ•ด ์ฃผ๋Š” ํ•จ์ˆ˜๋กœ ์—ฌํ–‰ ๋ฐฉํ–ฅ ์ธก์ •ํ•˜๊ธฐ

def calculateBearing(lat1,lng1,lat2,lng2):
    R = 6371 
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    
    return np.degrees(np.arctan2(y, x))
train['bearing'] = train.apply(lambda row:calculateBearing(row['pickup_latitude_round3'],
                                                           row['pickup_longitude_round3'],
                                                           row['dropoff_latitude_round3'],
                                                           row['dropoff_longitude_round3']),
                               axis = 1)

a) bearing ๋ถ„ํฌ ํ™•์ธ

sns.kdeplot(train['bearing'])
<Axes: xlabel='bearing', ylabel='Density'>

b) Bearing vs ์—ฌํ–‰ ์ง€์† ๊ธฐ๊ฐ„

plt.figure(figsize = (8,5))
plt.scatter(train['bearing'].values,
            y = np.log(train['trip_duration'].values))
plt.xlabel("Bearing")
plt.ylabel("Trip Duration(log scale)")
Text(0, 0.5, 'Trip Duration(log scale)')

  • ์—ฌํ–‰ ์ง€์† ์‹œ๊ฐ„ ์ค‘ ์ด์ƒ์น˜๋Š” ๋ชจ๋‘ bearing = -50 ์ฃผ๋ณ€์— ๋ถ„ํฌํ•จ

3-6. ์—ฌํ–‰ ๋ ˆ์ฝ”๋“œ

a) Store and FWD Flag ๋ถ„ํฌ ํ™•์ธ

train['store_and_fwd_flag'].value_counts()
N    1450599
Y    8045   
Name: store_and_fwd_flag, dtype: int64
plt.figure(figsize = (8,5))
sns.kdeplot(np.log(train.loc[train['store_and_fwd_flag'] == 'Y','trip_duration'].values),
            label = 'Store and Fwd = Yes')
sns.kdeplot(np.log(train.loc[train['store_and_fwd_flag'] == 'N','trip_duration'].values),
            label = 'Store and Fwd = No')
   
plt.title("Distribution of Store and Fwd Flag vs Trip Duration(log scale)")
plt.xlabel('Trip Duration(log scale)')
plt.ylabel('Density')
Text(0, 0.5, 'Density')

3-7. ์ง€์—ญ ๊ตฐ์ง‘ํ™”

  • ์ง€์—ญ์„ ์ƒ์„ฑํ•˜๋Š”๋ฐ ๋„์›€์ด ๋  ๊ฒƒ์ž„

  • k-means ๊ตฐ์ง‘ํ™” ์ˆ˜ํ–‰

### ์ขŒํ‘œ ์„ค์ •

coords = np.vstack((train[['pickup_latitude', 'pickup_longitude']].values,
                    train[['dropoff_latitude', 'dropoff_longitude']].values,
                    test[['pickup_latitude', 'pickup_longitude']].values,
                    test[['dropoff_latitude', 'dropoff_longitude']].values))
### k-means ๊ตฐ์ง‘ํ™”

kmeans = KMeans(n_clusters = 8, random_state = 0).fit(coords)
train.loc[:, 'pickup_neighbourhood'] = kmeans.predict(train[['pickup_latitude', 
                                                             'pickup_longitude']])
train.loc[:, 'dropoff_neighbourhood'] = kmeans.predict(train[['dropoff_latitude', 
                                                              'dropoff_longitude']])
### ๊ฒฝ๋„, ์œ„๋„ ๋ฒ”์œ„ ์„ค์ •

city_long_border = (-74.03, -73.75)
city_lat_border = (40.63, 40.85)
### ์‹œ๊ฐํ™”

fig, ax = plt.subplots(ncols = 1, sharex = True, sharey = True)
ax.scatter(train['pickup_longitude'], train['pickup_latitude'],
           c = train['pickup_neighbourhood'], label = 'train', alpha = 0.1)

fig.suptitle('Pickup Neighbourhood')

ax.set_ylabel('latitude')
ax.set_xlabel('longitude')

plt.ylim(city_lat_border)
plt.xlim(city_long_border)
(-74.03, -73.75)

a) ๊ฐ ์ง€์—ญ์—์„œ์˜ pickup ์ˆ˜

plt.figure(figsize = (8,5))

# countplot์œผ๋กœ ์‹œ๊ฐํ™”ํ•˜๋Š” ๊ฒฝ์šฐ ์ œ๋Œ€๋กœ ์‹œ๊ฐํ™”๊ฐ€ ๋˜์ง€ x
# histplot์œผ๋กœ ๋Œ€์ฒด
sns.histplot(train['pickup_neighbourhood']).set_title("Distribution of Number of Pickups across Neighbourhoods")
Text(0.5, 1.0, 'Distribution of Number of Pickups across Neighbourhoods')

  • ์ง€์—ญ 0, 3, 6 ์ˆœ์œผ๋กœ pickup ์ˆ˜์น˜๊ฐ€ ๋†’์Œ
avg_duration_neighbourhood = train.groupby(['pickup_neighbourhood'])['trip_duration'].mean().reset_index().rename(columns = {'trip_duration':'avg_trip_duration'})
plt.figure(figsize = (8,5))
sns.barplot(x = 'pickup_neighbourhood',y = 'avg_trip_duration',
            data = avg_duration_neighbourhood).set_title("Avg Trip Duration vs Neighbourhood")
Text(0.5, 1.0, 'Avg Trip Duration vs Neighbourhood')

  • 2, 3 ์ง€์—ญ ์ˆœ์œผ๋กœ ํ‰๊ท  ์—ฌํ–‰ ์ง€์†์‹œ๊ฐ„์ด ๊ธบ

  • 1, 6, 7์€ ์œ„์˜ pickup neighbourhood ์ˆซ์ž๊ฐ€ 0์— ๊ฐ€๊นŒ์šธ์ง€๋ผ๋„ ํ‰๊ท  ์ด์šฉ ๊ธฐ๊ฐ„์€ ๋†’์€ ์ถ•์— ์†ํ•จ

3-8. ์†๋„

a) ํ‰๊ท  ์†๋„ ๋ถ„ํฌ

train['avg_speed_kph'] = train['trip_distance'] / train['trip_duration_in_hour']

plt.figure(figsize = (8,5))

sns.kdeplot(train['avg_speed_kph'].values).set_title("Distribution of Average Speed (in kph)")
Text(0.5, 1.0, 'Distribution of Average Speed (in kph)')

print("Average speed is",np.mean(train['avg_speed_kph']),"kph") 

# ํ‰๊ท  ์†๋ ฅ์€ 14.4277kph ์ •๋„์ž„
Average speed is 14.427736738459107 kph

b) ์ผ์ฃผ์ผ์˜ ์š”์ผ๋ณ„ ํ‰๊ท  ์†๋„

  • ๊ตํ†ต ์†๋„๋ฅผ ์˜๋ฏธ
avg_speed_per_day = train.groupby(['pickup_day_of_week'])['avg_speed_kph'].mean().reset_index()

plt.figure(figsize = (8,5))
sns.barplot(x = 'pickup_day_of_week', y = 'avg_speed_kph',
            data = avg_speed_per_day, 
            order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday', 'Sunday']).set_title("Avg Speed (kph) vs Pickup Days of Week")
Text(0.5, 1.0, 'Avg Speed (kph) vs Pickup Days of Week')

  • ํ‰๊ท  ์†๋„์˜ ๊ฒฝ์šฐ ์ผ์š”์ผ๊ณผ ์›”์š”์ผ์ด ๋” ๋น ๋ฅธ ๊ฒฝํ–ฅ์„ ๋ณด์ž„

4. ๋ชจ๋ธ๋ง(Modeling)

4-1. Test data_ํŠน์„ฑ ๊ณตํ•™(Feature Engineering)

  • ๋ชจ๋ธ ์ ์šฉ์„ ์œ„ํ•ด test data์— ๋Œ€ํ•ด featrue engineering์„ ์ง„ํ–‰
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'],format = '%Y-%m-%d %H:%M:%S')
# test['dropoff_datetime'] = pd.to_datetime(test['dropoff_datetime'], format = '%Y-%m-%d %H:%M:%S')
test['pickup_date'] = test['pickup_datetime'].dt.date
test['pickup_day'] = test['pickup_datetime'].apply(lambda x:x.day)
test['pickup_hour'] = test['pickup_datetime'].apply(lambda x:x.hour)
test['pickup_day_of_week'] = test['pickup_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
# test['dropoff_date'] = test['dropoff_datetime'].dt.date
# test['dropoff_day'] = test['dropoff_datetime'].apply(lambda x:x.day)
# test['dropoff_hour'] = test['dropoff_datetime'].apply(lambda x:x.hour)
# test['dropoff_day_of_week'] = test['dropoff_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
test['pickup_latitude_round3'] = test['pickup_latitude'].apply(lambda x:round(x,3))
test['pickup_longitude_round3'] = test['pickup_longitude'].apply(lambda x:round(x,3))
test['dropoff_latitude_round3'] = test['dropoff_latitude'].apply(lambda x:round(x,3))
test['dropoff_longitude_round3'] = test['dropoff_longitude'].apply(lambda x:round(x,3))
test['trip_distance'] = test.apply(lambda row:calculateDistance(row), axis = 1)
# test['trip_duration_in_hour'] = test['trip_duration'].apply(lambda x:x/3600)
test['bearing'] = test.apply(lambda row:calculateBearing(row['pickup_latitude_round3'],
                                                         row['pickup_longitude_round3'],
                                                         row['dropoff_latitude_round3'],
                                                         row['dropoff_longitude_round3']),
                             axis = 1)
test.loc[:, 'pickup_neighbourhood'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test.loc[:, 'dropoff_neighbourhood'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])

4-2. ๋ชจ๋ธ ๊ตฌ์ถ•ํ•˜๊ธฐ

  • train ๋ฐ์ดํ„ฐ์—์„œ โ€œdropoff datetimeโ€ feature๋“ค์„ ์‚ญ์ œํ•ด์•ผ ํ•จ

  • lat lng์„ ์†Œ์ˆ˜์  3์ž๋ฆฌ๊นŒ์ง€ ๋ฐ˜์˜ฌ๋ฆผํ•˜์—ฌ ์ฒ˜๋ฆฌ

drop_cols = ['avg_speed_kph','trip_duration_in_hour',
             'dropoff_date','dropoff_day','dropoff_hour','dropoff_day_of_week','dropoff_datetime',
             'pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude']

training = train.drop(drop_cols, axis = 1)
testing = test.drop(['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude'],axis = 1)
  • ์šฐ๋ฆฌ๋Š” trip_duration์„ ์˜ˆ์ธกํ•ด์•ผ ํ•จ

    • log scale๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ ์˜ˆ์ธกํ•˜์ž.
### ๋กœ๊ทธ ๋ณ€ํ™˜

training['log_trip_duration'] = training['trip_duration'].apply(lambda x:np.log(x))
training.drop(['trip_duration'], axis = 1, inplace = True)
print("Training Data Shape ", training.shape)
print("Testing Data Shape ", testing.shape)
Training Data Shape  (1458644, 18)
Testing Data Shape  (625134, 17)
  • ์š”์ผ์„ ์ˆซ์ž๋กœ encoding
def encodeDays(day_of_week):
    day_dict = {'Sunday':0, 'Monday':1, 'Tuesday':2, 'Wednesday':3,
                'Thursday':4, 'Friday':5, 'Saturday':6}
                
    return day_dict[day_of_week]
training['pickup_day_of_week'] = training['pickup_day_of_week'].apply(lambda x:encodeDays(x))
testing['pickup_day_of_week'] = testing['pickup_day_of_week'].apply(lambda x:encodeDays(x))
### ์ตœ์ข… ๋ฐ์ดํ„ฐ ์ €์žฅ
# ๊ฐ€๊ณต๋œ ๋ฐ์ดํ„ฐ๋ฅผ ์ตœ์ข… ํŒŒ์ผ๋กœ ์ €์žฅ

training.to_csv("/content/drive/MyDrive/Colab Notebooks/ECC 48แ„€แ…ต แ„ƒแ…ฆแ„€แ…ชB/9แ„Œแ…ฎแ„Žแ…ก/data/input_training.csv",index = False)
testing.to_csv("/content/drive/MyDrive/Colab Notebooks/ECC 48แ„€แ…ต แ„ƒแ…ฆแ„€แ…ชB/9แ„Œแ…ฎแ„Žแ…ก/data/input_testing.csv",index = False)

del training
del testing
del train
del test
### ํ•„์š”ํ•œ ํ•จ์ˆ˜ ์ •์˜

# 1) ๋ผ๋ฒจ ์ธ์ฝ”๋”ฉ
def LabelEncoding(train_df,test_df,max_levels = 2):
  for col in train_df:
    if train_df[col].dtype == 'object':
      if len(list(train_df[col].unique())) <= max_levels:
        le = preprocessing.LabelEncoder()
        le.fit(train_df[col])
        train_df[col] = le.transform(train_df[col])
        test_df[col] = le.transform(test_df[col])
      
  return [train_df,test_df]
                

def readInputAndEncode(input_path,train_file,test_file,target_column):
    training = pd.read_csv(input_path + train_file)
    testing = pd.read_csv(input_path + test_file)
   
    training,testing = LabelEncoding(training,testing)
    
    # print("Training Data Shape after Encoding ",training.shape)
    # print("Testing Data Shape after Encoding ",testing.shape)

    ### ๋ชจ๋“  train column์ด test ๋ฐ์ดํ„ฐ์— ์žˆ๋Š”์ง€ ํ™•์ธ
    # ๊ทธ๋ ‡์ง€ ์•Š๋‹ค๋ฉด test data์— column์„ ์ถ”๊ฐ€ํ•˜๊ณ  0์œผ๋กœ ๋Œ€์ฒด

    train_cols = training.columns.tolist()
    test_cols = testing.columns.tolist()
    
    col_in_train_not_test = set(train_cols) - set(test_cols)
    for col in col_in_train_not_test:
      if col != target_column:
        testing[col] = 0
    
    col_in_test_not_train = set(test_cols) - set(train_cols)
    for col in col_in_test_not_train:
      training[col] = 0
    
    print("Training Data Shape after Processing ",training.shape)
    print("Testing Data Shape after Processing ",testing.shape)
    
    return [training,testing]
train,test = readInputAndEncode("/content/drive/MyDrive/Colab Notebooks/ECC 48แ„€แ…ต แ„ƒแ…ฆแ„€แ…ชB/9แ„Œแ…ฎแ„Žแ…ก/data/",
                                'input_training.csv','input_testing.csv','log_trip_duration')
train.drop(['pickup_date'], axis = 1, inplace = True)
test.drop(['pickup_date'], axis = 1, inplace = True)

train.drop(['pickup_datetime'], axis = 1, inplace = True)
test.drop(['pickup_datetime'], axis = 1, inplace = True)

test_id = test['id']
train.drop(['id'], axis = 1,inplace = True)
test.drop(['id'], axis = 1, inplace = True)
Training Data Shape after Processing  (1458644, 18)
Testing Data Shape after Processing  (625134, 17)

4-3. ๋ชจ๋ธ ์ ์šฉํ•˜๊ธฐ

def GetFeaturesAndSplit(train, test, target,
                        imputing_strategy = 'median', split = 0.25, imputation = True):
    labels = np.array(train[target])
    training = train.drop(target, axis = 1)
    training = np.array(training)
    testing = np.array(test)
    
    if imputation == True:
        imputer = SimpleImputer(strategy = imputing_strategy, missing_values = np.nan)
        imputer.fit(training)
        
        training = imputer.transform(training)
        testing = imputer.transform(testing)
    
    train_features, validation_features, train_labels, validation_labels = train_test_split(training, labels, 
                                                                                            test_size = split, 
                                                                                            random_state = 42)
    
    return [train_features,validation_features,train_labels,validation_labels,testing]
train_features, validation_features, train_labels, validation_labels, testing = GetFeaturesAndSplit(train, test, 
                                                                                                    'log_trip_duration', imputation = False)

a) ์„ ํ˜• ํšŒ๊ท€(Linear Regression)

### ํ•™์Šต

lm = linear_model.LinearRegression()
lm.fit(train_features, train_labels)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
### ์˜ˆ์ธก

valid_pred = lm.predict(validation_features)
### ํ‰๊ฐ€

rmse = mean_squared_error(validation_labels, valid_pred)
print("Root Mean Squared Error for Linear Regression(log scale): ",rmse)
Root Mean Squared Error for Linear Regression(log scale):  0.4031176249688163
### ์ œ์ถœ์šฉ ํŒŒ์ผ ์ƒ์„ฑ

test_pred = lm.predict(testing)
submit = pd.DataFrame()
submit['id'] = test_id
submit['trip_duration'] = np.exp(test_pred)
submit.to_csv("/content/drive/MyDrive/Colab Notebooks/ECC 48แ„€แ…ต แ„ƒแ…ฆแ„€แ…ชB/9แ„Œแ…ฎแ„Žแ…ก/data/submission_linear_regression_baseline.csv",index=False) #0.64221 on Leader board

del submit

b) Random Forest Regressor

rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
### ํ•™์Šต

rf.fit(train_features, train_labels)
RandomForestRegressor(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
### ์˜ˆ์ธก

valid_pred_rf = rf.predict(validation_features)
rmse = mean_squared_error(validation_labels, valid_pred_rf)
print("Root Mean Squared Error for Random Forest", rmse)
Root Mean Squared Error for Random Forest 0.16585976592912732
test_pred = rf.predict(testing)
submit = pd.DataFrame()
submit['id'] = test_id
submit['trip_duration'] = np.exp(test_pred)
submit.to_csv("submission_random_forest_baseline.csv",index = False)

ํƒœ๊ทธ: , ,

์นดํ…Œ๊ณ ๋ฆฌ:

์—…๋ฐ์ดํŠธ: