Kaggle 자전거 수요 분석

kaggle의 bike sharing demand 데이터를

PCA(주성분 분석)을 이용하여 분석해보았습니다

import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
test=pd.read_csv("/content/drive/My Drive/Colab Notebooks/bike/test.csv",header=0,parse_dates=["datetime"])
train=pd.read_csv("/content/drive/My Drive/Colab Notebooks/bike/train.csv",header=0,parse_dates=["datetime"])
#header None->0->1
from google.colab import drive
drive.mount('/content/drive')
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
train.head()
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0.0 3 13 16
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0.0 8 32 40
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0.0 5 27 32
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0.0 3 10 13
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0.0 0 1 1
test.head()
datetime season holiday workingday weather temp atemp humidity windspeed
0 2011-01-20 00:00:00 1 0 1 1 10.66 11.365 56 26.0027
1 2011-01-20 01:00:00 1 0 1 1 10.66 13.635 56 0.0000
2 2011-01-20 02:00:00 1 0 1 1 10.66 13.635 56 0.0000
3 2011-01-20 03:00:00 1 0 1 1 10.66 12.880 56 11.0014
4 2011-01-20 04:00:00 1 0 1 1 10.66 12.880 56 11.0014
train.shape
(10886, 12)
test.shape
(6493, 9)
train.describe()
season holiday workingday weather temp atemp humidity windspeed casual registered count
count 10886.000000 10886.000000 10886.000000 10886.000000 10886.00000 10886.000000 10886.000000 10886.000000 10886.000000 10886.000000 10886.000000
mean 2.506614 0.028569 0.680875 1.418427 20.23086 23.655084 61.886460 12.799395 36.021955 155.552177 191.574132
std 1.116174 0.166599 0.466159 0.633839 7.79159 8.474601 19.245033 8.164537 49.960477 151.039033 181.144454
min 1.000000 0.000000 0.000000 1.000000 0.82000 0.760000 0.000000 0.000000 0.000000 0.000000 1.000000
25% 2.000000 0.000000 0.000000 1.000000 13.94000 16.665000 47.000000 7.001500 4.000000 36.000000 42.000000
50% 3.000000 0.000000 1.000000 1.000000 20.50000 24.240000 62.000000 12.998000 17.000000 118.000000 145.000000
75% 4.000000 0.000000 1.000000 2.000000 26.24000 31.060000 77.000000 16.997900 49.000000 222.000000 284.000000
max 4.000000 1.000000 1.000000 4.000000 41.00000 45.455000 100.000000 56.996900 367.000000 886.000000 977.000000
test.describe()
season holiday workingday weather temp atemp humidity windspeed
count 6493.000000 6493.000000 6493.000000 6493.000000 6493.000000 6493.000000 6493.000000 6493.000000
mean 2.493300 0.029108 0.685815 1.436778 20.620607 24.012865 64.125212 12.631157
std 1.091258 0.168123 0.464226 0.648390 8.059583 8.782741 19.293391 8.250151
min 1.000000 0.000000 0.000000 1.000000 0.820000 0.000000 16.000000 0.000000
25% 2.000000 0.000000 0.000000 1.000000 13.940000 16.665000 49.000000 7.001500
50% 3.000000 0.000000 1.000000 1.000000 21.320000 25.000000 65.000000 11.001400
75% 3.000000 0.000000 1.000000 2.000000 27.060000 31.060000 81.000000 16.997900
max 4.000000 1.000000 1.000000 4.000000 40.180000 50.000000 100.000000 55.998600
train.isnull().sum()
datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64
test.isnull().sum()
datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
dtype: int64
weather_data = ['season', 'temp', 'weather' , 'atemp', 'humidity', 'windspeed']
train[weather_data].corr()
season temp weather atemp humidity windspeed
season 1.000000 0.258689 0.008879 0.264744 0.190610 -0.147121
temp 0.258689 1.000000 -0.055035 0.984948 -0.064949 -0.017852
weather 0.008879 -0.055035 1.000000 -0.055376 0.406244 0.007261
atemp 0.264744 0.984948 -0.055376 1.000000 -0.043536 -0.057473
humidity 0.190610 -0.064949 0.406244 -0.043536 1.000000 -0.318607
windspeed -0.147121 -0.017852 0.007261 -0.057473 -0.318607 1.000000

EDA / Preprocessing

Datetime
train["datetime-year"] = train["datetime"].dt.year
train["datetime-month"] = train["datetime"].dt.month
train["datetime-day"] = train["datetime"].dt.day
train["datetime-hour"] = train["datetime"].dt.hour
train["datetime-minute"] = train["datetime"].dt.minute
train["datetime-second"] = train["datetime"].dt.second

train["datetime-dayofweek"] = train["datetime"].dt.dayofweek
train.loc[train["datetime-dayofweek"] == 0, "datetime-dayofweek(humanized)"] = "Monday"
train.loc[train["datetime-dayofweek"] == 1, "datetime-dayofweek(humanized)"] = "Tuesday"
train.loc[train["datetime-dayofweek"] == 2, "datetime-dayofweek(humanized)"] = "Wednesday"
train.loc[train["datetime-dayofweek"] == 3, "datetime-dayofweek(humanized)"] = "Thursday"
train.loc[train["datetime-dayofweek"] == 4, "datetime-dayofweek(humanized)"] = "Friday"
train.loc[train["datetime-dayofweek"] == 5, "datetime-dayofweek(humanized)"] = "Saturday"
train.loc[train["datetime-dayofweek"] == 6, "datetime-dayofweek(humanized)"] = "Sunday"
test["datetime-year"] = test["datetime"].dt.year
test["datetime-month"] = test["datetime"].dt.month
test["datetime-day"] = test["datetime"].dt.day
test["datetime-hour"] = test["datetime"].dt.hour
test["datetime-minute"] = test["datetime"].dt.minute
test["datetime-second"] = test["datetime"].dt.second

test["datetime-dayofweek"] = test["datetime"].dt.dayofweek
test.loc[test["datetime-dayofweek"] == 0, "datetime-dayofweek(humanized)"] = "Monday"
test.loc[test["datetime-dayofweek"] == 1, "datetime-dayofweek(humanized)"] = "Tuesday"
test.loc[test["datetime-dayofweek"] == 2, "datetime-dayofweek(humanized)"] = "Wednesday"
test.loc[test["datetime-dayofweek"] == 3, "datetime-dayofweek(humanized)"] = "Thursday"
test.loc[test["datetime-dayofweek"] == 4, "datetime-dayofweek(humanized)"] = "Friday"
test.loc[test["datetime-dayofweek"] == 5, "datetime-dayofweek(humanized)"] = "Saturday"
test.loc[test["datetime-dayofweek"] == 6, "datetime-dayofweek(humanized)"] = "Sunday"

figure, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(nrows=2, ncols=3)

figure.set_size_inches(18, 8)

sns.barplot(data=train, x="datetime-year", y="count", ax=ax1)
sns.barplot(data=train, x="datetime-month", y="count", ax=ax2)
sns.barplot(data=train, x="datetime-day", y="count", ax=ax3)
sns.barplot(data=train, x="datetime-hour", y="count", ax=ax4)
sns.barplot(data=train, x="datetime-minute", y="count", ax=ax5)
sns.barplot(data=train, x="datetime-second", y="count", ax=ax6)
<matplotlib.axes._subplots.AxesSubplot at 0x7f7b3f03e080>

png

datetime-year

  • 2011년도의 자전거 대여량보다 2012년도의 자전거 대여량이 더 높음. 이는 Bike Sharing Demand 경진대회를 주최한 Capital Bikeshare사가 꾸준히 성장하고 있다고 간주할 수 있음.

datetime-month

  • 주로 여름(6-8월)에 자전거를 많이 빌리며, 겨울(12-2월)에는 자전거를 많이 빌리지 않는다
  • 같은 겨울이라도 12월의 자전거 대여량이 1월의 자전거 대여량보다 두 배 가까이 높다 (연도에 따른 성장세가 반영 되 있을 듯)

datetime-day

  • x축을 자세히 보면 1일부터 19일까지밖에 없다. 20일은 test 데이터에. 즉, train 데이터와 test 데이터를 나누는 기준이 되는 컬럼이 바로 datetime-day.
  • 이런 경우 datetime-day를 feature로 집어넣으면 머신러닝 알고리즘이 과적합(overfitting). 그러므로 train 데이터와 test 데이터를 나누는 기준이 되는 컬럼이 있으면, 이 컬럼은 feature로 사용하지 않는다.

datetime-hour

  • 새벽 시간에는 사람들이 자전거를 빌리지 않으며, 오후 시간에 상대적으로 자전거를 많이 빌린다.
  • 두 시간대, 출근 시간(7~9시)과 퇴근 시간(16시~19시)의 경우 사람들이 많이 빌린다.

datetime-minute & datetime-second

  • 이 두 컬럼은 x축이 모두 0으로 datetime-minutedatetime-second은 기록되고 있지 않다. 이 경우에는 feature로 넣어도 큰 의미가 없기 때문에 사용X

그러므로 이 시각화에서 알 수 있는 결론은, 전체 여섯개의 컬럼 중 datetime-year와 datetime-month, 그리고 datetime-hour만 사용.

train["datetime-year(str)"] = train["datetime-year"].astype('str')
train["datetime-month(str)"] = train["datetime-month"].astype('str')

train["datetime-year_month"] = train["datetime-year(str)"] + "-" + train["datetime-month(str)"]
figure, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)

figure.set_size_inches(18, 4)

sns.barplot(data=train, x="datetime-year", y="count", ax=ax1)
sns.barplot(data=train, x="datetime-month", y="count", ax=ax2)

figure, ax3 = plt.subplots(nrows=1, ncols=1)

figure.set_size_inches(18, 4)

sns.barplot(data=train, x="datetime-year_month", y="count", ax=ax3)
<matplotlib.axes._subplots.AxesSubplot at 0x7f7b3d47bef0>

png

png

  • 자전거 대여량은 꾸준히 상승하고 있는 추세
  • 우상단 시각화를 보자면, 12월의 자전거 대여량이 1월의 자전거 대여량보다 두 배 가까이 높다.
  • 하지만 아래의 시각화를 보면, 2011년 12월의 자전거 대여량과 2012년 1월의 자전거 대여량이 큰 차이가 없다.
  • 반면에 2011년 1월의 자전거 대여량과 2012년 12월의 자전거 대여량은 큰 차이

즉, 12월이 1월에 비해 자전거 대여량이 두 배 가까이 높은 이유는, 1) [Capital Bikeshare]의 자전거 대여량이 꾸준히 상승하고 있는 추세이며, 2) 이 과정에서 시기상으로 12월이 1월부터 늦게 발생했기 때문 즉 자전거를 대여하는 고객 입장에서 12월이라고 자전거를 더 많이 빌려야 할 이유는 없다.

이 점 역시 머신러닝 알고리즘이 과적합(overfitting)될 소지 있으며. 이를 해결할 수 있는 다양한 방법이 있는데,

  • datetime-year_month를 통채로 One Hot Encoding해서 feature로 사용한다.
  • 자전거 대여량이 꾸준히 성장하는 추세에 맞춰서 count를 보정한다.

가장 쉽과 빠르게 머신러닝 모델의 정확도를 늘리는 방법은 datetime-month를 feature로 사용하지 않는 것 그러므로 **연/월/일/시/분/초 여섯 개의 컬럼 중 연도(datetime-year)와 시간(datetime-hour), 이렇게 두 개의 컬럼만 사용

figure, (ax1, ax2, ax3) = plt.subplots(nrows=3, ncols=1)

figure.set_size_inches(18, 12)

sns.pointplot(data=train, x="datetime-hour", y="count", ax=ax1)


sns.pointplot(data=train, x="datetime-hour", y="count", hue="workingday", ax=ax2)


sns.pointplot(data=train, x="datetime-hour", y="count", hue="datetime-dayofweek(humanized)", ax=ax3)
<matplotlib.axes._subplots.AxesSubplot at 0x7f7b3d1c92b0>

png

  • 사람들은 기본적으로 출근 시간(7-9시)과 퇴근 시간(16-19시)에 자전거를 많이 빌림
  • 하지만 이는 근무일일 경우(workingday == 1)에만 한정. 근무일이 아닐 경우(workingday == 0), 사람들은 출/퇴근시간에 자전거를 빌리지 않고, 오후 시간(10 ~ 16시)에 자전거를 많이 빌린다.

요일(datetime-dayofweek)별 자전거 대여량

  • 먼저 금요일을 살펴보면, 다른 주중(월-목)에 비해 퇴근 시간(17-19시)에 상대적으로 자전거를 덜 빌림 이는 추측컨데 모종의 이유로 자전거를 탈 수 없거나(ex: 음주), 다른 교통수단(ex: 버스, 택시)을 대신 사용했을 것.
  • 반면 금요일은 주중임에도 불구하고 상대적으로 오후 시간(10~16시)의 자전거 대여량이 높다. 그 다음으로 높은 주중은 월요일. 즉, 금요일과 월요일은 주중임에도 불구하고 어느정도 주말의 속성을 가지고 있다.
  • 주말의 경우. 일요일을 보자면, 토요일에 비해 상대적으로 자전거 대여량이 낮다 이는 추측컨데 월요일의 피로도를 고려해서 토요일에 비해 대외 활동을 덜 가지는 것

이 분석을 통해 알 수 있는 사실은, 요일(datetime-dayofweek)을 feature로 집어넣으면 근무일(workingday)만 집어넣는 것에 비해 더 좋은 성능을 낼 수 있다. 그러므로 요일(datetime-dayofweek) 컬럼을 feature로 추가

PCA 시작

list(train)
['datetime',
 'season',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'casual',
 'registered',
 'count',
 'datetime-year',
 'datetime-month',
 'datetime-day',
 'datetime-hour',
 'datetime-minute',
 'datetime-second',
 'datetime-dayofweek',
 'datetime-dayofweek(humanized)',
 'datetime-year(str)',
 'datetime-month(str)',
 'datetime-year_month']

차원축소란 원래 차원(x, y축)을 다른 축으로 바꿔주는데,

그 축은 전체데이터를 잘 표현할수 있는 변수(차원)임 (분산이 가장 큰)

차원간에 다중분산성(독립되지 않고 서로 영향을 끼침)이 높다고 판단되는 차원들을 묶겠습니다

weather1 = ['season', 'temp', 'atemp', 'humidity', 'windspeed']
only_weather1 = train[weather1]
only_weather_test = test[weather1]
only_weather_test.head()
season temp atemp humidity windspeed
0 1 10.66 11.365 56 26.0027
1 1 10.66 13.635 56 0.0000
2 1 10.66 13.635 56 0.0000
3 1 10.66 12.880 56 11.0014
4 1 10.66 12.880 56 11.0014
only_weather1.head()
season temp atemp humidity windspeed
0 1 9.84 14.395 81 0.0
1 1 9.02 13.635 80 0.0
2 1 9.02 13.635 80 0.0
3 1 9.84 14.395 75 0.0
4 1 9.84 14.395 75 0.0
only_weather1.corr()
season temp atemp humidity windspeed
season 1.000000 0.258689 0.264744 0.190610 -0.147121
temp 0.258689 1.000000 0.984948 -0.064949 -0.017852
atemp 0.264744 0.984948 1.000000 -0.043536 -0.057473
humidity 0.190610 -0.064949 -0.043536 1.000000 -0.318607
windspeed -0.147121 -0.017852 -0.057473 -0.318607 1.000000
x, v, d = np.linalg.svd(only_weather1)#다중분산성 검사
print(v)

CI = max(v)/min(v)#제일높은값/ 제일낮은값

print("CI = ", CI) #30이상이면 다중분산성이 있다. pca를 쓴다
[7567.22295894 1472.48842857  922.0582764   112.34592103  103.2939727 ]
CI =  73.25909499631051
#StandardScaler() = 데이터를 조정하여 평균을0, 분산을 1로만들음
#같은 데이터에서 분기된 데이터들의 경우는 fit된 scaler를 test에 적용해야함
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaler.fit(only_weather1)
df_scaled = scaler.transform(only_weather1)
# Standardization
scaler_test = StandardScaler()
scaler_test.fit(only_weather_test)
df_scaled_test = scaler_test.transform(only_weather_test)
#PCA fit
#주성분(분산을 잘 나타내는)을 찾음
pca = PCA()
pca.fit(df_scaled)
x_pca = pca.transform(df_scaled)
loading = pd.DataFrame(pca.components_, columns = only_weather1.columns)
#PCA fit
pca_test = PCA()
pca_test.fit(df_scaled_test)
x_pca_test = pca_test.transform(df_scaled_test)
loading_test = pd.DataFrame(pca_test.components_, columns = only_weather_test.columns)
pca.explained_variance_ratio_
array([0.42347553, 0.2830441 , 0.15980429, 0.13083354, 0.00284254])
pca_test.explained_variance_ratio_
array([0.45379644, 0.25790681, 0.14917466, 0.13766332, 0.00145877])
plt.plot(range(1,6), pca.explained_variance_ratio_, alpha=0.5, linewidth=3, color='red')
plt.bar(range(1,6), pca.explained_variance_ratio_, alpha=0.5)

plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')

plt.tight_layout()
plt.show()

png

#3개의 축만 사용하기로
loading1 = loading[0:3]
loading1
season temp atemp humidity windspeed
0 -0.326754 -0.663497 -0.666371 -0.017713 0.092949
1 -0.353285 0.153685 0.124359 -0.673674 0.618279
2 0.808219 -0.150243 -0.171206 0.008608 0.542977
loading_test1 = loading_test[0:3]
loading_test1
season temp atemp humidity windspeed
0 -0.418213 -0.636772 -0.639951 0.029681 0.095921
1 -0.239429 0.127004 0.098151 -0.687312 0.666717
2 0.115195 0.044911 0.020255 0.675724 0.726429
only_weather1.shape
(10886, 5)
a = loading1.T #T = 전치행렬. 행과 열을 바꿔서 행렬연산을 가능하게만듬  5x3
b = only_weather1
a
0 1 2
season -0.326754 -0.353285 0.808219
temp -0.663497 0.153685 -0.150243
atemp -0.666371 0.124359 -0.171206
humidity -0.017713 -0.673674 0.008608
windspeed 0.092949 0.618279 0.542977
b
season temp atemp humidity windspeed
0 1 9.84 14.395 81 0.0000
1 1 9.02 13.635 80 0.0000
2 1 9.02 13.635 80 0.0000
3 1 9.84 14.395 75 0.0000
4 1 9.84 14.395 75 0.0000
... ... ... ... ... ...
10881 4 15.58 19.695 50 26.0027
10882 4 14.76 17.425 57 15.0013
10883 4 13.94 15.910 61 15.0013
10884 4 13.94 17.425 61 6.0032
10885 4 13.12 16.665 66 8.9981

10886 rows × 5 columns

c = np.dot(b,a) #행렬곱 실행. 이 행렬곱으로 원래 행렬(weather묶음)이 새로운 축을 기준으로 회전함
print(c.shape)
c
(10886, 3)





array([[-17.88276344, -51.6184782 ,  -2.43747214],
       [-16.81454064, -51.16533881,  -2.19276395],
       [-16.81454064, -51.16533881,  -2.19276395],
       ...,
       [-20.84428983, -29.11134886,   7.08501887],
       [-22.69020635, -34.48628044,   1.9398822 ],
       [-21.44989127, -36.22350229,   3.86239674]])
a_test = loading_test1.T
b_test = only_weather_test
c_test = np.dot(b_test,a_test)
print(c_test.shape)
c_test
(6493, 3)





array([[-10.32288503, -18.92312279,  57.55382825],
       [-14.26978336, -36.03676681,  38.71068597],
       [-14.26978336, -36.03676681,  38.71068597],
       ...,
       [-12.61262779, -31.52529752,  49.39002927],
       [-13.40667494, -30.03757883,  45.2471688 ],
       [-13.13954298, -36.22338933,  51.32868787]])
pca_dataset = pd.DataFrame({'PC1': c[:, 0], 'PC2': c[:, 1], 'PC3': c[:, 2]})
pca_dataset
PC1 PC2 PC3
0 -17.882763 -51.618478 -2.437472
1 -16.814541 -51.165339 -2.192764
2 -16.814541 -51.165339 -2.192764
3 -17.776483 -47.576433 -2.489117
4 -17.776483 -47.576433 -2.489117
... ... ... ...
10881 -23.237219 -14.176257 12.069428
10882 -22.327055 -26.102227 6.668012
10883 -20.844290 -29.111349 7.085019
10884 -22.690206 -34.486280 1.939882
10885 -21.449891 -36.223502 3.862397

10886 rows × 3 columns

pca_dataset_test = pd.DataFrame({'PC1': c_test[:, 0], 'PC2': c_test[:, 1], 'PC3': c_test[:, 2]})
pca_dataset_test
PC1 PC2 PC3
0 -10.322885 -18.923123 57.553828
1 -14.269783 -36.036767 38.710686
2 -14.269783 -36.036767 38.710686
3 -12.731353 -28.776048 46.687132
4 -12.731353 -28.776048 46.687132
... ... ... ...
6488 -12.612628 -31.525298 49.390029
6489 -12.612628 -31.525298 49.390029
6490 -12.612628 -31.525298 49.390029
6491 -13.406675 -30.037579 45.247169
6492 -13.139543 -36.223389 51.328688

6493 rows × 3 columns

x, v, d = np.linalg.svd(pca_dataset)
print(v)

CI = max(v)/min(v)

print("CI = ", CI) #다중분산성이 30보다 낮으므로 3개의 축은 독립된다는것을 확인
[4525.71707689 1468.52443054  566.65495536]
CI =  7.986724609245162
train['PC1'] = pca_dataset['PC1']
train['PC2'] = pca_dataset['PC2']
train['PC3'] = pca_dataset['PC3']
test['PC1'] = pca_dataset_test['PC1']
test['PC2'] = pca_dataset_test['PC2']
test['PC3'] = pca_dataset_test['PC3']
train.head()
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count datetime-year datetime-month datetime-day datetime-hour datetime-minute datetime-second datetime-dayofweek datetime-dayofweek(humanized) datetime-year(str) datetime-month(str) datetime-year_month PC1 PC2 PC3
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0.0 3 13 16 2011 1 1 0 0 0 5 Saturday 2011 1 2011-1 -17.882763 -51.618478 -2.437472
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0.0 8 32 40 2011 1 1 1 0 0 5 Saturday 2011 1 2011-1 -16.814541 -51.165339 -2.192764
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0.0 5 27 32 2011 1 1 2 0 0 5 Saturday 2011 1 2011-1 -16.814541 -51.165339 -2.192764
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0.0 3 10 13 2011 1 1 3 0 0 5 Saturday 2011 1 2011-1 -17.776483 -47.576433 -2.489117
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0.0 0 1 1 2011 1 1 4 0 0 5 Saturday 2011 1 2011-1 -17.776483 -47.576433 -2.489117


Count
sns.distplot(train["count"])
<matplotlib.axes._subplots.AxesSubplot at 0x7f7b35df1668>

png

train["log_count"] = np.log(train["count"] + 1) #좀 더 정규분표와 비슷하게 만들어 신빙성을 높임
figure, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)

figure.set_size_inches(18, 4)

sns.distplot(train["count"], ax=ax1)

sns.distplot(train["log_count"], ax=ax2)
<matplotlib.axes._subplots.AxesSubplot at 0x7f7b35a912b0>

png

train["count(recover)"] = np.exp(train["log_count"]) - 1

Fitting the model

#weather과 관련된 축을 새로 축소된 축으로 바꿔줌
features = ["season", "holiday", "workingday", "PC1", "PC2", "PC3",
                 "datetime-year", "datetime-hour", "datetime-dayofweek"]
features
['season',
 'holiday',
 'workingday',
 'PC1',
 'PC2',
 'PC3',
 'datetime-year',
 'datetime-hour',
 'datetime-dayofweek']
label = "log_count"
label
'log_count'
x_train = train[features]
x_test = test[features]
y_train = train[label]
Score method
import numpy as np


from sklearn.metrics import make_scorer


def rmse(predict, actual):

    predict = np.array(predict)
    actual = np.array(actual)

    distance = predict - actual  

    square_distance = distance ** 2   

    mean_square_distance = square_distance.mean()

    score = np.sqrt(mean_square_distance)
    
    return score

rmse_score = make_scorer(rmse)
rmse_score
make_scorer(rmse)
Hyperparameter tuning
# scikit-learn 패키지의 ensemble 모듈에 있는 RandomForestClassifier를 가지고 옵니다.
from sklearn.ensemble import RandomForestRegressor

# scikit-learn 패키지의 model_selection 모듈에 있는 cross_val_score 함수를 가지고 옵니다.
from sklearn.model_selection import cross_val_score

# n_estimators는 트리의 갯수입니다.
# 보통은 높을수록 좋지만, 그만큼 실행 속도가 오래 걸리기 때문에 이번에는 적당한 값을 주겠습니다. (300 개)
# 나머지 하이퍼패리미터가 전부 튜닝되면, 그 다음에는 n_estimators를 최대한 높이 주면 됩니다. (ex: 3,000 개)
n_estimators = 300

# 랜덤 서치를 반복할 횟수입니다.
# 보통 100번을 반복합니다.
num_epoch = 100

# hyperparameter 탐색 결과를 리스트로 저장합니다.
coarse_hyperparameters_list = []

# num_epoch 횟수만큼 랜덤 서치를 반복합니다.
for epoch in range(num_epoch):
    # 2에서 100 사이의 정수형(int) 값을 랜덤하게 생성하여 max_depth 변수에 할당합니다.
    max_depth = np.random.randint(low=2, high=100)

    # 0.1에서 1.0 사이의 실수형(float) 값을 랜덤하게 생성하여  max_features 변수에 할당합니다.
    max_features = np.random.uniform(low=0.1, high=1.0)

    # Random Forest를 생성합니다. 옵션은 다음과 같습니다.
    # 1) n_estimators. 트리의 갯수입니다.
    # 2) max_depth. 트리의 깊이입니다. 지정한 숫자만큼 트리가 깊게 가지를 뻗습니다.
    # 3) max_features. Feature Sampling입니다. 0.0 ~ 1.0 사이의 값을 넣으면, 트리를 생성할 때 전체 feature에서 지정한 비율만큼만 feature를 사용합니다.
    # 4) n_jobs. 병렬처리 여부입니다. -1을 대입하면 컴퓨터에 존재하는 모든 코어(Core, ex: 듀얼코어, 쿼드코어)를 전부 활용합니다.
    # 5) random_state. 랜덤포레스트의 결과가 랜덤하게 나오는 것을 고정하는 옵션입니다. 아무 숫자나 넣어주면 됩니다. (공학 용어에서 쓰이는 Seed Number와 동일한 개념입니다)
    model2 = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  max_features=max_features,
                                  n_jobs=-1,
                                  random_state=37)

    # cross_val_score를 실행합니다. 실행할 때는 다음의 옵션이 들어갑니다.
    # 1) model. 점수를 측정할 머신러닝 모델(현재는 Random Forest)이 들어갑니다.
    # 2) X_train. train 데이터의 feature 입니다.
    # 3) y_train. train 데이터의 label 입니다.
    # 4) cv. Cross Validation에서 데이터를 조각낼(split) 갯수입니다. 총 20조각을 내야하기 때문에 20을 대입합니다.
    # 5) scoring. 점수를 측정할 공식입니다.
    # 마지막으로, 이 함수의 실행 결과의 평균(mean)을 구한 뒤 score라는 이름의 새로운 변수에 할당합니다.
    score = cross_val_score(model2, x_train, y_train, cv=20, scoring=rmse_score).mean()
    
    # hyperparameter 탐색 결과를 딕셔너리화 합니다.
    hyperparameters = {
        'epoch': epoch,
        'score': score,
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'max_features': max_features,
    }

    # hyperparameter 탐색 결과를 리스트에 저장합니다.
    coarse_hyperparameters_list.append(hyperparameters)

    # hyperparameter 탐색 결과를 출력합니다.
    print(f"{epoch:2} n_estimators = {n_estimators}, max_depth = {max_depth:2}, max_features = {max_features:.6f}, Score = {score:.5f}")

# coarse_hyperparameters_list를 Pandas의 DataFrame으로 변환합니다.
coarse_hyperparameters_list = pd.DataFrame.from_dict(coarse_hyperparameters_list)

# 변환한 coarse_hyperparameters_list를 score가 높은 순으로 정렬합니다.
coarse_hyperparameters_list = coarse_hyperparameters_list.sort_values(by="score")

# coarse_hyperparameters_list 변수에 할당된 데이터의 행렬 사이즈를 출력합니다.
# 출력은 (row, column) 으로 표시됩니다.
print(coarse_hyperparameters_list.shape)

# coarse_hyperparameters_list의 상위 10개를 출력합니다.
coarse_hyperparameters_list.head(10)
 0 n_estimators = 300, max_depth = 60, max_features = 0.894427, Score = 0.37536
 1 n_estimators = 300, max_depth = 64, max_features = 0.824407, Score = 0.37384
 2 n_estimators = 300, max_depth = 38, max_features = 0.670715, Score = 0.37299
 3 n_estimators = 300, max_depth = 97, max_features = 0.541787, Score = 0.38654
 4 n_estimators = 300, max_depth = 17, max_features = 0.943065, Score = 0.37592
 5 n_estimators = 300, max_depth = 15, max_features = 0.883666, Score = 0.37421
 6 n_estimators = 300, max_depth = 59, max_features = 0.972636, Score = 0.37536
 7 n_estimators = 300, max_depth = 67, max_features = 0.587571, Score = 0.37658
 8 n_estimators = 300, max_depth = 89, max_features = 0.382686, Score = 0.40708
 9 n_estimators = 300, max_depth = 28, max_features = 0.904945, Score = 0.37514
10 n_estimators = 300, max_depth = 36, max_features = 0.269547, Score = 0.45493
11 n_estimators = 300, max_depth =  8, max_features = 0.393723, Score = 0.52784
12 n_estimators = 300, max_depth = 62, max_features = 0.181551, Score = 0.52206
13 n_estimators = 300, max_depth = 46, max_features = 0.135466, Score = 0.52206
14 n_estimators = 300, max_depth = 70, max_features = 0.693570, Score = 0.37299
15 n_estimators = 300, max_depth = 83, max_features = 0.281162, Score = 0.45492
16 n_estimators = 300, max_depth = 30, max_features = 0.629243, Score = 0.37645
17 n_estimators = 300, max_depth = 82, max_features = 0.741254, Score = 0.37299
18 n_estimators = 300, max_depth = 30, max_features = 0.170742, Score = 0.52246
19 n_estimators = 300, max_depth = 58, max_features = 0.833930, Score = 0.37384
20 n_estimators = 300, max_depth = 64, max_features = 0.157814, Score = 0.52206
21 n_estimators = 300, max_depth = 82, max_features = 0.622749, Score = 0.37658
22 n_estimators = 300, max_depth =  3, max_features = 0.104265, Score = 1.06773
23 n_estimators = 300, max_depth = 73, max_features = 0.307550, Score = 0.45492
24 n_estimators = 300, max_depth = 33, max_features = 0.984866, Score = 0.37542
25 n_estimators = 300, max_depth = 61, max_features = 0.213921, Score = 0.52206
26 n_estimators = 300, max_depth = 50, max_features = 0.322691, Score = 0.45492
27 n_estimators = 300, max_depth = 65, max_features = 0.222195, Score = 0.52206
28 n_estimators = 300, max_depth = 32, max_features = 0.288541, Score = 0.45490
29 n_estimators = 300, max_depth = 83, max_features = 0.635496, Score = 0.37658
30 n_estimators = 300, max_depth = 47, max_features = 0.333568, Score = 0.40708
31 n_estimators = 300, max_depth =  2, max_features = 0.369659, Score = 0.92161
32 n_estimators = 300, max_depth = 16, max_features = 0.887800, Score = 0.37411
33 n_estimators = 300, max_depth = 44, max_features = 0.778491, Score = 0.37384
34 n_estimators = 300, max_depth = 42, max_features = 0.835953, Score = 0.37384
35 n_estimators = 300, max_depth = 68, max_features = 0.502302, Score = 0.38654
36 n_estimators = 300, max_depth = 73, max_features = 0.486413, Score = 0.38654
37 n_estimators = 300, max_depth =  4, max_features = 0.555364, Score = 0.69367
38 n_estimators = 300, max_depth = 98, max_features = 0.834484, Score = 0.37384
39 n_estimators = 300, max_depth = 37, max_features = 0.575902, Score = 0.37657
40 n_estimators = 300, max_depth = 52, max_features = 0.583103, Score = 0.37658
41 n_estimators = 300, max_depth = 82, max_features = 0.628302, Score = 0.37658
42 n_estimators = 300, max_depth = 66, max_features = 0.880932, Score = 0.37384
43 n_estimators = 300, max_depth = 36, max_features = 0.355411, Score = 0.40711
44 n_estimators = 300, max_depth = 16, max_features = 0.583997, Score = 0.37762
45 n_estimators = 300, max_depth = 29, max_features = 0.862560, Score = 0.37360
46 n_estimators = 300, max_depth = 42, max_features = 0.334689, Score = 0.40708
47 n_estimators = 300, max_depth = 63, max_features = 0.494164, Score = 0.38654
48 n_estimators = 300, max_depth = 46, max_features = 0.188985, Score = 0.52206
49 n_estimators = 300, max_depth = 81, max_features = 0.558748, Score = 0.37658
50 n_estimators = 300, max_depth =  9, max_features = 0.949109, Score = 0.40934
51 n_estimators = 300, max_depth =  4, max_features = 0.154370, Score = 0.96037
52 n_estimators = 300, max_depth =  7, max_features = 0.989723, Score = 0.48421
53 n_estimators = 300, max_depth = 40, max_features = 0.370733, Score = 0.40708
54 n_estimators = 300, max_depth =  7, max_features = 0.671371, Score = 0.49037
55 n_estimators = 300, max_depth = 69, max_features = 0.688413, Score = 0.37299
56 n_estimators = 300, max_depth = 14, max_features = 0.221293, Score = 0.54334
57 n_estimators = 300, max_depth = 33, max_features = 0.670240, Score = 0.37290
58 n_estimators = 300, max_depth =  3, max_features = 0.644769, Score = 0.74844
59 n_estimators = 300, max_depth = 51, max_features = 0.120319, Score = 0.52206
60 n_estimators = 300, max_depth = 20, max_features = 0.852481, Score = 0.37286
61 n_estimators = 300, max_depth = 82, max_features = 0.855937, Score = 0.37384
62 n_estimators = 300, max_depth = 76, max_features = 0.463208, Score = 0.38654
63 n_estimators = 300, max_depth = 20, max_features = 0.774948, Score = 0.37371
64 n_estimators = 300, max_depth = 84, max_features = 0.491074, Score = 0.38654
65 n_estimators = 300, max_depth =  3, max_features = 0.139929, Score = 1.06773
66 n_estimators = 300, max_depth =  5, max_features = 0.537367, Score = 0.62741
67 n_estimators = 300, max_depth = 31, max_features = 0.867675, Score = 0.37368
68 n_estimators = 300, max_depth = 98, max_features = 0.392808, Score = 0.40708
69 n_estimators = 300, max_depth = 11, max_features = 0.437995, Score = 0.44879
70 n_estimators = 300, max_depth = 31, max_features = 0.443427, Score = 0.40705
71 n_estimators = 300, max_depth = 87, max_features = 0.838645, Score = 0.37384
72 n_estimators = 300, max_depth = 69, max_features = 0.119969, Score = 0.52206
73 n_estimators = 300, max_depth = 38, max_features = 0.355214, Score = 0.40707
74 n_estimators = 300, max_depth = 89, max_features = 0.434928, Score = 0.40708
75 n_estimators = 300, max_depth = 36, max_features = 0.885137, Score = 0.37381
76 n_estimators = 300, max_depth = 76, max_features = 0.254060, Score = 0.45492
77 n_estimators = 300, max_depth = 37, max_features = 0.486067, Score = 0.38653
78 n_estimators = 300, max_depth = 90, max_features = 0.748630, Score = 0.37299
79 n_estimators = 300, max_depth = 77, max_features = 0.805650, Score = 0.37384
80 n_estimators = 300, max_depth = 72, max_features = 0.554798, Score = 0.38654
81 n_estimators = 300, max_depth = 25, max_features = 0.778340, Score = 0.37411
82 n_estimators = 300, max_depth = 75, max_features = 0.800938, Score = 0.37384
83 n_estimators = 300, max_depth = 14, max_features = 0.936876, Score = 0.37611
84 n_estimators = 300, max_depth = 48, max_features = 0.138635, Score = 0.52206
85 n_estimators = 300, max_depth = 71, max_features = 0.169769, Score = 0.52206
86 n_estimators = 300, max_depth = 64, max_features = 0.312676, Score = 0.45492
87 n_estimators = 300, max_depth = 81, max_features = 0.541337, Score = 0.38654
88 n_estimators = 300, max_depth = 86, max_features = 0.423556, Score = 0.40708
89 n_estimators = 300, max_depth =  9, max_features = 0.476241, Score = 0.45649
90 n_estimators = 300, max_depth = 20, max_features = 0.937376, Score = 0.37603
91 n_estimators = 300, max_depth = 91, max_features = 0.937290, Score = 0.37536
92 n_estimators = 300, max_depth = 93, max_features = 0.534470, Score = 0.38654
93 n_estimators = 300, max_depth = 93, max_features = 0.520332, Score = 0.38654
94 n_estimators = 300, max_depth = 83, max_features = 0.221991, Score = 0.52206
95 n_estimators = 300, max_depth = 51, max_features = 0.408976, Score = 0.40708
96 n_estimators = 300, max_depth =  7, max_features = 0.553989, Score = 0.52635
97 n_estimators = 300, max_depth = 85, max_features = 0.793968, Score = 0.37384
98 n_estimators = 300, max_depth = 74, max_features = 0.134740, Score = 0.52206
99 n_estimators = 300, max_depth = 80, max_features = 0.383653, Score = 0.40708
(100, 5)
epoch score n_estimators max_depth max_features
60 60 0.372858 300 20 0.852481
57 57 0.372899 300 33 0.670240
2 2 0.372986 300 38 0.670715
17 17 0.372989 300 82 0.741254
55 55 0.372989 300 69 0.688413
78 78 0.372989 300 90 0.748630
14 14 0.372989 300 70 0.693570
45 45 0.373601 300 29 0.862560
67 67 0.373676 300 31 0.867675
63 63 0.373705 300 20 0.774948
n_estimators = 300

num_epoch = 100
finer_hyperparameters_list = []

for epoch in range(num_epoch):
    max_depth = np.random.randint(low=20, high=90)
    max_features = np.random.uniform(low=0.67, high=0.87)
    model2 = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  max_features=max_features,
                                  n_jobs=-1,
                                  random_state=37)

 
    score = cross_val_score(model2, x_train, y_train, cv=20, scoring=rmse_score).mean()
    
   
    hyperparameters = {
        'epoch': epoch,
        'score': score,
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'max_features': max_features,
    }

  
    finer_hyperparameters_list.append(hyperparameters)

  
    print(f"{epoch:2} n_estimators = {n_estimators}, max_depth = {max_depth:2}, max_features = {max_features:.6f}, Score = {score:.5f}")

finer_hyperparameters_list = pd.DataFrame.from_dict(finer_hyperparameters_list)

finer_hyperparameters_list = finer_hyperparameters_list.sort_values(by="score")

print(finer_hyperparameters_list.shape)

finer_hyperparameters_list.head(10)
 0 n_estimators = 300, max_depth = 31, max_features = 0.701086, Score = 0.37267
 1 n_estimators = 300, max_depth = 23, max_features = 0.728162, Score = 0.37346
 2 n_estimators = 300, max_depth = 36, max_features = 0.866035, Score = 0.37381
 3 n_estimators = 300, max_depth = 46, max_features = 0.858805, Score = 0.37384
 4 n_estimators = 300, max_depth = 34, max_features = 0.674593, Score = 0.37296
 5 n_estimators = 300, max_depth = 51, max_features = 0.858732, Score = 0.37384
 6 n_estimators = 300, max_depth = 59, max_features = 0.807806, Score = 0.37384
 7 n_estimators = 300, max_depth = 79, max_features = 0.756331, Score = 0.37299
 8 n_estimators = 300, max_depth = 22, max_features = 0.718962, Score = 0.37331
 9 n_estimators = 300, max_depth = 47, max_features = 0.776317, Score = 0.37299
10 n_estimators = 300, max_depth = 84, max_features = 0.810409, Score = 0.37384
11 n_estimators = 300, max_depth = 55, max_features = 0.825679, Score = 0.37384
12 n_estimators = 300, max_depth = 70, max_features = 0.741438, Score = 0.37299
13 n_estimators = 300, max_depth = 53, max_features = 0.671523, Score = 0.37299
14 n_estimators = 300, max_depth = 41, max_features = 0.826184, Score = 0.37384
15 n_estimators = 300, max_depth = 57, max_features = 0.749898, Score = 0.37299
16 n_estimators = 300, max_depth = 61, max_features = 0.756896, Score = 0.37299
17 n_estimators = 300, max_depth = 40, max_features = 0.710294, Score = 0.37298
18 n_estimators = 300, max_depth = 52, max_features = 0.785932, Score = 0.37384
19 n_estimators = 300, max_depth = 89, max_features = 0.710544, Score = 0.37299
20 n_estimators = 300, max_depth = 25, max_features = 0.843766, Score = 0.37411
21 n_estimators = 300, max_depth = 37, max_features = 0.841488, Score = 0.37381
22 n_estimators = 300, max_depth = 68, max_features = 0.841873, Score = 0.37384
23 n_estimators = 300, max_depth = 64, max_features = 0.732711, Score = 0.37299
24 n_estimators = 300, max_depth = 50, max_features = 0.750902, Score = 0.37299
25 n_estimators = 300, max_depth = 22, max_features = 0.726276, Score = 0.37331
26 n_estimators = 300, max_depth = 68, max_features = 0.717181, Score = 0.37299
27 n_estimators = 300, max_depth = 25, max_features = 0.799544, Score = 0.37411
28 n_estimators = 300, max_depth = 73, max_features = 0.793114, Score = 0.37384
29 n_estimators = 300, max_depth = 47, max_features = 0.673632, Score = 0.37299
30 n_estimators = 300, max_depth = 29, max_features = 0.746753, Score = 0.37299
31 n_estimators = 300, max_depth = 43, max_features = 0.795795, Score = 0.37384
32 n_estimators = 300, max_depth = 35, max_features = 0.788314, Score = 0.37382
33 n_estimators = 300, max_depth = 29, max_features = 0.844361, Score = 0.37360
34 n_estimators = 300, max_depth = 27, max_features = 0.812489, Score = 0.37375
35 n_estimators = 300, max_depth = 66, max_features = 0.843089, Score = 0.37384
36 n_estimators = 300, max_depth = 79, max_features = 0.757582, Score = 0.37299
37 n_estimators = 300, max_depth = 33, max_features = 0.730597, Score = 0.37290
38 n_estimators = 300, max_depth = 72, max_features = 0.688434, Score = 0.37299
39 n_estimators = 300, max_depth = 60, max_features = 0.830167, Score = 0.37384
40 n_estimators = 300, max_depth = 89, max_features = 0.732375, Score = 0.37299
41 n_estimators = 300, max_depth = 84, max_features = 0.675079, Score = 0.37299
42 n_estimators = 300, max_depth = 69, max_features = 0.684302, Score = 0.37299
43 n_estimators = 300, max_depth = 43, max_features = 0.759886, Score = 0.37299
44 n_estimators = 300, max_depth = 69, max_features = 0.853411, Score = 0.37384
45 n_estimators = 300, max_depth = 22, max_features = 0.723072, Score = 0.37331
46 n_estimators = 300, max_depth = 88, max_features = 0.767050, Score = 0.37299
47 n_estimators = 300, max_depth = 33, max_features = 0.867405, Score = 0.37389
48 n_estimators = 300, max_depth = 32, max_features = 0.865525, Score = 0.37384
49 n_estimators = 300, max_depth = 23, max_features = 0.672447, Score = 0.37346
50 n_estimators = 300, max_depth = 20, max_features = 0.708345, Score = 0.37371
51 n_estimators = 300, max_depth = 67, max_features = 0.707316, Score = 0.37299
52 n_estimators = 300, max_depth = 44, max_features = 0.765370, Score = 0.37299
53 n_estimators = 300, max_depth = 43, max_features = 0.757173, Score = 0.37299
54 n_estimators = 300, max_depth = 75, max_features = 0.670581, Score = 0.37299
55 n_estimators = 300, max_depth = 44, max_features = 0.704542, Score = 0.37299
56 n_estimators = 300, max_depth = 76, max_features = 0.699946, Score = 0.37299
57 n_estimators = 300, max_depth = 73, max_features = 0.746892, Score = 0.37299
58 n_estimators = 300, max_depth = 65, max_features = 0.784508, Score = 0.37384
59 n_estimators = 300, max_depth = 22, max_features = 0.849242, Score = 0.37376
60 n_estimators = 300, max_depth = 31, max_features = 0.745068, Score = 0.37267
61 n_estimators = 300, max_depth = 77, max_features = 0.688783, Score = 0.37299
62 n_estimators = 300, max_depth = 42, max_features = 0.774949, Score = 0.37299
63 n_estimators = 300, max_depth = 67, max_features = 0.841762, Score = 0.37384
64 n_estimators = 300, max_depth = 84, max_features = 0.794596, Score = 0.37384
65 n_estimators = 300, max_depth = 48, max_features = 0.697236, Score = 0.37299
66 n_estimators = 300, max_depth = 56, max_features = 0.691465, Score = 0.37299
67 n_estimators = 300, max_depth = 59, max_features = 0.670501, Score = 0.37299
68 n_estimators = 300, max_depth = 89, max_features = 0.757681, Score = 0.37299
69 n_estimators = 300, max_depth = 68, max_features = 0.751281, Score = 0.37299
70 n_estimators = 300, max_depth = 61, max_features = 0.676839, Score = 0.37299
71 n_estimators = 300, max_depth = 77, max_features = 0.674109, Score = 0.37299
72 n_estimators = 300, max_depth = 24, max_features = 0.795705, Score = 0.37366
73 n_estimators = 300, max_depth = 47, max_features = 0.710028, Score = 0.37299
74 n_estimators = 300, max_depth = 76, max_features = 0.827661, Score = 0.37384
75 n_estimators = 300, max_depth = 71, max_features = 0.686260, Score = 0.37299
76 n_estimators = 300, max_depth = 23, max_features = 0.793720, Score = 0.37323
77 n_estimators = 300, max_depth = 61, max_features = 0.672520, Score = 0.37299
78 n_estimators = 300, max_depth = 21, max_features = 0.696986, Score = 0.37411
79 n_estimators = 300, max_depth = 48, max_features = 0.839572, Score = 0.37384
80 n_estimators = 300, max_depth = 48, max_features = 0.821993, Score = 0.37384
81 n_estimators = 300, max_depth = 88, max_features = 0.702965, Score = 0.37299
82 n_estimators = 300, max_depth = 43, max_features = 0.777918, Score = 0.37384
83 n_estimators = 300, max_depth = 37, max_features = 0.830345, Score = 0.37381
84 n_estimators = 300, max_depth = 45, max_features = 0.849349, Score = 0.37384
85 n_estimators = 300, max_depth = 81, max_features = 0.708333, Score = 0.37299
86 n_estimators = 300, max_depth = 67, max_features = 0.790888, Score = 0.37384
87 n_estimators = 300, max_depth = 41, max_features = 0.761437, Score = 0.37299
88 n_estimators = 300, max_depth = 86, max_features = 0.727960, Score = 0.37299
89 n_estimators = 300, max_depth = 65, max_features = 0.755233, Score = 0.37299
90 n_estimators = 300, max_depth = 89, max_features = 0.738501, Score = 0.37299
91 n_estimators = 300, max_depth = 26, max_features = 0.803400, Score = 0.37401
92 n_estimators = 300, max_depth = 32, max_features = 0.735997, Score = 0.37307
93 n_estimators = 300, max_depth = 70, max_features = 0.846867, Score = 0.37384
94 n_estimators = 300, max_depth = 44, max_features = 0.800791, Score = 0.37384
95 n_estimators = 300, max_depth = 69, max_features = 0.849007, Score = 0.37384
96 n_estimators = 300, max_depth = 86, max_features = 0.849849, Score = 0.37384
97 n_estimators = 300, max_depth = 67, max_features = 0.686199, Score = 0.37299
98 n_estimators = 300, max_depth = 53, max_features = 0.737380, Score = 0.37299
99 n_estimators = 300, max_depth = 69, max_features = 0.788243, Score = 0.37384
(100, 5)
epoch score n_estimators max_depth max_features
0 0 0.372665 300 31 0.701086
60 60 0.372665 300 31 0.745068
37 37 0.372899 300 33 0.730597
4 4 0.372957 300 34 0.674593
17 17 0.372984 300 40 0.710294
87 87 0.372988 300 41 0.761437
69 69 0.372989 300 68 0.751281
68 68 0.372989 300 89 0.757681
67 67 0.372989 300 59 0.670501
66 66 0.372989 300 56 0.691465
# 가장 score가 낮게 나온(=좋은 정확도가 나온) 하이퍼패러미터를 가져옵니다.
# 이를 best_hyperparameters라는 이름의 변수에 저장합니다.
best_hyperparameters = finer_hyperparameters_list.iloc[0]

# best_hyperparameters에서 max_depth 하이퍼패러미터만 가져옵니다.
# 이를 best_max_depth라는 이름의 변수에 저장합니다.
best_max_depth = best_hyperparameters["max_depth"]

# best_hyperparameters에서 max_features 하이퍼패러미터만 가져옵니다.
# 이를 best_max_features라는 이름의 변수에 저장합니다.
best_max_features = best_hyperparameters["max_features"]

# best_max_depth와 best_max_features를 출력합니다.
print(f"max_depth(best) = {best_max_depth}, max_features(best) = {best_max_features:.6f}")
max_depth(best) = 31.0, max_features(best) = 0.701086
# scikit-learn 패키지의 ensemble 모듈에 있는 RandomForestRegressor를 가지고 옵니다.
from sklearn.ensemble import RandomForestRegressor

# 하이퍼패러미터 튜닝이 끝났으면, 이제 n_estimators를 가능한 높은 값을 주겠습니다.
best_n_estimators = 3000

# 주의: 혹시 하이퍼패러미터 튜닝을 하는데 시간이 너무 오래 걸린다면,
# 이를 대신해서 다음의 하이퍼패러미터를 사용해주세요. (아래 두 줄의 주석을 풀면 됩니다)
# best_max_depth = 83
# best_max_features = 0.851358
        
# RandomForestRegressor를 생성합니다. 실행할 때는 다음의 옵션이 들어갑니다.
# 1) n_estimators. 트리의 갯수입니다. 지정한 갯수만큼 트리를 생성합니다.
# 2) max_depth. 트리의 깊이입니다. 지정한 숫자만큼 트리가 깊게 가지를 뻗습니다.
# 3) max_features. Feature Sampling입니다. 0.0 ~ 1.0 사이의 값을 넣으면, 트리를 생성할 때 전체 feature에서 지정한 비율만큼만 feature를 사용합니다.
# 4) n_jobs. 병렬처리 여부입니다. -1을 대입하면 컴퓨터에 존재하는 모든 코어(Core, ex: 듀얼코어, 쿼드코어)를 전부 활용합니다.
# 5) random_state. 랜덤포레스트의 결과가 랜덤하게 나오는 것을 고정하는 옵션입니다. 아무 숫자나 넣어주면 됩니다. (공학 용어에서 쓰이는 Seed Number와 동일한 개념입니다)
model2 = RandomForestRegressor(n_estimators=best_n_estimators,
                              max_depth=best_max_depth,
                              max_features=best_max_features,
                              random_state=37,
                              n_jobs=-1)
model2
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=31.0, max_features=0.7010856639268682,
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=3000,
                      n_jobs=-1, oob_score=False, random_state=37, verbose=0,
                      warm_start=False)
model2.fit(x_train, y_train)
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=31.0, max_features=0.7010856639268682,
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=3000,
                      n_jobs=-1, oob_score=False, random_state=37, verbose=0,
                      warm_start=False)
log_predictions = model2.predict(x_test)
print(log_predictions.shape)
log_predictions 
(6493,)





array([2.47414611, 1.61229304, 1.3834309 , ..., 4.42168811, 4.19872127,
       3.63046707])
predictions = np.exp(log_predictions) - 1
print(predictions.shape)
predictions
(6493,)





array([10.87156579,  4.01429602,  2.98856254, ..., 82.23667949,
       65.6011116 , 36.73043521])
submission = pd.read_csv("/content/drive/My Drive/Colab Notebooks/bike/sampleSubmission.csv")
submission["count"] = predictions
submission.to_csv("/content/drive/My Drive/finish.csv", index=False)

Categories: , , , ,

Updated:

Comments