Kaggle 자전거 수요 분석

PCA(주성분 분석)을 이용하여 분석해보았습니다

import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

test=pd.read_csv("/content/drive/My Drive/Colab Notebooks/bike/test.csv",header=0,parse_dates=["datetime"])
train=pd.read_csv("/content/drive/My Drive/Colab Notebooks/bike/train.csv",header=0,parse_dates=["datetime"])
#header None->0->1

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive

train.head()

	datetime	season	weather	temp	atemp	humidity	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1

test.head()

	datetime	season	workingday	weather	temp	atemp	humidity	windspeed
0	2011-01-20 00:00:00	1	1	1	10.66	11.365	56	26.0027
1	2011-01-20 01:00:00	1	1	1	10.66	13.635	56	0.0000
2	2011-01-20 02:00:00	1	1	1	10.66	13.635	56	0.0000
3	2011-01-20 03:00:00	1	1	1	10.66	12.880	56	11.0014
4	2011-01-20 04:00:00	1	1	1	10.66	12.880	56	11.0014

train.shape

(10886, 12)

test.shape

(6493, 9)

train.describe()

	season	holiday	workingday	weather	temp	atemp	humidity	windspeed	casual	registered	count
count	10886.000000	10886.000000	10886.000000	10886.000000	10886.00000	10886.000000	10886.000000	10886.000000	10886.000000	10886.000000	10886.000000
mean	2.506614	0.028569	0.680875	1.418427	20.23086	23.655084	61.886460	12.799395	36.021955	155.552177	191.574132
std	1.116174	0.166599	0.466159	0.633839	7.79159	8.474601	19.245033	8.164537	49.960477	151.039033	181.144454
min	1.000000	0.000000	0.000000	1.000000	0.82000	0.760000	0.000000	0.000000	0.000000	0.000000	1.000000
25%	2.000000	0.000000	0.000000	1.000000	13.94000	16.665000	47.000000	7.001500	4.000000	36.000000	42.000000
50%	3.000000	0.000000	1.000000	1.000000	20.50000	24.240000	62.000000	12.998000	17.000000	118.000000	145.000000
75%	4.000000	0.000000	1.000000	2.000000	26.24000	31.060000	77.000000	16.997900	49.000000	222.000000	284.000000
max	4.000000	1.000000	1.000000	4.000000	41.00000	45.455000	100.000000	56.996900	367.000000	886.000000	977.000000

test.describe()

	season	holiday	workingday	weather	temp	atemp	humidity	windspeed
count	6493.000000	6493.000000	6493.000000	6493.000000	6493.000000	6493.000000	6493.000000	6493.000000
mean	2.493300	0.029108	0.685815	1.436778	20.620607	24.012865	64.125212	12.631157
std	1.091258	0.168123	0.464226	0.648390	8.059583	8.782741	19.293391	8.250151
min	1.000000	0.000000	0.000000	1.000000	0.820000	0.000000	16.000000	0.000000
25%	2.000000	0.000000	0.000000	1.000000	13.940000	16.665000	49.000000	7.001500
50%	3.000000	0.000000	1.000000	1.000000	21.320000	25.000000	65.000000	11.001400
75%	3.000000	0.000000	1.000000	2.000000	27.060000	31.060000	81.000000	16.997900
max	4.000000	1.000000	1.000000	4.000000	40.180000	50.000000	100.000000	55.998600

train.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

test.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
dtype: int64

weather_data = ['season', 'temp', 'weather' , 'atemp', 'humidity', 'windspeed']
train[weather_data].corr()

	season	temp	weather	atemp	humidity	windspeed
season	1.000000	0.258689	0.008879	0.264744	0.190610	-0.147121
temp	0.258689	1.000000	-0.055035	0.984948	-0.064949	-0.017852
weather	0.008879	-0.055035	1.000000	-0.055376	0.406244	0.007261
atemp	0.264744	0.984948	-0.055376	1.000000	-0.043536	-0.057473
humidity	0.190610	-0.064949	0.406244	-0.043536	1.000000	-0.318607
windspeed	-0.147121	-0.017852	0.007261	-0.057473	-0.318607	1.000000

EDA / Preprocessing

Datetime

train["datetime-year"] = train["datetime"].dt.year
train["datetime-month"] = train["datetime"].dt.month
train["datetime-day"] = train["datetime"].dt.day
train["datetime-hour"] = train["datetime"].dt.hour
train["datetime-minute"] = train["datetime"].dt.minute
train["datetime-second"] = train["datetime"].dt.second

train["datetime-dayofweek"] = train["datetime"].dt.dayofweek

train.loc[train["datetime-dayofweek"] == 0, "datetime-dayofweek(humanized)"] = "Monday"
train.loc[train["datetime-dayofweek"] == 1, "datetime-dayofweek(humanized)"] = "Tuesday"
train.loc[train["datetime-dayofweek"] == 2, "datetime-dayofweek(humanized)"] = "Wednesday"
train.loc[train["datetime-dayofweek"] == 3, "datetime-dayofweek(humanized)"] = "Thursday"
train.loc[train["datetime-dayofweek"] == 4, "datetime-dayofweek(humanized)"] = "Friday"
train.loc[train["datetime-dayofweek"] == 5, "datetime-dayofweek(humanized)"] = "Saturday"
train.loc[train["datetime-dayofweek"] == 6, "datetime-dayofweek(humanized)"] = "Sunday"

test["datetime-year"] = test["datetime"].dt.year
test["datetime-month"] = test["datetime"].dt.month
test["datetime-day"] = test["datetime"].dt.day
test["datetime-hour"] = test["datetime"].dt.hour
test["datetime-minute"] = test["datetime"].dt.minute
test["datetime-second"] = test["datetime"].dt.second

test["datetime-dayofweek"] = test["datetime"].dt.dayofweek

test.loc[test["datetime-dayofweek"] == 0, "datetime-dayofweek(humanized)"] = "Monday"
test.loc[test["datetime-dayofweek"] == 1, "datetime-dayofweek(humanized)"] = "Tuesday"
test.loc[test["datetime-dayofweek"] == 2, "datetime-dayofweek(humanized)"] = "Wednesday"
test.loc[test["datetime-dayofweek"] == 3, "datetime-dayofweek(humanized)"] = "Thursday"
test.loc[test["datetime-dayofweek"] == 4, "datetime-dayofweek(humanized)"] = "Friday"
test.loc[test["datetime-dayofweek"] == 5, "datetime-dayofweek(humanized)"] = "Saturday"
test.loc[test["datetime-dayofweek"] == 6, "datetime-dayofweek(humanized)"] = "Sunday"

figure, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(nrows=2, ncols=3)

figure.set_size_inches(18, 8)

sns.barplot(data=train, x="datetime-year", y="count", ax=ax1)
sns.barplot(data=train, x="datetime-month", y="count", ax=ax2)
sns.barplot(data=train, x="datetime-day", y="count", ax=ax3)
sns.barplot(data=train, x="datetime-hour", y="count", ax=ax4)
sns.barplot(data=train, x="datetime-minute", y="count", ax=ax5)
sns.barplot(data=train, x="datetime-second", y="count", ax=ax6)

<matplotlib.axes._subplots.AxesSubplot at 0x7f7b3f03e080>

png

datetime-year

2011년도의 자전거 대여량보다 2012년도의 자전거 대여량이 더 높음. 이는 Bike Sharing Demand 경진대회를 주최한 Capital Bikeshare사가 꾸준히 성장하고 있다고 간주할 수 있음.

datetime-month

주로 여름(6-8월)에 자전거를 많이 빌리며, 겨울(12-2월)에는 자전거를 많이 빌리지 않는다
같은 겨울이라도 12월의 자전거 대여량이 1월의 자전거 대여량보다 두 배 가까이 높다 (연도에 따른 성장세가 반영 되 있을 듯)

datetime-day

x축을 자세히 보면 1일부터 19일까지밖에 없다. 20일은 test 데이터에. 즉, train 데이터와 test 데이터를 나누는 기준이 되는 컬럼이 바로 datetime-day.
이런 경우 datetime-day를 feature로 집어넣으면 머신러닝 알고리즘이 과적합(overfitting). 그러므로 train 데이터와 test 데이터를 나누는 기준이 되는 컬럼이 있으면, 이 컬럼은 feature로 사용하지 않는다.

datetime-hour

새벽 시간에는 사람들이 자전거를 빌리지 않으며, 오후 시간에 상대적으로 자전거를 많이 빌린다.
두 시간대, 출근 시간(7~9시)과 퇴근 시간(16시~19시)의 경우 사람들이 많이 빌린다.

datetime-minute & datetime-second

이 두 컬럼은 x축이 모두 0으로 datetime-minute과 datetime-second은 기록되고 있지 않다. 이 경우에는 feature로 넣어도 큰 의미가 없기 때문에 사용X

그러므로 이 시각화에서 알 수 있는 결론은, 전체 여섯개의 컬럼 중 datetime-year와 datetime-month, 그리고 datetime-hour만 사용.

train["datetime-year(str)"] = train["datetime-year"].astype('str')
train["datetime-month(str)"] = train["datetime-month"].astype('str')

train["datetime-year_month"] = train["datetime-year(str)"] + "-" + train["datetime-month(str)"]

figure, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)

figure.set_size_inches(18, 4)

sns.barplot(data=train, x="datetime-year", y="count", ax=ax1)
sns.barplot(data=train, x="datetime-month", y="count", ax=ax2)

figure, ax3 = plt.subplots(nrows=1, ncols=1)

figure.set_size_inches(18, 4)

sns.barplot(data=train, x="datetime-year_month", y="count", ax=ax3)

<matplotlib.axes._subplots.AxesSubplot at 0x7f7b3d47bef0>

png

자전거 대여량은 꾸준히 상승하고 있는 추세
우상단 시각화를 보자면, 12월의 자전거 대여량이 1월의 자전거 대여량보다 두 배 가까이 높다.
하지만 아래의 시각화를 보면, 2011년 12월의 자전거 대여량과 2012년 1월의 자전거 대여량이 큰 차이가 없다.
반면에 2011년 1월의 자전거 대여량과 2012년 12월의 자전거 대여량은 큰 차이

즉, 12월이 1월에 비해 자전거 대여량이 두 배 가까이 높은 이유는, 1) [Capital Bikeshare]의 자전거 대여량이 꾸준히 상승하고 있는 추세이며, 2) 이 과정에서 시기상으로 12월이 1월부터 늦게 발생했기 때문 즉 자전거를 대여하는 고객 입장에서 12월이라고 자전거를 더 많이 빌려야 할 이유는 없다.

이 점 역시 머신러닝 알고리즘이 과적합(overfitting)될 소지 있으며. 이를 해결할 수 있는 다양한 방법이 있는데,

datetime-year_month를 통채로 One Hot Encoding해서 feature로 사용한다.
자전거 대여량이 꾸준히 성장하는 추세에 맞춰서 count를 보정한다.

가장 쉽과 빠르게 머신러닝 모델의 정확도를 늘리는 방법은 datetime-month를 feature로 사용하지 않는 것 그러므로 **연/월/일/시/분/초 여섯 개의 컬럼 중 연도(datetime-year)와 시간(datetime-hour), 이렇게 두 개의 컬럼만 사용

figure, (ax1, ax2, ax3) = plt.subplots(nrows=3, ncols=1)

figure.set_size_inches(18, 12)

sns.pointplot(data=train, x="datetime-hour", y="count", ax=ax1)

sns.pointplot(data=train, x="datetime-hour", y="count", hue="workingday", ax=ax2)

sns.pointplot(data=train, x="datetime-hour", y="count", hue="datetime-dayofweek(humanized)", ax=ax3)

<matplotlib.axes._subplots.AxesSubplot at 0x7f7b3d1c92b0>

png

사람들은 기본적으로 출근 시간(7-9시)과 퇴근 시간(16-19시)에 자전거를 많이 빌림
하지만 이는 근무일일 경우(workingday == 1)에만 한정. 근무일이 아닐 경우(workingday == 0), 사람들은 출/퇴근시간에 자전거를 빌리지 않고, 오후 시간(10 ~ 16시)에 자전거를 많이 빌린다.

요일(datetime-dayofweek)별 자전거 대여량

먼저 금요일을 살펴보면, 다른 주중(월-목)에 비해 퇴근 시간(17-19시)에 상대적으로 자전거를 덜 빌림 이는 추측컨데 모종의 이유로 자전거를 탈 수 없거나(ex: 음주), 다른 교통수단(ex: 버스, 택시)을 대신 사용했을 것.
반면 금요일은 주중임에도 불구하고 상대적으로 오후 시간(10~16시)의 자전거 대여량이 높다. 그 다음으로 높은 주중은 월요일. 즉, 금요일과 월요일은 주중임에도 불구하고 어느정도 주말의 속성을 가지고 있다.
주말의 경우. 일요일을 보자면, 토요일에 비해 상대적으로 자전거 대여량이 낮다 이는 추측컨데 월요일의 피로도를 고려해서 토요일에 비해 대외 활동을 덜 가지는 것

이 분석을 통해 알 수 있는 사실은, 요일(datetime-dayofweek)을 feature로 집어넣으면 근무일(workingday)만 집어넣는 것에 비해 더 좋은 성능을 낼 수 있다. 그러므로 요일(datetime-dayofweek) 컬럼을 feature로 추가

PCA 시작

list(train)

['datetime',
 'season',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'casual',
 'registered',
 'count',
 'datetime-year',
 'datetime-month',
 'datetime-day',
 'datetime-hour',
 'datetime-minute',
 'datetime-second',
 'datetime-dayofweek',
 'datetime-dayofweek(humanized)',
 'datetime-year(str)',
 'datetime-month(str)',
 'datetime-year_month']

차원축소란 원래 차원(x, y축)을 다른 축으로 바꿔주는데,

그 축은 전체데이터를 잘 표현할수 있는 변수(차원)임 (분산이 가장 큰)

차원간에 다중분산성(독립되지 않고 서로 영향을 끼침)이 높다고 판단되는 차원들을 묶겠습니다

weather1 = ['season', 'temp', 'atemp', 'humidity', 'windspeed']
only_weather1 = train[weather1]
only_weather_test = test[weather1]
only_weather_test.head()

	season	temp	atemp	humidity	windspeed
0	1	10.66	11.365	56	26.0027
1	1	10.66	13.635	56	0.0000
2	1	10.66	13.635	56	0.0000
3	1	10.66	12.880	56	11.0014
4	1	10.66	12.880	56	11.0014

only_weather1.head()

	season	temp	atemp	humidity
0	1	9.84	14.395	81
1	1	9.02	13.635	80
2	1	9.02	13.635	80
3	1	9.84	14.395	75
4	1	9.84	14.395	75

only_weather1.corr()

	season	temp	atemp	humidity	windspeed
season	1.000000	0.258689	0.264744	0.190610	-0.147121
temp	0.258689	1.000000	0.984948	-0.064949	-0.017852
atemp	0.264744	0.984948	1.000000	-0.043536	-0.057473
humidity	0.190610	-0.064949	-0.043536	1.000000	-0.318607
windspeed	-0.147121	-0.017852	-0.057473	-0.318607	1.000000

x, v, d = np.linalg.svd(only_weather1)#다중분산성 검사
print(v)

CI = max(v)/min(v)#제일높은값/ 제일낮은값

print("CI = ", CI) #30이상이면 다중분산성이 있다. pca를 쓴다

[7567.22295894 1472.48842857  922.0582764   112.34592103  103.2939727 ]
CI =  73.25909499631051

#StandardScaler() = 데이터를 조정하여 평균을0, 분산을 1로만들음
#같은 데이터에서 분기된 데이터들의 경우는 fit된 scaler를 test에 적용해야함
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()
scaler.fit(only_weather1)
df_scaled = scaler.transform(only_weather1)

# Standardization
scaler_test = StandardScaler()
scaler_test.fit(only_weather_test)
df_scaled_test = scaler_test.transform(only_weather_test)

#PCA fit
#주성분(분산을 잘 나타내는)을 찾음
pca = PCA()
pca.fit(df_scaled)
x_pca = pca.transform(df_scaled)
loading = pd.DataFrame(pca.components_, columns = only_weather1.columns)

#PCA fit
pca_test = PCA()
pca_test.fit(df_scaled_test)
x_pca_test = pca_test.transform(df_scaled_test)
loading_test = pd.DataFrame(pca_test.components_, columns = only_weather_test.columns)

pca.explained_variance_ratio_

array([0.42347553, 0.2830441 , 0.15980429, 0.13083354, 0.00284254])

pca_test.explained_variance_ratio_

array([0.45379644, 0.25790681, 0.14917466, 0.13766332, 0.00145877])

plt.plot(range(1,6), pca.explained_variance_ratio_, alpha=0.5, linewidth=3, color='red')
plt.bar(range(1,6), pca.explained_variance_ratio_, alpha=0.5)

plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')

plt.tight_layout()
plt.show()

png

#3개의 축만 사용하기로
loading1 = loading[0:3]
loading1

	season	temp	atemp	humidity	windspeed
0	-0.326754	-0.663497	-0.666371	-0.017713	0.092949
1	-0.353285	0.153685	0.124359	-0.673674	0.618279
2	0.808219	-0.150243	-0.171206	0.008608	0.542977

loading_test1 = loading_test[0:3]
loading_test1

	season	temp	atemp	humidity	windspeed
0	-0.418213	-0.636772	-0.639951	0.029681	0.095921
1	-0.239429	0.127004	0.098151	-0.687312	0.666717
2	0.115195	0.044911	0.020255	0.675724	0.726429

only_weather1.shape

(10886, 5)

a = loading1.T #T = 전치행렬. 행과 열을 바꿔서 행렬연산을 가능하게만듬  5x3
b = only_weather1

	0	1	2
season	-0.326754	-0.353285	0.808219
temp	-0.663497	0.153685	-0.150243
atemp	-0.666371	0.124359	-0.171206
humidity	-0.017713	-0.673674	0.008608
windspeed	0.092949	0.618279	0.542977

	season	temp	atemp	humidity	windspeed
0	1	9.84	14.395	81	0.0000
1	1	9.02	13.635	80	0.0000
2	1	9.02	13.635	80	0.0000
3	1	9.84	14.395	75	0.0000
4	1	9.84	14.395	75	0.0000
...	...	...	...	...	...
10881	4	15.58	19.695	50	26.0027
10882	4	14.76	17.425	57	15.0013
10883	4	13.94	15.910	61	15.0013
10884	4	13.94	17.425	61	6.0032
10885	4	13.12	16.665	66	8.9981

10886 rows × 5 columns

c = np.dot(b,a) #행렬곱 실행. 이 행렬곱으로 원래 행렬(weather묶음)이 새로운 축을 기준으로 회전함
print(c.shape)
c

(10886, 3)





array([[-17.88276344, -51.6184782 ,  -2.43747214],
       [-16.81454064, -51.16533881,  -2.19276395],
       [-16.81454064, -51.16533881,  -2.19276395],
       ...,
       [-20.84428983, -29.11134886,   7.08501887],
       [-22.69020635, -34.48628044,   1.9398822 ],
       [-21.44989127, -36.22350229,   3.86239674]])

a_test = loading_test1.T
b_test = only_weather_test

c_test = np.dot(b_test,a_test)
print(c_test.shape)
c_test

(6493, 3)





array([[-10.32288503, -18.92312279,  57.55382825],
       [-14.26978336, -36.03676681,  38.71068597],
       [-14.26978336, -36.03676681,  38.71068597],
       ...,
       [-12.61262779, -31.52529752,  49.39002927],
       [-13.40667494, -30.03757883,  45.2471688 ],
       [-13.13954298, -36.22338933,  51.32868787]])

pca_dataset = pd.DataFrame({'PC1': c[:, 0], 'PC2': c[:, 1], 'PC3': c[:, 2]})
pca_dataset

	PC1	PC2	PC3
0	-17.882763	-51.618478	-2.437472
1	-16.814541	-51.165339	-2.192764
2	-16.814541	-51.165339	-2.192764
3	-17.776483	-47.576433	-2.489117
4	-17.776483	-47.576433	-2.489117
...	...	...	...
10881	-23.237219	-14.176257	12.069428
10882	-22.327055	-26.102227	6.668012
10883	-20.844290	-29.111349	7.085019
10884	-22.690206	-34.486280	1.939882
10885	-21.449891	-36.223502	3.862397

10886 rows × 3 columns

pca_dataset_test = pd.DataFrame({'PC1': c_test[:, 0], 'PC2': c_test[:, 1], 'PC3': c_test[:, 2]})
pca_dataset_test

	PC1	PC2	PC3
0	-10.322885	-18.923123	57.553828
1	-14.269783	-36.036767	38.710686
2	-14.269783	-36.036767	38.710686
3	-12.731353	-28.776048	46.687132
4	-12.731353	-28.776048	46.687132
...	...	...	...
6488	-12.612628	-31.525298	49.390029
6489	-12.612628	-31.525298	49.390029
6490	-12.612628	-31.525298	49.390029
6491	-13.406675	-30.037579	45.247169
6492	-13.139543	-36.223389	51.328688

6493 rows × 3 columns

x, v, d = np.linalg.svd(pca_dataset)
print(v)

CI = max(v)/min(v)

print("CI = ", CI) #다중분산성이 30보다 낮으므로 3개의 축은 독립된다는것을 확인

[4525.71707689 1468.52443054  566.65495536]
CI =  7.986724609245162

train['PC1'] = pca_dataset['PC1']
train['PC2'] = pca_dataset['PC2']
train['PC3'] = pca_dataset['PC3']

test['PC1'] = pca_dataset_test['PC1']
test['PC2'] = pca_dataset_test['PC2']
test['PC3'] = pca_dataset_test['PC3']

train.head()

	datetime	season	weather	temp	atemp	humidity	casual	registered	count	datetime-year	datetime-month	datetime-day	datetime-hour	datetime-dayofweek	datetime-dayofweek(humanized)	datetime-year(str)	datetime-month(str)	datetime-year_month	PC1	PC2	PC3
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16	2011	1	1	0	5	Saturday	2011	1	2011-1	-17.882763	-51.618478	-2.437472
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40	2011	1	1	1	5	Saturday	2011	1	2011-1	-16.814541	-51.165339	-2.192764
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32	2011	1	1	2	5	Saturday	2011	1	2011-1	-16.814541	-51.165339	-2.192764
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13	2011	1	1	3	5	Saturday	2011	1	2011-1	-17.776483	-47.576433	-2.489117
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1	2011	1	1	4	5	Saturday	2011	1	2011-1	-17.776483	-47.576433	-2.489117

Count

sns.distplot(train["count"])

<matplotlib.axes._subplots.AxesSubplot at 0x7f7b35df1668>

png

train["log_count"] = np.log(train["count"] + 1) #좀 더 정규분표와 비슷하게 만들어 신빙성을 높임

figure, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)

figure.set_size_inches(18, 4)

sns.distplot(train["count"], ax=ax1)

sns.distplot(train["log_count"], ax=ax2)

<matplotlib.axes._subplots.AxesSubplot at 0x7f7b35a912b0>

png

train["count(recover)"] = np.exp(train["log_count"]) - 1

Fitting the model

#weather과 관련된 축을 새로 축소된 축으로 바꿔줌
features = ["season", "holiday", "workingday", "PC1", "PC2", "PC3",
                 "datetime-year", "datetime-hour", "datetime-dayofweek"]
features

['season',
 'holiday',
 'workingday',
 'PC1',
 'PC2',
 'PC3',
 'datetime-year',
 'datetime-hour',
 'datetime-dayofweek']

label = "log_count"
label

'log_count'

x_train = train[features]
x_test = test[features]

y_train = train[label]

Score method

import numpy as np

from sklearn.metrics import make_scorer

def rmse(predict, actual):

    predict = np.array(predict)
    actual = np.array(actual)

    distance = predict - actual  

    square_distance = distance ** 2   

    mean_square_distance = square_distance.mean()

    score = np.sqrt(mean_square_distance)
    
    return score

rmse_score = make_scorer(rmse)
rmse_score

make_scorer(rmse)

Hyperparameter tuning

# scikit-learn 패키지의 ensemble 모듈에 있는 RandomForestClassifier를 가지고 옵니다.
from sklearn.ensemble import RandomForestRegressor

# scikit-learn 패키지의 model_selection 모듈에 있는 cross_val_score 함수를 가지고 옵니다.
from sklearn.model_selection import cross_val_score

# n_estimators는 트리의 갯수입니다.
# 보통은 높을수록 좋지만, 그만큼 실행 속도가 오래 걸리기 때문에 이번에는 적당한 값을 주겠습니다. (300 개)
# 나머지 하이퍼패리미터가 전부 튜닝되면, 그 다음에는 n_estimators를 최대한 높이 주면 됩니다. (ex: 3,000 개)
n_estimators = 300

# 랜덤 서치를 반복할 횟수입니다.
# 보통 100번을 반복합니다.
num_epoch = 100

# hyperparameter 탐색 결과를 리스트로 저장합니다.
coarse_hyperparameters_list = []

# num_epoch 횟수만큼 랜덤 서치를 반복합니다.
for epoch in range(num_epoch):
    # 2에서 100 사이의 정수형(int) 값을 랜덤하게 생성하여 max_depth 변수에 할당합니다.
    max_depth = np.random.randint(low=2, high=100)

    # 0.1에서 1.0 사이의 실수형(float) 값을 랜덤하게 생성하여  max_features 변수에 할당합니다.
    max_features = np.random.uniform(low=0.1, high=1.0)

    # Random Forest를 생성합니다. 옵션은 다음과 같습니다.
    # 1) n_estimators. 트리의 갯수입니다.
    # 2) max_depth. 트리의 깊이입니다. 지정한 숫자만큼 트리가 깊게 가지를 뻗습니다.
    # 3) max_features. Feature Sampling입니다. 0.0 ~ 1.0 사이의 값을 넣으면, 트리를 생성할 때 전체 feature에서 지정한 비율만큼만 feature를 사용합니다.
    # 4) n_jobs. 병렬처리 여부입니다. -1을 대입하면 컴퓨터에 존재하는 모든 코어(Core, ex: 듀얼코어, 쿼드코어)를 전부 활용합니다.
    # 5) random_state. 랜덤포레스트의 결과가 랜덤하게 나오는 것을 고정하는 옵션입니다. 아무 숫자나 넣어주면 됩니다. (공학 용어에서 쓰이는 Seed Number와 동일한 개념입니다)
    model2 = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  max_features=max_features,
                                  n_jobs=-1,
                                  random_state=37)

    # cross_val_score를 실행합니다. 실행할 때는 다음의 옵션이 들어갑니다.
    # 1) model. 점수를 측정할 머신러닝 모델(현재는 Random Forest)이 들어갑니다.
    # 2) X_train. train 데이터의 feature 입니다.
    # 3) y_train. train 데이터의 label 입니다.
    # 4) cv. Cross Validation에서 데이터를 조각낼(split) 갯수입니다. 총 20조각을 내야하기 때문에 20을 대입합니다.
    # 5) scoring. 점수를 측정할 공식입니다.
    # 마지막으로, 이 함수의 실행 결과의 평균(mean)을 구한 뒤 score라는 이름의 새로운 변수에 할당합니다.
    score = cross_val_score(model2, x_train, y_train, cv=20, scoring=rmse_score).mean()
    
    # hyperparameter 탐색 결과를 딕셔너리화 합니다.
    hyperparameters = {
        'epoch': epoch,
        'score': score,
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'max_features': max_features,
    }

    # hyperparameter 탐색 결과를 리스트에 저장합니다.
    coarse_hyperparameters_list.append(hyperparameters)

    # hyperparameter 탐색 결과를 출력합니다.
    print(f"{epoch:2} n_estimators = {n_estimators}, max_depth = {max_depth:2}, max_features = {max_features:.6f}, Score = {score:.5f}")

# coarse_hyperparameters_list를 Pandas의 DataFrame으로 변환합니다.
coarse_hyperparameters_list = pd.DataFrame.from_dict(coarse_hyperparameters_list)

# 변환한 coarse_hyperparameters_list를 score가 높은 순으로 정렬합니다.
coarse_hyperparameters_list = coarse_hyperparameters_list.sort_values(by="score")

# coarse_hyperparameters_list 변수에 할당된 데이터의 행렬 사이즈를 출력합니다.
# 출력은 (row, column) 으로 표시됩니다.
print(coarse_hyperparameters_list.shape)

# coarse_hyperparameters_list의 상위 10개를 출력합니다.
coarse_hyperparameters_list.head(10)

n_estimators = 300, max_depth = 60, max_features = 0.894427, Score = 0.37536
n_estimators = 300, max_depth = 64, max_features = 0.824407, Score = 0.37384
n_estimators = 300, max_depth = 38, max_features = 0.670715, Score = 0.37299
n_estimators = 300, max_depth = 97, max_features = 0.541787, Score = 0.38654
n_estimators = 300, max_depth = 17, max_features = 0.943065, Score = 0.37592
n_estimators = 300, max_depth = 15, max_features = 0.883666, Score = 0.37421
n_estimators = 300, max_depth = 59, max_features = 0.972636, Score = 0.37536
n_estimators = 300, max_depth = 67, max_features = 0.587571, Score = 0.37658
n_estimators = 300, max_depth = 89, max_features = 0.382686, Score = 0.40708
n_estimators = 300, max_depth = 28, max_features = 0.904945, Score = 0.37514
n_estimators = 300, max_depth = 36, max_features = 0.269547, Score = 0.45493
n_estimators = 300, max_depth =  8, max_features = 0.393723, Score = 0.52784
n_estimators = 300, max_depth = 62, max_features = 0.181551, Score = 0.52206
n_estimators = 300, max_depth = 46, max_features = 0.135466, Score = 0.52206
n_estimators = 300, max_depth = 70, max_features = 0.693570, Score = 0.37299
n_estimators = 300, max_depth = 83, max_features = 0.281162, Score = 0.45492
n_estimators = 300, max_depth = 30, max_features = 0.629243, Score = 0.37645
n_estimators = 300, max_depth = 82, max_features = 0.741254, Score = 0.37299
n_estimators = 300, max_depth = 30, max_features = 0.170742, Score = 0.52246
n_estimators = 300, max_depth = 58, max_features = 0.833930, Score = 0.37384
n_estimators = 300, max_depth = 64, max_features = 0.157814, Score = 0.52206
n_estimators = 300, max_depth = 82, max_features = 0.622749, Score = 0.37658
n_estimators = 300, max_depth =  3, max_features = 0.104265, Score = 1.06773
n_estimators = 300, max_depth = 73, max_features = 0.307550, Score = 0.45492
n_estimators = 300, max_depth = 33, max_features = 0.984866, Score = 0.37542
n_estimators = 300, max_depth = 61, max_features = 0.213921, Score = 0.52206
n_estimators = 300, max_depth = 50, max_features = 0.322691, Score = 0.45492
n_estimators = 300, max_depth = 65, max_features = 0.222195, Score = 0.52206
n_estimators = 300, max_depth = 32, max_features = 0.288541, Score = 0.45490
n_estimators = 300, max_depth = 83, max_features = 0.635496, Score = 0.37658
n_estimators = 300, max_depth = 47, max_features = 0.333568, Score = 0.40708
n_estimators = 300, max_depth =  2, max_features = 0.369659, Score = 0.92161
n_estimators = 300, max_depth = 16, max_features = 0.887800, Score = 0.37411
n_estimators = 300, max_depth = 44, max_features = 0.778491, Score = 0.37384
n_estimators = 300, max_depth = 42, max_features = 0.835953, Score = 0.37384
n_estimators = 300, max_depth = 68, max_features = 0.502302, Score = 0.38654
n_estimators = 300, max_depth = 73, max_features = 0.486413, Score = 0.38654
n_estimators = 300, max_depth =  4, max_features = 0.555364, Score = 0.69367
n_estimators = 300, max_depth = 98, max_features = 0.834484, Score = 0.37384
n_estimators = 300, max_depth = 37, max_features = 0.575902, Score = 0.37657
n_estimators = 300, max_depth = 52, max_features = 0.583103, Score = 0.37658
n_estimators = 300, max_depth = 82, max_features = 0.628302, Score = 0.37658
n_estimators = 300, max_depth = 66, max_features = 0.880932, Score = 0.37384
n_estimators = 300, max_depth = 36, max_features = 0.355411, Score = 0.40711
n_estimators = 300, max_depth = 16, max_features = 0.583997, Score = 0.37762
n_estimators = 300, max_depth = 29, max_features = 0.862560, Score = 0.37360
n_estimators = 300, max_depth = 42, max_features = 0.334689, Score = 0.40708
n_estimators = 300, max_depth = 63, max_features = 0.494164, Score = 0.38654
n_estimators = 300, max_depth = 46, max_features = 0.188985, Score = 0.52206
n_estimators = 300, max_depth = 81, max_features = 0.558748, Score = 0.37658
n_estimators = 300, max_depth =  9, max_features = 0.949109, Score = 0.40934
n_estimators = 300, max_depth =  4, max_features = 0.154370, Score = 0.96037
n_estimators = 300, max_depth =  7, max_features = 0.989723, Score = 0.48421
n_estimators = 300, max_depth = 40, max_features = 0.370733, Score = 0.40708
n_estimators = 300, max_depth =  7, max_features = 0.671371, Score = 0.49037
n_estimators = 300, max_depth = 69, max_features = 0.688413, Score = 0.37299
n_estimators = 300, max_depth = 14, max_features = 0.221293, Score = 0.54334
n_estimators = 300, max_depth = 33, max_features = 0.670240, Score = 0.37290
n_estimators = 300, max_depth =  3, max_features = 0.644769, Score = 0.74844
n_estimators = 300, max_depth = 51, max_features = 0.120319, Score = 0.52206
n_estimators = 300, max_depth = 20, max_features = 0.852481, Score = 0.37286
n_estimators = 300, max_depth = 82, max_features = 0.855937, Score = 0.37384
n_estimators = 300, max_depth = 76, max_features = 0.463208, Score = 0.38654
n_estimators = 300, max_depth = 20, max_features = 0.774948, Score = 0.37371
n_estimators = 300, max_depth = 84, max_features = 0.491074, Score = 0.38654
n_estimators = 300, max_depth =  3, max_features = 0.139929, Score = 1.06773
n_estimators = 300, max_depth =  5, max_features = 0.537367, Score = 0.62741
n_estimators = 300, max_depth = 31, max_features = 0.867675, Score = 0.37368
n_estimators = 300, max_depth = 98, max_features = 0.392808, Score = 0.40708
n_estimators = 300, max_depth = 11, max_features = 0.437995, Score = 0.44879
n_estimators = 300, max_depth = 31, max_features = 0.443427, Score = 0.40705
n_estimators = 300, max_depth = 87, max_features = 0.838645, Score = 0.37384
n_estimators = 300, max_depth = 69, max_features = 0.119969, Score = 0.52206
n_estimators = 300, max_depth = 38, max_features = 0.355214, Score = 0.40707
n_estimators = 300, max_depth = 89, max_features = 0.434928, Score = 0.40708
n_estimators = 300, max_depth = 36, max_features = 0.885137, Score = 0.37381
n_estimators = 300, max_depth = 76, max_features = 0.254060, Score = 0.45492
n_estimators = 300, max_depth = 37, max_features = 0.486067, Score = 0.38653
n_estimators = 300, max_depth = 90, max_features = 0.748630, Score = 0.37299
n_estimators = 300, max_depth = 77, max_features = 0.805650, Score = 0.37384
n_estimators = 300, max_depth = 72, max_features = 0.554798, Score = 0.38654
n_estimators = 300, max_depth = 25, max_features = 0.778340, Score = 0.37411
n_estimators = 300, max_depth = 75, max_features = 0.800938, Score = 0.37384
n_estimators = 300, max_depth = 14, max_features = 0.936876, Score = 0.37611
n_estimators = 300, max_depth = 48, max_features = 0.138635, Score = 0.52206
n_estimators = 300, max_depth = 71, max_features = 0.169769, Score = 0.52206
n_estimators = 300, max_depth = 64, max_features = 0.312676, Score = 0.45492
n_estimators = 300, max_depth = 81, max_features = 0.541337, Score = 0.38654
n_estimators = 300, max_depth = 86, max_features = 0.423556, Score = 0.40708
n_estimators = 300, max_depth =  9, max_features = 0.476241, Score = 0.45649
n_estimators = 300, max_depth = 20, max_features = 0.937376, Score = 0.37603
n_estimators = 300, max_depth = 91, max_features = 0.937290, Score = 0.37536
n_estimators = 300, max_depth = 93, max_features = 0.534470, Score = 0.38654
n_estimators = 300, max_depth = 93, max_features = 0.520332, Score = 0.38654
n_estimators = 300, max_depth = 83, max_features = 0.221991, Score = 0.52206
n_estimators = 300, max_depth = 51, max_features = 0.408976, Score = 0.40708
n_estimators = 300, max_depth =  7, max_features = 0.553989, Score = 0.52635
n_estimators = 300, max_depth = 85, max_features = 0.793968, Score = 0.37384
n_estimators = 300, max_depth = 74, max_features = 0.134740, Score = 0.52206
n_estimators = 300, max_depth = 80, max_features = 0.383653, Score = 0.40708
(100, 5)

	epoch	score	n_estimators	max_depth	max_features
60	60	0.372858	300	20	0.852481
57	57	0.372899	300	33	0.670240
2	2	0.372986	300	38	0.670715
17	17	0.372989	300	82	0.741254
55	55	0.372989	300	69	0.688413
78	78	0.372989	300	90	0.748630
14	14	0.372989	300	70	0.693570
45	45	0.373601	300	29	0.862560
67	67	0.373676	300	31	0.867675
63	63	0.373705	300	20	0.774948

n_estimators = 300

num_epoch = 100
finer_hyperparameters_list = []

for epoch in range(num_epoch):
    max_depth = np.random.randint(low=20, high=90)
    max_features = np.random.uniform(low=0.67, high=0.87)
    model2 = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  max_features=max_features,
                                  n_jobs=-1,
                                  random_state=37)

 
    score = cross_val_score(model2, x_train, y_train, cv=20, scoring=rmse_score).mean()
    
   
    hyperparameters = {
        'epoch': epoch,
        'score': score,
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'max_features': max_features,
    }

  
    finer_hyperparameters_list.append(hyperparameters)

  
    print(f"{epoch:2} n_estimators = {n_estimators}, max_depth = {max_depth:2}, max_features = {max_features:.6f}, Score = {score:.5f}")

finer_hyperparameters_list = pd.DataFrame.from_dict(finer_hyperparameters_list)

finer_hyperparameters_list = finer_hyperparameters_list.sort_values(by="score")

print(finer_hyperparameters_list.shape)

finer_hyperparameters_list.head(10)

n_estimators = 300, max_depth = 31, max_features = 0.701086, Score = 0.37267
n_estimators = 300, max_depth = 23, max_features = 0.728162, Score = 0.37346
n_estimators = 300, max_depth = 36, max_features = 0.866035, Score = 0.37381
n_estimators = 300, max_depth = 46, max_features = 0.858805, Score = 0.37384
n_estimators = 300, max_depth = 34, max_features = 0.674593, Score = 0.37296
n_estimators = 300, max_depth = 51, max_features = 0.858732, Score = 0.37384
n_estimators = 300, max_depth = 59, max_features = 0.807806, Score = 0.37384
n_estimators = 300, max_depth = 79, max_features = 0.756331, Score = 0.37299
n_estimators = 300, max_depth = 22, max_features = 0.718962, Score = 0.37331
n_estimators = 300, max_depth = 47, max_features = 0.776317, Score = 0.37299
n_estimators = 300, max_depth = 84, max_features = 0.810409, Score = 0.37384
n_estimators = 300, max_depth = 55, max_features = 0.825679, Score = 0.37384
n_estimators = 300, max_depth = 70, max_features = 0.741438, Score = 0.37299
n_estimators = 300, max_depth = 53, max_features = 0.671523, Score = 0.37299
n_estimators = 300, max_depth = 41, max_features = 0.826184, Score = 0.37384
n_estimators = 300, max_depth = 57, max_features = 0.749898, Score = 0.37299
n_estimators = 300, max_depth = 61, max_features = 0.756896, Score = 0.37299
n_estimators = 300, max_depth = 40, max_features = 0.710294, Score = 0.37298
n_estimators = 300, max_depth = 52, max_features = 0.785932, Score = 0.37384
n_estimators = 300, max_depth = 89, max_features = 0.710544, Score = 0.37299
n_estimators = 300, max_depth = 25, max_features = 0.843766, Score = 0.37411
n_estimators = 300, max_depth = 37, max_features = 0.841488, Score = 0.37381
n_estimators = 300, max_depth = 68, max_features = 0.841873, Score = 0.37384
n_estimators = 300, max_depth = 64, max_features = 0.732711, Score = 0.37299
n_estimators = 300, max_depth = 50, max_features = 0.750902, Score = 0.37299
n_estimators = 300, max_depth = 22, max_features = 0.726276, Score = 0.37331
n_estimators = 300, max_depth = 68, max_features = 0.717181, Score = 0.37299
n_estimators = 300, max_depth = 25, max_features = 0.799544, Score = 0.37411
n_estimators = 300, max_depth = 73, max_features = 0.793114, Score = 0.37384
n_estimators = 300, max_depth = 47, max_features = 0.673632, Score = 0.37299
n_estimators = 300, max_depth = 29, max_features = 0.746753, Score = 0.37299
n_estimators = 300, max_depth = 43, max_features = 0.795795, Score = 0.37384
n_estimators = 300, max_depth = 35, max_features = 0.788314, Score = 0.37382
n_estimators = 300, max_depth = 29, max_features = 0.844361, Score = 0.37360
n_estimators = 300, max_depth = 27, max_features = 0.812489, Score = 0.37375
n_estimators = 300, max_depth = 66, max_features = 0.843089, Score = 0.37384
n_estimators = 300, max_depth = 79, max_features = 0.757582, Score = 0.37299
n_estimators = 300, max_depth = 33, max_features = 0.730597, Score = 0.37290
n_estimators = 300, max_depth = 72, max_features = 0.688434, Score = 0.37299
n_estimators = 300, max_depth = 60, max_features = 0.830167, Score = 0.37384
n_estimators = 300, max_depth = 89, max_features = 0.732375, Score = 0.37299
n_estimators = 300, max_depth = 84, max_features = 0.675079, Score = 0.37299
n_estimators = 300, max_depth = 69, max_features = 0.684302, Score = 0.37299
n_estimators = 300, max_depth = 43, max_features = 0.759886, Score = 0.37299
n_estimators = 300, max_depth = 69, max_features = 0.853411, Score = 0.37384
n_estimators = 300, max_depth = 22, max_features = 0.723072, Score = 0.37331
n_estimators = 300, max_depth = 88, max_features = 0.767050, Score = 0.37299
n_estimators = 300, max_depth = 33, max_features = 0.867405, Score = 0.37389
n_estimators = 300, max_depth = 32, max_features = 0.865525, Score = 0.37384
n_estimators = 300, max_depth = 23, max_features = 0.672447, Score = 0.37346
n_estimators = 300, max_depth = 20, max_features = 0.708345, Score = 0.37371
n_estimators = 300, max_depth = 67, max_features = 0.707316, Score = 0.37299
n_estimators = 300, max_depth = 44, max_features = 0.765370, Score = 0.37299
n_estimators = 300, max_depth = 43, max_features = 0.757173, Score = 0.37299
n_estimators = 300, max_depth = 75, max_features = 0.670581, Score = 0.37299
n_estimators = 300, max_depth = 44, max_features = 0.704542, Score = 0.37299
n_estimators = 300, max_depth = 76, max_features = 0.699946, Score = 0.37299
n_estimators = 300, max_depth = 73, max_features = 0.746892, Score = 0.37299
n_estimators = 300, max_depth = 65, max_features = 0.784508, Score = 0.37384
n_estimators = 300, max_depth = 22, max_features = 0.849242, Score = 0.37376
n_estimators = 300, max_depth = 31, max_features = 0.745068, Score = 0.37267
n_estimators = 300, max_depth = 77, max_features = 0.688783, Score = 0.37299
n_estimators = 300, max_depth = 42, max_features = 0.774949, Score = 0.37299
n_estimators = 300, max_depth = 67, max_features = 0.841762, Score = 0.37384
n_estimators = 300, max_depth = 84, max_features = 0.794596, Score = 0.37384
n_estimators = 300, max_depth = 48, max_features = 0.697236, Score = 0.37299
n_estimators = 300, max_depth = 56, max_features = 0.691465, Score = 0.37299
n_estimators = 300, max_depth = 59, max_features = 0.670501, Score = 0.37299
n_estimators = 300, max_depth = 89, max_features = 0.757681, Score = 0.37299
n_estimators = 300, max_depth = 68, max_features = 0.751281, Score = 0.37299
n_estimators = 300, max_depth = 61, max_features = 0.676839, Score = 0.37299
n_estimators = 300, max_depth = 77, max_features = 0.674109, Score = 0.37299
n_estimators = 300, max_depth = 24, max_features = 0.795705, Score = 0.37366
n_estimators = 300, max_depth = 47, max_features = 0.710028, Score = 0.37299
n_estimators = 300, max_depth = 76, max_features = 0.827661, Score = 0.37384
n_estimators = 300, max_depth = 71, max_features = 0.686260, Score = 0.37299
n_estimators = 300, max_depth = 23, max_features = 0.793720, Score = 0.37323
n_estimators = 300, max_depth = 61, max_features = 0.672520, Score = 0.37299
n_estimators = 300, max_depth = 21, max_features = 0.696986, Score = 0.37411
n_estimators = 300, max_depth = 48, max_features = 0.839572, Score = 0.37384
n_estimators = 300, max_depth = 48, max_features = 0.821993, Score = 0.37384
n_estimators = 300, max_depth = 88, max_features = 0.702965, Score = 0.37299
n_estimators = 300, max_depth = 43, max_features = 0.777918, Score = 0.37384
n_estimators = 300, max_depth = 37, max_features = 0.830345, Score = 0.37381
n_estimators = 300, max_depth = 45, max_features = 0.849349, Score = 0.37384
n_estimators = 300, max_depth = 81, max_features = 0.708333, Score = 0.37299
n_estimators = 300, max_depth = 67, max_features = 0.790888, Score = 0.37384
n_estimators = 300, max_depth = 41, max_features = 0.761437, Score = 0.37299
n_estimators = 300, max_depth = 86, max_features = 0.727960, Score = 0.37299
n_estimators = 300, max_depth = 65, max_features = 0.755233, Score = 0.37299
n_estimators = 300, max_depth = 89, max_features = 0.738501, Score = 0.37299
n_estimators = 300, max_depth = 26, max_features = 0.803400, Score = 0.37401
n_estimators = 300, max_depth = 32, max_features = 0.735997, Score = 0.37307
n_estimators = 300, max_depth = 70, max_features = 0.846867, Score = 0.37384
n_estimators = 300, max_depth = 44, max_features = 0.800791, Score = 0.37384
n_estimators = 300, max_depth = 69, max_features = 0.849007, Score = 0.37384
n_estimators = 300, max_depth = 86, max_features = 0.849849, Score = 0.37384
n_estimators = 300, max_depth = 67, max_features = 0.686199, Score = 0.37299
n_estimators = 300, max_depth = 53, max_features = 0.737380, Score = 0.37299
n_estimators = 300, max_depth = 69, max_features = 0.788243, Score = 0.37384
(100, 5)

	epoch	score	n_estimators	max_depth	max_features
0	0	0.372665	300	31	0.701086
60	60	0.372665	300	31	0.745068
37	37	0.372899	300	33	0.730597
4	4	0.372957	300	34	0.674593
17	17	0.372984	300	40	0.710294
87	87	0.372988	300	41	0.761437
69	69	0.372989	300	68	0.751281
68	68	0.372989	300	89	0.757681
67	67	0.372989	300	59	0.670501
66	66	0.372989	300	56	0.691465

# 가장 score가 낮게 나온(=좋은 정확도가 나온) 하이퍼패러미터를 가져옵니다.
# 이를 best_hyperparameters라는 이름의 변수에 저장합니다.
best_hyperparameters = finer_hyperparameters_list.iloc[0]

# best_hyperparameters에서 max_depth 하이퍼패러미터만 가져옵니다.
# 이를 best_max_depth라는 이름의 변수에 저장합니다.
best_max_depth = best_hyperparameters["max_depth"]

# best_hyperparameters에서 max_features 하이퍼패러미터만 가져옵니다.
# 이를 best_max_features라는 이름의 변수에 저장합니다.
best_max_features = best_hyperparameters["max_features"]

# best_max_depth와 best_max_features를 출력합니다.
print(f"max_depth(best) = {best_max_depth}, max_features(best) = {best_max_features:.6f}")

max_depth(best) = 31.0, max_features(best) = 0.701086

# scikit-learn 패키지의 ensemble 모듈에 있는 RandomForestRegressor를 가지고 옵니다.
from sklearn.ensemble import RandomForestRegressor

# 하이퍼패러미터 튜닝이 끝났으면, 이제 n_estimators를 가능한 높은 값을 주겠습니다.
best_n_estimators = 3000

# 주의: 혹시 하이퍼패러미터 튜닝을 하는데 시간이 너무 오래 걸린다면,
# 이를 대신해서 다음의 하이퍼패러미터를 사용해주세요. (아래 두 줄의 주석을 풀면 됩니다)
# best_max_depth = 83
# best_max_features = 0.851358
        
# RandomForestRegressor를 생성합니다. 실행할 때는 다음의 옵션이 들어갑니다.
# 1) n_estimators. 트리의 갯수입니다. 지정한 갯수만큼 트리를 생성합니다.
# 2) max_depth. 트리의 깊이입니다. 지정한 숫자만큼 트리가 깊게 가지를 뻗습니다.
# 3) max_features. Feature Sampling입니다. 0.0 ~ 1.0 사이의 값을 넣으면, 트리를 생성할 때 전체 feature에서 지정한 비율만큼만 feature를 사용합니다.
# 4) n_jobs. 병렬처리 여부입니다. -1을 대입하면 컴퓨터에 존재하는 모든 코어(Core, ex: 듀얼코어, 쿼드코어)를 전부 활용합니다.
# 5) random_state. 랜덤포레스트의 결과가 랜덤하게 나오는 것을 고정하는 옵션입니다. 아무 숫자나 넣어주면 됩니다. (공학 용어에서 쓰이는 Seed Number와 동일한 개념입니다)
model2 = RandomForestRegressor(n_estimators=best_n_estimators,
                              max_depth=best_max_depth,
                              max_features=best_max_features,
                              random_state=37,
                              n_jobs=-1)
model2

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=31.0, max_features=0.7010856639268682,
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=3000,
                      n_jobs=-1, oob_score=False, random_state=37, verbose=0,
                      warm_start=False)

model2.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=31.0, max_features=0.7010856639268682,
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=3000,
                      n_jobs=-1, oob_score=False, random_state=37, verbose=0,
                      warm_start=False)

log_predictions = model2.predict(x_test)
print(log_predictions.shape)
log_predictions 

(6493,)

array([2.47414611, 1.61229304, 1.3834309 , ..., 4.42168811, 4.19872127,
       3.63046707])

predictions = np.exp(log_predictions) - 1
print(predictions.shape)
predictions

(6493,)

array([10.87156579,  4.01429602,  2.98856254, ..., 82.23667949,
       65.6011116 , 36.73043521])

submission = pd.read_csv("/content/drive/My Drive/Colab Notebooks/bike/sampleSubmission.csv")

submission["count"] = predictions
submission.to_csv("/content/drive/My Drive/finish.csv", index=False)

Twitter Facebook LinkedIn

YOON TAE YOUNG

Kaggle 자전거 수요 분석

PCA(주성분 분석)을 이용하여 분석해보았습니다

EDA / Preprocessing

Datetime

PCA 시작

차원축소란 원래 차원(x, y축)을 다른 축으로 바꿔주는데,

그 축은 전체데이터를 잘 표현할수 있는 변수(차원)임 (분산이 가장 큰)

차원간에 다중분산성(독립되지 않고 서로 영향을 끼침)이 높다고 판단되는 차원들을 묶겠습니다

Count

Fitting the model

Score method

Hyperparameter tuning

Comments

You May Also Enjoy

lightsail을 이용한 django 프로젝트 배포

Github 블로그 UI 개선하기

케라스 CONVNET

케라스를 이용한 머신러닝-주택가격예측

YOON TAE YOUNG

kaggle의 bike sharing demand 데이터를

PCA(주성분 분석)을 이용하여 분석해보았습니다

EDA / Preprocessing

Datetime

PCA 시작

차원축소란 원래 차원(x, y축)을 다른 축으로 바꿔주는데,

그 축은 전체데이터를 잘 표현할수 있는 변수(차원)임 (분산이 가장 큰)

차원간에 다중분산성(독립되지 않고 서로 영향을 끼침)이 높다고 판단되는 차원들을 묶겠습니다

Count

Fitting the model

Score method

Hyperparameter tuning

Comments

You May Also Enjoy

lightsail을 이용한 django 프로젝트 배포

Github 블로그 UI 개선하기

케라스 CONVNET

케라스를 이용한 머신러닝-주택가격예측