import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import warnings

#데이터 로드
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('submission.csv')


#데이터 확인
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      1459 non-null   int64  
 1   hour                    1459 non-null   int64  
 2   hour_bef_temperature    1457 non-null   float64
 3   hour_bef_precipitation  1457 non-null   float64
 4   hour_bef_windspeed      1450 non-null   float64
 5   hour_bef_humidity       1457 non-null   float64
 6   hour_bef_visibility     1457 non-null   float64
 7   hour_bef_ozone          1383 non-null   float64
 8   hour_bef_pm10           1369 non-null   float64
 9   hour_bef_pm2.5          1342 non-null   float64
 10  count                   1459 non-null   float64
dtypes: float64(9), int64(2)
memory usage: 125.5 KB


train.sort_values('id')


train.isnull().sum()

id                          0
hour                        0
hour_bef_temperature        2
hour_bef_precipitation      2
hour_bef_windspeed          9
hour_bef_humidity           2
hour_bef_visibility         2
hour_bef_ozone             76
hour_bef_pm10              90
hour_bef_pm2.5            117
count                       0
dtype: int64


#시간대별 우천시 대여 현황
train['count'].groupby([train['hour'], train['hour_bef_precipitation']]).mean().unstack()


#우천 여부에 따른 대여수량에 큰 차이가 있어 결과에도 반영할 필요가 있어 우천 여부에 따른 예상 대여 수량 예측으로 변경
submission['count_Rain'] = np.NaN
submission['count_nonRain'] = np.NaN
submission.drop('count',axis=1, inplace=True)
submission


#우천 여부 결측치 처리
train[train['hour_bef_precipitation'].isna()] # 0시, 18시 결측치

train['count'].groupby(train['hour']).mean() # 0시 평균 71.766 / 18시 평균 : 262.163

#결측치의 대여량은 0시 : 39대 / 18시 : 1대로 우천으로 판단하여 1(우천) 부여
train['hour_bef_precipitation'].fillna(1, inplace=True)


#우천 여부에 따른 예측을 위해 분리
train_Rain = train[train['hour_bef_precipitation']==1]
train_nonRain = train[train['hour_bef_precipitation']==0]


train_Rain.info()
train_nonRain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 0 to 1443
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      48 non-null     int64  
 1   hour                    48 non-null     int64  
 2   hour_bef_temperature    46 non-null     float64
 3   hour_bef_precipitation  48 non-null     float64
 4   hour_bef_windspeed      46 non-null     float64
 5   hour_bef_humidity       46 non-null     float64
 6   hour_bef_visibility     46 non-null     float64
 7   hour_bef_ozone          43 non-null     float64
 8   hour_bef_pm10           41 non-null     float64
 9   hour_bef_pm2.5          40 non-null     float64
 10  count                   48 non-null     float64
dtypes: float64(9), int64(2)
memory usage: 4.5 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1411 entries, 1 to 1458
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      1411 non-null   int64  
 1   hour                    1411 non-null   int64  
 2   hour_bef_temperature    1411 non-null   float64
 3   hour_bef_precipitation  1411 non-null   float64
 4   hour_bef_windspeed      1404 non-null   float64
 5   hour_bef_humidity       1411 non-null   float64
 6   hour_bef_visibility     1411 non-null   float64
 7   hour_bef_ozone          1340 non-null   float64
 8   hour_bef_pm10           1328 non-null   float64
 9   hour_bef_pm2.5          1302 non-null   float64
 10  count                   1411 non-null   float64
dtypes: float64(9), int64(2)
memory usage: 132.3 KB


#우천시 시간대별 대여수 현황
train_Rain['count'].groupby([train_Rain['hour']]).mean().plot()

plt.axvline(8, c = 'red')
plt.axvline(9, c = 'red')
plt.axvline(17, c = 'red')
plt.axvline(21, c = 'red')

#특정 시간대에 높은 대여량을 보여주지만 공통점이 없음
#8, 9, 17, 21시에 높은 대여량을 보여주는데 우천시라 출,퇴근 용으로 대여를 한다고 보기에는 비정상적이라 판단

<matplotlib.lines.Line2D at 0x7fddacf4f370>


#비우천시 시간대별 대여수 현황
train_nonRain['count'].groupby([train_nonRain['hour']]).mean().plot()

plt.axvline(8, c = 'red')
plt.axvline(18, c = 'red')

plt.text(8.5,140, 'Go to Work')
plt.text(18.5,250, 'End Work')

Text(18.5, 250, 'End Work')


#상관계수 확인
train_Rain.corr()
train_nonRain.corr()


#비우천시 상관관계 히트멥
plt.figure(figsize=(10,10)) # 크기 설정
sns.heatmap(train_nonRain.corr(),annot=True) # annot: 숫자 표시

#우천시 상관관계 히트멥
plt.figure(figsize=(10,10)) # 크기 설정
sns.heatmap(train_nonRain.corr(),annot=True) # annot: 숫자 표시

<AxesSubplot:>


#각 데이터프레임 결측치 확인
#hour_bef_temperature(기온) , hour_bef_windspeed(풍속), hour_bef_humidity(습도)를 요인으로 사용
train_Rain.isnull().sum() # 934, 1035 2개 항목에서 모든 요인 결측치
train_nonRain.isnull().sum() # 풍속만 7개의 결측치

0


#결측치 처리
#시간대별 평균 값 측정
train_Rain.groupby('hour').mean()['hour_bef_temperature']
train_Rain.groupby('hour').mean()['hour_bef_humidity']
train_Rain.groupby('hour').mean()['hour_bef_windspeed']

train_nonRain.groupby('hour').mean()['hour_bef_windspeed']


#결측치를 시간대별 평균 값으로 대체
train_Rain['hour_bef_temperature'].fillna({934:11.400000,1035:18.000000},inplace=True)
train_Rain['hour_bef_humidity'].fillna({934:84.000000,1035:82.000000},inplace=True)
train_Rain['hour_bef_windspeed'].fillna({934:2.800000 , 1035:1.900000 },inplace=True)

train_nonRain['hour_bef_windspeed'].fillna({18:3.289655, 244:1.798246, 260:1.618966, 376:1.950877, 780:3.326316, 1138:2.779661, 1229:1.643860},inplace=True)

/Users/seokholee/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py:6392: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


#TEST 데이터 결측치 처리
test.isnull().sum() # 653번 결측치 확인

test[test['hour_bef_temperature'].isna()] # 653 hour 19시
test[test['hour_bef_windspeed'].isna()] # 653
test[test['hour_bef_humidity'].isna()] # 653

test.groupby('hour').mean()['hour_bef_temperature'] # 26.110345
test.groupby('hour').mean()['hour_bef_windspeed'] # 3.541379
test.groupby('hour').mean()['hour_bef_humidity'] # 47.689655


test['hour_bef_temperature'].fillna({653:26.110345},inplace=True)
test['hour_bef_windspeed'].fillna({653:3.541379},inplace=True)
test['hour_bef_humidity'].fillna({653:47.689655},inplace=True)


#우천시 예측
use_columns = ['hour','hour_bef_temperature', 'hour_bef_windspeed', 'hour_bef_humidity']
X_train = train_Rain[use_columns]
Y_train = train_Rain['count']
X_test = test[use_columns]

from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor(n_estimators=100,random_state=0)

#모델에 train 의 x,y를 넣어 학습한다.
model.fit(X_train,Y_train)

#학습한 모델에 test x를 넣어 결과를 뽑는다.
ypred1=model.predict(X_test)


submission['count_Rain']=ypred1
submission


#비우천시 예측
use_columns = ['hour','hour_bef_temperature', 'hour_bef_windspeed', 'hour_bef_humidity']
X_train = train_nonRain[use_columns]
Y_train = train_nonRain['count']
X_test = test[use_columns]

from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor(n_estimators=100,random_state=0)

#모델에 train 의 x,y를 넣어 학습한다.
model.fit(X_train,Y_train)

#학습한 모델에 test x를 넣어 결과를 뽑는다.
ypred1=model.predict(X_test)


submission['count_nonRain']=ypred1
submission

	id	hour	hour_bef_temperature	hour_bef_precipitation	hour_bef_windspeed	hour_bef_humidity	hour_bef_visibility	hour_bef_ozone	hour_bef_pm10	hour_bef_pm2.5	count
0	3	20	16.3	1.0	1.5	89.0	576.0	0.027	76.0	33.0	49.0
1	6	13	20.1	0.0	1.4	48.0	916.0	0.042	73.0	40.0	159.0
2	7	6	13.9	0.0	0.7	79.0	1382.0	0.033	32.0	19.0	26.0
3	8	23	8.1	0.0	2.7	54.0	946.0	0.040	75.0	64.0	57.0
4	9	18	29.5	0.0	4.8	7.0	2000.0	0.057	27.0	11.0	431.0
...	...	...	...	...	...	...	...	...	...	...	...
1454	2174	4	16.8	0.0	1.6	53.0	2000.0	0.031	37.0	27.0	21.0
1455	2175	3	10.8	0.0	3.8	45.0	2000.0	0.039	34.0	19.0	20.0
1456	2176	5	18.3	0.0	1.9	54.0	2000.0	0.009	30.0	21.0	22.0
1457	2178	21	20.7	0.0	3.7	37.0	1395.0	0.082	71.0	36.0	216.0
1458	2179	17	21.1	0.0	3.1	47.0	1973.0	0.046	38.0	17.0	170.0

hour_bef_precipitation	0.0	1.0
hour
0	73.275862	17.000000
1	48.844828	23.666667
2	32.431034	11.666667
3	21.813559	8.500000
4	13.716667	2.000000
5	13.283333	3.000000
6	24.916667	3.000000
7	62.783333	37.000000
8	137.830508	103.000000
9	93.250000	111.000000
10	79.616667	30.000000
11	88.327869	NaN
12	113.733333	2.000000
13	122.000000	5.000000
14	143.491228	7.750000
15	162.877193	11.750000
16	175.649123	44.666667
17	199.925926	72.000000
18	270.779661	15.000000
19	207.762712	20.000000
20	171.879310	29.333333
21	170.220339	86.000000
22	150.966102	68.000000
23	105.566667	1.000000

	id	hour	hour_bef_temperature	hour_bef_precipitation	hour_bef_windspeed	hour_bef_humidity	hour_bef_visibility	hour_bef_ozone	hour_bef_pm10	hour_bef_pm2.5	count
id	1.000000	-0.008835	-0.005731	NaN	-0.002075	-0.007240	-0.003851	0.052797	-0.019598	0.007852	-0.011242
hour	-0.008835	1.000000	0.416285	NaN	0.474299	-0.356118	0.184096	0.393164	-0.032076	-0.057489	0.648941
hour_bef_temperature	-0.005731	0.416285	1.000000	NaN	0.394710	-0.495191	0.183872	0.546061	-0.012448	-0.078857	0.618140
hour_bef_precipitation	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
hour_bef_windspeed	-0.002075	0.474299	0.394710	NaN	1.000000	-0.468876	0.266186	0.521368	0.017473	-0.195151	0.485488
hour_bef_humidity	-0.007240	-0.356118	-0.495191	NaN	-0.468876	1.000000	-0.563993	-0.425419	-0.091953	0.182254	-0.456960
hour_bef_visibility	-0.003851	0.184096	0.183872	NaN	0.266186	-0.563993	1.000000	0.084029	-0.435255	-0.672749	0.279097
hour_bef_ozone	0.052797	0.393164	0.546061	NaN	0.521368	-0.425419	0.084029	1.000000	0.113076	0.026256	0.481460
hour_bef_pm10	-0.019598	-0.032076	-0.012448	NaN	0.017473	-0.091953	-0.435255	0.113076	1.000000	0.488428	-0.124484
hour_bef_pm2.5	0.007852	-0.057489	-0.078857	NaN	-0.195151	0.182254	-0.672749	0.026256	0.488428	1.000000	-0.134993
count	-0.011242	0.648941	0.618140	NaN	0.485488	-0.456960	0.279097	0.481460	-0.124484	-0.134993	1.000000

	id	count_Rain	count_nonRain
0	0	9.346667	NaN
1	1	43.351667	NaN
2	2	82.604000	NaN
3	4	17.393667	NaN
4	5	94.530000	NaN
...	...	...	...
710	2148	23.557000	NaN
711	2149	17.855000	NaN
712	2165	12.053667	NaN
713	2166	18.128667	NaN
714	2177	13.506667	NaN

	id	count_Rain	count_nonRain
0	0	9.346667	81.62
1	1	43.351667	239.04
2	2	82.604000	91.30
3	4	17.393667	29.59
4	5	94.530000	102.97
...	...	...	...
710	2148	23.557000	64.07
711	2149	17.855000	66.55
712	2165	12.053667	119.63
713	2166	18.128667	129.07
714	2177	13.506667	215.22

분석하고싶은코코

분석하고싶은코코

데이터 분석(2) - 따릉이 수요량 예측 본문

데이터 분석(2) - 따릉이 수요량 예측

따릉이 예측 - 데이터 분석 연습(2)¶

우천 여부에 따른 수요 변화량에 큰 의미가 있어 두 가지경우에 따른 수요량을 예측하기로 하였습니다.¶

'데이터분석' 카테고리의 다른 글

티스토리툴바

이디야는 스타벅스 근처에 입점한다? (0)	2022.11.25
제주도 교통량 예측(1)_EDA (0)	2022.10.11
데이터분석(4) - 타이타닉 생존자 구하기 (0)	2022.06.27
데이터분석(3) - 랜덤 포레스트 (0)	2022.06.07
데이터 분석 체험하기 - 영화 관객수 예측 (0)	2022.06.03

« 2026/03 »
일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30	31