์ค๋์ RNN ๋ชจ๋ธ์ค ํ๋์ธ LSTM์ ํตํด ์ผ์ฑ์ ์ ์ฃผ๊ฐ๋ฅผ ์์ธกํด๋ณด๊ฒ ์ต๋๋ค.
๋ฐ์ดํฐ์ ์์ฑ
๋ฐ์ดํฐ์ ์ ๋ค์ด๋ก๋ ํ๊ณ ์ ํ๋ ๊ฒฝ์ฐ ๋งํฌ์ ์ ์ํ์ฌ ๋ฐ์ ์ ์์ต๋๋ค.
df_price = pd.read_csv(os.path.join(data_path, '01-์ผ์ฑ์ ์-์ฃผ๊ฐ.csv'), encoding='utf8')
df_price.describe()
์ปฌ๋ผ์ ์ผ์, ์๊ฐ, ๊ณ ๊ฐ, ์ ๊ฐ, ์ข
๊ฐ, ๊ฑฐ๋๋์ผ๋ก ๊ตฌ์ฑ ๋์ด์์ผ๋ฉฐ ์ด 9,288๊ฐ์ record๋ฅผ ๊ฐ๊ณ ์์ต๋๋ค.
ํด๋น ๋ฐ์ดํฐ ์คํค๋ง๋ฅผ ๊ฐ๊ณ ๋ฏธ๋ ํน์ ์์ ์ โ์ข
๊ฐโ๋ฅผ ์์ธกํด๋ณด๊ฒ ์ต๋๋ค.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ ๋ฐ ์๊ฐํ
๋ ์งํ ๋ณํ(-> datetime)
pd.to_datetime(df_price['์ผ์'], format='%Y%m%d')
# 0 2020-01-07
# 1 2020-01-06
# 2 2020-01-03
# 3 2020-01-02
# 4 2019-12-30
df_price['์ผ์'] = pd.to_datetime(df_price['์ผ์'], format='%Y%m%d')
df_price['์ฐ๋'] =df_price['์ผ์'].dt.year
df_price['์'] =df_price['์ผ์'].dt.month
df_price['์ผ'] =df_price['์ผ์'].dt.day
1990๋ ๋ ์ดํ ์ฃผ๊ฐ ์๊ฐํ
df = df_price.loc[df_price['์ฐ๋']>=1990]
plt.figure(figsize=(16, 9))
sns.lineplot(y=df['์ข
๊ฐ'], x=df['์ผ์'])
plt.xlabel('time')
plt.ylabel('price')
๋ฐ์ดํฐ ์ ๊ทํ
๋ฅ๋ฌ๋ ๋ชจ๋ธ ํ์ต์ ์ํํ ํ๊ธฐ ์ํด ๋ ๋ฆฝ๋ณ์์ ์ข ์๋ณ์๋ฅผ ์ ๊ทํํด์ค๋ค.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scale_cols = ['์๊ฐ', '๊ณ ๊ฐ', '์ ๊ฐ', '์ข
๊ฐ', '๊ฑฐ๋๋']
df_scaled = scaler.fit_transform(df[scale_cols])
df_scaled = pd.DataFrame(df_scaled)
df_scaled.columns = scale_cols
print(df_scaled)
๋ชจ๋ ์ปฌ๋ผ์ ์ค์ผ์ผ์ด 0~1๋ก ๋ณ๊ฒฝ๋์ด ์ถ๋ ฅ๋ ๋ชจ์ต
ํ์ต ๋ฐ์ดํฐ์ ์์ฑ
window_size๋ฅผ ์ ์ํ์ฌ ํ์ต ๋ฐ์ดํฐ๋ฅผ ์์ฑํฉ๋๋ค.
window_size๋ ๋ด๊ฐ ์ผ๋ง๋์(๊ธฐ๊ฐ)์ ์ฃผ๊ฐ ๋ฐ์ดํฐ๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ๋ค์๋ ์ข
๊ฐ๋ฅผ ์์ธกํ ๊ฒ์ธ๊ฐ๋ฅผ ์ ํ๋ ํ๋ผ๋ฏธํฐ์
๋๋ค.
GCP AutoML์์์ historical data feed size์ ๋์ผํ ๊ฐ๋
์
๋๋ค.
ํด๋น ์์ ์์๋ ๊ณผ๊ฑฐ 20์ผ์ ๊ธฐ์ค์ผ๋ก ๊ทธ ๋ค์๋ ์ ๋ฐ์ดํฐ๋ฅผ ์์ธกํด๋ณด๊ฒ ์ต๋๋ค.
TEST_SIZE = 200
WINDOW_SIZE = 20
train = df_scaled[:-TEST_SIZE]
test = df_scaled[-TEST_SIZE:]
dataset ๋ง๋ค์ด์ฃผ๋ ํจ์ ์์ฑ
def make_dataset(data, label, window_size=20):
feature_list = []
label_list = []
for i in range(len(data) - window_size):
feature_list.append(np.array(data.iloc[i:i+window_size]))
label_list.append(np.array(label.iloc[i+window_size]))
return np.array(feature_list), np.array(label_list)
์ ํจ์๋ ์ ํด์ง window_size์ ๊ธฐ๋ฐํด์ 20์ผ ๊ธฐ๊ฐ์ ๋ฐ์ดํฐ์
์ ๋ฌถ์ด์ฃผ๋ ํจ์์
๋๋ค.
์์ฐจ์ ์ผ๋ก 20์ผ ๋์์ ๋ฐ์ดํฐ์
์ ๋ฌถ๊ณ , ์ด์ ๋ง๋ label์ ๋งคํํ์ฌ return ํด์ค๋๋ค.
feature์ label ์ ์
feature_cols = ['์๊ฐ', '๊ณ ๊ฐ', '์ ๊ฐ', '๊ฑฐ๋๋']
label_cols = ['์ข
๊ฐ']
train_feature = train[feature_cols]
train_label = train[label_cols]
# train dataset
train_feature, train_label = make_dataset(train_feature, train_label, 20)
# train, validation set ์์ฑ
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size=0.2)
x_train.shape, x_valid.shape
# ((6086, 20, 4), (1522, 20, 4))
# test dataset (์ค์ ์์ธก ํด๋ณผ ๋ฐ์ดํฐ)
test_feature, test_label = make_dataset(test_feature, test_label, 20)
test_feature.shape, test_label.shape
# ((180, 20, 4), (180, 1))
LSTM ๋ชจ๋ธ ์์ฑ
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM
model = Sequential()
model.add(LSTM(16,
input_shape=(train_feature.shape[1], train_feature.shape[2]),
activation='relu',
return_sequences=False)
)
model.add(Dense(1))
๋ชจ๋ธ ํ์ต
model.compile(loss='mean_squared_error', optimizer='adam')
early_stop = EarlyStopping(monitor='val_loss', patience=5)
filename = os.path.join(model_path, 'tmp_checkpoint.h5')
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
history = model.fit(x_train, y_train,
epochs=200,
batch_size=16,
validation_data=(x_valid, y_valid),
callbacks=[early_stop, checkpoint])
# ...
# ...
# Epoch 00015: val_loss did not improve from 0.00002
# Epoch 16/200
# 6086/6086 [==============================] - 12s 2ms/step - loss: 3.1661e-05 - val_loss: 4.1063e-05
# Epoch 00016: val_loss did not improve from 0.00002
# Epoch 17/200
# 6086/6086 [==============================] - 13s 2ms/step - loss: 2.4644e-05 - val_loss: 4.0085e-05
# Epoch 00017: val_loss did not improve from 0.00002
# Epoch 18/200
# 6086/6086 [==============================] - 13s 2ms/step - loss: 2.2936e-05 - val_loss: 2.4692e-05
# Epoch 00018: val_loss did not improve from 0.00002
Early Stop ์ต์
์ผ๋ก ์ธํด 18๋ฒ์งธ Epoch ๊ธฐ์ค 0.00002์ ํ๊ท ์ค์ฐจ์์ ํ์ต์ด ํฌํ๋์ด ์ค๋จ๋์ต๋๋ค.
๋ค์์ผ๋ก ํ์ตํ ๋ชจ๋ธ๋ก ๋ฏธ๋ ์ฃผ๊ฐ ์์ธก์ ํด๋ณด๊ฒ ์ต๋๋ค.
# weight ๋ก๋ฉ
model.load_weights(filename)
# ์์ธก
pred = model.predict(test_feature)
์ค์ ๋ฐ์ดํฐ์ ์์ธกํ ๋ฐ์ดํฐ ์๊ฐํ
((6086, 20, 4), (1522, 20, 4))
plt.figure(figsize=(12, 9))
plt.plot(test_label, label='actual')
plt.plot(pred, label='prediction')
plt.legend()
plt.show()
Reference
Lee, T. (2020, February 14). ๋ฅ๋ฌ๋(LSTM)์ ํ์ฉํ์ฌ ์ผ์ฑ์ ์ ์ฃผ๊ฐ ์์ธก์ ํด๋ณด์์ต๋๋ค.
Retrieved August 27, 2020, from https://teddylee777.github.io/tensorflow/LSTM์ผ๋ก-์์ธกํด๋ณด๋-์ผ์ฑ์ ์-์ฃผ๊ฐ