Untitled
user_9363972
python
2 years ago
7.5 kB
19
Indexable
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.io as pio # Import plotly.io instead of plotly.offline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam
from keras import metrics
from plotly.offline import iplot
train = pd.read_csv('train.csv', parse_dates=['date'])
test = pd.read_csv('test.csv', parse_dates=['date'])
lag_size = (test['date'].max().date() - train['date'].max().date()).days
dates = train.groupby('date', as_index=False)['sales'].sum()
stores = train.groupby(['store', 'date'], as_index=False)['sales'].sum()
items = train.groupby(['item', 'date'], as_index=False)['sales'].sum()
# Date
# plot_dates = go.Scatter(x=dates['date'], y=dates['sales'])
# layout = go.Layout(title='Daily sales', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
# fig = go.Figure(data=[plot_dates], layout=layout)
# Store
# plot_stores = []
# for store in stores['store'].unique():
# c_store = stores[(stores['store'] == store)]
# plot_stores.append(go.Scatter(x=c_store['date'], y=c_store['sales'], name=('Store ' + str(store))))
# layout = go.Layout(title='Store daily sales', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
# fig = go.Figure(data=plot_stores, layout=layout)
# Item
# plot_items = []
# for item in items['item'].unique():
# c_items = items[(items['item'] == item)]
# plot_items.append(go.Scatter(
# x=c_items['date'], y=c_items['sales'], name=('Item ' + str(item))))
# layout = go.Layout(title='Item daily sales', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
# fig = go.Figure(data=plot_items, layout=layout)
# pio.show(fig)
indexes_train = []
for i in range(train.store.nunique()*train.item.nunique()):
indexes_train = np.append(indexes_train, np.linspace (0, train.date.nunique()-1, train.date.nunique(), dtype=int))
i=+1
indexes_test = []
for i in range(test.store.nunique()*test.item.nunique()):
indexes_test = np.append(indexes_test, np.linspace (0, test.date.nunique()-1, test.date.nunique(), dtype=int))
i=+1
train["indexes"] = indexes_train
test["indexes"] = indexes_test
data = pd.concat([train,test], sort=False)
hour = 60*60
day = 24*hour
week = 7*day
year = 365.2425*day
df1=data.copy()
df1["index_second"]= df1["indexes"]*day
df1["YEAR_index_norm"] = 2 * np.pi * df1["index_second"] / year
df1["YEAR_cos_index"] = np.cos(df1["YEAR_index_norm"])
df1["YEAR_sin_index"] = np.sin(df1["YEAR_index_norm"])
df2 = df1.drop(["indexes", "index_second", "YEAR_index_norm"], axis=1)
def create_date_time_features(df):
df = df.copy()
df['dayofweek'] = df.date.dt.dayofweek
df['quarter'] = df.date.dt.quarter
df['month'] = df.date.dt.month
df['year'] = df.date.dt.year
df['dayofyear'] = df.date.dt.dayofyear
df['dayofmonth'] = df.date.dt.day
df['weekofyear'] = df.date.dt.isocalendar().week.astype("int64")
df["season"] = np.where(df.month.isin([12,1,2]), 0, 1)
df["season"] = np.where(df.month.isin([6,7,8]), 2, df["season"])
df["season"] = np.where(df.month.isin([9, 10, 11]), 3, df["season"])
return df
def lag_features(df, lags):
for lag in lags:
df['sales_lag_' + str(lag)] = df.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(lag))
df.fillna(0, inplace=True)
return df
def roll_mean_features(df, windows):
for window in windows:
df['sales_roll_mean_' + str(window)] = df.groupby( ["store", "item"])['sales'].transform(lambda x:x. shift(1).rolling(window=window, min_periods=10, win_type="triang").mean())
df.fillna(0, inplace=True)
return df
def ewm_features(dataframe, alphas, lags):
for alpha in alphas:
for lag in lags:
dataframe['sales_ewm_alpha_' + str(alpha).replace (".", "") + "_lag_" + str(lag)] = dataframe.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
dataframe.fillna(0, inplace=True)
return dataframe
df3 = create_date_time_features(df2)
df4= df3.copy()
lag_features(df4, [91, 98, 105, 112, 119, 126, 182, 364, 546, 728])
df5 = df4.copy()
df5 = roll_mean_features(df5,[365,546])
df6= df5.copy()
alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [91, 98, 105, 112, 180, 270, 365, 546, 728]
df6= ewm_features(df6, alphas, lags)
df7 = pd.get_dummies(df6, columns=['store', 'item', 'dayofweek', "quarter", 'month', "year", "season"])
df7["sales"] = np.log1p(df7["sales"].values)
df_model = df7.copy()
df_model = df_model.sort_values("date").reset_index(drop=True)
train= df_model.loc[(df_model["date"] < "2017-01-01"), :]
val= df_model.loc[(df_model["date"] >= "2017-01-01") & (df_model["date"] < "2017-04-01"), :]
cols = [col for col in train.columns if col not in ['date', 'id', "sales", "year"]]
X_train = train[cols]
Y_train = train['sales']
X_val = val[cols]
Y_val = val['sales']
NN_model = Sequential()
NN_model.add(Dense(128, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(256, kernel_initializer='normal',activation='relu'))
NN_model.add(Dropout(0.2))
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))
NN_model.compile(loss=tf.keras.losses.mae, optimizer="adam", metrics=['mae'])
NN_model.summary()
X_train_tensor = tf.convert_to_tensor(X_train.values, dtype=tf.float32)
Y_train_tensor = tf.convert_to_tensor(Y_train.values, dtype=tf.float32)
NN_model.fit(X_train_tensor, Y_train_tensor, epochs=2, batch_size=32, validation_split=0.2)
X_val_tensor = tf.convert_to_tensor(X_val.values, dtype=tf.float32)
train_predictions_tensor = NN_model.predict(X_val_tensor)
train_predictions = train_predictions_tensor.flatten()
train_results = pd.DataFrame(data={'Train Predictions': np.expm1(train_predictions), 'Actuals': np.expm1(Y_val)})
train_results
sns.lineplot(data=train_results.head(100))
plt.title("Sales vs Predicted Sales")
plt.show()
#
df_final_model = df7.copy()
test = df_final_model.loc[(df_final_model["date"] >= "2018-01-01"), :]
X_test = test[cols].astype(np.float32)
X_test_tensor = tf.convert_to_tensor(X_test)
test_predictions = NN_model.predict(X_test_tensor).flatten()
test_results = pd.DataFrame(
data={'Test Predictions': np.expm1(test_predictions)})
new_test= pd.read_csv('test.csv', parse_dates = ['date'])
result_df = test.loc[:, ['id', 'sales']]
result_df['sales'] = np.expm1(test_predictions)
result_df = pd.concat([ new_test, result_df], axis=1)
result_df = result_df.drop('id', axis=1)
result_df.to_csv('result_prediction.csv', index=False)
train= pd.read_csv('train.csv', parse_dates = ['date'])
plot_test = pd.concat([ train, result_df], axis=0).reset_index()
daily_result = plot_test.groupby('date', as_index=False)['sales'].sum()
sc = go.Scatter(x=daily_result['date'], y=daily_result['sales'])
layout = go.Layout(title='Daily sales', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
fig = go.Figure(data=[sc], layout=layout)
iplot(fig)
Editor is loading...
Leave a Comment