This notebook is to predict the behavior of social crowding in Tippecanoe county. The data provided contains week 11 to week 43 of each POI (Place of Interests) visit counts. Using the provided data to predict week 44's visit count for each POI. In order to get the prediction result, here are the steps:
The final model selected is a univariate CNN model which has a MAE of 9.2.
%%capture
%logstop
%logstart -t -r -q ipython_command_log.py global
#- IRONHACKS RESEARCH TRACKING CODE
#----------------------------------
# The following code is used to help our research team understand how you
# our notebook environment. We do not collect any personal information with
# the following code, it is used to measure when and how often you work on
# your submission files.
import os
from datetime import datetime
import IPython.core.history as history
ha = history.HistoryAccessor()
ha_tail = ha.get_tail(1)
ha_cmd = next(ha_tail)
session_id = str(ha_cmd[0])
command_id = str(ha_cmd[1])
timestamp = datetime.utcnow().isoformat()
history_line = ','.join([session_id, command_id, timestamp]) + '\n'
logfile = open(os.environ['HOME']+'/ipython_session_log.csv', 'a')
logfile.write(history_line)
logfile.close()
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ='key.json'
bigquery_client = bigquery.Client(project='ironhacks-covid19-data')
Query and plot the number of data entries for each week.
QUERY = """
SELECT week_number, Count(*) as count
FROM ironhacks-covid19-data.ironhacks_covid19_competition.weekly_patterns
GROUP BY week_number
ORDER BY week_number
"""
query_job = bigquery_client.query(QUERY)
week_number_count = query_job.to_dataframe()
week_number_count.head(10)
## fill the missing weeks' data with 0
week_number_count_array = np.insert(week_number_count.to_numpy(), 7, np.array([[18, 0],[19,0]]), axis=0)
# draw a bar plot
week_number = week_number_count_array[:,0]
week_count = week_number_count_array[:,1]
plt.figure(figsize=(12,8))
plt.barh(week_number, week_count)
plt.yticks(week_number)
plt.ylabel("Week number")
plt.xlabel("Number of Data Entries")
for i, v in enumerate(week_count):
plt.text(v + 15, i + 10.7, str(v))
plt.show()
The query result shows that week 18, and week 19 data are missing. Week 40, 41, 42, 43 have 1804 entries, which are the most. Therefore, use the week 40 data to furture explore the data.
QUERY = """
SELECT Count(DISTINCT(poi_id)) as N_POI
FROM ironhacks-covid19-data.ironhacks_covid19_competition.weekly_patterns
WHERE week_number = 40
"""
query_job = bigquery_client.query(QUERY)
week_40_count = query_job.to_dataframe()
week_40_count
Using DISTINCT to count the number of unique POIs recorded in week 40 and the result indicates that all 1804 data entries are from different POIs, respectively.
QUERY = """
SELECT city, Count(1) as N_city
FROM ironhacks-covid19-data.ironhacks_covid19_competition.weekly_patterns
WHERE week_number = 40
GROUP BY city
ORDER BY N_city DESC
"""
query_job = bigquery_client.query(QUERY)
week_40_city_count = query_job.to_dataframe()
week_40_city_count
city_count = {}
for i in range(len(week_40_city_count)):
if(week_40_city_count["city"][i] == "W Lafayette"):
city_count["West Lafayette"] += week_40_city_count["N_city"][i]
elif(week_40_city_count["city"][i] == "La Fayette"):
city_count["Lafayette"] += week_40_city_count["N_city"][i]
else:
city_count[week_40_city_count["city"][i]] = week_40_city_count["N_city"][i]
city_label = list(city_count.keys())[::-1]
city_poi_count = list(city_count.values())[::-1]
plt.figure(figsize=(12,8))
plt.barh(city_label, city_poi_count)
plt.yticks(city_label)
plt.ylabel("City")
plt.xlabel("Number of POIs")
for i, v in enumerate(city_poi_count):
plt.text(v + 10, i, str(v))
plt.show()
By count the POI numbers inside each city, the bar plot result shows that Lafayette has the most number of POIs, which is 1255. West lafayette ranks second place with 475 POIs. Other cities fall far behind those two cities. This can be served as a Categorical Feature.
QUERY = """
SELECT postal_code, Count(1) as N_postal
FROM ironhacks-covid19-data.ironhacks_covid19_competition.weekly_patterns
WHERE week_number = 40
GROUP BY postal_code
ORDER BY N_postal DESC
"""
query_job = bigquery_client.query(QUERY)
week_40_post_code_count = query_job.to_dataframe()
week_40_post_code_count
post_code_count = {}
for i in range(len(week_40_post_code_count)):
post_code_count[week_40_post_code_count["postal_code"][i]] = week_40_post_code_count["N_postal"][i]
post_code_label = list(map(str, list(post_code_count.keys())[::-1]))
post_code_value = list(post_code_count.values())[::-1]
plt.figure(figsize=(12,8))
plt.barh(post_code_label, post_code_value)
plt.ylabel("Postal Code")
plt.xlabel("Number of POIs")
for i, v in enumerate(post_code_value):
plt.text(v + 5, i - 0.15, str(v))
plt.show()
From the bar plot, postcode 47905 has the most POIs. Also, 47905 is located in Lafayette, which also aligns with the previous city result. This can be served as a Categorical Feature. Moreover, there are more postcodes that cities, which both represent the location information of the POIs. Thus, choose postcodes as a feature will show a more detailed location information than cities.
QUERY = """
SELECT top_category, Count(1) as N_category
FROM ironhacks-covid19-data.ironhacks_covid19_competition.weekly_patterns
WHERE week_number = 40
GROUP BY top_category
ORDER BY N_category DESC
"""
query_job = bigquery_client.query(QUERY)
week_40_category_count = query_job.to_dataframe()
week_40_category_count.head(10)
category_count = {}
for i in range(len(week_40_category_count)):
if(week_40_category_count["top_category"][i] is None):
category_count["None"] = week_40_category_count["N_category"][i]
else:
category_count[week_40_category_count["top_category"][i]] = week_40_category_count["N_category"][i]
len(category_count)
category_label = list(category_count.keys())[:40][::-1]
category_value = list(category_count.values())[:40][::-1]
plt.figure(figsize=(12,8))
plt.barh(category_label, category_value)
plt.ylabel("Top Category")
plt.xlabel("Number of POIs")
for i, v in enumerate(category_value):
plt.text(v + 5, i - 0.5, str(v))
plt.show()
The summary of the top 40 categories from the total of 105 different categories in the listed POIs shows that "Restaurants and Other Eating Places" is the largest group of POIs. Total of 351 POIs is below this category. This can be served as a Categorical Feature. Notice that there are 9 POIs with a "None" category in the top 40 categories.
QUERY = """
SELECT week_count, COUNT(week_count) as number_of_POI
FROM (
SELECT poi_id, COUNT(week_number) as week_count
FROM ironhacks-covid19-data.ironhacks_covid19_competition.weekly_patterns
GROUP BY poi_id
ORDER BY week_count DESC
)
GROUP BY week_count
ORDER BY number_of_POI
"""
query_job = bigquery_client.query(QUERY)
poi_weeks = query_job.to_dataframe()
week_count_label = list(map(str, list(poi_weeks["week_count"])))
number_of_POI = list(poi_weeks["number_of_POI"])
plt.figure(figsize=(12,9))
plt.barh(week_count_label, number_of_POI)
plt.yticks(week_count_label)
plt.ylabel("The number of weeks of data included")
plt.xlabel("Number of POIs")
for i, v in enumerate(number_of_POI):
plt.text(v + 15, i - 0.2, str(v))
plt.show()
Considering there are two weeks of data missing, the dataset has 1360 POIs with all weeks POI visitor information. Analysis of the sample of those POIs' data may reveal some other information about the numerical features since it can be a continuous feature.
# select 10 of 1360 POIs with all weeks' data and further explore the numerical variables
QUERY = """
SELECT poi_id
FROM (
SELECT poi_id, COUNT(week_number) as week_count
FROM ironhacks-covid19-data.ironhacks_covid19_competition.weekly_patterns
GROUP BY poi_id
)
WHERE week_count = 31
LIMIT 10
"""
query_job = bigquery_client.query(QUERY)
ten_poi_with_all_weeks = query_job.to_dataframe()
ten_poi_with_all_weeks
# query the 10 selected POI data and store them as a list of dataframe with all the data
data_missing = []
for poi_id in ten_poi_with_all_weeks["poi_id"]:
QUERY="""
SELECT poi_id, week_number, visits_concentration, distance_from_home, median_dwell, raw_visit_counts
FROM ironhacks-covid19-data.ironhacks_covid19_competition.weekly_patterns
WHERE poi_id="%s"
"""%(poi_id)
query_job = bigquery_client.query(QUERY)
data_missing.append(query_job.to_dataframe())
# fill the missing weeks with 0
data_fill = data_missing.copy()
for i in range(len(data_fill)):
rows = pd.DataFrame([[data_fill[i]["poi_id"][0], 18, 0, 0, 0, 0], [data_fill[i]["poi_id"][0], 19, 0, 0, 0, 0]], columns=list(data_fill[i].columns))
data_fill[i] = data_fill[i].append(rows, ignore_index=True)
data_fill[i] = data_fill[i].sort_values(by=["week_number"])
data_fill[i] = data_fill[i].reset_index(drop=True)
data_fill[0]
def plot_visit_count(data):
plt.figure(figsize=(15,8))
for i in range(len(data)):
plt.plot(list(map(str, data[i]["week_number"])), data[i]["raw_visit_counts"], "o-", label=data[i]["poi_id"][0])
plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left', borderaxespad=0.)
plt.grid()
plt.show()
plot_visit_count(data_missing)
def plot_numerical_data(input_data):
for data in input_data:
x_time = list(map(str, data["week_number"]))
y1 = data["visits_concentration"]
y2 = data["distance_from_home"]
y3 = data["median_dwell"]
y = data["raw_visit_counts"]
plt.figure(figsize=(8,4))
plt.title(data["poi_id"][0])
plt.plot(x_time, y1 / 10, "*-", label="visits_concentration / 10")
plt.plot(x_time, y2 / 300, "^-", label="distance_from_home / 300")
plt.plot(x_time, y3, "+-", label="median_dwell")
plt.plot(x_time, y, "ro-", label='raw_visit_counts')
plt.xticks(x_time)
plt.grid()
plt.legend()
plt.show()
plot_numerical_data(data_missing)
QUERY = """
SELECT poi_id
FROM (
SELECT poi_id, COUNT(week_number) as week_count
FROM ironhacks-covid19-data.ironhacks_covid19_competition.weekly_patterns
GROUP BY poi_id
)
WHERE week_count <= 5
"""
query_job = bigquery_client.query(QUERY)
week_40_poi_less_5_weeks = query_job.to_dataframe()
# select all POIs data with no more than 5 weeks of data
data_less_5 = []
for poi_id in week_40_poi_less_5_weeks["poi_id"]:
QUERY="""
SELECT *
FROM ironhacks-covid19-data.ironhacks_covid19_competition.weekly_patterns
WHERE poi_id="%s"
"""%(poi_id)
query_job = bigquery_client.query(QUERY)
data_less_5.append(query_job.to_dataframe())
week_available = []
for i in range(len(data_less_5)):
week_available.append(list(data_less_5[i]["week_number"]))
plt.figure(figsize=(4,3))
plt.eventplot(week_available, linewidths=5)
plt.grid()
plt.show()
plot_numerical_data(data_less_5)
The histgram result shows what weeks are included in the POIs with less than 5 weeks of data. It indicates that most of them are the last four weeks.
The first trail is to test models on univariate variable, which is the raw_visit_counts. Steps are:
def mse_loss(y_true, y_pred):
"""
MSE loss function
Inputs param
-------------------------
y_true: list
The ground truth values
y_pred: list
The predicted values
-------------------------
return: number
mean square error of the two input lists
"""
if(len(y_true) != len(y_pred)):
print("True label len:" + str(len(y_true)) + ", Predict label len: " + str(len(y_pred)))
raise Exception("Input lists have different length")
mse = np.mean(np.array(y_true) - np.array(y_pred))**2
return mse
def mae_loss(y_true, y_pred):
"""
MAE loss function
Inputs param
-------------------------
y_true: list
The ground truth values
y_pred: list
The predicted values
-------------------------
return: number
mean absolute error of the two input lists
"""
if(len(y_true) != len(y_pred)):
print("True label len:" + str(len(y_true)) + ", Predict label len: " + str(len(y_pred)))
raise Exception("Input lists have different length")
mae = np.mean(np.absolute(np.array(y_true) - np.array(y_pred)))
return mae
Extract all weeks _raw_visitcounts data from the database
week_visit_less_5 = {} # dict stores all visit count grouped by week, example: week_visit_less_5[40] retrieves week 40's raw_visit_count of all selected POIs
poi_ids_less_5 = [] # list stores all selected POIs' IDs
for data in data_less_5:
poi_ids_less_5.append(data["poi_id"][0])
if not week_visit_less_5:
for i in range(len(data["raw_visit_counts"])):
week_visit_less_5[data["week_number"][i]] = []
for i in range(len(data["raw_visit_counts"])):
week_visit_less_5[data["week_number"][i]].append(data["raw_visit_counts"][i])
print(poi_ids_less_5)
print(week_visit_less_5)
Calculate the average _raw_visitcounts of the existing data as the prediction
average_predict_result_less_5 = []
for data in data_less_5:
average_predict_result_less_5.append(np.rint(data["raw_visit_counts"].sum() / len(data["raw_visit_counts"])))
sum_mae = 0
for i in range(4):
mae = mae_loss(week_visit_less_5[i + 40], np.array(average_predict_result_less_5))
sum_mae += mae
print("MAE Loss for week " + str(i + 40) + " is: " + str(mae))
print("Average MAE is: " + str(sum_mae / 4))
Fit a linear regression model and predict the result.
from sklearn.linear_model import LinearRegression
linear_predict_result_less_5 = []
for data in data_less_5:
X = np.array(data["week_number"]).reshape(-1, 1)
y = np.array(data["raw_visit_counts"]).reshape(-1, 1)
model = LinearRegression()
model.fit(X, y)
pred = model.predict(np.array([40, 41, 42, 43, 44]).reshape(-1, 1))
linear_predict_result_less_5.append(np.rint(pred.reshape(1, -1)[0]))
sum_mae = 0
for i in range(np.array(linear_predict_result_less_5).shape[1] - 1):
mae = mae_loss(week_visit_less_5[i + 40], np.array(linear_predict_result_less_5)[:, i])
sum_mae += mae
print("MAE Loss for week " + str(i + 40) + " is: " + str(mae))
print("Average MAE is: " + str(sum_mae / 4))
Fit an SVR model and predict the result.
from sklearn.svm import SVR
SVR_predict_result_less_5 = []
for data in data_less_5:
X = np.array(data["week_number"]).reshape(-1, 1)
y = np.array(data["raw_visit_counts"]).reshape(-1, 1)
model = SVR(kernel='linear')
model.fit(X, y)
pred = model.predict(np.array([40, 41, 42, 43, 44]).reshape(-1, 1))
SVR_predict_result_less_5.append(list(np.rint(pred)))
sum_mae = 0
for i in range(np.array(SVR_predict_result_less_5).shape[1] - 1):
mae = mae_loss(week_visit_less_5[i + 40], np.array(SVR_predict_result_less_5)[:, i])
sum_mae += mae
print("MAE Loss for week " + str(i + 40) + " is: " + str(mae))
print("Average MAE is: " + str(sum_mae / 4))
Fit a KNN regression model and predict the result. n_neighbors = 2
from sklearn.neighbors import KNeighborsRegressor
knn_predict_result_less_5 = []
for data in data_less_5:
X = np.array(data["week_number"]).reshape(-1, 1)
y = np.array(data["raw_visit_counts"]).reshape(-1, 1)
model = KNeighborsRegressor(n_neighbors = 2)
model.fit(X, y)
pred = model.predict(np.array([40, 41, 42, 43, 44]).reshape(-1, 1))
knn_predict_result_less_5.append(list((np.rint(pred).reshape(1, -1)[0])))
sum_mae = 0
for i in range(np.array(knn_predict_result_less_5).shape[1] - 1):
mae = mae_loss(week_visit_less_5[i + 40], np.array(knn_predict_result_less_5)[:, i])
sum_mae += mae
print("MAE Loss for week " + str(i + 40) + " is: " + str(mae))
print("Average MAE is: " + str(sum_mae / 4))
Fit a RandomForest regression model and predict the result.
from sklearn.ensemble import RandomForestClassifier
rf_predict_result_less_5 = []
for data in data_less_5:
X = np.array(data["week_number"]).reshape(-1, 1)
y = np.array(data["raw_visit_counts"]).reshape(-1, 1)
model = RandomForestClassifier(max_features=None)
model.fit(X, y)
pred = model.predict(np.array([40, 41, 42, 43, 44]).reshape(-1, 1))
rf_predict_result_less_5.append(list((np.rint(pred).reshape(1, -1)[0])))
sum_mae = 0
for i in range(np.array(rf_predict_result_less_5).shape[1] - 1):
mae = mae_loss(week_visit_less_5[i + 40], np.array(rf_predict_result_less_5)[:, i])
sum_mae += mae
print("MAE Loss for week " + str(i + 40) + " is: " + str(mae))
print("Average MAE is: " + str(sum_mae / 4))
poi_less_5_week_44_pred = {"poi_id": poi_ids_less_5, "raw_visit_counts": np.array(knn_predict_result_less_5)[:, -1]}
poi_less_5_week_44_pred_df = pd.DataFrame.from_dict(poi_less_5_week_44_pred)
poi_less_5_week_44_pred_df
# select all POIs
QUERY = """
SELECT poi_id
FROM ironhacks-covid19-data.ironhacks_covid19_competition.prediction_list_poi
"""
query_job = bigquery_client.query(QUERY)
all_poi_id = query_job.to_dataframe()
data_all = []
for poi_id in all_poi_id["poi_id"]:
QUERY="""
SELECT poi_id, week_number, raw_visit_counts
FROM ironhacks-covid19-data.ironhacks_covid19_competition.weekly_patterns
WHERE poi_id="%s"
"""%(poi_id)
query_job = bigquery_client.query(QUERY)
data_all.append(query_job.to_dataframe())
week_all = {} # dict stores all visit count grouped by week
poi_ids_all = all_poi_id["poi_id"].tolist() # list stores all selected POIs' IDs
for data in data_all:
for i in range(len(data["raw_visit_counts"])):
week_all[data["week_number"][i]] = []
for data in data_all:
for i in range(len(data["raw_visit_counts"])):
week_all[data["week_number"][i]].append(data["raw_visit_counts"][i])
knn_predict_all = []
for data in data_all:
X = np.array(data["week_number"]).reshape(-1, 1)
y = np.array(data["raw_visit_counts"]).reshape(-1, 1)
model = KNeighborsRegressor(n_neighbors = 2)
model.fit(X, y)
pred = model.predict(np.array([40, 41, 42, 43, 44]).reshape(-1, 1))
knn_predict_all.append(list((np.rint(pred).reshape(1, -1)[0])))
sum_mae = 0
for i in range(4):
mae = mae_loss(week_all[i + 40], np.array(knn_predict_all)[:, i])
sum_mae += mae
print("MAE Loss for week " + str(i + 40) + " is: " + str(mae))
print("Average MAE is: " + str(sum_mae / 4))
knn_all_poi_univariate_predict = {"poi_id": poi_ids_all, "raw_visit_counts": np.array(knn_predict_all)[:, -1]}
knn_all_poi_univariate_predict_df = pd.DataFrame.from_dict(knn_all_poi_univariate_predict)
knn_all_poi_univariate_predict_df
Define the model function with the univariate models. Specificly, the selected models are
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.holtwinters import Holt
def univariate_model_predict(model_type, predict_week=44):
"""
univariate models prediction function.
param
-------------------
model_type : string
Select the univariate model that to be trained.
predict_week : number
Only 43 and 44. The week to be predicted. Default week is 44.
-------------------
return
predict : list
Return predicted result of all poi visit counts in a list.
model : object
Trained model.
"""
predict = []
for data in poi_visit_array:
if(predict_week == 43):
train_data = data[:-1]
elif(predict_week == 44):
train_data = data
if(model_type == "AR"):
model = AutoReg(train_data, lags=1)
elif(model_type == "MA"):
model = ARIMA(train_data, order=(0, 0, 1))
elif(model_type == "ARMA"):
model = ARIMA(train_data, order=(2, 0, 1))
elif(model_type == "ARIMA"):
model = ARIMA(train_data, order=(1, 1, 1))
elif(model_type == "SARIMA"):
model = SARIMAX(train_data, order=(1, 1, 1), seasonal_order=(0, 0, 0, 0))
elif(model_type == "SES"):
model = SimpleExpSmoothing(train_data)
elif(model_type == "HWES"):
model = ExponentialSmoothing(train_data)
elif(model_type == "HOLT"):
model = Holt(train_data)
if(model_type == "SARIMA"):
model_fit = model.fit(disp=False)
else:
model_fit = model.fit()
if(model_type == "ARIMA"):
pred = model_fit.predict(len(train_data), len(train_data), typ="levels")
else:
pred = model_fit.predict(len(train_data), len(train_data))
predict.append(np.rint(pred[0]))
return predict, model_fit
# AR model predict
ar_pred, ar_model = univariate_model_predict("AR", 43)
ar_mae = mae_loss(week_43_true, ar_pred)
print("AR predict MAE for week 43 is: " + str(ar_mae))
# MA model predict
ma_pred, ma_model = univariate_model_predict("MA", 43)
ma_mae = mae_loss(week_43_true, ma_pred)
print("MA predict MSE for week 43 is: " + str(ma_mae))
# ARMA model predict
arma_pred, arma_model = univariate_model_predict("ARMA", 43)
arma_mae = mae_loss(week_43_true, arma_pred)
print("ARMA predict MSE for week 43 is: " + str(arma_mae))
# ARIMA model predict
arima_pred, arima_model = univariate_model_predict("ARIMA", 43)
arima_mae = mae_loss(week_43_true, arima_pred)
print("ARIMA predict MSE for week 43 is: " + str(arima_mae))
# SARIMA model predict
sarima_pred, sarima_model = univariate_model_predict("SARIMA", 43)
sarima_mae = mae_loss(week_43_true, sarima_pred)
print("SARIMA predict MSE for week 43 is: " + str(sarima_mae))
# SES model predict
ses_pred, ses_model = univariate_model_predict("SES", 43)
ses_mae = mae_loss(week_43_true, ses_pred)
print("SES predict MSE for week 43 is: " + str(ses_mae))
# HWES model predict
hwes_pred, hwes_model = univariate_model_predict("HWES", 43)
hwes_mae = mae_loss(week_43_true, hwes_pred)
print("HWES predict MSE for week 43 is: " + str(hwes_mae))
# HOLT model predict
holt_pred, holt_model = univariate_model_predict("HOLT", 43)
holt_mae = mae_loss(week_43_true, holt_pred)
print("HOLT predict MSE for week 43 is: " + str(holt_mae))
# Select the best univariate model to predict week 44
best_pred, best_model = univariate_model_predict("SES")
week_44 = {'poi_id':poi_pred_id, 'raw_visit_counts': best_pred}
week_44_df = pd.DataFrame(week_44)
week_44_df
week_44_df.to_csv("prediction_list_poi.csv", index=False)
%%capture
!pip install pmdarima
from pmdarima import auto_arima
def auto_univariate_predict(predict_week=44):
prediction = []
for data in poi_visit_array:
if(predict_week == 43):
train_data = data[:-1]
elif(predict_week == 44):
train_data = data
result = auto_arima(train_data, start_p=1, start_q=1, max_p=10, max_d=7,max_q=10, max_P=4, max_D=4,max_Q=4)
pred = np.around(result.fit_predict(train_data, n_periods=1))
prediction.append(pred)
return prediction
auto_model_week_43_pred = auto_univariate_predict(predict_week=43)
print(mae_loss(week_43_true, auto_model_week_43_pred))
%%capture
!pip install keras
!pip install tensorflow
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
def lstm_univariate_model(n_feature):
model = Sequential()
# model.add(Dense(10, activation="relu"))
model.add(LSTM(4, input_shape=(1, n_feature)))
model.add(Dense(1))
model.compile(loss="mae", optimizer="adam", metrics=['accuracy'])
return model
# split into train and test sets
dataset = poi_visit_array[0][:-1]
train_size = int(len(dataset) * 0.7)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size], dataset[train_size:len(dataset)]
print(train, test)
# convert an array of values into a dataset matrix
def create_dataset(dataset, look_back=1):
dataX, dataY = [], []
for i in range(len(dataset)-look_back-1):
a = dataset[i:(i+look_back)]
dataX.append(a)
dataY.append(dataset[i + look_back])
return np.array(dataX), np.array(dataY)
look_back=2
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
lstm_uni_model = lstm_univariate_model(look_back)
lstm_uni_model.summary()
lstm_uni_data = lstm_uni_model.fit(trainX, trainY, validation_data=(testX, testY), epochs=1000, batch_size=1, verbose=2)
np.rint(lstm_uni_model.predict(testX)[-1])
plt.figure()
plt.plot(lstm_uni_data.history['loss'])
plt.plot(lstm_uni_data.history['val_loss'])
plt.title('LSTM Accuracy vs Epoch')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc='upper left')
POI_CONCEN = {k:[] for k in POI_ID}
POI_MED_DWELL = {k:[] for k in POI_ID}
# init with all 0
for k in POI_CONCEN:
for i in range(11,44):
POI_CONCEN[k].append({i:0})
POI_MED_DWELL[k].append({i:0})
for i in range(0,len(weekly_patterns_data)):
for week_concen, week_med_dwell in zip(POI_CONCEN[weekly_patterns_data["poi_id"][i]], POI_MED_DWELL[weekly_patterns_data["poi_id"][i]]):
if(weekly_patterns_data["week_number"][i] in week_concen):
week_concen[weekly_patterns_data["week_number"][i]] = weekly_patterns_data["visits_concentration"][i]
week_med_dwell[weekly_patterns_data["week_number"][i]] = weekly_patterns_data["median_dwell"][i]
poi_concen_df = pd.DataFrame.from_dict(POI_CONCEN)
poi_dwell_df = pd.DataFrame.from_dict(POI_MED_DWELL)
for i in range(0, 33):
for j in range(0, 1804):
d1 = poi_concen_df.iloc[i,j]
d2 = poi_dwell_df.iloc[i,j]
poi_concen_df.iloc[i,j] = next(iter(d1.values()))
poi_dwell_df.iloc[i,j] = next(iter(d2.values()))
print(poi_visit_df.shape)
print(poi_concen_df.shape)
print(poi_dwell_df.shape)
poi_concen_array = poi_concen_df.T.to_numpy()
poi_dwell_array = poi_dwell_df.T.to_numpy()
NUM_PLOTS = 1804
for l1, l2 in zip(poi_concen_array, poi_dwell_array):
temp_sum1, temp_sum2 = 0, 0
for i in range(2,7):
temp_sum1 += l1[i]
temp_sum2 += l2[i]
if i == 6:
l1[i+1] = temp_sum1 / 5
l2[i+1] = temp_sum2 / 5
temp_sum1 += l1[i+1] - l1[2]
temp_sum2 += l2[i+1] - l2[2]
l1[i+2] = temp_sum1 / 5
l2[i+2] = temp_sum2 / 5
plt.figure(figsize=(15,5))
for i in range(0, NUM_PLOTS):
plt.plot(poi_concen_array[i])
plt.figure(figsize=(15,5))
for i in range(0, NUM_PLOTS):
plt.plot(poi_dwell_array[i])
### TODO change it!!!!
from statsmodels.tsa.vector_ar.var_model import VAR
from random import random
data = []
for i in range(100):
v1 = i + random()
v2 = v1 + random()
v3 = v2 + random()
row = [v1, v2, v3]
data.append(row)
model = VAR(data)
model_fit = model.fit()
# make prediction
yhat = model_fit.forecast(model_fit.y, steps=10)
print(yhat)
# normal lstm
print(poi_visit_array)
print(poi_concen_array)
print(poi_dwell_array)
# look back steps
# convert to [rows, columns] structure
aaa = 0
in_seq1 = poi_dwell_array[aaa][:-1].reshape((len(poi_dwell_array[aaa][:-1]), 1))
in_seq2 = poi_concen_array[aaa][:-1].reshape((len(poi_concen_array[aaa][:-1]), 1))
out_seq = poi_visit_array[aaa][:-1].reshape((len(poi_visit_array[aaa][:-1]), 1))
in_seq1 = nor_seq(in_seq1)
in_seq2 = nor_seq(in_seq2)
# horizontally stack columns
dataset = np.hstack((in_seq1, in_seq2, out_seq))
# choose a number of time steps
n_steps = 2
# convert into input/output
X, y = split_sequences(dataset, n_steps)
print(X.shape, y.shape)
# summarize the data
for i in range(len(X)):
print(X[i], y[i])
# print(np.array(in_seq1).astype(None))
n_features = X.shape[2]
x_input = []
for n in range(n_steps):
x_input.append([in_seq1[n-n_steps], in_seq2[n-n_steps]])
x_input = np.array(x_input).reshape((1, n_steps, n_features)).astype(None)
print(x_input)
n_features = X.shape[2]
model = Sequential()
model.add(LSTM(5, activation='relu', input_shape=(n_steps, n_features)))
# model.add(Dense(8, activation='relu'))
# model.add(Dense(32, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mae')
model_data = model.fit(X, y, epochs=500, batch_size=8, verbose=0, shuffle=False)
yhat = model.predict(x_input)
print(np.rint(yhat).astype(int)[0][0])
plt.plot(model_data.history['loss'])
def multi_lstm(n_steps, n_features):
"""
Multivariate LSTM model
Input: param
------------------------
n_steps: number
Number of LSTM look back steps
n_features: number
Number of features used as X
------------------------
return: object
An LSTM model with input shape (n_steps, n_features)
"""
model = Sequential()
model.add(LSTM(5, activation='relu', input_shape=(n_steps, n_features)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')
return model
def nor_seq(seq):
"""
Normalization function, normalize a list to 0~1 scale
Input param
------------------------
seq: list
Input list to be normalized
------------------------
return: list
Normalized list
"""
return [i/max(seq) for i in seq]
def split_sequences(sequences, n_steps):
"""
split a multivariate sequence into samples
# https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/
Input param
------------------------
sequences: array
Input array to be splited by looking back into features X and labels y
n_steps: number
Number of look back steps to generate the model formula
------------------------
return
------------------------
X: numpy array
X feature array
y: numpy array
y labels
"""
X, y = list(), list()
for i in range(len(sequences)):
# find the end of this pattern
end_ix = i + n_steps
# check if we are beyond the dataset
if end_ix >= len(sequences):
break
seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix, -1]
X.append(seq_x)
y.append(seq_y)
return np.array(X).astype(None), np.array(y).astype(None)
def train_multi_lstm(n_steps = 2, epochs=500):
"""
Use data to train individual LSTM models for each POI and predict.
Input param
-------------------------
n_steps: number
Number of LSTM look back steps, default 2
epochs: number
Training epochs, default 500
-------------------------
return: list, list
return the LSTM predict results of all POIs visit count for week 43 and 44
"""
predict_43 = []
predict_44 = []
for i in range(len(poi_visit_array)):
in_seq1 = poi_dwell_array[i].reshape((len(poi_dwell_array[i]), 1))
in_seq2 = poi_concen_array[i].reshape((len(poi_concen_array[i]), 1))
out_seq = poi_visit_array[i].reshape((len(poi_visit_array[i]), 1))
# normalize input features
in_seq1 = nor_seq(in_seq1)
in_seq2 = nor_seq(in_seq2)
# horizontally stack columns
dataset = np.hstack((in_seq1, in_seq2, out_seq))
# convert into input/output
X, y = split_sequences(dataset, n_steps)
# create the LSTM model and train it
model = multi_lstm(n_steps, n_features = X.shape[2])
model.fit(X, y, epochs=epochs, batch_size=16, verbose=0, shuffle=False)
# generate prediction features
x_input_43 = []
x_input_44 = []
for n in range(n_steps):
x_input_43.append([in_seq1[n-n_steps-1], in_seq2[n-n_steps-1]])
x_input_44.append([in_seq1[n-n_steps], in_seq2[n-n_steps]])
x_input_43 = np.array(x_input_43).reshape((1, n_steps, n_features)).astype(None)
x_input_44 = np.array(x_input_44).reshape((1, n_steps, n_features)).astype(None)
y_pred_43 = np.rint(model.predict(x_input_43)).astype(int)[0][0]
y_pred_44 = np.rint(model.predict(x_input_44)).astype(int)[0][0]
predict_43.append(y_pred_43)
predict_44.append(y_pred_44)
print(str(round(100*i/1804, 2))+"% "+str(y_pred_43)+", "+str(y_pred_44))
return predict_43, predict_44
import os, logging, time, datetime
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
logging.getLogger("tensorflow").setLevel(logging.CRITICAL)
logging.getLogger("tensorflow_hub").setLevel(logging.CRITICAL)
start_time = time.time()
pred_43, pred_44 = train_multi_lstm()
print(datetime.timedelta(seconds=(time.time() - start_time)))
lstm_mae = mae_loss(week_43_true, pred_43)
print(lstm_mae)
week_44_lstm = {'poi_id':poi_pred_id, 'raw_visit_counts': pred_44}
week_44_lstm_df = pd.DataFrame(week_44_lstm)
week_44_lstm_df.to_csv("submission_prediction_output.csv", index=False)
week_44_lstm_df
def cnn_model(n_steps=1, n_features=1):
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(n_steps, n_features)))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mae')
return model
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
cnn_predit_all = []
n_steps = 3
n_features = 1
i = 0
for data in data_all:
X, y = create_dataset(data["raw_visit_counts"], look_back = n_steps)
X = X.reshape((X.shape[0], X.shape[1], 1))
model = cnn_model(n_steps = n_steps, n_features = n_features)
model.fit(X, y, epochs=1000, verbose = 0)
input_x = np.array(data["raw_visit_counts"][-3:]).reshape(1, n_steps, 1)
pred = np.rint(model.predict(input_x, verbose = 0))[0][0]
print(str(i + 1) + "/1804: " + str(pred))
i = i + 1
cnn_predit_all.append(pred)
print(mae_loss(week_all[43], cnn_predit_all))
for i in range(len(cnn_predit_all)):
if(cnn_predit_all[i] < 0):
cnn_predit_all[i] = 0
print(mae_loss(week_all[43], cnn_predit_all))
cnn_all_poi_univariate_predict = {"poi_id": poi_ids_all, "raw_visit_counts": np.array(cnn_predit_all)}
cnn_all_poi_univariate_predict_df = pd.DataFrame.from_dict(cnn_all_poi_univariate_predict)
cnn_all_poi_univariate_predict_df