Main Process (5 models)
- Logistic Regression
- Naive Bayes
- Suppot Vector Machine
- Random Forest Classification
- XGBoost Classification
- Comparison and Conclusion
# Main packages and functions
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.arima_model import ARIMAResults
from scipy.stats import boxcox
# Load timeseries dataset
timeseries = importfile(file_id = '1bx2sY8TGrJ7DVRoZ_yjXxp3tQgqn4Zix')
1. Logistic Regression
- GridSearchCV to find the best_estimator and best_parameter
- Print the classification_report
- Plot confusion_matrix
- Plot roc_auc_score
if __name__ == '__main__':
train_data=pd.read_csv(os.path.join("/content",'train.csv'), index_col =0)
test_data=pd.read_csv(os.path.join("/content",'test.csv'), index_col =0)
X_train, X_test, y_train, y_test = get_data(train_data,test_data)
# Set the parameters by cross-validation
tuned_parameters = [{'C': [0.001, 0.01,0.1]}]
grid = grid_search(tuned_parameters, X_train, y_train)
best_estimator = grid.best_estimator_
best_estimator.fit(X_train, y_train)
y_pred = best_estimator.predict(X_test)
report(y_test, y_pred)
plot_confusion_matrix(y_test, y_pred,normalize=False)
plot_roc_auc(X_test,y_test, y_pred)
Output:
2. Naive Bayes
- GridSearchCV to find the best_estimator and best_parameter
- Print the classification_report
- Plot confusion_matrix
- Plot roc_auc_score
if __name__ == '__main__':
train_data=pd.read_csv(os.path.join("/content",'train.csv'), index_col =0)
test_data=pd.read_csv(os.path.join("/content",'test.csv'), index_col =0)
X_train, X_test, y_train, y_test = get_data(train_data,test_data)
# Set the parameters by cross-validation
tuned_parameters = [{'alpha': [0.01, 0.1, 1]}]
grid_naiv = grid_search(tuned_parameters, X_train, y_train)
best_estimator = grid_naiv.best_estimator_
best_estimator.fit(X_train, y_train)
y_pred = best_estimator.predict(X_test)
report(y_test, y_pred)
plot_confusion_matrix(y_test, y_pred,normalize=False)
plot_roc_auc(X_test,y_test, y_pred)
Output:
3. SVM
- GridSearchCV to find the best_estimator and best_parameter
- Print the classification_report
- Plot confusion_matrix
- Plot roc_auc_score
if __name__ == '__main__':
train_data=pd.read_csv(os.path.join("/content",'train.csv'), index_col =0)
test_data=pd.read_csv(os.path.join("/content",'test.csv'), index_col =0)
X_train, X_test, y_train, y_test = get_data(train_data,test_data)
#Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [0.1, 1, 10]},
{'kernel': ['linear'], 'C': [0.1, 1, 10]}]
grid_svm = grid_search(tuned_parameters, X_train, y_train)
best_estimator = grid_svm.best_estimator_
best_estimator.fit(X_train, y_train)
y_pred = best_estimator.predict(X_test)
report(y_test, y_pred)
plot_confusion_matrix(y_test, y_pred,normalize=False)
plot_roc_auc(X_test,y_test, y_pred)
Output:
4. Random Forest Classification
- GridSearchCV to find the best_estimator and best_parameter
- Print the classification_report
- Plot confusion_matrix
- Plot roc_auc_score
if __name__ == '__main__':
train_data=pd.read_csv(os.path.join("/content",'train.csv'), index_col =0)
test_data=pd.read_csv(os.path.join("/content",'test.csv'), index_col =0)
X_train, X_test, y_train, y_test = get_data(train_data,test_data)
# Set the parameters by cross-validation
tuned_parameters = [{'n_estimators': [100, 200],
'max_depth': [10, 50, 100],
'min_samples_split': [2, 4],
'max_features': ['sqrt', 'log2']}]
grid_rf = grid_search(tuned_parameters, X_train, y_train)
best_estimator = grid_rf.best_estimator_
best_estimator.fit(X_train, y_train)
y_pred = best_estimator.predict(X_test)
report(y_test, y_pred)
imp = get_feature_importance(train_data)
plot_confusion_matrix(y_test, y_pred,normalize=False)
plot_roc_auc(X_test,y_test, y_pred)
Output:
5. XGBoost Classification
- GridSearchCV to find the best_estimator and best_parameter
- Print the classification_report
- Plot confusion_matrix
- Plot roc_auc_score
if __name__ == '__main__':
train_data=pd.read_csv('./train.csv', index_col =0)
test_data=pd.read_csv('./test.csv', index_col =0)
X_train, X_test, y_train, y_test = get_data(train_data,test_data)
#model before grid search
xgb = model(xgb.XGBClassifier(),X_train, y_train)
y_pred = xgb.predict(X_test)
report(y_test, y_pred)
#grid search
param = {'n_estimators': [100, 200, 300],
'max_depth': [3, 4, 5],
'min_child_weight': [1, 2, 3],
'gamma': [0.1, 0.2, 0.3]}
grid_search(grid_search(param,X_train,y_train))
#plot
plot_important_features(xgb)
plot_confusion_matrix(y_test, y_pred,normalize=False)
plot_roc_auc(X_test,y_test, y_pred,xgb)
Output:
6. Comparison and Conclusion
By looking at the roc_auc_score, it shows that Random Forest and XGBoost classification performs best.