import pandas as pd
import numpy as np
import os
import operator
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.preprocessing import LabelBinarizer,StandardScaler,OrdinalEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from scipy.stats import boxcox
from sklearn.linear_model import LogisticRegression,RidgeClassifier, PassiveAggressiveClassifier
from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from fancyimpute import KNN,SoftImpute
from xgboost import plot_importance
from matplotlib import pyplot
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import joblib
%matplotlib inline def classify(est, x, y,X_test,y_test):
#Passing the model and train test dataset to fit the model
est.fit(x, y)
#Predicting the probabilities of the Tet data
y2 = est.predict_proba(X_test)
y1 = est.predict(X_test)
'''print("Accuracy: ", metrics.accuracy_score(y_test, y1))
print("Area under the ROC curve: ", metrics.roc_auc_score(y_test, y2[:, 1]))
#Calculate different metrics
print("F-metric: ", metrics.f1_score(y_test, y1))
print(" ")
print("Classification report:")
print(metrics.classification_report(y_test, y1))
print(" ")
print("Evaluation by cross-validation:")
print(cross_val_score(est, x, y))'''
return est, y1, y2[:, 1]
#Function to find which features are more important than others through model
def feat_importance(estimator):
feature_importance = {}
for index, name in enumerate(df_LC.columns):
feature_importance[name] = estimator.feature_importances_[index]
feature_importance = {k: v for k, v in feature_importance.items()}
sorted_x = sorted(feature_importance.items(), key=operator.itemgetter(1), reverse = True)
return sorted_x
#Model to predict the ROC curve for various models and finding the best one
def run_models(X_train, y_train, X_test, y_test, model_type = 'Non-balanced'):
clfs = {'GradientBoosting': GradientBoostingClassifier(max_depth= 6, n_estimators=100, max_features = 0.3),
'LogisticRegression' : LogisticRegression(),
#'GaussianNB': GaussianNB(),
'RandomForestClassifier': RandomForestClassifier(n_estimators=10),
'XGBClassifier': XGBClassifier()
}
cols = ['model','matthews_corrcoef', 'roc_auc_score', 'precision_score', 'recall_score','f1_score']
models_report = pd.DataFrame(columns = cols)
conf_matrix = dict()
for clf, clf_name in zip(clfs.values(), clfs.keys()):
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_score = clf.predict_proba(X_test)[:,1]
print('computing {} - {} '.format(clf_name, model_type))
tmp = pd.Series({'model_type': model_type,
'model': clf_name,
'roc_auc_score' : metrics.roc_auc_score(y_test, y_score),
'matthews_corrcoef': metrics.matthews_corrcoef(y_test, y_pred),
'precision_score': metrics.precision_score(y_test, y_pred),
'recall_score': metrics.recall_score(y_test, y_pred),
'f1_score': metrics.f1_score(y_test, y_pred)})
models_report = models_report.append(tmp, ignore_index = True)
conf_matrix[clf_name] = metrics.confusion_matrix(y_test, y_pred)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score, drop_intermediate = False, pos_label = 1)
plt.figure(1, figsize=(6,6))
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.title('ROC curve - {}'.format(model_type))
plt.plot(fpr, tpr, label = clf_name )
plt.legend(loc=2, prop={'size':11})
plt.plot([0,1],[0,1], color = 'black')
return models_report, conf_matrix, plt #############Reading the dataset############################
#my_change
data=pd.read_csv("https://s3.amazonaws.com/hackerday.datascience/358/LoansTrainingSetV2.csv",low_memory=True)
###############EDA Starts here####################################
data.head()
len(data)
##Drop the duplicates with respect to LOAN ID
data.drop_duplicates(subset="Loan ID",keep='first',inplace=True)