ROC AUC score is much less than average cross validation score

https://datascience.stackexchange.com/questions/67822

08-12-2020
|

Pregunta

Using Lending club Dataset to find the propability of default. I am using hyperopt library to fine tune hyper parameter for an XGBclassifier and trying to maximize the ROC AUC score. I am also using Random over sampling inside the pipeline and performing the cross validation on the whole pipeline. The problem is that I am getting very different scores using the parameters I get from the Hyperopt using cross validation than when fitting the model on the whole training data and trying to calculate the ROC AUC score on the validation set.

The models seems to be over-fitting despite the cross validation. I don't know what should I do. Cross validation score:0.74

Validation Score:0.66

Find the code below:

#creating lists for numerical,text,categorical features for preprocessing step
numerical_features =(sorted(features.select_dtypes(include=['float64']).columns))
categorical_features = (sorted(features.select_dtypes(exclude=['float64']).columns))
text_features=['emp_title','title']
ordinal_features=['grade']
categorical_features.remove('emp_title')
categorical_features.remove('title')
categorical_features.remove('grade')
numerical_features.remove('int_rate')
#%%
numerical_features.remove('total_pymnt')
#label encoding label/target variable combining different classes
#le = preprocessing.LabelEncoder()
#eh=le.fit_transform(target)
#%%
#creating training and validation sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2,random_state=777)
#%%
preprocess = make_column_transformer(((make_pipeline(IterativeImputer(initial_strategy='median',add_indicator=True,verbose=2,max_iter=100),StandardScaler())),numerical_features),((make_pipeline(SimpleImputer(strategy='constant',fill_value="Not Available",add_indicator=True),OneHotEncoder(handle_unknown='ignore'))),categorical_features),(OrdinalEncoder(),ordinal_features))
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin

#RandomOverSampler(sampling_strategy=sampling,random_state=777)
#%%
import numpy as np


unique, counts = np.unique(y_train, return_counts=True)

counts2=np.asarray((unique, counts)).T
#%%



#%%
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin
def objective(space):

    classifier = make_pipeline(preprocess,RandomOverSampler(random_state=777),XGBClassifier(n_jobs=-1,verbosity=3,
 objective= 'binary:logistic',
 nthread=-1,
 scale_pos_weight=1,
 seed=27,tree_method='hist',n_estimators = space['n_estimators'],
                            max_depth = int(space['max_depth']),
                            learning_rate = space['learning_rate'],
                            gamma = space['gamma'],
                            min_child_weight = space['min_child_weight'],
                            subsample = space['subsample'],
                            colsample_bytree = space['colsample_bytree']))




    # Applying k-Fold Cross Validation
    from sklearn.model_selection import cross_val_score



    accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv =3,scoring='roc_auc')
    CrossValMean = accuracies.mean()

    print("CrossValMean:", CrossValMean)




    return{'loss':1-CrossValMean, 'status': STATUS_OK }

space = {
    'max_depth' : hp.choice('max_depth', range(5, 50, 1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'n_estimators' : hp.choice('n_estimators', range(20, 500, 10)),
    'gamma' : hp.quniform('gamma', 0, 0.50, 0.01),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=300,
            trials=trials)

print("Best: ", best)
#%%
#training model on parameters got from hyperopt
grid_clf = make_pipeline(preprocess,RandomOverSampler(random_state=777),XGBClassifier(n_jobs=-1,verbosity=3,
 objective= 'binary:logistic',
 nthread=-1,
 scale_pos_weight=1,
 seed=27,tree_method='hist',n_estimators = 370,
                            max_depth = 6,
                            learning_rate = 0.16,
                            gamma = 0.45,
                            min_child_weight =7.0,
                            subsample = 0.52,
                            colsample_bytree = 0.76))


print(grid_clf)
#%%

clf=grid_clf.fit(X_train, y_train)
#, xgbclassifier__early_stopping_rounds=20, xgbclassifier__eval_set=[(X_test, y_test)],xgbclassifier__eval_metric='refit_score')
#%%
print(classification_report(y_test, grid_clf.predict(X_test) ))
#%%
print(confusion_matrix(y_test, grid_clf.predict(X_test) ))
#%%
from sklearn.metrics import roc_auc_score
y_predicted = grid_clf.predict(X_test)
r_adj_test = roc_auc_score(y_test, y_predicted)
print(r_adj_test )

Solución

Your test score is incorrect. The ROC curve needs the probability scores from the model, not the class decisions. So replace

y_predicted = grid_clf.predict(X_test)

with

y_predicted = grid_clf.predict_proba(X_test)[:,1]

Licenciado bajo: CC-BY-SA con atribución

No afiliado a datascience.stackexchange