Magic Mushroom 🍄
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt
plt.style.use('seaborn')
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import shap
df = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')
df[:3]
df = df.apply(LabelEncoder().fit_transform)
df[:3]
Features
features = list(set(df.columns.tolist()) - set(['class']))
target = 'class'
Train/Test Split
train, test = train_test_split(df,test_size=0.33,random_state=42,stratify=df[target])
Looking for best params
param_grid = {
"max_depth" : [ 7, 14, 20,23],
"min_child_weight" : [ 3, 5, 7 ],
"gamma" : [ 0.1, 0.3],
"colsample_bytree" : [ 0.3, 0.5 , 0.7 ],
"n_estimators" : [ 100,300,600,1000 ],
}
xgc = xgb.XGBClassifier()
grid = GridSearchCV(xgc, param_grid, cv=3,verbose=10,n_jobs=-1)
grid.fit(train[features],train[target])
Best params
grid.best_params_
Results
results = test.copy()
results['y_pred'] = grid.best_estimator_.predict(test[features])
Classification Report
print(metrics.classification_report(results[target],results['y_pred']))
Confusion Matrix
sns.heatmap(metrics.confusion_matrix(results[target],results['y_pred']),annot=True,fmt='d');
How does the model think ?
fig,ax = plt.subplots(1,1,figsize=(20,20))
xgb.plot_tree(grid.best_estimator_,num_trees=10,ax=ax);
Shap Values / Feature Impact
expl = shap.TreeExplainer(grid.best_estimator_)
shap_values = expl.shap_values(test[features],test[target])
shap.summary_plot(shap_values,test[features])