Imports

import os,re,zipfile
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
plt.style.use('dark_background')
plt.style.use('seaborn')
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import metrics
import xgboost as xgb

Data

fixed acidity
volatile acidity
citric acid
residual sugar
chlorides
free sulfur dioxide
total sulfur dioxide
density
pH
sulphates
alcohol Output variable (based on sensory data):
quality (score between 0 and 10)
type 1/0 (red/white)

red = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv',sep=';')
red['type'] = 1
white = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv',sep=';')
white['type'] = 0
df = pd.concat([red,white],ignore_index=True)
df.columns = ['_'.join(x.split()) for x in df.columns.str.lower()]
df[:3]

EDA

number of wine samples by quality

df.quality.value_counts().plot.bar(rot=0)

<matplotlib.axes._subplots.AxesSubplot at 0x7efe6684db00>

Average features by quality

df.groupby('quality').agg({
    'fixed_acidity': 'mean', 
    'volatile_acidity': 'mean', 
    'citric_acid': 'mean', 
    'residual_sugar': 'mean',
    'chlorides': 'mean', 
    'free_sulfur_dioxide': 'mean', 
    'total_sulfur_dioxide': 'mean', 
    'density': 'mean', 
    'ph': 'mean', 
    'sulphates': 'mean', 
    'alcohol': 'mean',
}).plot.bar(rot=0,figsize=(15,5),cmap='Paired').legend(bbox_to_anchor=(1.0,1.0))

<matplotlib.legend.Legend at 0x7efe6684d0b8>

Predicting Wine Quality

Split the data into Train & Test

train,test = train_test_split(df,test_size=0.33, random_state=42,stratify=df.quality)

Feature columns

x_cols = [
        'fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar','chlorides', 
        'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density','ph', 'sulphates', 'alcohol', 'type'
       ]

Grid Search Parameters Search

param_grid = {
    # "learning_rate"    : [0.05, 0.10 ] ,
    "max_depth"        : [ 1, 4, 7, 14, 20],
    # "min_child_weight" : [ 3, 5, 7 ],
    # "gamma"            : [ 0.1, 0.3],
    "colsample_bytree" : [ 0.3, 0.5 , 0.7 ],
    "n_estimators" : [ 1000 ],
    "objective": ['binary:logistic','multi:softmax','multi:softprob'],
    "num_class": [df.quality.nunique()]
 }

XGBoost Classifier + Hyper-Parameters Tunning

Search

xgc = xgb.XGBClassifier()
grid = GridSearchCV(xgc, param_grid, cv=2,verbose=10,n_jobs=-1)
grid.fit(train[x_cols],train['quality'])

Already searched the best

best_params = {
    'colsample_bytree': 0.5,
    'max_depth': 20,
    'n_estimators': 1000,
    'num_class': 7,
    'objective': 'binary:logistic'
 }
xgc = xgb.XGBClassifier(**best_params)
xgc.fit(train[x_cols],train['quality'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=20,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, num_class=7, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1)

Feature importance

xgb.plot_importance(xgc)

<matplotlib.axes._subplots.AxesSubplot at 0x7efe6668e470>

Results

results = pd.DataFrame({
    'y_pred': xgc.predict(test[x_cols]),
    'y_true': test['quality']})
results

Classification Report

print(metrics.classification_report(results.y_true,results.y_pred))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00        10
           4       0.44      0.17      0.24        71
           5       0.71      0.69      0.70       706
           6       0.65      0.74      0.70       936
           7       0.64      0.60      0.62       356
           8       0.88      0.36      0.51        64
           9       0.00      0.00      0.00         2

    accuracy                           0.67      2145
   macro avg       0.48      0.37      0.40      2145
weighted avg       0.66      0.67      0.66      2145

/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

Anomaly Detection

from sklearn.ensemble import IsolationForest

Columns to search for anomalies

x_cols = [
        'fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar','chlorides', 
        'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density','ph', 'sulphates', 'alcohol'
       ]

Anomalies dataframe

%%time
adf = df.copy()
adf['anomalies'] = IsolationForest(random_state=0,n_estimators=1000,n_jobs=-1).fit_predict(adf[x_cols])
adf[:3]

CPU times: user 4.53 s, sys: 255 ms, total: 4.79 s
Wall time: 4.3 s

Total Anoamlies

(adf.anomalies.value_counts() / adf.anomalies.value_counts().sum()).rename(index={1:True,-1:False}).plot.pie(autopct='%1.1f%%')

<matplotlib.axes._subplots.AxesSubplot at 0x7f52bf32d6a0>

adf.groupby(['anomalies','type','quality']).agg({x:'mean' for x in x_cols})#.style.background_gradient(cmap='Blues')

	fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	ph	sulphates	alcohol	quality	type
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5	1
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	5	1
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	5	1

	y_pred	y_true
5783	7	7
2962	5	4
1384	5	5
5905	6	6
3083	5	3
...	...	...
4066	6	6
1083	6	6
398	6	6
306	5	5
6139	6	6

			fixed_acidity	volatile_acidity	citric_acid	residual_sugar	chlorides	free_sulfur_dioxide	total_sulfur_dioxide	density	ph	sulphates	alcohol
anomalies	type	quality
-1	0	3	8.487500	0.353750	0.335000	6.700000	0.068000	95.812500	240.125000	0.995799	3.107500	0.527500	10.162500
		4	7.791667	0.630417	0.352500	6.062500	0.064000	42.250000	167.958333	0.995680	3.181667	0.487500	10.341667
		5	7.231507	0.340137	0.462466	11.860274	0.083274	49.650685	182.136986	0.997487	3.103288	0.513014	9.487671
		6	7.312676	0.308873	0.506761	11.803521	0.066183	40.767606	154.112676	0.997205	3.145352	0.536761	10.210563
		7	5.419048	0.375476	0.221429	4.226190	0.033857	39.190476	130.285714	0.990191	3.341905	0.641429	12.921429
		8	5.200000	0.318333	0.307778	3.166667	0.032889	54.222222	148.222222	0.989728	3.384444	0.676667	12.688889
	1	3	8.700000	0.888125	0.205000	2.806250	0.135750	10.500000	23.500000	0.997699	3.390000	0.565000	9.987500
		4	7.500000	0.822750	0.142000	2.400000	0.110300	13.300000	34.300000	0.996449	3.461500	0.666500	10.530000
		5	9.346491	0.625263	0.333772	3.141228	0.147377	16.964912	54.307018	0.998277	3.246316	0.764298	10.075000
		6	9.688372	0.509496	0.385271	3.042248	0.110450	13.740310	38.767442	0.997853	3.274651	0.769302	10.820930
		7	9.648214	0.463393	0.445893	3.228571	0.086839	13.625000	45.964286	0.997024	3.269821	0.787857	11.638690
		8	8.383333	0.501667	0.360000	3.166667	0.063333	12.666667	47.666667	0.995000	3.351667	0.785000	12.783333
1	0	3	7.008333	0.319583	0.336667	6.187500	0.045167	25.000000	124.250000	0.994274	3.240833	0.439167	10.466667
		4	7.076821	0.361424	0.300397	4.514238	0.048993	21.857616	121.887417	0.994165	3.182980	0.475232	10.137417
		5	6.918280	0.300000	0.331069	7.096279	0.049873	35.734827	149.257225	0.995145	3.172290	0.480578	9.825780
		6	6.821815	0.258952	0.332393	6.262623	0.044518	35.479784	136.477668	0.993853	3.190042	0.489582	10.587549
		7	6.766880	0.260012	0.328172	5.209953	0.038297	34.001746	124.988359	0.992508	3.210768	0.499721	11.329957
		8	6.736145	0.275181	0.327530	5.807229	0.038608	35.771084	124.969880	0.992372	3.209699	0.475904	11.578916
		9	7.420000	0.298000	0.386000	4.120000	0.027400	33.400000	116.000000	0.991460	3.308000	0.466000	12.180000
	1	3	7.000000	0.870000	0.035000	1.950000	0.069500	13.000000	30.500000	0.996525	3.430000	0.590000	9.825000
		4	7.948485	0.615909	0.193636	2.872727	0.078788	11.636364	37.424242	0.996599	3.333030	0.553939	10.104545
		5	7.930159	0.567346	0.225573	2.405732	0.081750	16.987654	56.957672	0.996868	3.316737	0.592152	9.864462
		6	8.007269	0.494440	0.245580	2.333988	0.078495	16.211198	41.402750	0.996301	3.329077	0.651513	10.581009
		7	8.568531	0.380629	0.347483	2.521678	0.072573	14.209790	30.734266	0.995744	3.298951	0.723007	11.398252
		8	8.658333	0.384167	0.406667	2.283333	0.071000	13.583333	26.333333	0.995318	3.225000	0.759167	11.750000