Imports 
Code 
import  os,re,zipfile 
import  pandas as  pd 
import  numpy as  np 
from  matplotlib import  pyplot as  plt 
 plt.style.use('dark_background' ) 
 plt.style.use('seaborn' ) 
from  sklearn.model_selection import  train_test_split,GridSearchCV 
from  sklearn import  metrics 
import  xgboost as  xgb 
 
 
 
Data 
fixed acidity 
volatile acidity 
citric acid 
residual sugar 
chlorides 
free sulfur dioxide 
total sulfur dioxide 
density 
pH 
sulphates 
alcohol Output variable (based on sensory data): 
quality (score between 0 and 10) 
type 1/0 (red/white) 
 
Code 
 red =  pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv' ,sep= ';' ) 
 red['type' ] =  1  
 white =  pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv' ,sep= ';' ) 
 white['type' ] =  0  
 df =  pd.concat([red,white],ignore_index= True ) 
 df.columns =  ['_' .join(x.split()) for  x in  df.columns.str .lower()] 
 df[:3 ] 
 
 
0 
7.4 
0.70 
0.00 
1.9 
0.076 
11.0 
34.0 
0.9978 
3.51 
0.56 
9.4 
5 
1 
 
1 
7.8 
0.88 
0.00 
2.6 
0.098 
25.0 
67.0 
0.9968 
3.20 
0.68 
9.8 
5 
1 
 
2 
7.8 
0.76 
0.04 
2.3 
0.092 
15.0 
54.0 
0.9970 
3.26 
0.65 
9.8 
5 
1 
 
 
 
 
 
 
EDA 
number of wine samples by quality 
Code 
 df.quality.value_counts().plot.bar(rot= 0 ) 
 
 
Average features by quality 
Code 
 df.groupby('quality' ).agg({ 
     'fixed_acidity' : 'mean' ,  
     'volatile_acidity' : 'mean' ,  
     'citric_acid' : 'mean' ,  
     'residual_sugar' : 'mean' , 
     'chlorides' : 'mean' ,  
     'free_sulfur_dioxide' : 'mean' ,  
     'total_sulfur_dioxide' : 'mean' ,  
     'density' : 'mean' ,  
     'ph' : 'mean' ,  
     'sulphates' : 'mean' ,  
     'alcohol' : 'mean' , 
 }).plot.bar(rot= 0 ,figsize= (15 ,5 ),cmap= 'Paired' ).legend(bbox_to_anchor= (1.0 ,1.0 )) 
 
 
 
Predicting Wine Quality 
Split the data into Train & Test 
Code 
 train,test =  train_test_split(df,test_size= 0.33 , random_state= 42 ,stratify= df.quality) 
 
 
Feature columns 
Code 
 x_cols =  [ 
         'fixed_acidity' , 'volatile_acidity' , 'citric_acid' , 'residual_sugar' ,'chlorides' ,  
         'free_sulfur_dioxide' , 'total_sulfur_dioxide' , 'density' ,'ph' , 'sulphates' , 'alcohol' , 'type'  
        ] 
 
 
Grid Search Parameters Search 
Code 
 param_grid =  { 
     # "learning_rate"    : [0.05, 0.10 ] ,  
     "max_depth"         : [ 1 , 4 , 7 , 14 , 20 ], 
     # "min_child_weight" : [ 3, 5, 7 ],  
     # "gamma"            : [ 0.1, 0.3],  
     "colsample_bytree"  : [ 0.3 , 0.5  , 0.7  ], 
     "n_estimators"  : [ 1000  ], 
     "objective" : ['binary:logistic' ,'multi:softmax' ,'multi:softprob' ], 
     "num_class" : [df.quality.nunique()] 
  } 
 
 
XGBoost Classifier + Hyper-Parameters Tunning 
Search 
Code 
 xgc =  xgb.XGBClassifier() 
 grid =  GridSearchCV(xgc, param_grid, cv= 2 ,verbose= 10 ,n_jobs=- 1 ) 
 grid.fit(train[x_cols],train['quality' ]) 
 
 
Already searched the best 
Code 
 best_params =  { 
     'colsample_bytree' : 0.5 , 
     'max_depth' : 20 , 
     'n_estimators' : 1000 , 
     'num_class' : 7 , 
     'objective' : 'binary:logistic'  
  } 
 xgc =  xgb.XGBClassifier(** best_params) 
 xgc.fit(train[x_cols],train['quality' ]) 
 
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=20,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, num_class=7, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=None, subsample=1, verbosity=1) 
 
 
Feature importance 
Results 
Code 
 results =  pd.DataFrame({ 
     'y_pred' : xgc.predict(test[x_cols]), 
     'y_true' : test['quality' ]}) 
 results 
 
 
5783 
7 
7 
 
2962 
5 
4 
 
1384 
5 
5 
 
5905 
6 
6 
 
3083 
5 
3 
 
... 
... 
... 
 
4066 
6 
6 
 
1083 
6 
6 
 
398 
6 
6 
 
306 
5 
5 
 
6139 
6 
6 
 
 
2145 rows × 2 columns
 
 
 
Classification Report 
Code 
print (metrics.classification_report(results.y_true,results.y_pred)) 
 
              precision    recall  f1-score   support
           3       0.00      0.00      0.00        10
           4       0.44      0.17      0.24        71
           5       0.71      0.69      0.70       706
           6       0.65      0.74      0.70       936
           7       0.64      0.60      0.62       356
           8       0.88      0.36      0.51        64
           9       0.00      0.00      0.00         2
    accuracy                           0.67      2145
   macro avg       0.48      0.37      0.40      2145
weighted avg       0.66      0.67      0.66      2145
 
 
/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result)) 
 
 
 
Anomaly Detection 
Code 
from  sklearn.ensemble import  IsolationForest 
 
 
Columns to search for anomalies 
Code 
 x_cols =  [ 
         'fixed_acidity' , 'volatile_acidity' , 'citric_acid' , 'residual_sugar' ,'chlorides' ,  
         'free_sulfur_dioxide' , 'total_sulfur_dioxide' , 'density' ,'ph' , 'sulphates' , 'alcohol'  
        ] 
 
 
Anomalies dataframe 
Code 
%% time 
 adf =  df.copy() 
 adf['anomalies' ] =  IsolationForest(random_state= 0 ,n_estimators= 1000 ,n_jobs=- 1 ).fit_predict(adf[x_cols]) 
 adf[:3 ] 
 
CPU times: user 4.53 s, sys: 255 ms, total: 4.79 s
Wall time: 4.3 s 
 
 
Total Anoamlies 
Code 
 (adf.anomalies.value_counts() /  adf.anomalies.value_counts().sum ()).rename(index= {1 :True ,- 1 :False }).plot.pie(autopct= ' %1.1f%% ' ) 
 
 
Code 
 adf.groupby(['anomalies' ,'type' ,'quality' ]).agg({x:'mean'  for  x in  x_cols})#.style.background_gradient(cmap='Blues')  
 
anomalies 
type 
quality 
 
 
 
 
 
 
 
 
 
 
 
 
 
-1 
0 
3 
8.487500 
0.353750 
0.335000 
6.700000 
0.068000 
95.812500 
240.125000 
0.995799 
3.107500 
0.527500 
10.162500 
 
4 
7.791667 
0.630417 
0.352500 
6.062500 
0.064000 
42.250000 
167.958333 
0.995680 
3.181667 
0.487500 
10.341667 
 
5 
7.231507 
0.340137 
0.462466 
11.860274 
0.083274 
49.650685 
182.136986 
0.997487 
3.103288 
0.513014 
9.487671 
 
6 
7.312676 
0.308873 
0.506761 
11.803521 
0.066183 
40.767606 
154.112676 
0.997205 
3.145352 
0.536761 
10.210563 
 
7 
5.419048 
0.375476 
0.221429 
4.226190 
0.033857 
39.190476 
130.285714 
0.990191 
3.341905 
0.641429 
12.921429 
 
8 
5.200000 
0.318333 
0.307778 
3.166667 
0.032889 
54.222222 
148.222222 
0.989728 
3.384444 
0.676667 
12.688889 
 
1 
3 
8.700000 
0.888125 
0.205000 
2.806250 
0.135750 
10.500000 
23.500000 
0.997699 
3.390000 
0.565000 
9.987500 
 
4 
7.500000 
0.822750 
0.142000 
2.400000 
0.110300 
13.300000 
34.300000 
0.996449 
3.461500 
0.666500 
10.530000 
 
5 
9.346491 
0.625263 
0.333772 
3.141228 
0.147377 
16.964912 
54.307018 
0.998277 
3.246316 
0.764298 
10.075000 
 
6 
9.688372 
0.509496 
0.385271 
3.042248 
0.110450 
13.740310 
38.767442 
0.997853 
3.274651 
0.769302 
10.820930 
 
7 
9.648214 
0.463393 
0.445893 
3.228571 
0.086839 
13.625000 
45.964286 
0.997024 
3.269821 
0.787857 
11.638690 
 
8 
8.383333 
0.501667 
0.360000 
3.166667 
0.063333 
12.666667 
47.666667 
0.995000 
3.351667 
0.785000 
12.783333 
 
1 
0 
3 
7.008333 
0.319583 
0.336667 
6.187500 
0.045167 
25.000000 
124.250000 
0.994274 
3.240833 
0.439167 
10.466667 
 
4 
7.076821 
0.361424 
0.300397 
4.514238 
0.048993 
21.857616 
121.887417 
0.994165 
3.182980 
0.475232 
10.137417 
 
5 
6.918280 
0.300000 
0.331069 
7.096279 
0.049873 
35.734827 
149.257225 
0.995145 
3.172290 
0.480578 
9.825780 
 
6 
6.821815 
0.258952 
0.332393 
6.262623 
0.044518 
35.479784 
136.477668 
0.993853 
3.190042 
0.489582 
10.587549 
 
7 
6.766880 
0.260012 
0.328172 
5.209953 
0.038297 
34.001746 
124.988359 
0.992508 
3.210768 
0.499721 
11.329957 
 
8 
6.736145 
0.275181 
0.327530 
5.807229 
0.038608 
35.771084 
124.969880 
0.992372 
3.209699 
0.475904 
11.578916 
 
9 
7.420000 
0.298000 
0.386000 
4.120000 
0.027400 
33.400000 
116.000000 
0.991460 
3.308000 
0.466000 
12.180000 
 
1 
3 
7.000000 
0.870000 
0.035000 
1.950000 
0.069500 
13.000000 
30.500000 
0.996525 
3.430000 
0.590000 
9.825000 
 
4 
7.948485 
0.615909 
0.193636 
2.872727 
0.078788 
11.636364 
37.424242 
0.996599 
3.333030 
0.553939 
10.104545 
 
5 
7.930159 
0.567346 
0.225573 
2.405732 
0.081750 
16.987654 
56.957672 
0.996868 
3.316737 
0.592152 
9.864462 
 
6 
8.007269 
0.494440 
0.245580 
2.333988 
0.078495 
16.211198 
41.402750 
0.996301 
3.329077 
0.651513 
10.581009 
 
7 
8.568531 
0.380629 
0.347483 
2.521678 
0.072573 
14.209790 
30.734266 
0.995744 
3.298951 
0.723007 
11.398252 
 
8 
8.658333 
0.384167 
0.406667 
2.283333 
0.071000 
13.583333 
26.333333 
0.995318 
3.225000 
0.759167 
11.750000