Wine Quality
import os,re,zipfile
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
plt.style.use('dark_background')
plt.style.use('seaborn')
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import metrics
import xgboost as xgb
- fixed acidity
- volatile acidity
- citric acid
- residual sugar
- chlorides
- free sulfur dioxide
- total sulfur dioxide
- density
- pH
- sulphates
- alcohol Output variable (based on sensory data):
- quality (score between 0 and 10)
- type 1/0 (red/white)
red = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv',sep=';')
red['type'] = 1
white = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv',sep=';')
white['type'] = 0
df = pd.concat([red,white],ignore_index=True)
df.columns = ['_'.join(x.split()) for x in df.columns.str.lower()]
df[:3]
number of wine samples by quality
df.quality.value_counts().plot.bar(rot=0)
Average features by quality
df.groupby('quality').agg({
'fixed_acidity': 'mean',
'volatile_acidity': 'mean',
'citric_acid': 'mean',
'residual_sugar': 'mean',
'chlorides': 'mean',
'free_sulfur_dioxide': 'mean',
'total_sulfur_dioxide': 'mean',
'density': 'mean',
'ph': 'mean',
'sulphates': 'mean',
'alcohol': 'mean',
}).plot.bar(rot=0,figsize=(15,5),cmap='Paired').legend(bbox_to_anchor=(1.0,1.0))
Split the data into Train & Test
train,test = train_test_split(df,test_size=0.33, random_state=42,stratify=df.quality)
Feature columns
x_cols = [
'fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar','chlorides',
'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density','ph', 'sulphates', 'alcohol', 'type'
]
Grid Search Parameters Search
param_grid = {
# "learning_rate" : [0.05, 0.10 ] ,
"max_depth" : [ 1, 4, 7, 14, 20],
# "min_child_weight" : [ 3, 5, 7 ],
# "gamma" : [ 0.1, 0.3],
"colsample_bytree" : [ 0.3, 0.5 , 0.7 ],
"n_estimators" : [ 1000 ],
"objective": ['binary:logistic','multi:softmax','multi:softprob'],
"num_class": [df.quality.nunique()]
}
XGBoost Classifier + Hyper-Parameters Tunning
Search
xgc = xgb.XGBClassifier()
grid = GridSearchCV(xgc, param_grid, cv=2,verbose=10,n_jobs=-1)
grid.fit(train[x_cols],train['quality'])
Already searched the best
best_params = {
'colsample_bytree': 0.5,
'max_depth': 20,
'n_estimators': 1000,
'num_class': 7,
'objective': 'binary:logistic'
}
xgc = xgb.XGBClassifier(**best_params)
xgc.fit(train[x_cols],train['quality'])
Feature importance
xgb.plot_importance(xgc)
Results
results = pd.DataFrame({
'y_pred': xgc.predict(test[x_cols]),
'y_true': test['quality']})
results
Classification Report
print(metrics.classification_report(results.y_true,results.y_pred))
from sklearn.ensemble import IsolationForest
Columns to search for anomalies
x_cols = [
'fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar','chlorides',
'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density','ph', 'sulphates', 'alcohol'
]
Anomalies dataframe
%%time
adf = df.copy()
adf['anomalies'] = IsolationForest(random_state=0,n_estimators=1000,n_jobs=-1).fit_predict(adf[x_cols])
adf[:3]
Total Anoamlies
(adf.anomalies.value_counts() / adf.anomalies.value_counts().sum()).rename(index={1:True,-1:False}).plot.pie(autopct='%1.1f%%')
adf.groupby(['anomalies','type','quality']).agg({x:'mean' for x in x_cols})#.style.background_gradient(cmap='Blues')