Used Cars Database
import os,re,zipfile
import pandas as pd
import numpy as np
from types import SimpleNamespace
from matplotlib import pyplot as plt
plt.style.use('dark_background')
plt.style.use('seaborn')
base_size = 10
sizes=SimpleNamespace(**dict(small=(1*base_size,1*base_size),medium=(2*base_size,2*base_size),large=(3*base_size,3*base_size)))
%%time
zip_name = 'used-cars-database.zip'
if not os.path.exists(zip_name):
os.environ['KAGGLE_USERNAME'] = "" # username from the json file
os.environ['KAGGLE_KEY'] = "" # key from the json file
!kaggle datasets download orgesleka/used-cars-database
with zipfile.ZipFile(zip_name, 'r') as zip_ref:
zip_ref.extractall(zip_name.split('.')[0])
os.listdir(zip_name.split('.')[0])
auto = pd.read_csv(zip_name.split('.')[0]+'/autos.csv',encoding='ISO-8859-1')
auto
missing_values_cols = auto.isnull().sum()
missing_values_cols = missing_values_cols[missing_values_cols > 0]
missing_values_cols = missing_values_cols.index.tolist()
missing_values_cols
# auto.groupby(['category', 'name'])['value'].transform(lambda x: x.fillna(x.mean()))
fig,ax = plt.subplots(2,4,figsize=(30,15))
auto.vehicleType.value_counts().plot.barh(ax=ax.ravel()[0],title='Number of vehicles by type')
auto.fuelType.value_counts().plot.barh(ax=ax.ravel()[1],title='Number of vehicles by fuel type')
auto.yearOfRegistration.value_counts()[:20].plot.barh(ax=ax.ravel()[2],title='Number of vehicles by year of registration (first 20)')
auto.groupby('vehicleType')[['price']].mean().plot.barh(ax=ax.ravel()[3],title='Mean price of vehicles by type')
auto.groupby('fuelType')[['price']].mean().plot.barh(ax=ax.ravel()[4],title='Mean price of vehicles fuel type')
auto.groupby('vehicleType')[['kilometer']].mean().plot.barh(ax=ax.ravel()[5],title='Mean kilometer of vehicles by type')
auto.groupby('fuelType')[['kilometer']].mean().plot.barh(ax=ax.ravel()[6],title='Mean kilometer of vehicles fuel type')
auto.gearbox.value_counts().plot.barh(ax=ax.ravel()[7],title='Number of vehicles by gearbox')
plt.tight_layout()
tmp = auto.groupby(['brand','model']).agg({
'kilometer': 'mean',
'price': 'mean'
})
t = list(set([x[0] for x in tmp.index]))
fig, ax = plt.subplots(int(len(t) / 4)+1, 4,figsize=(50,50))
for i,brand in enumerate(t):
try:
tmp.loc[brand].plot.barh(ax=ax.ravel()[i], title=f'Mean (kilometer, price) for {brand.replace("_"," ").capitalize()} car models')
except:
print(f'error for {brand}')
continue
plt.tight_layout()
XGBoost
and LabelEncoder
should do the job for us
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from collections import defaultdict
d = defaultdict(LabelEncoder)
d
x_cols
are our features
and target
is what we look for to predict
x_cols=['brand','model','kilometer','yearOfRegistration','monthOfRegistration','fuelType','vehicleType','notRepairedDamage','powerPS','gearbox']
target = 'price'
- Copy the dataframe without the missing values
- Define the columns that need encoding
- Apply the encoding
auto_copy = auto.dropna().copy()
enc_cols = list(set(x_cols) - set(['powerPS','yearOfRegistration','monthOfRegistration','kilometer']))
auto_copy[enc_cols] = auto_copy[enc_cols].apply(lambda x: d[x.name].fit_transform(x))
After we dropped the missing values we still have 70% of the data
len(auto_copy) / len(auto)
Now, let's split our data into train
/test
X_train, X_test, y_train, y_test = train_test_split(auto_copy[x_cols], auto_copy[target], test_size=0.33, random_state=42)
Further on we can use the XGBRegressor
with a gamma
distribution and train it on the train
data
%%time
xg = xgb.XGBRegressor(objective ='reg:gamma', n_estimators = 1000).fit(X_train,y_train)
Make a dataframe
with test
target vs predicted
test target
preds = pd.DataFrame({
'true': y_test,
'predicted': xg.predict(X_test)
})
Use MAE for our metric
metrics.mean_absolute_error(preds.true,preds.predicted)
Let's have a look into our Feature Importance
plt.figure(figsize=sizes.small)
xgb.plot_importance(xg)
plt.show()
And here we have a copy of our test
dataframe with decoded
columns, but also with original
and predicted
price
xt = X_test.copy()
xt['original_price'] = y_test
xt['predicted_price'] = xg.predict(X_test)
xt[enc_cols] = xt[enc_cols].apply(lambda x: d[x.name].inverse_transform(x))
xt[:30]