Imports

import os,re,zipfile
import pandas as pd
import numpy as np
from types import SimpleNamespace
from matplotlib import pyplot as plt
plt.style.use('dark_background')
plt.style.use('seaborn')
base_size = 10
sizes=SimpleNamespace(**dict(small=(1*base_size,1*base_size),medium=(2*base_size,2*base_size),large=(3*base_size,3*base_size)))

Download and Unzip

%%time
zip_name = 'used-cars-database.zip'
if not os.path.exists(zip_name):
  os.environ['KAGGLE_USERNAME'] = "" # username from the json file
  os.environ['KAGGLE_KEY'] = "" # key from the json file
  !kaggle datasets download orgesleka/used-cars-database
Downloading used-cars-database.zip to /content
 52% 9.00M/17.3M [00:00<00:00, 29.0MB/s]
100% 17.3M/17.3M [00:00<00:00, 43.8MB/s]
CPU times: user 22.3 ms, sys: 3.06 ms, total: 25.4 ms
Wall time: 2.34 s
with zipfile.ZipFile(zip_name, 'r') as zip_ref:
    zip_ref.extractall(zip_name.split('.')[0])
os.listdir(zip_name.split('.')[0])
['autos.csv', 'cnt_km_year_powerPS_minPrice_maxPrice_avgPrice_sdPrice.csv']

Load data

auto = pd.read_csv(zip_name.split('.')[0]+'/autos.csv',encoding='ISO-8859-1')
auto
dateCrawled name seller offerType price abtest vehicleType yearOfRegistration gearbox powerPS model kilometer monthOfRegistration fuelType brand notRepairedDamage dateCreated nrOfPictures postalCode lastSeen
0 2016-03-24 11:52:17 Golf_3_1.6 privat Angebot 480 test NaN 1993 manuell 0 golf 150000 0 benzin volkswagen NaN 2016-03-24 00:00:00 0 70435 2016-04-07 03:16:57
1 2016-03-24 10:58:45 A5_Sportback_2.7_Tdi privat Angebot 18300 test coupe 2011 manuell 190 NaN 125000 5 diesel audi ja 2016-03-24 00:00:00 0 66954 2016-04-07 01:46:50
2 2016-03-14 12:52:21 Jeep_Grand_Cherokee_"Overland" privat Angebot 9800 test suv 2004 automatik 163 grand 125000 8 diesel jeep NaN 2016-03-14 00:00:00 0 90480 2016-04-05 12:47:46
3 2016-03-17 16:54:04 GOLF_4_1_4__3TÜRER privat Angebot 1500 test kleinwagen 2001 manuell 75 golf 150000 6 benzin volkswagen nein 2016-03-17 00:00:00 0 91074 2016-03-17 17:40:17
4 2016-03-31 17:25:20 Skoda_Fabia_1.4_TDI_PD_Classic privat Angebot 3600 test kleinwagen 2008 manuell 69 fabia 90000 7 diesel skoda nein 2016-03-31 00:00:00 0 60437 2016-04-06 10:17:21
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
371523 2016-03-14 17:48:27 Suche_t4___vito_ab_6_sitze privat Angebot 2200 test NaN 2005 NaN 0 NaN 20000 1 NaN sonstige_autos NaN 2016-03-14 00:00:00 0 39576 2016-04-06 00:46:52
371524 2016-03-05 19:56:21 Smart_smart_leistungssteigerung_100ps privat Angebot 1199 test cabrio 2000 automatik 101 fortwo 125000 3 benzin smart nein 2016-03-05 00:00:00 0 26135 2016-03-11 18:17:12
371525 2016-03-19 18:57:12 Volkswagen_Multivan_T4_TDI_7DC_UY2 privat Angebot 9200 test bus 1996 manuell 102 transporter 150000 3 diesel volkswagen nein 2016-03-19 00:00:00 0 87439 2016-04-07 07:15:26
371526 2016-03-20 19:41:08 VW_Golf_Kombi_1_9l_TDI privat Angebot 3400 test kombi 2002 manuell 100 golf 150000 6 diesel volkswagen NaN 2016-03-20 00:00:00 0 40764 2016-03-24 12:45:21
371527 2016-03-07 19:39:19 BMW_M135i_vollausgestattet_NP_52.720____Euro privat Angebot 28990 control limousine 2013 manuell 320 m_reihe 50000 8 benzin bmw nein 2016-03-07 00:00:00 0 73326 2016-03-22 03:17:10

371528 rows × 20 columns

missing_values_cols = auto.isnull().sum()
missing_values_cols = missing_values_cols[missing_values_cols > 0]
missing_values_cols = missing_values_cols.index.tolist()
missing_values_cols
['vehicleType', 'gearbox', 'model', 'fuelType', 'notRepairedDamage']
# auto.groupby(['category', 'name'])['value'].transform(lambda x: x.fillna(x.mean()))

Quick EDA

fig,ax = plt.subplots(2,4,figsize=(30,15))
auto.vehicleType.value_counts().plot.barh(ax=ax.ravel()[0],title='Number of vehicles by type')
auto.fuelType.value_counts().plot.barh(ax=ax.ravel()[1],title='Number of vehicles by fuel type')
auto.yearOfRegistration.value_counts()[:20].plot.barh(ax=ax.ravel()[2],title='Number of vehicles by year of registration (first 20)')
auto.groupby('vehicleType')[['price']].mean().plot.barh(ax=ax.ravel()[3],title='Mean price of vehicles by type')
auto.groupby('fuelType')[['price']].mean().plot.barh(ax=ax.ravel()[4],title='Mean price of vehicles fuel type')
auto.groupby('vehicleType')[['kilometer']].mean().plot.barh(ax=ax.ravel()[5],title='Mean kilometer of vehicles by type')
auto.groupby('fuelType')[['kilometer']].mean().plot.barh(ax=ax.ravel()[6],title='Mean kilometer of vehicles fuel type')
auto.gearbox.value_counts().plot.barh(ax=ax.ravel()[7],title='Number of vehicles by gearbox')
plt.tight_layout()

Kilometers and Prices by Brand and Model

tmp = auto.groupby(['brand','model']).agg({
    'kilometer': 'mean',
    'price': 'mean'
})
t = list(set([x[0] for x in tmp.index]))
fig, ax = plt.subplots(int(len(t) / 4)+1, 4,figsize=(50,50))

for i,brand in enumerate(t):
  try:
    tmp.loc[brand].plot.barh(ax=ax.ravel()[i], title=f'Mean (kilometer, price) for {brand.replace("_"," ").capitalize()} car models')
  except:
    print(f'error for {brand}')
    continue
plt.tight_layout()

After the quick overview let's build a model to help us set the price

XGBoost and LabelEncoder should do the job for us

from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from collections import defaultdict
d = defaultdict(LabelEncoder)
d
defaultdict(sklearn.preprocessing._label.LabelEncoder, {})

x_cols are our features and target is what we look for to predict

x_cols=['brand','model','kilometer','yearOfRegistration','monthOfRegistration','fuelType','vehicleType','notRepairedDamage','powerPS','gearbox']
target = 'price'
  • Copy the dataframe without the missing values
  • Define the columns that need encoding
  • Apply the encoding
auto_copy = auto.dropna().copy()
enc_cols = list(set(x_cols) - set(['powerPS','yearOfRegistration','monthOfRegistration','kilometer']))
auto_copy[enc_cols] = auto_copy[enc_cols].apply(lambda x: d[x.name].fit_transform(x))

After we dropped the missing values we still have 70% of the data

len(auto_copy) / len(auto)
0.7023858228720312

Now, let's split our data into train/test

X_train, X_test, y_train, y_test = train_test_split(auto_copy[x_cols], auto_copy[target], test_size=0.33, random_state=42)

Further on we can use the XGBRegressor with a gamma distribution and train it on the train data

%%time
xg = xgb.XGBRegressor(objective ='reg:gamma', n_estimators = 1000).fit(X_train,y_train)
CPU times: user 1min 23s, sys: 67.9 ms, total: 1min 23s
Wall time: 1min 23s

Make a dataframe with test target vs predicted test target

preds = pd.DataFrame({
    'true': y_test,
    'predicted': xg.predict(X_test)
})

Use MAE for our metric

metrics.mean_absolute_error(preds.true,preds.predicted)
2078.077884307823

Let's have a look into our Feature Importance

plt.figure(figsize=sizes.small)
xgb.plot_importance(xg)
plt.show()
<Figure size 720x720 with 0 Axes>

And here we have a copy of our test dataframe with decoded columns, but also with original and predicted price

xt = X_test.copy()
xt['original_price'] = y_test
xt['predicted_price'] = xg.predict(X_test)
xt[enc_cols] = xt[enc_cols].apply(lambda x: d[x.name].inverse_transform(x))
xt[:30]
brand model kilometer yearOfRegistration monthOfRegistration fuelType vehicleType notRepairedDamage powerPS gearbox original_price predicted_price
336427 volkswagen polo 150000 2003 12 benzin kleinwagen nein 75 manuell 2990 2454.927246
339513 ford fiesta 125000 1998 6 benzin kleinwagen nein 0 automatik 850 980.325439
54995 renault megane 150000 1999 11 benzin kombi nein 0 manuell 950 830.935181
297839 mercedes_benz a_klasse 150000 2000 5 diesel limousine nein 90 automatik 1500 1719.381470
353328 volkswagen andere 125000 1995 5 benzin limousine nein 90 automatik 1700 1200.977295
213296 bmw 3er 125000 1997 9 benzin limousine nein 102 manuell 2450 1498.408813
57967 opel agila 30000 2009 11 benzin kleinwagen nein 60 manuell 5850 5020.946777
114503 nissan micra 150000 2000 10 benzin kleinwagen nein 54 manuell 399 853.951660
126949 seat ibiza 80000 2010 8 diesel kleinwagen nein 143 manuell 10760 10703.540039
22843 audi a6 150000 2007 11 diesel kombi nein 179 manuell 10450 21701.361328
281191 mercedes_benz e_klasse 150000 2002 5 diesel limousine nein 150 manuell 5350 3735.010254
210491 seat leon 80000 2008 10 benzin limousine ja 125 manuell 3900 5240.312988
298231 fiat seicento 150000 2001 2 benzin kleinwagen nein 55 manuell 0 837.973206
246949 fiat punto 125000 2000 5 benzin kleinwagen nein 60 manuell 1100 935.326477
316596 volkswagen polo 150000 1997 10 benzin kleinwagen nein 60 manuell 500 750.544250
249809 bmw 5er 150000 2006 3 diesel limousine nein 163 automatik 10449 8034.567871
241669 suzuki swift 70000 2008 1 benzin kleinwagen nein 92 manuell 4500 4527.567871
288208 opel astra 50000 2000 9 benzin limousine nein 0 automatik 3100 2609.267090
56016 audi 80 50000 1994 1 benzin limousine nein 90 manuell 3500 2869.840820
274185 volkswagen golf 70000 2008 10 benzin coupe nein 161 automatik 10000 13441.850586
357311 volvo c_reihe 150000 2000 10 benzin cabrio nein 163 manuell 4500 4575.593262
279920 peugeot 3_reihe 150000 2008 5 diesel cabrio nein 136 manuell 4666 6645.180664
73242 mercedes_benz a_klasse 150000 1999 4 benzin limousine nein 82 manuell 2250 1340.504639
66915 opel astra 150000 2004 10 benzin limousine nein 105 manuell 3790 2607.811768
112509 volkswagen golf 150000 2004 5 benzin coupe nein 75 manuell 2500 3468.785645
265868 volkswagen golf 150000 1999 4 benzin limousine nein 125 manuell 1300 1867.259277
116073 mazda andere 150000 2004 9 diesel bus nein 136 manuell 1299 3971.486572
321078 kia sorento 150000 2003 7 diesel suv nein 140 manuell 4550 4021.498535
124545 opel astra 150000 2001 5 benzin cabrio nein 147 manuell 3900 2662.085449
246943 fiat punto 60000 2011 1 benzin kleinwagen nein 105 manuell 6850 6856.173828