# imports 
from google.colab import drive
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import wordcloud
from wordcloud import WordCloud, STOPWORDS


# Mounting Google Drive locally
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


# Import data and test
pd.read_csv("gdrive/Shared drives/BA780 Team2/datasets/vehicles.csv").head()


# Select useful column and store in google shared drive.
drive.mount('/content/gdrive', force_remount=True)
vehicle_dataset = pd.read_csv("gdrive/Shared drives/BA780 Team2/datasets/vehicles.csv")
output = vehicle_dataset[['region', 'price', 'year', 'manufacturer', 'model', 
                          'fuel', 'odometer', 'title_status', 'transmission', 
                          'condition', 'cylinders','VIN', 'drive', 'size', 'type', 
                          'paint_color', 'state', 'lat', 'long', 'posting_date']]
output.to_csv('gdrive/Shared drives/BA780 Team2/datasets/vehicle.csv', index=False)

Mounted at /content/gdrive


# Loaded the sorted dataset
data = pd.read_csv("gdrive/Shared drives/BA780 Team2/datasets/vehicle.csv")
data.head()


sns.heatmap(data.isnull(),yticklabels=False,cbar=False, cmap='Greys')

<matplotlib.axes._subplots.AxesSubplot at 0x7f3be25b4d50>


# Drop columns 'size', 'VIN'
data.drop(columns=['VIN', 'size'], inplace= True)
sns.heatmap(data.isnull(),yticklabels=False,cbar=False, cmap='Greys')

<matplotlib.axes._subplots.AxesSubplot at 0x7f3bcc1e9150>


print(data.shape)
data.dropna(inplace= True)
print(data.shape)
# Save dataset to Google Drive
data.to_csv('gdrive/Shared drives/BA780 Team2/datasets/vehicle_cleaned.csv', index=False)

(426880, 18)
(115435, 18)


# Read cleaned dataset
data = pd.read_csv("gdrive/Shared drives/BA780 Team2/datasets/vehicle_cleaned.csv")
data.head()


fig, ax = plt.subplots(1,1, figsize = (12,7))
data.boxplot('price', 'manufacturer', ax = ax)
plt.suptitle('box plot for price')
plt.title(' ')
plt.ylabel('price')
plt.xticks(rotation = 90)
plt.show()

/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning:

Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray


data = data[(10000 < data['price']) & (data['price'] < 100000)]


fig, ax = plt.subplots(1,1, figsize = (12,7))
data.boxplot('price', 'manufacturer', ax = ax)
plt.suptitle('box plot for price')
plt.title(' ')
plt.ylabel('price')
plt.xticks(rotation = 90)
plt.show()

/usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning:

Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray


# Find unique data for manufacturer and create a column with manufacturer's country
def country(row):
  if row['manufacturer'] in ['ford', 'gmc', 'chevrolet', 'jeep', 'chrysler', 'ram','cadillac','lincoln','saturn','buick','pontiac','tesla','harley-davidson','mercury']:
    val = 'American Made'
  elif (row['manufacturer'] in ['toyota', 'nissan', 'honda', 'dodge','infiniti','mazda','lexus', 'mitsubishi','acura','subaru' ,'datsun']):
    val = 'Japanese Made'
  elif (row['manufacturer'] in ['mercedes-benz','bmw','volkswagen','porsche','audi']):
    val = 'German Made'
  elif (row['manufacturer'] in ['ferrari','fiat','alfa-romeo']):
    val = 'Italian Made'
  elif (row['manufacturer'] in ['kia','hyundai']):
    val = 'Korean Made'
  elif (row['manufacturer'] in ['land rover','jaguar']):
    val = 'Indian Made'
  elif (row['manufacturer'] in ['volvo']):
    val = 'Swedish Made'
  elif (row['manufacturer'] in ['rover','mini','aston-martin', 'morgan']):
    val = 'British Made'
  return val
data['Made'] = data.apply(country, axis=1)


data_copy = data.copy()
# Print unique data for categorical variables
print('condition: ', data_copy['condition'].unique())
print('cylinders: ', data_copy['cylinders'].unique())
print('fuel: ', data_copy['fuel'].unique())
print('title_status: ', data_copy['title_status'].unique())
print('transmission: ', data_copy['transmission'].unique())
print('drive: ', data_copy['drive'].unique())
print('type: ', data_copy['type'].unique())
print('paint_color: ', data_copy['paint_color'].unique())
print('Made: ', data_copy['Made'].unique())

condition:  ['excellent' 'good' 'new' 'like new' 'fair' 'salvage']
cylinders:  ['6 cylinders' '8 cylinders' '4 cylinders' '10 cylinders' 'other'
 '3 cylinders' '5 cylinders' '12 cylinders']
fuel:  ['gas' 'diesel' 'other' 'hybrid' 'electric']
title_status:  ['clean' 'rebuilt' 'salvage' 'lien' 'missing' 'parts only']
transmission:  ['automatic' 'other' 'manual']
drive:  ['rwd' '4wd' 'fwd']
type:  ['truck' 'pickup' 'other' 'coupe' 'mini-van' 'SUV' 'sedan' 'offroad'
 'convertible' 'wagon' 'van' 'hatchback' 'bus']
paint_color:  ['black' 'silver' 'grey' 'red' 'blue' 'white' 'brown' 'yellow' 'green'
 'custom' 'orange' 'purple']
Made:  ['American Made' 'Japanese Made' 'German Made' 'Italian Made'
 'Korean Made' 'British Made' 'Swedish Made' 'Indian Made']


condition = {'new': 6, 'like new': 5, 'excellent': 4, 'good': 3, 'fair': 2, 'salvage': 1}
data_copy['condition'] = data_copy['condition'].replace(condition)
df = pd.DataFrame(list(condition.items()),columns = ['condition','Code'])
df


cylinders = {'8 cylinders': 8, '6 cylinders': 6, '4 cylinders': 4, '5 cylinders': 5, 'other': 0, '3 cylinders': 3, '10 cylinders': 10, '12 cylinders':12}
data_copy['cylinders'] = data_copy['cylinders'].replace(cylinders)
df = pd.DataFrame(list(cylinders.items()),columns = ['cylinders','Code'])
df


fuel = {'electric': 4, 'hybrid': 3, 'gas': 2, 'diesel': 1, 'other': 0}
data_copy['fuel'] = data_copy['fuel'].replace(fuel)
df = pd.DataFrame(list(fuel.items()),columns = ['fuel','Code'])
df


title_status = {'clean': 6, 'rebuilt': 5, 'salvage': 4, 'missing': 3, 'lien': 2, 'parts only': 1}
data_copy['title_status'] = data_copy['title_status'].replace(title_status)
df = pd.DataFrame(list(title_status.items()),columns = ['title_status','Code'])
df


transmission = {'automatic': 2, 'manual': 1, 'other': 0}
data_copy['transmission'] = data_copy['transmission'].replace({'automatic': 2, 'manual': 1, 'other': 0})
df = pd.DataFrame(list(transmission.items()),columns = ['transmission','Code'])
df


drive = {'4wd': 3, 'rwd': 2, 'fwd': 1}
data_copy['drive'] = data_copy['drive'].replace(drive)
df = pd.DataFrame(list(drive.items()),columns = ['drive','Code'])
df


type = {'truck': 12, 'pickup': 11, 'coupe': 10, 'mini-van': 9, 'SUV': 8, 'sedan': 7, 'offroad': 6, 'van': 5, 'convertible': 4, 'hatchback': 3, 'wagon': 2, 'bus': 1, 'other': 0}
data_copy['type'] = data_copy['type'].replace(type)
df = pd.DataFrame(list(type.items()),columns = ['type','Code'])
df


paint_color = {'black': 12, 'silver': 11, 'grey': 10, 'red': 9, 'blue': 8, 'white': 7, 'brown': 6, 'yellow': 5, 'green': 4, 'custom': 3, 'purple': 2, 'orange': 1}
data_copy['paint_color'] = data_copy['paint_color'].replace(paint_color)
df = pd.DataFrame(list(paint_color.items()),columns = ['paint_color','Code'])
df


Made = {'American Made': 8, 'Japanese Made': 7, 'Germany Made': 6, 'Italian Made': 5, 'Korean Made': 4, 'British Made': 3, 'Indian Made': 2, 'Swedish Made': 1}
data_copy['Made'] = data_copy['Made'].replace(Made)
df = pd.DataFrame(list(Made.items()),columns = ['Made','Code'])
df


data_copy.to_csv('gdrive/Shared drives/BA780 Team2/datasets/vehicle_ml.csv', index=False)
data_copy = pd.read_csv('gdrive/Shared drives/BA780 Team2/datasets/vehicle_ml.csv')
data_copy.head()


data.to_csv('gdrive/Shared drives/BA780 Team2/datasets/vehicle_cleaned.csv', index=False)
data = pd.read_csv("gdrive/Shared drives/BA780 Team2/datasets/vehicle_cleaned.csv")
data.head()


data = pd.read_csv("gdrive/Shared drives/BA780 Team2/datasets/vehicle_ml.csv")


# Print description for each variables
data.describe()


data.corr()


plt.figure(figsize=(20, 10))
corr = data.corr().abs()
ax = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, linewidths=.8, cmap="YlOrBr", annot=True)
plt.title('Correlation Between Each Variables', fontsize=25)
sns.despine();


# Please make sure your have at least 20GB Free RAM before run it
#words = pd.read_csv("gdrive/Shared drives/BA780 Team2/datasets/words.csv")
#sns.set(rc={'figure.figsize':(20,10)})
#description = words['description']
#stop=['ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 'it', 'how', 'further', 'was', 'here', 'than']
#stopwords= set(STOPWORDS)
#stopwords.update(stop)
#wordcloud = WordCloud(width=600, height=300, collocations= False, stopwords=stopwords,max_font_size=50, max_words=300, background_color="white").generate(' '.join(str(t) for t in description))
#plt.imshow(wordcloud, interpolation='bilinear')
#plt.axis("off")
#plt.show()


data = pd.read_csv("gdrive/Shared drives/BA780 Team2/datasets/vehicle_cleaned.csv")


# average price by Manufacturer, and its fuel type
data.groupby(['manufacturer','fuel']).agg({'price' : 'mean'}).sort_values(by='price', ascending=False).head(10).reset_index('fuel')


# scatterplot of average price by manufacturer
top10_man = data.groupby('manufacturer').agg({'price': 'mean'}).sort_values(by='price', ascending=False).head(10).reset_index()
man_df = pd.DataFrame(top10_man)
top10_man_list = list(top10_man['manufacturer'])
top_man = sns.lineplot(x='manufacturer', y='price', data=man_df).set(title='Average Price by Manufacturer')
plt.xticks(top10_man_list)
plt.xticks(rotation=45)

(array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]),
 <a list of 10 Text major ticklabel objects>)


fig = go.Figure(data = go.Choropleth(
    locations=data['state'].str.upper(),
    z=data['price'],
    locationmode='USA-states',
    colorscale='YlOrBr',
    colorbar_title="Avg Price",
    hovertext=data['state'],
), layout=dict(title = 'Average Price by States'))

fig.update_layout(
    geo_scope = 'usa'
)

fig.show()


mapcount = data.groupby('state', dropna=False)['state'].count().reset_index(name="count")
fig = go.Figure(data = go.Choropleth(
    locations=mapcount['state'].str.upper(),
    z=mapcount['count'],
    locationmode='USA-states',
    colorscale='YlOrBr',
    colorbar_title="Total",
    hovertext=data['state'],
), layout=dict(title = 'Total Vehicles by States'))

fig.update_layout(
    geo_scope = 'usa'
)

fig.show()


z = plt.subplots(figsize=(20,10))
z = sns.countplot(x=data.year.astype(int), hue="fuel", data=data, orient = 'v')
z.set_title('Fuel usage over time', pad=12)
z.set_ylabel('# of cars sold')
plt.legend(loc='upper left')

plt.xticks(rotation=90);


f, ax = plt.subplots(figsize=(20, 10))
ax.set_title('Price vs Year', pad=12)
fig = sns.boxplot(x=data.year.astype(int), y='price', data=data)
plt.xticks(rotation=90);


# The variation of price of used cars 
# The limit of prices has been set to remove outliers as 75 percentile of the data lies within the interval picked
l = pd.to_numeric(data['price']).hist(range=[1000,100000],histtype = 'barstacked')
plt.title('Variation in price')
l.set_xlabel('Price($)')
l.set_ylabel('Count')
;

''


m = sns.relplot(x='odometer',y = 'price' , kind='scatter', data=data, hue = 'drive')


avg_price_drive = data.groupby("drive")['price'].mean()

sns.catplot(x = 'drive', y = 'price', kind = 'box', data = data).set(title='Average price of cars by type of drive');


sns.set(rc={'figure.figsize':(12,8)})
sns.violinplot(y = 'price',data=data, x='drive')
plt.ylabel('Price [$]')
plt.xlabel('Drive')

Text(0.5, 0, 'Drive')


x = data.manufacturer
y = data.paint_color

fig = go.Figure(go.Histogram2d(
        x=x,
        y=y
    ))
fig.show()


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59989 entries, 0 to 59988
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   region        59989 non-null  object 
 1   price         59989 non-null  int64  
 2   year          59989 non-null  float64
 3   manufacturer  59989 non-null  object 
 4   model         59989 non-null  object 
 5   fuel          59989 non-null  object 
 6   odometer      59989 non-null  float64
 7   title_status  59989 non-null  object 
 8   transmission  59989 non-null  object 
 9   condition     59989 non-null  object 
 10  cylinders     59989 non-null  object 
 11  drive         59989 non-null  object 
 12  type          59989 non-null  object 
 13  paint_color   59989 non-null  object 
 14  state         59989 non-null  object 
 15  lat           59989 non-null  float64
 16  long          59989 non-null  float64
 17  posting_date  59989 non-null  object 
 18  Made          59989 non-null  object 
dtypes: float64(4), int64(1), object(14)
memory usage: 8.7+ MB


n = data.groupby(['state','manufacturer']).agg(['count']).rename(columns={'count': 'total_sales'})
n.reset_index(inplace=True)
n = n[['state','manufacturer','price']]
n1 = n.groupby(['state']).apply(lambda x: x.sort_values([('price', 'total_sales')], ascending = False)).reset_index(drop=True)
n1


plot_order = data.groupby('paint_color')['price'].mean().sort_values(ascending=False).index.values
plot_order

array(['red', 'white', 'black', 'yellow', 'orange', 'purple', 'green',
       'blue', 'silver', 'brown', 'custom', 'grey'], dtype=object)


colors = {'white': '#FFFFFF', 'black':'#000000' , 'silver':'#C0C0C0', 'blue': '#0000FF', 'red':'#FF0000', 'grey': '#808080', 'green': '#00FF00', 'brown':'#964B00', 'custom':'#FFFF33' , 'yellow':'#FFE633', 'orange':'#FF9C33', 'purple':'#A020F0'}
custom_palette = ['white', 'black', 'silver', 'blue', 'red', 'grey', 'green', 'brown', 'custom', 'yellow', 'orange', 'purple']
plot_order = data.groupby('paint_color')['price'].mean().sort_values(ascending=False).index.values
sns.catplot(x = 'paint_color', y = 'price', kind = 'bar', data=data, height=6, aspect=2, palette=colors, ci=None, legend_out=False, order=plot_order).set(title='Average price of cars by color')
plt.xticks(rotation=45)
sns.set(font_scale=1.5)
plt.xlabel("Color")
plt.ylabel("Average Price of Cars Sold")
sns.set_style("darkgrid");


# plotting the number of cars sold in each color
colors = {'white': '#FFFFFF', 'black':'#000000' , 'silver':'#C0C0C0', 'red':'#FF0000', 'blue': '#0000FF', 'grey': '#808080', 'green': '#00FF00', 'brown':'#964B00', 'custom':'#FFFF33' , 'yellow':'#FFE633', 'orange':'#FF9C33', 'purple':'#A020F0'}

custom_palette = ['white', 'black', 'silver', 'red', 'blue', 'grey', 'green', 'brown', 'custom', 'yellow', 'orange', 'purple']
sns.catplot(x='paint_color', data=data, kind='count', height=6, aspect=4, order=custom_palette, palette=colors).set(title='Number of Cars Sold in Each Color')
sns.set(font_scale=1.5)
plt.xlabel("Color")
plt.ylabel("Number of Cars Sold")
sns.set_style("darkgrid");


plot_order = data.groupby('cylinders')['price'].mean().sort_values(ascending=False).index.values
sns.catplot(data=data, x='cylinders',  y='price',kind='bar',ci=None, legend_out=False, order=plot_order)
plt.ylabel('Price')
plt.xlabel('cylinder')
plt.xticks(rotation=45)
sns.set(font_scale=0.8)


sns.set(rc={'figure.figsize':(10,15)})
sns.catplot(data=data, y='price',x='cylinders',alpha=0.5, aspect=1.5)
plt.ylabel('Price')
plt.xlabel('cylinder')
plt.xticks(rotation=45)
sns.set(font_scale=1.5)


avg_price = data.groupby("Made")['price'].mean()
avg_odometer = data.groupby("Made")['odometer'].mean()
price_odometer = pd.merge(avg_price, avg_odometer, on = 'Made')
price_odometer.reset_index(level = 0, inplace = True)


import numpy as np
import matplotlib.pyplot as plt

price_odometer


made = price_odometer['Made']

price = price_odometer['price']
odometer = price_odometer['odometer']

price = [*price , price [0]]
odometer = [*odometer, odometer[0]]


label_loc = np.linspace(start=0, stop=2 * np.pi, num=len(price))
# angles=np.linspace(0,2*np.pi,len(price), endpoint=False)


plt.figure(figsize=(8, 8))
plt.subplot(polar=True)
plt.plot(label_loc, price, label='price')
plt.plot(label_loc, odometer, label='distance travel')

plt.title('Difference between the distance travel and price', size=20)
lines, labels = plt.thetagrids(np.degrees(label_loc), labels=made)
# Fill area
# ax.plot(angles,price, 'o--', color='g', label='price')
# ax.fill(label_loc, price, alpha=0.1, color='g')
plt.legend()

plt.show()


Italian_made = data[data['Made'] == 'Italian Made']
Italian_made[['price', 'manufacturer']]
Italian_manufacturers = Italian_made.groupby('manufacturer')['price'].mean()
Italian_manufacturers = pd.DataFrame(Italian_manufacturers)
Italian_manufacturers.reset_index(level = 0, inplace = True)

g = sns.barplot(x = Italian_manufacturers['manufacturer'], y = Italian_manufacturers['price'], data = Italian_manufacturers, palette="Blues_d")

g.set_title('Price for Italian made cars')
plt.xticks(rotation = 45)
sns.set(rc={'figure.figsize':(2,1)})
sns.set(font_scale=1.2)
plt.show()


avg_price = data.groupby("cylinders")['price'].mean()
avg_odometer = data.groupby("cylinders")['odometer'].mean()
price_odometer = pd.merge(avg_price, avg_odometer, on = 'cylinders')
price_odometer.reset_index(level = 0, inplace = True)


price_odometer


made = price_odometer['cylinders']

price = price_odometer['price']
odometer = price_odometer['odometer']

price = [*price , price [0]]
odometer = [*odometer, odometer[0]]


label_loc = np.linspace(start=0, stop=2 * np.pi, num=len(price))
# angles=np.linspace(0,2*np.pi,len(price), endpoint=False)


plt.figure(figsize=(8, 8))
plt.subplot(polar=True)
plt.plot(label_loc, price, label='price')
plt.plot(label_loc, odometer, label='cylinders')

plt.title('Difference between cylinders and price', size=20)
lines, labels = plt.thetagrids(np.degrees(label_loc), labels=made)
# Fill area
# ax.plot(angles,price, 'o--', color='g', label='price')
# ax.fill(label_loc, price, alpha=0.1, color='g')
plt.legend()

plt.show()


import pandas as pd
import sklearn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as pltb
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

import xgboost
from numpy import loadtxt
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression


# Mounting Google Drive locally
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


data = pd.read_csv('/content/drive/Shareddrives/BA780 Team2/datasets/vehicle_cleaned.csv')
data_regression = data.loc[:, ['manufacturer', 'cylinders', 'drive', 'type', 'paint_color', 'price']]
#Copying in to 2 versions for one hot and label
ohc_reg = data_regression
label_reg = data_regression
data_regression_original = data_regression
data_regression.head()


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59989 entries, 0 to 59988
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   region        59989 non-null  object 
 1   price         59989 non-null  int64  
 2   year          59989 non-null  float64
 3   manufacturer  59989 non-null  object 
 4   model         59989 non-null  object 
 5   fuel          59989 non-null  object 
 6   odometer      59989 non-null  float64
 7   title_status  59989 non-null  object 
 8   transmission  59989 non-null  object 
 9   condition     59989 non-null  object 
 10  cylinders     59989 non-null  object 
 11  drive         59989 non-null  object 
 12  type          59989 non-null  object 
 13  paint_color   59989 non-null  object 
 14  state         59989 non-null  object 
 15  lat           59989 non-null  float64
 16  long          59989 non-null  float64
 17  posting_date  59989 non-null  object 
 18  Made          59989 non-null  object 
dtypes: float64(4), int64(1), object(14)
memory usage: 8.7+ MB


data['price'].sort_values()

2556     10020
53327    10050
22505    10082
54792    10088
4012     10088
         ...  
5021     99700
8495     99888
7491     99888
17446    99990
35265    99999
Name: price, Length: 59989, dtype: int64


ohc_reg = pd.get_dummies(ohc_reg)


label_reg = data_regression

le_manufacturer = LabelEncoder()
le_type = LabelEncoder()
le_cylinders = LabelEncoder()
le_paint_colors = LabelEncoder()
le_drive = LabelEncoder()

label_reg["type"] = le_type.fit_transform(label_reg['type'])
label_reg["manufacturer"] = le_manufacturer.fit_transform(label_reg['manufacturer'])
label_reg['cylinders'] = le_cylinders.fit_transform(label_reg['cylinders'])
label_reg['paint_color'] = le_paint_colors.fit_transform(label_reg['paint_color'])
label_reg['drive'] = le_drive.fit_transform(label_reg['drive'])

label_reg.head()


X_regla = label_reg.drop('price', axis = 1)
y_regla = label_reg['price']

X_train_regla, X_test_regla, y_train_regla, y_test_regla = train_test_split(X_regla, y_regla, test_size = 0.2, random_state = 780)


LRmodel = LinearRegression()
LRmodel.fit(X_train_regla, y_train_regla)

#Make predictions b
ypred_linear = LRmodel.predict(X_test_regla)
ypred_linear

array([10712.86934246, 17979.69833771, 27646.4619262 , ...,
       23915.36627515, 23975.00121577, 26183.46597909])


LR = X_test_regla.join(y_test_regla).reset_index()
LR.join(pd.Series(ypred_linear, name='predicted')).head()


print("Model coefficients:    ",LRmodel.coef_)
print("Model intercept:", LRmodel.intercept_)

Model coefficients:     [   9.6855616  3086.12987719 -967.64906097  185.42471614   58.68524042]
Model intercept: 8497.968918481138


# Evaluation
MSE_LR = mean_squared_error(y_test_regla, ypred_linear)
RMSE_LR = np.sqrt(MSE_LR)
r2_LR = r2_score(y_test_regla, ypred_linear)
MAE_LR = mean_absolute_error(y_test_regla, ypred_linear)
print('The root mean squared error for linear regression is:',RMSE_LR)
print('The mean absolute error for linear regression is:', MAE_LR)
print('The R squared for linear regression is:', r2_LR)

The root mean squared error for linear regression is: 11349.936810583622
The mean absolute error for linear regression is: 8840.065391043321
The R squared for linear regression is: 0.10780806250497865


from sklearn.tree import DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(X_train_regla,  y_train_regla.values)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')


y_pred_tree = dec_tree_reg.predict(X_test_regla)


# Evaluation
MSE_tree = mean_squared_error(y_test_regla,y_pred_tree)
RMSE_tree = np.sqrt(MSE_tree)
MAE_tree = mean_absolute_error(y_test_regla, y_pred_tree)

print('The root mean squared error  is:',RMSE_tree)
print('The mean absolute error for decision tree is:', MAE_tree)

The root mean squared error  is: 9966.943632550378
The mean absolute error for decision tree is: 6739.6300011659505


#Combine multiple decision trees into a forest

from sklearn.ensemble import RandomForestRegressor
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(X_train_regla, y_train_regla.values)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)


y_pred_forest = random_forest_reg.predict(X_test_regla)


# Evaluation
MSE_forest = mean_squared_error(y_test_regla,y_pred_forest)
RMSE_forest = np.sqrt(MSE_forest)
MAE_forest = mean_absolute_error(y_test_regla, y_pred_forest)

print('The root mean squared error for random forest is:',RMSE_forest)
print('The mean absolute error for decision tree is:', MAE_forest)

The root mean squared error for random forest is: 9785.433838837054
The mean absolute error for decision tree is: 6701.098888177917


# max_depth = [None, 2,4,6,8,10,12]
parameters = {
              'max_depth': [*range(5, 15, 1)],
              'min_samples_leaf': [1,2,3],
             'max_features':['auto', 'sqrt', 'log2'],
             'random_state':[0,1]}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
#negated MSE
gs.fit(X_train_regla, y_train_regla.values)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=0, splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 3],
                         'random_state': [0, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)


regressor = gs.best_estimator_

regressor.fit(X_train_regla, y_train_regla.values)
y_pred_forest2 = regressor.predict(X_test_regla)
RMSE_forest2 = np.sqrt(mean_squared_error(y_test_regla,y_pred_forest2))
MAE_forest2 = mean_absolute_error(y_test_regla, y_pred_forest2)
print('The root mean squared error for random forest after grid search is:',RMSE_forest2)
print('The mean absolute error for random forest after grid search is:',MAE_forest2)

The root mean squared error for random forest after grid search is: 9999.675769540172
The mean absolute error for random forest after grid search is: 6923.777227403449


comparison = {'ML Models': ['Linear Regression', 'Decision Tree', 'Random Forest', 'Random Forest w/ CV'],
              'RMSE score': [RMSE_LR, RMSE_tree, RMSE_forest, RMSE_forest2],
              'MAE score': [MAE_LR, MAE_tree, MAE_forest, MAE_forest2]}
comparison = pd.DataFrame(data=comparison)
comparison


#country, edlevel, yearscode
#These are the strings that the user will put in
X = np.array([["ford", '8 cylinders', '4wd', 'truck', 'black' ]])
X

array([['ford', '8 cylinders', '4wd', 'truck', 'black']], dtype='<U11')


X[:, 0] = le_manufacturer.transform(X[:,0])
X[:, 1] = le_cylinders.transform(X[:,1])
X[:, 2] = le_drive.transform(X[:,2])
X[:, 3] = le_type.transform(X[:,3])
X[:, 4] = le_paint_colors.transform(X[:,4])

X = X.astype(float)
X

array([[13.,  6.,  0., 10.,  0.]])


y_pred = random_forest_reg.predict(X)
y_pred

array([33808.70998939])

	id	url	region	region_url	price	year	manufacturer	model	condition	cylinders	fuel	odometer	title_status	transmission	VIN	drive	size	type	paint_color	image_url	description	county	state	lat	long	posting_date
0	7222695916	https://prescott.craigslist.org/cto/d/prescott...	prescott	https://prescott.craigslist.org	6000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	az	NaN	NaN	NaN
1	7218891961	https://fayar.craigslist.org/ctd/d/bentonville...	fayetteville	https://fayar.craigslist.org	11900	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	ar	NaN	NaN	NaN
2	7221797935	https://keys.craigslist.org/cto/d/summerland-k...	florida keys	https://keys.craigslist.org	21000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	fl	NaN	NaN	NaN
3	7222270760	https://worcester.craigslist.org/cto/d/west-br...	worcester / central MA	https://worcester.craigslist.org	1500	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	ma	NaN	NaN	NaN
4	7210384030	https://greensboro.craigslist.org/cto/d/trinit...	greensboro	https://greensboro.craigslist.org	4900	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	nc	NaN	NaN	NaN

	region	price	year	manufacturer	model	fuel	odometer	title_status	transmission	condition	cylinders	VIN	drive	size	type	paint_color	state	lat	long	posting_date
0	prescott	6000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	az	NaN	NaN	NaN
1	fayetteville	11900	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	ar	NaN	NaN	NaN
2	florida keys	21000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	fl	NaN	NaN	NaN
3	worcester / central MA	1500	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	ma	NaN	NaN	NaN
4	greensboro	4900	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	nc	NaN	NaN	NaN

	region	price	year	manufacturer	model	fuel	odometer	title_status	transmission	condition	cylinders	drive	type	paint_color	state	lat	long	posting_date	Made
0	auburn	15000	2013.0	ford	f-150 xlt	2	128000.0	6	2	4	6	2	12	12	al	32.5920	-85.518900	2021-05-03T14:02:03-0500	8
1	auburn	27990	2012.0	gmc	sierra 2500 hd extended cab	2	68696.0	6	0	3	8	3	11	12	al	32.5900	-85.480000	2021-05-03T13:41:25-0500	8
2	auburn	34590	2016.0	chevrolet	silverado 1500 double	2	29499.0	6	0	3	6	3	11	11	al	32.5900	-85.480000	2021-05-03T12:41:33-0500	8
3	auburn	35000	2019.0	toyota	tacoma	2	43000.0	6	2	4	6	3	12	10	al	32.6013	-85.443974	2021-05-03T12:12:59-0500	7
4	auburn	29990	2016.0	chevrolet	colorado extended cab	2	17302.0	6	0	3	6	3	11	9	al	32.5900	-85.480000	2021-05-03T11:31:14-0500	8

	price	year	fuel	odometer	title_status	transmission	condition	cylinders	drive	type	paint_color	lat	long
count	59989.000000	59989.000000	59989.000000	5.998900e+04	59989.000000	59989.000000	59989.000000	59989.000000	59989.000000	59989.000000	59989.000000	59989.000000	59989.000000
mean	24607.909083	2012.192585	1.872227	7.971816e+04	5.906099	1.557286	3.729750	6.369634	2.324660	8.395973	8.938622	38.486827	-92.650564
std	12004.580604	9.972958	0.438481	1.262387e+05	0.490186	0.794879	0.711955	1.591310	0.787214	3.120328	2.404889	5.776469	17.578955
min	10020.000000	1900.000000	0.000000	0.000000e+00	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	-81.838232	-159.719900
25%	14900.000000	2011.000000	2.000000	3.205100e+04	6.000000	1.000000	3.000000	6.000000	2.000000	7.000000	7.000000	34.935000	-103.717900
50%	21950.000000	2015.000000	2.000000	7.067200e+04	6.000000	2.000000	4.000000	6.000000	3.000000	8.000000	9.000000	39.055400	-86.634478
75%	32200.000000	2017.000000	2.000000	1.109640e+05	6.000000	2.000000	4.000000	8.000000	3.000000	11.000000	11.000000	42.404100	-80.278473
max	99999.000000	2022.000000	4.000000	1.000000e+07	6.000000	2.000000	6.000000	12.000000	3.000000	12.000000	12.000000	73.325010	139.691700

	price	year	fuel	odometer	title_status	transmission	condition	cylinders	drive	type	paint_color	lat	long
price	1.000000	0.189845	-0.221488	-0.201135	0.032619	-0.331418	-0.075689	0.323408	0.233129	0.151558	-0.015687	-0.026680	0.030282
year	0.189845	1.000000	-0.002755	-0.132139	-0.029916	-0.116013	-0.015840	-0.209451	0.001557	-0.032472	0.124126	0.003867	0.027889
fuel	-0.221488	-0.002755	1.000000	-0.051595	-0.022239	0.052831	0.119535	-0.184885	-0.144181	-0.167953	0.059338	0.013363	-0.038496
odometer	-0.201135	-0.132139	-0.051595	1.000000	0.012987	0.175588	0.010593	0.071487	0.080813	0.090448	-0.023020	0.004204	-0.026391
title_status	0.032619	-0.029916	-0.022239	0.012987	1.000000	-0.083462	-0.072381	0.054584	0.001417	-0.012550	-0.006388	-0.017047	0.013671
transmission	-0.331418	-0.116013	0.052831	0.175588	-0.083462	1.000000	0.468616	-0.115187	-0.003642	0.126242	-0.021872	0.093544	-0.138929
condition	-0.075689	-0.015840	0.119535	0.010593	-0.072381	0.468616	1.000000	-0.168673	-0.032885	0.032923	0.026443	0.002702	-0.106107
cylinders	0.323408	-0.209451	-0.184885	0.071487	0.054584	-0.115187	-0.168673	1.000000	0.319786	0.340173	-0.060274	-0.039585	0.093029
drive	0.233129	0.001557	-0.144181	0.080813	0.001417	-0.003642	-0.032885	0.319786	1.000000	0.298624	0.014999	0.157868	0.067188
type	0.151558	-0.032472	-0.167953	0.090448	-0.012550	0.126242	0.032923	0.340173	0.298624	1.000000	-0.057390	0.007754	0.010963
paint_color	-0.015687	0.124126	0.059338	-0.023020	-0.006388	-0.021872	0.026443	-0.060274	0.014999	-0.057390	1.000000	0.016283	0.003216
lat	-0.026680	0.003867	0.013363	0.004204	-0.017047	0.093544	0.002702	-0.039585	0.157868	0.007754	0.016283	1.000000	-0.080925
long	0.030282	0.027889	-0.038496	-0.026391	0.013671	-0.138929	-0.106107	0.093029	0.067188	0.010963	0.003216	-0.080925	1.000000

	cylinders	Code
0	8 cylinders	8
1	6 cylinders	6
2	4 cylinders	4
3	5 cylinders	5
4	other	0
5	3 cylinders	3
6	10 cylinders	10
7	12 cylinders	12

	type	Code
0	truck	12
1	pickup	11
2	coupe	10
3	mini-van	9
4	SUV	8
5	sedan	7
6	offroad	6
7	van	5
8	convertible	4
9	hatchback	3
10	wagon	2
11	bus	1
12	other	0

	paint_color	Code
0	black	12
1	silver	11
2	grey	10
3	red	9
4	blue	8
5	white	7
6	brown	6
7	yellow	5
8	green	4
9	custom	3
10	purple	2
11	orange	1

	Made	Code
0	American Made	8
1	Japanese Made	7
2	Germany Made	6
3	Italian Made	5
4	Korean Made	4
5	British Made	3
6	Indian Made	2
7	Swedish Made	1

	fuel	price
manufacturer
ferrari	gas	83346.111111
jaguar	electric	65000.000000
aston-martin	gas	53367.000000
rover	other	52886.000000
land rover	gas	51497.500000
bmw	other	50995.000000
tesla	electric	49134.358974
cadillac	other	43914.162162
volvo	electric	43470.000000
rover	diesel	42212.631579

	state	manufacturer	price
			total_sales
0	ak	ford	105
1	ak	chevrolet	83
2	ak	toyota	67
3	ak	ram	47
4	ak	bmw	45
...	...	...	...
1579	wy	mitsubishi	1
1580	wy	nissan	1
1581	wy	porsche	1
1582	wy	subaru	1
1583	wy	bmw	1

	index	manufacturer	cylinders	drive	type	paint_color	price	predicted
0	39746	13	0	0	10	4	49990	10712.869342
1	55683	17	3	0	0	1	21500	17979.698338
2	35570	13	6	2	10	10	18995	27646.461926
3	35190	38	3	1	9	5	12850	19119.009477
4	6390	7	6	0	10	10	10902	29523.646679

	ML Models	RMSE score	MAE score
0	Linear Regression	11349.936811	8840.065391
1	Decision Tree	9966.943633	6739.630001
2	Random Forest	9785.433839	6701.098888
3	Random Forest w/ CV	9999.675770	6923.777227

	condition	Code
0	new	6
1	like new	5
2	excellent	4
3	good	3
4	fair	2
5	salvage	1

	title_status	Code
0	clean	6
1	rebuilt	5
2	salvage	4
3	missing	3
4	lien	2
5	parts only	1

	drive	Code
0	4wd	3
1	rwd	2
2	fwd	1

Analyzing Used Car Data to Improve Sales and Predict Pricing¶

Team Members¶

General Summary¶

Project Objective¶

Real world Impact¶

Contents¶

About Dataset¶

Data Preparation¶

Select important variables¶

Missing values processing¶

Removing outliers¶

Create custom variables¶

The clean table¶

Exploratory data analysis¶

Initial Analysis - Statistics¶

Statistical description¶

Correlation Between Variables¶

Wordcloud based on customer comment¶

Business Problem¶

Companies point of view¶

Price Exploration¶

Average price by manufacturer and its fuel type¶

Average price over the state¶

Fuel usage shift over time¶

Average price over the years¶

The variation of price of used cars¶

The variation in price with miles run by the car¶

Drive Exploration¶

Color Exploration with price¶

Color correlation with manufacturer¶

Cylinder Exploration¶

Customer point of view¶

Radar chart with price over odometer¶

Manufacturers with the lowest prices¶

Price in terms of number of cylinders¶

Machine Learning¶

Set up¶

Business summary¶

Technical summary¶

Preprocessing data for regression prediction¶

One hot encoding¶

Label encoding¶

Linear Regression¶

Decision Tree¶

Random Forest¶

Comparing between models¶

Making real prediction¶

Conclusion¶