Loading Data

GitHub Repository: HarshKapadia2/car-details

Load Data

data <- read.csv("https://raw.githubusercontent.com/HarshKapadia2/car-details/main/data/car_details_v3.csv")
head(data)
##                            name year selling_price km_driven   fuel seller_type
## 1        Maruti Swift Dzire VDI 2014        450000    145500 Diesel  Individual
## 2  Skoda Rapid 1.5 TDI Ambition 2014        370000    120000 Diesel  Individual
## 3      Honda City 2017-2020 EXi 2006        158000    140000 Petrol  Individual
## 4     Hyundai i20 Sportz Diesel 2010        225000    127000 Diesel  Individual
## 5        Maruti Swift VXI BSIII 2007        130000    120000 Petrol  Individual
## 6 Hyundai Xcent 1.2 VTVT E Plus 2017        440000     45000 Petrol  Individual
##   transmission        owner    mileage  engine  max_power
## 1       Manual  First Owner  23.4 kmpl 1248 CC     74 bhp
## 2       Manual Second Owner 21.14 kmpl 1498 CC 103.52 bhp
## 3       Manual  Third Owner  17.7 kmpl 1497 CC     78 bhp
## 4       Manual  First Owner  23.0 kmpl 1396 CC     90 bhp
## 5       Manual  First Owner  16.1 kmpl 1298 CC   88.2 bhp
## 6       Manual  First Owner 20.14 kmpl 1197 CC  81.86 bhp
##                     torque seats
## 1           190Nm@ 2000rpm     5
## 2      250Nm@ 1500-2500rpm     5
## 3    12.7@ 2,700(kgm@ rpm)     5
## 4 22.4 kgm at 1750-2750rpm     5
## 5    11.5@ 4,500(kgm@ rpm)     5
## 6        113.75nm@ 4000rpm     5

Structure of Data

str(data)
## 'data.frame':    8128 obs. of  13 variables:
##  $ name         : chr  "Maruti Swift Dzire VDI" "Skoda Rapid 1.5 TDI Ambition" "Honda City 2017-2020 EXi" "Hyundai i20 Sportz Diesel" ...
##  $ year         : int  2014 2014 2006 2010 2007 2017 2007 2001 2011 2013 ...
##  $ selling_price: int  450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
##  $ km_driven    : int  145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
##  $ fuel         : chr  "Diesel" "Diesel" "Petrol" "Diesel" ...
##  $ seller_type  : chr  "Individual" "Individual" "Individual" "Individual" ...
##  $ transmission : chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ owner        : chr  "First Owner" "Second Owner" "Third Owner" "First Owner" ...
##  $ mileage      : chr  "23.4 kmpl" "21.14 kmpl" "17.7 kmpl" "23.0 kmpl" ...
##  $ engine       : chr  "1248 CC" "1498 CC" "1497 CC" "1396 CC" ...
##  $ max_power    : chr  "74 bhp" "103.52 bhp" "78 bhp" "90 bhp" ...
##  $ torque       : chr  "190Nm@ 2000rpm" "250Nm@ 1500-2500rpm" "12.7@ 2,700(kgm@ rpm)" "22.4 kgm at 1750-2750rpm" ...
##  $ seats        : int  5 5 5 5 5 5 5 4 5 5 ...

Installing Packages

# install.packages("tidyverse", repos = "http://cran.us.r-project.org") # For `ggplot2` and `dplyr`
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
## Installing package into '/home/runner/work/_temp/Library'
## (as 'lib' is unspecified)
## also installing the dependencies 'colorspace', 'cli', 'crayon', 'utf8', 'farver', 'labeling', 'lifecycle', 'munsell', 'RColorBrewer', 'viridisLite', 'ellipsis', 'fansi', 'pillar', 'pkgconfig', 'vctrs', 'gtable', 'isoband', 'scales', 'tibble', 'withr'
install.packages("dplyr", repos = "http://cran.us.r-project.org")
## Installing package into '/home/runner/work/_temp/Library'
## (as 'lib' is unspecified)
## also installing the dependencies 'purrr', 'generics', 'tidyselect'
# install.packages("fansi", repos = "http://cran.us.r-project.org") # To solve `tidyverse` package error
install.packages("plotrix", repos = "http://cran.us.r-project.org") # For 3d Pie Chart
## Installing package into '/home/runner/work/_temp/Library'
## (as 'lib' is unspecified)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotrix)
# library(tidyverse)

Data Cleaning and Processing

Remove Incomplete Rows

count(data)
##      n
## 1 8128
sum(is.na(data))
## [1] 221
row_status <- complete.cases(data)
data <- data[row_status,] # [row, column]
sum(is.na(data))
## [1] 0
count(data)
##      n
## 1 7907

Convert Engine Column from Character to Integer Vector

str(data$engine)
##  chr [1:7907] "1248 CC" "1498 CC" "1497 CC" "1396 CC" "1298 CC" "1197 CC" ...
engine_chr_values <- strsplit(data$engine, split = " ", fixed = TRUE)
engine_count <- length(engine_chr_values)
new_engine_values <- vector()

for (i in 1:engine_count) {
  new_engine_values[i] <- strtoi(engine_chr_values[[i]][1])
}

data$engine <- new_engine_values
str(data$engine)
##  int [1:7907] 1248 1498 1497 1396 1298 1197 1061 796 1364 1399 ...

Convert Mileage Column from Character to Numeric Vector

str(data$mileage)
##  chr [1:7907] "23.4 kmpl" "21.14 kmpl" "17.7 kmpl" "23.0 kmpl" "16.1 kmpl" ...
mileage_chr_values <- strsplit(data$mileage, split = " ", fixed = TRUE)
mileage_count <- length(mileage_chr_values)
new_mileage_values <- vector()

for (i in 1:mileage_count) {
  new_mileage_values[i] <- as.numeric(mileage_chr_values[[i]][1])
}

data$mileage <- new_mileage_values
str(data$mileage)
##  num [1:7907] 23.4 21.1 17.7 23 16.1 ...

Convert Max Power Column from Character to Numeric Vector

str(data$max_power)
##  chr [1:7907] "74 bhp" "103.52 bhp" "78 bhp" "90 bhp" "88.2 bhp" ...
max_power_chr_values <- strsplit(data$max_power, split = " ", fixed = TRUE)
max_power_count <- length(max_power_chr_values)
new_max_power_values <- vector()

for (i in 1:max_power_count) {
  new_max_power_values[i] <- as.numeric(max_power_chr_values[[i]][1])
}

data$max_power <- new_max_power_values
str(data$max_power)
##  num [1:7907] 74 103.5 78 90 88.2 ...

Data for Prediction

Select Numeric Data

pred_data <- data %>% select(selling_price, km_driven, owner, year, fuel, seller_type, transmission, mileage, engine)

Calculate Age

year_count <- length(pred_data$year)
age <- vector()

for (i in 1:year_count) {
  age[i] <- 2020 - pred_data$year[i]
}

pred_data <- cbind(pred_data, age)
str(pred_data$age)
##  num [1:7907] 6 6 14 10 13 3 13 19 9 7 ...
pred_data <- subset(pred_data, select = -year)
str(pred_data)
## 'data.frame':    7907 obs. of  9 variables:
##  $ selling_price: int  450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
##  $ km_driven    : int  145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
##  $ owner        : chr  "First Owner" "Second Owner" "Third Owner" "First Owner" ...
##  $ fuel         : chr  "Diesel" "Diesel" "Petrol" "Diesel" ...
##  $ seller_type  : chr  "Individual" "Individual" "Individual" "Individual" ...
##  $ transmission : chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ mileage      : num  23.4 21.1 17.7 23 16.1 ...
##  $ engine       : int  1248 1498 1497 1396 1298 1197 1061 796 1364 1399 ...
##  $ age          : num  6 6 14 10 13 3 13 19 9 7 ...

Convert Fuel Types to Binary

1 = Petrol

0 = Diesel

pred_data %>% count(fuel)
##     fuel    n
## 1    CNG   53
## 2 Diesel 4299
## 3    LPG   35
## 4 Petrol 3520
pred_fuel_count <- length(pred_data$fuel)

for (i in 1:pred_fuel_count) {
  temp_val <- trimws(pred_data$fuel[i])

  if(temp_val %in% "CNG") {
    pred_data <- pred_data[-c(i),]
  }
  else if(temp_val %in% "LPG") {
    pred_data <- pred_data[-c(i),]
  }
}

pred_data %>% count(fuel)
##     fuel    n
## 1    CNG    2
## 2 Diesel 4299
## 3    LPG    1
## 4 Petrol 3520
fuel_type <- vector() # 1 = Petrol, 0 = Diesel
pred_fuel_count <- length(pred_data$fuel)

for(i in 1:pred_fuel_count) {
  if(pred_data$fuel[i] %in% "Petrol") {
    fuel_type[i] <- 1
  }
  else {
    fuel_type[i] <- 0
  }
}

pred_data <- cbind(pred_data, fuel_type)
pred_data <- subset(pred_data, select = -fuel)

str(pred_data)
## 'data.frame':    7822 obs. of  9 variables:
##  $ selling_price: int  450000 370000 158000 225000 130000 440000 45000 350000 200000 500000 ...
##  $ km_driven    : int  145500 120000 140000 127000 120000 45000 5000 90000 169000 68000 ...
##  $ owner        : chr  "First Owner" "Second Owner" "Third Owner" "First Owner" ...
##  $ seller_type  : chr  "Individual" "Individual" "Individual" "Individual" ...
##  $ transmission : chr  "Manual" "Manual" "Manual" "Manual" ...
##  $ mileage      : num  23.4 21.1 17.7 23 16.1 ...
##  $ engine       : int  1248 1498 1497 1396 1298 1197 796 1364 1399 1461 ...
##  $ age          : num  6 6 14 10 13 3 19 9 7 6 ...
##  $ fuel_type    : num  0 0 1 0 1 1 1 0 0 0 ...

Convert Transmission Types to Binary

1 = Manual

0 = Automatic

pred_data %>% count(transmission)
##   transmission    n
## 1    Automatic 1041
## 2       Manual 6781
transmission_type <- vector() # 1 = Manual, 0 = Automatic
transmission_count <- length(pred_data$transmission)

for(i in 1:transmission_count) {
  if(pred_data$transmission[i] %in% "Manual") {
    transmission_type[i] <- 1
  }
  else {
    transmission_type[i] <- 0
  }
}

pred_data <- cbind(pred_data, transmission_type)
pred_data <- subset(pred_data, select = -transmission)

str(pred_data)
## 'data.frame':    7822 obs. of  9 variables:
##  $ selling_price    : int  450000 370000 158000 225000 130000 440000 45000 350000 200000 500000 ...
##  $ km_driven        : int  145500 120000 140000 127000 120000 45000 5000 90000 169000 68000 ...
##  $ owner            : chr  "First Owner" "Second Owner" "Third Owner" "First Owner" ...
##  $ seller_type      : chr  "Individual" "Individual" "Individual" "Individual" ...
##  $ mileage          : num  23.4 21.1 17.7 23 16.1 ...
##  $ engine           : int  1248 1498 1497 1396 1298 1197 796 1364 1399 1461 ...
##  $ age              : num  6 6 14 10 13 3 19 9 7 6 ...
##  $ fuel_type        : num  0 0 1 0 1 1 1 0 0 0 ...
##  $ transmission_type: num  1 1 1 1 1 1 1 1 1 1 ...

Convert Seller Types to Binary

1 = Individual

0 = Dealer

pred_data %>% count(seller_type)
##        seller_type    n
## 1           Dealer 1106
## 2       Individual 6480
## 3 Trustmark Dealer  236
seller_count <- length(pred_data$seller_type)

for (i in 1:seller_count) {
  temp_val <- trimws(pred_data$seller_type[i])

  if(temp_val %in% "Trustmark Dealer") {
    pred_data <- pred_data[-c(i),]
  }
}

pred_data %>% count(seller_type)
##        seller_type    n
## 1           Dealer 1106
## 2       Individual 6480
## 3 Trustmark Dealer  100
seller_type <- vector() # 1 = Individual, 0 = Dealer
seller_count <- length(pred_data$seller_type)

for(i in 1:seller_count) {
  if(pred_data$seller_type[i] %in% "Individual") {
    seller_type[i] <- 1
  }
  else {
    seller_type[i] <- 0
  }
}

pred_data <- subset(pred_data, select = -seller_type)
pred_data <- cbind(pred_data, seller_type)

str(pred_data)
## 'data.frame':    7686 obs. of  9 variables:
##  $ selling_price    : int  450000 370000 158000 225000 130000 440000 45000 350000 200000 500000 ...
##  $ km_driven        : int  145500 120000 140000 127000 120000 45000 5000 90000 169000 68000 ...
##  $ owner            : chr  "First Owner" "Second Owner" "Third Owner" "First Owner" ...
##  $ mileage          : num  23.4 21.1 17.7 23 16.1 ...
##  $ engine           : int  1248 1498 1497 1396 1298 1197 796 1364 1399 1461 ...
##  $ age              : num  6 6 14 10 13 3 19 9 7 6 ...
##  $ fuel_type        : num  0 0 1 0 1 1 1 0 0 0 ...
##  $ transmission_type: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ seller_type      : num  1 1 1 1 1 1 1 1 1 1 ...

Convert Owner Types to Binary

1 = First Owner

0 = Second Owner

pred_data %>% count(owner)
##                  owner    n
## 1          First Owner 5034
## 2 Fourth & Above Owner  158
## 3         Second Owner 1984
## 4       Test Drive Car    5
## 5          Third Owner  505
owner_count <- length(pred_data$owner)

for (i in 1:owner_count) {
  temp_val <- trimws(pred_data$owner[i])

  if(temp_val %in% "Fourth & Above Owner") {
    pred_data <- pred_data[-c(i),]
  }
  else if(temp_val %in% "Third Owner") {
    pred_data <- pred_data[-c(i),]
  }
  else if(temp_val %in% "Test Drive Car") {
    pred_data <- pred_data[-c(i),]
  }
}

pred_data %>% count(owner)
##                  owner    n
## 1          First Owner 5034
## 2 Fourth & Above Owner   22
## 3         Second Owner 1984
## 4       Test Drive Car    1
## 5          Third Owner   54
owner_type <- vector() # 1 = First Owner, 0 = Second Owner
owner_count <- length(pred_data$owner)

for(i in 1:owner_count) {
  if(pred_data$owner[i] %in% "First Owner") {
    owner_type[i] <- 1
  }
  else {
    owner_type[i] <- 0
  }
}

pred_data <- cbind(pred_data, owner_type)
pred_data <- subset(pred_data, select = -owner)

str(pred_data)
## 'data.frame':    7095 obs. of  9 variables:
##  $ selling_price    : int  450000 370000 225000 130000 440000 45000 350000 200000 500000 92000 ...
##  $ km_driven        : int  145500 120000 127000 120000 45000 5000 90000 169000 68000 100000 ...
##  $ mileage          : num  23.4 21.1 23 16.1 20.1 ...
##  $ engine           : int  1248 1498 1396 1298 1197 796 1364 1399 1461 993 ...
##  $ age              : num  6 6 10 13 3 19 9 7 6 15 ...
##  $ fuel_type        : num  0 0 0 1 1 1 0 0 0 1 ...
##  $ transmission_type: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ seller_type      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ owner_type       : num  1 0 1 1 1 0 1 1 0 0 ...

Visualising Data

Histogram

hist(data$seats, main = "No. of Seats vs No. of Cars", xlab = "No. of Seats", ylab = "No. of Cars")

Box Plot

temp_data <- data %>%
  select(selling_price, seats) %>%
  filter(selling_price < 200000)

boxplot(temp_data$selling_price ~ temp_data$seats, main = "No. of Seats vs Selling Price", xlab = "No. of Seats", ylab = "Selling Price (Rs)")

Pie Chart

fuel_count = data %>% count(fuel)
pie(fuel_count$n, labels = fuel_count$fuel, radius = 1, col = c("orange", "blue", "yellow", "dark green"), main = "Fuel Type")

3D Pie Chart

transmission_count = data %>% count(transmission)
pie3D(transmission_count$n, labels = transmission_count$transmission, radius = 1, col = c("yellow", "blue"), explode = 0.4, main = "Transmission Type")

Bar Plot

barplot(table(data$seller_type), main = "Seller Type vs No. of Cars")

Scatter Plot

graph_data_1 <- data %>%
  select(selling_price, km_driven) %>%
  filter(selling_price < 5000000, km_driven < 400000)

ggplot(graph_data_1, aes(x = selling_price, y = km_driven)) + geom_point() + geom_smooth() + labs(title = "Selling Price vs Distance Driven", x = "Selling Price (Rs)", y = "Distance Driven (km)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Line Plot

graph_data_2 <- data %>%
  select(selling_price, km_driven) %>%
  filter(selling_price > 2000000, km_driven < 400000)

ggplot(graph_data_2, aes(x = selling_price, y = km_driven)) +
  stat_summary(fun.y = "mean", geom = "line", size = 1, linetype = "solid") +
  labs(title = "Selling Price vs Distance Driven", x = "Selling Price (Rs)", y = "Distance Driven (km)")
## Warning: `fun.y` is deprecated. Use `fun` instead.

Bar Chart

ggplot(data, aes(x = fuel, fill = owner)) + geom_bar() + labs(title = "Fuel Type vs No. of Cars vs Owner", x = "Fuel Type", y = "No. of Cars", fill = "Owner")

Dot Plot

ggplot(data, aes(x = seats, y = fuel)) + geom_point(col = "red", size = 5) +
  geom_segment(aes(x = min(seats), xend = max(seats), y = fuel, yend = fuel), linetype = "dashed", size = 0.1) +
  labs(title = "No. of Seats vs Fuel Type", x = "No. of Seats", y = "Fuel Type")

Smooth Plot

graph_data <- data %>%
  select(mileage, engine)

ggplot(graph_data, aes(x = engine, y = mileage)) + geom_smooth() + labs(title = "Engine Capacity vs Car Mileage", x = "Engine Capacity (cc)", y = "Car Mileage (km/L)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Density Plot

ggplot(data, aes(mileage)) + geom_density(aes(fill = fuel), alpha = 0.3) + labs(title = "Car Mileage vs Density vs Fuel Type", x = "Car Mileage (km/L)", y = "Density", fill = "Fuel Type")

Prediction

Simple Linear Regression

plot(pred_data$engine, pred_data$mileage, xlab = "Engine Capacity (cc)", ylab = "Mileage (km/L)")
abline(lm(mileage ~ engine, data = pred_data), col = "red", lwd = 3)
title(main = "Engine Capacity vs Mileage")

plot(pred_data$age, pred_data$mileage, xlab = "Age (yr)", ylab = "Mileage (km/L)")
abline(lm(mileage ~ age, data = pred_data), col = "red", lwd = 3)
title(main = "Age vs Mileage")

Multiple Linear Regression

mlr <- lm(selling_price ~ km_driven + owner_type + age + fuel_type + seller_type + transmission_type + mileage + engine, data = pred_data)
summary(mlr)
## 
## Call:
## lm(formula = selling_price ~ km_driven + owner_type + age + fuel_type + 
##     seller_type + transmission_type + mileage + engine, data = pred_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1649833  -182047     6554   146683  7708354 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        9.551e+05  9.759e+04   9.787  < 2e-16 ***
## km_driven         -1.424e+00  1.358e-01 -10.485  < 2e-16 ***
## owner_type         3.294e+04  1.627e+04   2.024  0.04296 *  
## age               -4.802e+04  2.362e+03 -20.325  < 2e-16 ***
## fuel_type         -5.672e+04  1.925e+04  -2.946  0.00323 ** 
## seller_type       -2.952e+05  1.905e+04 -15.499  < 2e-16 ***
## transmission_type -9.839e+05  2.246e+04 -43.802  < 2e-16 ***
## mileage            1.483e+04  2.667e+03   5.562 2.76e-08 ***
## engine             6.193e+02  2.426e+01  25.523  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 553200 on 7086 degrees of freedom
## Multiple R-squared:  0.5647, Adjusted R-squared:  0.5642 
## F-statistic:  1149 on 8 and 7086 DF,  p-value: < 2.2e-16