GitHub Repository: HarshKapadia2/car-details
data <- read.csv("https://raw.githubusercontent.com/HarshKapadia2/car-details/main/data/car_details_v3.csv")
head(data)
## name year selling_price km_driven fuel seller_type
## 1 Maruti Swift Dzire VDI 2014 450000 145500 Diesel Individual
## 2 Skoda Rapid 1.5 TDI Ambition 2014 370000 120000 Diesel Individual
## 3 Honda City 2017-2020 EXi 2006 158000 140000 Petrol Individual
## 4 Hyundai i20 Sportz Diesel 2010 225000 127000 Diesel Individual
## 5 Maruti Swift VXI BSIII 2007 130000 120000 Petrol Individual
## 6 Hyundai Xcent 1.2 VTVT E Plus 2017 440000 45000 Petrol Individual
## transmission owner mileage engine max_power
## 1 Manual First Owner 23.4 kmpl 1248 CC 74 bhp
## 2 Manual Second Owner 21.14 kmpl 1498 CC 103.52 bhp
## 3 Manual Third Owner 17.7 kmpl 1497 CC 78 bhp
## 4 Manual First Owner 23.0 kmpl 1396 CC 90 bhp
## 5 Manual First Owner 16.1 kmpl 1298 CC 88.2 bhp
## 6 Manual First Owner 20.14 kmpl 1197 CC 81.86 bhp
## torque seats
## 1 190Nm@ 2000rpm 5
## 2 250Nm@ 1500-2500rpm 5
## 3 12.7@ 2,700(kgm@ rpm) 5
## 4 22.4 kgm at 1750-2750rpm 5
## 5 11.5@ 4,500(kgm@ rpm) 5
## 6 113.75nm@ 4000rpm 5
str(data)
## 'data.frame': 8128 obs. of 13 variables:
## $ name : chr "Maruti Swift Dzire VDI" "Skoda Rapid 1.5 TDI Ambition" "Honda City 2017-2020 EXi" "Hyundai i20 Sportz Diesel" ...
## $ year : int 2014 2014 2006 2010 2007 2017 2007 2001 2011 2013 ...
## $ selling_price: int 450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
## $ km_driven : int 145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
## $ fuel : chr "Diesel" "Diesel" "Petrol" "Diesel" ...
## $ seller_type : chr "Individual" "Individual" "Individual" "Individual" ...
## $ transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ owner : chr "First Owner" "Second Owner" "Third Owner" "First Owner" ...
## $ mileage : chr "23.4 kmpl" "21.14 kmpl" "17.7 kmpl" "23.0 kmpl" ...
## $ engine : chr "1248 CC" "1498 CC" "1497 CC" "1396 CC" ...
## $ max_power : chr "74 bhp" "103.52 bhp" "78 bhp" "90 bhp" ...
## $ torque : chr "190Nm@ 2000rpm" "250Nm@ 1500-2500rpm" "12.7@ 2,700(kgm@ rpm)" "22.4 kgm at 1750-2750rpm" ...
## $ seats : int 5 5 5 5 5 5 5 4 5 5 ...
# install.packages("tidyverse", repos = "http://cran.us.r-project.org") # For `ggplot2` and `dplyr`
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
## Installing package into '/home/runner/work/_temp/Library'
## (as 'lib' is unspecified)
## also installing the dependencies 'colorspace', 'cli', 'crayon', 'utf8', 'farver', 'labeling', 'lifecycle', 'munsell', 'RColorBrewer', 'viridisLite', 'ellipsis', 'fansi', 'pillar', 'pkgconfig', 'vctrs', 'gtable', 'isoband', 'scales', 'tibble', 'withr'
install.packages("dplyr", repos = "http://cran.us.r-project.org")
## Installing package into '/home/runner/work/_temp/Library'
## (as 'lib' is unspecified)
## also installing the dependencies 'purrr', 'generics', 'tidyselect'
# install.packages("fansi", repos = "http://cran.us.r-project.org") # To solve `tidyverse` package error
install.packages("plotrix", repos = "http://cran.us.r-project.org") # For 3d Pie Chart
## Installing package into '/home/runner/work/_temp/Library'
## (as 'lib' is unspecified)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotrix)
# library(tidyverse)
count(data)
## n
## 1 8128
sum(is.na(data))
## [1] 221
row_status <- complete.cases(data)
data <- data[row_status,] # [row, column]
sum(is.na(data))
## [1] 0
count(data)
## n
## 1 7907
str(data$engine)
## chr [1:7907] "1248 CC" "1498 CC" "1497 CC" "1396 CC" "1298 CC" "1197 CC" ...
engine_chr_values <- strsplit(data$engine, split = " ", fixed = TRUE)
engine_count <- length(engine_chr_values)
new_engine_values <- vector()
for (i in 1:engine_count) {
new_engine_values[i] <- strtoi(engine_chr_values[[i]][1])
}
data$engine <- new_engine_values
str(data$engine)
## int [1:7907] 1248 1498 1497 1396 1298 1197 1061 796 1364 1399 ...
str(data$mileage)
## chr [1:7907] "23.4 kmpl" "21.14 kmpl" "17.7 kmpl" "23.0 kmpl" "16.1 kmpl" ...
mileage_chr_values <- strsplit(data$mileage, split = " ", fixed = TRUE)
mileage_count <- length(mileage_chr_values)
new_mileage_values <- vector()
for (i in 1:mileage_count) {
new_mileage_values[i] <- as.numeric(mileage_chr_values[[i]][1])
}
data$mileage <- new_mileage_values
str(data$mileage)
## num [1:7907] 23.4 21.1 17.7 23 16.1 ...
str(data$max_power)
## chr [1:7907] "74 bhp" "103.52 bhp" "78 bhp" "90 bhp" "88.2 bhp" ...
max_power_chr_values <- strsplit(data$max_power, split = " ", fixed = TRUE)
max_power_count <- length(max_power_chr_values)
new_max_power_values <- vector()
for (i in 1:max_power_count) {
new_max_power_values[i] <- as.numeric(max_power_chr_values[[i]][1])
}
data$max_power <- new_max_power_values
str(data$max_power)
## num [1:7907] 74 103.5 78 90 88.2 ...
pred_data <- data %>% select(selling_price, km_driven, owner, year, fuel, seller_type, transmission, mileage, engine)
year_count <- length(pred_data$year)
age <- vector()
for (i in 1:year_count) {
age[i] <- 2020 - pred_data$year[i]
}
pred_data <- cbind(pred_data, age)
str(pred_data$age)
## num [1:7907] 6 6 14 10 13 3 13 19 9 7 ...
pred_data <- subset(pred_data, select = -year)
str(pred_data)
## 'data.frame': 7907 obs. of 9 variables:
## $ selling_price: int 450000 370000 158000 225000 130000 440000 96000 45000 350000 200000 ...
## $ km_driven : int 145500 120000 140000 127000 120000 45000 175000 5000 90000 169000 ...
## $ owner : chr "First Owner" "Second Owner" "Third Owner" "First Owner" ...
## $ fuel : chr "Diesel" "Diesel" "Petrol" "Diesel" ...
## $ seller_type : chr "Individual" "Individual" "Individual" "Individual" ...
## $ transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ mileage : num 23.4 21.1 17.7 23 16.1 ...
## $ engine : int 1248 1498 1497 1396 1298 1197 1061 796 1364 1399 ...
## $ age : num 6 6 14 10 13 3 13 19 9 7 ...
1
= Petrol
0
= Diesel
pred_data %>% count(fuel)
## fuel n
## 1 CNG 53
## 2 Diesel 4299
## 3 LPG 35
## 4 Petrol 3520
pred_fuel_count <- length(pred_data$fuel)
for (i in 1:pred_fuel_count) {
temp_val <- trimws(pred_data$fuel[i])
if(temp_val %in% "CNG") {
pred_data <- pred_data[-c(i),]
}
else if(temp_val %in% "LPG") {
pred_data <- pred_data[-c(i),]
}
}
pred_data %>% count(fuel)
## fuel n
## 1 CNG 2
## 2 Diesel 4299
## 3 LPG 1
## 4 Petrol 3520
fuel_type <- vector() # 1 = Petrol, 0 = Diesel
pred_fuel_count <- length(pred_data$fuel)
for(i in 1:pred_fuel_count) {
if(pred_data$fuel[i] %in% "Petrol") {
fuel_type[i] <- 1
}
else {
fuel_type[i] <- 0
}
}
pred_data <- cbind(pred_data, fuel_type)
pred_data <- subset(pred_data, select = -fuel)
str(pred_data)
## 'data.frame': 7822 obs. of 9 variables:
## $ selling_price: int 450000 370000 158000 225000 130000 440000 45000 350000 200000 500000 ...
## $ km_driven : int 145500 120000 140000 127000 120000 45000 5000 90000 169000 68000 ...
## $ owner : chr "First Owner" "Second Owner" "Third Owner" "First Owner" ...
## $ seller_type : chr "Individual" "Individual" "Individual" "Individual" ...
## $ transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ mileage : num 23.4 21.1 17.7 23 16.1 ...
## $ engine : int 1248 1498 1497 1396 1298 1197 796 1364 1399 1461 ...
## $ age : num 6 6 14 10 13 3 19 9 7 6 ...
## $ fuel_type : num 0 0 1 0 1 1 1 0 0 0 ...
1
= Manual
0
= Automatic
pred_data %>% count(transmission)
## transmission n
## 1 Automatic 1041
## 2 Manual 6781
transmission_type <- vector() # 1 = Manual, 0 = Automatic
transmission_count <- length(pred_data$transmission)
for(i in 1:transmission_count) {
if(pred_data$transmission[i] %in% "Manual") {
transmission_type[i] <- 1
}
else {
transmission_type[i] <- 0
}
}
pred_data <- cbind(pred_data, transmission_type)
pred_data <- subset(pred_data, select = -transmission)
str(pred_data)
## 'data.frame': 7822 obs. of 9 variables:
## $ selling_price : int 450000 370000 158000 225000 130000 440000 45000 350000 200000 500000 ...
## $ km_driven : int 145500 120000 140000 127000 120000 45000 5000 90000 169000 68000 ...
## $ owner : chr "First Owner" "Second Owner" "Third Owner" "First Owner" ...
## $ seller_type : chr "Individual" "Individual" "Individual" "Individual" ...
## $ mileage : num 23.4 21.1 17.7 23 16.1 ...
## $ engine : int 1248 1498 1497 1396 1298 1197 796 1364 1399 1461 ...
## $ age : num 6 6 14 10 13 3 19 9 7 6 ...
## $ fuel_type : num 0 0 1 0 1 1 1 0 0 0 ...
## $ transmission_type: num 1 1 1 1 1 1 1 1 1 1 ...
1
= Individual
0
= Dealer
pred_data %>% count(seller_type)
## seller_type n
## 1 Dealer 1106
## 2 Individual 6480
## 3 Trustmark Dealer 236
seller_count <- length(pred_data$seller_type)
for (i in 1:seller_count) {
temp_val <- trimws(pred_data$seller_type[i])
if(temp_val %in% "Trustmark Dealer") {
pred_data <- pred_data[-c(i),]
}
}
pred_data %>% count(seller_type)
## seller_type n
## 1 Dealer 1106
## 2 Individual 6480
## 3 Trustmark Dealer 100
seller_type <- vector() # 1 = Individual, 0 = Dealer
seller_count <- length(pred_data$seller_type)
for(i in 1:seller_count) {
if(pred_data$seller_type[i] %in% "Individual") {
seller_type[i] <- 1
}
else {
seller_type[i] <- 0
}
}
pred_data <- subset(pred_data, select = -seller_type)
pred_data <- cbind(pred_data, seller_type)
str(pred_data)
## 'data.frame': 7686 obs. of 9 variables:
## $ selling_price : int 450000 370000 158000 225000 130000 440000 45000 350000 200000 500000 ...
## $ km_driven : int 145500 120000 140000 127000 120000 45000 5000 90000 169000 68000 ...
## $ owner : chr "First Owner" "Second Owner" "Third Owner" "First Owner" ...
## $ mileage : num 23.4 21.1 17.7 23 16.1 ...
## $ engine : int 1248 1498 1497 1396 1298 1197 796 1364 1399 1461 ...
## $ age : num 6 6 14 10 13 3 19 9 7 6 ...
## $ fuel_type : num 0 0 1 0 1 1 1 0 0 0 ...
## $ transmission_type: num 1 1 1 1 1 1 1 1 1 1 ...
## $ seller_type : num 1 1 1 1 1 1 1 1 1 1 ...
1
= First Owner
0
= Second Owner
pred_data %>% count(owner)
## owner n
## 1 First Owner 5034
## 2 Fourth & Above Owner 158
## 3 Second Owner 1984
## 4 Test Drive Car 5
## 5 Third Owner 505
owner_count <- length(pred_data$owner)
for (i in 1:owner_count) {
temp_val <- trimws(pred_data$owner[i])
if(temp_val %in% "Fourth & Above Owner") {
pred_data <- pred_data[-c(i),]
}
else if(temp_val %in% "Third Owner") {
pred_data <- pred_data[-c(i),]
}
else if(temp_val %in% "Test Drive Car") {
pred_data <- pred_data[-c(i),]
}
}
pred_data %>% count(owner)
## owner n
## 1 First Owner 5034
## 2 Fourth & Above Owner 22
## 3 Second Owner 1984
## 4 Test Drive Car 1
## 5 Third Owner 54
owner_type <- vector() # 1 = First Owner, 0 = Second Owner
owner_count <- length(pred_data$owner)
for(i in 1:owner_count) {
if(pred_data$owner[i] %in% "First Owner") {
owner_type[i] <- 1
}
else {
owner_type[i] <- 0
}
}
pred_data <- cbind(pred_data, owner_type)
pred_data <- subset(pred_data, select = -owner)
str(pred_data)
## 'data.frame': 7095 obs. of 9 variables:
## $ selling_price : int 450000 370000 225000 130000 440000 45000 350000 200000 500000 92000 ...
## $ km_driven : int 145500 120000 127000 120000 45000 5000 90000 169000 68000 100000 ...
## $ mileage : num 23.4 21.1 23 16.1 20.1 ...
## $ engine : int 1248 1498 1396 1298 1197 796 1364 1399 1461 993 ...
## $ age : num 6 6 10 13 3 19 9 7 6 15 ...
## $ fuel_type : num 0 0 0 1 1 1 0 0 0 1 ...
## $ transmission_type: num 1 1 1 1 1 1 1 1 1 1 ...
## $ seller_type : num 1 1 1 1 1 1 1 1 1 1 ...
## $ owner_type : num 1 0 1 1 1 0 1 1 0 0 ...
hist(data$seats, main = "No. of Seats vs No. of Cars", xlab = "No. of Seats", ylab = "No. of Cars")
temp_data <- data %>%
select(selling_price, seats) %>%
filter(selling_price < 200000)
boxplot(temp_data$selling_price ~ temp_data$seats, main = "No. of Seats vs Selling Price", xlab = "No. of Seats", ylab = "Selling Price (Rs)")
fuel_count = data %>% count(fuel)
pie(fuel_count$n, labels = fuel_count$fuel, radius = 1, col = c("orange", "blue", "yellow", "dark green"), main = "Fuel Type")
transmission_count = data %>% count(transmission)
pie3D(transmission_count$n, labels = transmission_count$transmission, radius = 1, col = c("yellow", "blue"), explode = 0.4, main = "Transmission Type")
barplot(table(data$seller_type), main = "Seller Type vs No. of Cars")
graph_data_1 <- data %>%
select(selling_price, km_driven) %>%
filter(selling_price < 5000000, km_driven < 400000)
ggplot(graph_data_1, aes(x = selling_price, y = km_driven)) + geom_point() + geom_smooth() + labs(title = "Selling Price vs Distance Driven", x = "Selling Price (Rs)", y = "Distance Driven (km)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
graph_data_2 <- data %>%
select(selling_price, km_driven) %>%
filter(selling_price > 2000000, km_driven < 400000)
ggplot(graph_data_2, aes(x = selling_price, y = km_driven)) +
stat_summary(fun.y = "mean", geom = "line", size = 1, linetype = "solid") +
labs(title = "Selling Price vs Distance Driven", x = "Selling Price (Rs)", y = "Distance Driven (km)")
## Warning: `fun.y` is deprecated. Use `fun` instead.
ggplot(data, aes(x = fuel, fill = owner)) + geom_bar() + labs(title = "Fuel Type vs No. of Cars vs Owner", x = "Fuel Type", y = "No. of Cars", fill = "Owner")
ggplot(data, aes(x = seats, y = fuel)) + geom_point(col = "red", size = 5) +
geom_segment(aes(x = min(seats), xend = max(seats), y = fuel, yend = fuel), linetype = "dashed", size = 0.1) +
labs(title = "No. of Seats vs Fuel Type", x = "No. of Seats", y = "Fuel Type")
graph_data <- data %>%
select(mileage, engine)
ggplot(graph_data, aes(x = engine, y = mileage)) + geom_smooth() + labs(title = "Engine Capacity vs Car Mileage", x = "Engine Capacity (cc)", y = "Car Mileage (km/L)")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data, aes(mileage)) + geom_density(aes(fill = fuel), alpha = 0.3) + labs(title = "Car Mileage vs Density vs Fuel Type", x = "Car Mileage (km/L)", y = "Density", fill = "Fuel Type")
plot(pred_data$engine, pred_data$mileage, xlab = "Engine Capacity (cc)", ylab = "Mileage (km/L)")
abline(lm(mileage ~ engine, data = pred_data), col = "red", lwd = 3)
title(main = "Engine Capacity vs Mileage")
plot(pred_data$age, pred_data$mileage, xlab = "Age (yr)", ylab = "Mileage (km/L)")
abline(lm(mileage ~ age, data = pred_data), col = "red", lwd = 3)
title(main = "Age vs Mileage")
mlr <- lm(selling_price ~ km_driven + owner_type + age + fuel_type + seller_type + transmission_type + mileage + engine, data = pred_data)
summary(mlr)
##
## Call:
## lm(formula = selling_price ~ km_driven + owner_type + age + fuel_type +
## seller_type + transmission_type + mileage + engine, data = pred_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1649833 -182047 6554 146683 7708354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.551e+05 9.759e+04 9.787 < 2e-16 ***
## km_driven -1.424e+00 1.358e-01 -10.485 < 2e-16 ***
## owner_type 3.294e+04 1.627e+04 2.024 0.04296 *
## age -4.802e+04 2.362e+03 -20.325 < 2e-16 ***
## fuel_type -5.672e+04 1.925e+04 -2.946 0.00323 **
## seller_type -2.952e+05 1.905e+04 -15.499 < 2e-16 ***
## transmission_type -9.839e+05 2.246e+04 -43.802 < 2e-16 ***
## mileage 1.483e+04 2.667e+03 5.562 2.76e-08 ***
## engine 6.193e+02 2.426e+01 25.523 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 553200 on 7086 degrees of freedom
## Multiple R-squared: 0.5647, Adjusted R-squared: 0.5642
## F-statistic: 1149 on 8 and 7086 DF, p-value: < 2.2e-16