5 Data imputation
5.1 General data manipulation
We will move on with the data downloaded in the previous chapter. To get an idea of the structure of our data, we use the glimpse
function.
glimpse(cars_data_df)
#> Rows: 239
#> Columns: 63
#> $ url_to_cars <chr> "https://www.hasznaltauto.hu/szemely~
#> $ ad_title <chr> "SSANGYONG KORANDO 1.5 Turbo e-XGI S~
#> $ vetelar <chr> "6 999 000 Ft", "7 780 000 Ft", NA, ~
#> $ vetelar_eur <chr> "€ 19 204", "€ 21 347", "€ 13 283", ~
#> $ finanszirozas_kalkulator_hirdetes <chr> "Finanszírozás kalkulátor ~
#> $ altalanos_adatok <chr> "Általános adatok", "Általános adato~
#> $ atveheto <chr> "2021.", "2021.", "2021.", "2021.", ~
#> $ evjarat <chr> "2021/10", "2020/1", "2020", "2021",~
#> $ allapot <chr> "Újszerű", "Kitűnő", "Normál", "Kitű~
#> $ kivitel <chr> "Terepjáró", "Egyterű", "Ferdehátú",~
#> $ finanszirozas <chr> "Finanszírozás", NA, NA, "Finanszíro~
#> $ finanszirozas_2 <chr> "35%-tól elvihető", NA, NA, "20%-tól~
#> $ finanszirozas_tipusa_casco_val <chr> "Lízing", NA, NA, "Lízing", NA, NA, ~
#> $ finanszirozas_tipusa_casco_nelkul <chr> "Lízing", NA, NA, NA, NA, NA, NA, NA~
#> $ kezdoreszlet_casco_nelkul <chr> "2 449 650 Ft", NA, NA, NA, NA, NA, ~
#> $ havi_reszlet_casco_nelkul <chr> "83 735 Ft", NA, NA, NA, NA, NA, NA,~
#> $ futamido_casco_nelkul <chr> "60 hónap", NA, NA, NA, NA, NA, NA, ~
#> $ garancia <chr> "Garancia", NA, NA, "Garancia", NA, ~
#> $ garancia_2 <chr> "5 év", NA, NA, "5 év", NA, NA, NA, ~
#> $ atrozsdasodasi_garancia <chr> "10 év", NA, NA, "12 év", NA, NA, NA~
#> $ szavatossagi_garancia <chr> "5 év", NA, NA, NA, NA, NA, NA, "7 é~
#> $ jarmu_adatok <chr> "Jármű adatok", "Jármű adatok", "Jár~
#> $ kilometerora_allasa <chr> "13 km", "Nincs megadva", "Nincs meg~
#> $ szallithato_szem_szama <chr> "5 fő", "5 fő", "5 fő", "5 fő", "7 f~
#> $ ajtok_szama <chr> "5", "5", "5", "5", "5", "5", "4", "~
#> $ sajat_tomeg <chr> "1 405 kg", "1 395 kg", "865 kg", "1~
#> $ teljes_tomeg <chr> "2 010 kg", "1 935 kg", "1 365 kg", ~
#> $ csomagtarto <chr> "551 liter", "645 liter", "265 liter~
#> $ klima_fajtaja <chr> "Manuális klíma", "Automata klíma", ~
#> $ motor_adatok <chr> "Motor adatok", "Motor adatok", "Mot~
#> $ uzemanyag <chr> "Benzin", "Benzin", "Hibrid", "Benzi~
#> $ hengerurtartalom <chr> "1 497 cm3", "1 199 cm3", "1 197 cm3~
#> $ teljesitmeny <chr> "120 kW, 163 LE", "96 kW, 131 LE", "~
#> $ henger_elrendezes <chr> "Soros", "Soros", "Soros", "Soros", ~
#> $ hajtas <chr> "Első kerék", "Első kerék", "Első ke~
#> $ sebessegvalto_fajtaja <chr> "Manuális (6 fokozatú)", "Manuális (~
#> $ okmanyok <chr> "Okmányok", "Okmányok", "Okmányok", ~
#> $ okmanyok_jellege <chr> "Érvényes magyar okmányokkal", "Érvé~
#> $ muszaki_vizsga_ervenyes <chr> "2025/10", NA, "2024", NA, NA, NA, N~
#> $ abroncs <chr> "Abroncs", "Abroncs", "Abroncs", "Ab~
#> $ nyari_gumi_meret <chr> "235/55 R 18", "205/60 R 16", "185/5~
#> $ szin <chr> NA, "Fehér", "Bíborvörös (metál)", "~
#> $ karpit_szine_1 <chr> NA, "Fekete", "Fekete", "Fekete", NA~
#> $ alaptipus_ara <chr> NA, NA, "4 440 000 Ft", "5 800 000 F~
#> $ extrakkal_novelt_ar <chr> NA, NA, "5 220 000 Ft", "6 095 000 F~
#> $ akcios_ar <chr> NA, NA, "4 890 000 Ft", "5 785 000 F~
#> $ akcio_feltetelei <chr> NA, NA, "Hűség program regisztráció,~
#> $ karpit_szine_2 <chr> NA, NA, NA, "Sötétszürke", NA, NA, N~
#> $ kezdoreszlet <chr> NA, NA, NA, NA, NA, NA, NA, "1 200 0~
#> $ havi_reszlet <chr> NA, NA, NA, NA, NA, NA, NA, "69 990 ~
#> $ futamido <chr> NA, NA, NA, NA, NA, NA, NA, "61 hóna~
#> $ teto <chr> NA, NA, NA, NA, NA, NA, NA, "Lemezte~
#> $ teli_gumi_meret <chr> NA, NA, NA, NA, NA, NA, NA, "205/60 ~
#> $ kotelezo_es_casco_dijak_hirdetes <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ alvazszam <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ ellenorzom <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ futasido <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ garancia_adatai <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ garancia_megnevezese <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ garancia_lejarata <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ hatso_nyari_gumi_meret <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ hatso_teli_gumi_meret <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ berlesi_lehetoseg <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
Currently, all variable characters are type, so be sure to manipulate the data. Probably most columns are not needed. Lets create some new variables!
cars_data_df %>%
transmute(
ad_title,
url_to_cars,
price = vetelar_eur, # translate the column names :)
year = evjarat,
weight = teljes_tomeg,
n_doors = ajtok_szama,
fuel = uzemanyag,
gear = sebessegvalto_fajtaja,
mileage = kilometerora_allasa
)
#> # A tibble: 239 x 9
#> ad_title url_to_cars price year weight n_doors fuel gear mileage
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 SSANGYONG KO~ https://www.ha~ € 19 ~ 2021~ 2 010~ 5 Benz~ Manu~ 13 km
#> 2 CITROEN GRAN~ https://www.ha~ € 21 ~ 2020~ 1 935~ 5 Benz~ Manu~ Nincs ~
#> 3 SUZUKI SWIFT~ https://www.ha~ € 13 ~ 2020 1 365~ 5 Hibr~ Manu~ Nincs ~
#> 4 PEUGEOT 208 ~ https://www.ha~ € 15 ~ 2021 1 595~ 5 Benz~ Manu~ 2 km
#> 5 OPEL COMBO L~ https://www.ha~ € 11 ~ 2021 2 280~ 5 Benz~ Manu~ 2 km
#> 6 OPEL INSIGNI~ https://www.ha~ € 18 ~ 2021 2 160~ 5 Benz~ Manu~ Nincs ~
#> 7 OPEL ZAFIRA ~ https://www.ha~ € 22 ~ 2021 2 635~ 4 Dízel Manu~ Nincs ~
#> 8 SUZUKI VITAR~ https://www.ha~ € 16 ~ 2021 1 770~ 5 Benz~ Manu~ 1 km
#> 9 SUZUKI IGNIS~ https://www.ha~ € 13 ~ 2021 1 330~ 5 Benz~ Manu~ Nincs ~
#> 10 CITROEN C5 A~ https://www.ha~ € 23 ~ 2021~ 1 940~ 5 Benz~ Manu~ 2 km
#> # ... with 229 more rows
The transmute
function is like applying theselect
and mutate
functions at the same time. We can create new variables with it, but only those that are defined in it will remain. However, it is worth cleaning the data in one step here.
cars_data_df %>%
transmute(
ad_title,
price = str_remove_all(vetelar_eur, "\\D"), # remove non-digit chr
year = gsub("/.*", "", evjarat), # remove after / sign
weight = str_remove_all(teljes_tomeg, "\\D"), # remove non-digit chr
n_doors = str_remove_all(ajtok_szama, "\\D"), # remove non-digit chr
fuel = uzemanyag, # fuel type
gear = str_detect(sebessegvalto_fajtaja, "Manuális"), # is manual mentioned?
mileage = str_remove_all(kilometerora_allasa, "\\D") # remove non-digit chr
)
#> # A tibble: 239 x 8
#> ad_title price year weight n_doors fuel gear mileage
#> <chr> <chr> <chr> <chr> <chr> <chr> <lgl> <chr>
#> 1 SSANGYONG KORANDO 1.5 Turbo e~ 19204 2021 2010 5 Benz~ TRUE "13"
#> 2 CITROEN GRAND C4 SPACETOURER ~ 21347 2020 1935 5 Benz~ TRUE ""
#> 3 SUZUKI SWIFT 1.2 Dualjet Hybr~ 13283 2020 1365 5 Hibr~ TRUE ""
#> 4 PEUGEOT 208 1.2 PureTech Acti~ 15873 2021 1595 5 Benz~ TRUE "2"
#> 5 OPEL COMBO Life 1.2 T Enjoy 2~ 11483 2021 2280 5 Benz~ TRUE "2"
#> 6 OPEL INSIGNIA Grand Sport 1.4~ 18271 2021 2160 5 Benz~ TRUE ""
#> 7 OPEL ZAFIRA LIFE 1.5 D Busine~ 22879 2021 2635 4 Dízel TRUE ""
#> 8 SUZUKI VITARA 1.4 Hybrid GL+ 16271 2021 1770 5 Benz~ TRUE "1"
#> 9 SUZUKI IGNIS 1.2 Hybrid GL+ 13307 2021 1330 5 Benz~ TRUE ""
#> 10 CITROEN C5 AIRCROSS 1.2 PureT~ 23597 2021 1940 5 Benz~ TRUE "2"
#> # ... with 229 more rows
But our variables are still character type…
df <- cars_data_df %>%
transmute(
ad_title,
price = str_remove_all(vetelar_eur, "\\D"), # remove non-digit chr
year = gsub("/.*", "", evjarat), # remove after / sign
weight = str_remove_all(teljes_tomeg, "\\D"), # remove non-digit chr
n_doors = str_remove_all(ajtok_szama, "\\D"), # remove non-digit chr
fuel = uzemanyag, # fuel type
gear = str_detect(sebessegvalto_fajtaja, "Manuális"), # is manual mentioned?
mileage = str_remove_all(kilometerora_allasa, "\\D") # remove non-digit chr
) %>%
mutate_at(vars("price", "year", "weight", "n_doors", "mileage"), as.numeric)
df
#> # A tibble: 239 x 8
#> ad_title price year weight n_doors fuel gear mileage
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <lgl> <dbl>
#> 1 SSANGYONG KORANDO 1.5 Turbo e~ 19204 2021 2010 5 Benz~ TRUE 13
#> 2 CITROEN GRAND C4 SPACETOURER ~ 21347 2020 1935 5 Benz~ TRUE NA
#> 3 SUZUKI SWIFT 1.2 Dualjet Hybr~ 13283 2020 1365 5 Hibr~ TRUE NA
#> 4 PEUGEOT 208 1.2 PureTech Acti~ 15873 2021 1595 5 Benz~ TRUE 2
#> 5 OPEL COMBO Life 1.2 T Enjoy 2~ 11483 2021 2280 5 Benz~ TRUE 2
#> 6 OPEL INSIGNIA Grand Sport 1.4~ 18271 2021 2160 5 Benz~ TRUE NA
#> 7 OPEL ZAFIRA LIFE 1.5 D Busine~ 22879 2021 2635 4 Dízel TRUE NA
#> 8 SUZUKI VITARA 1.4 Hybrid GL+ 16271 2021 1770 5 Benz~ TRUE 1
#> 9 SUZUKI IGNIS 1.2 Hybrid GL+ 13307 2021 1330 5 Benz~ TRUE NA
#> 10 CITROEN C5 AIRCROSS 1.2 PureT~ 23597 2021 1940 5 Benz~ TRUE 2
#> # ... with 229 more rows
5.2 Simple methods
Now the types are right. How much data is missing? Check with the visdat package.
visdat::vis_miss(df)
I suggest that we arrange the table according to the existence of the deficit at this time.
df %>%
arrange(!is.na(mileage)) # rows w missing mileage 1st
#> # A tibble: 239 x 8
#> ad_title price year weight n_doors fuel gear mileage
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <lgl> <dbl>
#> 1 CITROEN GRAND C4 SPACETOURER~ 21347 2020 1935 5 Benzin TRUE NA
#> 2 SUZUKI SWIFT 1.2 Dualjet Hyb~ 13283 2020 1365 5 Hibrid TRUE NA
#> 3 OPEL INSIGNIA Grand Sport 1.~ 18271 2021 2160 5 Benzin TRUE NA
#> 4 OPEL ZAFIRA LIFE 1.5 D Busin~ 22879 2021 2635 4 Dízel TRUE NA
#> 5 SUZUKI IGNIS 1.2 Hybrid GL+ 13307 2021 1330 5 Benzin TRUE NA
#> 6 OPEL CROSSLAND 1.2 Edition 13234 2021 1690 5 Benzin TRUE NA
#> 7 BMW I3 120Ah (Automata) ZÖLD~ 38207 2021 1710 5 Elekt~ FALSE NA
#> 8 OPEL MOKKA 1.2 T Edition CÉG~ 14792 2021 1700 5 Benzin TRUE NA
#> 9 BMW 520d xDrive (Automata) M~ 62394 2021 2405 4 Dízel FALSE NA
#> 10 OPEL CORSA F 1.2 Edition CÉG~ 10739 2021 1550 5 Benzin TRUE NA
#> # ... with 229 more rows
These are very new cars so our best assumption is that here the mileage
is 0.
df %>%
arrange(!is.na(mileage)) %>% # rows w missing mileage 1st
replace_na(list(mileage = 0))
#> # A tibble: 239 x 8
#> ad_title price year weight n_doors fuel gear mileage
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <lgl> <dbl>
#> 1 CITROEN GRAND C4 SPACETOURER~ 21347 2020 1935 5 Benzin TRUE 0
#> 2 SUZUKI SWIFT 1.2 Dualjet Hyb~ 13283 2020 1365 5 Hibrid TRUE 0
#> 3 OPEL INSIGNIA Grand Sport 1.~ 18271 2021 2160 5 Benzin TRUE 0
#> 4 OPEL ZAFIRA LIFE 1.5 D Busin~ 22879 2021 2635 4 Dízel TRUE 0
#> 5 SUZUKI IGNIS 1.2 Hybrid GL+ 13307 2021 1330 5 Benzin TRUE 0
#> 6 OPEL CROSSLAND 1.2 Edition 13234 2021 1690 5 Benzin TRUE 0
#> 7 BMW I3 120Ah (Automata) ZÖLD~ 38207 2021 1710 5 Elekt~ FALSE 0
#> 8 OPEL MOKKA 1.2 T Edition CÉG~ 14792 2021 1700 5 Benzin TRUE 0
#> 9 BMW 520d xDrive (Automata) M~ 62394 2021 2405 4 Dízel FALSE 0
#> 10 OPEL CORSA F 1.2 Edition CÉG~ 10739 2021 1550 5 Benzin TRUE 0
#> # ... with 229 more rows
In the case of weight
it would be more reasonable to replace missing observations with the average.
df %>%
arrange(!is.na(weight)) %>% # rows w missing weight 1st
mutate(weight = ifelse(is.na(weight), mean(weight, na.rm = TRUE), weight))
#> # A tibble: 239 x 8
#> ad_title price year weight n_doors fuel gear mileage
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <lgl> <dbl>
#> 1 MERCEDES-BENZ E 63 AMG 4 M S.~ 90517 2019 2099. 4 Benz~ FALSE 57000
#> 2 SSANGYONG KORANDO 1.5 Turbo e~ 19204 2021 2010 5 Benz~ TRUE 13
#> 3 CITROEN GRAND C4 SPACETOURER ~ 21347 2020 1935 5 Benz~ TRUE NA
#> 4 SUZUKI SWIFT 1.2 Dualjet Hybr~ 13283 2020 1365 5 Hibr~ TRUE NA
#> 5 PEUGEOT 208 1.2 PureTech Acti~ 15873 2021 1595 5 Benz~ TRUE 2
#> 6 OPEL COMBO Life 1.2 T Enjoy 2~ 11483 2021 2280 5 Benz~ TRUE 2
#> 7 OPEL INSIGNIA Grand Sport 1.4~ 18271 2021 2160 5 Benz~ TRUE NA
#> 8 OPEL ZAFIRA LIFE 1.5 D Busine~ 22879 2021 2635 4 Dízel TRUE NA
#> 9 SUZUKI VITARA 1.4 Hybrid GL+ 16271 2021 1770 5 Benz~ TRUE 1
#> 10 SUZUKI IGNIS 1.2 Hybrid GL+ 13307 2021 1330 5 Benz~ TRUE NA
#> # ... with 229 more rows
The syntax used above works as follows: if NA
, its value should be equal to the average value, else, it should be equal to itself. Although we could have switched to the median
.
df %>%
arrange(!is.na(weight)) %>% # rows w missing weight 1st
mutate(weight = ifelse(is.na(weight), median(weight, na.rm = TRUE), weight))
#> # A tibble: 239 x 8
#> ad_title price year weight n_doors fuel gear mileage
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <lgl> <dbl>
#> 1 MERCEDES-BENZ E 63 AMG 4 M S.~ 90517 2019 2068. 4 Benz~ FALSE 57000
#> 2 SSANGYONG KORANDO 1.5 Turbo e~ 19204 2021 2010 5 Benz~ TRUE 13
#> 3 CITROEN GRAND C4 SPACETOURER ~ 21347 2020 1935 5 Benz~ TRUE NA
#> 4 SUZUKI SWIFT 1.2 Dualjet Hybr~ 13283 2020 1365 5 Hibr~ TRUE NA
#> 5 PEUGEOT 208 1.2 PureTech Acti~ 15873 2021 1595 5 Benz~ TRUE 2
#> 6 OPEL COMBO Life 1.2 T Enjoy 2~ 11483 2021 2280 5 Benz~ TRUE 2
#> 7 OPEL INSIGNIA Grand Sport 1.4~ 18271 2021 2160 5 Benz~ TRUE NA
#> 8 OPEL ZAFIRA LIFE 1.5 D Busine~ 22879 2021 2635 4 Dízel TRUE NA
#> 9 SUZUKI VITARA 1.4 Hybrid GL+ 16271 2021 1770 5 Benz~ TRUE 1
#> 10 SUZUKI IGNIS 1.2 Hybrid GL+ 13307 2021 1330 5 Benz~ TRUE NA
#> # ... with 229 more rows
5.3 Advanced methods
Is this kind of imputation correct for the other numerical variables as well? Then perform the operation on all numeric columns.
df %>%
arrange(!is.na(weight)) %>% # rows w missing weight 1st
mutate_if(is.numeric, ~ ifelse(is.na(.), median(., na.rm = TRUE), .))
#> # A tibble: 239 x 8
#> ad_title price year weight n_doors fuel gear mileage
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <lgl> <dbl>
#> 1 MERCEDES-BENZ E 63 AMG 4 M S.~ 90517 2019 2068. 4 Benz~ FALSE 57000
#> 2 SSANGYONG KORANDO 1.5 Turbo e~ 19204 2021 2010 5 Benz~ TRUE 13
#> 3 CITROEN GRAND C4 SPACETOURER ~ 21347 2020 1935 5 Benz~ TRUE 31431
#> 4 SUZUKI SWIFT 1.2 Dualjet Hybr~ 13283 2020 1365 5 Hibr~ TRUE 31431
#> 5 PEUGEOT 208 1.2 PureTech Acti~ 15873 2021 1595 5 Benz~ TRUE 2
#> 6 OPEL COMBO Life 1.2 T Enjoy 2~ 11483 2021 2280 5 Benz~ TRUE 2
#> 7 OPEL INSIGNIA Grand Sport 1.4~ 18271 2021 2160 5 Benz~ TRUE 31431
#> 8 OPEL ZAFIRA LIFE 1.5 D Busine~ 22879 2021 2635 4 Dízel TRUE 31431
#> 9 SUZUKI VITARA 1.4 Hybrid GL+ 16271 2021 1770 5 Benz~ TRUE 1
#> 10 SUZUKI IGNIS 1.2 Hybrid GL+ 13307 2021 1330 5 Benz~ TRUE 31431
#> # ... with 229 more rows
It is a completely different approach to try to estimate the value of the missing variable from the values of the other variables, but this can often be more advantageous.
The R mice package serves this purpose, the methodology of which is already based on serious academic research. With the help of the package, we can easily use machine-learning-based tools to estimate our missing values. But beware: the proportion of our missing data should not exceed 10% as much as possible, since then we would estimate the unknown from the unknown.
Of the engines on which the estimate is based, I recommend the random forest, but find out which one is more useful to you in a given case when it comes to an important publication.
library(mice)
df %>%
mice %>%
complete() %>% # return the estimated values
tibble() # convert it to a nice looking data frame
#>
#> iter imp variable
#> 1 1 year weight gear mileage
#> 1 2 year weight gear mileage
#> 1 3 year weight gear mileage
#> 1 4 year weight gear mileage
#> 1 5 year weight gear mileage
#> 2 1 year weight gear mileage
#> 2 2 year weight gear mileage
#> 2 3 year weight gear mileage
#> 2 4 year weight gear mileage
#> 2 5 year weight gear mileage
#> 3 1 year weight gear mileage
#> 3 2 year weight gear mileage
#> 3 3 year weight gear mileage
#> 3 4 year weight gear mileage
#> 3 5 year weight gear mileage
#> 4 1 year weight gear mileage
#> 4 2 year weight gear mileage
#> 4 3 year weight gear mileage
#> 4 4 year weight gear mileage
#> 4 5 year weight gear mileage
#> 5 1 year weight gear mileage
#> 5 2 year weight gear mileage
#> 5 3 year weight gear mileage
#> 5 4 year weight gear mileage
#> 5 5 year weight gear mileage
#> # A tibble: 239 x 8
#> ad_title price year weight n_doors fuel gear mileage
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 SSANGYONG KORANDO 1.5 Turbo e~ 19204 2021 2010 5 Benz~ 1 13
#> 2 CITROEN GRAND C4 SPACETOURER ~ 21347 2020 1935 5 Benz~ 1 13
#> 3 SUZUKI SWIFT 1.2 Dualjet Hybr~ 13283 2020 1365 5 Hibr~ 1 6050
#> 4 PEUGEOT 208 1.2 PureTech Acti~ 15873 2021 1595 5 Benz~ 1 2
#> 5 OPEL COMBO Life 1.2 T Enjoy 2~ 11483 2021 2280 5 Benz~ 1 2
#> 6 OPEL INSIGNIA Grand Sport 1.4~ 18271 2021 2160 5 Benz~ 1 43380
#> 7 OPEL ZAFIRA LIFE 1.5 D Busine~ 22879 2021 2635 4 Dízel 1 89576
#> 8 SUZUKI VITARA 1.4 Hybrid GL+ 16271 2021 1770 5 Benz~ 1 1
#> 9 SUZUKI IGNIS 1.2 Hybrid GL+ 13307 2021 1330 5 Benz~ 1 17000
#> 10 CITROEN C5 AIRCROSS 1.2 PureT~ 23597 2021 1940 5 Benz~ 1 2
#> # ... with 229 more rows
Now we have a clean data frame, without any missing value!