data manipulation with python.
Cleaning & Filtering cars dataset for all car models is Asia, Europe & USA
View On Github
Code
# first let's import the nessary libraries
import pandas as pd
import numpy as np
# let's get our data
df = pd.read_csv("Cars Data1.csv")
df
Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight Wheelbase Length
0 Acura MDX SUV Asia All $36,945 $33,337 3.5 6.0 265.0 17.0 23.0 4451.0 106.0 189.0
1 Acura RSX Type S 2dr Sedan Asia Front $23,820 $21,761 2.0 4.0 200.0 24.0 31.0 2778.0 101.0 172.0
2 Acura TSX 4dr Sedan Asia Front $26,990 $24,647 2.4 4.0 200.0 22.0 29.0 3230.0 105.0 183.0
3 Acura TL 4dr Sedan Asia Front $33,195 $30,299 3.2 6.0 270.0 20.0 28.0 3575.0 108.0 186.0
4 Acura 3.5 RL 4dr Sedan Asia Front $43,755 $39,014 3.5 6.0 225.0 18.0 24.0 3880.0 115.0 197.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
427 Volvo C70 LPT convertible 2dr Sedan Europe Front $40,565 $38,203 2.4 5.0 197.0 21.0 28.0 3450.0 105.0 186.0
428 Volvo C70 HPT convertible 2dr Sedan Europe Front $42,565 $40,083 2.3 5.0 242.0 20.0 26.0 3450.0 105.0 186.0
429 Volvo S80 T6 4dr Sedan Europe Front $45,210 $42,573 2.9 6.0 268.0 19.0 26.0 3653.0 110.0 190.0
430 Volvo V40 Wagon Europe Front $26,135 $24,641 1.9 4.0 170.0 22.0 29.0 2822.0 101.0 180.0
431 Volvo XC70 Wagon Europe All $35,145 $33,112 2.5 5.0 208.0 20.0 27.0 3823.0 109.0 186.0
432 rows × 15 columns
# let's see if there is any missing values
pd.set_option('display.max_row', 5)
for col in df.columns:
p = np.mean(df[col].isnull())
print("{} - {}%".format(df[col], round(p*100, 2)))
0 Acura
1 Acura
...
430 Volvo
431 Volvo
Name: Make, Length: 432, dtype: object - 0.93%
0 MDX
1 RSX Type S 2dr
...
430 V40
431 XC70
Name: Model, Length: 432, dtype: object - 0.93%
0 SUV
1 Sedan
...
430 Wagon
431 Wagon
Name: Type, Length: 432, dtype: object - 0.93%
0 Asia
1 Asia
...
430 Europe
431 Europe
Name: Origin, Length: 432, dtype: object - 0.93%
0 All
1 Front
...
430 Front
431 All
Name: DriveTrain, Length: 432, dtype: object - 0.93%
0 $36,945
1 $23,820
...
430 $26,135
431 $35,145
Name: MSRP, Length: 432, dtype: object - 0.93%
0 $33,337
1 $21,761
...
430 $24,641
431 $33,112
Name: Invoice, Length: 432, dtype: object - 0.93%
0 3.5
1 2.0
...
430 1.9
431 2.5
Name: EngineSize, Length: 432, dtype: float64 - 0.93%
0 6.0
1 4.0
...
430 4.0
431 5.0
Name: Cylinders, Length: 432, dtype: float64 - 1.39%
0 265.0
1 200.0
...
430 170.0
431 208.0
Name: Horsepower, Length: 432, dtype: float64 - 0.93%
0 17.0
1 24.0
...
430 22.0
431 20.0
Name: MPG_City, Length: 432, dtype: float64 - 0.93%
0 23.0
1 31.0
...
430 29.0
431 27.0
Name: MPG_Highway, Length: 432, dtype: float64 - 0.93%
0 4451.0
1 2778.0
...
430 2822.0
431 3823.0
Name: Weight, Length: 432, dtype: float64 - 0.93%
0 106.0
1 101.0
...
430 101.0
431 109.0
Name: Wheelbase, Length: 432, dtype: float64 - 0.93%
0 189.0
1 172.0
...
430 180.0
431 186.0
Name: Length, Length: 432, dtype: float64 - 0.93%
# all of them have null value so let's count them by simply
df.isnull().sum()
Make 4
Model 4
Type 4
Origin 4
DriveTrain 4
MSRP 4
Invoice 4
EngineSize 4
Cylinders 6
Horsepower 4
MPG_City 4
MPG_Highway 4
Weight 4
Wheelbase 4
Length 4
dtype: int64
df.head(1)
Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight Wheelbase Length
0 Acura MDX SUV Asia All $36,945 $33,337 3.5 6.0 265.0 17.0 23.0 4451.0 106.0 189.0
df[df["Make"].isnull()]
Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight Wheelbase Length
30 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
39 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
161 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
173 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
df["Cylinders"] = df["Cylinders"].fillna(df["Cylinders"].mean())
df[df["Make"].isnull()]
Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight Wheelbase Length
30 NaN NaN NaN NaN NaN NaN NaN NaN 5.807512 NaN NaN NaN NaN NaN NaN
39 NaN NaN NaN NaN NaN NaN NaN NaN 5.807512 NaN NaN NaN NaN NaN NaN
161 NaN NaN NaN NaN NaN NaN NaN NaN 5.807512 NaN NaN NaN NaN NaN NaN
173 NaN NaN NaN NaN NaN NaN NaN NaN 5.807512 NaN NaN NaN NaN NaN NaN
df = df.dropna()
df
Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight Wheelbase Length
0 Acura MDX SUV Asia All $36,945 $33,337 3.5 6.0 265.0 17.0 23.0 4451.0 106.0 189.0
1 Acura RSX Type S 2dr Sedan Asia Front $23,820 $21,761 2.0 4.0 200.0 24.0 31.0 2778.0 101.0 172.0
2 Acura TSX 4dr Sedan Asia Front $26,990 $24,647 2.4 4.0 200.0 22.0 29.0 3230.0 105.0 183.0
3 Acura TL 4dr Sedan Asia Front $33,195 $30,299 3.2 6.0 270.0 20.0 28.0 3575.0 108.0 186.0
4 Acura 3.5 RL 4dr Sedan Asia Front $43,755 $39,014 3.5 6.0 225.0 18.0 24.0 3880.0 115.0 197.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
427 Volvo C70 LPT convertible 2dr Sedan Europe Front $40,565 $38,203 2.4 5.0 197.0 21.0 28.0 3450.0 105.0 186.0
428 Volvo C70 HPT convertible 2dr Sedan Europe Front $42,565 $40,083 2.3 5.0 242.0 20.0 26.0 3450.0 105.0 186.0
429 Volvo S80 T6 4dr Sedan Europe Front $45,210 $42,573 2.9 6.0 268.0 19.0 26.0 3653.0 110.0 190.0
430 Volvo V40 Wagon Europe Front $26,135 $24,641 1.9 4.0 170.0 22.0 29.0 2822.0 101.0 180.0
431 Volvo XC70 Wagon Europe All $35,145 $33,112 2.5 5.0 208.0 20.0 27.0 3823.0 109.0 186.0
428 rows × 15 columns
# four empty rows removed and two empty Cylinders coulmns filled with the average
pd.set_option('display.max_row', 38)
df.isnull().sum()
Make 0
Model 0
Type 0
Origin 0
DriveTrain 0
MSRP 0
Invoice 0
EngineSize 0
Cylinders 0
Horsepower 0
MPG_City 0
MPG_Highway 0
Weight 0
Wheelbase 0
Length 0
dtype: int64
df["Make"].value_counts()
Toyota 28
Chevrolet 27
Mercedes-Benz 26
Ford 23
BMW 20
Audi 19
Honda 17
Nissan 17
Volkswagen 15
Chrysler 15
Dodge 13
Mitsubishi 13
Volvo 12
Jaguar 12
Hyundai 12
Subaru 11
Pontiac 11
Mazda 11
Lexus 11
Kia 11
Buick 9
Mercury 9
Lincoln 9
Saturn 8
Cadillac 8
Suzuki 8
Infiniti 8
GMC 8
Acura 7
Porsche 7
Saab 7
Land Rover 3
Oldsmobile 3
Jeep 3
Scion 2
Isuzu 2
MINI 2
Hummer 1
Name: Make, dtype: int64
pd.set_option('display.max_row', 15)
df.head(1)
Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight Wheelbase Length
0 Acura MDX SUV Asia All $36,945 $33,337 3.5 6.0 265.0 17.0 23.0 4451.0 106.0 189.0
df[["Cylinders", "Horsepower", "MPG_City", "MPG_Highway", "Weight", "Wheelbase", "Length"]] = df[["Cylinders", "Horsepower", "MPG_City", "MPG_Highway", "Weight", "Wheelbase", "Length"]].astype("int64")
df.head()
Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight Wheelbase Length
0 Acura MDX SUV Asia All $36,945 $33,337 3.5 6 265 17 23.0 4451 106 189
1 Acura RSX Type S 2dr Sedan Asia Front $23,820 $21,761 2.0 4 200 24 31.0 2778 101 172
2 Acura TSX 4dr Sedan Asia Front $26,990 $24,647 2.4 4 200 22 29.0 3230 105 183
3 Acura TL 4dr Sedan Asia Front $33,195 $30,299 3.2 6 270 20 28.0 3575 108 186
4 Acura 3.5 RL 4dr Sedan Asia Front $43,755 $39,014 3.5 6 225 18 24.0 3880 115 197
df["Origin"].value_counts()
Asia 158
USA 147
Europe 123
Name: Origin, dtype: int64
# filtering ; show all the data where the origin is asia or europe
new_data = df[df["Origin"].isin(["Asia" or "Europe"])]
new_data
Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight Wheelbase Length
0 Acura MDX SUV Asia All $36,945 $33,337 3.5 6 265 17 23 4451 106 189
1 Acura RSX Type S 2dr Sedan Asia Front $23,820 $21,761 2.0 4 200 24 31 2778 101 172
2 Acura TSX 4dr Sedan Asia Front $26,990 $24,647 2.4 4 200 22 29 3230 105 183
3 Acura TL 4dr Sedan Asia Front $33,195 $30,299 3.2 6 270 20 28 3575 108 186
4 Acura 3.5 RL 4dr Sedan Asia Front $43,755 $39,014 3.5 6 225 18 24 3880 115 197
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
400 Toyota MR2 Spyder convertible 2dr Sports Asia Rear $25,130 $22,787 1.8 4 138 26 32 2195 97 153
401 Toyota Tacoma Truck Asia Rear $12,800 $11,879 2.4 4 142 22 27 2750 103 191
402 Toyota Tundra Regular Cab V6 Truck Asia Rear $16,495 $14,978 3.4 6 190 16 18 3925 128 218
403 Toyota Tundra Access Cab V6 SR5 Truck Asia All $25,935 $23,520 3.4 6 190 14 17 4435 128 218
404 Toyota Matrix XR Wagon Asia Front $16,695 $15,156 1.8 4 130 29 36 2679 102 171
158 rows × 15 columns
df[(df["Weight"]> 4000)]
Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight Wheelbase Length
0 Acura MDX SUV Asia All $36,945 $33,337 3.5 6 265 17 23 4451 106 189
15 Audi A4 3.0 Quattro convertible 2dr Sedan Europe All $44,240 $40,075 3.0 6 220 18 25 4013 105 180
17 Audi A6 4.2 Quattro 4dr Sedan Europe All $49,690 $44,936 4.2 8 300 17 24 4024 109 193
18 Audi A8 L Quattro 4dr Sedan Europe All $69,190 $64,740 4.2 8 330 17 24 4399 121 204
20 Audi RS 6 4dr Sports Europe Front $84,600 $76,417 4.2 8 450 15 22 4024 109 191
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
405 Volkswagen Touareg V6 SUV Europe All $35,515 $32,243 3.2 6 220 15 20 5086 112 187
415 Volkswagen Phaeton 4dr Sedan Europe Front $65,000 $59,912 4.2 8 335 16 22 5194 118 204
416 Volkswagen Phaeton W12 4dr Sedan Europe Front $75,000 $69,130 6.0 12 420 12 19 5399 118 204
419 Volkswagen Passat W8 Wagon Europe Front $40,235 $36,956 4.0 8 270 18 25 4067 106 184
420 Volvo XC90 T6 SUV Europe All $41,250 $38,851 2.9 6 268 15 20 4638 113 189
103 rows × 15 columns
df[~(df["Weight"]> 4000)]
Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight Wheelbase Length
1 Acura RSX Type S 2dr Sedan Asia Front $23,820 $21,761 2.0 4 200 24 31 2778 101 172
2 Acura TSX 4dr Sedan Asia Front $26,990 $24,647 2.4 4 200 22 29 3230 105 183
3 Acura TL 4dr Sedan Asia Front $33,195 $30,299 3.2 6 270 20 28 3575 108 186
4 Acura 3.5 RL 4dr Sedan Asia Front $43,755 $39,014 3.5 6 225 18 24 3880 115 197
5 Acura 3.5 RL w/Navigation 4dr Sedan Asia Front $46,100 $41,100 3.5 6 225 18 24 3893 115 197
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
427 Volvo C70 LPT convertible 2dr Sedan Europe Front $40,565 $38,203 2.4 5 197 21 28 3450 105 186
428 Volvo C70 HPT convertible 2dr Sedan Europe Front $42,565 $40,083 2.3 5 242 20 26 3450 105 186
429 Volvo S80 T6 4dr Sedan Europe Front $45,210 $42,573 2.9 6 268 19 26 3653 110 190
430 Volvo V40 Wagon Europe Front $26,135 $24,641 1.9 4 170 22 29 2822 101 180
431 Volvo XC70 Wagon Europe All $35,145 $33,112 2.5 5 208 20 27 3823 109 186
325 rows × 15 columns
# let's increase the values of the coulmn "MPG_Highway" by 3
df["MPG_City"] = df["MPG_City"].apply(lambda x:x+3)
df
Make Model Type Origin DriveTrain MSRP Invoice EngineSize Cylinders Horsepower MPG_City MPG_Highway Weight Wheelbase Length
0 Acura MDX SUV Asia All $36,945 $33,337 3.5 6 265 20 23 4451 106 189
1 Acura RSX Type S 2dr Sedan Asia Front $23,820 $21,761 2.0 4 200 27 31 2778 101 172
2 Acura TSX 4dr Sedan Asia Front $26,990 $24,647 2.4 4 200 25 29 3230 105 183
3 Acura TL 4dr Sedan Asia Front $33,195 $30,299 3.2 6 270 23 28 3575 108 186
4 Acura 3.5 RL 4dr Sedan Asia Front $43,755 $39,014 3.5 6 225 21 24 3880 115 197
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
427 Volvo C70 LPT convertible 2dr Sedan Europe Front $40,565 $38,203 2.4 5 197 24 28 3450 105 186
428 Volvo C70 HPT convertible 2dr Sedan Europe Front $42,565 $40,083 2.3 5 242 23 26 3450 105 186
429 Volvo S80 T6 4dr Sedan Europe Front $45,210 $42,573 2.9 6 268 22 26 3653 110 190
430 Volvo V40 Wagon Europe Front $26,135 $24,641 1.9 4 170 25 29 2822 101 180
431 Volvo XC70 Wagon Europe All $35,145 $33,112 2.5 5 208 23 27 3823 109 186
428 rows × 15 columns