Curso - Data Analysis With Python - IBM Cognitive Class

Formação: Applied Data Science With Python

Victor Hugo Negrisoli - Desenvolvedor de Software Full-Stack & Analista de Dados

Módulo 1 - Importando dados com Pandas e analisando o DataFrame

In [329]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
In [227]:
# Importando os dados de veículos de 1985 como dataset de estudo

file = "dados/imports-85.data"

df = pd.read_csv(file, header = None)
df.head(5)
Out[227]:
0 1 2 3 4 5 6 7 8 9 ... 16 17 18 19 20 21 22 23 24 25
0 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 13495
1 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 16500
2 1 ? alfa-romero gas std two hatchback rwd front 94.5 ... 152 mpfi 2.68 3.47 9.0 154 5000 19 26 16500
3 2 164 audi gas std four sedan fwd front 99.8 ... 109 mpfi 3.19 3.40 10.0 102 5500 24 30 13950
4 2 164 audi gas std four sedan 4wd front 99.4 ... 136 mpfi 3.19 3.40 8.0 115 5500 18 22 17450

5 rows × 26 columns

In [228]:
# O CSV está sem os valores de coluna, ou seja, sem cabeçalhos (headers), estes serão definidos abaixo pela seguinte lista:

headers = [
    "symboling",
    "normalized-losses",
    "make",
    "fuel-type",
    "aspiration",
    "num-of-doors",
    "body-style",
    "drive-wheels",
    "engine-location",
    "wheel-base",
    "length",
    "width",
    "height",
    "curb-weight",
    "engine-type",
    "num-of-cylinders",
    "engine-size",
    "fuel-system",
    "bore",
    "stroke",
    "compression-ratio",
    "horsepower",
    "peak-rpm",
    "city-mpg",
    "highway-mpg",
    "price"
]

# Definindo a lista de headers como a lista de cabeçalho do DataFrame

df.columns = headers
In [229]:
df.head(5)
Out[229]:
symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base ... engine-size fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg price
0 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 13495
1 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 16500
2 1 ? alfa-romero gas std two hatchback rwd front 94.5 ... 152 mpfi 2.68 3.47 9.0 154 5000 19 26 16500
3 2 164 audi gas std four sedan fwd front 99.8 ... 109 mpfi 3.19 3.40 10.0 102 5500 24 30 13950
4 2 164 audi gas std four sedan 4wd front 99.4 ... 136 mpfi 3.19 3.40 8.0 115 5500 18 22 17450

5 rows × 26 columns

In [230]:
df.describe()
Out[230]:
symboling wheel-base length width height curb-weight engine-size compression-ratio city-mpg highway-mpg
count 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000
mean 0.834146 98.756585 174.049268 65.907805 53.724878 2555.565854 126.907317 10.142537 25.219512 30.751220
std 1.245307 6.021776 12.337289 2.145204 2.443522 520.680204 41.642693 3.972040 6.542142 6.886443
min -2.000000 86.600000 141.100000 60.300000 47.800000 1488.000000 61.000000 7.000000 13.000000 16.000000
25% 0.000000 94.500000 166.300000 64.100000 52.000000 2145.000000 97.000000 8.600000 19.000000 25.000000
50% 1.000000 97.000000 173.200000 65.500000 54.100000 2414.000000 120.000000 9.000000 24.000000 30.000000
75% 2.000000 102.400000 183.100000 66.900000 55.500000 2935.000000 141.000000 9.400000 30.000000 34.000000
max 3.000000 120.900000 208.100000 72.300000 59.800000 4066.000000 326.000000 23.000000 49.000000 54.000000
In [231]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    205 non-null object
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 205 non-null object
stroke               205 non-null object
compression-ratio    205 non-null float64
horsepower           205 non-null object
peak-rpm             205 non-null object
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                205 non-null object
dtypes: float64(5), int64(5), object(16)
memory usage: 41.8+ KB
In [232]:
df.dtypes
Out[232]:
symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object
In [233]:
df.describe(include = "all")
Out[233]:
symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base ... engine-size fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg price
count 205.000000 205 205 205 205 205 205 205 205 205.000000 ... 205.000000 205 205 205 205.000000 205 205 205.000000 205.000000 205
unique NaN 52 22 2 2 3 5 3 2 NaN ... NaN 8 39 37 NaN 60 24 NaN NaN 187
top NaN ? toyota gas std four sedan fwd front NaN ... NaN mpfi 3.62 3.40 NaN 68 5500 NaN NaN ?
freq NaN 41 32 185 168 114 96 120 202 NaN ... NaN 94 23 20 NaN 19 37 NaN NaN 4
mean 0.834146 NaN NaN NaN NaN NaN NaN NaN NaN 98.756585 ... 126.907317 NaN NaN NaN 10.142537 NaN NaN 25.219512 30.751220 NaN
std 1.245307 NaN NaN NaN NaN NaN NaN NaN NaN 6.021776 ... 41.642693 NaN NaN NaN 3.972040 NaN NaN 6.542142 6.886443 NaN
min -2.000000 NaN NaN NaN NaN NaN NaN NaN NaN 86.600000 ... 61.000000 NaN NaN NaN 7.000000 NaN NaN 13.000000 16.000000 NaN
25% 0.000000 NaN NaN NaN NaN NaN NaN NaN NaN 94.500000 ... 97.000000 NaN NaN NaN 8.600000 NaN NaN 19.000000 25.000000 NaN
50% 1.000000 NaN NaN NaN NaN NaN NaN NaN NaN 97.000000 ... 120.000000 NaN NaN NaN 9.000000 NaN NaN 24.000000 30.000000 NaN
75% 2.000000 NaN NaN NaN NaN NaN NaN NaN NaN 102.400000 ... 141.000000 NaN NaN NaN 9.400000 NaN NaN 30.000000 34.000000 NaN
max 3.000000 NaN NaN NaN NaN NaN NaN NaN NaN 120.900000 ... 326.000000 NaN NaN NaN 23.000000 NaN NaN 49.000000 54.000000 NaN

11 rows × 26 columns

Módulo 2 - Técnicas de Data Wrangling para tratamento de dados com Pandas

In [234]:
# Trabalhando com missing data, ou valores nulos, NaN, entre outros

df = df.replace('NaN', '')
df = df.replace("?", "0")
df = df.fillna(0)
In [235]:
# Criando uma nova coluna para converter a coluna city-mpg para city-L/100km

df["city-L/100km"] = 235 / df["city-mpg"]
df.head()
Out[235]:
symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base ... fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg price city-L/100km
0 3 0 alfa-romero gas std two convertible rwd front 88.6 ... mpfi 3.47 2.68 9.0 111 5000 21 27 13495 11.190476
1 3 0 alfa-romero gas std two convertible rwd front 88.6 ... mpfi 3.47 2.68 9.0 111 5000 21 27 16500 11.190476
2 1 0 alfa-romero gas std two hatchback rwd front 94.5 ... mpfi 2.68 3.47 9.0 154 5000 19 26 16500 12.368421
3 2 164 audi gas std four sedan fwd front 99.8 ... mpfi 3.19 3.40 10.0 102 5500 24 30 13950 9.791667
4 2 164 audi gas std four sedan 4wd front 99.4 ... mpfi 3.19 3.40 8.0 115 5500 18 22 17450 13.055556

5 rows × 27 columns

In [236]:
# Convertendo o tipo de uma coluna em outro

df["price"] = df["price"].astype("int")
df
Out[236]:
symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base ... fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg price city-L/100km
0 3 0 alfa-romero gas std two convertible rwd front 88.6 ... mpfi 3.47 2.68 9.0 111 5000 21 27 13495 11.190476
1 3 0 alfa-romero gas std two convertible rwd front 88.6 ... mpfi 3.47 2.68 9.0 111 5000 21 27 16500 11.190476
2 1 0 alfa-romero gas std two hatchback rwd front 94.5 ... mpfi 2.68 3.47 9.0 154 5000 19 26 16500 12.368421
3 2 164 audi gas std four sedan fwd front 99.8 ... mpfi 3.19 3.40 10.0 102 5500 24 30 13950 9.791667
4 2 164 audi gas std four sedan 4wd front 99.4 ... mpfi 3.19 3.40 8.0 115 5500 18 22 17450 13.055556
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
200 -1 95 volvo gas std four sedan rwd front 109.1 ... mpfi 3.78 3.15 9.5 114 5400 23 28 16845 10.217391
201 -1 95 volvo gas turbo four sedan rwd front 109.1 ... mpfi 3.78 3.15 8.7 160 5300 19 25 19045 12.368421
202 -1 95 volvo gas std four sedan rwd front 109.1 ... mpfi 3.58 2.87 8.8 134 5500 18 23 21485 13.055556
203 -1 95 volvo diesel turbo four sedan rwd front 109.1 ... idi 3.01 3.40 23.0 106 4800 26 27 22470 9.038462
204 -1 95 volvo gas turbo four sedan rwd front 109.1 ... mpfi 3.78 3.15 9.5 114 5400 19 25 22625 12.368421

205 rows × 27 columns

In [237]:
# Aplicando o método de simplificação no campo length (data / data.max)

df["length"] = df["length"] / df["length"].max()
df["length"].head(5)
Out[237]:
0    0.811148
1    0.811148
2    0.822681
3    0.848630
4    0.848630
Name: length, dtype: float64
In [238]:
# Aplicando o método de min-max na coluna length (data - data.min / data.max - data.min)

df["length"] = (df["length"] - df["length"].min()) / (df["length"].max() - df["length"].min())
df["length"].head(5)
Out[238]:
0    0.413433
1    0.413433
2    0.449254
3    0.529851
4    0.529851
Name: length, dtype: float64
In [239]:
# Aplicando o método de Z-Score na coluna length (data - data.media / data.std)

df["length"] = (df["length"] - df["length"].mean()) / (df["length"].std())
df["length"].head(5)
Out[239]:
0   -0.425480
1   -0.425480
2   -0.230948
3    0.206750
4    0.206750
Name: length, dtype: float64
In [240]:
# Trabalhando com Data Binning (categorização de dados)]

groups = ["Low", "Medium", "High"]
bins = np.linspace(min(df["price"]), max(df["price"]), 4)
print(bins)
for bin_item in bins:
    print('Bin: {}'.format(bin_item))
[    0.         15133.33333333 30266.66666667 45400.        ]
Bin: 0.0
Bin: 15133.333333333334
Bin: 30266.666666666668
Bin: 45400.0
In [241]:
# Convertendo dados em dados categóricos usando dummy data
# O método get_dummies recebe uma coluna e cria um DataFrame contendo valores 0 (false) e 1 (true) para dados
# da coluna com objetivo em categorizá-los

categorical = pd.get_dummies(df["fuel-type"])
categorical.tail(15)
Out[241]:
diesel gas
190 0 1
191 0 1
192 1 0
193 0 1
194 0 1
195 0 1
196 0 1
197 0 1
198 0 1
199 0 1
200 0 1
201 0 1
202 0 1
203 1 0
204 0 1

Módulo 3 - E.D.A - Análise Exploratória de Dado, Estatística Descritiva, Correlação e ANOVA (Pandas, Seaborn, Matplotlib)

In [242]:
drive_wheels_count = df["drive-wheels"].value_counts()
drive_wheels_count.head(5)
Out[242]:
fwd    120
rwd     76
4wd      9
Name: drive-wheels, dtype: int64
In [243]:
# Plotando a distribuição da coluna drive-wheels pela coluna price em um boxplot usando a biblioteca Seaborn

x = "drive-wheels"
y = "price"
sns.boxplot(x = x, y = y, data = df)
Out[243]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ce7749e348>
In [244]:
# Plotando a distribuição da coluna engine-size pela coluna price em um scatterplot usando a biblioteca Matplotlib

x = df["engine-size"]
y = df["price"]

# Configurações de exibições do gráfico

plt.xlabel("Engine Size")
plt.ylabel("Price")
plt.title("Distribution of Engine Size by Pricing")

plt.scatter(x, y)
Out[244]:
<matplotlib.collections.PathCollection at 0x1ce7750b648>
In [245]:
# Agrupando dados utilizando a função groupby

group = df[["drive-wheels", "body-style", "price"]]

group = group.groupby(["drive-wheels", "body-style"], as_index = False).mean()
group
Out[245]:
drive-wheels body-style price
0 4wd hatchback 3801.500000
1 4wd sedan 12647.333333
2 4wd wagon 9095.750000
3 fwd convertible 11595.000000
4 fwd hardtop 8249.000000
5 fwd hatchback 8396.387755
6 fwd sedan 9467.526316
7 fwd wagon 9997.333333
8 rwd convertible 23949.600000
9 rwd hardtop 24202.714286
10 rwd hatchback 13583.157895
11 rwd sedan 21711.833333
12 rwd wagon 16994.222222
In [246]:
drive_wheels = group["drive-wheels"]
price = group["price"]

plt.xlabel("Drive Wheels")
plt.title("Drive Wheels by Pricing")

plt.bar(drive_wheels, price)
Out[246]:
<BarContainer object of 13 artists>
In [247]:
body_style = group["body-style"]

plt.xlabel("Body Style")
plt.ylabel("Price")
plt.title("Body Style by Pricing")

plt.bar(body_style, price)
Out[247]:
<BarContainer object of 13 artists>
In [248]:
# Criando uma tabela pivô (semelhante à tabela dinâmica do Excel)

pivot_group = group.pivot(index = "drive-wheels", columns = "body-style")
In [249]:
# Criando um mapa de calor (HeatMap) com os dados do DataFrame pivô criado anteriormente

plt.pcolor(pivot_group, cmap = "RdBu")
plt.colorbar()
plt.show()
In [250]:
# Utilizando análise ANOVA (Análise de Variância)

anova_df = df[["make", "price"]]


anova_df.head()
Out[250]:
make price
0 alfa-romero 13495
1 alfa-romero 16500
2 alfa-romero 16500
3 audi 13950
4 audi 17450
In [251]:
# Teste ANOVA entre os preços de Honda e Subaru no DataFrame agrupando marca e preço 

group_anova = anova_df.groupby(["make"])
anova_resultados = stats.f_oneway(group_anova.get_group("honda")["price"], group_anova.get_group("subaru")["price"])
anova_resultados

# É possível verificar que não há tanta diferença na variância, levando em consideração que F-Score < 1 e p > 0.05
Out[251]:
F_onewayResult(statistic=0.19744030127462606, pvalue=0.6609478240622193)
In [252]:
# Teste ANOVA entre os preços de Honda e Jaguar no DataFrame agrupando marca e preço 

group_anova = anova_df.groupby(["make"])
anova_resultados = stats.f_oneway(group_anova.get_group("honda")["price"], group_anova.get_group("jaguar")["price"])
anova_resultados

# É possível verificar que há diferença na variância, levando em consideração que F-Score > 1 e p > 0.05
Out[252]:
F_onewayResult(statistic=400.925870564337, pvalue=1.0586193512077862e-11)
In [253]:
# Trabalhando com correlação entre variáveis

# O gráfico regplot irá traçar uma linha em um scatter plot para indicar a correlação entre as variáveis

x = df["engine-size"]
y = df["price"]

sns.regplot(x, y, df)
plt.ylim(0, )

# É possível analisar que há uma correlação positiva entre engine-size e price, sendo uma correlação forte,
# pois, aumentando o engine-size, aumenta-se o preço
Out[253]:
(0, 52494.60255948399)
In [254]:
# Verificando agora a correlação existente entre highway-mpg e price

x = df["highway-mpg"]

sns.regplot(x, y, df)
plt.ylim(0, ) # Essa propriedade irá setar um limite de 0 para o eixo Y

# É possível analisar que há uma correlação positiva entre engine-size e price, sendo uma correlação forte, 
# pois, reduzindo highway-mpg, reduz-se o preço
Out[254]:
(0, 48154.85169390302)
In [255]:
# Verificando agora a correlação existente entre peak-rpm e price

x = df["peak-rpm"].astype("int")

sns.regplot(x, y, df)
plt.ylim(0, )

# É possível analisar que há uma correlação negativa entre peak-rpm e price, sendo uma correlação fraca, 
# pois, há valores altos e baixos em peak-rpm, e o preço mantém-se quase sempre constante
Out[255]:
(0, 47670.0133554084)
In [256]:
# Utilizando o método de correlação de Pearson

# Coeficiente de correlação
# Valor p

# Valores de Referência:

# Coeficiente de correlação:

# +1: correlação forte
# -1: correlação fraca
#  0: não há correlação

# Valores de p:

# p < 0.001 - Resultado altamente confiável
# p < 0.05  - Resultado moderadamente confiável
# p < 0.1   - Resultado baixamente confiável
# p > 0.1   - Resultado nada confiável

# Verificando a correlação entre as variáveis horsepower e price

df["horsepower"] = df["horsepower"].astype("int")

correlacao_df = df[["horsepower", "price"]]

coeficiente, p = stats.pearsonr(correlacao_df["horsepower"], correlacao_df["price"])
print("Coeficiente: {}".format(str(coeficiente)))
print("Valor de p: {}".format(str(p)))

# É possível verificar que p é muito menor que 0.001 e F é próximo de 1, então há uma forte correlação
Coeficiente: 0.6912878787942788
Valor de p: 1.8175735366187956e-30

Módulo 4 - Regressão Linear Simples (SLR) e Múltipla (MLR)

In [257]:
# Regressão Linear Simples (SLR)

# Utilizando a classe LinearRegression da biblioteca Scikit-Learn para determinar os arrays de b0 e b1
# e criar um modelo de previsã utilizando regressão linear simples

lm = LinearRegression()

x = df[["highway-mpg"]]
y = df["price"]

lm.fit(x, y)

previsao = lm.predict(x)

print(previsao)
[15975.79026479 15975.79026479 16782.55750053 13555.48855758
 20009.62644348 17589.32473626 17589.32473626 17589.32473626
 21623.16091495 20009.62644348 14362.25579332 14362.25579332
 15169.02302905 15169.02302905 17589.32473626 20009.62644348
 20009.62644348 21623.16091495 -5000.15786438  3067.51449299
  3067.51449299  4681.04896447  7101.35067168 13555.48855758
  7101.35067168  7101.35067168  7101.35067168 13555.48855758
 13555.48855758 18396.091972   -5806.92510011  7101.35067168
  3874.28172873 10328.41961463 10328.41961463 10328.41961463
 10328.41961463 11135.18685037 11135.18685037 11135.18685037
 11135.18685037 15169.02302905 12748.72132184 14362.25579332
  3067.51449299  3067.51449299 14362.25579332 22429.92815069
 22429.92815069 24043.46262216 12748.72132184  7101.35067168
  7101.35067168  7101.35067168  7101.35067168 19202.85920774
 19202.85920774 19202.85920774 19202.85920774 11941.9540861
 11941.9540861  11941.9540861  11941.9540861   3874.28172873
 11941.9540861  15975.79026479  6294.58343594 17589.32473626
 17589.32473626 17589.32473626 17589.32473626 23236.69538642
 23236.69538642 24850.2298579  24850.2298579  18396.091972
  4681.04896447  7101.35067168  7101.35067168 13555.48855758
 13555.48855758 11941.9540861  18396.091972   18396.091972
 18396.091972   11941.9540861  11941.9540861  13555.48855758
 13555.48855758  7908.11790742 -2579.85615717  7908.11790742
  7908.11790742  7908.11790742  7908.11790742  7908.11790742
  7908.11790742  7908.11790742  7908.11790742 10328.41961463
 10328.41961463 20009.62644348 20009.62644348 17589.32473626
 17589.32473626 19202.85920774 17589.32473626 18396.091972
 11135.18685037 18396.091972   17589.32473626 18396.091972
 11135.18685037 18396.091972   17589.32473626 18396.091972
 11135.18685037 18396.091972    4681.04896447 13555.48855758
  7101.35067168  7101.35067168  7101.35067168 13555.48855758
 18396.091972   15975.79026479 17589.32473626 17589.32473626
 17589.32473626 15169.02302905 12748.72132184 12748.72132184
 15169.02302905 15169.02302905 15169.02302905 15169.02302905
 16782.55750053 16782.55750053  8714.88514316 12748.72132184
 12748.72132184  7908.11790742 11135.18685037 11941.9540861
 17589.32473626 14362.25579332 11941.9540861  12748.72132184
 14362.25579332 19202.85920774  6294.58343594  7101.35067168
  7101.35067168  7908.11790742 11941.9540861  11941.9540861
  7908.11790742  7908.11790742  8714.88514316  -159.55444995
  -159.55444995 10328.41961463 10328.41961463 10328.41961463
 10328.41961463 14362.25579332 14362.25579332 13555.48855758
 13555.48855758 13555.48855758 13555.48855758 13555.48855758
 13555.48855758 10328.41961463 11135.18685037 11941.9540861
 11941.9540861  11941.9540861  18396.091972   18396.091972
 18396.091972   18396.091972     647.21278578 10328.41961463
   647.21278578 10328.41961463 10328.41961463  3874.28172873
 11941.9540861  14362.25579332 14362.25579332 18396.091972
  7101.35067168 12748.72132184 15169.02302905 15169.02302905
 15169.02302905 15169.02302905 20009.62644348 20009.62644348
 15169.02302905 17589.32473626 19202.85920774 15975.79026479
 17589.32473626]
In [258]:
# Interceptando o valor de b0
b0 = lm.intercept_

print(b0)
37758.50562969372
In [259]:
# Interceptando o valor de b1
b1 = lm.coef_

print(b1)

# A relação entre o preço será: preco = 37758.50562969372 - 806.76723574 * highway-mpg

preco_previsao = b0 - b1 * x

print(preco_previsao)
[-806.76723574]
      highway-mpg
0    59541.220995
1    59541.220995
2    58734.453759
3    61961.522702
4    55507.384816
..            ...
200  60347.988230
201  57927.686523
202  56314.152052
203  59541.220995
204  57927.686523

[205 rows x 1 columns]
In [260]:
# Regressão Linear Múltipla (MLR)
# Podemos ter b0, o alvo, b1, b2, b3... bn

horsepower = df["horsepower"]
curb_weight = df["curb-weight"]
engine_size = df["engine-size"]
highway_mpg = df["highway-mpg"]

z = df[["horsepower", "curb-weight", "engine-size", "highway-mpg"]]

lm.fit(z, y)

previsao = lm.predict(z)
previsao
Out[260]:
array([13896.49032606, 13896.49032606, 17227.23244169, 10375.21137424,
       16290.42510874, 14803.20232557, 15851.4869507 , 16193.65700638,
       16774.35165898, 16232.01250439, 10621.32269212, 10621.32269212,
       18036.9536937 , 18208.03872154, 19633.90021086, 25530.04045633,
       25996.6359868 , 26734.65012691, -1481.28889727,  4655.05455499,
        4763.92684543,  5019.51072075,  5543.28751782,  8477.43453608,
        5826.35547297,  5894.78948411,  5894.78948411,  8673.40465888,
       12528.96986454, 18033.50821987,  2514.87539344,  5556.62590679,
        3512.15132738,  6631.38203079,  6681.15222071,  6849.12661168,
        6892.67552786,  9723.08620511,  9887.94995921,  9934.60951226,
       10146.1328194 , 11240.50567186, 10186.22451351, 10885.2541009 ,
        4655.05455499,  4763.92684543, 12973.25930694, 34239.19102087,
       34239.19102087, 41551.14469471,  6922.40536039,  5731.36586927,
        5746.91905362,  5871.34452841,  5886.89771276,  7312.14020322,
        7312.14020322,  7327.69338757,  8665.79116232, 11731.29052399,
       11809.05644574, 11731.29052399, 11809.05644574, 10256.28841207,
       11855.71599879, 15369.49206704, 12904.37424102, 23210.78107532,
       23941.78073972, 23148.56833792, 24003.99347712, 30772.59085594,
       30601.5058281 , 39881.55398387, 38852.39389664, 16390.93679496,
        5377.00360244,  5981.65695812,  6168.29517031,  8530.31536287,
       10527.93293297, 11535.88350111, 18101.942231  , 18375.67827555,
       18391.2314599 , 11650.9770653 , 11775.40254009, 10630.58394967,
       10630.58394967,  6547.75434854,  5420.10733796,  6637.96281776,
        6700.17555516,  6967.69032596,  6740.61383447,  6980.13287344,
        6802.82657186,  7008.12860527,  6917.92013605, 10906.68366617,
       10838.24965504, 22070.01402446, 22695.2520353 , 21437.36493695,
       21435.38049992, 21815.08112543, 21646.90380707, 14817.60958445,
       17435.45041038, 15470.84332711, 19556.96692656, 14997.74497294,
       17606.53543822, 15650.9787156 , 19728.0519544 , 14988.69461229,
       17606.53543822, 16544.06945759,  5150.15746928,  8477.43453608,
        5826.35547297,  5894.78948411,  7430.52266444, 12528.96986454,
       18055.28267796, 16849.01543386, 21717.34627615, 21717.34627615,
       21854.21429843, 23745.32596253, 14023.69215555, 13653.52636804,
       13047.78569717, 13162.87926135, 13200.20690379, 13358.84938415,
       13637.30674275, 13758.62158067,  7223.15915027,  9543.41807066,
        9916.69449504,  8532.90377535,  9371.25149724,  9958.1371295 ,
       11374.56381236, 10933.7941289 ,  9856.90744991, 10490.45263522,
       10785.06704005, 12416.8368848 ,  5961.75188605,  6307.42917958,
        6229.66325783,  7228.57429402,  8132.64199116, 10683.36422441,
        7253.8945138 ,  7340.99234615,  9456.37965573,  7535.86473316,
        5548.41013622,  7905.20742252,  7961.19888618,  8051.40735541,
        8160.27964585,  9032.93224968,  9141.80454013, 15139.97159767,
       15127.52905019, 15174.18860324, 15572.35012257, 15681.22241302,
       16493.09863604, 11162.3769747 , 10540.90894558, 11785.29755062,
       11785.29755062, 11922.16557289, 20175.70641686, 20300.13189166,
       20680.48103331, 19608.46310492,  6210.50893845,  9355.60885768,
        6219.84084906,  9364.94076829,  9560.91089109,  7016.89205444,
        9919.98363933, 10345.92294364, 10243.27192693, 15456.83266921,
        8524.02670335, 10966.97556569, 16088.24807238, 16467.74577049,
       16159.79272038, 16492.63086545, 16084.65398221, 16433.04531163,
       16212.67354717, 16830.02382564, 20811.30761181, 17701.47829228,
       17078.62039991])
In [261]:
b0 = lm.intercept_
bn = lm.coef_

print(b0)
print(bn)
-3558.1248837676467
[  -4.52518033    3.11063687  113.42306658 -174.59226569]
In [262]:
preco_previsao_multipla = b0 + bn[0] * horsepower + bn[1] * curb_weight + bn[2] * engine_size + bn[3] * highway_mpg
preco_previsao_multipla
Out[262]:
0      13896.490326
1      13896.490326
2      17227.232442
3      10375.211374
4      16290.425109
           ...     
200    16212.673547
201    16830.023826
202    20811.307612
203    17701.478292
204    17078.620400
Length: 205, dtype: float64
In [263]:
# Plotando um gráfico de regressão com a biblioteca Seabonr

x = "highway-mpg" # Variável dependente
y = "price"       # Variável target

sns.regplot(x, y, df)
plt.ylim(0,)
Out[263]:
(0, 48156.5089568308)
In [264]:
# Criando um gráfico residual

sns.residplot(x, y, df)
Out[264]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ce78568288>
In [265]:
# Gráficos de distribuição apenas de price por highway-mpg

y = df["price"]
grafico_1 = sns.distplot(y, hist = False, color = "r", label = "Valores Atuais")
In [266]:
# Agora, vamos comprar os valores previstos com os valores atuais

grafico_1 = sns.distplot(y, hist = False, color = "r", label = "Valores Atuais")
sns.distplot(preco_previsao_multipla, hist = False, color = "b", label = "Valores de Previsão", ax = grafico_1)
Out[266]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ce78645448>
In [267]:
# Regressões polinomiais

x = df["highway-mpg"]
y = df["price"]

# Calculando polinômio de 3º grau com os campos highway-mpg e price

f = np.polyfit(x, y, 3)
f
Out[267]:
array([-1.49872547e+00,  1.96887677e+02, -8.63947169e+03,  1.33515385e+05])
In [268]:
p = np.poly1d(f)
p
Out[268]:
poly1d([-1.49872547e+00,  1.96887677e+02, -8.63947169e+03,  1.33515385e+05])
In [269]:
# Criando polinômios multidimensionais com ScikitLearn

pr = PolynomialFeatures(degree = 2, include_bias = False)
x_polly = pr.fit_transform(df[["horsepower", "curb-weight"]])
x_polly
Out[269]:
array([[1.1100000e+02, 2.5480000e+03, 1.2321000e+04, 2.8282800e+05,
        6.4923040e+06],
       [1.1100000e+02, 2.5480000e+03, 1.2321000e+04, 2.8282800e+05,
        6.4923040e+06],
       [1.5400000e+02, 2.8230000e+03, 2.3716000e+04, 4.3474200e+05,
        7.9693290e+06],
       ...,
       [1.3400000e+02, 3.0120000e+03, 1.7956000e+04, 4.0360800e+05,
        9.0721440e+06],
       [1.0600000e+02, 3.2170000e+03, 1.1236000e+04, 3.4100200e+05,
        1.0349089e+07],
       [1.1400000e+02, 3.0620000e+03, 1.2996000e+04, 3.4906800e+05,
        9.3758440e+06]])
In [270]:
# Normalizar cada feature

scale = StandardScaler()
scale.fit(df[["horsepower", "curb-weight"]])
scale.transform(df[["horsepower", "curb-weight"]])
Out[270]:
array([[ 0.19053401, -0.01456628],
       [ 0.19053401, -0.01456628],
       [ 1.24619543,  0.51488192],
       [-0.03041838, -0.42079745],
       [ 0.28873507,  0.51680718],
       [ 0.16598375, -0.0935022 ],
       [ 0.16598375,  0.55531251],
       [ 0.16598375,  0.76709179],
       [ 0.90249171,  1.02122692],
       [ 1.39349702,  0.95769314],
       [-0.05496864, -0.30913201],
       [-0.05496864, -0.30913201],
       [ 0.43603667,  0.29732684],
       [ 0.43603667,  0.40321648],
       [ 0.43603667,  0.96154367],
       [ 1.93360286,  1.29846525],
       [ 1.93360286,  1.58725518],
       [ 1.93360286,  1.82791345],
       [-1.35613271, -2.05534843],
       [-0.81602687, -1.31219568],
       [-0.81602687, -1.24481137],
       [-0.8651274 , -1.30834515],
       [-0.8651274 , -1.30834515],
       [-0.03041838, -0.82317808],
       [-0.8651274 , -1.13314593],
       [-0.8651274 , -1.09079007],
       [-0.8651274 , -1.09079007],
       [-0.03041838, -0.70188631],
       [-0.37412209, -0.03959474],
       [ 1.02524304,  0.49177872],
       [-1.11063006, -1.62216354],
       [-0.66872528, -1.41808532],
       [-1.06152953, -1.38343053],
       [-0.66872528, -1.18512812],
       [-0.66872528, -1.15432386],
       [-0.66872528, -1.05035948],
       [-0.66872528, -1.02340576],
       [-0.42322263, -0.61524933],
       [-0.42322263, -0.51321022],
       [-0.42322263, -0.48433123],
       [-0.42322263, -0.35341313],
       [-0.05496864, -0.17436337],
       [-0.07951891, -0.50550916],
       [-0.61962475, -0.42079745],
       [-0.81602687, -1.31219568],
       [-0.81602687, -1.24481137],
       [-0.32502156,  0.34353323],
       [ 1.78630127,  2.90798777],
       [ 1.78630127,  2.90798777],
       [ 3.8976241 ,  2.6846569 ],
       [-0.8651274 , -1.28139143],
       [-0.8651274 , -1.26213876],
       [-0.8651274 , -1.25251243],
       [-0.8651274 , -1.17550179],
       [-0.8651274 , -1.16587546],
       [-0.05496864, -0.338011  ],
       [-0.05496864, -0.338011  ],
       [-0.05496864, -0.32838467],
       [ 0.77974038, -0.10697906],
       [-0.47232316, -0.32838467],
       [-0.47232316, -0.28025301],
       [-0.47232316, -0.32838467],
       [-0.47232316, -0.28025301],
       [-0.96332847, -0.21671923],
       [-0.47232316, -0.25137402],
       [ 0.4114864 ,  0.22031619],
       [-0.76692634,  0.27807418],
       [ 0.4851372 ,  1.84716611],
       [ 0.4851372 ,  2.29960366],
       [ 0.4851372 ,  1.80866079],
       [ 0.4851372 ,  2.33810898],
       [ 1.27074569,  2.280351  ],
       [ 1.27074569,  2.17446136],
       [ 1.98270339,  2.58839359],
       [ 1.98270339,  2.23221934],
       [ 1.761751  ,  0.68238007],
       [-0.8651274 , -1.22748397],
       [-0.8651274 , -1.17742705],
       [-0.8651274 , -1.06191108],
       [-0.03041838, -0.79044855],
       [ 0.31328534, -0.35726366],
       [-0.37412209, -0.43812484],
       [ 1.02524304,  0.53413458],
       [ 1.02524304,  0.703558  ],
       [ 1.02524304,  0.71318433],
       [-0.37412209, -0.36688999],
       [-0.37412209, -0.28987935],
       [ 0.31328534, -0.29372988],
       [ 0.31328534, -0.29372988],
       [-0.84057714, -1.28331669],
       [-1.18428085, -1.03688262],
       [-0.84057714, -1.22748397],
       [-0.84057714, -1.18897865],
       [-0.84057714, -1.02340576],
       [-0.84057714, -1.16395019],
       [-0.84057714, -1.01570469],
       [-0.84057714, -1.12544487],
       [-0.84057714, -0.9983773 ],
       [-0.84057714, -1.05421002],
       [-0.15316971, -0.44582591],
       [-0.15316971, -0.48818176],
       [ 1.1970949 ,  1.03855432],
       [ 1.1970949 ,  1.42553282],
       [ 1.1970949 ,  0.97117   ],
       [ 1.39349702,  0.99234793],
       [ 2.37550764,  1.12326603],
       [ 1.39349702,  1.12326603],
       [-0.15316971,  0.89415935],
       [-0.20227024,  1.23493147],
       [-0.15316971,  1.29846525],
       [-0.20227024,  1.68351848],
       [-0.20227024,  1.00004899],
       [-0.20227024,  1.34082111],
       [-0.20227024,  1.40435489],
       [-0.20227024,  1.78940812],
       [-0.15316971,  1.00004899],
       [-0.20227024,  1.34082111],
       [ 0.95159224,  1.10593863],
       [-0.8651274 , -1.22748397],
       [-0.03041838, -0.82317808],
       [-0.8651274 , -1.13314593],
       [-0.8651274 , -1.09079007],
       [-0.8651274 , -0.70188631],
       [-0.37412209, -0.03959474],
       [ 1.02524304,  0.50525559],
       [ 0.97614251,  0.42824494],
       [ 2.5473595 ,  0.38588908],
       [ 2.5473595 ,  0.38588908],
       [ 2.5473595 ,  0.47060079],
       [ 4.535931  ,  1.56030145],
       [-2.53454545,  0.04511697],
       [-2.53454545, -0.18398971],
       [ 0.16598375,  0.197213  ],
       [ 0.16598375,  0.26844785],
       [ 0.16598375,  0.29155104],
       [ 0.16598375,  0.38973961],
       [ 1.39349702,  0.48600292],
       [ 1.39349702,  0.5610883 ],
       [-0.84057714, -0.97334884],
       [-0.74237608, -0.83858021],
       [-0.74237608, -0.60754826],
       [-0.52142369, -0.79044855],
       [-0.52142369, -0.70381157],
       [-0.2268205 , -0.41502165],
       [-0.52142369, -0.32838467],
       [ 0.19053401, -0.0877264 ],
       [-0.52142369, -0.51128496],
       [-0.2268205 , -0.19361604],
       [-0.52142369, -0.26100035],
       [ 0.19053401,  0.18181087],
       [-1.012429  , -1.09849114],
       [-1.012429  , -0.9926015 ],
       [-1.012429  , -1.04073315],
       [-1.012429  , -0.53053762],
       [-1.012429  , -0.51128496],
       [-1.012429  ,  1.06743331],
       [-0.81602687, -0.91366559],
       [-0.81602687, -0.85975813],
       [-1.15973059, -0.54016395],
       [-1.15973059, -0.54016395],
       [-0.81602687, -0.88863713],
       [-0.81602687, -0.83472967],
       [-0.81602687, -0.80007488],
       [-0.81602687, -0.74424216],
       [-0.81602687, -0.67685785],
       [ 0.21508428, -0.55941661],
       [ 0.21508428, -0.49203229],
       [ 0.31328534, -0.02996841],
       [ 0.31328534, -0.03766948],
       [ 0.31328534, -0.00879048],
       [ 0.31328534,  0.23764359],
       [ 0.31328534,  0.3050279 ],
       [ 0.31328534,  0.80752237],
       [-0.27592103, -0.44197537],
       [-0.74237608, -0.14548438],
       [-0.27592103, -0.27255195],
       [-0.27592103, -0.27255195],
       [-0.27592103, -0.18784024],
       [ 1.41804728,  0.80944764],
       [ 1.41804728,  0.88645829],
       [ 1.29529596,  1.1078639 ],
       [ 1.29529596,  1.14636922],
       [-1.25793165, -0.56711767],
       [-0.44777289, -0.66723152],
       [-1.25793165, -0.56134188],
       [-0.44777289, -0.66145572],
       [-0.44777289, -0.54016395],
       [-0.8651274 , -0.45545224],
       [-0.07951891, -0.49203229],
       [-0.32502156, -0.58059454],
       [-0.32502156, -0.64412832],
       [ 0.16598375,  0.2029888 ],
       [-0.8651274 ,  0.04511697],
       [-0.37412209,  0.01431271],
       [ 0.26418481,  0.68623061],
       [ 0.26418481,  0.92111308],
       [ 0.26418481,  0.73051173],
       [ 0.26418481,  0.93651521],
       [ 1.44259755,  0.94229101],
       [ 1.44259755,  1.15792082],
       [ 0.26418481,  0.76324125],
       [ 1.39349702,  0.94999207],
       [ 0.75519012,  0.87875722],
       [ 0.06778268,  1.27343679],
       [ 0.26418481,  0.97502053]])
In [271]:
# Utilizando Pipeline para fazer todo o processo de regressão linear, features polinomiais e normalização

pipeline = [
    ('scale', StandardScaler()),
    ('polynomial', PolynomialFeatures(degree = 2)),
    ('model', LinearRegression())
]

pipe = Pipeline(pipeline)
print(pipe)

# É possível perceber que, no campo steps, o Pipeline irá executar cada um dos passos definidos para conseguir realizar
# operações de previsões, treinamentos, entre outras operações
Pipeline(memory=None,
         steps=[('scale',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('polynomial',
                 PolynomialFeatures(degree=2, include_bias=True,
                                    interaction_only=False, order='C')),
                ('model',
                 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                                  normalize=False))],
         verbose=False)
In [272]:
# Medidas amostrais de avaliação e Erro Quadrático Médio (MSE)

# O método mean_squared_error recebe a variável target e a variável que foi prevista para
# encontrar o erro quadrático médio
# O fator R^2 (Coeficiente de Determinação) determina o quão próximo o dado está da linha de regressão 

mean_squared_error(df["price"], preco_previsao_multipla)
Out[272]:
17092937.937706746
In [273]:
# Geralmente, o MSE fica entre 0 e 1, é possível calcular o valor do MSE da seguinte maneira:

x = df[["highway-mpg"]]
y = df["price"]

lm.fit(x, y)

lm.score(x, y)
Out[273]:
0.47289700751877217
In [274]:
# Tomada de Decisão: verificando se seu modelo é aceitável

nova_entrada = np.arange(1, 101, 1).reshape(-1, 1)
nova_entrada
Out[274]:
array([[  1],
       [  2],
       [  3],
       [  4],
       [  5],
       [  6],
       [  7],
       [  8],
       [  9],
       [ 10],
       [ 11],
       [ 12],
       [ 13],
       [ 14],
       [ 15],
       [ 16],
       [ 17],
       [ 18],
       [ 19],
       [ 20],
       [ 21],
       [ 22],
       [ 23],
       [ 24],
       [ 25],
       [ 26],
       [ 27],
       [ 28],
       [ 29],
       [ 30],
       [ 31],
       [ 32],
       [ 33],
       [ 34],
       [ 35],
       [ 36],
       [ 37],
       [ 38],
       [ 39],
       [ 40],
       [ 41],
       [ 42],
       [ 43],
       [ 44],
       [ 45],
       [ 46],
       [ 47],
       [ 48],
       [ 49],
       [ 50],
       [ 51],
       [ 52],
       [ 53],
       [ 54],
       [ 55],
       [ 56],
       [ 57],
       [ 58],
       [ 59],
       [ 60],
       [ 61],
       [ 62],
       [ 63],
       [ 64],
       [ 65],
       [ 66],
       [ 67],
       [ 68],
       [ 69],
       [ 70],
       [ 71],
       [ 72],
       [ 73],
       [ 74],
       [ 75],
       [ 76],
       [ 77],
       [ 78],
       [ 79],
       [ 80],
       [ 81],
       [ 82],
       [ 83],
       [ 84],
       [ 85],
       [ 86],
       [ 87],
       [ 88],
       [ 89],
       [ 90],
       [ 91],
       [ 92],
       [ 93],
       [ 94],
       [ 95],
       [ 96],
       [ 97],
       [ 98],
       [ 99],
       [100]])
In [275]:
nova_entrada_previsao = lm.predict(nova_entrada)
nova_entrada_previsao
Out[275]:
array([ 36951.73839396,  36144.97115822,  35338.20392248,  34531.43668674,
        33724.66945101,  32917.90221527,  32111.13497953,  31304.3677438 ,
        30497.60050806,  29690.83327232,  28884.06603658,  28077.29880085,
        27270.53156511,  26463.76432937,  25656.99709364,  24850.2298579 ,
        24043.46262216,  23236.69538642,  22429.92815069,  21623.16091495,
        20816.39367921,  20009.62644348,  19202.85920774,  18396.091972  ,
        17589.32473626,  16782.55750053,  15975.79026479,  15169.02302905,
        14362.25579332,  13555.48855758,  12748.72132184,  11941.9540861 ,
        11135.18685037,  10328.41961463,   9521.65237889,   8714.88514316,
         7908.11790742,   7101.35067168,   6294.58343594,   5487.81620021,
         4681.04896447,   3874.28172873,   3067.51449299,   2260.74725726,
         1453.98002152,    647.21278578,   -159.55444995,   -966.32168569,
        -1773.08892143,  -2579.85615717,  -3386.6233929 ,  -4193.39062864,
        -5000.15786438,  -5806.92510011,  -6613.69233585,  -7420.45957159,
        -8227.22680733,  -9033.99404306,  -9840.7612788 , -10647.52851454,
       -11454.29575027, -12261.06298601, -13067.83022175, -13874.59745749,
       -14681.36469322, -15488.13192896, -16294.8991647 , -17101.66640043,
       -17908.43363617, -18715.20087191, -19521.96810765, -20328.73534338,
       -21135.50257912, -21942.26981486, -22749.0370506 , -23555.80428633,
       -24362.57152207, -25169.33875781, -25976.10599354, -26782.87322928,
       -27589.64046502, -28396.40770076, -29203.17493649, -30009.94217223,
       -30816.70940797, -31623.4766437 , -32430.24387944, -33237.01111518,
       -34043.77835092, -34850.54558665, -35657.31282239, -36464.08005813,
       -37270.84729386, -38077.6145296 , -38884.38176534, -39691.14900108,
       -40497.91623681, -41304.68347255, -42111.45070829, -42918.21794402])
In [276]:
sns.regplot(nova_entrada, nova_entrada_previsao)
Out[276]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ce788d9748>

Módulo 5 - Avaliação de Modelos, Sobreajuste e Sub-Ajuste, Regressão de Ridge e Busca em Grid

In [277]:
# Criando um modelo de testes e treinamentos com train_test_split da biblioteca Scikit-Learn

x = df[["highway-mpg", "curb-weight"]]
y = df["price"]

# Define-se que 30% dos dados serão para testes e 70% para os modelos de teste

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

print("Treinamento das variáveis independentes: {}".format(x_train))
print("Teste das variáveis independentes: {}".format(x_test))
print("Treinamento das variáveis target: {}".format(y_train))
print("Teste das variáveis target: {}".format(y_test))
Treinamento das variáveis independentes:      highway-mpg  curb-weight
40            33         2372
60            32         2410
56            23         2380
101           22         3095
86            32         2405
..           ...          ...
67            25         3515
192           38         2579
117           24         3130
47            19         4066
172           30         2975

[143 rows x 2 columns]
Teste das variáveis independentes:      highway-mpg  curb-weight
52            38         1905
181           24         3151
5             25         2507
18            53         1488
188           32         2300
..           ...          ...
198           22         3045
143           32         2340
24            38         1967
30            54         1713
135           28         2758

[62 rows x 2 columns]
Treinamento das variáveis target: 40     10295
60      8495
56     11845
101    13499
86      8189
       ...  
67     25552
192    13845
117    18150
47     32250
172    17669
Name: price, Length: 143, dtype: int32
Teste das variáveis target: 52      6795
181    15750
5      15250
18      5151
188     9995
       ...  
198    18420
143     9960
24      6229
30      6479
135    15510
Name: price, Length: 62, dtype: int32
In [278]:
# Realizando a distribuição comparando os valores atuais de price com os valores treinados de price (70% dos dados)

grafico_1 = sns.distplot(y, hist = False, color = "r", label = "Valores Atuais")
sns.distplot(y_train, hist = False, color = "b", label = "Valores de Previsão", ax = grafico_1)
Out[278]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ce7894e648>
In [279]:
# Realizando a distribuição comparando os valores atuais de price com os valores de teste de price (30% dos dados)

grafico_1 = sns.distplot(y, hist = False, color = "r", label = "Valores Atuais")
sns.distplot(y_test, hist = False, color = "b", label = "Valores de Previsão", ax = grafico_1)
Out[279]:
<matplotlib.axes._subplots.AxesSubplot at 0x1ce789b0bc8>
In [284]:
# Utilizando modelo Cross Value Score - Particionar o DataFrame em 4 para dividir o modelo em modelos de
# testes e de treinamento, 3 para treinamento e 1 para testes

lm = LinearRegression()

scores = cross_val_score(lm, x, y, cv = 3)
scores
Out[284]:
array([0.69469371, 0.50982474, 0.60191592])
In [286]:
# Verificando a média dos scores definidos

np.mean(scores)
Out[286]:
0.6021447875609813
In [288]:
# O método cross_val_predict é bastante parecido com cross_val_scores, a diferença é
# que o resultado é um conjunto de valores de predição

lm_2 = LinearRegression()

previsoes = cross_val_predict(lm, x, y, cv = 3)
previsoes
Out[288]:
array([13257.30936368, 13257.30936368, 15988.07194542, 10568.95983131,
       17059.61071548, 13421.06989556, 16441.96106451, 17428.00862708,
       19939.4840788 , 19112.38245937, 11354.51949418, 11354.51949418,
       14443.84482594, 14936.86860723, 18333.37957089, 20699.02262823,
       22043.63294082, 23695.4288853 , -3151.3390705 ,  2965.22822052,
        3278.97062679,  3514.44370867,  4311.37473466,  8695.46946244,
        5127.10499096,  5324.31450348,  5324.31450348,  9260.20579373,
       12343.84544393, 16411.79047107, -1400.06727694,  3800.42281587,
        2899.20135207,  5947.64983601,  6091.07493602,  6575.13464855,
        6700.63161106,  8866.65786152,  9341.7535053 ,  9476.21453656,
       10085.7712116 , 12247.64798205,  9908.89713096, 10834.60350664,
        2965.22822052,  3278.97062679, 14393.33880063, 28989.91512972,
       28989.91512972, 28481.3705053 ,  6296.37742447,  4526.51238467,
        4571.33272843,  4929.89547845,  4974.7158222 , 12813.92051489,
       12813.92051489, 12858.74085864, 13889.60876496, 10467.94778069,
       10692.04949945, 10467.94778069, 10692.04949945,  8331.42701494,
       10826.51053071, 14350.92575125, 11432.12370983, 22456.85119616,
       24563.40735256, 25047.4537528 , 28472.49906886, 28387.30385225,
       27702.29478903, 30462.47065942, 28158.34926498, 17802.65484663,
        4747.07035402,  5194.51257338,  5941.7951878 ,  8027.56200663,
       10829.87181068, 10224.36081185, 16843.64215813, 17939.65665927,
       18001.93021048, 10685.18509074, 11183.37350035, 11240.87724861,
       11240.87724861,  4550.71009454,  5609.22740847,  4911.89669151,
        5160.99089631,  6232.09597697,  5322.90212943,  6281.91481793,
        5571.99633424,  6394.00721009,  6032.82061313, 10092.12880215,
        9818.12517687, 20189.18940982, 22692.5861681 , 19629.6547983 ,
       19766.65661094, 20695.99007601, 20613.57690727, 19172.67297306,
       21006.29742624, 21788.16212351, 24237.89758718, 19857.68203627,
       21691.30648945, 22473.17118672, 24922.90665039, 19857.68203627,
       21691.30648945, 20542.69109948,  4747.07035402,  7815.83193255,
        5480.97090891,  5754.97453419,  8270.82600272, 12884.89900032,
       16656.82150453, 16035.01334181, 15843.42288527, 15843.42288527,
       16391.43013584, 23317.17637869, 13391.69966652, 11909.58914793,
       14499.24152861, 14960.0658075 , 15109.52233038, 15744.71255263,
       16449.86123339, 17499.03711069,  7116.77612823,  8802.8260717 ,
       10083.22079235,  7942.59134412,  9174.06178284, 10962.38578826,
       12757.34804083, 13339.77012303, 10428.88798799, 12377.2613335 ,
       12379.47408255, 15960.54759149,  5859.73717403,  6634.41535894,
        6367.66645881,  9383.03540485, 10428.88798799, 19178.25191239,
        7259.71415978,  7558.47292793,  9517.51622944,  7451.37957864,
        5520.11754167,  8260.67416986,  8452.73337795,  8762.16210211,
        9135.6105623 , 10725.63090172, 11099.0793619 , 13472.03819857,
       13429.35837455, 13589.40771463, 14955.16208332, 15328.61054351,
       18113.46906091, 10437.34519495, 12268.3490244 , 11751.96253266,
       11751.96253266, 12221.4405969 , 19251.12264462, 19677.92088484,
       20904.96582545, 21118.36494556,  7489.83079918,  9188.96034232,
        7521.8406672 ,  9220.97021034,  9893.17743868,  8860.01066596,
       10535.58754805, 10608.26138566, 10256.15283748, 15890.08650293,
       12385.52164583, 13529.61658208, 17816.9230418 , 19118.65767446,
       18062.33202993, 19204.0173225 , 20363.01081823, 21558.04589083,
       18243.72128202, 19842.1988284 , 19823.07166543, 21259.09022806,
       19980.90825646])
In [290]:
# Valor médio das previsões

np.mean(previsoes)
Out[290]:
13036.892636649161
In [321]:
# Sobreajuste e Sub-Ajuste e Seleção de Modelo
# Primeiro, iremos definir os modelos x e y de treino e teste

x_train, x_test, y_train, y_test = train_test_split(df[["horsepower"]], df[["price"]], test_size = 0.3, random_state = 0)

# Será necessário instanciar uma regressão linear e uma lista vazia, que conterá os valores R Quadráticos de teste

lm_3 = LinearRegression()
r_quad_test = []

# Será necessário  criar uma lista de 4 posições para definirmos os 4 graus de teste

order = [1, 2, 3, 4]

# Primeiramente, iremos instanciar uma PolynomialFeature, e então, iremos realizar um fit_transform com
# os valores de treino e teste de x
# Por fim, iremos utilizar a regressão linear para realizar um fit do valor de treino de X e de Y
# Ao realizar o score entre x_test com PolynomialFeature e o teste y, iremos incrementar este valor à lista de R Quadrático
# para obter os valroes para cada grau

for n in order:
    pr = PolynomialFeatures(degree = n)
    x_train_pr = pr.fit_transform(x_train)
    x_test_pr = pr.fit_transform(x_test)
    lm_3.fit(x_train_pr, y_train)
    r_quad_test.append(lm_3.score(x_test_pr, y_test))
    
r_quad_test
Out[321]:
[0.09424588101755615,
 0.06166640058217798,
 0.2276227056867567,
 0.06640983644054366]
In [328]:
# Regressão de Ridge - previne sobreajuste
# Utiliza um valor alpha para controlar os coeficientes dos graus no polinômio

x = df[["horsepower"]]
y = df[["price"]]

ridge_model = Ridge(alpha = 0.1)
ridge_model.fit(x, y)

previsao = ridge_model.predict(x)
previsao
Out[328]:
array([[14010.95228047],
       [14010.95228047],
       [19892.36356418],
       [12779.95922109],
       [14558.06030686],
       [13874.17527388],
       [13874.17527388],
       [13874.17527388],
       [17977.48547181],
       [20713.02560376],
       [12643.1822145 ],
       [12643.1822145 ],
       [15378.72234645],
       [15378.72234645],
       [15378.72234645],
       [23722.11974891],
       [23722.11974891],
       [23722.11974891],
       [ 5394.00086482],
       [ 8403.09500997],
       [ 8403.09500997],
       [ 8129.54099677],
       [ 8129.54099677],
       [12779.95922109],
       [ 8129.54099677],
       [ 8129.54099677],
       [ 8129.54099677],
       [12779.95922109],
       [10865.08112873],
       [18661.3705048 ],
       [ 6761.77093079],
       [ 9223.75704955],
       [ 7035.32494399],
       [ 9223.75704955],
       [ 9223.75704955],
       [ 9223.75704955],
       [ 9223.75704955],
       [10591.52711553],
       [10591.52711553],
       [10591.52711553],
       [10591.52711553],
       [12643.1822145 ],
       [12506.4052079 ],
       [ 9497.31106275],
       [ 8403.09500997],
       [ 8403.09500997],
       [11138.63514192],
       [22901.45770932],
       [22901.45770932],
       [34664.28027673],
       [ 8129.54099677],
       [ 8129.54099677],
       [ 8129.54099677],
       [ 8129.54099677],
       [ 8129.54099677],
       [12643.1822145 ],
       [12643.1822145 ],
       [12643.1822145 ],
       [17293.60043882],
       [10317.97310233],
       [10317.97310233],
       [10317.97310233],
       [10317.97310233],
       [ 7582.43297038],
       [10317.97310233],
       [15241.94533985],
       [ 8676.64902316],
       [15652.27635965],
       [15652.27635965],
       [15652.27635965],
       [15652.27635965],
       [20029.14057077],
       [20029.14057077],
       [23995.67376211],
       [23995.67376211],
       [22764.68070273],
       [ 8129.54099677],
       [ 8129.54099677],
       [ 8129.54099677],
       [12779.95922109],
       [14694.83731346],
       [10865.08112873],
       [18661.3705048 ],
       [18661.3705048 ],
       [18661.3705048 ],
       [10865.08112873],
       [10865.08112873],
       [14694.83731346],
       [14694.83731346],
       [ 8266.31800337],
       [ 6351.439911  ],
       [ 8266.31800337],
       [ 8266.31800337],
       [ 8266.31800337],
       [ 8266.31800337],
       [ 8266.31800337],
       [ 8266.31800337],
       [ 8266.31800337],
       [ 8266.31800337],
       [12096.0741881 ],
       [12096.0741881 ],
       [19618.80955098],
       [19618.80955098],
       [19618.80955098],
       [20713.02560376],
       [26184.10586767],
       [20713.02560376],
       [12096.0741881 ],
       [11822.52017491],
       [12096.0741881 ],
       [11822.52017491],
       [11822.52017491],
       [11822.52017491],
       [11822.52017491],
       [11822.52017491],
       [12096.0741881 ],
       [11822.52017491],
       [18251.039485  ],
       [ 8129.54099677],
       [12779.95922109],
       [ 8129.54099677],
       [ 8129.54099677],
       [ 8129.54099677],
       [10865.08112873],
       [18661.3705048 ],
       [18387.8164916 ],
       [27141.54491385],
       [27141.54491385],
       [27141.54491385],
       [38220.48244827],
       [-1171.29545187],
       [-1171.29545187],
       [13874.17527388],
       [13874.17527388],
       [13874.17527388],
       [13874.17527388],
       [20713.02560376],
       [20713.02560376],
       [ 8266.31800337],
       [ 8813.42602976],
       [ 8813.42602976],
       [10044.41908914],
       [10044.41908914],
       [11685.74316831],
       [10044.41908914],
       [14010.95228047],
       [10044.41908914],
       [11685.74316831],
       [10044.41908914],
       [14010.95228047],
       [ 7308.87895718],
       [ 7308.87895718],
       [ 7308.87895718],
       [ 7308.87895718],
       [ 7308.87895718],
       [ 7308.87895718],
       [ 8403.09500997],
       [ 8403.09500997],
       [ 6488.2169176 ],
       [ 6488.2169176 ],
       [ 8403.09500997],
       [ 8403.09500997],
       [ 8403.09500997],
       [ 8403.09500997],
       [ 8403.09500997],
       [14147.72928707],
       [14147.72928707],
       [14694.83731346],
       [14694.83731346],
       [14694.83731346],
       [14694.83731346],
       [14694.83731346],
       [14694.83731346],
       [11412.18915512],
       [ 8813.42602976],
       [11412.18915512],
       [11412.18915512],
       [11412.18915512],
       [20849.80261036],
       [20849.80261036],
       [20165.91757737],
       [20165.91757737],
       [ 5941.10889121],
       [10454.75010893],
       [ 5941.10889121],
       [10454.75010893],
       [10454.75010893],
       [ 8129.54099677],
       [12506.4052079 ],
       [11138.63514192],
       [11138.63514192],
       [13874.17527388],
       [ 8129.54099677],
       [10865.08112873],
       [14421.28330027],
       [14421.28330027],
       [14421.28330027],
       [14421.28330027],
       [20986.57961696],
       [20986.57961696],
       [14421.28330027],
       [20713.02560376],
       [17156.82343222],
       [13327.06724748],
       [14421.28330027]])
In [338]:
# Utilizando método de Busca por Grid - valores como o alpha na regressão de Ridge são Hiperparâmetros, e o
# método de Busca por Grid nos permite iterar sobre esses hiperparâmetros 

parametros = [
    {
        'alpha': [0.001, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000]
    }
]

x = df[["horsepower", "curb-weight", "engine-size", "highway-mpg"]]
y = df[["price"]]

ridge = Ridge()
grid = GridSearchCV(ridge, parametros, cv = 4)
grid.fit(x, y)

melhor_estimador = grid.best_estimator_
resultados = grid.cv_results_

print("Melhor estimador: {}\n".format(melhor_estimador))
print("Teste médio dos scores: {}\n".format(resultados["mean_test_score"]))
Melhor estimador: Ridge(alpha=10000, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

Teste médio dos scores: [0.62546088 0.62546237 0.62547584 0.62560951 0.62684632 0.63342604
 0.63677298 0.60677025 0.55261827]

In [341]:
# Utilizando os parametros passando atributo de normalização

parametros = [
    {
        'alpha': [1, 10, 100, 1000],
        'normalize': [True, False]
    }
]

ridge = Ridge()
grid = GridSearchCV(ridge, parametros, cv = 4)
grid.fit(x, y)

melhor_estimador = grid.best_estimator_
resultados = grid.cv_results_

print("Melhor estimador: {}\n".format(melhor_estimador))
print("Scores: {}\n".format(resultados["params"]))
Melhor estimador: Ridge(alpha=1000, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

Scores: [{'alpha': 1, 'normalize': True}, {'alpha': 1, 'normalize': False}, {'alpha': 10, 'normalize': True}, {'alpha': 10, 'normalize': False}, {'alpha': 100, 'normalize': True}, {'alpha': 100, 'normalize': False}, {'alpha': 1000, 'normalize': True}, {'alpha': 1000, 'normalize': False}]