10 Regresión linear

pacman::p_load(tidyverse, performance, ggeffects)

performance

ggeffects

10.1 Simple

lm1 <- lm(y~x, data=anscombe_long %>% filter(set==1))
check_model(lm1)
check_outliers(lm1)

anova(lm1)
summary(lm1)

Agregar formula y R2

library(ggpmisc)
formu <- y ~ x

anscombe_long %>% 
  filter(set==1) %>% 
  ggplot() + 
  aes(x, y) + 
  geom_point() + 
  geom_smooth(method="lm")+ 
  stat_poly_eq(formula = formu,
               aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),
               parse = TRUE)

10.2 Regresión polinomial

Es un caso especial de la Regresión Lineal, enriquece el modelo lineal al aumentar predictores adicionales, obtenidos al elevar cada uno de los predictores originales a una potencia.

La regresión ajustada parece correcta para el set 1, pero veamos para el set 2

lm2 <- lm(y~x, data=anscombe_long %>% filter(set==2))
check_model(lm2)
check_outliers(lm2)

lm2.sq <- lm(y~x + I(x^2), data=anscombe_long %>% filter(set==2))
compare_performance(lm2, lm2.sq)

Regresiones lineales seriales

library(broom)  

lm_ans_coef <- anscombe_long %>% 
  group_by(set) %>%
  do(tidy(lm(data = ., formula = y ~ x)))

lm_ans_coef

lm_ans_stats <- anscombe_long %>% 
  group_by(set) %>%
  do(glance(
    lm(data = ., formula = y ~ x))
    )
lm_ans_stats

Ahora veamos un caso del dataset cars

cars %>%
  ggplot()+
  aes(speed, dist)+
  geom_point()

cars %>%
  ggplot()+
  aes(speed, dist)+
  geom_point()+
  # geom_smooth()+
  geom_smooth(method = lm)+
  # geom_smooth(method = lm, formula = y ~ poly(x, 1), ) + 
  geom_smooth(method = lm, formula = y ~ poly(x, 2), col="red")

Ajustamos las dos alternativas

fit1 <- lm(dist ~ speed, data = cars)
check_model(fit1)

fit2 <- lm(dist ~ speed + I(speed^2), data = cars)

compare_performance(fit1, fit2)
summary(fit2)

pred_fit2 <- ggpredict(fit2, terms = c("speed"))

pred_fit2 %>%  
  plot(add.data = TRUE, limit.range = TRUE)

10.3 Regresion multiple

La regresión lineal múltiple permite generar un modelo lineal en el que el valor de la variable dependiente o respuesta (Y) se determina a partir de un conjunto de variables independientes llamadas predictores (X1, X2, X3…)

Volvamos a iris para tratar de modelar Sepal.Width ~ Sepal.Length

model1 <- lm(Sepal.Width ~ Sepal.Length, data = iris)
model2 <- lm(Sepal.Width ~ Sepal.Length * Species, data = iris)
# model2 <- lm(Sepal.Width ~ Sepal.Length + Species + Sepal.Width:Species, data = iris)
model3 <- lm(Sepal.Width ~ Sepal.Length + Species, data = iris)
model4 <- lm(Sepal.Width ~ Sepal.Length:Species, data = iris)

compare_performance(model1, model2, model3, model4)

anova(model2)
check_model(model2)

pred_mod <- ggpredict(model2, terms = c("Sepal.Length", "Species"))

pred_mod %>%  
  plot(add.data = TRUE, limit.range = TRUE)

Como se corrije la Simspon`s paradox? Atribuyendo efecto aleatorio a los grupos,

library(lme4)

simpson_lm <- lm(V2 ~ V1, data=simpson) 
simpson_mixed <- lmer(V2 ~ V1 + (1|Group), data=simpson) 

ggpredict(simpson_lm) %>% plot(add.data = TRUE)
ggpredict(simpson_mixed) %>% plot(add.data = TRUE)