Skip to content

Instantly share code, notes, and snippets.

@jbkunst
Created November 30, 2017 21:25

Revisions

  1. jbkunst created this gist Nov 30, 2017.
    167 changes: 167 additions & 0 deletions logistic-regression.r
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,167 @@
    # packages ----------------------------------------------------------------
    rm(list = ls())
    library(tidyverse)
    theme_set(theme_minimal())


    # data --------------------------------------------------------------------
    n <- 500
    df <- data_frame(
    x1 = rnorm(n) + 1,
    x2 = 5 + 2 * rnorm(n)
    )

    ggplot(df) +
    geom_point(aes(x1, x2))

    df <- df %>%
    mutate(y = as.numeric(x1 > (x2 - 5)/2 + rnorm(n)))

    ggplot(df) +
    geom_point(aes(x1, x2, color = y))

    ggplot(df) +
    geom_point(aes(x1, y, color = y), alpha = 0.25)

    df

    df %>%
    gather(key, value, -y) %>%
    ggplot() +
    geom_point(aes(value, y, color = y), alpha = 0.25) +
    facet_wrap(~ key, scales = "free")


    ggplot(df) +
    geom_density(aes(x1, fill = y, group = y), alpha = 0.75)





    # ajuste visual -----------------------------------------------------------
    ggplot(df) +
    geom_point(aes(x1, y, color = y), alpha = 0.25) +
    geom_smooth(aes(x1, y), method = "lm")


    ggplot(df) +
    geom_point(aes(x1, y, color = y), alpha = 0.25) +
    geom_smooth(aes(x1, y), method = "loess")



    # logit -------------------------------------------------------------------
    df2 <- data_frame(
    s = seq(-10, 10, length.out = 101)
    )

    ggplot(df2) +
    geom_line(aes(s, sin(s)))

    ggplot(df2) +
    geom_line(aes(s, 1 / (1 + exp(s))))

    ggplot(df2) +
    geom_line(aes(s, 1 / (1 + exp(-s))))

    ggplot(df2) +
    geom_line(aes(s, 1 / (1 + exp(-(2 + - 2 * s)))))

    s <- seq(-10, 10, length.out = 101)
    s2 <- seq(-5, 5, length.out = 6)

    p <- expand.grid(x = s, b0 = s2, b1 = s2) %>%
    tbl_df() %>%
    mutate(y = 1 / (1 + exp(-(b0 + b1 * x)))) %>%
    ggplot() +
    geom_line(aes(x, y)) +
    facet_grid(b0 ~ b1) +
    theme(axis.title.x = element_blank(), axis.text = element_blank(), axis.ticks = element_blank())

    p

    p +
    geom_point(data = df, aes(x1, y, color = y), alpha = 0.25) +
    theme(legend.position = "none")


    # ajuste parametros -------------------------------------------------------
    m1 <- glm(y ~ x1, family = binomial(link = "logit"), data = df)
    m1
    c <- m1$coefficients
    c

    ggplot() +
    geom_jitter(data = df, aes(x1, y, color = y), alpha = 0.2, height = 0.01) +
    geom_line(data = df2, aes(s, 1 / (1 + exp(-(c[1] + c[2] * s))))) +
    xlim(-2, 3)

    m2 <- glm(y ~ x1 + x2, family = binomial(link = "logit"), data = df)
    m2
    c <- m2$coefficients
    c

    df_sur <- expand.grid(
    x1 = seq(min(df$x1), max(df$x1), length.out = 100),
    x2 = seq(min(df$x2), max(df$x2), length.out = 100)
    ) %>%
    tbl_df() %>%
    mutate(
    y = predict(m2, newdata = ., type = "response")
    )
    df_sur

    df_surm <- df_sur %>%
    spread(x1, y) %>%
    select(-x2) %>%
    as.matrix()

    library(plotly)
    plot_ly() %>%
    add_surface(x = unique(sort(df_sur$x1)), y = unique(sort(df_sur$x2)), z = df_surm) %>%
    add_markers(data = df, x = ~x1, y = ~x2, z = ~y, color = ~y)

    # evaluar -----------------------------------------------------------------
    df <- df %>%
    mutate(
    p1 = predict(m1, newdata = ., type = "response"),
    p2 = predict(m2, newdata = ., type = "response"),
    l2 = predict(m2, newdata = ., type = "link")
    )

    ggplot(df) +
    geom_density(aes(p1))

    df %>%
    select(y, starts_with("p")) %>%
    gather(key, value, -y) %>%
    ggplot() +
    geom_density(aes(value)) +
    facet_wrap(~ key)

    df %>%
    select(y, starts_with("p")) %>%
    gather(key, value, -y) %>%
    ggplot() +
    geom_density(aes(value, group = y, fill = y), alpha = 0.75) +
    facet_wrap(~ key)



    # acumulada ---------------------------------------------------------------
    x <- 1:10
    ecdf_x <- ecdf(x)

    plot(ecdf_x)

    n <- 5000
    plot(ecdf(runif(n)))

    plot(ecdf(rnorm(n)))
    plot(ecdf(rexp(n)), add = TRUE)
    plot(ecdf(runif(n)), add = TRUE)

    df <- df %>%
    mutate()