Sebastian Jose Teran Hidalgo
10/25/2019
This talk will cover:
In this talk:
Other ways without these tools:
In this talk:
Other reasons:
In this talk:
Others:
data.frame(r_sample = rnorm(n = 10000, mean = 1000, sd = 250)) %>% ggplot(aes(x = r_sample)) + geom_histogram(bins = 300) + geom_vline(xintercept = 1000, color = 'red') + theme_linedraw()
data.frame(r_sample = rpois(n = 10000, lambda = 3)) %>% ggplot(aes(x = r_sample)) + geom_histogram() + geom_vline(xintercept = 3, color = 'red') + theme_linedraw()
data.frame(r_sample = rbinom(n = 10000, size = 1, prob = 0.5)) %>% ggplot(aes(x = r_sample)) + geom_histogram() + theme_linedraw()
Def: Choose rows by their ordinal position in the tbl.
slice(df, c(1, 2, 3)) # rows 1, 2 ,3 slice(df, rep(c(1, 2), each = 3)) # rows 1, 1, 1, 2, 2, 2 slice(df, rep(c(1, 2), c(2, 3))) # rows 1, 1, 2, 2, 2
Simulate data on 3 cars for a A/B test twice.
n_simu <- 2; n_obs <- 3 data.frame(obs_id = 1:n_obs)
## obs_id ## 1 1 ## 2 2 ## 3 3
n_simu <- 2; n_obs <- 3 data.frame(obs_id = 1:n_obs) %>% dplyr::slice(rep(obs_id, each = n_simu)) # each n_simu times
## obs_id ## 1 1 ## 2 1 ## 3 2 ## 4 2 ## 5 3 ## 6 3
n_simu <- 2; n_obs <- 3 data.frame(obs_id = 1:n_obs) %>% dplyr::slice(rep(obs_id, each = n_simu)) %>% # each n_simu times mutate(simu_id = rep(1:n_simu, n_obs)) %>% # create simu_id arrange(simu_id, obs_id) %>% # obs_id are nested within simu_id select(simu_id, obs_id)
## simu_id obs_id ## 1 1 1 ## 2 1 2 ## 3 1 3 ## 4 2 1 ## 5 2 2 ## 6 2 3
data.frame(obs_id = 1:n_obs) %>% dplyr::slice(rep(obs_id, each = n_simu)) %>% mutate( simu_id = rep(1:n_simu, n_obs), pois_var = rpois(dplyr::n(), 10) # simulated online_views ) %>% arrange(simu_id, obs_id) %>% select(simu_id, obs_id, pois_var) %>% rename(car_id = obs_id, online_views = pois_var)
## simu_id car_id online_views ## 1 1 1 10 ## 2 1 2 10 ## 3 1 3 6 ## 4 2 1 8 ## 5 2 2 5 ## 6 2 3 7
To simulate a linear model you need to:
We will:
Simulate the number of online views.
n_simu <- 2 n_obs <- 3 data.frame(car_id = seq(1, n_obs, 1)) %>% dplyr::slice(rep(car_id, each = n_simu)) %>% mutate( simu_id = rep(1:n_simu, n_obs), price = rpois(n(), 15), # independent variable miles = rpois(n(), 25), # independent variable mean_views = 200 - 2*price - 2*miles, # mean(price, miles) num_views = rnorm(n(), mean_views, 20) # random var with mean ) %>% arrange(simu_id, car_id) %>% select(simu_id, car_id, price, miles, mean_views, num_views) %>% head(6)
## simu_id car_id price miles mean_views num_views ## 1 1 1 16 25 18 29.574479 ## 2 1 2 11 21 36 21.138754 ## 3 1 3 16 26 16 16.308039 ## 4 2 1 9 23 36 53.505714 ## 5 2 2 12 29 18 3.479131 ## 6 2 3 14 18 36 37.145661
tidyr::nest(dt, simu_id)
data.frame(car_id = seq(1, n_obs, 1)) %>% dplyr::slice(rep(car_id, each = n_simu)) %>% mutate( simu_id = rep(1:n_simu, n_obs), price = rpois(n(), 15), miles = rpois(n(), 25), mean_views = 200 - 2*price - 2*miles, num_views = rnorm(n(), mean_views, 20) ) %>% arrange(simu_id, car_id) %>% select(simu_id, car_id, price, miles, mean_views, num_views) %>% group_by(simu_id) %>% # needed for nest() below tidyr::nest() # nest the data by each simu_id
simu_nested <- data.frame(car_id = seq(1, n_obs, 1)) %>% dplyr::slice(rep(car_id, each = n_simu)) %>% mutate( simu_id = rep(1:n_simu, n_obs), price = rpois(n(), 15), miles = rpois(n(), 25), mean_views = 200 - 2*price - 2*miles, num_views = rnorm(n(), mean_views, 20) ) %>% group_by(simu_id) %>% nest()
names(simu_nested)
## [1] "simu_id" "data"
simu_nested$data[[1]] %>% head(3)
## # A tibble: 3 x 5 ## car_id price miles mean_views num_views ## <dbl> <int> <int> <dbl> <dbl> ## 1 1 14 19 134 135. ## 2 2 16 24 120 115. ## 3 3 10 26 128 126.
Defining the linear model function:
num_views_model <- function(df) { lm(num_views ~ price + miles, data = df) }
Defining the function to get the coefficient of price:
num_views_price_coef <- function(model) { as.numeric(model$coefficients[2]) }
The map functions transform their input by applying a function to each element and returning a vector the same length as the input.
purrr::map(nested_df, num_views_model)
Run one model on each simu_id.
simu_nested <- simu_nested %>% mutate( # Run the num_views_model on each simu_id model = map(data, num_views_model), # After the model ran get the price coefficient price_coef = map(model, num_views_price_coef) )
simu_nested$model[[1]]
## ## Call: ## lm(formula = num_views ~ price + miles, data = df) ## ## Coefficients: ## (Intercept) price miles ## 198.029 -1.168 -2.219
simu_nested$price_coef[[1]]
## [1] -1.167983
Nesting creates a list-column of data frames; unnesting flattens it back out into regular columns.
tidyr::unnest(nested_df, summary_df)
simu_price_coef <- unnest(simu_nested, price_coef) %>% # need help with this select(simu_id, price_coef) head(simu_price_coef, 3)
## # A tibble: 3 x 2 ## # Groups: simu_id [3] ## simu_id price_coef ## <int> <dbl> ## 1 1 -1.17 ## 2 2 -1.08 ## 3 3 -1.51
simu_price_coef %>% ggplot(aes(x = price_coef)) + geom_histogram() + theme_linedraw()
simu_price_coef %>% ungroup() %>% summarise( mean_price_coef = mean(price_coef), sd_price_coef = sd(price_coef) )
## # A tibble: 1 x 2 ## mean_price_coef sd_price_coef ## <dbl> <dbl> ## 1 -2.00 0.761
n_simu <- 2 n_obs <- 3 data.frame(car_id = seq(1, n_obs, 1)) %>% dplyr::slice(rep(car_id, each = n_simu)) %>% mutate( simu_id = rep(1:n_simu, n_obs), price = rpois(n(), 15), miles = rpois(n(), 25), treatment = rbinom(n(), 1, 0.50), # randomized treatment mean_views = 200 - 2* price - 2* miles + 10*treatment, num_views = rnorm(n(), mean_views, 20) )
## simu_id car_id price miles treatment mean_views num_views ## 1 1 1 15 24 1 132 123.60111 ## 2 1 2 9 29 0 124 113.45211 ## 3 1 3 17 28 0 110 136.85678 ## 4 2 1 14 35 1 112 99.09604 ## 5 2 2 17 20 1 136 167.00747 ## 6 2 3 17 24 0 118 141.42582
num_views_model_without <- function(df) { lm(num_views ~ treatment, data = df) } num_views_model_with <- function(df) { lm(num_views ~ treatment + price + miles, data = df) } treatment_pvalue <- function(model) { summary(model)$coefficients[2, 4] }
simu_ab <- data.frame(car_id = seq(1, n_obs, 1)) %>% dplyr::slice(rep(car_id, each = n_simu)) %>% mutate( simu_id = rep(1:n_simu, n_obs), price = rpois(n(), 15), miles = rpois(n(), 25), treatment = rbinom(n(), 1, 0.50), mean_views = 200 - 2*price - 2*miles + 10*treatment, num_views = rnorm(n(), mean_views, 20) )
simu_ab_nested <- simu_ab %>% group_by(simu_id) %>% nest() %>% mutate( model_without = map(data, num_views_model_without), model_with = map(data, num_views_model_with), pvalues_without = map(model_without, treatment_pvalue), pvalues_with = map(model_with, treatment_pvalue) ) names(simu_ab_nested)
## [1] "simu_id" "data" "model_without" "model_with" ## [5] "pvalues_without" "pvalues_with"
simu_ab_nested$model_without[[1]]$coefficients
## (Intercept) treatment ## 117.34793 14.26929
simu_ab_nested$model_with[[1]]$coefficients
## (Intercept) treatment price miles ## 181.371428 16.948224 -1.395408 -1.738635
simu_ab_nested$pvalues_without[[1]]
## [1] 0.003314815
simu_ab_nested$pvalues_with[[1]]
## [1] 0.0001615344
simu_ab_pvalues_without <- simu_ab_nested %>% unnest(pvalues_without) %>% select(simu_id, pvalues_without) simu_ab_pvalues_with <- simu_ab_nested %>% unnest(pvalues_with) %>% select(simu_id, pvalues_with)
simu_ab_pvalues_with %>% ungroup() %>% summarise(power_with = mean(pvalues_with < 0.05))
## # A tibble: 1 x 1 ## power_with ## <dbl> ## 1 0.675
simu_ab_pvalues_without %>% ungroup() %>% summarise(power_without = mean(pvalues_without < 0.05))
## # A tibble: 1 x 1 ## power_without ## <dbl> ## 1 0.56
n_steps_x <- 10 n_steps_y <- 5 dots_df <- data.frame(n_lines = rpois(1, 10) + 1) %>% # number of lines dplyr::slice(rep(1, n_lines)) %>% mutate( line_id = 1:n(), # assign an id to each line line_x_start = sample(2:(n_steps_x - 1), n(), replace = TRUE), line_y_start = sample(1:n_steps_y, n(), replace = TRUE), line_length = rdunif(n(), 1, 3), line_thick = rdunif(n(), 1, 5) ) %>% dplyr::slice(rep(line_id, line_length)) %>% select(-n_lines)
dots_df %>% head(4)
## line_id line_x_start line_y_start line_length line_thick ## 1 1 6 2 1 1 ## 2 2 9 3 2 2 ## 3 2 9 3 2 2 ## 4 3 8 4 2 2
lines_df <- dots_df %>% group_by(line_id) %>% mutate( x_axis_line = line_x_start + 0:(n() - 1), y_axis_steps = sample(c(-1, 1), n(), replace = TRUE), y_axis_line = line_y_start + y_axis_steps ) %>% ungroup()
lines_df %>% ggplot(aes( x = x_axis_line, y = y_axis_line, group = as.factor(line_id), size = line_thick ), color = 'black') + geom_line() + geom_hline(yintercept = 1, size = 0.1) + geom_hline(yintercept = 2, size = 0.1) + geom_hline(yintercept = 3, size = 0.1) + geom_hline(yintercept = 4, size = 0.1) + geom_hline(yintercept = 5, size = 0.1) + ylim(1, 5) + theme_void() + theme(legend.position = 'none')
n_squares <- 20^2 n_steps_x <- 10 n_steps_y <- 5 data.frame(squares_id = seq(1:n_squares)) %>% mutate(n_lines = rpois(1, 6) + 1) %>% # number of lines dplyr::slice(rep(1:n(), n_lines)) %>% mutate( line_id = 1:n(), # assign an id to each line line_x_start = sample(2:(n_steps_x - 1), n(), replace = TRUE), line_y_start = sample(1:n_steps_y, n(), replace = TRUE), line_length = rdunif(n(), 2, 4), line_thick = rdunif(n(), 1, 3) ) %>% dplyr::slice(rep(line_id, line_length)) %>% select(-n_lines) %>% group_by(line_id) %>% # continues
You can all be aRtists!
Special Thanks to: