我正在尝试编写一个名为 grouped_lm, 的函数,它基本上为多个标准/依赖( grouping.vars )和预测变量/独立变量( crit.vars )的分组变量( pred.vars )组合的每个级别运行线性回归模型。

这样做的方式是将第一次进入 crit.vars 回归到 pred.vars 上。
例如,如果我输入 grouping.vars = amcrit.vars = c(mpg, drat)crit.vars = c(wt, disp) (在 mtcars 数据集的上下文中),该函数将为分组变量 mpg ~ wt ( drat ~ dispam )的每个级别运行两个回归模型( am = 0am = 1 )。

我设法从输入的变量创建了一个数据框,编写了一个运行线性回归模型的自定义函数,但似乎无法弄清楚如何使用 rlang 将输入的变量输入到将输入到 purrr::pmap 的列表元素中。

对一个冗长的问题深表歉意,并提前感谢您提供的任何帮助。

# libraries needed
library(tidyverse)
library(plyr)

# function definition
grouped_lm <- function(data,
                       grouping.vars,
                       crit.vars,
                       pred.vars) {

  #================== preparing dataframe ==================
  #
  # check how many variables were entered for criterion variables vector
  crit.vars <-
    as.list(rlang::quo_squash(rlang::enquo(crit.vars)))
  crit.vars <-
    if (length(crit.vars) == 1) {
      crit.vars
    } else {
      crit.vars[-1]
    }

  # check how many variables were entered for predictor variables vector
  pred.vars <-
    as.list(rlang::quo_squash(rlang::enquo(pred.vars)))
  pred.vars <-
    if (length(pred.vars) == 1) {
      pred.vars
    } else {
      pred.vars[-1]
    }

  # check how many variables were entered for grouping variable vector
  grouping.vars <-
    as.list(rlang::quo_squash(rlang::enquo(grouping.vars)))
  grouping.vars <-
    if (length(grouping.vars) == 1) {
      grouping.vars
    } else {
      grouping.vars[-1]
    }

  # getting the dataframe ready
  df <- dplyr::select(.data = data,
                      !!!grouping.vars,
                      !!!crit.vars,
                      !!!pred.vars) %>%
    dplyr::group_by(.data = ., !!!grouping.vars) %>%
    tidyr::nest(data = .)

  # checking if the nested dataframe looks okay
  cat(paste("the entire nested dataframe: \n"))
  print(df)             # the entire nested dataframe
  cat(paste("first element of the list column from nested dataframe: \n"))
  print(df$data[[1]])   # first element of the list column

  #============== custom function ================

  # custom function to run linear regression for every element of a list for two variables
  lm_listed <- function(list.col, x_name, y_name) {
    fx <- glue::glue("scale({y_name}) ~ scale({x_name})")

    # this tags any names that are not predictor variables (used to remove intercept terms)
    filter_name  <- glue::glue("scale({x_name})")

    # dataframe with results from lm
    results_df <-
      list.col %>% # running linear regression on each individual group with purrr
      purrr::map(.x = .,
                 .f = ~ stats::lm(formula = as.formula(fx),
                                  data = (.))) %>% # tidying up the output with broom
      purrr::map_dfr(.x = .,
                     .f = ~ broom::tidy(x = .),
                     .id = "group") %>% # remove intercept terms
      dplyr::filter(.data = ., term == !!filter_name) %>% # add formula as a character
      dplyr::mutate(.data = ., formula = as.character(fx)) %>% # rearrange the dataframe
      dplyr::select(
        .data = .,
        group,
        formula,
        term,
        estimate,
        std.error,
        t = statistic,
        p.value
      ) %>% # convert to a tibble dataframe
      tibble::as_data_frame(x = .)

    # return the dataframe
    return(results_df)
  }

  # check if the function works
  group_mtcars <- split(mtcars, mtcars$am)
  fn_results <- purrr::pmap(.l = list(
    l = list(group_mtcars),
    x_name = list('wt', 'disp'),
    y_name = list('mpg', 'drat')
  ),
  .f = lm_listed) %>%
    dplyr::bind_rows()

  # seems to be working!
  cat(paste("the custom function seems to be working!: \n"))
  print(fn_results)

  #========= using  custom function on entered dataframe =================
  cat(paste("running the custom function on the entered dataframe: \n"))
  # running custom function for each element of the created list column
  df_lm <- purrr::pmap(.l = list(
    l = list(df$data),
    x_name = list(!!!pred.vars),
    y_name = list(!!!crit.vars)
  ),
  .f = lm_listed) %>%
    dplyr::bind_rows()

  #============================== output ========================

  print(df_lm)
  # return the final dataframe with results
  return(df_lm)
}

# example usage of the function
grouped_lm(
  data = iris,
  crit.vars = c(Sepal.Length, Petal.Length),
  pred.vars = c(Sepal.Width, Petal.Width),
  grouping.vars = Species
)
#> the entire nested dataframe:
#> # A tibble: 3 x 2
#>   Species    data
#>   <fct>      <list>
#> 1 setosa     <tibble [50 x 4]>
#> 2 versicolor <tibble [50 x 4]>
#> 3 virginica  <tibble [50 x 4]>
#> first element of the list column from nested dataframe:
#> # A tibble: 50 x 4
#>    Sepal.Length Petal.Length Sepal.Width Petal.Width
#>           <dbl>        <dbl>       <dbl>       <dbl>
#>  1         5.10         1.40        3.50       0.200
#>  2         4.90         1.40        3.00       0.200
#>  3         4.70         1.30        3.20       0.200
#>  4         4.60         1.50        3.10       0.200
#>  5         5.00         1.40        3.60       0.200
#>  6         5.40         1.70        3.90       0.400
#>  7         4.60         1.40        3.40       0.300
#>  8         5.00         1.50        3.40       0.200
#>  9         4.40         1.40        2.90       0.200
#> 10         4.90         1.50        3.10       0.100
#> # ... with 40 more rows
#> the custom function seems to be working!:
#> # A tibble: 4 x 7
#>   group formula                   term    estimate std.error     t p.value
#>   <chr> <chr>                     <chr>      <dbl>     <dbl> <dbl>   <dbl>
#> 1 0     scale(mpg) ~ scale(wt)    scale(~   -0.768     0.155 -4.94 1.25e-4
#> 2 1     scale(mpg) ~ scale(wt)    scale(~   -0.909     0.126 -7.23 1.69e-5
#> 3 0     scale(drat) ~ scale(disp) scale(~   -0.614     0.192 -3.20 5.20e-3
#> 4 1     scale(drat) ~ scale(disp) scale(~   -0.305     0.287 -1.06 3.12e-1
#> running the custom function on the entered dataframe:
#> Error in !pred.vars: invalid argument type

reprex package (v0.2.0) 于 2018 年 3 月 23 日创建。

提供答案后的编辑

我还想知道如何在输出中为每个分组变量获取单独的列。因此,根据下面提供的答案,如果我运行-

grouped_lm(
  data = mtcars,
  crit.vars = c(wt, mpg),
  pred.vars = c(drat, disp),
  grouping.vars = c(am, cyl)
)

它有效,但输出看起来像这样:

r - 使用 rlang 列出 purrr::pmap 中的变量-LMLPHP

从图中可以看出,值 1 到 6 代表什么完全不清楚。因此,最好为每个提供的分组变量设置一个单独的列,这样在本例中,每个 am 模型将有两列 cyllm 以及它们各自的级别。

(我手动创建了这个数据框。这不是分组的方式,但这只是为了显示所需的输出是什么样的。)

r - 使用 rlang 列出 purrr::pmap 中的变量-LMLPHP

最佳答案

如果我们需要复制与 'mtcars' 的示例用法相同的行为,其中 x_namey_name 是字符串而不是 symbols(这是 'pred.vars' 和 'crit.vars' 的情况),请将它们转换为字符串使用 quo_name

df_lm <- purrr::pmap(.l = list(
    l = list(df$data),
    x_name = map(pred.vars, quo_name),
    y_name = map(crit.vars, quo_name)
  ),
  .f = lm_listed) %>%
    dplyr::bind_rows()

  #============================== output ========================

  print(df_lm)
  # return the final dataframe with results
  return(df_lm)

}

或者作为 symbol 传递而不进行任何评估,即使用 !!
df_lm <- purrr::pmap(.l = list(
    l = list(df$data),
    x_name = pred.vars,  ###
    y_name = crit.vars   ###
  ),
  .f = lm_listed) %>%
    dplyr::bind_rows()

  #============================== output ========================

  print(df_lm)
  # return the final dataframe with results
  return(df_lm)

}

这与 lm_listed 函数如何接受参数有关。将对象视为字符串
sl <- "Sepal.Length"
sw <- "Sepal.Width"

并且 glue 正确返回它
glue::glue("scale({sl}) ~ scale({sw})")
#scale(Sepal.Length) ~ scale(Sepal.Width)

现在,我们将其更改为 symbol ,它也可以工作
sl <- rlang::sym("Sepal.Length")
sw <- rlang::sym("Sepal.Width")
glue::glue("scale({sl}) ~ scale({sw})")
#scale(Sepal.Length) ~ scale(Sepal.Width)

但是,问题在于使用 !! 来评估作为输入参数传递的
sl <- !!rlang::sym("Sepal.Length")


!!tidyverse 函数的环境之外进行评估,这会导致错误

-完整代码
grouped_lm <- function(data,
                       grouping.vars,
                       crit.vars,
                       pred.vars) {

  #================== preparing dataframe ==================
  #
  # check how many variables were entered for criterion variables vector
  crit.vars <-
    as.list(rlang::quo_squash(rlang::enquo(crit.vars)))
  crit.vars <-
    if (length(crit.vars) == 1) {
      crit.vars
    } else {
      crit.vars[-1]
    }

  # check how many variables were entered for predictor variables vector
  pred.vars <-
    as.list(rlang::quo_squash(rlang::enquo(pred.vars)))
  pred.vars <-
    if (length(pred.vars) == 1) {
      pred.vars
    } else {
      pred.vars[-1]
    }

  # check how many variables were entered for grouping variable vector
  grouping.vars <-
    as.list(rlang::quo_squash(rlang::enquo(grouping.vars)))
  grouping.vars <-
    if (length(grouping.vars) == 1) {
      grouping.vars
    } else {
      grouping.vars[-1]
    }

  # getting the dataframe ready
  df <- dplyr::select(.data = data,
                      !!!grouping.vars,
                      !!!crit.vars,
                      !!!pred.vars) %>%
    dplyr::group_by(.data = ., !!!grouping.vars) %>%
    tidyr::nest(data = .)

  # checking if the nested dataframe looks okay
  cat(paste("the entire nested dataframe: \n"))
  print(df)             # the entire nested dataframe
  cat(paste("first element of the list column from nested dataframe: \n"))
  print(df$data[[1]])   # first element of the list column

  #============== custom function ================

  # custom function to run linear regression for every element of a list for two variables
  lm_listed <- function(list.col, x_name, y_name) {
    fx <- glue::glue("scale({y_name}) ~ scale({x_name})")

    # this tags any names that are not predictor variables (used to remove intercept terms)
    filter_name  <- glue::glue("scale({x_name})")

    # dataframe with results from lm
    results_df <-
      list.col %>% # running linear regression on each individual group with purrr
      purrr::map(.x = .,
                 .f = ~ stats::lm(formula = as.formula(fx),
                                  data = (.))) %>% # tidying up the output with broom
      purrr::map_dfr(.x = .,
                     .f = ~ broom::tidy(x = .),
                     .id = "group") %>% # remove intercept terms
      dplyr::filter(.data = ., term == !!filter_name) %>% # add formula as a character
      dplyr::mutate(.data = ., formula = as.character(fx)) %>% # rearrange the dataframe
      dplyr::select(
        .data = .,
        group,
        formula,
        term,
        estimate,
        std.error,
        t = statistic,
        p.value
      ) %>% # convert to a tibble dataframe
      tibble::as_data_frame(x = .)

    # return the dataframe
    return(results_df)
  }

  # check if the function works
  group_mtcars <- split(mtcars, mtcars$am)
  fn_results <- purrr::pmap(.l = list(
    l = list(group_mtcars),
    x_name = list('wt', 'disp'),
    y_name = list('mpg', 'drat')
  ),
  .f = lm_listed) %>%
    dplyr::bind_rows()

  # seems to be working!
  cat(paste("the custom function seems to be working!: \n"))
  print(fn_results)

  #========= using  custom function on entered dataframe =================
  cat(paste("running the custom function on the entered dataframe: \n"))
  # running custom function for each element of the created list column

  df_lm <- purrr::pmap(.l = list(
    l = list(df$data),
    x_name = pred.vars,
    y_name = crit.vars
  ),
  .f = lm_listed) %>%
    dplyr::bind_rows()

  #============================== output ========================

  print(df_lm)
  # return the final dataframe with results
  return(df_lm)

}

- 运行函数
res <- grouped_lm(
  data = iris,
  crit.vars = c(Sepal.Length, Petal.Length),
  pred.vars = c(Sepal.Width, Petal.Width),
  grouping.vars = Species
)

-输出打印
#the entire nested dataframe:
# A tibble: 3 x 2
#  Species    data
#  <fctr>     <list>
#1 setosa     <tibble [50 x 4]>
#2 versicolor <tibble [50 x 4]>
#3 virginica  <tibble [50 x 4]>
#first element of the list column from nested dataframe:
# A tibble: 50 x 4
#   Sepal.Length Petal.Length Sepal.Width Petal.Width
#          <dbl>        <dbl>       <dbl>       <dbl>
# 1         5.10         1.40        3.50       0.200
# 2         4.90         1.40        3.00       0.200
# 3         4.70         1.30        3.20       0.200
# 4         4.60         1.50        3.10       0.200
# 5         5.00         1.40        3.60       0.200
# 6         5.40         1.70        3.90       0.400
# 7         4.60         1.40        3.40       0.300
# 8         5.00         1.50        3.40       0.200
# 9         4.40         1.40        2.90       0.200
#10         4.90         1.50        3.10       0.100
# ... with 40 more rows
#the custom function seems to be working!:
# A tibble: 4 x 7
#  group formula                   term        estimate std.error     t   p.value
#  <chr> <chr>                     <chr>          <dbl>     <dbl> <dbl>     <dbl>
#1 0     scale(mpg) ~ scale(wt)    scale(wt)     -0.768     0.155 -4.94 0.000125
#2 1     scale(mpg) ~ scale(wt)    scale(wt)     -0.909     0.126 -7.23 0.0000169
#3 0     scale(drat) ~ scale(disp) scale(disp)   -0.614     0.192 -3.20 0.00520
#4 1     scale(drat) ~ scale(disp) scale(disp)   -0.305     0.287 -1.06 0.312
#running the custom function on the entered dataframe:
# A tibble: 6 x 7
#  group formula                                  term               estimate std.error     t         p.value
#  <chr> <chr>                                    <chr>                 <dbl>     <dbl> <dbl>           <dbl>
#1 1     scale(Sepal.Length) ~ scale(Sepal.Width) scale(Sepal.Width)    0.743    0.0967  7.68 0.000000000671
#2 2     scale(Sepal.Length) ~ scale(Sepal.Width) scale(Sepal.Width)    0.526    0.123   4.28 0.0000877
#3 3     scale(Sepal.Length) ~ scale(Sepal.Width) scale(Sepal.Width)    0.457    0.128   3.56 0.000843
#4 1     scale(Petal.Length) ~ scale(Petal.Width) scale(Petal.Width)    0.332    0.136   2.44 0.0186
#5 2     scale(Petal.Length) ~ scale(Petal.Width) scale(Petal.Width)    0.787    0.0891  8.83 0.0000000000127
#6 3     scale(Petal.Length) ~ scale(Petal.Width) scale(Petal.Width)    0.322    0.137   2.36 0.0225

-结果输出
res
# A tibble: 6 x 7
#  group formula                                  term               estimate std.error     t         p.value
#  <chr> <chr>                                    <chr>                 <dbl>     <dbl> <dbl>           <dbl>
#1 1     scale(Sepal.Length) ~ scale(Sepal.Width) scale(Sepal.Width)    0.743    0.0967  7.68 0.000000000671
#2 2     scale(Sepal.Length) ~ scale(Sepal.Width) scale(Sepal.Width)    0.526    0.123   4.28 0.0000877
#3 3     scale(Sepal.Length) ~ scale(Sepal.Width) scale(Sepal.Width)    0.457    0.128   3.56 0.000843
#4 1     scale(Petal.Length) ~ scale(Petal.Width) scale(Petal.Width)    0.332    0.136   2.44 0.0186
#5 2     scale(Petal.Length) ~ scale(Petal.Width) scale(Petal.Width)    0.787    0.0891  8.83 0.0000000000127
#6 3     scale(Petal.Length) ~ scale(Petal.Width) scale(Petal.Width)    0.322    0.137   2.36 0.0225

如果我们需要在输出中也有“grouping.vars”
grouped_lm <- function(data,
                       grouping.vars,
                       crit.vars,
                       pred.vars) {

  #================== preparing dataframe ==================
  #
  # check how many variables were entered for criterion variables vector
  crit.vars <-
    as.list(rlang::quo_squash(rlang::enquo(crit.vars)))
  crit.vars <-
    if (length(crit.vars) == 1) {
      crit.vars
    } else {
      crit.vars[-1]
    }

  # check how many variables were entered for predictor variables vector
  pred.vars <-
    as.list(rlang::quo_squash(rlang::enquo(pred.vars)))
  pred.vars <-
    if (length(pred.vars) == 1) {
      pred.vars
    } else {
      pred.vars[-1]
    }

  # check how many variables were entered for grouping variable vector
  grouping.vars <-
    as.list(rlang::quo_squash(rlang::enquo(grouping.vars)))
  grouping.vars <-
    if (length(grouping.vars) == 1) {
      grouping.vars
    } else {
      grouping.vars[-1]
    }

  # getting the dataframe ready
  df <- dplyr::select(.data = data,
                      !!!grouping.vars,
                      !!!crit.vars,
                      !!!pred.vars) %>%
    dplyr::group_by(.data = ., !!!grouping.vars) %>%
    tidyr::nest(data = .)

  # checking if the nested dataframe looks okay
  cat(paste("the entire nested dataframe: \n"))
  print(df)             # the entire nested dataframe
  cat(paste("first element of the list column from nested dataframe: \n"))
  print(df$data[[1]])   # first element of the list column

  #============== custom function ================

  # custom function to run linear regression for every element of a list for two variables
  lm_listed <- function(list.col, x_name, y_name) {
    fx <- glue::glue("scale({y_name}) ~ scale({x_name})")

    # this tags any names that are not predictor variables (used to remove intercept terms)
    filter_name  <- glue::glue("scale({x_name})")

    # dataframe with results from lm
    results_df <-
      list.col %>% # running linear regression on each individual group with purrr
      purrr::map(.x = .,
                 .f = ~ stats::lm(formula = as.formula(fx),
                                  data = (.))) %>% # tidying up the output with broom
      purrr::map_dfr(.x = .,
                     .f = ~ broom::tidy(x = .),
                     .id = "group") %>% # remove intercept terms
      dplyr::filter(.data = ., term == !!filter_name) %>% # add formula as a character
      dplyr::mutate(.data = ., formula = as.character(fx)) %>% # rearrange the dataframe
      dplyr::select(
        .data = .,
        group,
        formula,
        term,
        estimate,
        std.error,
        t = statistic,
        p.value
      ) %>% # convert to a tibble dataframe
      tibble::as_data_frame(x = .)

    # return the dataframe
    return(results_df)
  }

  # check if the function works
  group_mtcars <- split(mtcars, mtcars$am)
  fn_results <- purrr::pmap(.l = list(
    l = list(group_mtcars),
    x_name = list('wt', 'disp'),
    y_name = list('mpg', 'drat')
  ),
  .f = lm_listed) %>%
    dplyr::bind_rows()

  # seems to be working!
  cat(paste("the custom function seems to be working!: \n"))
  print(fn_results)

  #========= using  custom function on entered dataframe =================
  cat(paste("running the custom function on the entered dataframe: \n"))
  # running custom function for each element of the created list column

  df <- df %>%
          tibble::rownames_to_column('group')
  df_lm <- purrr::pmap(.l = list(
    l = list(df$data),
    x_name = pred.vars,
    y_name = crit.vars
  ),
  .f = lm_listed) %>%
    dplyr::bind_rows() %>%
    left_join(df) %>%
    select(!!!grouping.vars, everything()) %>%
    select(-group, -data)



  #============================== output ========================

  print(df_lm)
  # return the final dataframe with results
  return(df_lm)

}

-运行功能
r1 <- grouped_lm(
   data = mtcars,
   crit.vars = c(wt, mpg),
   pred.vars = c(drat, disp),
   grouping.vars = c(am, cyl)
 )

-输出
r1
# A tibble: 12 x 8
#      am   cyl formula                  term        estimate std.error        t   p.value
#   <dbl> <dbl> <chr>                    <chr>          <dbl>     <dbl>    <dbl>     <dbl>
# 1  1.00  6.00 scale(wt) ~ scale(drat)  scale(drat)   -0.101     0.995 -  0.102   0.935
# 2  1.00  4.00 scale(wt) ~ scale(drat)  scale(drat)   -0.226     0.398 -  0.568   0.591
# 3  0     6.00 scale(wt) ~ scale(drat)  scale(drat)    0.307     0.673    0.456   0.693
# 4  0     8.00 scale(wt) ~ scale(drat)  scale(drat)   -0.119     0.314 -  0.379   0.713
# 5  0     4.00 scale(wt) ~ scale(drat)  scale(drat)    0.422     0.906    0.466   0.722
# 6  1.00  8.00 scale(wt) ~ scale(drat)  scale(drat)   -1.00    NaN      NaN     NaN
# 7  1.00  6.00 scale(mpg) ~ scale(disp) scale(disp)    1.00      0      Inf       0
# 8  1.00  4.00 scale(mpg) ~ scale(disp) scale(disp)   -0.835     0.225 -  3.72    0.00991
# 9  0     6.00 scale(mpg) ~ scale(disp) scale(disp)    0.670     0.525    1.28    0.330
#10  0     8.00 scale(mpg) ~ scale(disp) scale(disp)   -0.535     0.267 -  2.00    0.0729
#11  0     4.00 scale(mpg) ~ scale(disp) scale(disp)    0.932     0.362    2.57    0.236
#12  1.00  8.00 scale(mpg) ~ scale(disp) scale(disp)    1.00    NaN      NaN     NaN

关于r - 使用 rlang 列出 purrr::pmap 中的变量,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/49460721/

10-12 17:34