为用户定义的函数构建高效循环：data.table

本文介绍了为用户定义的函数构建高效循环：data.table的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！问题描述我试图建立一个有效的for循环这个函数由minem在这里提出：（） p> 我的数据是： library（dplyr） library （tidyr） library（lubridate） library（data.table）遵守< - cbind.data.frame（c（1，2， 3，1，2，3），c（2013-01-01，2013-01-01，2013-01-01，2013-02-01， 2013-02-01，2013-02-01））名称（依从性）[1]< - ID名称（遵守）[2]< - 年 adherence $ year lsr c（1，1，1 ，2，2，2，3，3），#ID c（2012-03-01，2012-08-02，2013-01 -06，2012-08-25，2013-03-22，2013-09-15，2011-01-01，2013-01-05），#eksd c（60，90，90， 60，120，60，30，90）＃DDD ）名称（lsr）[1]< - ID名称lsr）[2]< - eksd（lsr）[3]< - DDD lsr $ eksd< - as.Date（（lsr $ eksd ）） lsr $ DDD< - as.numeric（as.character（lsr $ DDD）） lsr $ ENDDATE< - lsr $ eksd + lsr $ DDD lsr< as.data.table（lsr）遵守< - as.data.table（遵守）由minem提出的函数是： $ p $ by_minem2 d dt [，ENDDATE2：= as.numeric（ENDDATE）] x uid id2 id2< - uid [！（uid％in％x $ ID）] x2 x setkey（x，ID）x } p> > by_minem2（lsr） ID V1 1：1 64 2：2 0 3：3 63 对于循环，我需要包含有关我在哪个时间进行评估的信息，所以理想的重复输出如下所示： cbind（as.Date（2013-02-01），by_minem2（lsr））然后我想在不同的日期重复这个几百次，把所有的东西放到同一个data.table中： $ pre code> time.months< - as.Date（2013-02-01）+（365.25 / 12）*（0：192）#dates to evaluate at 我试图用for循环这样做： { by_minem d< ; - as.numeric（d） dt [，ENDDATE2：= as.numeric（ENDDATE）] x< - dt [eksd< d& ENDDATE> d，sum（ENDDATE2-d），keyby = ID] uid id2 id2< - uid [！（uid％in％x $ ID）] x2 x setkey（x，ID） xtot< - append（xtot，x） xtot< - cbind（d，xtot）＃我需要知道评估时间xtot $ / code $ / pre 解决方案相关问题的答案 Data.table：如何获得它承诺的快速子集，并应用于第二个data.table ，这可以通过使用 data.table 来实现中的更新来解决。与链接问题的区别在于，我们需要创建所有独特的交叉连接 CJ（） ID s加入 lsr 之前我们自己的日期向量。 OP提供了一系列日期 time.months 其定义 time.months< - as.Date（2013-02-01）+（365.25 / 12）*（0：192）#dates to evaluate at 导致歪歪日期，只有强制为数字或POSIXct时才可见： head（lubridate :: as_datetime（time.months）） [1]2013-02-01 00:00:00 UTC2013-03-03 10:30:00 UTC2013- 04-02 21:00:00 UTC [4]2013-05-03 07:30:00 UTC2013-06-02 18:00:00 UTC2013-07-03 04 ：30:00 UTC 问题在于这些日期与午夜不一致，而是从白天开始。为避免这些含糊之处，可以使用 seq（）函数日期< - seq（as.Date（2013-02-01），length.out = 193，by =month）从每个月的第一天开始创建一系列日期。另外，数据使用.table 的 IDate 类，它将日期存储为整数（4个字节）而不是double（8个字节）。这可以节省内存和处理时间，因为通常可以使用更快的整数算术。＃coerce标识日期 idates< - as.IDate（dates） setDT（lsr）[，eksd：= as.IDate（eksd）] [，ENDDATE：= as.IDate（ENDDATE）] ＃交叉连接带日期的唯一ID CJ（ID = lsr $ ID，date = idates，unique = TRUE）[＃intialize结果列，AH：= 0L] [ $ b lsr，on =。（ID，日期> = eksd，日期< ENDDATE），＃...只更新匹配的行 $：as.integer（ENDDATE - x.date）] [＃从长格式转换为宽格式，dcast（.SD，ID〜date）] $ block $ $ $ $ $ $ $ $ c $ ID 2013-02-01 2013-03-01 2013- 04-01 2013-05-01 2013-06-01 2013-07-01 2013-08-01 [...] 1：1 64 36 5 0 0 0 0 2：2 0 0 110 80 49 19 0 3：3 63 35 4 0 0 0 0 间隔 [eksd，ENDDATE） c> ID do 不是重叠。这可以通过 lsr [order（eksd），all（eksd - shift（ENDDATE，fill = 0）> 0），keyby = ID] ID V1 1：1 TRUE 2：2 TRUE 3：3 TRUE 在有重叠的情况下，可以修改上面的代码，使用 by = .EACHI 基准在另一个相关问题 data.table by = xx当我没有找到任何匹配项时，我如何保存长度为0的组合，OP指出由于他的生产规模数据。根据， lsr has-group-of-length-0-when-i-returns-no-match / 48401280＃comment83662275_48336742> 20 mio行和12列， adherence 数据集，我试图不使用有2列1.5 mio行。在另一个中，OP提到 lsr 是几百mio 。行。 @minem已经通过在他的回答。我们可以使用这个基准测试数据来比较不同的答案。 $ $ $ $ $ $ $ $ b $ lsr< - data。（ ID = c（1，1，1，2，2，2，3，3）， eksd = as 。日（c（2012-03-01，2012-08-02，2013-01-06，2012-08-25，2013-03-22，2013-09- （60，90，90，60，2011-01-01，2013-01-05））， DDD = as.integer （120，60，30，90））， stringsAsFactors = FALSE） lsr $ ENDDATE n lsr2 [，ID：= as.integer（paste0（.id，ID））] 基准数据集包含400 k行和150 k独特 ID s： $ block $ $ pre> N V2 1：400000 150000 ＃数据准备从基准 lsr2i< - copy（lsr2）[，eksd：= as.IDate（eksd）] [，ENDDATE：= as.IDate（ENDDATE）] lsr2 [ ，ENDDATE2：= as.numeric（ENDDATE）] ＃define date series dates ＃运行基准 library（microbenchmark） bm< - microbenchmark（ minem = { dt< - copy（lsr2） xtot< - lapply （日期，函数（d）{d x< - dt [eksd< = d& ENDDATE> d，sum（ENDDATE2-d），keyby = ID] uid id2 id2< （ID = id2，V1 = 0）$ （x，seq_along（xtot））中的b $ bx } setkey（x，ID）x }） { setnames（xtot [[x]]，c（ID，paste0（V，x）））} xtot< - Reduce（function（... ）合并（...，all = TRUE，by =ID），xtot）xtot }， uwe = { dt CJ（ID = dt $ ID，date = idates，unique = TRUE）[，AH：= 0L] [ dt，on =。（ID，日期> = eksd，日期< ENDDATE ）， AH：= as.integer（ENDDATE - x.date）] [，dcast（.SD，ID〜date）] }， times = 1L ） print（bm）一次运行的结果显示 equi join 比 lapply（）快4倍以上（）单位：秒表达式最小值lq平均值中值uq max neval minem 27.654703 27.654703 27.654703 27.654703 27.654703 27.654703 1 uwe 5.958907 5.958907 5.958907 5.958907 5.958907 5.958907 1 I'm trying to build an efficient for loop for this function proposed by minem here: (Data.table: how to get the blazingly fast subsets it promises and apply to a second data.table)My data are: library(dplyr)library(tidyr)library(lubridate)library(data.table)adherence <- cbind.data.frame(c("1", "2", "3", "1", "2", "3"), c("2013-01-01", "2013-01-01", "2013-01-01", "2013-02-01", "2013-02-01", "2013-02-01"))names(adherence)[1] <- "ID" names(adherence)[2] <- "year"adherence$year <- ymd(adherence$year)lsr <- cbind.data.frame( c("1", "1", "1", "2", "2", "2", "3", "3"), #ID c("2012-03-01", "2012-08-02", "2013-01-06","2012-08-25", "2013-03-22", "2013-09-15", "2011-01-01", "2013-01-05"), #eksd c("60", "90", "90", "60", "120", "60", "30", "90") # DDD)names(lsr)[1] <- "ID"names(lsr)[2] <- "eksd"names(lsr)[3] <- "DDD"lsr$eksd <- as.Date((lsr$eksd))lsr$DDD <- as.numeric(as.character(lsr$DDD))lsr$ENDDATE <- lsr$eksd + lsr$DDDlsr <- as.data.table(lsr)adherence <- as.data.table(adherence)The Function proposed by minem are:by_minem2 <- function(dt = lsr2) { d <- as.numeric(as.Date("2013-02-01")) dt[, ENDDATE2 := as.numeric(ENDDATE)] x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID] uid <- unique(dt$ID) id2 <- setdiff(uid, x$ID) id2 <- uid[!(uid %in% x$ID)] x2 <- data.table(ID = id2, V1 = 0) x <- rbind(x, x2) setkey(x, ID) x}This returns: > by_minem2(lsr) ID V11: 1 642: 2 03: 3 63For the loop i need to include information about which time I evaluated at so the ideal repeated output looks like this: cbind(as.Date("2013-02-01"),by_minem2(lsr))I then want to repeat this for different dates a few hundred times putting everything into the same data.table: time.months <- as.Date("2013-02-01")+(365.25/12)*(0:192) #dates to evaluate atI'm trying to do this with a for loop like this: for (d in min(time.months):max(time.months)){ by_minem <- function(dt = lsr2) { d <- as.numeric(d) dt[, ENDDATE2 := as.numeric(ENDDATE)] x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID] uid <- unique(dt$ID) id2 <- setdiff(uid, x$ID) id2 <- uid[!(uid %in% x$ID)] x2 <- data.table(ID = id2, V1 = 0) x <- rbind(x, x2) setkey(x, ID) xtot <- append(xtot,x) xtot <- cbind(d, xtot) # i need to know time of evaluation xtot }} 解决方案 As indicated in the answer to the related question Data.table: how to get the blazingly fast subsets it promises and apply to a second data.table, this can be solved by updating in a non-equi join which is possible with data.table. The difference to the linked question is that here we need to create the cross join CJ() of all unique IDs with the vector of dates on our own before joining with lsr.The OP has provided a series of dates time.months whose defintion time.months <- as.Date("2013-02-01")+(365.25/12)*(0:192) #dates to evaluate atleads to "crooked" dates which is only visible if coerced to numeric or POSIXct:head(lubridate::as_datetime(time.months))[1] "2013-02-01 00:00:00 UTC" "2013-03-03 10:30:00 UTC" "2013-04-02 21:00:00 UTC"[4] "2013-05-03 07:30:00 UTC" "2013-06-02 18:00:00 UTC" "2013-07-03 04:30:00 UTC"The issue is that these "dates" are not aligned with midnight but start somewhere during the day. To avoid these ambiguities, the seq() function can be useddates <- seq(as.Date("2013-02-01"), length.out = 193, by = "month")which creates a series of dates starting on the first day of each month.In addition, data.table's IDate class is used which stores dates as integers (4 bytes) instead of double (8 bytes). This saves memory as well as processing time because the usually faster integer arithmetic can be used.# coerce Date to IDateidates <- as.IDate(dates)setDT(lsr)[, eksd := as.IDate(eksd)][, ENDDATE := as.IDate(ENDDATE)]# cross join unique IDs with dates CJ(ID = lsr$ID, date = idates, unique = TRUE)[ # intialize result column , AH := 0L][ # non-equi join and ... lsr, on = .(ID, date >= eksd, date < ENDDATE), # ... update only matching rows AH := as.integer(ENDDATE - x.date)][ # reshape from long to wide format , dcast(.SD, ID ~ date)] ID 2013-02-01 2013-03-01 2013-04-01 2013-05-01 2013-06-01 2013-07-01 2013-08-01 [...]1: 1 64 36 5 0 0 0 02: 2 0 0 110 80 49 19 03: 3 63 35 4 0 0 0 0CaveatNote that above code assumes that the intervals [eksd, ENDDATE) for each ID do not overlap. This can be verified bylsr[order(eksd), all(eksd - shift(ENDDATE, fill = 0) > 0), keyby = ID] ID V11: 1 TRUE2: 2 TRUE3: 3 TRUEIn case there are overlaps, the above code can be modified to aggregate within the non-equi join using by = .EACHI.BenchmarkIn another related question data.table by = xx How do i keep the groups of length 0 when i returns no match, the OP has pointed out that performance is crucial due to the size of his production data. According to OP's comment, lsr has 20 mio rows and 12 columns, the adherence dataset, that I'm trying not to use has 1,5 mio rows of 2 columns. In another question, the OP mentions that lsr is a few hundred mio. rows.@minem has responded to this by providing a benchmark in his answer. We can use this benchmark data to compare the different answers.# create benchmark datalsr <- data.frame( ID = c("1", "1", "1", "2", "2", "2", "3", "3"), eksd = as.Date(c("2012-03-01", "2012-08-02", "2013-01-06","2012-08-25", "2013-03-22", "2013-09-15", "2011-01-01", "2013-01-05")), DDD = as.integer(c("60", "90", "90", "60", "120", "60", "30", "90")), stringsAsFactors = FALSE)lsr$ENDDATE <- lsr$eksd + lsr$DDDn <- 5e4lsr2 <- lapply(1:n, function(x) lsr)lsr2 <- rbindlist(lsr2, use.names = T, fill = T, idcol = T)lsr2[, ID := as.integer(paste0(.id, ID))]Thus, the benchmark dataset consists of 400 k rows and 150 k unique IDs:lsr2[, .(.N, uniqueN(ID))] N V21: 400000 150000# pull data preparation out of the benchmark lsr2i <- copy(lsr2)[, eksd := as.IDate(eksd)][, ENDDATE := as.IDate(ENDDATE)]lsr2[, ENDDATE2 := as.numeric(ENDDATE)]# define date seriesdates <- seq(as.Date("2013-02-01"), length.out = 193, by = "month")idates <- seq(as.IDate("2013-02-01"), length.out = 193, by = "month")# run benchmarklibrary(microbenchmark)bm <- microbenchmark( minem = { dt <- copy(lsr2) xtot <- lapply(dates, function(d) { d <- as.numeric(d) x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID] uid <- unique(dt$ID) id2 <- setdiff(uid, x$ID) id2 <- uid[!(uid %in% x$ID)] if (length(id2) > 0) { x2 <- data.table(ID = id2, V1 = 0) x <- rbind(x, x2) } setkey(x, ID) x }) for (x in seq_along(xtot)) { setnames(xtot[[x]], c("ID", paste0("V", x))) } xtot <- Reduce(function(...) merge(..., all = TRUE, by = "ID"), xtot) xtot }, uwe = { dt <- copy(lsr2i) CJ(ID = dt$ID, date = idates, unique = TRUE)[, AH := 0L][ dt, on = .(ID, date >= eksd, date < ENDDATE), AH := as.integer(ENDDATE - x.date)][, dcast(.SD, ID ~ date)] }, times = 1L)print(bm)The result for one run shows that the non-equi join is more than 4 times faster than the lapply() approach.Unit: seconds expr min lq mean median uq max neval minem 27.654703 27.654703 27.654703 27.654703 27.654703 27.654703 1 uwe 5.958907 5.958907 5.958907 5.958907 5.958907 5.958907 1 这篇关于为用户定义的函数构建高效循环：data.table的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！