有两种方法,
滚动连接
和
在非等距联接中更新
. 对于给定的示例数据集,这两种方法都比
minem's solution
.
非等距连接
它需要创造
start
-
end
最好以长格式完成的间隔
# create intervals in long format
long <- setDT(melt(inter1, "IBD", variable.name = "var"))
long <- rbind(long,
long[, CJ(IBD = IBD, var = var,
value = c(-.Machine$integer.max, .Machine$integer.max),
unique = TRUE)])[
order(IBD, var, value)]
long <- long[, .(start = head(value, -1L),
end = tail(value, -1L),
INT = 1:(.N - 1L)),
by = .(IBD, var)]
long
IBD var start end INT
1: 1 var1a -2147483647 2 1
2: 1 var1a 2 4 2
3: 1 var1a 4 8 3
4: 1 var1a 8 9 4
5: 1 var1a 9 2147483647 5
---
246: 10 var5a -2147483647 29 1
247: 10 var5a 29 44 2
248: 10 var5a 44 45 3
249: 10 var5a 45 80 4
250: 10 var5a 80 2147483647 5
注意,使用了最大整数而不是
Inf
以避免从整数到双精度的协同作用。
现在,我们在指定的列上循环并对每一列执行非等距联接。每次迭代都会添加一个新的结果列:
v0int <- c("var1a","var2a","var3a","var4a","var5a")
setDT(smpl)
for (col in v0int) {
smpl[long[var == col],
on = c("IBD", paste0(col, ">start"), paste0(col, "<=end")),
paste0(col, "_INT") := i.INT]
}
smpl[]
ID IBD CELL var1a var2a var3a var4a var5a var6a var7a var8a var9a loc bill var1a_INT var2a_INT var3a_INT var4a_INT var5a_INT
1: 1 7 6849 93 38 151 203 63 70 35 8 7 17 2 5 1 5 5 4
2: 2 9 2517 109 130 97 296 15 97 79 267 422 4 2 5 5 1 5 1
3: 3 10 9322 65 18 160 156 80 132 33 41 387 8 1 5 1 5 4 4
4: 4 10 7377 105 8 87 263 101 110 207 224 331 11 2 5 1 1 5 5
5: 5 4 6991 72 144 187 144 117 125 123 84 60 3 1 5 5 5 5 5
---
4996: 4996 6 5129 56 188 21 74 105 133 192 45 284 5 1 5 5 1 3 5
4997: 4997 2 2657 8 50 127 6 119 81 60 250 209 3 2 2 5 5 1 5
4998: 4998 2 1473 128 90 156 74 203 5 198 63 10 17 1 5 5 5 5 5
4999: 4999 9 2120 66 141 170 256 151 68 205 97 8 9 2 5 5 5 5 5
5000: 5000 2 4555 109 102 92 98 11 107 104 210 266 14 2 5 5 5 5 1
注意连接条件(
on =
)作为字符串动态创建。
滚动连接
Frank has pointed out
那个
滚动连接
也适用于此处,因为间隔中没有间隙。
操作程序已指定
右关闭
间隔,例如。,
if smpl$var1a > 11 & smpl$var1a <= 18 then var1a_INT = 2
因此,我们需要
向后的
使用
结束
间隔值。
在常规联接中,联接参数必须完全匹配。在反向滚动联接中,如果没有完全匹配,则该值落在两个联接之间的间隙中
结束
值,然后
下一次观测是向后进行的
(国家奥委会)。
long <- setDT(melt(inter1, "IBD", variable.name = "var", value.name = "end"))
long <- rbind(long,
long[, CJ(IBD = IBD, var = var, end = .Machine$integer.max,
unique = TRUE)])
setorder(long, IBD, var, end)
long[, INT := rowid(IBD, var)]
v0int <- c("var1a","var2a","var3a","var4a","var5a")
setDT(smpl)
for (col in v0int) {
smpl[, paste0(col, "_INT") := long[var == col][
smpl, on = c("IBD", paste0("end==", col)),
roll = -Inf, x.INT]]
}
基准
将非等速连接和滚动连接与
minem's answer
,更新的
smpl
通过引用避免重复调用
rbind()
.
结果是相等的,只是行的顺序不同。
随着所有解决方案的更新
脱脂奶粉
通过引用,所有基准测试运行都以新的
copy()
原始数据集的。
library(bench)
my_check <- function(x, y) {
all.equal(x[order(ID)], y[order(ID)])
}
v0int <- c("var1a","var2a","var3a","var4a","var5a")
bm <- mark(
rj = {
smpl <- copy(smpl0)
long <- setDT(melt(inter1, "IBD", variable.name = "var", value.name = "end"))
long <- rbind(long,
long[, CJ(IBD = IBD, var = var, end = .Machine$integer.max,
unique = TRUE)])
setorder(long, IBD, var, end)
long[, INT := rowid(IBD, var)]
setDT(smpl)
for (col in v0int) {
smpl[, paste0(col, "_INT") := long[var == col][
smpl, on = c("IBD", paste0("end==", col)),
roll = -Inf, x.INT]]
}
smpl[]
},
nej = {
smpl <- copy(smpl0)
long <- setDT(melt(inter1, "IBD", variable.name = "var"))
long <- rbind(long,
long[, CJ(IBD = IBD, var = var,
value = c(-.Machine$integer.max, .Machine$integer.max),
unique = TRUE)])[
order(IBD, var, value)]
long <- long[, .(start = head(value, -1L),
end = tail(value, -1L),
INT = 1:(.N - 1L)),
by = .(IBD, var)]
setDT(smpl)
for (col in v0int) {
smpl[long[var == col],
on = c("IBD", paste0(col, ">start"), paste0(col, "<=end")),
paste0(col, "_INT") := i.INT]
}
smpl[]
},
minem1 = {
smpl <- copy(smpl0)
setDT(smpl) # convert smpl to data.table
setkey(smpl, IBD) # setkey on IBD for faster `IBD == l` operation
for (l in (unique(inter1$IBD))) {
for (k in 1:length(v0int)) {
q0 <- v0int[k]
q1 <- sort(inter1[inter1$IBD == l, q0])
smpl[IBD == l, paste0(q0, "_INT") := as.integer(cut(get(q0), c(0, q1, Inf)))]
}
}
smpl[]
},
minem2 = {
smpl <- copy(smpl0)
setDT(smpl) # convert smpl to data.table
setkey(smpl, IBD) # setkey on IBD for faster `IBD == l` operation
for (l in (unique(inter1$IBD))) {
for (k in 1:length(v0int)) {
q0 <- v0int[k]
q1 <- sort(inter1[inter1$IBD == l, q0])
smpl[IBD == l, paste0(q0, "_INT") := cut(get(q0), c(0, q1, Inf), labels = FALSE)]
}
}
smpl[]
},
check = my_check,
min_time = 1
)
bm
# A tibble: 4 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:t> <bch:t> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <lis> <lis>
1 rj 20ms 22.5ms 22ms 28ms 44.5 1.73MB 3 41 921ms <data~ <Rpro~ <bch~ <tib~
2 nej 25.3ms 28.3ms 28.6ms 30.9ms 35.4 2.31MB 2 20 566ms <data~ <Rpro~ <bch~ <tib~
3 minem1 106.2ms 113.8ms 110.3ms 129.9ms 8.79 6.4MB 2 7 797ms <data~ <Rpro~ <bch~ <tib~
4 minem2 98.8ms 101.8ms 101.6ms 106.3ms 9.83 5.66MB 3 7 712ms <data~ <Rpro~ <bch~ <tib~
滚动连接的速度大约是
minem's solutions
,非等速连接的速度要快四倍。此外,分配的内存要少两到四倍。
ggplot2::autoplot(bm)