我的数据框架如下:
## Please copy following text in your clipboard (do not copy this line)
hid ,mid ,aprps,astart ,aend ,ax ,ay ,exph
10001,1000101,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000101,3 ,2012-01-01 00:00:00,2012-01-01 08:00:00,475465.6,1272272,41.55607
10001,1000101,4 ,2012-01-01 08:00:00,2012-01-01 08:15:00,475465.6,1272272,41.55607
10001,1000101,3 ,2012-01-01 08:15:00,2012-01-01 09:15:00,475465.6,1272272,41.55607
10001,1000101,4 ,2012-01-01 09:15:00,2012-01-01 09:30:00,475465.6,1272272,41.55607
10001,1000101,3 ,2012-01-01 09:30:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 00:00:00,2012-01-01 07:30:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 07:30:00,2012-01-01 07:50:00,475465.6,1272272,41.55607
10001,1000102,1 ,2012-01-01 07:50:00,2012-01-01 11:00:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 11:00:00,2012-01-01 11:20:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 11:20:00,2012-01-01 14:00:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 14:00:00,2012-01-01 14:20:00,475465.6,1272272,41.55607
10001,1000102,1 ,2012-01-01 14:20:00,2012-01-01 17:00:00,475465.6,1272272,41.55607
10001,1000102,4 ,2012-01-01 17:00:00,2012-01-01 17:20:00,475465.6,1272272,41.55607
10001,1000102,3 ,2012-01-01 17:20:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 00:00:00,2012-01-01 08:00:00,475465.6,1272272,41.55607
10001,1000103,4 ,2012-01-01 08:00:00,2012-01-01 12:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 12:00:00,2012-01-01 13:00:00,475465.6,1272272,41.55607
10001,1000103,4 ,2012-01-01 13:00:00,2012-01-01 19:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 19:00:00,2012-01-01 20:00:00,475465.6,1272272,41.55607
10001,1000103,4 ,2012-01-01 20:00:00,2012-01-01 23:00:00,475465.6,1272272,41.55607
10001,1000103,3 ,2012-01-01 23:00:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 00:00:00,2012-01-01 00:00:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 00:00:00,2012-01-01 07:00:00,475465.6,1272272,41.55607
10001,1000104,4 ,2012-01-01 07:00:00,2012-01-01 07:30:00,473548.0,1279171,41.55607
10001,1000104,2 ,2012-01-01 07:30:00,2012-01-01 10:00:00,473548.0,1279171,41.55607
10001,1000104,4 ,2012-01-01 10:00:00,2012-01-01 10:30:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 10:30:00,2012-01-01 17:30:00,475465.6,1272272,41.55607
10001,1000104,4 ,2012-01-01 17:30:00,2012-01-01 17:45:00,484869.7,1270558,41.55607
10001,1000104,2 ,2012-01-01 17:45:00,2012-01-01 21:30:00,484869.7,1270558,41.55607
10001,1000104,4 ,2012-01-01 21:30:00,2012-01-01 21:45:00,475465.6,1272272,41.55607
10001,1000104,3 ,2012-01-01 21:45:00,2012-01-02 00:00:00,475465.6,1272272,41.55607
## Do not copy this line
您可以复制上面的文本并导入为
df
使用
{psych}
包裹:
install.packages("psych")
library(psych)
# Please copy above text and run following
df <- read.clipboard(header=TRUE, sep=",")
我需要从
东风
是:
-
提取和
exph
在两对行中提取
aprps==4
和上一行
-
如果有多行
APRPS=4
,按组重复
mid
-
存储总和
扩张
和通信
hid
在列表或数据框中
为了解决这个问题,我目前正在使用基于两个循环的以下脚本:
library(tidyverse)
calc <- function(i) {
## Extract records by "mid" excluding the first records
temp <<- df %>% filter(mid==i) %>% filter(row_number()>1)
## Extract row number of "aprps==4"
r.aprps <- which(temp$aprps==4)
## Repeat operation by two pairs of rows based on "r.aprps"
for (j in 1:length(r.aprps)) {
## Extract movement
temp2 <<- temp[c((r.aprps[j]-1):r.aprps[j]),]
## Other operations in actual data set (jsut put example)
exp <- data.frame(mid=unique(temp2$mid),expsum=sum(temp2$exph))
## Store PPA in list
if (lp==1 & j==1) {
df.exp <<- exp
} else {
df.exp <<- rbind(df.exp,exp)
}
}
}
## Set loop conditions
list.mid <- unique(df$mid)
nloop <- length(list.mid)
## Initialize df.exp
df.exp <- data.frame(matrix(vector(),0,2,
dimnames=list(c(),c("mid","expsum"))),
stringsAsFactors=F)
## Loop to store PPA in list
for (lp in 1:nloop) {
calc(list.mid[lp])
}
但是,作为实际数据帧
东风
包含约40000条记录,实际操作包含更复杂的计算,需要30多小时。我试着找到缩短手术时间的方法,现在试着申请
map
函数从
purrr
在嵌套的数据帧中存储每个操作,而不是每次在循环操作中替换变量。
下面的脚本是我正在尝试构建的脚本,但是它无法达到所需的输出。
## Store df by mid into list
nest <- df %>% group_by(mid) %>% nest()
## Extract row number with "aprps==4"
nest2 <- nest %>% mutate(row.aprps4=map(data,~which(.$aprps==4)))
## Obtain row numbers to extract by movement
nest3 <- nest2 %>% mutate(row.aprps4_1=map(data,~data.frame(rm1=which(.$aprps==4)-1)),
row.aprps4_2=map(data,~data.frame(rm1=which(.$aprps==4))))
## How to extract two pairs of records based on row.aprps4_1 and row.aprps4_1 and store sum of exph?
Some trials:
# It works but cannot extract records using two variables (row.aprps4_1 and .._2)
nest3 %>% mutate(move=map2(data,row.aprps4_1,~filter(.x,seq_len(nrow(.x))%in%.y)))
# Using pmap to specify range of filtering by two variables but does not work
nest4 %>% pmap(data,row.move1,row.move2,~filter(..1,seq_len(nrow(..1))%in%..2))
# Using double map function instead of double loop but does not work
pmap(nest4$data,nest4$row.move1,nest4$row.move2,~filter(..1,seq_len(nrow(..1))%in%c(..2:..3)))
你有什么建议要把操作紧固吗?
我更喜欢使用
地图
了解它的功能,但也欢迎其他替代方案。
我也发现了
this post
与此类似,但无法解决如何基于动态变量提取两行的问题
r.aprpr4_1
和
_2
.
=====更新:问题已解决====
我可以通过以下脚本来解决问题:
## Convert df into nested data frame by `mid`
nest <- df %>% group_by(mid) %>% nest()
## Obtain row numbers to extract aprps==4
nest2 <- nest %>% mutate(r=map(data,~which(.$aprps==4)))
## Split r and expand record
nest3 <- nest2 %>% unnest(r,.drop=FALSE)
## Extract pairs of movement
nest4 <- nest3 %>% mutate(pair=map2(data,r,~filter(.x,seq_len(nrow(.x))%in%c((.y-1):.y)))) %>% dplyr::select(mid,pair)
要点是:
-
需要
unnest()
通过从中提取向量展开每个记录
APRPS=4
(不能申请)
.x%in%.y
哪里
.y
长度超过两个)
-
mutate
必须申请
map2
(如
nest3 %>% map2(a,b,~f(.x,.y...))
不接受)
非常感谢您通过以下帖子获得此解决方案:
Split delimited strings in a column and insert as new rows
map2() function in pipe