代码之家  ›  专栏  ›  技术社区  ›  jakes

查找长度等于或大于n的最新Trues序列

  •  5
  • jakes  · 技术社区  · 7 年前

    我有如下数据:

    library(dplyr)
    ex <- data.frame(bool = c(rep(FALSE, 2), rep(TRUE, 3), rep(FALSE, 2), rep(TRUE, 5), 
                          FALSE, FALSE, rep(TRUE, 6), FALSE, FALSE, FALSE)) %>%
      mutate(seq = data.table::rleid(bool)) %>% 
      group_by(seq) %>% 
      mutate(n = n()) %>% 
      ungroup() %>% 
      mutate(expected_output = c(4, 4, NA, NA, NA, 4, 4, rep(NA,5), 4, 4, rep(NA, 6), rep(6, 3)))
    

    为每一个 FALSE 我需要找到一个最新的序列 TRUE 长度至少为 4 . 但是如果之前没有这样的序列(比如行 1:2 6:7 ,我们应该向前检查,即找到观察后出现的长度为4或更多的第一个序列。

    的最后一列 ex 包含预期的输出。我怎样才能做到(最好是 tidyverse ?

    编辑

    使用的解决方案 泰迪弗斯 我还是很感激。

    3 回复  |  直到 7 年前
        1
  •  1
  •   Aaron Hayman    7 年前

    下面的代码应该使用BASE R。

    function(col,min_seq =4)
    {
        end = c(which(c(col[-1],NA)!=col),length(col))   
        num = diff(c(0,end))     
        start = end-num+1 
        seq_n = seq_along(start) 
        v=col[end]
    
        accept = num >= min_seq & v
        st = start[accept]
        sn = seq_n[accept]
        en = end[accept]
        en_ = en
        en_[1]=1
        place = rep(sn, diff(c(en_,length(col) + 1 )))      # If row with start of sequence is wanted instead of sequence number sn can be replaced with st
        place[col]=NA
    
        return(place)
    }
    
        2
  •  1
  •   Andre Elrico    7 年前

    你可以这样做:

    定义函数: (强大且错误处理)

    fun1<-
    function(vec, min_rep = 4) {
    
        stopifnot(length(vec)>0, all(vec %in% 0:1))
    
        runL <- do.call(rbind,rle(vec))
        lngth<- ncol(runL)
        runL <- rbind(runL, seq = 1:lngth, seq2 = NA^runL[2,])
    
        runL[3,] <- ifelse(!runL[2,]|runL[1,]<min_rep, NA, runL[3,]) 
    
        cases <- na.omit(runL[3,])
    
        if(length(cases)>0) {
    
            for(i in rev(cases)) {
                runL[4,1:i][!is.na(runL[4,1:i])] <- i
            }
    
            for(i in cases) {
                runL[4,i:lngth][!is.na(runL[4,i:lngth])] <- i
            }
        } else { runL[4,] <- NA }
    
        return(rep(runL[4,],runL[1,]))
    }
    

    调用函数:

    vec = c(rep(FALSE, 2), rep(TRUE, 3), rep(FALSE, 2), rep(TRUE, 5), 
            FALSE, FALSE, rep(TRUE, 6), FALSE, FALSE, FALSE)
    cbind(vec,fun1(vec))
    
    vec = rep(T,5)
    cbind(vec,fun1(vec))
    
    vec = rep(F,5)
    cbind(vec,fun1(vec))
    
    vec = c(rep(F,5),T)
    cbind(vec,fun1(vec))
    
    vec = c()
    cbind(vec,fun1(vec))
    
    vec = 1:3
    cbind(vec,fun1(vec))
    
        3
  •  0
  •   chinsoon12    7 年前

    如果OP严格不需要 data.table 解决方案,我可以取下这个职位。

    这是一个可能的 数据.表 方法:

    #aggregate the dataset by bool and rleid
    agg <- DT[, .(rn=.GRP, N=.N), by=.(bool, seq=rleid(bool))]
    
    #extract all the TRUE sequences with length >= 4
    true4s <- agg[(bool) & N >= 4L]
    
        #for rows that are FALSE
    agg[(!bool), expOut := {
            prev <- NA
    
            #find the previous sequence of TRUEs by using data.table non-equi join 
            #(a rolling join will work too here)
            #in addition, do the match in reverse so that we can fill NA with prev value
            ans <- true4s[.SD[order(-rn)], {
                if (.N > 0L) {
                    prev <- seq[.N] 
                } 
                prev
                  #for each row in i (see ?data.table for i argument and also ?.EACHI)
                              #non equi join where earlier row in x to be join with later row in i 
            }, by=.EACHI, on=.(rn<rn)]$V1
    
            #for the rolling version
            #}, by=.EACHI, on=.(rn), roll=Inf]$V1
    
            rev(ans)
        }]
    
    #add expected output to original dataset
    DT[, expected_output := inverse.rle(list(values=agg$expOut, lengths=agg$N))]
    

    输出:

         bool expected_output
     1: FALSE               4
     2: FALSE               4
     3:  TRUE              NA
     4:  TRUE              NA
     5:  TRUE              NA
     6: FALSE               4
     7: FALSE               4
     8:  TRUE              NA
     9:  TRUE              NA
    10:  TRUE              NA
    11:  TRUE              NA
    12:  TRUE              NA
    13: FALSE               4
    14: FALSE               4
    15:  TRUE              NA
    16:  TRUE              NA
    17:  TRUE              NA
    18:  TRUE              NA
    19:  TRUE              NA
    20:  TRUE              NA
    21: FALSE               6
    22: FALSE               6
    23: FALSE               6
         bool expected_output
    

    数据:

    library(data.table)
    DT <- data.table(bool = c(rep(FALSE, 2), rep(TRUE, 3), rep(FALSE, 2), rep(TRUE, 5), 
        FALSE, FALSE, rep(TRUE, 6), FALSE, FALSE, FALSE)) 
    
    推荐文章