代码之家  ›  专栏  ›  技术社区  ›  NelsonGon phoxis

在同一工作表中提取多个数据文件

csv r
  •  0
  • NelsonGon phoxis  · 技术社区  · 7 年前

    : 手工制作,剪切粘贴成多张纸。如果能找到一个同样的解决方法就太好了。

    : 给定以下虚拟数据集:

    structure(list(V1 = structure(c(8L, 6L, 2L, 4L, 1L, 1L, 1L, 1L, 
    9L, 5L, 2L, 1L, 1L, 1L, 1L, 10L, 7L, 3L), .Label = c("", "1", 
    "12", "5", "Age", "Class A", "Height", "Number of Boys", "More Boys", 
    "More Girls"), class = "factor"), V2 = structure(c(1L, 5L, 3L, 
    4L, 1L, 1L, 1L, 1L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 7L, 2L), .Label = c("", 
    "12", "2", "6", "Class B", "Time", "Weight"), class = "factor"), 
        V3 = structure(c(1L, 5L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 6L, 
        3L, 1L, 1L, 1L, 1L, 1L, 7L, 2L), .Label = c("", "13", "3", 
        "7", "Class C", "Next", "Time"), class = "factor"), V4 = structure(c(1L, 
        5L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 
        6L, 2L), .Label = c("", "14", "4", "8", "Class D", "Day"), class = "factor"), 
        V5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
        NA, NA, NA, NA, NA), V6 = c(NA, NA, NA, NA, NA, NA, NA, NA, 
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V7 = c(NA, NA, NA, 
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
        ), V8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
        NA, NA, NA, NA, NA, NA), V9 = c(NA, NA, NA, NA, NA, NA, NA, 
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V10 = structure(c(5L, 
        4L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L), .Label = c("", "1", "8", "Class E", "Number of Girls"
        ), class = "factor"), V11 = structure(c(1L, 4L, 3L, 2L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
        "2", "8", "Class F"), class = "factor"), V12 = structure(c(1L, 
        4L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L), .Label = c("", "3", "9", "Class G"), class = "factor"), 
        V13 = structure(c(1L, 4L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "0", "4", 
        "Class Q"), class = "factor"), V14 = c(NA, NA, NA, NA, NA, 
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA, 
    -18L))
    

                   V1      V2      V3      V4 V5 V6 V7 V8 V9             V10     V11     V12     V13 V14
    1  Number of Boys                         NA NA NA NA NA Number of Girls                          NA
    2            Class A Class B Class C Class D NA NA NA NA NA         Class E Class F Class G Class Q  NA
    3                  1       2       3       4 NA NA NA NA NA               8       8       9       0  NA
    4                  5       6       7       8 NA NA NA NA NA               1       2       3       4  NA
    5                                            NA NA NA NA NA                                          NA
    6                                            NA NA NA NA NA                                          NA
    7                                            NA NA NA NA NA                                          NA
    8                                            NA NA NA NA NA                                          NA
    9          More Boys                         NA NA NA NA NA                                          NA
    10               Age    Time    Next     Day NA NA NA NA NA                                          NA
    11                 1       2       3       4 NA NA NA NA NA                                          NA
    12                                           NA NA NA NA NA                                          NA
    

    希望可以看到,这些是同一工作表上的独立“文件”。我一直在寻找一种快速的方法来挖掘不同的数据集,但还没有。附近有工作吗?

    预期输出(及其各自的行)

    Three data sets: 
    A: Number of Boys
    B: Number of Girls
    C: More Boys
    

    提前谢谢。

    0 回复  |  直到 7 年前
        1
  •  1
  •   Aurèle    7 年前

    使用@alexis\u laz的解决方案 https://stackoverflow.com/a/42120347/6197649

    library(Matrix)
    
    x <- !is.na(df) & df != ""
    m <- Matrix(x)
    
    #> 18 x 14 sparse Matrix of class "lgCMatrix"
    #>    [[ suppressing 14 column names 'V1', 'V2', 'V3' ... ]]
    #>                                  
    #>  [1,] | . . . . . . . . | . . . .
    #>  [2,] | | | | . . . . . | | | | .
    #>  [3,] | | | | . . . . . | | | | .
    #>  [4,] | | | | . . . . . | | | | .
    #>  [5,] . . . . . . . . . . . . . .
    #>  [6,] . . . . . . . . . . . . . .
    #>  [7,] . . . . . . . . . . . . . .
    #>  [8,] . . . . . . . . . . . . . .
    #>  [9,] | . . . . . . . . . . . . .
    #> [10,] | | | | . . . . . . . . . .
    #> [11,] | | | | . . . . . . . . . .
    #> [12,] . . . . . . . . . . . . . .
    #> [13,] . . . . . . . . . . . . . .
    #> [14,] . . . . . . . . . . . . . .
    #> [15,] . . . . . . . . . . . . . .
    #> [16,] | . . . . . . . . . . . . .
    #> [17,] | | | | . . . . . . . . . .
    #> [18,] | | | | . . . . . . . . . .
    
    sm = as.matrix(summary(m))
    
    d = dist(sm, "manhattan")
    
    gr = cutree(hclust(d, "single"), h = 1)
    
    res <- sparseMatrix(i = sm[, "i"], j = sm[, "j"], x = gr)
    
    #> 18 x 13 sparse Matrix of class "dgCMatrix"
    #>                                
    #>  [1,] 1 . . . . . . . . 4 . . .
    #>  [2,] 1 1 1 1 . . . . . 4 4 4 4
    #>  [3,] 1 1 1 1 . . . . . 4 4 4 4
    #>  [4,] 1 1 1 1 . . . . . 4 4 4 4
    #>  [5,] . . . . . . . . . . . . .
    #>  [6,] . . . . . . . . . . . . .
    #>  [7,] . . . . . . . . . . . . .
    #>  [8,] . . . . . . . . . . . . .
    #>  [9,] 2 . . . . . . . . . . . .
    #> [10,] 2 2 2 2 . . . . . . . . .
    #> [11,] 2 2 2 2 . . . . . . . . .
    #> [12,] . . . . . . . . . . . . .
    #> [13,] . . . . . . . . . . . . .
    #> [14,] . . . . . . . . . . . . .
    #> [15,] . . . . . . . . . . . . .
    #> [16,] 3 . . . . . . . . . . . .
    #> [17,] 3 3 3 3 . . . . . . . . .
    #> [18,] 3 3 3 3 . . . . . . . . .
    
    res2 <- summary(res)
    
    lapply(
      split(res2[, c("i", "j")], res2$x),
      function(area) {
        df[min(area$i):max(area$i), min(area$j):max(area$j), drop = FALSE]
      }
    )
    
    #> $`1`
    #>                  V1      V2      V3      V4
    #> 1 Number of Boys                        
    #> 2           Class A Class B Class C Class D
    #> 3                 1       2       3       4
    #> 4                 5       6       7       8
    #> 
    #> $`2`
    #>           V1   V2   V3  V4
    #> 9  More Boys              
    #> 10       Age Time Next Day
    #> 11         1    2    3   4
    #> 
    #> $`3`
    #>            V1     V2   V3  V4
    #> 16 More Girls                
    #> 17     Height Weight Time Day
    #> 18         12     12   13  14
    #> 
    #> $`4`
    #>               V10     V11     V12     V13
    #> 1 Number of Girls                        
    #> 2         Class E Class F Class G Class Q
    #> 3               8       8       9       0
    #> 4               1       2       3       4
    

    创建日期:2019-04-10 reprex package

        2
  •  1
  •   Simon    7 年前

    下面的代码创建一个数据帧列名设置正确。然而,它依赖于这样一个事实:在您的工作表中,“表的列”至少由一个空列分隔。。。

    df <- apply(df, 2, function(x) gsub("^$|^ $", NA, x))
    empty_cols <- sapply(1:ncol(df), function(i){length(which(is.na(df[, i])))==nrow(df)})
    start_cols <- c(1, which(diff(empty_cols)==-1)+1)
    if (is.na(df[1, 1])) start_cols <- start_cols[-1]
    start_rows <- lapply(start_cols, function(i){
      start_rows <- c(1, which(diff(is.na(df[, i]))==-1)+1)
      if (is.na(df[1, i])) start_rows <- start_rows[-1]
      start_rows})
    
    end_rows <- lapply(start_cols, function(i){
      end_rows <- c(1, which(diff(is.na(df[, i]))==1))
      if (!is.na(df[nrow(df), i])) end_rows <- c(end_rows, nrow(df))
      end_rows[-1]})
    
    data.sets <- list()
    for (i in 1:length(start_cols)) {
        for (j in 1:length(start_rows[[i]])){
    
          col <- start_cols[i]
          row <- start_rows[[i]][j]
          start_row <- row+1
          end_row <- end_rows[[i]][j]
          name <- df[row, col]
          ncol <- which(diff(is.na(df[row+1, col:ncol(df)]))==1)[1]
          end_col <- col+ncol-1
          column_names <- df[start_row, col:end_col]
          data <- df[(start_row+1):end_row, col:end_col]
          data <- matrix(data, ncol = length(col:end_col))
          data <- as.data.frame(data)
          names(data) <- column_names
          data.sets[[name]] <- data
        }
    }
    
    
    > data.sets
    $`Number of Boys`
      Class A Class B Class C Class D
    1       1       2       3       4
    2       5       6       7       8
    
    $`More Boys`
      Age Time Next Day
    1   1    2    3   4
    
    $`More Girls`
      Height Weight Time Day
    1     12     12   13  14
    
    $`Number of Girls`
      Class E Class F Class G Class Q
    1       8       8       9       0
    2       1       2       3       4