代码之家  ›  专栏  ›  技术社区  ›  CodeNoob

R:根据匹配的行和列名称从相关矩阵中移除项

  •  0
  • CodeNoob  · 技术社区  · 6 年前

    让我们看一个示例矩阵并计算相关性:

    some.data <- data.frame(
      A1.1 = c(1,3,4,5,6),
      A1.2 = c(4,5,6,2,3),
      A1.3 = c(3,3,4,2,1),
      A2.1 = c(3,4,5,2,4),
      A2.2 = c(4,5,5,4,2),
      A2.3 = c(1,1,2,2,3),
      A3.1 = c(1,3,4,5,6),
      A3.2 = c(1,4,3,3,4),
      A3.3 = c(4,4,4,4,5)
    )
    cor.mat <- cor(some.data)
    

    它给出:

                A1.1       A1.2       A1.3       A2.1       A2.2       A2.3        A3.1       A3.2       A3.3
    A1.1  1.00000000 -0.4109975 -0.6155470 0.06839411 -0.5305954  0.9009862  1.00000000  0.7428336  0.6393620
    A1.2 -0.41099747  1.0000000  0.8320503 0.83205029  0.6454972 -0.3779645 -0.41099747  0.0000000 -0.3535534
    A1.3 -0.61554702  0.8320503  1.0000000 0.42307692  0.8951436 -0.6289709 -0.61554702 -0.3580574 -0.7844645
    A2.1  0.06839411  0.8320503  0.4230769 1.00000000  0.1790287  0.1572427  0.06839411  0.3580574  0.1961161
    A2.2 -0.53059545  0.6454972  0.8951436 0.17902872  1.0000000 -0.7319251 -0.53059545 -0.1666667 -0.9128709
    A2.3  0.90098616 -0.3779645 -0.6289709 0.15724273 -0.7319251  1.0000000  0.90098616  0.4879500  0.8017837
    A3.1  1.00000000 -0.4109975 -0.6155470 0.06839411 -0.5305954  0.9009862  1.00000000  0.7428336  0.6393620
    A3.2  0.74283363  0.0000000 -0.3580574 0.35805744 -0.1666667  0.4879500  0.74283363  1.0000000  0.4564355
    A3.3  0.63936201 -0.3535534 -0.7844645 0.19611614 -0.9128709  0.8017837  0.63936201  0.4564355  1.0000000
    

    在我的原始数据中,有些列是相关的,这里用前缀(a1、a2、a3)表示。由于我对这些不感兴趣,我想将具有相同前缀的相关性设置为零,如下所示:

               A1.1       A1.2       A1.3       A2.1       A2.2       A2.3        A3.1       A3.2       A3.3
    A1.1          0         0          0   0.06839411 -0.5305954  0.9009862  1.00000000  0.7428336  0.6393620
    A1.2          0         0          0   0.83205029  0.6454972 -0.3779645 -0.41099747  0.0000000 -0.3535534
    A1.3          0         0          0   0.42307692  0.8951436 -0.6289709 -0.61554702 -0.3580574 -0.7844645
    A2.1  0.06839411  0.8320503  0.4230769          0          0          0  0.06839411  0.3580574  0.1961161
    A2.2 -0.53059545  0.6454972  0.8951436          0          0          0 -0.53059545 -0.1666667 -0.9128709
    A2.3  0.90098616 -0.3779645 -0.6289709          0          0          0  0.90098616  0.4879500  0.8017837
    A3.1  1.00000000 -0.4109975 -0.6155470 0.06839411 -0.5305954  0.9009862           0          0          0
    A3.2  0.74283363  0.0000000 -0.3580574 0.35805744 -0.1666667  0.4879500           0          0          0 
    A3.3  0.63936201 -0.3535534 -0.7844645 0.19611614 -0.9128709  0.8017837           0          0          0
    

    我可以使用for循环来完成这项工作,但我想这样做要容易得多?

    2 回复  |  直到 6 年前
        1
  •  3
  •   markus    6 年前

    一种选择是将数据从宽改为长,使其包含三列

    cor.mat_long <- reshape2::melt(cor.mat)
    cor.mat_long
    #   Var1 Var2       value
    #1  A1.1 A1.1  1.00000000
    #2  A1.2 A1.1 -0.41099747
    #3  A1.3 A1.1 -0.61554702
    #4  A2.1 A1.1  0.06839411
    #5  A2.2 A1.1 -0.53059545
    #6  A2.3 A1.1  0.90098616
    #...
    

    基于前缀创建逻辑向量 Var1 Var2 指示这些前缀何时相同。使用此矢量替换 cor.mat_long$value 具有 0 它的评估结果 TRUE

    cor.mat_long$value[with(cor.mat_long, sub("\\.\\d+$", "", Var1) == sub("\\.\\d+$", "", Var2))] <- 0
    

    最后重新整形为宽格式。

    cor.mat2 <- reshape2::dcast(cor.mat_long, Var1 ~ Var2)
    cor.mat2
    #  Var1        A1.1       A1.2       A1.3       A2.1       A2.2       A2.3        A3.1       A3.2       A3.3
    #1 A1.1  0.00000000  0.0000000  0.0000000 0.06839411 -0.5305954  0.9009862  1.00000000  0.7428336  0.6393620
    #2 A1.2  0.00000000  0.0000000  0.0000000 0.83205029  0.6454972 -0.3779645 -0.41099747  0.0000000 -0.3535534
    #3 A1.3  0.00000000  0.0000000  0.0000000 0.42307692  0.8951436 -0.6289709 -0.61554702 -0.3580574 -0.7844645
    #4 A2.1  0.06839411  0.8320503  0.4230769 0.00000000  0.0000000  0.0000000  0.06839411  0.3580574  0.1961161
    #5 A2.2 -0.53059545  0.6454972  0.8951436 0.00000000  0.0000000  0.0000000 -0.53059545 -0.1666667 -0.9128709
    #6 A2.3  0.90098616 -0.3779645 -0.6289709 0.00000000  0.0000000  0.0000000  0.90098616  0.4879500  0.8017837
    #7 A3.1  1.00000000 -0.4109975 -0.6155470 0.06839411 -0.5305954  0.9009862  0.00000000  0.0000000  0.0000000
    #8 A3.2  0.74283363  0.0000000 -0.3580574 0.35805744 -0.1666667  0.4879500  0.00000000  0.0000000  0.0000000
    #9 A3.3  0.63936201 -0.3535534 -0.7844645 0.19611614 -0.9128709  0.8017837  0.00000000  0.0000000  0.0000000
    

    如果你不想 VAR1 作为显式列,do

    rownames(cor.mat2) <- cor.mat2$Var1
    cor.mat2 <- cor.mat2[-1] 
    

    不过,不知道这是否比循环简单得多。

        2
  •  2
  •   akrun    6 年前

    我们可以用1的块对角矩阵相乘。

    library(Matrix)
    as.matrix(cor.mat * !bdiag(replicate(3, matrix(1, 3, 3), simplify = FALSE)))
    #        A1.1       A1.2       A1.3       A2.1       A2.2       A2.3        A3.1       A3.2       A3.3
    #A1.1  0.00000000  0.0000000  0.0000000 0.06839411 -0.5305954  0.9009862  1.00000000  0.7428336  0.6393620
    #A1.2  0.00000000  0.0000000  0.0000000 0.83205029  0.6454972 -0.3779645 -0.41099747  0.0000000 -0.3535534
    #A1.3  0.00000000  0.0000000  0.0000000 0.42307692  0.8951436 -0.6289709 -0.61554702 -0.3580574 -0.7844645
    #A2.1  0.06839411  0.8320503  0.4230769 0.00000000  0.0000000  0.0000000  0.06839411  0.3580574  0.1961161
    #A2.2 -0.53059545  0.6454972  0.8951436 0.00000000  0.0000000  0.0000000 -0.53059545 -0.1666667 -0.9128709
    #A2.3  0.90098616 -0.3779645 -0.6289709 0.00000000  0.0000000  0.0000000  0.90098616  0.4879500  0.8017837
    #A3.1  1.00000000 -0.4109975 -0.6155470 0.06839411 -0.5305954  0.9009862  0.00000000  0.0000000  0.0000000
    #A3.2  0.74283363  0.0000000 -0.3580574 0.35805744 -0.1666667  0.4879500  0.00000000  0.0000000  0.0000000
    #A3.3  0.63936201 -0.3535534 -0.7844645 0.19611614 -0.9128709  0.8017837  0.00000000  0.0000000  0.0000000
    

    或者另一个选择是使用 row/column 指数

    replace(cor.mat, cbind(rep(1:9, each = 3),
           c(sapply(list(1:3, 4:6, 7:9), rep, 3))), 0)
    

    或使用 outer 构造逻辑矩阵并用 cor.mat

    nm1 <- sub("\\.\\d+$", "", colnames(cor.mat))
    cor.mat * outer(nm1, nm1, `!=`)