代码之家  ›  专栏  ›  技术社区  ›  Robert Cruz

如何使用AWK透视数据

  •  -2
  • Robert Cruz  · 技术社区  · 7 年前

    发件人:

    DT  X   Y   Z
    10  75  0   3
    20  100 1   6
    30  125 2   9
    

    收件人:

    DT  ID  VALUE
    10  X   75
    20  Y   0
    30  Z   3
    10  X   100
    20  Y   1
    30  Z   6
    10  X   125
    20  Y   2
    30  Z   9
    
    3 回复  |  直到 7 年前
        1
  •  1
  •   Robert Cruz    7 年前

    完成了

    #my original dataset is separated by "," and have 280 cols 
    
            tempfile=dataset.csv;
            col_count=`head -n1 $tempfile | tr -cd "," | wc -c`;
            col_count=`expr $col_count + 1`;
    
    
            for i in `seq 4 $col_count`; do 
                                        echo $i;
                            pt="{print \$"$i"}";
                        col_name=`head -n 1 $tempfile | sed s'/ //'g | awk -F"," "$pt"`;
                        awk -F"," -v header="DT,ID,$col_name" -f st.awk $tempfile | awk 'NR>1 {print substr($0,index($0,$1))",'"$col_name"'"}' | sed 's/ //g' >> New$tempfile;
    
                 done;
    
      # file st.awk:
    # the code below was found on some stackoverflow page, with some minor changes
        BEGIN {
            # Parse headers into an assoc array h
            split(header, a, ",")
            for(i in a) {
                h[a[i]]=2
            }   
    
        }
    
        # Find the column numbers in the first line of a file
        FNR==1{
            split("", cols) # This will re-init cols
            for(i=1;i<=NF;i++) {
                if($i in h) {
                    cols[i]=1
                }
            }   
            next
        }
    
        # Print those columns on all other lines
        {
            res = ""
            for(i=1;i<=NF;i++) {
                if(i in cols) {
                    s = res ? OFS : ""
                    res = res "," $i
                }
            }   
            if (res) {
                print res 
            }   
        }
    
        2
  •  0
  •   ctac_    7 年前

    您可以尝试此awk(MAWK版本1.2)

    您的数据可以是5x5或更多

    mawk -v OFS='\t' '
    NR==1 {
    nbfield=(NF-1)
    for(i=1;i<NF;i++)
        ID[i]=$(i+1)
    print $1 OFS "ID" OFS "VALUE"
    next
    }
    {
    numrecord=((NR-1)%nbfield)
    numrecord = numrecord ? numrecord : nbfield
    for(i=0;i<=nbfield;i++)
        val[ID[i],numrecord]=$(i+1)
    }
    numrecord==nbfield {
    for(i=1;i<=nbfield;i++)
        for(j=1;j<=nbfield;j++) 
            print val[ID[0],j] OFS ID[j] OFS val[ID[j],i]
    }
    ' infile
    
        3
  •  0
  •   Bilbo    2 年前

    输入:

    --     ColOne ColTwo ColThr
    RowOne    A      B      C      D      E
    RowTwo    F      G      H      I      J
    RowThr    K      L      M      N      O
    RowFor    P      Q      R      S      T
    RowFiv    U      V      W      X      Y
    

    输出:

    RowNbr | ColNbr | RowColVal
    ------ | ------ | ---------
    RowOne | ColOne | A        
    RowOne | ColTwo | B        
    RowOne | ColThr | C        
    
    RowTwo | ColOne | F        
    RowTwo | ColTwo | G        
    RowTwo | ColThr | H        
    
    RowThr | ColOne | K        
    RowThr | ColTwo | L        
    RowThr | ColThr | M        
    

    透视脚本:

    # pivot a table
    
    BEGIN                                 {                          # before processing innput lines, emit output header
      OFS = " | "                                                    # set the output field-separator
      fmtOutDtl = "%6s | %-6s | %-9s"          "\n"                  # set the output format for all detail  lines: InpRowHdr, InpColHdr, InpVal
      fmtOutHdr = "%6s | ColNbr | RowColVal"   "\n"                  # set the output format for the header  line
      strOutDiv =  "------ | ------ | ---------"                     # set                       the divider line
      print ""                                                       # emit blank line before output
    }                                                                # done with output header
    
    NR == 1                               {                          # when we are on the innput header line /                                  the first row
      FldCnt = (                       NF - 1 )                      # number of columns to process is number of fields on this row, except for the first val
      for(      idxCol = 1;   idxCol < NF;            idxCol++   )   # scan col numbers after the first, ignoring                               the first val
                      ColHds[ idxCol ]           = $( idxCol + 1 )   # store the next col-val as this ColHdr
    
      printf( fmtOutHdr, "RowNbr"  )                                 # emit header line: RowNbr-header, innput column headers
      print   strOutDiv                                              # emit divider row after header line and before data lines
      next                                                           # skip to the next innput row
    }                                                                # done with first  innput row
    
    {                                                                # for each body innput row
      RecNbr = ( ( NR - 1 ) %           FldCnt )                     # get RecNum for this  row: ( RecNum - 1 ) Mod [number of fields]: zero-based / 0..[number_of_cols-1]
      RecNbr = RecNbr ? RecNbr :        FldCnt                       # promote from zero-based to one-based: 0 =>   [number of fields]: one -based / 1..[number_of_cols  ]
      for(      idxCol = 0;   idxCol <= FldCnt;       idxCol++   )   # scan col numbers including the first
                 Rws[ ColHds[ idxCol ], RecNbr ] = $( idxCol + 1 )   # store this row+col val in this Row position under this ColHdr
    }                                                                # done with this body innput row
    
    RecNbr ==                           FldCnt {                     # when we are on the last innput row that we are processing (lines beyond FldCnt are not emitted)
      for(      idxCol = 1;   idxCol <= FldCnt;       idxCol++   ) { # scan col numbers after the first
        for(    idxRow = 1;   idxRow <= FldCnt;       idxRow++   ) { # scan row numbers after the first, up to number of cols
          printf( fmtOutDtl                                          \
                ,Rws[ ColHds[ 0      ]   ,            idxCol ]       \
                ,     ColHds[ idxRow ]                               \
                ,Rws[ ColHds[ idxRow ]   ,            idxCol ]   )   # emit innput rowHdr, colHdr, row+col val
        }                                                            # done scanning row numbers
        print ""                                                     # emit a blank line after each innput row
      }                                                              # done scanning col numbers
    }                                                                # done with the last innput row
    
    END                                   {                          # after processing innput lines
    }                                                                # do nothing