代码之家  ›  专栏  ›  技术社区  ›  MAPK

使用perl脚本提取基本更改

  •  1
  • MAPK  · 技术社区  · 6 年前

    我需要一些关于这个perl脚本的帮助,因为我是perl的绝对初学者。我有我下面的代码,这让我在特定的位置不匹配从bowtie.out 对于给定的序列($sequence)长度(这里我在sequence of length 22中搜索)。ARGV[0]是 bowtie.out 文件,ARGV[1]是最大序列长度。 这将提供字母数或基数更改(例如,A>T) 在每个变更类型的特定位置。但是,我还想为每个更改类型设置$position\u ref,并得到如下所示的结果。我需要对这段代码做什么才能得到如下所示的结果? 提前谢谢你的帮助。

    # usage ./script.pl bowtie.out 22 
    
    my $sread = "ACGT-read";
    my $strand ="-";
    my $name = "bill";
    my $position = 1;
    my $sequence = "ACGT";
    my $quality = "good";
    my $d2 = "d2";
    my $d3 = "d3";
    my $class= $ARGV[1];
    my @mray;
    my @lines;
    my $min = 10;
    
    for (my $i=0; $i < $class; $i++) {
        $mray[$i][0]=0;
        $mray[$i][1]=0;
        $mray[$i][2]=0;
        $mray[$i][3]=0;
        $mray[$i][4]=0;
        $mray[$i][5]=0;
        $mray[$i][6]=0;
        $mray[$i][7]=0;
        $mray[$i][8]=0;
        $mray[$i][9]=0;
        $mray[$i][10]=0;
        $mray[$i][11]=0;
    }
    
    open (INFILE, "<$ARGV[0]") || die "couldn't open the 1 infile!";
    
    while ($rln = <INFILE>){
        chomp $rln;
        ($sread, $strand, $name, $position, $sequence, $quality, $d2,$d3) = split("\t",$rln);
    #   print "what is d3 \n $d3\n";
        $seq_len = length ($sequence);
        if ($seq_len == $class && $d3){
            $position_ref = $position; 
    # This is where I need help. I want to paste these values (from $position_ref) to the corresponding columns as shown in the result below.
            print "what is position_ref \n $position_ref\n";
    #   print "what is d3 \n $d3\n";
           ($position, $d2) = split(":",$d3);
    #   print "what is d2 \n $d2\n";
           $var = substr $d2, -1;
           $var2 = substr $d2, 0;
    
           if ($var2 eq "A>T" ) { $mray[$position][0]++ }
           if ($var2 eq "A>G" ) { $mray[$position][1]++ }
           if ($var2 eq "A>C" ) { $mray[$position][2]++ }
           if ($var2 eq "C>T" ) { $mray[$position][3]++ }
           if ($var2 eq "C>G" ) { $mray[$position][4]++ }
           if ($var2 eq "C>A" ) { $mray[$position][5]++ }
           if ($var2 eq "G>T" ) { $mray[$position][6]++ }
           if ($var2 eq "G>A" ) { $mray[$position][7]++ }
           if ($var2 eq "G>C" ) { $mray[$position][8]++ }
           if ($var2 eq "T>A" ) { $mray[$position][9]++ }
           if ($var2 eq "T>G" ) { $mray[$position][10]++ }
           if ($var2 eq "T>C" ) { $mray[$position][11]++ }
    
    
    #   print "what is mray \n $mray[$position][3]\n";
           if ($position == ($class-1)) {
               $read = substr $sequence, 0, $class-1;
               push (@lines, $read);
    #   print "what is read \n $read\n";
    #   print "what is position \n $position\n";
           }
        }
    }
    close (INFILE);
    print "Pos\tA>T\tA>G\tA>C\tC>T\tC>G\tC>A\tG>T\tG>A\tG>C\tT>A\tT>G\tT>C\n";
    for (my $i=0; $i < $class; $i++) {
        $pnum = $i + 1;
        print "$pnum\t$mray[$i][0]\t$mray[$i][1]\t$mray[$i][2]\t$mray[$i][3]\t$mray[$i][4]\t$mray[$i][5]\t$mray[$i][6]\t$mray[$i][7]\t$mray[$i][8]\t$mray[$i][9]\t$mray[$i][10]\t$mray[$i][11]\n";
    }
    
    @lines = sort(@lines); # sort the list
    $count = 0;
    foreach my $line(@lines) # loop thru list
     {
        if ($line eq $oldline)
            {
              $count++;
            }  
            else 
            { 
              if ($count >= $min) {print "$oldline\t$count\n";}
    #          print "$count\n";
              $count=1;
              $oldline=$line;
            }
     }
    
    exit;   
    

    bowtie.out

        K00363:128:HV3CJBBXX:3:1101:9668:1894 1:N:0:TAATCG  -   reverseKF898354_1_14561 12006   TCACCAGGAAGATAAAACACGA  JJJJJJJJJJJJJJJJJFFFAA  0   12:T>A
        K00363:128:HV3CJBBXX:3:1101:3363:1894 1:N:0:TAATCG  -   reverseKF898354_1_14561 1108    GCACCAGGAAGATAAAACACGT  JJJJJJJJJJJJJJJJJFFFAA  0   12:T>A
        K00363:128:HV3CJBBXX:3:1101:8521:1912 1:N:0:TAATCG  -   reverseKF898354_1_14561 13807   CACACAAATCATGGACGAAGATGA    JJJJJJJJJJJJJJJJJJJFFFAA    0   23:G>C
        K00363:128:HV3CJBBXX:3:1101:11343:1912 1:N:0:TAATCG -   reverseKF898354_1_14561 11823   TTTACAATCGTTTGCAGTCTATCA    JJJJJJJJJJJJJJJJJJJFFFAA    0   
        K00363:128:HV3CJBBXX:3:1101:17056:1930 1:N:0:TAATCG +   reverseKF898354_1_14561 1970    TGCGCAGGACAAGAACTGAATG  AAFFFJJJJJJJJJJJJJJJJJ  0   19:C>A
        K00363:128:HV3CJBBXX:3:1101:11515:1965 1:N:0:TAATCG -   reverseKF898354_1_14561 9030    TGTCTGAAAATAACACGTCCAA  JJJJJJJJJJJJJJJJJFFFAA  0   5:A>G
    

    我得到的结果是:

    Pos A>T A>G A>C C>T C>G C>A G>T G>A G>C T>A T>G T>C
    1   0   0   0   0   0   0   0   0   0   0   0   0
    2   0   0   0   0   0   0   0   0   0   0   0   0
    3   0   0   0   0   0   0   0   0   0   0   0   0
    4   0   0   0   0   0   0   0   0   0   0   0   0
    5   0   0   0   0   0   0   0   0   0   0   0   0
    6   0   1   0   0   0   0   0   0   0   0   0   0
    7   0   0   0   0   0   0   0   0   0   0   0   0
    8   0   0   0   0   0   0   0   0   0   0   0   0
    9   0   0   0   0   0   0   0   0   0   0   0   0
    10  0   0   0   0   0   0   0   0   0   0   0   0
    11  0   0   0   0   0   0   0   0   0   0   0   0
    12  0   0   0   0   0   0   0   0   0   0   0   0
    13  0   0   0   0   0   0   0   0   0   2   0   0
    14  0   0   0   0   0   0   0   0   0   0   0   0
    15  0   0   0   0   0   0   0   0   0   0   0   0
    16  0   0   0   0   0   0   0   0   0   0   0   0
    17  0   0   0   0   0   0   0   0   0   0   0   0
    18  0   0   0   0   0   0   0   0   0   0   0   0
    19  0   0   0   0   0   0   0   0   0   0   0   0
    20  0   0   0   0   0   1   0   0   0   0   0   0
    21  0   0   0   0   0   0   0   0   0   0   0   0
    22  0   0   0   0   0   0   0   0   0   0   0   0
    

    我想要的结果(制表符分隔):

    Pos A>T position_ref    A>G position_ref    A>C position_ref    C>T position_ref    C>G position_ref    C>A position_ref    G>T position_ref    G>A position_ref    G>C position_ref    T>A position_ref    T>G position_ref    T>C position_ref
    1   0       0       0       0       0       0       0       0       0       0               0   
    2   0       0       0       0       0       0       0       0       0       0               0   
    3   0       0       0       0       0       0       0       0       0       0               0   
    4   0       0       0       0       0       0       0       0       0       0               0   
    5   0       0       0       0       0       0       0       0       0       0               0   
    6   0       1   9030    0       0       0       0       0       0       0       0               0   
    7   0       0       0       0       0       0       0       0       0       0               0   
    8   0       0       0       0       0       0       0       0       0       0               0   
    9   0       0       0       0       0       0       0       0       0       0               0   
    10  0       0       0       0       0       0       0       0       0       0               0   
    11  0       0       0       0       0       0       0       0       0       0               0   
    12  0       0       0       0       0       0       0       0       0       0               0   
    13  0       0       0       0       0       0       0       0       0       2   12006   1108        0   
    14  0       0       0       0       0       0       0       0       0       0               0   
    15  0       0       0       0       0       0       0       0       0       0               0   
    16  0       0       0       0       0       0       0       0       0       0               0   
    17  0       0       0       0       0       0       0       0       0       0               0   
    18  0       0       0       0       0       0       0       0       0       0               0   
    19  0       0       0       0       0       0       0       0       0       0               0   
    20  0       0       0       0       0       1   11823   0       0       0       0               0   
    21  0       0       0       0       0       0       0       0       0       0               0   
    22  0       0       0       0       0       0       0       0       0       0               0   
    
    2 回复  |  直到 6 年前
        1
  •  1
  •   MAPK    6 年前

    代码按制表符拆分输入数据。但是示例文件不包含制表符(因为复制并粘贴到网页时制表符会丢失)。我试着把所有的空白都转换成制表符,现在我认为输入数据和您将其拆分成的变量之间不匹配。

    例如,第一行数据如下所示:

    K00363:128:HV3CJBBXX:3:1101:9668:1894 1:N:0:TAATCG  -   reverseKF898354_1_14561 12006   TCACCAGGAAGATAAAACACGA  JJJJJJJJJJJJJJJJJFFFAA  0   12:T>A
    

    如果我们假设所有的空白实际上都是一个制表符,那么我们将得到以下变量:

    • $sread
    • $strand : -
    • $name :reverseKF898354\u 1\u 14561
    • $position
    • $sequence :tcacaggagataacacga
    • $quality
    • $d2
    • $d3

    我想那是错的。我想很多都是一列出来的。我怀疑问题出在第三列——那列包含 + - . 要么它不应该是一个单独的列,要么它需要自己的变量。

    如果我在 split() 呼叫存储 + / ,那么您的代码实际上做了一些事情。但这仍然不是你所期望的结果,我对你的数据了解不够,无法确定下一步是什么。

        2
  •  0
  •   Alessandro Testori    6 年前

    如果我从您所需的输出中正确理解,那么您需要的是引用位置,而不是特定基更改的计数:请尝试在您的脚本中用“$position\u ref”替换哈希中的“++”,如下所示:

    # usage ./script.pl bowtie.out 22 
    
    my $sread = "ACGT-read";
    my $strand ="-";
    my $name = "bill";
    my $position = 1;
    my $sequence = "ACGT";
    my $quality = "good";
    my $d2 = "d2";
    my $d3 = "d3";
    my $class= $ARGV[1];
    my @mray;
    my @lines;
    my $min = 10;
    
    for (my $i=0; $i < $class; $i++) {
        $mray[$i][0]=0;
        $mray[$i][1]=0;
        $mray[$i][2]=0;
        $mray[$i][3]=0;
        $mray[$i][4]=0;
        $mray[$i][5]=0;
        $mray[$i][6]=0;
        $mray[$i][7]=0;
        $mray[$i][8]=0;
        $mray[$i][9]=0;
        $mray[$i][10]=0;
        $mray[$i][11]=0;
    }
    
    open (INFILE, "<$ARGV[0]") || die "couldn't open the 1 infile!";
    
    while ($rln = <INFILE>){
        chomp $rln;
        ($sread, $strand, $name, $position, $sequence, $quality, $d2,$d3) = split("\t",$rln);
    #   print "what is d3 \n $d3\n";
        $seq_len = length ($sequence);
        if ($seq_len == $class && $d3){
            $position_ref = $position; 
    # This is where I need help. I want to paste these values (from $position_ref) to the corresponding columns as shown in the result below.
            print "what is position_ref \n $position_ref\n";
    #   print "what is d3 \n $d3\n";
           ($position, $d2) = split(":",$d3);
    #   print "what is d2 \n $d2\n";
           $var = substr $d2, -1;
           $var2 = substr $d2, 0;
    
           if ($var2 eq "A>T" ) { $mray[$position][0]=$position_ref}
           if ($var2 eq "A>G" ) { $mray[$position][1]=$position_ref }
           if ($var2 eq "A>C" ) { $mray[$position][2]=$position_ref }
           if ($var2 eq "C>T" ) { $mray[$position][3]=$position_ref }
           if ($var2 eq "C>G" ) { $mray[$position][4]=$position_ref }
           if ($var2 eq "C>A" ) { $mray[$position][5]=$position_ref }
           if ($var2 eq "G>T" ) { $mray[$position][6]=$position_ref }
           if ($var2 eq "G>A" ) { $mray[$position][7]=$position_ref }
           if ($var2 eq "G>C" ) { $mray[$position][8]=$position_ref }
           if ($var2 eq "T>A" ) { $mray[$position][9]=$position_ref }
           if ($var2 eq "T>G" ) { $mray[$position][10]=$position_ref }
           if ($var2 eq "T>C" ) { $mray[$position][11]=$position_ref}
    
    
    #   print "what is mray \n $mray[$position][3]\n";
           if ($position == ($class-1)) {
               $read = substr $sequence, 0, $class-1;
               push (@lines, $read);
    #   print "what is read \n $read\n";
    #   print "what is position \n $position\n";
           }
        }
    }
    close (INFILE);
    print "Pos\tA>T\tA>G\tA>C\tC>T\tC>G\tC>A\tG>T\tG>A\tG>C\tT>A\tT>G\tT>C\n";
    for (my $i=0; $i < $class; $i++) {
        $pnum = $i + 1;
        print "$pnum\t$mray[$i][0]\t$mray[$i][1]\t$mray[$i][2]\t$mray[$i][3]\t$mray[$i][4]\t$mray[$i][5]\t$mray[$i][6]\t$mray[$i][7]\t$mray[$i][8]\t$mray[$i][9]\t$mray[$i][10]\t$mray[$i][11]\n";
    }
    
    @lines = sort(@lines); # sort the list
    $count = 0;
    foreach my $line(@lines) # loop thru list
     {
        if ($line eq $oldline)
            {
              $count++;
            }  
            else 
            { 
              if ($count >= $min) {print "$oldline\t$count\n";}
    #          print "$count\n";
              $count=1;
              $oldline=$line;
            }
     }
    exit;