每行具有最大值的列的输出名称

Output name of column with max value per line

我有

chr pos C T A G
NC_044998.1     3732    21 0 0 0
NC_044998.1     3733    22 0 2 0
NC_044998.1     3734    22 0 5 0
NC_044998.1     3735    22 0 0 0
NC_044998.1     3736    0 0 7 0
NC_044998.1     3737    0 0 0 22
NC_044998.1     3738    20 0 0 0
NC_044998.1     3739    1 0 22 0
NC_044998.1     3740    0 22 0 0
NC_044998.1     3741    22 0 0 0

我需要输出每行 $3 到 $7 的最大值以及与之关联的列名。

所以我有

chr pos max ref
NC_044998.1     3732 21 C
NC_044998.1     3733 22 C
NC_044998.1     3734 22 C
NC_044998.1     3735 22 C
NC_044998.1     3736 7 A
NC_044998.1     3737 22 G
NC_044998.1     3738 20 C
NC_044998.1     3739 22 A
NC_044998.1     3740 22 T
NC_044998.1     3741 22 C 

我正在尝试调整这个:

awk 'NR == 1 {for (c = 3; c <= NF; i++) headers[c] = $c; next} {maxc=3;for(c=4;c<=NF;c++)if($c>$maxc){maxc=c} printf "max:%s, %s\n", $maxc, headers[maxc]}'

但它只输出这个最大值

也试过

awk '{maxc=3;for(c=4;c<=NF;c++)if($c>$maxc){maxc=c; $maxc = headers[c]} printf "max:%s, column:%s, column:%s\n",$maxc, maxc, headers[maxc]}'

我试图解决的另一个问题是在一列或多列之间存在关联的情况。在那种情况下,我想打印最大值和所有相关列的名称。

对于您最初想要的结果:

awk 'NR==1 {for (i=0; i<=NF; i++) header_array[i] = $i; } NR>1{a=; for (i=3;i<=6;i++) if($i >= a) {a=$i; column=header_array[i]} {printf "max:%s, %s\n", a, column}}' file

# Output
max:21, C
max:22, C
max:22, C
max:22, C
max:7, A
max:22, G
max:20, C
max:22, A
max:22, T
max:22, C

更新后的期望结果:

awk 'NR==1 {for (i=0; i<=NF; i++) header_array[i] = $i; print "chr pos max ref"} NR>1{char=; pos=; a=; for (i=3;i<=6;i++) if($i >= a) {a=$i; column=header_array[i]} {printf "%s %s %s %s\n", char, pos, a, column}}' file

# Output
chr pos max ref
NC_044998.1 3732 21 C
NC_044998.1 3733 22 C
NC_044998.1 3734 22 C
NC_044998.1 3735 22 C
NC_044998.1 3736 7 A
NC_044998.1 3737 22 G
NC_044998.1 3738 20 C
NC_044998.1 3739 22 A
NC_044998.1 3740 22 T
NC_044998.1 3741 22 C

格式化:

awk 'NR==1 {
    for (i = 0; i <= NF; i++) {
        header_array[i] = $i
        }
    print "chr pos max ref"
    }
    NR>1 {
        char = 
        pos = 
        a = 
        for (i = 3; i <= 6; i++) {
            if ($i >= a) {
                a = $i
                column = header_array[i]
            }
        } {
           printf "%s %s %s %s\n", char, pos, a, column
        }
    }' file

你可以试试

awk 'BEGIN{OFS="\t";FS="[[:space:]]+"}
     NR==1{print ,,"max","ref"; 
           for(i=3;i<=6;++i)BASES[i]=$(i);
     } 
     NR>1{
        basemax=3; max=;
        for(i=4;i<=6;++i){if($i>max){basemax=i;max=$i;}}
        print ,,BASES[basemax],max
     }
' inputfile

你明白了,

chr pos max ref
NC_044998.1 3732    C   21
NC_044998.1 3733    C   22
NC_044998.1 3734    C   22
NC_044998.1 3735    C   22
NC_044998.1 3736    A   7
NC_044998.1 3737    G   22
NC_044998.1 3738    C   20
NC_044998.1 3739    A   22
NC_044998.1 3740    T   22
NC_044998.1 3741    C   22

注意:如果存在两个或多个最大值的碱基,修正

对于输入,

chr pos C T A G
NC_044998.1     3732    21 0 21 0
NC_044998.1     3733    22 0 2 0
NC_044998.1     3734    22 0 5 0
NC_044998.1     3735    22 0 0 0
NC_044998.1     3736    0 0 7 7
NC_044998.1     3737    0 0 0 22
NC_044998.1     3738    20 0 0 0
NC_044998.1     3739    1 0 22 0
NC_044998.1     3740    0 22 0 0
NC_044998.1     3741    22 0 0 0

你可以试试

awk 'BEGIN{OFS="\t";FS="[[:space:]]+"}
     NR==1{print ,,"max","ref"; 
         for(i=3;i<=6;++i)BASES[i]=$(i);
     } 
     NR>1{basemax=BASES[3]; max=;
         for(i=4;i<=6;++i){
             if($i>max){basemax=BASES[i];max=$i;}
             else if($i==max){basemax=basemax","BASES[i];}
         }
         print ,,basemax,max
     }
' fileinput

你明白了,

chr pos max ref
NC_044998.1 3732    C,A 21
NC_044998.1 3733    C   22
NC_044998.1 3734    C   22
NC_044998.1 3735    C   22
NC_044998.1 3736    A,G 7
NC_044998.1 3737    G   22
NC_044998.1 3738    C   20
NC_044998.1 3739    A   22
NC_044998.1 3740    T   22
NC_044998.1 3741    C   22

使用您展示的示例,请尝试遵循 awk 代码,用 GNU awk 编写和测试。

awk -v startField="3" -v endField="6" '
FNR==1{
  for(i=startField;i<=endField;i++){
    heading[i]=$i
  }
  next
}
{
  max=maxInd=""
  for(i=startField;i<=endField;i++){
    maxInd=(max<$i?i:maxInd)
    max=(max<$i?$i:max)
  }
  NF=(startField-1)
  print [=10=],heading[maxInd]
}
'  Input_file

此代码方法的优点:

  • 用户可以通过使用名为 startFieldendField 的变量来指定起始字段编号和结束字段编号,因此我们无需更改主要 awk 代码中的任何内容。
  • 第二个优点是因为这里没有硬编码,所以假设用户明天想要检查第 10 个字段到第 20 个字段的最大值,那么我们不需要打印或明确提及要打印的第 9 个字段,因为在代码本身。

详细解释: 补充上面的详细解释。

awk -v startField="3" -v endField="6" '   ##Starting awk program and setting startField and endField to values on which user wants to look for maximum values.
FNR==1{                                   ##Checking condition if this is first line of Input_file.
  for(i=startField;i<=endField;i++){      ##Traversing through only those fields which user needs to get max value.
    heading[i]=$i                         ##Creating array heading whose index is i and value is current field value.
  }
  next                                    ##next will skip all further statements from here.
}
{
  max=maxInd=""                           ##Nullifying max and maxInd variables here.
  for(i=startField;i<=endField;i++){      ##Traversing through only those fields which user needs to get max value.
    maxInd=(max<$i?i:maxInd)              ##Getting maxInd variable to current field number if current field value is greater than maxInd else keep it as maxInd itself.
    max=(max<$i?$i:max)                   ##Getting max variable to current field value if current field value is greater than max else keep it as max itself.
  }
  NF=(startField-1)                       ##Setting NF(number of fields of current line) to startField-1 here.
  print [=11=],heading[maxInd]                ##printing current field followed by heading array value whose index is maxInd.
}
'  Input_file                             ##Mentioning Input_file name here. 

假设您有这个输入文件:

cat file

chr pos C T A G
NC_044998.1     3732    21 0 0 0
NC_044998.1     3733    22 0 2 0
NC_044998.1     3734    22 0 5 0
NC_044998.1     3735    22 0 0 0
NC_044998.1     3736    0 0 7 7
NC_044998.1     3737    0 22 0 22
NC_044998.1     3738    20 0 0 0
NC_044998.1     3739    1 0 22 0
NC_044998.1     3740    0 22 0 0
NC_044998.1     3741    22 0 0 0

要获得所需的输出,请使用此 awk:

awk -v b=3 'NR==1{for (i=b; i<=NF; ++i) hdr[i]=$i; print , , "max", "ref"; next} {for (i=b; i<=NF; ++i) max=($i>max?$i:max); for (i=b; i<=NF; ++i) if ($i == max) c4=(c4?c4 ";":"") hdr[i]; print , , max, c4; max=c4=""}' file

chr pos max ref
NC_044998.1 3732 21 C
NC_044998.1 3733 22 C
NC_044998.1 3734 22 C
NC_044998.1 3735 22 C
NC_044998.1 3736 7 A;G
NC_044998.1 3737 22 T;G
NC_044998.1 3738 20 C
NC_044998.1 3739 22 A
NC_044998.1 3740 22 T
NC_044998.1 3741 22 C

这是一个更具可读性的版本:

awk -v b=3 '
NR == 1 {
   for (i=b; i<=NF; ++i)
      hdr[i] = $i
   print , , "max", "ref"
   next
}
{
   for (i=b; i<=NF; ++i)
      max = ($i > max ? $i : max)
   for (i=b; i<=NF; ++i)
      if ($i == max)
         c4 = (c4 ? c4 ";" : "") hdr[i]
   print , , max, c4
   max=c4=""
}' file

鉴于此输入文件在第 1 条和第 6 条数据行上具有重复的最大值:

$ cat file
chr pos C T A G
NC_044998.1     3732    21 0 21 0
NC_044998.1     3733    22 0 2 0
NC_044998.1     3734    22 0 5 0
NC_044998.1     3735    22 0 0 0
NC_044998.1     3736    0 0 7 0
NC_044998.1     3737    0 22 0 22
NC_044998.1     3738    20 0 0 0
NC_044998.1     3739    1 0 22 0
NC_044998.1     3740    0 22 0 0
NC_044998.1     3741    22 0 0 0

使用 GNU awk sorted_in:

$ cat tst.awk
BEGIN {
    PROCINFO["sorted_in"] = "@val_num_desc"
}
{
    printf "%s%s%s%s", , OFS, , OFS
     =  = ""
}
NR==1 {
    split([=11=],refs)
    print "max", "ref"
    next
}
{
    n = split([=11=],vals)

    for ( i in vals ) {
        maxVal = vals[i]
        break
    }

    ref = sep = ""
    for ( i=1; i<=n; i++ ) {
        if ( vals[i] == maxVal ) {
            ref = ref sep refs[i]
            sep = ","
        }
    }

    print maxVal, ref
}

$ awk -f tst.awk file | column -t
chr          pos   max  ref
NC_044998.1  3732  21   C,A
NC_044998.1  3733  22   C
NC_044998.1  3734  22   C
NC_044998.1  3735  22   C
NC_044998.1  3736  7    A
NC_044998.1  3737  22   T,G
NC_044998.1  3738  20   C
NC_044998.1  3739  22   A
NC_044998.1  3740  22   T
NC_044998.1  3741  22   C

我发布上面的主要原因是为了展示如何通过使用“sorted_in”和 for ( i in ... ) 来获得最小值或最大值,然后在访问第一个索引后中断,就像那样根据您使用 PROCINFO["sorted_in"]=... 设置的任何标准为您提供最大值或最小值,而无需编写任何代码来进行比较。