计算每行不同字符串的出现次数并将计数附加为列

Question

我有

NC_044998.1  3789  hetdevdev   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev
NC_044998.1  3803  homorefref  hetrefdev   homorefref  homorefref  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref
NC_044998.1  3806  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref
NC_044998.1  3856  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev
NC_044998.1  4008  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev
NC_044998.1  4012  homodevdev  hetdevref   hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  hetdevref   homodevdev
NC_044998.1  4020  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref
NC_044998.1  5353  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref
NC_044998.1  5364  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  hetrefdev   homorefref  homorefref
NC_044998.1  5435  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref

我想计算每行输出中“homorefref”、“homodevdev”、“hetrefdev”、“hetdevref”、“hetdevdev”的出现次数，如

NC_044998.1  3789  hetdevdev   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 11 0 0 0
NC_044998.1  3803  homorefref  hetrefdev   homorefref  homorefref  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref 8 0 3 0 0
NC_044998.1  3806  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 9 0 2 0 0
NC_044998.1  3856  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4008  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4012  homodevdev  hetdevref   hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  hetdevref   homodevdev 0 7 0 4 0
NC_044998.1  4020  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5353  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5364  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  hetrefdev   homorefref  homorefref 9 0 2 0 0
NC_044998.1  5435  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref 10 0 1 0 0

我知道如何使用 grep 对循环中的每个字符串执行操作，然后粘贴为新列

while read i; do echo $i |grep -o "homorefref"| wc -l;  done < file

或使用 awk 的单个字符串

awk '{print ,,gsub(/homorefref/,"")}' file

但我找不到同时处理所有字符串的方法。

我也试过这个

var="homorefref,homodevdev,hetrefdev,hetdevref,hetdevdev"
char=","
awk -F"${char}" '{print NF-1}' file <<< "${string}"

Answer 1

您可以使用这个 awk:

awk 'BEGIN {s="homorefref,homodevdev,hetrefdev,hetdevref,hetdevdev"; n=split(s, a, ","); for (i=1; i<=n; ++i) col[a[i]]} {for (i=1; i<=NF; ++i) if ($i in col) ++fq[$i]; printf "%s", [=10=]; for (i=1; i<=n; ++i) printf "%s%s", OFS, fq[a[i]]+0; print ""; delete fq}' file

NC_044998.1  3789  hetdevdev   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 0 1
NC_044998.1  3803  homorefref  hetrefdev   homorefref  homorefref  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref 8 0 3 0 0
NC_044998.1  3806  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 9 0 2 0 0
NC_044998.1  3856  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4008  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4012  homodevdev  hetdevref   hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  hetdevref   homodevdev 0 7 0 4 0
NC_044998.1  4020  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5353  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5364  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  hetrefdev   homorefref  homorefref 9 0 2 0 0
NC_044998.1  5435  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref 10 0 1 0 0

更易读的格式：

awk '
BEGIN {
   s = "homorefref,homodevdev,hetrefdev,hetdevref,hetdevdev"
   n=split(s, a, ",")
   for (i=1; i<=n; ++i)
      col[a[i]]
}
{
   for (i=1; i<=NF; ++i)
      if ($i in col)
         ++fq[$i]
   printf "%s", [=11=]
   for (i=1; i<=n; ++i)
      printf "%s%s", OFS, fq[a[i]]+0
   print ""
   delete fq
}' file

Answer 2

$ cat tst.awk
BEGIN {
    numTags = split("homorefref homodevdev hetrefdev hetdevref hetdevdev",tags)
}
{
    delete cnt
    for (i=3; i<=NF; i++) {
        tag = $i
        cnt[tag]++
    }
    printf "%s", [=10=]
    for (tagNr=1; tagNr<=numTags; tagNr++) {
        tag = tags[tagNr]
        printf "%s%d", OFS, cnt[tag]
    }
    print ""
}

$ awk -f tst.awk file
NC_044998.1  3789  hetdevdev   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 0 1
NC_044998.1  3803  homorefref  hetrefdev   homorefref  homorefref  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref 8 0 3 0 0
NC_044998.1  3806  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 9 0 2 0 0
NC_044998.1  3856  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4008  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4012  homodevdev  hetdevref   hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  hetdevref   homodevdev 0 7 0 4 0
NC_044998.1  4020  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5353  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5364  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  hetrefdev   homorefref  homorefref 9 0 2 0 0
NC_044998.1  5435  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref 10 0 1 0 0

Answer 3

另一个 awk:

使用 2 个文件的方法，第一个是使用 echo "homorefref,homodevdev..." 的内联文件，使用 RS="," 分割，第二个文件是使用 RS=" 的实际数据文件\n".

$ awk ' NR==FNR {a[NR]=[=10=]; next} { delete b; for(i=2;i<=NF;i++) b[$i]++; printf "%s ",[=10=]; for(i=1;i<=5;i++) printf "%d ", b[a[i]] ; print "" } ' RS="," <(echo "homorefref,homodevdev,hetrefdev,hetdevref,hetdevdev") RS="\n" data.txt
NC_044998.1  3789  hetdevdev   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 0 0
NC_044998.1  3803  homorefref  hetrefdev   homorefref  homorefref  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref 8 0 3 0 0
NC_044998.1  3806  homorefref  hetrefdev   hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 9 0 2 0 0
NC_044998.1  3856  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4008  homodevdev  hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev  homodevdev 0 10 0 1 0
NC_044998.1  4012  homodevdev  hetdevref   hetdevref   homodevdev  homodevdev  homodevdev  homodevdev  hetdevref   homodevdev  hetdevref   homodevdev 0 7 0 4 0
NC_044998.1  4020  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5353  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref  homorefref  homorefref 10 0 1 0 0
NC_044998.1  5364  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  hetrefdev   homorefref  homorefref 9 0 2 0 0
NC_044998.1  5435  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  homorefref  hetrefdev   homorefref  homorefref  homorefref 10 0 1 0 0

为代码可读性插入换行符

awk ' 
NR==FNR {a[NR]=[=11=]; next} 
{ 
  delete b; for(i=2;i<=NF;i++) b[$i]++; 
  printf "%s ",[=11=]; 
  for(i=1;i<=5;i++) printf "%d ", b[a[i]] ; 
  print "" 
} ' RS="," <(echo "homorefref,homodevdev,hetrefdev,hetdevref,hetdevdev") RS="\n" data.txt

计算每行不同字符串的出现次数并将计数附加为列

Counting number of occurrence of different strings per line and appending counts as columns

awk