为所有列的块中的所有列计算 avg 和 sd

Calculating avg and sd for all columns in chunks for all columns

我正在尝试使用以下方法计算文件中所有列的块中所有列的 avg 和 sd:

BEGIN { FS = OFS = "\t" }

 != c1 {
    for(i=1; i<=NF; i++) {
    if (c1) print s[i]/q[i]
    d = 
    s[i] = q[i] = 0
    c1 = 
}
}    
{
        s[i]+=$i
        if($i!="-")
            q[i]++
}
END {
    for(i=1; i<=NF; i++)
        print d[i]"\t"s[i]/q[i]
}

在如下所示的文件中:

chr x y z 
1 1 2 3
1 2 - 2
1 3 3 3
5a 2 2 3
5a 2 2 2
5a 3 3 3

所以我的输出像

chr x y z
1 2 2.5 2.3
5a 2.3 2.3 2.7

这是我用真实数据得到的错误

awk: avgchunks.awk:12: (FILENAME=wgcodnoncod_fst.table FNR=1) fatal: attempt to use scalar `s' as an array

根据这个问题的公认答案,在他的帮助下,我扩展了这个问题的答案,也计算了 SD

BEGIN { FS = OFS = "\t" }
NR == 1 {
   print
   next
}
c1 != "" &&  != c1 {
   printf "%s", c1
   for(i=2; i<=NF; i++) {
      printf "%s%.2f%s%.2f", OFS, s[i]/q[i], OFS, sqrt(sq[i]/q[i] - (s[i]/q[i])^2)
   }
   print ""
   delete s
   delete q
   delete sq
}
{
   for(i=2; i<=NF; i++) {
      s[i] += $i
      if($i != "-")
         sq[i] += $i * $i
         q[i]++
   }
   c1 = 
}
END {
   printf "%s", c1
      for(i=2; i<=NF; i++) {
      #sq[i] += $i * $i
      printf "%s%.2f%s%.2f", OFS, s[i]/q[i], OFS, sqrt(sq[i]/q[i] - (s[i]/q[i])^2)
   }
   print ""
}

您可以使用此脚本进行计算:

cat avg.awk

BEGIN { FS = OFS = "\t" }
NR == 1 {
   print
   next
}
c1 != "" &&  != c1 {
   printf "%s", c1
   for(i=2; i<=NF; i++) {
      printf "%s%.1f", OFS, s[i]/(q[i]?q[i]:1)
   }
   print ""
   delete s
   delete q
}
{
   for(i=2; i<=NF; i++) {
      s[i] += $i
      if($i != "-")
         q[i]++
   }
   c1 = 
}
END {
   printf "%s", c1
   for(i=2; i<=NF; i++) {
      printf "%s%.1f", OFS, s[i]/(q[i]?q[i]:1)
   }
   print ""
}

然后将其用作:

awk -f avg.awk file

chr x   y   z
1   2.0 2.5 2.7
5a  2.3 2.3 2.7

对于标准差,使用这个 awk 脚本:

cat avg.awk

BEGIN { FS = OFS = "\t" }
NR == 1 {
   print
   next
}
c1 != "" &&  != c1 {
   printf "%s", c1
   for(i=2; i<=NF; i++) {
      sq[i] = s[i] * s[i]
      printf "%s%.1f%s%.1f", OFS, s[i]/(q[i]?q[i]:1), OFS, sqrt(sq[i]/(q[i]?q[i]:1) - (s[i]/(q[i]?q[i]:1))^2)
   }
   print ""
   delete s
   delete q
}
{
   for(i=2; i<=NF; i++) {
      s[i] += $i
      if($i != "-")
         q[i]++
   }
   c1 = 
}
END {
   printf "%s", c1
   for(i=2; i<=NF; i++) {
      sq[i] = s[i] * s[i]
      printf "%s%.1f%s%.1f", OFS, s[i]/(q[i]?q[i]:1), OFS, sqrt(sq[i]/(q[i]?q[i]:1) - (s[i]/(q[i]?q[i]:1))^2)
   }
   print ""
}

作为平均值和标准差的计算 共享共同的步骤,我将两者结合起来。请分配变量 typeBEGIN 块中到您要计算的内容。

func calc(c1, sx, sxx, n, nf,    ave, var, sd)
{
    printf("%s%s", c1, OFS)             # print label
    for (i = 2; i <= nf; i++) {
        if (n[i] == 0) {                # in case data is empty
            ave[i] = "-"
            sd[i] = "-"
        } else {                        # calculate based on formula
            ave[i] = sx[i] / n[i]
            var[i] = sxx[i] / n[i] - ave[i] * ave[i]
            if (var[i] < 0) sd[i] = 0   # variance is negative value
            else sd[i] = sqrt(var[i])
        }
        if (type == "ave")              # report depending on the type
            printf("%.1f%s", ave[i], i == nf ? "\n" : OFS)
        else
            printf("%.2f%s", sd[i], i == nf ? "\n" : OFS)
    }
}
BEGIN {
    FS = OFS = "\t"
    type = "ave"                        # assign to "ave" or "sd"
}
NR == 1 {
    print
    next
}
c1 != "" &&  != c1 {                  # data set have changed
     calc(c1, sx, sxx, n, NF)           # then report with current data
     delete sx                          # initialize variables for next data
     delete sxx
     delete n
}
{
    c1 =                              # data set label
    for (i = 2; i <= NF; i++) {
        if ($i != "-") {
            sx[i] += $i                 # sum of x
            sxx[i] += $i * $i           # sum of x * x
            n[i]++                      # count of samples
        }
    }
}
END {
    calc(c1, sx, sxx, n, NF)
}

平均结果:

chr     x       y       z
1       2.0     2.5     2.7
5a      2.3     2.3     2.7

标准差的结果:

chr     x       y       z
1       0.82    0.50    0.47
5a      0.47    0.47    0.47

旁注:

  • 方差,sd 的平方,理论上有 non-negative 个值。 但是,由于round-off,计算机计算可能为负数 错误。在计算平方根时我们需要注意这一点 的方差。
  • 当样本数(n)小于10时,可能会更好 将 sd 计算中的分母更改为 n - 1 for 更好地估计标准偏差。 这取决于您的要求或应用。