AWK:帮助转换数据 table

AWK: Help on transforming data table

我有以下名为 in.txt 的文件:

2020-01-01  fruit   banana  3.4
2020-03-02  alcohol smirnov 26.99
2020-03-10  fruit   orange  4.20
2020-04-03  fruit   orange  4.20
2021-09-01  alcohol beer    6.00
2021-08-03  fruit   mango   6.99
2022-01-01  fruit   orange  4.30
2022-03-04  alcohol beer    6.00
2022-03-03  alcohol beer    6.00
2022-04-01  fruit   mango   7.20

我想转换文件,使其内容如下所示:

                2020-01-01      2021-01-01      2022-01-01
                -2020-12-31     -2021-12-31     -2022-12-31
fruit   banana  3.40             0.00            0.00
        orange  8.40            0.00            4.30
        mango   0.00            6.99            7.20

Subt            11.80           6.99            11.50


alcohol beer    0.00            6.00            12.00
        smirnov 26.99           0.00            0.00

Subt            26.99           6.00            12.00

Total           38.59           12.99           23.50

我已经开始编写以下脚本,但仍然不知道如何处理它。如何并排显示总计列。另一个问题是这只是虚拟数据。除了水果和酒精之外,我有许多不同的类别,为每个类别编写 if 语句和 for 循环似乎是错误的。另外,我如何才能只打印一次水果和酒精,而不是每次迭代第 3 列,并将日期范围置于顶部。非常感谢帮助。

#!/usr/bin/env bash
  
awk '

        BEGIN{
        FS=OFS="\t";
        }

        {        
        if ( ~ fruit &&  >= "2020-01-01" &&  <= "2020-12-31") {                        
                        a[]+=;
                        sa+=;
                }
        }       
        
        END {
        PROCINFO["sorted_in"]="@ind_str_asc";
        for (i in a) {
                        print "fruit", i, a[i]
                }
        }        
        
' "${@:--}"

我相信下面这段 awk 代码是一个好的开始。剩下要做的只是一些清理和一些额外的求和代码。

BEGIN{ 
   # how many divisions per year
   n=1
   # initialisation of some variables
   tmax=0;tmin=999999; ymax=qmax=0;ymin=9999;qmin=99
}
# convert date to quarter,trim,half
{ y=+0; q=(substr(,6,7)+0)%n}
# compute min max time
(y*100+q < tmin) { ymin=y;qmin=q;tmin=y*100+q }
(y*100+q > tmax) { ymax=y;qmax=q;tmax=y*100+q }
# Create arrays that keep track of everything
# a : prices by year,q,category and element
# b : just a list of categories, eg fruit
# c : just a list of elements and the category it belongs to.
{ a[y,q,,]=; b[]; c[]= }
END{
   # loop over categories (eg fruit)
   for(i in b) {
     # loop over elemnts
     for(j in c) {
        # exclude elements that do not belong to category
        if (i!=c[j]) continue
        s=i OFS j;
        # loop over the time
        for (y=ymin;y<=ymax;y++) {
          for (q=0;q<n;++q) {
             if (y*100+q < tmin) continue
             if (y*100+q > tmax) continue
             s=s OFS a[y,q,i,j]+0
          }
        }
        print s
     }
   }
}

当前输出:

alcohol beer 0 6 6
alcohol smirnov 26.99 0 0
fruit orange 4.2 0 4.3
fruit mango 0 6.99 7.2
fruit banana 3.4 0 0

请您尝试以下操作:

#!/bin/bash

awk '
    {
        year = substr(, 1, 4)                         # extract year
        if (from == "" || from > year) from = year      # first (smallest) year
        if (to == "" || to < year) to = year            # last (largest) year

        if ( in category == 0) {
            category[] =                            # map item to category
            list[] = list[] fs[]                # csv of items
            fs[] = ","                                # delimiter for csv
        }
        sum[,year] +=                               # sum of the item in the year
        subt[,year] +=                              # sum of the category in the year
        ttl[year] +=                                  # sum in the year

    }
    END {
        format1 = "%-10s%-10s"                          # format for the left cells
        format2 = "%-16s"                               # format for the header
        format3 = "%-16.2f"                             # format for the amounts

        # print upper header
        printf(format1, "", "")
        for (y = from; y <= to; y++) {
            printf(format2, y "-01-01")
        }
        print ""

        # print second header
        printf(format1, "", "")
        for (y = from; y <= to; y++) {
            printf(format2, "-" y "-12-31")
        }
        print ""

        for (cat in list) {                             # loop over the categories ("fruit" and "alcohol")
            n = split(list[cat], item, ",")             # split into items
            for (i = 1; i <= n; i++) {                  # loop over the items
                printf(format1, i == 1 ? cat : "", item[i])
                for (y = from; y <= to; y++) {          # loop over years
                    printf(format3, sum[item[i],y])     # append the sum of the year
                }
                print ""                                # finally break the line
            }
            print ""                                    # insert blank line
            printf(format1, "Subt", "")
            for (y = from; y <= to; y++) {
                printf(format3, subt[cat,y])            # append the subtotal
            }
            print "\n"
        }
        printf(format1, "Total", "")
        for (y = from; y <= to; y++) {
            printf(format3, ttl[y])                     # append the total amount
        }
        print ""
    }
' in.txt

使用提供的输入输出:

                    2020-01-01      2021-01-01      2022-01-01      
                    -2020-12-31     -2021-12-31     -2022-12-31     
alcohol   smirnov   26.99           0.00            0.00            
          beer      0.00            6.00            12.00           

Subt                26.99           6.00            12.00           

fruit     banana    3.40            0.00            0.00            
          orange    8.40            0.00            4.30            
          mango     0.00            6.99            7.20            

Subt                11.80           6.99            11.50           

Total               38.79           12.99           23.50           

商品顺序与OP不一致请见谅

对数组的数组使用 GNU awk:

$ cat tst.awk
BEGIN { OFS="\t" }
{
    sub(/-.*/,"",)
    minYear = ( NR==1 ||  < minYear ?  : minYear )
    maxYear = ( NR==1 ||  > maxYear ?  : maxYear )
    items[][]
    vals[][][] += 
    typeTots[][] += 
    yearTots[] += 
}
END {
    printf "%s", OFS
    for ( year=minYear; year<=maxYear; year++ ) {
        printf "%s%s", OFS, year
    }
    print ""

    for ( type in items ) {
        itemCnt = 0
        for ( item in items[type] ) {
            printf "%s%s%s", (itemCnt++ ? "" : type), OFS, item
            for ( year=minYear; year<=maxYear; year++ ) {
                printf "%s%0.2f", OFS, vals[year][type][item]
            }
            print ""
        }
        printf "Subt%s", OFS
        for ( year=minYear; year<=maxYear; year++ ) {
            printf "%s%0.2f", OFS, typeTots[year][type]
        }
        print ORS
    }

    printf "Total%s", OFS
    for ( year=minYear; year<=maxYear; year++ ) {
        printf "%s%0.2f", OFS, yearTots[year]
    }
    print ""
}

$ awk -f tst.awk in.txt
                2020    2021    2022
alcohol beer    0.00    6.00    12.00
        smirnov 26.99   0.00    0.00
Subt            26.99   6.00    12.00

fruit   orange  8.40    0.00    4.30
        mango   0.00    6.99    7.20
        banana  3.40    0.00    0.00
Subt            11.80   6.99    11.50

Total           38.79   12.99   23.50

或者如果您真的想要特定的日期范围,而不仅仅是 header 中的年份:

$ cat tst.awk
BEGIN { OFS="\t" }
{
    sub(/-.*/,"",)
    minYear = ( NR==1 ||  < minYear ?  : minYear )
    maxYear = ( NR==1 ||  > maxYear ?  : maxYear )
    items[][]
    vals[][][] += 
    typeTots[][] += 
    yearTots[] += 
}
END {
    printf "%s", OFS
    for ( year=minYear; year<=maxYear; year++ ) {
        printf "%s%s-01-01", OFS, year
    }
    print ""

    printf "%s", OFS
    for ( year=minYear; year<=maxYear; year++ ) {
        printf "%s-%s-12-31", OFS, year
    }
    print ""

    for ( type in items ) {
        itemCnt = 0
        for ( item in items[type] ) {
            printf "%s%s%s", (itemCnt++ ? "" : type), OFS, item
            for ( year=minYear; year<=maxYear; year++ ) {
                printf "%s%0.2f", OFS, vals[year][type][item]
            }
            print ""
        }
        printf "Subt%s", OFS
        for ( year=minYear; year<=maxYear; year++ ) {
            printf "%s%0.2f", OFS, typeTots[year][type]
        }
        print ORS
    }

    printf "Total%s", OFS
    for ( year=minYear; year<=maxYear; year++ ) {
        printf "%s%0.2f", OFS, yearTots[year]
    }
    print ""
}

$ awk -f tst.awk in.txt | column -s$'\t' -t
                  2020-01-01   2021-01-01   2022-01-01
                  -2020-12-31  -2021-12-31  -2022-12-31
alcohol  beer     0.00         6.00         12.00
         smirnov  26.99        0.00         0.00
Subt              26.99        6.00         12.00
fruit    orange   8.40         0.00         4.30
         mango    0.00         6.99         7.20
         banana   3.40         0.00         0.00
Subt              11.80        6.99         11.50
Total             38.79        12.99        23.50