AWK:在 awk table 转换脚本中求和有问题

AWK: sums problematic in awk table transform script

我有以下文件:

cat st_in.txt 
2015-01-01  2   A   FI
2015-02-03  4   B   VI
2015-03-01  6   A   FI
2015-01-08  -4  C   VE
2016-01-05  -3  B   VE
2016-02-03  -1  D   FE
2016-04-01  -2  B   FE
2016-06-13  -5  D   VE
2017-01-01  2   A   VI
2017-02-03  3   A   VI
2017-02-04  8   C   FI
2017-01-05  -1  B   FE

并想这样改造:

        2015    2016    2017

A       8.00    0.00    0.00
C       0.00    0.00    8.00
sumFI   8.00    0.00    8.00

A       0.00    0.00    5.00
B       4.00    0.00    0.00
sumVI   4.00    0.00    5.00

sumI    12.00   0.00    13.00

B       0.00    -2.00   -1.00
D       0.00    -1.00   0.00
sumFE   0.00    -3.00   -1.00

B       0.00    -3.00   0.00
C      -4.00    0.00    0.00
D       0.00    -5.00   0.00
sumVE   -4.00   -8.00   0.00

sumE    -4.00   -11.00  -1.00

net     8.00    -11.00  12.00

为此,我编写了以下脚本:

#!/usr/bin/env bash

awk '
BEGIN {
        OFS = "\t"
}

{
        yr = substr(, 1, 4)
        sub(/-.*/, "", )
        minYr = (NR == 1 ||  < minYr ?  : minYr)
        maxYr = (NR == 1 ||  > maxYr ?  : maxYr)
        H[][]
        W[yr][][] += 
        yT[] += 
        val[][] += 
}

END {
        for (yr = minYr; yr <= maxYr; yr++) {
                printf "%s%s", OFS, yr
        }
        print ""
        print ""
        for (cT in H) {
                for (c in H[cT]) {
                        printf c, OFS
                        for (yr = minYr; yr <= maxYr; yr++) {
                                printf "%s%0.2f", OFS, W[yr][cT][c]
                        }
                        print ""
                }
                printf "sum" cT, OFS
                for (yr = minYr; yr <= maxYr; yr++) {
                        printf "%s%0.2f", OFS, val[yr][cT]
                }
                print ""
                print ""
                if (cT == "VI") {
                        printf "sumI", OFS
                        for (yr = minYr; yr <= maxYr; yr++) {
                                printf "%s%0.2f", OFS, W[yr][cT][c]
                        }
                } else if (cT == "VE") {
                        printf "sumE", OFS
                        for (yr = minYr; yr <= maxYr; yr++) {
                                printf "%s%0.2f", OFS, W[yr][cT][c]
                        }
                }
                print ORS
        }
        printf "net"
        for (yr = minYr; yr <= maxYr; yr++) {
                printf "%s%0.2f", OFS, yT[yr]
        }
        print ""
}
' "${@:--}"

打印以下内容:

./trans1 st_in.txt 
    2015    2016    2017

A   8.00    0.00    0.00
C   0.00    0.00    8.00
sumFI   8.00    0.00    8.00



A   0.00    0.00    5.00
B   4.00    0.00    0.00
sumVI   4.00    0.00    5.00

sumI    4.00    0.00    0.00

B   0.00    -2.00   -1.00
D   0.00    -1.00   0.00
sumFE   0.00    -3.00   -1.00



B   0.00    -3.00   0.00
C   -4.00   0.00    0.00
D   0.00    -5.00   0.00
sumVE   -4.00   -8.00   0.00

sumE    0.00    -5.00   0.00

net 8.00    -11.00  12.00

我不担心这里的格式。控制台打印输出类似于目标。但我正在努力的是获得 sumI (sumFI + sumVI) 和 sumE(sumFE + sumVE) 的正确总和。有人可以帮忙吗?


Ed Morton 编辑以提供有意义的数据类型和变量名称,假设 OP 当前数据是 Date Amount Item Type 的列类型,就像在 :

中一样
$ cat trans1
#!/usr/bin/env bash

awk '
BEGIN {
    OFS = "\t"
}

{
    date   = 
    amount = 
    item   = 
    type   = 

    year = substr(date, 1, 4)
    minYear = (NR == 1 || year < minYear ? year : minYear)
    maxYear = (NR == 1 || year > maxYear ? year : maxYear)

    types_items[type][item]
    yearsTypesItems2amounts[year][type][item] += 
    years2amounts[year]                       += 
    yearsTypes2amounts[year][type]            += 
}

END {
    for (year = minYear; year <= maxYear; year++) {
        printf "%s%s", OFS, year
    }
    print ""
    print ""
    for (type in types_items) {
        for (item in types_items[type]) {
            printf item, OFS
            for (year = minYear; year <= maxYear; year++) {
                printf "%s%0.2f", OFS, yearsTypesItems2amounts[year][type][item]
            }
            print ""
        }
        printf "sum" type, OFS
        for (year = minYear; year <= maxYear; year++) {
            printf "%s%0.2f", OFS, yearsTypes2amounts[year][type]
        }
        print ""
        print ""
        if (type == "VI") {
            printf "sumI", OFS
            for (year = minYear; year <= maxYear; year++) {
                printf "%s%0.2f", OFS, yearsTypesItems2amounts[year][type][item] # <--- NOTE: item unset here
            }
        } else if (type == "VE") {
            printf "sumE", OFS
            for (year = minYear; year <= maxYear; year++) {
                printf "%s%0.2f", OFS, yearsTypesItems2amounts[year][type][item] # <--- NOTE: item unset here
            }
        }
        print ORS
    }
    printf "net"
    for (year = minYear; year <= maxYear; year++) {
        printf "%s%0.2f", OFS, years2amounts[year]
    }
    print ""
}
' "${@:--}"

请参阅上面的“注意”,通过重命名练习可以明显看出某些错误所在(看起来您使用了错误的数组,应该使用 yearsTypes2amounts[year][type] 而不是 yearsTypesItems2amounts[year][type][item]).

继续对数组的数组使用 GNU awk,并假设 OP 当前数据是 Date Amount Item Type 的列类型,就像在 :

中一样
$ cat trans1
#!/usr/bin/env bash

awk '
BEGIN {
    OFS = "\t"
}

{
    date   = 
    amount = 
    item   = 
    type   = 

    year = substr(date, 1, 4)
    minYear = (NR == 1 || year < minYear ? year : minYear)
    maxYear = (NR == 1 || year > maxYear ? year : maxYear)

    cat = substr(type,2)
    ctiys2amounts[cat][type][item][year] += amount
}

END {
    for (year = minYear; year <= maxYear; year++) {
        printf "%s%s", OFS, year
    }
    print ORS
    for (cat in ctiys2amounts) {
        delete catSum
        for (type in ctiys2amounts[cat]) {
            delete typeSum
            for (item in ctiys2amounts[cat][type]) {
                printf "%s", item
                for (year = minYear; year <= maxYear; year++) {
                    amount = ctiys2amounts[cat][type][item][year]
                    printf "%s%0.2f", OFS, amount
                    typeSum[year] += amount
                }
                print ""
            }
            printf "sum%s", type
            for (year = minYear; year <= maxYear; year++) {
                printf "%s%0.2f", OFS, typeSum[year]
                catSum[year] += typeSum[year]
            }
            print ORS
        }
        printf "sum%s", cat
        for (year = minYear; year <= maxYear; year++) {
            printf "%s%0.2f", OFS, catSum[year]
            yearSum[year] += catSum[year]
        }
        print ORS
    }
    printf "net"
    for (year = minYear; year <= maxYear; year++) {
        printf "%s%0.2f", OFS, yearSum[year]
    }
    print ""
}
' "${@:--}"

$ ./trans1 st_in.txt
        2015    2016    2017

B       0.00    -2.00   -1.00
D       0.00    -1.00   0.00
sumFE   0.00    -3.00   -1.00

B       0.00    -3.00   0.00
C       -4.00   0.00    0.00
D       0.00    -5.00   0.00
sumVE   -4.00   -8.00   0.00

sumE    -4.00   -11.00  -1.00

A       8.00    0.00    0.00
C       0.00    0.00    8.00
sumFI   8.00    0.00    8.00

A       0.00    0.00    5.00
B       4.00    0.00    0.00
sumVI   4.00    0.00    5.00

sumI    12.00   0.00    13.00

net     8.00    -11.00  12.00