根据搜索模式添加两列的 awk 脚本
Awk script to add two columns depending on search pattern
我有以下 awk 脚本:
#! /usr/bin/awk -f
BEGIN{
FS=OFS="\t";
split("Mtm|^Dept|^Im|^TT|ge[mn]$,^Mini", arr1, ",")
split("Variable Expenses,Fixed Expenses", arr2, ",")
itm="Accounting & Legal";
}
<0 && ~arr1[1] {=itm; =arr2[1]; print [=10=]}
<0 && ~arr1[2] {=itm; =arr2[2]; print [=10=]}
产生以下输出:
c_wp_e_2025 2021-05-10 -120.00 8430Ken Jebsen BILL PAYMENT 12:08-11662 Accounting & Legal Variable Expenses
c_wp_e_2148 2021-07-27 -90.85 6450Barrie L. Jorgen BILL PAYMENT 09:31-35651 Accounting & Legal Variable Expenses
e_wp_e_789 2018-06-12 -190.00 Immigration NZ 11 DEBIT Accounting & Legal Variable Expenses
e_wp_e_818 2018-06-29 -153.43 Dept of Home Affa 26 DEBIT AUD 141.37 Accounting & Legal Variable Expenses
e_wp_e_839 2018-07-18 -67.47 TT Services New Z 16 DEBIT Accounting & Legal Variable Expenses
e_wp_e_2512 2021-07-27 -470.20 Dept Internal Aff 23 DEBIT Accounting & Legal Variable Expenses
c_az_262 2021-10-01 -210.45 Mtm Accounting Ltd. Mlm Accounting Accounting & Legal Variable Expenses
c_az_421 2021-07-19 -561.20 Paypal *Mtm Paypal *Mtm Accounting & Legal Variable Expenses
c_az_1082 2020-11-25 -52.20 Ministryofbusiness 285515Dpsa2A Mbie Accounting & Legal Fixed Expenses
c_az_1110 2020-11-16 -2228.70 Mtm Accounting Ltd. Mlm Accounting Accounting & Legal Variable Expenses
这可以正常工作。然而,除了 itm="Accounting & Legal"(例如 itm="Bank_charges)之外,我还有更多类别,有时还有固定费用和可变费用。在其他时候只有其中之一,在这种情况下创建的数组splits 将只有一个值。在这种情况下,主体中的第二个语句将变得未使用。对于每个新类别,都有不同的搜索模式。我是编写 awk 程序的新手,并且一直在研究如何处理这个问题。怎么可能可以有效地编写以适应上述情况。
输入(摘录)将是:
c_wp_e_2025 2021-05-10 -120.00 8430Ken Jebsen BILL PAYMENT 12:08-11662
c_wp_e_2148 2021-07-27 -90.85 6450Barrie L. Jorgen BILL PAYMENT 09:31-35651
e_wp_e_789 2018-06-12 -190.00 Immigration NZ 11 DEBIT
e_wp_e_818 2018-06-29 -153.43 Dept of Home Affa 26 DEBIT AUD 141.37
e_wp_e_839 2018-07-18 -67.47 TT Services New Z 16 DEBIT
e_wp_e_2512 2021-07-27 -470.20 Dept Internal Aff 23 DEBIT
c_az_262 2021-10-01 -210.45 Mtm Accounting Ltd. Mlm Accounting
c_az_421 2021-07-19 -561.20 Paypal *Mtm Paypal *Mtm
c_az_1082 2020-11-25 -52.20 Ministryofbusiness 285515Dpsa2A Mbie
c_az_1110 2020-11-16 -2228.70 Mtm Accounting Ltd. Mlm Accounting
Bank_charges 的标准,例如将是:
split("^Cle|Forei|2 WBC|irnie W| \
wtown W|*M|^Repl|elex|^Unar|lert$,enance$|ebit|A/C|rice$", arr1, ",")
split("Variable Expenses,Fixed Expenses", arr2,",")
itm="Bank_charges
此条件的一些输出将导致:
c_az_1668 2020-03-06 -10.00 Visa Debit Card Fee 4825561****** 4823 Bank_charges Fixed Expenses
c_az_1687 2020-02-28 -8.50 Monthly A/C Fee Bank_charges Fixed Expenses
c_az_1688 2020-02-28 -2.50 Clearance Fee Bank_charges Variable Expenses
c_az_1785 2020-01-31 -8.50 Monthly A/C Fee Bank_charges Fixed Expenses
另一个只有可变费用的类别是:
split("^318|^74 |ASB|^City S|^Fix", arr1,",")
split("Variable Expenses", arr2,",")
itm="Bank_withdrawals"
然而,这对上述操作根本不起作用。输入文件中的所有数据都被分类为 Bank_withdrawals 并分类为可变费用。
如果你是 awk 的新手,最好看看写这个长手:
#! /usr/bin/awk -f
BEGIN{ FS=OFS="\t"; }
<0 && ~"Mtm|^Dept|^Im|^TT|ge[mn]$" {=;"Accounting & Legal"; ="Variable Expenses"; print [=10=]}
<0 && ~"^Mini" {="Accounting & Legal"; ="Fixed Expenses"; print [=10=]}
(请原谅打字错误,写在我的phone)
但是我不相信这是你真正要求的,所以我建议像这样循环遍历开始时创建的数组元素:
#! /usr/bin/awk -f
BEGIN{
FS=OFS="\t";
els=split("Mtm|^Dept|^Im|^TT|ge[mn]$,^Mini", arr1, ",");
split("Variable Expenses,Fixed Expenses", arr2, ",");
split("Accounting & Legal","Accounting & Legal", arr3, ",");
}
for (i=1;i<=els;i++){
<0 && ~arr1[i] {=arr3[i]; =arr2[i]; print [=11=]}
}
请您尝试以下操作:
#!/usr/bin/awk -f
BEGIN {
FS=OFS="\t";
# enumerate the item, keyword and the statement in order
itm = "Accounting & Legal"
key[itm] = "Mtm|^Dept|^Im|^TT|ge[mn]$,^Mini"
stmnt[itm] = "Variable Expenses,Fixed Expenses"
itm = "Bank_charges"
key[itm] = "^Cle|Forei|2 WBC|irnie W|wtown W|*M|^Repl|elex|^Unar|lert$,enance$|ebit|A/C|rice$"
stmnt[itm] = "Variable Expenses,Fixed Expenses"
itm="Bank_withdrawals"
key[itm] = "^318|^74 |ASB|^City S|^Fix"
stmnt[itm] = "Variable Expenses"
# initialize key2 and stmnt2 using itm, key and stmnt defined above
for (itm in key) {
split(key[itm], a, ",")
key2[itm,1] = a[1]
key2[itm,2] = a[2]
split(stmnt[itm], a, ",")
stmnt2[itm,1] = a[1]
stmnt2[itm,2] = a[2]
}
}
# main loop with the input lines
{
for (itm in key) {
for (i = 1; i <= 2; i++) {
if (key2[itm,i] != "") {
if ( < 0 && ~ key2[itm,i]) {
= itm; = stmnt2[itm,i]
print [=10=]
}
}
}
}
}
如果您有更多项目,请在它们后面附加键和语句
在引用现有代码的“Bank_charges”行下方。
如果一个项目有声明“可变费用”或“固定费用”,
只放一个没有逗号的元素。
我 认为 这就是你想要做的,但它显然未经测试,因为你的问题中没有输入和预期输出的具体样本来测试:
$ cat tst.sh
#!/usr/bin/env bash
awk '
BEGIN{
FS=OFS="\t"
mkMap("Mtm|^Dept|^Im|^TT|ge[mn]$", "Variable Expenses", "Accounting & Legal")
mkMap("^Mini", "Fixed Expenses", "Accounting & Legal")
mkMap("^Cle|Forei|2 WBC|irnie W| wtown W|[*]M|^Repl|elex|^Unar|lert$", "Variable Expenses", "Bank_charges")
mkMap("enance$|ebit|A/C|rice$", "Fixed Expenses", "Bank_charges")
mkMap("^318|^74 |ASB|^City S|^Fix", "Variable Expenses", "Bank_withdrawals")
}
< 0 {
found = 0
for ( re in re2type ) {
if ( ~ re ) {
= re2item[re]
= re2type[re]
found = 1
}
}
if ( found ) {
print
}
}
function mkMap(re,type,item) {
re2type[re] = type
re2item[re] = item
}
' "${@:--}"
我有以下 awk 脚本:
#! /usr/bin/awk -f
BEGIN{
FS=OFS="\t";
split("Mtm|^Dept|^Im|^TT|ge[mn]$,^Mini", arr1, ",")
split("Variable Expenses,Fixed Expenses", arr2, ",")
itm="Accounting & Legal";
}
<0 && ~arr1[1] {=itm; =arr2[1]; print [=10=]}
<0 && ~arr1[2] {=itm; =arr2[2]; print [=10=]}
产生以下输出:
c_wp_e_2025 2021-05-10 -120.00 8430Ken Jebsen BILL PAYMENT 12:08-11662 Accounting & Legal Variable Expenses
c_wp_e_2148 2021-07-27 -90.85 6450Barrie L. Jorgen BILL PAYMENT 09:31-35651 Accounting & Legal Variable Expenses
e_wp_e_789 2018-06-12 -190.00 Immigration NZ 11 DEBIT Accounting & Legal Variable Expenses
e_wp_e_818 2018-06-29 -153.43 Dept of Home Affa 26 DEBIT AUD 141.37 Accounting & Legal Variable Expenses
e_wp_e_839 2018-07-18 -67.47 TT Services New Z 16 DEBIT Accounting & Legal Variable Expenses
e_wp_e_2512 2021-07-27 -470.20 Dept Internal Aff 23 DEBIT Accounting & Legal Variable Expenses
c_az_262 2021-10-01 -210.45 Mtm Accounting Ltd. Mlm Accounting Accounting & Legal Variable Expenses
c_az_421 2021-07-19 -561.20 Paypal *Mtm Paypal *Mtm Accounting & Legal Variable Expenses
c_az_1082 2020-11-25 -52.20 Ministryofbusiness 285515Dpsa2A Mbie Accounting & Legal Fixed Expenses
c_az_1110 2020-11-16 -2228.70 Mtm Accounting Ltd. Mlm Accounting Accounting & Legal Variable Expenses
这可以正常工作。然而,除了 itm="Accounting & Legal"(例如 itm="Bank_charges)之外,我还有更多类别,有时还有固定费用和可变费用。在其他时候只有其中之一,在这种情况下创建的数组splits 将只有一个值。在这种情况下,主体中的第二个语句将变得未使用。对于每个新类别,都有不同的搜索模式。我是编写 awk 程序的新手,并且一直在研究如何处理这个问题。怎么可能可以有效地编写以适应上述情况。
输入(摘录)将是:
c_wp_e_2025 2021-05-10 -120.00 8430Ken Jebsen BILL PAYMENT 12:08-11662
c_wp_e_2148 2021-07-27 -90.85 6450Barrie L. Jorgen BILL PAYMENT 09:31-35651
e_wp_e_789 2018-06-12 -190.00 Immigration NZ 11 DEBIT
e_wp_e_818 2018-06-29 -153.43 Dept of Home Affa 26 DEBIT AUD 141.37
e_wp_e_839 2018-07-18 -67.47 TT Services New Z 16 DEBIT
e_wp_e_2512 2021-07-27 -470.20 Dept Internal Aff 23 DEBIT
c_az_262 2021-10-01 -210.45 Mtm Accounting Ltd. Mlm Accounting
c_az_421 2021-07-19 -561.20 Paypal *Mtm Paypal *Mtm
c_az_1082 2020-11-25 -52.20 Ministryofbusiness 285515Dpsa2A Mbie
c_az_1110 2020-11-16 -2228.70 Mtm Accounting Ltd. Mlm Accounting
Bank_charges 的标准,例如将是:
split("^Cle|Forei|2 WBC|irnie W| \
wtown W|*M|^Repl|elex|^Unar|lert$,enance$|ebit|A/C|rice$", arr1, ",")
split("Variable Expenses,Fixed Expenses", arr2,",")
itm="Bank_charges
此条件的一些输出将导致:
c_az_1668 2020-03-06 -10.00 Visa Debit Card Fee 4825561****** 4823 Bank_charges Fixed Expenses
c_az_1687 2020-02-28 -8.50 Monthly A/C Fee Bank_charges Fixed Expenses
c_az_1688 2020-02-28 -2.50 Clearance Fee Bank_charges Variable Expenses
c_az_1785 2020-01-31 -8.50 Monthly A/C Fee Bank_charges Fixed Expenses
另一个只有可变费用的类别是:
split("^318|^74 |ASB|^City S|^Fix", arr1,",")
split("Variable Expenses", arr2,",")
itm="Bank_withdrawals"
然而,这对上述操作根本不起作用。输入文件中的所有数据都被分类为 Bank_withdrawals 并分类为可变费用。
如果你是 awk 的新手,最好看看写这个长手:
#! /usr/bin/awk -f
BEGIN{ FS=OFS="\t"; }
<0 && ~"Mtm|^Dept|^Im|^TT|ge[mn]$" {=;"Accounting & Legal"; ="Variable Expenses"; print [=10=]}
<0 && ~"^Mini" {="Accounting & Legal"; ="Fixed Expenses"; print [=10=]}
(请原谅打字错误,写在我的phone)
但是我不相信这是你真正要求的,所以我建议像这样循环遍历开始时创建的数组元素:
#! /usr/bin/awk -f
BEGIN{
FS=OFS="\t";
els=split("Mtm|^Dept|^Im|^TT|ge[mn]$,^Mini", arr1, ",");
split("Variable Expenses,Fixed Expenses", arr2, ",");
split("Accounting & Legal","Accounting & Legal", arr3, ",");
}
for (i=1;i<=els;i++){
<0 && ~arr1[i] {=arr3[i]; =arr2[i]; print [=11=]}
}
请您尝试以下操作:
#!/usr/bin/awk -f
BEGIN {
FS=OFS="\t";
# enumerate the item, keyword and the statement in order
itm = "Accounting & Legal"
key[itm] = "Mtm|^Dept|^Im|^TT|ge[mn]$,^Mini"
stmnt[itm] = "Variable Expenses,Fixed Expenses"
itm = "Bank_charges"
key[itm] = "^Cle|Forei|2 WBC|irnie W|wtown W|*M|^Repl|elex|^Unar|lert$,enance$|ebit|A/C|rice$"
stmnt[itm] = "Variable Expenses,Fixed Expenses"
itm="Bank_withdrawals"
key[itm] = "^318|^74 |ASB|^City S|^Fix"
stmnt[itm] = "Variable Expenses"
# initialize key2 and stmnt2 using itm, key and stmnt defined above
for (itm in key) {
split(key[itm], a, ",")
key2[itm,1] = a[1]
key2[itm,2] = a[2]
split(stmnt[itm], a, ",")
stmnt2[itm,1] = a[1]
stmnt2[itm,2] = a[2]
}
}
# main loop with the input lines
{
for (itm in key) {
for (i = 1; i <= 2; i++) {
if (key2[itm,i] != "") {
if ( < 0 && ~ key2[itm,i]) {
= itm; = stmnt2[itm,i]
print [=10=]
}
}
}
}
}
如果您有更多项目,请在它们后面附加键和语句 在引用现有代码的“Bank_charges”行下方。 如果一个项目有声明“可变费用”或“固定费用”, 只放一个没有逗号的元素。
我 认为 这就是你想要做的,但它显然未经测试,因为你的问题中没有输入和预期输出的具体样本来测试:
$ cat tst.sh
#!/usr/bin/env bash
awk '
BEGIN{
FS=OFS="\t"
mkMap("Mtm|^Dept|^Im|^TT|ge[mn]$", "Variable Expenses", "Accounting & Legal")
mkMap("^Mini", "Fixed Expenses", "Accounting & Legal")
mkMap("^Cle|Forei|2 WBC|irnie W| wtown W|[*]M|^Repl|elex|^Unar|lert$", "Variable Expenses", "Bank_charges")
mkMap("enance$|ebit|A/C|rice$", "Fixed Expenses", "Bank_charges")
mkMap("^318|^74 |ASB|^City S|^Fix", "Variable Expenses", "Bank_withdrawals")
}
< 0 {
found = 0
for ( re in re2type ) {
if ( ~ re ) {
= re2item[re]
= re2type[re]
found = 1
}
}
if ( found ) {
print
}
}
function mkMap(re,type,item) {
re2type[re] = type
re2item[re] = item
}
' "${@:--}"