awk 比较两个文件中的两列并将调整后的不匹配追加到输出中

awk compare two columns in two files and append adjusted non matched into output

你好,我正在尝试比较文件 1 和文件 2,

  1. 比较文件 1 的列 $1,$2 和文件 2 的 $1,$3
  2. 如果有匹配项它会把文件1和文件2输出到 输出文件,其中在比较期间复制列 $1、$2、$7、$9 比较期间从 file1 和从 file2 复制列 $1,$3,$6,$7,$8 从 file2 到结果输出文件
  3. 如果没有匹配项,它也会将 file2 中剩余的不匹配列添加到输出文件中
  4. 最后它会在输出中添加结果增量值 $5
  5. 列中的文件

我部分地用这个移动了它

awk 'NR==FNR {a[,]=[=10=]; next}
             {if((,) in a)
             {print a[,],[=10=]; delete a[,]}
             else print [=10=]}
     END    {for(k in a) print a[k]}' file1 file2

文件 1

SITE-A SERV-A AA 1.00 PPA IP 98a7df9asd7f FEX 98a7df9asd7f_a     
SITE-A SERV-A AA 1.00 PPA IP 98a7df9asd7g FEX 98a7df9asd7f_b     
SITE-A SERV-A AA 1.00 PPA IP 98a7df9asd7h FEX 98a7df9asd7f_c     
SITE-B SERV-A BB 1.00 DF IP a7sdf9899hhh FEX a7sdf9899hhh_a     
SITE-B SERV-A BB 1.00 DF IP a7sdf9899hhf FEX a7sdf9899hhh_b     
SITE-B SERV-A BB 1.00 AF IP a7sdf9899hhm FEX a7sdf9899hhh_c 

文件 2

SITE-A 17 SERV-A 0 39 idx a7sdf9899778 0 0 out_fan pri
SITE-A 17 SERV-A 1 1 test a7sdf9899779 1 0 out_fan pri
SITE-A 17 SERV-A 2 32 dummy_host a7sdf9899770 2 0 out_fan pri
SITE-C 22 SERV-A 2 519 dummy_host a7sdf9899772 2 2 out_fan pri  
SITE-C 22 SERV-A 3 520 prod a7sdf9899775 3 out_fan pri  
SITE-C 22 SERV-A 4 521 dev a7sdf9899774 4 out_fan pri 

期望的输出:

SITE-A SERV-A idx a7sdf9899778 0
SITE-A SERV-A test a7sdf9899779 1
SITE-A SERV-A dummy_host a7sdf9899770 2
SITE-A SERV-A 98a7df9asd7f_a 98a7df9asd7f 3
SITE-A SERV-A 98a7df9asd7f_b 98a7df9asd7g 4
SITE-A SERV-A 98a7df9asd7f_c 98a7df9asd7h 5
SITE-B SERV-A a7sdf9899hhh_a a7sdf9899hhh 0
SITE-B SERV-A a7sdf9899hhh_b a7sdf9899hhf 1
SITE-B SERV-A a7sdf9899hhh_c a7sdf9899hhm 2
SITE-C SERV-A dummy_host a7sdf9899772 2
SITE-C SERV-A prod a7sdf9899775 3
SITE-C SERV-A dev a7sdf9899774 4 
$ cat tst.awk
NR==FNR {
    key =  FS 
    a[key] = a[key] key OFS  OFS  OFS  ORS
    cnt[key]++      # or cnt[key] =  + 1
    next
}
{
    key =  FS 
    if ( key != prev ) {
        printf "%s", a[key]
        delete a[key]
        prev = key
    }
    print key, , , , cnt[key]++
}
END {
    for ( key in a ) {
        printf "%s", a[key]
    }
}

.

$ awk -f tst.awk file2 file1
SITE-A SERV-A idx a7sdf9899778 0
SITE-A SERV-A test a7sdf9899779 1
SITE-A SERV-A dummy_host a7sdf9899770 2
SITE-A SERV-A IP 98a7df9asd7f FEX 3
SITE-A SERV-A IP 98a7df9asd7g FEX 4
SITE-A SERV-A IP 98a7df9asd7h FEX 5
SITE-B SERV-A IP a7sdf9899hhh FEX 0
SITE-B SERV-A IP a7sdf9899hhf FEX 1
SITE-B SERV-A IP a7sdf9899hhm FEX 2
SITE-C SERV-A dummy_host a7sdf9899772 2
SITE-C SERV-A prod a7sdf9899775 3
SITE-C SERV-A dev a7sdf9899774 4

不清楚您是希望 file1 行的第 5 个输出字段从 file2 中给定键的行数开始,还是从 file2 中的 $8 值开始,所以我包括了这两个选项,一个作为注释。

END 中的 for ( key in a ) 将以 "random" 顺序打印 file2 中剩余的行块(参见 https://www.gnu.org/software/gawk/manual/gawk.html#Controlling-Array-Traversal),如果这是一个问题,您只需要保留一个在读取 file2 时使用递增索引分隔键数组(例如 if (!(key in a)) keys[++numKeys]=key 在开始处)并在 END 部分中使用它来按顺序获取键(for (keynr=1; keyNr<=numKeys; keyNr++) { key=keys[keyNr] ...),即:

$ cat tst.awk
NR==FNR {
    key =  FS 
    if ( !(key in a) ) {
        keys[++numKeys] = key
    }
    a[key] = a[key] key OFS  OFS  OFS  ORS
    cnt[key]++
    next
}
{
    key =  FS 
    if ( key != prev ) {
        printf "%s", a[key]
        delete a[key]
        prev = key
    }
    print key, , , , cnt[key]++
}
END {
    for ( keyNr=1; keyNr<=numKeys; keyNr++ ) {
        key = keys[keyNr]
        printf "%s", a[key]
    }
}

.

$ awk -f tst.awk file2 file1
SITE-A SERV-A idx a7sdf9899778 0
SITE-A SERV-A test a7sdf9899779 1
SITE-A SERV-A dummy_host a7sdf9899770 2
SITE-A SERV-A IP 98a7df9asd7f FEX 3
SITE-A SERV-A IP 98a7df9asd7g FEX 4
SITE-A SERV-A IP 98a7df9asd7h FEX 5
SITE-B SERV-A IP a7sdf9899hhh FEX 0
SITE-B SERV-A IP a7sdf9899hhf FEX 1
SITE-B SERV-A IP a7sdf9899hhm FEX 2
SITE-C SERV-A dummy_host a7sdf9899772 2
SITE-C SERV-A prod a7sdf9899775 3
SITE-C SERV-A dev a7sdf9899774 4