awk 比较两个文件中的两列并将调整后的不匹配追加到输出中
awk compare two columns in two files and append adjusted non matched into output
你好,我正在尝试比较文件 1 和文件 2,
- 比较文件 1 的列 $1,$2 和文件 2 的 $1,$3
- 如果有匹配项它会把文件1和文件2输出到
输出文件,其中在比较期间复制列 $1、$2、$7、$9
比较期间从 file1 和从 file2 复制列
$1,$3,$6,$7,$8 从 file2 到结果输出文件
- 如果没有匹配项,它也会将 file2 中剩余的不匹配列添加到输出文件中
- 最后它会在输出中添加结果增量值
$5
列中的文件
我部分地用这个移动了它
awk 'NR==FNR {a[,]=[=10=]; next}
{if((,) in a)
{print a[,],[=10=]; delete a[,]}
else print [=10=]}
END {for(k in a) print a[k]}' file1 file2
文件 1
SITE-A SERV-A AA 1.00 PPA IP 98a7df9asd7f FEX 98a7df9asd7f_a
SITE-A SERV-A AA 1.00 PPA IP 98a7df9asd7g FEX 98a7df9asd7f_b
SITE-A SERV-A AA 1.00 PPA IP 98a7df9asd7h FEX 98a7df9asd7f_c
SITE-B SERV-A BB 1.00 DF IP a7sdf9899hhh FEX a7sdf9899hhh_a
SITE-B SERV-A BB 1.00 DF IP a7sdf9899hhf FEX a7sdf9899hhh_b
SITE-B SERV-A BB 1.00 AF IP a7sdf9899hhm FEX a7sdf9899hhh_c
文件 2
SITE-A 17 SERV-A 0 39 idx a7sdf9899778 0 0 out_fan pri
SITE-A 17 SERV-A 1 1 test a7sdf9899779 1 0 out_fan pri
SITE-A 17 SERV-A 2 32 dummy_host a7sdf9899770 2 0 out_fan pri
SITE-C 22 SERV-A 2 519 dummy_host a7sdf9899772 2 2 out_fan pri
SITE-C 22 SERV-A 3 520 prod a7sdf9899775 3 out_fan pri
SITE-C 22 SERV-A 4 521 dev a7sdf9899774 4 out_fan pri
期望的输出:
SITE-A SERV-A idx a7sdf9899778 0
SITE-A SERV-A test a7sdf9899779 1
SITE-A SERV-A dummy_host a7sdf9899770 2
SITE-A SERV-A 98a7df9asd7f_a 98a7df9asd7f 3
SITE-A SERV-A 98a7df9asd7f_b 98a7df9asd7g 4
SITE-A SERV-A 98a7df9asd7f_c 98a7df9asd7h 5
SITE-B SERV-A a7sdf9899hhh_a a7sdf9899hhh 0
SITE-B SERV-A a7sdf9899hhh_b a7sdf9899hhf 1
SITE-B SERV-A a7sdf9899hhh_c a7sdf9899hhm 2
SITE-C SERV-A dummy_host a7sdf9899772 2
SITE-C SERV-A prod a7sdf9899775 3
SITE-C SERV-A dev a7sdf9899774 4
$ cat tst.awk
NR==FNR {
key = FS
a[key] = a[key] key OFS OFS OFS ORS
cnt[key]++ # or cnt[key] = + 1
next
}
{
key = FS
if ( key != prev ) {
printf "%s", a[key]
delete a[key]
prev = key
}
print key, , , , cnt[key]++
}
END {
for ( key in a ) {
printf "%s", a[key]
}
}
.
$ awk -f tst.awk file2 file1
SITE-A SERV-A idx a7sdf9899778 0
SITE-A SERV-A test a7sdf9899779 1
SITE-A SERV-A dummy_host a7sdf9899770 2
SITE-A SERV-A IP 98a7df9asd7f FEX 3
SITE-A SERV-A IP 98a7df9asd7g FEX 4
SITE-A SERV-A IP 98a7df9asd7h FEX 5
SITE-B SERV-A IP a7sdf9899hhh FEX 0
SITE-B SERV-A IP a7sdf9899hhf FEX 1
SITE-B SERV-A IP a7sdf9899hhm FEX 2
SITE-C SERV-A dummy_host a7sdf9899772 2
SITE-C SERV-A prod a7sdf9899775 3
SITE-C SERV-A dev a7sdf9899774 4
不清楚您是希望 file1 行的第 5 个输出字段从 file2 中给定键的行数开始,还是从 file2 中的 $8 值开始,所以我包括了这两个选项,一个作为注释。
END 中的 for ( key in a )
将以 "random" 顺序打印 file2 中剩余的行块(参见 https://www.gnu.org/software/gawk/manual/gawk.html#Controlling-Array-Traversal),如果这是一个问题,您只需要保留一个在读取 file2 时使用递增索引分隔键数组(例如 if (!(key in a)) keys[++numKeys]=key
在开始处)并在 END 部分中使用它来按顺序获取键(for (keynr=1; keyNr<=numKeys; keyNr++) { key=keys[keyNr] ...
),即:
$ cat tst.awk
NR==FNR {
key = FS
if ( !(key in a) ) {
keys[++numKeys] = key
}
a[key] = a[key] key OFS OFS OFS ORS
cnt[key]++
next
}
{
key = FS
if ( key != prev ) {
printf "%s", a[key]
delete a[key]
prev = key
}
print key, , , , cnt[key]++
}
END {
for ( keyNr=1; keyNr<=numKeys; keyNr++ ) {
key = keys[keyNr]
printf "%s", a[key]
}
}
.
$ awk -f tst.awk file2 file1
SITE-A SERV-A idx a7sdf9899778 0
SITE-A SERV-A test a7sdf9899779 1
SITE-A SERV-A dummy_host a7sdf9899770 2
SITE-A SERV-A IP 98a7df9asd7f FEX 3
SITE-A SERV-A IP 98a7df9asd7g FEX 4
SITE-A SERV-A IP 98a7df9asd7h FEX 5
SITE-B SERV-A IP a7sdf9899hhh FEX 0
SITE-B SERV-A IP a7sdf9899hhf FEX 1
SITE-B SERV-A IP a7sdf9899hhm FEX 2
SITE-C SERV-A dummy_host a7sdf9899772 2
SITE-C SERV-A prod a7sdf9899775 3
SITE-C SERV-A dev a7sdf9899774 4
你好,我正在尝试比较文件 1 和文件 2,
- 比较文件 1 的列 $1,$2 和文件 2 的 $1,$3
- 如果有匹配项它会把文件1和文件2输出到 输出文件,其中在比较期间复制列 $1、$2、$7、$9 比较期间从 file1 和从 file2 复制列 $1,$3,$6,$7,$8 从 file2 到结果输出文件
- 如果没有匹配项,它也会将 file2 中剩余的不匹配列添加到输出文件中
- 最后它会在输出中添加结果增量值 $5 列中的文件
我部分地用这个移动了它
awk 'NR==FNR {a[,]=[=10=]; next}
{if((,) in a)
{print a[,],[=10=]; delete a[,]}
else print [=10=]}
END {for(k in a) print a[k]}' file1 file2
文件 1
SITE-A SERV-A AA 1.00 PPA IP 98a7df9asd7f FEX 98a7df9asd7f_a
SITE-A SERV-A AA 1.00 PPA IP 98a7df9asd7g FEX 98a7df9asd7f_b
SITE-A SERV-A AA 1.00 PPA IP 98a7df9asd7h FEX 98a7df9asd7f_c
SITE-B SERV-A BB 1.00 DF IP a7sdf9899hhh FEX a7sdf9899hhh_a
SITE-B SERV-A BB 1.00 DF IP a7sdf9899hhf FEX a7sdf9899hhh_b
SITE-B SERV-A BB 1.00 AF IP a7sdf9899hhm FEX a7sdf9899hhh_c
文件 2
SITE-A 17 SERV-A 0 39 idx a7sdf9899778 0 0 out_fan pri
SITE-A 17 SERV-A 1 1 test a7sdf9899779 1 0 out_fan pri
SITE-A 17 SERV-A 2 32 dummy_host a7sdf9899770 2 0 out_fan pri
SITE-C 22 SERV-A 2 519 dummy_host a7sdf9899772 2 2 out_fan pri
SITE-C 22 SERV-A 3 520 prod a7sdf9899775 3 out_fan pri
SITE-C 22 SERV-A 4 521 dev a7sdf9899774 4 out_fan pri
期望的输出:
SITE-A SERV-A idx a7sdf9899778 0
SITE-A SERV-A test a7sdf9899779 1
SITE-A SERV-A dummy_host a7sdf9899770 2
SITE-A SERV-A 98a7df9asd7f_a 98a7df9asd7f 3
SITE-A SERV-A 98a7df9asd7f_b 98a7df9asd7g 4
SITE-A SERV-A 98a7df9asd7f_c 98a7df9asd7h 5
SITE-B SERV-A a7sdf9899hhh_a a7sdf9899hhh 0
SITE-B SERV-A a7sdf9899hhh_b a7sdf9899hhf 1
SITE-B SERV-A a7sdf9899hhh_c a7sdf9899hhm 2
SITE-C SERV-A dummy_host a7sdf9899772 2
SITE-C SERV-A prod a7sdf9899775 3
SITE-C SERV-A dev a7sdf9899774 4
$ cat tst.awk
NR==FNR {
key = FS
a[key] = a[key] key OFS OFS OFS ORS
cnt[key]++ # or cnt[key] = + 1
next
}
{
key = FS
if ( key != prev ) {
printf "%s", a[key]
delete a[key]
prev = key
}
print key, , , , cnt[key]++
}
END {
for ( key in a ) {
printf "%s", a[key]
}
}
.
$ awk -f tst.awk file2 file1
SITE-A SERV-A idx a7sdf9899778 0
SITE-A SERV-A test a7sdf9899779 1
SITE-A SERV-A dummy_host a7sdf9899770 2
SITE-A SERV-A IP 98a7df9asd7f FEX 3
SITE-A SERV-A IP 98a7df9asd7g FEX 4
SITE-A SERV-A IP 98a7df9asd7h FEX 5
SITE-B SERV-A IP a7sdf9899hhh FEX 0
SITE-B SERV-A IP a7sdf9899hhf FEX 1
SITE-B SERV-A IP a7sdf9899hhm FEX 2
SITE-C SERV-A dummy_host a7sdf9899772 2
SITE-C SERV-A prod a7sdf9899775 3
SITE-C SERV-A dev a7sdf9899774 4
不清楚您是希望 file1 行的第 5 个输出字段从 file2 中给定键的行数开始,还是从 file2 中的 $8 值开始,所以我包括了这两个选项,一个作为注释。
END 中的 for ( key in a )
将以 "random" 顺序打印 file2 中剩余的行块(参见 https://www.gnu.org/software/gawk/manual/gawk.html#Controlling-Array-Traversal),如果这是一个问题,您只需要保留一个在读取 file2 时使用递增索引分隔键数组(例如 if (!(key in a)) keys[++numKeys]=key
在开始处)并在 END 部分中使用它来按顺序获取键(for (keynr=1; keyNr<=numKeys; keyNr++) { key=keys[keyNr] ...
),即:
$ cat tst.awk
NR==FNR {
key = FS
if ( !(key in a) ) {
keys[++numKeys] = key
}
a[key] = a[key] key OFS OFS OFS ORS
cnt[key]++
next
}
{
key = FS
if ( key != prev ) {
printf "%s", a[key]
delete a[key]
prev = key
}
print key, , , , cnt[key]++
}
END {
for ( keyNr=1; keyNr<=numKeys; keyNr++ ) {
key = keys[keyNr]
printf "%s", a[key]
}
}
.
$ awk -f tst.awk file2 file1
SITE-A SERV-A idx a7sdf9899778 0
SITE-A SERV-A test a7sdf9899779 1
SITE-A SERV-A dummy_host a7sdf9899770 2
SITE-A SERV-A IP 98a7df9asd7f FEX 3
SITE-A SERV-A IP 98a7df9asd7g FEX 4
SITE-A SERV-A IP 98a7df9asd7h FEX 5
SITE-B SERV-A IP a7sdf9899hhh FEX 0
SITE-B SERV-A IP a7sdf9899hhf FEX 1
SITE-B SERV-A IP a7sdf9899hhm FEX 2
SITE-C SERV-A dummy_host a7sdf9899772 2
SITE-C SERV-A prod a7sdf9899775 3
SITE-C SERV-A dev a7sdf9899774 4