awk:gsub /pattern1/,但不是 /pattern1pattern2/
awk: gsub /pattern1/, but not /pattern1pattern2/
在我的工作中,我必须解决这样一个简单的问题:将 pattern1 更改为 newpattern,但是 only if not followed通过模式 2 或 模式 3:
"pattern1 pattern1pattern2 pattern1pattern3 pattern1pattern4" → "newpattern pattern1pattern2 pattern1pattern3 newpatternpattern4"
这是我的解决方案,但我不喜欢它,我想应该有更优雅、更简单的方法来做到这一点?
$ echo 'pattern1 pattern1pattern2 pattern1pattern3 pattern1pattern4' | awk '
{gsub(/pattern1pattern2/, "###", [=11=])
gsub(/pattern1pattern3/, "%%%", [=11=])
gsub(/pattern1/, "newpattern", [=11=])
gsub(/###/, "pattern1pattern2", [=11=])
gsub(/%%%/, "pattern1pattern3", [=11=])
print}'
newpattern pattern1pattern2 pattern1pattern3 newpatternpattern4
因此,示例输入文件:
pattern1 pattern1pattern2 aaa_pattern1pattern3 pattern1pattern4 pattern1pattern2pattern1
样本输出文件应该是:
newpattern pattern1pattern2 aaa_pattern1pattern3 newpatternpattern4 pattern1pattern2newpattern
这在 perl 中是微不足道的,使用否定的前瞻:
perl -pe 's/pattern1(?!pattern[23])/newpattern/g' file
替换所有 pattern1
后面没有跟 pattern2
或 pattern3
的匹配项。
如果出于某种原因您需要在 awk 中执行此操作,那么您可以采用以下一种方法:
{
out = ""
replacement = "newpattern"
while (match([=11=], /pattern1/)) {
if (substr([=11=], RSTART + RLENGTH) ~ /^pattern[23]/) {
out = out substr([=11=], 1, RSTART + RLENGTH - 1)
}
else {
out = out substr([=11=], 1, RSTART - 1) replacement
}
[=11=] = substr([=11=], RSTART + RLENGTH)
}
print out [=11=]
}
在 pattern1
匹配时使用输入并构建字符串 out
,当每个匹配后的部分不是 pattern2
或 pattern3
时插入替换。一旦没有更多的匹配项,打印到目前为止已经构建的字符串,然后是输入中剩下的任何内容。
awk解决方法。好问题。基本上它在做 2 个 gensubs:
$ cat tst.awk
{ for (i=1; i<=NF; i++){
s=gensub(/pattern1/, "newpattern", "g", $i);
t=gensub(/(newpattern)(pattern(2|3))/, "pattern1\2", "g", s);
$i=t
}
}1
测试:
echo "pattern1 pattern1pattern2 aaa_pattern1pattern3 pattern1pattern4 pattern1pattern2pattern1" | awk -f tst.awk
newpattern pattern1pattern2 aaa_pattern1pattern3 newpatternpattern4 pattern1pattern2newpattern
但是,只要您的输入中已经有类似 newpatternpattern2
的内容,这就会失败。但我猜这不是 OP 在他的输入示例中建议的。
使用 GNU awk 将第 4 个参数拆分 ():
$ cat tst.awk
{
split([=10=],flds,/pattern1(pattern2|pattern3)/,seps)
for (i=1; i in flds; i++) {
printf "%s%s", gensub(/pattern1/,"newpattern","g",flds[i]), seps[i]
}
print ""
}
$ awk -f tst.awk file
newpattern pattern1pattern2 aaa_pattern1pattern3 newpatternpattern4 pattern1pattern2newpattern
对于其他 awk,您可以使用 while(match()) 循环执行相同的操作:
$ cat tst.awk
{
while ( match([=11=],/pattern1(pattern2|pattern3)/) ) {
tgt = substr([=11=],1,RSTART-1)
gsub(/pattern1/,"newpattern",tgt)
printf "%s%s", tgt, substr([=11=],RSTART,RLENGTH)
[=11=] = substr([=11=],RSTART+RLENGTH)
}
gsub(/pattern1/,"newpattern",[=11=])
print
}
$ awk -f tst.awk file
newpattern pattern1pattern2 aaa_pattern1pattern3 newpatternpattern4 pattern1pattern2newpattern
但显然 gawk 解决方案更简单、更简洁,所以,一如既往地使用 gawk!
在我的工作中,我必须解决这样一个简单的问题:将 pattern1 更改为 newpattern,但是 only if not followed通过模式 2 或 模式 3:
"pattern1 pattern1pattern2 pattern1pattern3 pattern1pattern4" → "newpattern pattern1pattern2 pattern1pattern3 newpatternpattern4"
这是我的解决方案,但我不喜欢它,我想应该有更优雅、更简单的方法来做到这一点?
$ echo 'pattern1 pattern1pattern2 pattern1pattern3 pattern1pattern4' | awk '
{gsub(/pattern1pattern2/, "###", [=11=])
gsub(/pattern1pattern3/, "%%%", [=11=])
gsub(/pattern1/, "newpattern", [=11=])
gsub(/###/, "pattern1pattern2", [=11=])
gsub(/%%%/, "pattern1pattern3", [=11=])
print}'
newpattern pattern1pattern2 pattern1pattern3 newpatternpattern4
因此,示例输入文件:
pattern1 pattern1pattern2 aaa_pattern1pattern3 pattern1pattern4 pattern1pattern2pattern1
样本输出文件应该是:
newpattern pattern1pattern2 aaa_pattern1pattern3 newpatternpattern4 pattern1pattern2newpattern
这在 perl 中是微不足道的,使用否定的前瞻:
perl -pe 's/pattern1(?!pattern[23])/newpattern/g' file
替换所有 pattern1
后面没有跟 pattern2
或 pattern3
的匹配项。
如果出于某种原因您需要在 awk 中执行此操作,那么您可以采用以下一种方法:
{
out = ""
replacement = "newpattern"
while (match([=11=], /pattern1/)) {
if (substr([=11=], RSTART + RLENGTH) ~ /^pattern[23]/) {
out = out substr([=11=], 1, RSTART + RLENGTH - 1)
}
else {
out = out substr([=11=], 1, RSTART - 1) replacement
}
[=11=] = substr([=11=], RSTART + RLENGTH)
}
print out [=11=]
}
在 pattern1
匹配时使用输入并构建字符串 out
,当每个匹配后的部分不是 pattern2
或 pattern3
时插入替换。一旦没有更多的匹配项,打印到目前为止已经构建的字符串,然后是输入中剩下的任何内容。
awk解决方法。好问题。基本上它在做 2 个 gensubs:
$ cat tst.awk
{ for (i=1; i<=NF; i++){
s=gensub(/pattern1/, "newpattern", "g", $i);
t=gensub(/(newpattern)(pattern(2|3))/, "pattern1\2", "g", s);
$i=t
}
}1
测试:
echo "pattern1 pattern1pattern2 aaa_pattern1pattern3 pattern1pattern4 pattern1pattern2pattern1" | awk -f tst.awk
newpattern pattern1pattern2 aaa_pattern1pattern3 newpatternpattern4 pattern1pattern2newpattern
但是,只要您的输入中已经有类似 newpatternpattern2
的内容,这就会失败。但我猜这不是 OP 在他的输入示例中建议的。
使用 GNU awk 将第 4 个参数拆分 ():
$ cat tst.awk
{
split([=10=],flds,/pattern1(pattern2|pattern3)/,seps)
for (i=1; i in flds; i++) {
printf "%s%s", gensub(/pattern1/,"newpattern","g",flds[i]), seps[i]
}
print ""
}
$ awk -f tst.awk file
newpattern pattern1pattern2 aaa_pattern1pattern3 newpatternpattern4 pattern1pattern2newpattern
对于其他 awk,您可以使用 while(match()) 循环执行相同的操作:
$ cat tst.awk
{
while ( match([=11=],/pattern1(pattern2|pattern3)/) ) {
tgt = substr([=11=],1,RSTART-1)
gsub(/pattern1/,"newpattern",tgt)
printf "%s%s", tgt, substr([=11=],RSTART,RLENGTH)
[=11=] = substr([=11=],RSTART+RLENGTH)
}
gsub(/pattern1/,"newpattern",[=11=])
print
}
$ awk -f tst.awk file
newpattern pattern1pattern2 aaa_pattern1pattern3 newpatternpattern4 pattern1pattern2newpattern
但显然 gawk 解决方案更简单、更简洁,所以,一如既往地使用 gawk!