grep:提取前缀和后缀之间的内容
grep: extract content between prefix and suffix
我有这样一个文件内容:
Listening for transport dt_socket at address: 8000
------------------------------------------------------------
^[[1m HAPI FHIR^[[22m 5.4.0 - Command Line Tool
------------------------------------------------------------
Process ID : 21719@psgd
Max configured JVM memory (Xmx) : 3.2GB
Detected Java version : 11.0.7
------------------------------------------------------------
^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:40.79^[[0;39m ^[[37m[main]^[[0;39m ^[[37mWARN ^[[0;39m ^[[1;34mo.f.c.i.s.c.ClassPathScanner^[[0;39m ^[[1;37mUnable to resolve location classpath:db/migration. Note this warning will become an error in Flyway 7.
^[[0;39m^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:42.641^[[0;39m ^[[37m[main]^[[0;39m ^[[37mWARN ^[[0;39m ^[[1;34mo.f.c.i.s.c.ClassPathScanner^[[0;39m ^[[1;37mUnable to resolve location classpath:db/migration. Note this warning will become an error in Flyway 7.
^[[0;39m^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:44.693^[[0;39m ^[[37m[main]^[[0;39m ^[[37mINFO ^[[0;39m ^[[1;34mc.u.f.j.m.t.InitializeSchemaTask^[[0;39m ^[[1;37m3_3_0.20180115.0: Initializing ORACLE_12C schema for HAPI FHIR
^[[0;39m^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:44.848^[[0;39m ^[[37m[main]^[[0;39m ^[[37mINFO ^[[0;39m ^[[1;34mc.u.f.j.m.t.BaseTask^[[0;39m ^[[1;37m3_3_0.20180115.0: SQL "create sequence SEQ_BLKEXCOL_PID start with 1 increment by 50" returned 0
^[[0;39m^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:44.918^[[0;39m ^[[37m[main]^[[0;39m ^[[37mINFO ^[[0;39m ^[[1;34mc.u.f.j.m.t.BaseTask^[[0;39m ^[[1;37m3_3_0.20180115.0: SQL "
create sequence SEQ_BLKEXCOLFILE_PID start with 1 increment by 50" returned 0
^[[0;39m^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:47.573^[[0;39m ^[[37m[main]^[[0;39m ^[[37mINFO ^[[0;39m ^[[1;34mc.u.f.j.m.t.BaseTask^[[0;39m ^[[1;37m3_3_0.20180115.0: SQL "
create table HFJ_BINARY_STORAGE_BLOB (
BLOB_ID varchar2(200 char) not null,
BLOB_DATA blob not null,
CONTENT_TYPE varchar2(100 char) not null,
BLOB_HASH varchar2(128 char),
PUBLISHED_DATE timestamp not null,
RESOURCE_ID varchar2(100 char) not null,
BLOB_SIZE number(10,0),
primary key (BLOB_ID)
)" returned 0
我只需要提取 SQL "
和 " returned 0
之间的内容,修剪所有空格。
有什么想法吗?
我尝试使用以下方法来减少问题:
$ echo 'sdf SQL" sdf sdf" returned 0' | grep 's/SQL"\(.*\)" returned 0//' -
但是它变空了。
我的预期输出是:
create sequence SEQ_BLKEXCOL_PID start with 1 increment by 50;
create sequence SEQ_BLKEXCOLFILE_PID start with 1 increment by 50;
create table HFJ_BINARY_STORAGE_BLOB (
BLOB_ID varchar2(200 char) not null,
BLOB_DATA blob not null,
CONTENT_TYPE varchar2(100 char) not null,
BLOB_HASH varchar2(128 char),
PUBLISHED_DATE timestamp not null,
RESOURCE_ID varchar2(100 char) not null,
BLOB_SIZE number(10,0),
primary key (BLOB_ID)
);
我试过执行:
cat test.log | sed -E 's/.* SQL"(.*)" returned 0//'
它正在返回我所有的文件内容...
使用awk
,它returns为空:
$ awk -v RS='SQL "[[:space:]]+?\n\n+.*returned 0' '
RT{
gsub(/^SQL "\n+|\n+$/,"",RT)
sub(/" returned 0[[:space:]]+?\n*$/,"",RT)
print RT";"
}
' test.log
这可以使用 gnu-awk
中的自定义 RS
来完成,该自定义 RS
在 SQL "..."
文本块上拆分数据,然后在操作块内提取引号之间的文本而不带前导 space .
awk -v RS=' SQL "[^"]+"' 'RT {
gsub(/^[^"]*"[[:space:]]*|"[^"]*$/, "", RT); print RT ";"}' file.sql
create sequence SEQ_BLKEXCOL_PID start with 1 increment by 50;
create sequence SEQ_BLKEXCOLFILE_PID start with 1 increment by 50;
create table HFJ_BINARY_STORAGE_BLOB (
BLOB_ID varchar2(200 char) not null,
BLOB_DATA blob not null,
CONTENT_TYPE varchar2(100 char) not null,
BLOB_HASH varchar2(128 char),
PUBLISHED_DATE timestamp not null,
RESOURCE_ID varchar2(100 char) not null,
BLOB_SIZE number(10,0),
primary key (BLOB_ID)
);
使用 GNU awk
,使用您显示的示例,请尝试执行以下代码一次。
awk -v RS='SQL "[[:space:]]*\n\n+.*returned 0' '
RT{
gsub(/^SQL "\n+|\n+$/,"",RT)
sub(/" returned 0[[:space:]]*\n*$/,"",RT)
print RT";"
}
' Input_file
解释: 简单的解释就是,为awk
程序设置RS为SQL "[[:space:]]+?\n\n+.*returned 0
,去掉not需要像 SQL "
这样的带有新行的字符串,并在打印之前最后返回 0。
regex的解释如下: match SQL followed by space "
followed by 1 or more spaces(optional) followed by 1 or more new行到 returned 0
这里。
如果你有 gnu grep
那么你可以使用这个 PCRE 正则表达式:
grep -oPz ' SQL "\K[^"]+' file.sql
create sequence SEQ_BLKEXCOL_PID start with 1 increment by 50
create sequence SEQ_BLKEXCOLFILE_PID start with 1 increment by 50
create table HFJ_BINARY_STORAGE_BLOB (
BLOB_ID varchar2(200 char) not null,
BLOB_DATA blob not null,
CONTENT_TYPE varchar2(100 char) not null,
BLOB_HASH varchar2(128 char),
PUBLISHED_DATE timestamp not null,
RESOURCE_ID varchar2(100 char) not null,
BLOB_SIZE number(10,0),
primary key (BLOB_ID)
)
解释:
' SQL "
:搜索 SQL "
文本
\K
: 重置匹配信息
[^"]+
:匹配 1+ 个非 "
的字符
要根据需要(在评论中)进行格式化,请使用此 grep + sed (gnu)
解决方案:
grep -oPzZ ' SQL "\K[^"]+' file.sql |
sed -E '$s/$/\n/; s/\x0/;/; s/^[[:blank:]]+//'
create sequence SEQ_BLKEXCOL_PID start with 1 increment by 50;
create sequence SEQ_BLKEXCOLFILE_PID start with 1 increment by 50;
create table HFJ_BINARY_STORAGE_BLOB (
BLOB_ID varchar2(200 char) not null,
BLOB_DATA blob not null,
CONTENT_TYPE varchar2(100 char) not null,
BLOB_HASH varchar2(128 char),
PUBLISHED_DATE timestamp not null,
RESOURCE_ID varchar2(100 char) not null,
BLOB_SIZE number(10,0),
primary key (BLOB_ID)
);
我有这样一个文件内容:
Listening for transport dt_socket at address: 8000
------------------------------------------------------------
^[[1m HAPI FHIR^[[22m 5.4.0 - Command Line Tool
------------------------------------------------------------
Process ID : 21719@psgd
Max configured JVM memory (Xmx) : 3.2GB
Detected Java version : 11.0.7
------------------------------------------------------------
^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:40.79^[[0;39m ^[[37m[main]^[[0;39m ^[[37mWARN ^[[0;39m ^[[1;34mo.f.c.i.s.c.ClassPathScanner^[[0;39m ^[[1;37mUnable to resolve location classpath:db/migration. Note this warning will become an error in Flyway 7.
^[[0;39m^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:42.641^[[0;39m ^[[37m[main]^[[0;39m ^[[37mWARN ^[[0;39m ^[[1;34mo.f.c.i.s.c.ClassPathScanner^[[0;39m ^[[1;37mUnable to resolve location classpath:db/migration. Note this warning will become an error in Flyway 7.
^[[0;39m^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:44.693^[[0;39m ^[[37m[main]^[[0;39m ^[[37mINFO ^[[0;39m ^[[1;34mc.u.f.j.m.t.InitializeSchemaTask^[[0;39m ^[[1;37m3_3_0.20180115.0: Initializing ORACLE_12C schema for HAPI FHIR
^[[0;39m^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:44.848^[[0;39m ^[[37m[main]^[[0;39m ^[[37mINFO ^[[0;39m ^[[1;34mc.u.f.j.m.t.BaseTask^[[0;39m ^[[1;37m3_3_0.20180115.0: SQL "create sequence SEQ_BLKEXCOL_PID start with 1 increment by 50" returned 0
^[[0;39m^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:44.918^[[0;39m ^[[37m[main]^[[0;39m ^[[37mINFO ^[[0;39m ^[[1;34mc.u.f.j.m.t.BaseTask^[[0;39m ^[[1;37m3_3_0.20180115.0: SQL "
create sequence SEQ_BLKEXCOLFILE_PID start with 1 increment by 50" returned 0
^[[0;39m^[[32m2021-07-01^[[0;39m ^[[1;32m12:27:47.573^[[0;39m ^[[37m[main]^[[0;39m ^[[37mINFO ^[[0;39m ^[[1;34mc.u.f.j.m.t.BaseTask^[[0;39m ^[[1;37m3_3_0.20180115.0: SQL "
create table HFJ_BINARY_STORAGE_BLOB (
BLOB_ID varchar2(200 char) not null,
BLOB_DATA blob not null,
CONTENT_TYPE varchar2(100 char) not null,
BLOB_HASH varchar2(128 char),
PUBLISHED_DATE timestamp not null,
RESOURCE_ID varchar2(100 char) not null,
BLOB_SIZE number(10,0),
primary key (BLOB_ID)
)" returned 0
我只需要提取 SQL "
和 " returned 0
之间的内容,修剪所有空格。
有什么想法吗?
我尝试使用以下方法来减少问题:
$ echo 'sdf SQL" sdf sdf" returned 0' | grep 's/SQL"\(.*\)" returned 0//' -
但是它变空了。
我的预期输出是:
create sequence SEQ_BLKEXCOL_PID start with 1 increment by 50;
create sequence SEQ_BLKEXCOLFILE_PID start with 1 increment by 50;
create table HFJ_BINARY_STORAGE_BLOB (
BLOB_ID varchar2(200 char) not null,
BLOB_DATA blob not null,
CONTENT_TYPE varchar2(100 char) not null,
BLOB_HASH varchar2(128 char),
PUBLISHED_DATE timestamp not null,
RESOURCE_ID varchar2(100 char) not null,
BLOB_SIZE number(10,0),
primary key (BLOB_ID)
);
我试过执行:
cat test.log | sed -E 's/.* SQL"(.*)" returned 0//'
它正在返回我所有的文件内容...
使用awk
,它returns为空:
$ awk -v RS='SQL "[[:space:]]+?\n\n+.*returned 0' '
RT{
gsub(/^SQL "\n+|\n+$/,"",RT)
sub(/" returned 0[[:space:]]+?\n*$/,"",RT)
print RT";"
}
' test.log
这可以使用 gnu-awk
中的自定义 RS
来完成,该自定义 RS
在 SQL "..."
文本块上拆分数据,然后在操作块内提取引号之间的文本而不带前导 space .
awk -v RS=' SQL "[^"]+"' 'RT {
gsub(/^[^"]*"[[:space:]]*|"[^"]*$/, "", RT); print RT ";"}' file.sql
create sequence SEQ_BLKEXCOL_PID start with 1 increment by 50;
create sequence SEQ_BLKEXCOLFILE_PID start with 1 increment by 50;
create table HFJ_BINARY_STORAGE_BLOB (
BLOB_ID varchar2(200 char) not null,
BLOB_DATA blob not null,
CONTENT_TYPE varchar2(100 char) not null,
BLOB_HASH varchar2(128 char),
PUBLISHED_DATE timestamp not null,
RESOURCE_ID varchar2(100 char) not null,
BLOB_SIZE number(10,0),
primary key (BLOB_ID)
);
使用 GNU awk
,使用您显示的示例,请尝试执行以下代码一次。
awk -v RS='SQL "[[:space:]]*\n\n+.*returned 0' '
RT{
gsub(/^SQL "\n+|\n+$/,"",RT)
sub(/" returned 0[[:space:]]*\n*$/,"",RT)
print RT";"
}
' Input_file
解释: 简单的解释就是,为awk
程序设置RS为SQL "[[:space:]]+?\n\n+.*returned 0
,去掉not需要像 SQL "
这样的带有新行的字符串,并在打印之前最后返回 0。
regex的解释如下: match SQL followed by space "
followed by 1 or more spaces(optional) followed by 1 or more new行到 returned 0
这里。
如果你有 gnu grep
那么你可以使用这个 PCRE 正则表达式:
grep -oPz ' SQL "\K[^"]+' file.sql
create sequence SEQ_BLKEXCOL_PID start with 1 increment by 50
create sequence SEQ_BLKEXCOLFILE_PID start with 1 increment by 50
create table HFJ_BINARY_STORAGE_BLOB (
BLOB_ID varchar2(200 char) not null,
BLOB_DATA blob not null,
CONTENT_TYPE varchar2(100 char) not null,
BLOB_HASH varchar2(128 char),
PUBLISHED_DATE timestamp not null,
RESOURCE_ID varchar2(100 char) not null,
BLOB_SIZE number(10,0),
primary key (BLOB_ID)
)
解释:
' SQL "
:搜索SQL "
文本\K
: 重置匹配信息[^"]+
:匹配 1+ 个非"
的字符
要根据需要(在评论中)进行格式化,请使用此 grep + sed (gnu)
解决方案:
grep -oPzZ ' SQL "\K[^"]+' file.sql |
sed -E '$s/$/\n/; s/\x0/;/; s/^[[:blank:]]+//'
create sequence SEQ_BLKEXCOL_PID start with 1 increment by 50;
create sequence SEQ_BLKEXCOLFILE_PID start with 1 increment by 50;
create table HFJ_BINARY_STORAGE_BLOB (
BLOB_ID varchar2(200 char) not null,
BLOB_DATA blob not null,
CONTENT_TYPE varchar2(100 char) not null,
BLOB_HASH varchar2(128 char),
PUBLISHED_DATE timestamp not null,
RESOURCE_ID varchar2(100 char) not null,
BLOB_SIZE number(10,0),
primary key (BLOB_ID)
);