将文件名添加到统计数据中
Adding file name to the counted data
假设我有类似下面的文件。
文件 1
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,7330235,7330235,G,T
17,10222478,10222478,C,T
文件 2
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,10222478,10222478,C,T
文件 3
17,10222478,10222478,C,T
我想找出每个文件中存在多少次重复值,因此理想情况下,输出如下:
输出
2 1,144931087,144931087,T,C
2 16,89017167,89017167,C,G
3 17,10222478,10222478,C,T
1 17,7330235,7330235,G,T
我使用了以下命令来计算重复值。
sort Test1.csv Test2.csv Test3.csv | uniq --count
现在我想为统计输出添加文件名。
我想要的输出应该是这样的:
Test1 Test2 2 1,144931087,144931087,T,C
Test1 Test2 2 16,89017167,89017167,C,G
Test1 Test2 Test 3 3 17,10222478,10222478,C,T
Test1 1 17,7330235,7330235,G,T
任何人都可以帮助我获得所需的输出,或者任何人都可以建议我获得所需输出的更好方法吗?
使用 awk。抱歉我聪明的文件命名方案:
$ awk '{
a[[=10=]]++ # count hits
b[[=10=]]=b[[=10=]] FILENAME " " # store filenames
}
END {
for(i in a)
print b[i] a[i],i # output them
}' foo bar baz
foo bar 2 1,144931087,144931087,T,C
foo bar 2 16,89017167,89017167,C,G
foo bar baz 3 17,10222478,10222478,C,T
foo 1 17,7330235,7330235,G,T
已更新 每条评论:
$ awk 'BEGIN {
FS=OFS=","
}
{
a[ OFS OFS OFS ]++
b[ OFS OFS OFS ]=b[ OFS OFS OFS ] FILENAME "|"
c[ OFS OFS OFS ]=[=11=] # keep the last record with
} # specific key combination
END {
for(i in a)
print b[i] "," a[i],c[i]
}' foo bar baz
foo|bar|,2,16,89017167,89017167,C
foo|,1,17,7330235,7330235,G
foo|bar|,2,1,144931087,144931087,T
foo|bar|baz|,3,17,10222478,10222478,C
输入:
more Test*.csv
::::::::::::::
Test1.csv
::::::::::::::
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,7330235,7330235,G,T
17,10222478,10222478,C,T
::::::::::::::
Test2.csv
::::::::::::::
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,10222478,10222478,C,T
::::::::::::::
Test3.csv
::::::::::::::
17,10222478,10222478,C,T
命令:
awk '{tmp[[=11=]]++;if(length(tmp2[[=11=]])==0){tmp2[[=11=]]=FILENAME;next}tmp2[[=11=]]=tmp2[[=11=]] OFS FILENAME}END{for(elem in tmp){print tmp2[elem] OFS tmp[elem] OFS elem}}' Test*.csv
输出:
Test1.csv Test2.csv 2 1,144931087,144931087,T,C
Test1.csv Test2.csv 2 16,89017167,89017167,C,G
Test1.csv Test2.csv Test3.csv 3 17,10222478,10222478,C,T
Test1.csv 1 17,7330235,7330235,G,T
解释:
# gawk profile, created Mon Dec 17 14:46:47 2018
# Rule(s)
{
tmp[[=13=]]++ #associative array to count the occurrences freq
if (length(tmp2[[=13=]]) == 0) { #when you add the first occurrence filename you do not need to add a space
tmp2[[=13=]] = FILENAME
next
}
#append to variable with a space
tmp2[[=13=]] = tmp2[[=13=]] OFS FILENAME
}
# END rule(s)
END {
# loop on each element of the associative arrays and print them
for (elem in tmp) {
print tmp2[elem] OFS tmp[elem] OFS elem
}
}
可以将if...next...
替换为(length(tmp2[[=16=]]) == 0 ? tmp2[[=16=]] = FILENAME : tmp2[[=16=]] = tmp2[[=16=]] OFS FILENAME)
,将awk
脚本简化为:
{
tmp[[=14=]]++
(length(tmp2[[=14=]]) == 0 ? tmp2[[=14=]] = FILENAME : tmp2[[=14=]] = tmp2[[=14=]] OFS FILENAME)
}
END {
for (elem in tmp) {
print tmp2[elem] OFS tmp[elem] OFS elem
}
}
能否请您尝试以下操作,这应该会让您输出到 Input_file 行的输入事件中。我使用了 gsub(/[[:space:]]+$/,"")
因为你的 Input_file(s) 在最后一行有空格所以在这里删除它们,你可以删除它以防万一不是这种情况。
awk '
{
gsub(/[[:space:]]+$/,"")
}
!a[[=10=]]++{
b[++count]=[=10=]
}
{
c[[=10=]]++
d[[=10=]]=d[[=10=]]?d[[=10=]] OFS FILENAME:FILENAME
}
END{
for(i=1;i<=count;i++){
print d[b[i]]"|"c[b[i]],b[i]
}
}' test1 test2 test3
输出如下。
test1 test2|2 1,144931087,144931087,T,C
test1 test2|2 16,89017167,89017167,C,G
test1|1 17,7330235,7330235,G,T
test1 test2 test3|3 17,10222478,10222478,C,T
使用 Perl 的另一个答案。
> cat file1m.csv
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,7330235,7330235,G,T
17,10222478,10222478,C,T
> cat file2m.csv
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,10222478,10222478,C,T
> cat file3m.csv
17,10222478,10222478,C,T
> cat uniq_perl.ksh
perl -lne '
@t=@{ $kvf{$_} };
if( not $ARGV ~~ @t ) { push(@t,$ARGV); $kvf{$_}=[ @t ] ; }
close(ARGV) if eof;
END { for(keys %kvf) { @x=@{$kvf{$_}}; print join(" ",@x)." ".scalar(@x)." ".$_ } }
' file*m*csv
> ./uniq_perl.ksh
file1m.csv file2m.csv file3m.csv 3 17,10222478,10222478,C,T
file1m.csv 1 17,7330235,7330235,G,T
file1m.csv file2m.csv 2 1,144931087,144931087,T,C
file1m.csv file2m.csv 2 16,89017167,89017167,C,G
>
假设我有类似下面的文件。
文件 1
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,7330235,7330235,G,T
17,10222478,10222478,C,T
文件 2
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,10222478,10222478,C,T
文件 3
17,10222478,10222478,C,T
我想找出每个文件中存在多少次重复值,因此理想情况下,输出如下:
输出
2 1,144931087,144931087,T,C
2 16,89017167,89017167,C,G
3 17,10222478,10222478,C,T
1 17,7330235,7330235,G,T
我使用了以下命令来计算重复值。
sort Test1.csv Test2.csv Test3.csv | uniq --count
现在我想为统计输出添加文件名。 我想要的输出应该是这样的:
Test1 Test2 2 1,144931087,144931087,T,C
Test1 Test2 2 16,89017167,89017167,C,G
Test1 Test2 Test 3 3 17,10222478,10222478,C,T
Test1 1 17,7330235,7330235,G,T
任何人都可以帮助我获得所需的输出,或者任何人都可以建议我获得所需输出的更好方法吗?
使用 awk。抱歉我聪明的文件命名方案:
$ awk '{
a[[=10=]]++ # count hits
b[[=10=]]=b[[=10=]] FILENAME " " # store filenames
}
END {
for(i in a)
print b[i] a[i],i # output them
}' foo bar baz
foo bar 2 1,144931087,144931087,T,C
foo bar 2 16,89017167,89017167,C,G
foo bar baz 3 17,10222478,10222478,C,T
foo 1 17,7330235,7330235,G,T
已更新 每条评论:
$ awk 'BEGIN {
FS=OFS=","
}
{
a[ OFS OFS OFS ]++
b[ OFS OFS OFS ]=b[ OFS OFS OFS ] FILENAME "|"
c[ OFS OFS OFS ]=[=11=] # keep the last record with
} # specific key combination
END {
for(i in a)
print b[i] "," a[i],c[i]
}' foo bar baz
foo|bar|,2,16,89017167,89017167,C
foo|,1,17,7330235,7330235,G
foo|bar|,2,1,144931087,144931087,T
foo|bar|baz|,3,17,10222478,10222478,C
输入:
more Test*.csv
::::::::::::::
Test1.csv
::::::::::::::
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,7330235,7330235,G,T
17,10222478,10222478,C,T
::::::::::::::
Test2.csv
::::::::::::::
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,10222478,10222478,C,T
::::::::::::::
Test3.csv
::::::::::::::
17,10222478,10222478,C,T
命令:
awk '{tmp[[=11=]]++;if(length(tmp2[[=11=]])==0){tmp2[[=11=]]=FILENAME;next}tmp2[[=11=]]=tmp2[[=11=]] OFS FILENAME}END{for(elem in tmp){print tmp2[elem] OFS tmp[elem] OFS elem}}' Test*.csv
输出:
Test1.csv Test2.csv 2 1,144931087,144931087,T,C
Test1.csv Test2.csv 2 16,89017167,89017167,C,G
Test1.csv Test2.csv Test3.csv 3 17,10222478,10222478,C,T
Test1.csv 1 17,7330235,7330235,G,T
解释:
# gawk profile, created Mon Dec 17 14:46:47 2018
# Rule(s)
{
tmp[[=13=]]++ #associative array to count the occurrences freq
if (length(tmp2[[=13=]]) == 0) { #when you add the first occurrence filename you do not need to add a space
tmp2[[=13=]] = FILENAME
next
}
#append to variable with a space
tmp2[[=13=]] = tmp2[[=13=]] OFS FILENAME
}
# END rule(s)
END {
# loop on each element of the associative arrays and print them
for (elem in tmp) {
print tmp2[elem] OFS tmp[elem] OFS elem
}
}
可以将if...next...
替换为(length(tmp2[[=16=]]) == 0 ? tmp2[[=16=]] = FILENAME : tmp2[[=16=]] = tmp2[[=16=]] OFS FILENAME)
,将awk
脚本简化为:
{
tmp[[=14=]]++
(length(tmp2[[=14=]]) == 0 ? tmp2[[=14=]] = FILENAME : tmp2[[=14=]] = tmp2[[=14=]] OFS FILENAME)
}
END {
for (elem in tmp) {
print tmp2[elem] OFS tmp[elem] OFS elem
}
}
能否请您尝试以下操作,这应该会让您输出到 Input_file 行的输入事件中。我使用了 gsub(/[[:space:]]+$/,"")
因为你的 Input_file(s) 在最后一行有空格所以在这里删除它们,你可以删除它以防万一不是这种情况。
awk '
{
gsub(/[[:space:]]+$/,"")
}
!a[[=10=]]++{
b[++count]=[=10=]
}
{
c[[=10=]]++
d[[=10=]]=d[[=10=]]?d[[=10=]] OFS FILENAME:FILENAME
}
END{
for(i=1;i<=count;i++){
print d[b[i]]"|"c[b[i]],b[i]
}
}' test1 test2 test3
输出如下。
test1 test2|2 1,144931087,144931087,T,C
test1 test2|2 16,89017167,89017167,C,G
test1|1 17,7330235,7330235,G,T
test1 test2 test3|3 17,10222478,10222478,C,T
使用 Perl 的另一个答案。
> cat file1m.csv
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,7330235,7330235,G,T
17,10222478,10222478,C,T
> cat file2m.csv
1,144931087,144931087,T,C
16,89017167,89017167,C,G
17,10222478,10222478,C,T
> cat file3m.csv
17,10222478,10222478,C,T
> cat uniq_perl.ksh
perl -lne '
@t=@{ $kvf{$_} };
if( not $ARGV ~~ @t ) { push(@t,$ARGV); $kvf{$_}=[ @t ] ; }
close(ARGV) if eof;
END { for(keys %kvf) { @x=@{$kvf{$_}}; print join(" ",@x)." ".scalar(@x)." ".$_ } }
' file*m*csv
> ./uniq_perl.ksh
file1m.csv file2m.csv file3m.csv 3 17,10222478,10222478,C,T
file1m.csv 1 17,7330235,7330235,G,T
file1m.csv file2m.csv 2 1,144931087,144931087,T,C
file1m.csv file2m.csv 2 16,89017167,89017167,C,G
>