如何使用 Bash 生成文件中所有可能的行组合?
How to generate all possible combinations of lines in a file using Bash?
我有文件“test.txt”(任意行数):
$ cat test.txt
A
B
C
我想找到一个 bash 代码来生成具有 n 元素的所有可能组合,其中 n >= 2、从所有元素开始(即行数,X),使得n=X,n=X-1,n=X-2,n=X-3,。 .., n = 2,在上面的例子中是:
A,B,C
A,B
A,C
B,C
有什么建议吗?
非常感谢!
我们可以使用以下 awk 来获取所有行的每种可能组合:
awk 'NR==FNR { a[[=10=]]; next } { for (i in a) print i, [=10=] }' test.txt test.txt
然后,我们可以使用split
将每一行写入一个单独的文件:
awk 'NR==FNR { a[[=11=]]; next } { for (i in a) print i, [=11=] }' test.txt test.txt > tmp.txt
split -l 1 -d tmp.txt "test-"
rm tmp.txt
我本地机器上的示例:
$
$ cat test.txt
A
B
C
$
$ awk 'NR==FNR { a[[=12=]]; next } { for (i in a) print i, [=12=] }' test.txt test.txt > tmp.txt
$ split -l 1 -d tmp.txt "test-"
$ rm tmp.txt
$
$ tail -n +1 *
==> test-00 <==
A A
==> test-01 <==
B A
==> test-02 <==
C A
==> test-03 <==
A B
==> test-04 <==
B B
==> test-05 <==
C B
==> test-06 <==
A C
==> test-07 <==
B C
==> test-08 <==
C C
==> test.txt <==
A
B
C
$
一个bash递归函数:这不是最快的解决方案
all_combinations() {
(($# == 0)) && return
(IFS=,; echo "$*")
local i x
for ((i=0; i<$#; i++)); do
x=("$@")
unset 'x[i]'
"${FUNCNAME[0]}" "${x[@]}"
done
}
combinations() {
all_combinations "$@" |
grep , | # at least 2 element
sort -u | # remove duplicates
while IFS= read -r line; do # print number of elements
printf '%d\t%s\n' \
$(commas=${line//[^,]/}; echo ${#commas}) \
"$line"
done |
sort -k1,1nr -k2 | # sort by num + line
cut -f2- # remove num
}
mapfile -t lines < test.txt
combinations "${lines[@]}"
如果 test.txt 包含 4 行,则生成
A,B,C,D
A,B,C
A,B,D
A,C,D
B,C,D
A,B
A,C
A,D
B,C
B,D
C,D
假设:
- 字符串
A,B,C
被认为等同于 A,C,B
、B,A,C
、B,C,A
、C,A,B
和 C,B,A
(即,我们只需要生成这 6 种组合中的一种)
生成组合列表的一个想法...
- 我们会将行加载到数组中
- 对于数组中的每一项,我们将开始一组新的输出字符串
- 进行递归调用以将下一个数组项附加到我们的输出字符串
- 当我们将数组项附加到输出时,我们将继续并打印由 2 个或更多数组项组成的每个字符串
- 这更像是一种尾递归方法,应该消除重复项的生成(
A,B,C
与其他 5 个等效模式相比)and/or 或需要回滚“已经看到的”组合
实现此逻辑的一个 awk
想法:
awk '
# input params are current output string, current arr[] index, and current output length (ie, number of fields)
# parameter "i" will be treated as a local variable
function combo(output, j, combo_length, i) {
if ( combo_length >= 2) # print any combination with a length >= 2
print output
for (i=j+1; i<=n; i++) # loop through "rest" of array entries for next field in output
combo(output "," arr[i], i, combo_length+1 )
}
{ arr[NR]= } # load fields into array "arr[]"
END { n=length(arr)
for (i=1; i<=n; i++) # for each arr[i] start a new set of combos starting with arr[i]
combo(arr[i],i,1)
}
' test.txt
这会生成:
A,B
A,B,C
A,B,C,D
A,B,D
A,C
A,C,D
A,D
B,C
B,C,D
B,D
C,D
如果我们想根据字段数排序,然后输出字符串,我们可以进行以下更改:
- 将
print output
更改为 print combo_length, output
然后...
- 通过
sort | cut
管道输出 awk
(我们将在这里借用 glenn 的代码)
这会生成:
$ awk ' ... print combo_length,output ...' test.txt | sort -k1,1nr -k2 | cut -d" " -f2-
A,B,C,D
A,B,C
A,B,D
A,C,D
B,C,D
A,B
A,C
A,D
B,C
B,D
C,D
对于 20 行 test.txt
(字母 A
到 T
),输出转储到文件 test.out
:
$ time awk '...' test.txt > test.out
real 0m1.420s
user 0m1.279s
sys 0m0.139s
$ wc -l test.out
1048555 2097110 23685256 test.out
$ time awk '...' test.txt | sort ... | cut ... > test.out
real 0m3.456s
user 0m3.493s
sys 0m0.185s
$ wc test.out
1048555 1048555 20971480 test.out
使用二进制计数器技巧迭代所有子集...
$ awk '{a[NR]=}
END {for(i=0;i<2^NR;i++)
{printf "{";
for(j=0;j<NR;j++) printf "%s", and(i,2^j)?FS a[j+1]:"";
print " }"}}' file
{ }
{ A }
{ B }
{ A B }
{ C }
{ A C }
{ B C }
{ A B C }
{ D }
{ A D }
{ B D }
{ A B D }
{ C D }
{ A C D }
{ B C D }
{ A B C D }
以所需的格式,通过管道传输到另一个 awk 以过滤 n<2 个元素
awk -v FS=, '{a[NR]=}
END {for(i=0;i<2^NR;i++)
{s="";
for(j=0;j<NR;j++)
{e=and(i,2^j);
printf "%s", e?s a[j+1]:""; if(e)s=FS}
print comb}}' file |
awk -F, 'NF>1'
A,B
A,C
B,C
A,B,C
A,D
B,D
A,B,D
C,D
A,C,D
B,C,D
A,B,C,D
它是如何工作的?
所有组合都等同于给定元素的所有子集。这又可以用 0..2^n-1 枚举(标记)。
如果我们用二进制表示枚举计数器,则每个位置位都可以映射到完整集合的一个元素。因此,当 运行 所有子集的枚举时,我们可以创建一个特定的子集,其中包含为给定标签设置相应位的元素。
例如,对于 3 元素初始集 {A,B,C}。我们有枚举
0 0 0 -> no elements, empty subset -> { }
0 0 1 -> A bit is set -> { A }
0 1 0 -> B bit is set -> { B }
0 1 1 -> Both A and B bits are set -> { A B }
... etc
剩下的只是格式化。
此方法适用于生成所有组合,对于会减少选择的各种限制(例如恰好 3 个元素),这不是很有效。此外,由于 2^N,N 有一个上限。
重用 中的 get_combs()
函数:
$ cat tst.awk
###################
# Calculate all combinations of a set of strings, see
# https://rosettacode.org/wiki/Combinations#AWK
###################
function get_combs(A,B, i,n,comb) {
## Default value for r is to choose 2 from pool of all elements in A.
## Can alternatively be set on the command line:-
## awk -v r=<number of items being chosen> -f <scriptname>
n = length(A)
if (r=="") r = 2
comb = ""
for (i=1; i <= r; i++) { ## First combination of items:
indices[i] = i
comb = (i>1 ? comb OFS : "") A[indices[i]]
}
B[comb]
## While 1st item is less than its maximum permitted value...
while (indices[1] < n - r + 1) {
## loop backwards through all items in the previous
## combination of items until an item is found that is
## less than its maximum permitted value:
for (i = r; i >= 1; i--) {
## If the equivalently positioned item in the
## previous combination of items is less than its
## maximum permitted value...
if (indices[i] < n - r + i) {
## increment the current item by 1:
indices[i]++
## Save the current position-index for use
## outside this "for" loop:
p = i
break}}
## Put consecutive numbers in the remainder of the array,
## counting up from position-index p.
for (i = p + 1; i <= r; i++) indices[i] = indices[i - 1] + 1
## Print the current combination of items:
comb = ""
for (i=1; i <= r; i++) {
comb = (i>1 ? comb OFS : "") A[indices[i]]
}
B[comb]
}
}
# Input should be a list of strings
{ A[NR] = [=10=] }
END {
OFS = ","
for (r=NR; r>=2; r--) {
delete B
get_combs(A,B)
PROCINFO["sorted_in"] = "@ind_str_asc"
for (comb in B) {
print comb
}
}
}
$ awk -f tst.awk test.txt
A,B,C
A,B
A,C
B,C
我有文件“test.txt”(任意行数):
$ cat test.txt
A
B
C
我想找到一个 bash 代码来生成具有 n 元素的所有可能组合,其中 n >= 2、从所有元素开始(即行数,X),使得n=X,n=X-1,n=X-2,n=X-3,。 .., n = 2,在上面的例子中是:
A,B,C
A,B
A,C
B,C
有什么建议吗? 非常感谢!
我们可以使用以下 awk 来获取所有行的每种可能组合:
awk 'NR==FNR { a[[=10=]]; next } { for (i in a) print i, [=10=] }' test.txt test.txt
然后,我们可以使用split
将每一行写入一个单独的文件:
awk 'NR==FNR { a[[=11=]]; next } { for (i in a) print i, [=11=] }' test.txt test.txt > tmp.txt
split -l 1 -d tmp.txt "test-"
rm tmp.txt
我本地机器上的示例:
$
$ cat test.txt
A
B
C
$
$ awk 'NR==FNR { a[[=12=]]; next } { for (i in a) print i, [=12=] }' test.txt test.txt > tmp.txt
$ split -l 1 -d tmp.txt "test-"
$ rm tmp.txt
$
$ tail -n +1 *
==> test-00 <==
A A
==> test-01 <==
B A
==> test-02 <==
C A
==> test-03 <==
A B
==> test-04 <==
B B
==> test-05 <==
C B
==> test-06 <==
A C
==> test-07 <==
B C
==> test-08 <==
C C
==> test.txt <==
A
B
C
$
一个bash递归函数:这不是最快的解决方案
all_combinations() {
(($# == 0)) && return
(IFS=,; echo "$*")
local i x
for ((i=0; i<$#; i++)); do
x=("$@")
unset 'x[i]'
"${FUNCNAME[0]}" "${x[@]}"
done
}
combinations() {
all_combinations "$@" |
grep , | # at least 2 element
sort -u | # remove duplicates
while IFS= read -r line; do # print number of elements
printf '%d\t%s\n' \
$(commas=${line//[^,]/}; echo ${#commas}) \
"$line"
done |
sort -k1,1nr -k2 | # sort by num + line
cut -f2- # remove num
}
mapfile -t lines < test.txt
combinations "${lines[@]}"
如果 test.txt 包含 4 行,则生成
A,B,C,D
A,B,C
A,B,D
A,C,D
B,C,D
A,B
A,C
A,D
B,C
B,D
C,D
假设:
- 字符串
A,B,C
被认为等同于A,C,B
、B,A,C
、B,C,A
、C,A,B
和C,B,A
(即,我们只需要生成这 6 种组合中的一种)
生成组合列表的一个想法...
- 我们会将行加载到数组中
- 对于数组中的每一项,我们将开始一组新的输出字符串
- 进行递归调用以将下一个数组项附加到我们的输出字符串
- 当我们将数组项附加到输出时,我们将继续并打印由 2 个或更多数组项组成的每个字符串
- 这更像是一种尾递归方法,应该消除重复项的生成(
A,B,C
与其他 5 个等效模式相比)and/or 或需要回滚“已经看到的”组合
实现此逻辑的一个 awk
想法:
awk '
# input params are current output string, current arr[] index, and current output length (ie, number of fields)
# parameter "i" will be treated as a local variable
function combo(output, j, combo_length, i) {
if ( combo_length >= 2) # print any combination with a length >= 2
print output
for (i=j+1; i<=n; i++) # loop through "rest" of array entries for next field in output
combo(output "," arr[i], i, combo_length+1 )
}
{ arr[NR]= } # load fields into array "arr[]"
END { n=length(arr)
for (i=1; i<=n; i++) # for each arr[i] start a new set of combos starting with arr[i]
combo(arr[i],i,1)
}
' test.txt
这会生成:
A,B
A,B,C
A,B,C,D
A,B,D
A,C
A,C,D
A,D
B,C
B,C,D
B,D
C,D
如果我们想根据字段数排序,然后输出字符串,我们可以进行以下更改:
- 将
print output
更改为print combo_length, output
然后... - 通过
sort | cut
管道输出awk
(我们将在这里借用 glenn 的代码)
这会生成:
$ awk ' ... print combo_length,output ...' test.txt | sort -k1,1nr -k2 | cut -d" " -f2-
A,B,C,D
A,B,C
A,B,D
A,C,D
B,C,D
A,B
A,C
A,D
B,C
B,D
C,D
对于 20 行 test.txt
(字母 A
到 T
),输出转储到文件 test.out
:
$ time awk '...' test.txt > test.out
real 0m1.420s
user 0m1.279s
sys 0m0.139s
$ wc -l test.out
1048555 2097110 23685256 test.out
$ time awk '...' test.txt | sort ... | cut ... > test.out
real 0m3.456s
user 0m3.493s
sys 0m0.185s
$ wc test.out
1048555 1048555 20971480 test.out
使用二进制计数器技巧迭代所有子集...
$ awk '{a[NR]=}
END {for(i=0;i<2^NR;i++)
{printf "{";
for(j=0;j<NR;j++) printf "%s", and(i,2^j)?FS a[j+1]:"";
print " }"}}' file
{ }
{ A }
{ B }
{ A B }
{ C }
{ A C }
{ B C }
{ A B C }
{ D }
{ A D }
{ B D }
{ A B D }
{ C D }
{ A C D }
{ B C D }
{ A B C D }
以所需的格式,通过管道传输到另一个 awk 以过滤 n<2 个元素
awk -v FS=, '{a[NR]=}
END {for(i=0;i<2^NR;i++)
{s="";
for(j=0;j<NR;j++)
{e=and(i,2^j);
printf "%s", e?s a[j+1]:""; if(e)s=FS}
print comb}}' file |
awk -F, 'NF>1'
A,B
A,C
B,C
A,B,C
A,D
B,D
A,B,D
C,D
A,C,D
B,C,D
A,B,C,D
它是如何工作的?
所有组合都等同于给定元素的所有子集。这又可以用 0..2^n-1 枚举(标记)。 如果我们用二进制表示枚举计数器,则每个位置位都可以映射到完整集合的一个元素。因此,当 运行 所有子集的枚举时,我们可以创建一个特定的子集,其中包含为给定标签设置相应位的元素。
例如,对于 3 元素初始集 {A,B,C}。我们有枚举
0 0 0 -> no elements, empty subset -> { }
0 0 1 -> A bit is set -> { A }
0 1 0 -> B bit is set -> { B }
0 1 1 -> Both A and B bits are set -> { A B }
... etc
剩下的只是格式化。
此方法适用于生成所有组合,对于会减少选择的各种限制(例如恰好 3 个元素),这不是很有效。此外,由于 2^N,N 有一个上限。
重用 get_combs()
函数:
$ cat tst.awk
###################
# Calculate all combinations of a set of strings, see
# https://rosettacode.org/wiki/Combinations#AWK
###################
function get_combs(A,B, i,n,comb) {
## Default value for r is to choose 2 from pool of all elements in A.
## Can alternatively be set on the command line:-
## awk -v r=<number of items being chosen> -f <scriptname>
n = length(A)
if (r=="") r = 2
comb = ""
for (i=1; i <= r; i++) { ## First combination of items:
indices[i] = i
comb = (i>1 ? comb OFS : "") A[indices[i]]
}
B[comb]
## While 1st item is less than its maximum permitted value...
while (indices[1] < n - r + 1) {
## loop backwards through all items in the previous
## combination of items until an item is found that is
## less than its maximum permitted value:
for (i = r; i >= 1; i--) {
## If the equivalently positioned item in the
## previous combination of items is less than its
## maximum permitted value...
if (indices[i] < n - r + i) {
## increment the current item by 1:
indices[i]++
## Save the current position-index for use
## outside this "for" loop:
p = i
break}}
## Put consecutive numbers in the remainder of the array,
## counting up from position-index p.
for (i = p + 1; i <= r; i++) indices[i] = indices[i - 1] + 1
## Print the current combination of items:
comb = ""
for (i=1; i <= r; i++) {
comb = (i>1 ? comb OFS : "") A[indices[i]]
}
B[comb]
}
}
# Input should be a list of strings
{ A[NR] = [=10=] }
END {
OFS = ","
for (r=NR; r>=2; r--) {
delete B
get_combs(A,B)
PROCINFO["sorted_in"] = "@ind_str_asc"
for (comb in B) {
print comb
}
}
}
$ awk -f tst.awk test.txt
A,B,C
A,B
A,C
B,C