如何使用 Bash 生成文件中所有可能的行组合？

Question

我有文件“test.txt”（任意行数）：

$ cat test.txt
A
B
C

我想找到一个 bash 代码来生成具有 n 元素的所有可能组合，其中 n >= 2、从所有元素开始（即行数，X），使得n=X，n=X-1，n=X-2，n=X-3，。 .., n = 2，在上面的例子中是：

A,B,C
A,B
A,C
B,C

有什么建议吗？非常感谢！

Answer 1

我们可以使用以下 awk 来获取所有行的每种可能组合：

awk 'NR==FNR { a[[=10=]]; next } { for (i in a) print i, [=10=] }' test.txt test.txt

然后，我们可以使用split将每一行写入一个单独的文件：

awk 'NR==FNR { a[[=11=]]; next } { for (i in a) print i, [=11=] }' test.txt test.txt > tmp.txt
split -l 1 -d tmp.txt "test-"
rm tmp.txt

我本地机器上的示例：

$
$ cat test.txt
A
B
C
$
$ awk 'NR==FNR { a[[=12=]]; next } { for (i in a) print i, [=12=] }' test.txt test.txt > tmp.txt
$ split -l 1 -d tmp.txt "test-"
$ rm tmp.txt
$
$ tail -n +1 *
==> test-00 <==
A A

==> test-01 <==
B A

==> test-02 <==
C A

==> test-03 <==
A B

==> test-04 <==
B B

==> test-05 <==
C B

==> test-06 <==
A C

==> test-07 <==
B C

==> test-08 <==
C C

==> test.txt <==
A
B
C
$

Answer 2

一个bash递归函数：这不是最快的解决方案

all_combinations() {
    (($# == 0)) && return
    (IFS=,; echo "$*")
    local i x
    for ((i=0; i<$#; i++)); do 
        x=("$@")
        unset 'x[i]'
        "${FUNCNAME[0]}" "${x[@]}"
    done
}

combinations() {
    all_combinations "$@" |
      grep , |                      # at least 2 element
      sort -u |                     # remove duplicates
      while IFS= read -r line; do   # print number of elements
          printf '%d\t%s\n' \
            $(commas=${line//[^,]/}; echo ${#commas}) \
            "$line"
      done |
      sort -k1,1nr -k2 |            # sort by num + line
      cut -f2-                      # remove num
}

mapfile -t lines < test.txt
combinations "${lines[@]}"

如果 test.txt 包含 4 行，则生成

A,B,C,D
A,B,C
A,B,D
A,C,D
B,C,D
A,B
A,C
A,D
B,C
B,D
C,D

Answer 3

假设：

字符串 A,B,C 被认为等同于 A,C,B、B,A,C、B,C,A、C,A,B 和 C,B,A（即，我们只需要生成这 6 种组合中的一种）

生成组合列表的一个想法...

我们会将行加载到数组中
对于数组中的每一项，我们将开始一组新的输出字符串
进行递归调用以将下一个数组项附加到我们的输出字符串
当我们将数组项附加到输出时，我们将继续并打印由 2 个或更多数组项组成的每个字符串
这更像是一种尾递归方法，应该消除重复项的生成（A,B,C 与其他 5 个等效模式相比）and/or 或需要回滚“已经看到的”组合

实现此逻辑的一个 awk 想法：

awk '

# input params are current output string, current arr[] index, and current output length (ie, number of fields)
# parameter "i" will be treated as a local variable

function combo(output, j, combo_length,  i) {

    if ( combo_length >= 2)           # print any combination with a length >= 2
       print output

    for (i=j+1; i<=n; i++)            # loop through "rest" of array entries for next field in output
        combo(output "," arr[i], i, combo_length+1 )
}

    { arr[NR]= }                     # load fields into array "arr[]"

END { n=length(arr)

      for (i=1; i<=n; i++)             # for each arr[i] start a new set of combos starting with arr[i]
          combo(arr[i],i,1)
    }
' test.txt

这会生成：

A,B
A,B,C
A,B,C,D
A,B,D
A,C
A,C,D
A,D
B,C
B,C,D
B,D
C,D

如果我们想根据字段数排序，然后输出字符串，我们可以进行以下更改：

将 print output 更改为 print combo_length, output 然后...
通过 sort | cut 管道输出 awk（我们将在这里借用 glenn 的代码）

这会生成：

$ awk ' ... print combo_length,output ...' test.txt |  sort -k1,1nr -k2 | cut -d" " -f2-

A,B,C,D
A,B,C
A,B,D
A,C,D
B,C,D
A,B
A,C
A,D
B,C
B,D
C,D

对于 20 行 test.txt（字母 A 到 T），输出转储到文件 test.out:

$ time awk '...' test.txt > test.out

real    0m1.420s
user    0m1.279s
sys     0m0.139s

$ wc -l test.out
 1048555  2097110 23685256 test.out

$ time awk '...' test.txt | sort ... | cut ... > test.out

real    0m3.456s
user    0m3.493s
sys     0m0.185s

$ wc test.out
 1048555  1048555 20971480 test.out

Answer 4

使用二进制计数器技巧迭代所有子集...

$ awk '{a[NR]=}
   END {for(i=0;i<2^NR;i++)
          {printf "{";
           for(j=0;j<NR;j++) printf "%s", and(i,2^j)?FS a[j+1]:"";
           print " }"}}' file


{ }
{ A }
{ B }
{ A B }
{ C }
{ A C }
{ B C }
{ A B C }
{ D }
{ A D }
{ B D }
{ A B D }
{ C D }
{ A C D }
{ B C D }
{ A B C D }

以所需的格式，通过管道传输到另一个 awk 以过滤 n<2 个元素

awk -v FS=, '{a[NR]=}
         END {for(i=0;i<2^NR;i++)
             {s=""; 
              for(j=0;j<NR;j++) 
                {e=and(i,2^j); 
                 printf "%s", e?s a[j+1]:""; if(e)s=FS}
              print comb}}' file | 
awk -F, 'NF>1'


A,B
A,C
B,C
A,B,C
A,D
B,D
A,B,D
C,D
A,C,D
B,C,D
A,B,C,D

它是如何工作的？

所有组合都等同于给定元素的所有子集。这又可以用 0..2^n-1 枚举（标记）。如果我们用二进制表示枚举计数器，则每个位置位都可以映射到完整集合的一个元素。因此，当运行所有子集的枚举时，我们可以创建一个特定的子集，其中包含为给定标签设置相应位的元素。

例如，对于 3 元素初始集 {A,B,C}。我们有枚举

0 0 0   -> no elements, empty subset  -> { }
0 0 1   -> A bit is set               -> { A }
0 1 0   -> B bit is set               -> { B }
0 1 1   -> Both A and B bits are set  -> { A B }
... etc

剩下的只是格式化。

此方法适用于生成所有组合，对于会减少选择的各种限制（例如恰好 3 个元素），这不是很有效。此外，由于 2^N，N 有一个上限。

Answer 5

重用中的 get_combs() 函数：

$ cat tst.awk
###################
# Calculate all combinations of a set of strings, see
# https://rosettacode.org/wiki/Combinations#AWK
###################

function get_combs(A,B, i,n,comb) {
    ## Default value for r is to choose 2 from pool of all elements in A.
    ## Can alternatively be set on the command line:-
    ##    awk -v r=<number of items being chosen> -f <scriptname>
    n = length(A)
    if (r=="") r = 2

    comb = ""
    for (i=1; i <= r; i++) { ## First combination of items:
        indices[i] = i
        comb = (i>1 ? comb OFS : "") A[indices[i]]
    }
    B[comb]

    ## While 1st item is less than its maximum permitted value...
    while (indices[1] < n - r + 1) {
        ## loop backwards through all items in the previous
        ## combination of items until an item is found that is
        ## less than its maximum permitted value:
        for (i = r; i >= 1; i--) {
            ## If the equivalently positioned item in the
            ## previous combination of items is less than its
            ## maximum permitted value...
            if (indices[i] < n - r + i) {
                ## increment the current item by 1:
                indices[i]++
                ## Save the current position-index for use
                ## outside this "for" loop:
                p = i
                break}}
        ## Put consecutive numbers in the remainder of the array,
        ## counting up from position-index p.
        for (i = p + 1; i <= r; i++) indices[i] = indices[i - 1] + 1

        ## Print the current combination of items:
        comb = ""
        for (i=1; i <= r; i++) {
            comb = (i>1 ? comb OFS : "") A[indices[i]]
        }
        B[comb]
    }
}

# Input should be a list of strings
{ A[NR] = [=10=] }
END {
    OFS = ","
    for (r=NR; r>=2; r--) {
        delete B
        get_combs(A,B)
        PROCINFO["sorted_in"] = "@ind_str_asc"
        for (comb in B) {
            print comb
        }
    }
}

$ awk -f tst.awk test.txt
A,B,C
A,B
A,C
B,C

如何使用 Bash 生成文件中所有可能的行组合？

How to generate all possible combinations of lines in a file using Bash?

bash

recursion

awk

combinations

split

它是如何工作的？