如何使用 Bash 生成文件中所有可能的行组合?

How to generate all possible combinations of lines in a file using Bash?

我有文件“test.txt”(任意行数):

$ cat test.txt
A
B
C

我想找到一个 bash 代码来生成具有 n 元素的所有可能组合,其中 n >= 2、从所有元素开始(即行数,X),使得n=X,n=X-1,n=X-2,n=X-3,。 .., n = 2,在上面的例子中是:

A,B,C
A,B
A,C
B,C

有什么建议吗? 非常感谢!

我们可以使用以下 来获取所有行的每种可能组合:

awk 'NR==FNR { a[[=10=]]; next } { for (i in a) print i, [=10=] }' test.txt test.txt

然后,我们可以使用split将每一行写入一个单独的文件:

awk 'NR==FNR { a[[=11=]]; next } { for (i in a) print i, [=11=] }' test.txt test.txt > tmp.txt
split -l 1 -d tmp.txt "test-"
rm tmp.txt

我本地机器上的示例:

$
$ cat test.txt
A
B
C
$
$ awk 'NR==FNR { a[[=12=]]; next } { for (i in a) print i, [=12=] }' test.txt test.txt > tmp.txt
$ split -l 1 -d tmp.txt "test-"
$ rm tmp.txt
$
$ tail -n +1 *
==> test-00 <==
A A

==> test-01 <==
B A

==> test-02 <==
C A

==> test-03 <==
A B

==> test-04 <==
B B

==> test-05 <==
C B

==> test-06 <==
A C

==> test-07 <==
B C

==> test-08 <==
C C

==> test.txt <==
A
B
C
$

一个bash递归函数:这不是最快的解决方案

all_combinations() {
    (($# == 0)) && return
    (IFS=,; echo "$*")
    local i x
    for ((i=0; i<$#; i++)); do 
        x=("$@")
        unset 'x[i]'
        "${FUNCNAME[0]}" "${x[@]}"
    done
}

combinations() {
    all_combinations "$@" |
      grep , |                      # at least 2 element
      sort -u |                     # remove duplicates
      while IFS= read -r line; do   # print number of elements
          printf '%d\t%s\n' \
            $(commas=${line//[^,]/}; echo ${#commas}) \
            "$line"
      done |
      sort -k1,1nr -k2 |            # sort by num + line
      cut -f2-                      # remove num
}

mapfile -t lines < test.txt
combinations "${lines[@]}"

如果 test.txt 包含 4 行,则生成

A,B,C,D
A,B,C
A,B,D
A,C,D
B,C,D
A,B
A,C
A,D
B,C
B,D
C,D

假设:

  • 字符串 A,B,C 被认为等同于 A,C,BB,A,CB,C,AC,A,BC,B,A(即,我们只需要生成这 6 种组合中的一种)

生成组合列表的一个想法...

  • 我们会将行加载到数组中
  • 对于数组中的每一项,我们将开始一组新的输出字符串
  • 进行递归调用以将下一个数组项附加到我们的输出字符串
  • 当我们将数组项附加到输出时,我们将继续并打印由 2 个或更多数组项组成的每个字符串
  • 这更像是一种尾递归方法,应该消除重复项的生成(A,B,C 与其他 5 个等效模式相比)and/or 或需要回滚“已经看到的”组合

实现此逻辑的一个 awk 想法:

awk '

# input params are current output string, current arr[] index, and current output length (ie, number of fields)
# parameter "i" will be treated as a local variable

function combo(output, j, combo_length,  i) {

    if ( combo_length >= 2)           # print any combination with a length >= 2
       print output

    for (i=j+1; i<=n; i++)            # loop through "rest" of array entries for next field in output
        combo(output "," arr[i], i, combo_length+1 )
}

    { arr[NR]= }                     # load fields into array "arr[]"

END { n=length(arr)

      for (i=1; i<=n; i++)             # for each arr[i] start a new set of combos starting with arr[i]
          combo(arr[i],i,1)
    }
' test.txt

这会生成:

A,B
A,B,C
A,B,C,D
A,B,D
A,C
A,C,D
A,D
B,C
B,C,D
B,D
C,D

如果我们想根据字段数排序,然后输出字符串,我们可以进行以下更改:

  • print output 更改为 print combo_length, output 然后...
  • 通过 sort | cut 管道输出 awk(我们将在这里借用 glenn 的代码)

这会生成:

$ awk ' ... print combo_length,output ...' test.txt |  sort -k1,1nr -k2 | cut -d" " -f2-

A,B,C,D
A,B,C
A,B,D
A,C,D
B,C,D
A,B
A,C
A,D
B,C
B,D
C,D


对于 20 行 test.txt(字母 AT),输出转储到文件 test.out:

$ time awk '...' test.txt > test.out

real    0m1.420s
user    0m1.279s
sys     0m0.139s

$ wc -l test.out
 1048555  2097110 23685256 test.out

$ time awk '...' test.txt | sort ... | cut ... > test.out

real    0m3.456s
user    0m3.493s
sys     0m0.185s

$ wc test.out
 1048555  1048555 20971480 test.out

使用二进制计数器技巧迭代所有子集...

$ awk '{a[NR]=}
   END {for(i=0;i<2^NR;i++)
          {printf "{";
           for(j=0;j<NR;j++) printf "%s", and(i,2^j)?FS a[j+1]:"";
           print " }"}}' file


{ }
{ A }
{ B }
{ A B }
{ C }
{ A C }
{ B C }
{ A B C }
{ D }
{ A D }
{ B D }
{ A B D }
{ C D }
{ A C D }
{ B C D }
{ A B C D }

以所需的格式,通过管道传输到另一个 awk 以过滤 n<2 个元素

awk -v FS=, '{a[NR]=}
         END {for(i=0;i<2^NR;i++)
             {s=""; 
              for(j=0;j<NR;j++) 
                {e=and(i,2^j); 
                 printf "%s", e?s a[j+1]:""; if(e)s=FS}
              print comb}}' file | 
awk -F, 'NF>1'


A,B
A,C
B,C
A,B,C
A,D
B,D
A,B,D
C,D
A,C,D
B,C,D
A,B,C,D

它是如何工作的?

所有组合都等同于给定元素的所有子集。这又可以用 0..2^n-1 枚举(标记)。 如果我们用二进制表示枚举计数器,则每个位置位都可以映射到完整集合的一个元素。因此,当 运行 所有子集的枚举时,我们可以创建一个特定的子集,其中包含为给定标签设置相应位的元素。

例如,对于 3 元素初始集 {A,B,C}。我们有枚举

0 0 0   -> no elements, empty subset  -> { }
0 0 1   -> A bit is set               -> { A }
0 1 0   -> B bit is set               -> { B }
0 1 1   -> Both A and B bits are set  -> { A B }
... etc

剩下的只是格式化。

此方法适用于生成所有组合,对于会减少选择的各种限制(例如恰好 3 个元素),这不是很有效。此外,由于 2^N,N 有一个上限。

重用 中的 get_combs() 函数:

$ cat tst.awk
###################
# Calculate all combinations of a set of strings, see
# https://rosettacode.org/wiki/Combinations#AWK
###################

function get_combs(A,B, i,n,comb) {
    ## Default value for r is to choose 2 from pool of all elements in A.
    ## Can alternatively be set on the command line:-
    ##    awk -v r=<number of items being chosen> -f <scriptname>
    n = length(A)
    if (r=="") r = 2

    comb = ""
    for (i=1; i <= r; i++) { ## First combination of items:
        indices[i] = i
        comb = (i>1 ? comb OFS : "") A[indices[i]]
    }
    B[comb]

    ## While 1st item is less than its maximum permitted value...
    while (indices[1] < n - r + 1) {
        ## loop backwards through all items in the previous
        ## combination of items until an item is found that is
        ## less than its maximum permitted value:
        for (i = r; i >= 1; i--) {
            ## If the equivalently positioned item in the
            ## previous combination of items is less than its
            ## maximum permitted value...
            if (indices[i] < n - r + i) {
                ## increment the current item by 1:
                indices[i]++
                ## Save the current position-index for use
                ## outside this "for" loop:
                p = i
                break}}
        ## Put consecutive numbers in the remainder of the array,
        ## counting up from position-index p.
        for (i = p + 1; i <= r; i++) indices[i] = indices[i - 1] + 1

        ## Print the current combination of items:
        comb = ""
        for (i=1; i <= r; i++) {
            comb = (i>1 ? comb OFS : "") A[indices[i]]
        }
        B[comb]
    }
}

# Input should be a list of strings
{ A[NR] = [=10=] }
END {
    OFS = ","
    for (r=NR; r>=2; r--) {
        delete B
        get_combs(A,B)
        PROCINFO["sorted_in"] = "@ind_str_asc"
        for (comb in B) {
            print comb
        }
    }
}

$ awk -f tst.awk test.txt
A,B,C
A,B
A,C
B,C