Perl:用每个选项替换不明确的字母

Perl: replace ambiguous letters with each option

我正在尝试执行以下应该很容易的任务,但我无法理解它:

我有两个序列的字母(核苷酸碱基)可能不明确。我想重写每个序列的所有可能性...

例如第一个序列是:

 CAGCMGCCGCGGTAAYWC

其中包含M、Y、W,分别可以是[A,C]、[C,T]、[A,T]。因此,上面的序列应该改写为以下可能性:

 CAGCAGCCGCGGTAACAC
 CAGCAGCCGCGGTAACTC
 CAGCAGCCGCGGTAATAC
 CAGCAGCCGCGGTAATTC
 CAGCCGCCGCGGTAACAC
 CAGCCGCCGCGGTAACTC
 CAGCCGCCGCGGTAATAC
 CAGCCGCCGCGGTAATTC

到目前为止我有 MWE:

#!/usr/bin/perl -w
use warnings;
use strict;

my %seqs = (
    "PrefixPE/1"  => "CAGCMGCCGCGGTAAYWC",
    "PrefixPE/2" => "BSCCCGYCAATTYMTKTRAGT"
);

my %ops;
$ops{"R"}{"A"}="";
$ops{"R"}{"G"}="";
$ops{"Y"}{"C"}="";
$ops{"Y"}{"T"}="";
$ops{"M"}{"A"}="";
$ops{"M"}{"C"}="";
$ops{"K"}{"G"}="";
$ops{"K"}{"T"}="";
$ops{"W"}{"A"}="";
$ops{"W"}{"T"}="";
$ops{"B"}{"C"}="";
$ops{"B"}{"G"}="";
$ops{"B"}{"T"}="";
$ops{"S"}{"C"}="";
$ops{"S"}{"G"}="";

foreach my $id(keys %seqs){
    my $seq=$seqs{$id};
    my @nts=(split '', $seq);
    my $i=0;
    foreach my $n(@nts){
        $i++;
        if (exists $ops{$n}){
            my $j=0;
            foreach my $o(keys %{$ops{$n}}){
                $j++;
                print "$id, pos $i\.$j = <$o>\n";
            }
        }
        else{
            print "$id, pos $i = <$n>\n";
        }
    }
}

对于每个字母,扩展可能的序列集。

#!/usr/bin/perl
use warnings;
use strict;
use feature qw{ say };
use Syntax::Construct qw{ /r };

my %ambiguous = ( M => [qw[ A C ]],
                  Y => [qw[ C T ]],
                  W => [qw[ A T ]],
                );

my $string = 'CAGCMGCCGCGGTAAYWC';
my $is_ambiguous = '[' . (join q(), keys %ambiguous) . ']';

my @strings = $string;
while ($strings[0] =~ $is_ambiguous) {
    my ($letter) = $strings[0] =~ /($is_ambiguous)/;
    @strings = map {
        my $s = $_;
        map $s =~ s/$letter/$_/r, @{ $ambiguous{$letter} }
        # map { (my $x = $s) =~ s/$letter/$_/; $x } @{ $ambiguous{$letter} }
    } @strings;
}

say for @strings;

在 5.14 之前的 Perl 上,使用注释行而不是上面的行,并删除 Syntax::Construct.

这是使用递归的解决方案:

use feature qw(say);
use strict;
use warnings;

my %seqs = (
    "PrefixPE/1"  => "CAGCMGCCGCGGTAAYWC",
    "PrefixPE/2" => "BSCCCGYCAATTYMTKTRAGT"
);

my %ops = (
    R => 'AG',
    Y => 'CT',
    M => 'AC',
    K => 'GT',
    W => 'AT',
    B => 'CGT',
    S => 'CG',
);

$ops{$_} = [ split //, $ops{$_} ] for keys %ops;

my $perm = GenPermutations->new( \%ops  );
for my $id (keys %seqs) {
    my $seq = $seqs{$id};
    $perm->gen( $seq );
    $perm->print_result();
}
exit;

package GenPermutations;

sub new {
    my ( $class, $ops ) = @_;
    my ($pat) = map qr/$_/, '[' . (join '', keys %$ops) . ']';
    my $info = { ops => $ops, pat => $pat, pos => [], result => [], seq => undef }; 
    return bless $info, $class;
}

sub _init {
    my ( $self, $seq ) = @_;
    @{ $self->{pos} } = ();
    @{ $self->{result} } = ();
    $self->{seq} = $seq;
    while ( $seq =~ /($self->{pat})/g ) {
        push @{ $self->{pos} }, [$-[1], ];
    }
}

sub print_result {
    my ( $self ) = @_;

    say $self->{seq} . ' : found ' . (scalar @{ $self->{result} }) . ' permutations : ';
    say "  $_" for @{ $self->{result} };
    say "";
}

sub gen {
    my ( $self, $seq ) = @_;
    $self->_init( $seq );
    $self->_gen( $seq, 0 );
}

sub _gen {
    my ( $self, $str, $pos_index ) = @_;

    if ( $pos_index > $#{$self->{pos}} ) {
        push @{ $self->{result} }, $str;
        return;
    }

    my $info = $self->{pos}[$pos_index];
    my ( $index, $letter) = @$info;
    $pos_index++;
    for my $replace ( @{ $self->{ops}{$letter} } ) {
        my $temp = $str;
        substr $temp, $index, 1, $replace; 
        $self->_gen( $temp, $pos_index );
    }
}

1;