我想将 file1 列(1、2、4、5)映射到 file2 列(1、2、4、5)。第 5 列可能包含不同顺序的逗号分隔字符(A、T、G、C)

I want to map file1 column(1,2,4,5) to file2 column(1,2,4,5). 5th columns may contain comma separated characters (A,T,G,C) with different orders

文件 1

chr1 123896 rs0987522 A T
chr5 678452 rs8733521 G C,A

文件 2

chr1 123896 rs0987522 A T,C,G
chr5 678452 rs8733521 G A,T

输出

chr1 123896 rs0987522 A T*,C,G
chr5 678452 rs8733521 G C!,A*,T

如果 file1 的第 4 列与 file2 的第 4 列匹配的任何字符 (A==>> T,A,G,C),则 file2 中的行应按上面给出的输出和来自的匹配字符打印file1(此处为第 1 行 column5 中的 T)应由“”(T)表示,file1 中的不匹配字符应由“!”表示(C!).

awk 'NR==FNR{firstfile[,,];next} (,,) in firstfile' File1 file2

我使用过 awk,但仅用于 (1,2,4) 列。请帮助进行第 5 次匹配。 第 5 列逗号分隔的字符在文件 1 和文件 2 中的顺序可能不同。

您可以使用此 awk,它存储 </code>,密钥为 <code>(,,)。在处理 file2 时,它用逗号拆分存储的值,并通过附加 * 来替换每个逗号分隔的值。如果未找到元素(即 sub returns 0),则我们通过添加 !,:

来为每个值添加前缀
awk 'NR==FNR {
   map[,,] = 
   next
}
(,,) in map {
    n = split(map[,,], a, /,/)
    for (i=1; i<=n; ++i)
       if (sub(a[i], "&*", ) == 0)
           = a[i] "!," 
} 1' file1 file2

chr1 123896 rs0987522 A T*,C,G
chr5 678452 rs8733521 G C!,A*,T

使用您展示的示例,请尝试执行以下 awk 程序。

awk '
FNR==NR{
  arr1[,,]=
  next
}
{
  val=""
  delete arr2;delete arr3;delete arr4;delete arr5 
  num1=split(arr1[,,],arr2,",")
  for(i=1;i<=num1;i++){ arr4[arr2[i]] }
  num2=split($NF,arr3,",")
}
((,,) in arr1){
  for(i=1;i<=num2;i++){
    val=(val?val ",":"")(arr3[i] in arr4?arr3[i]"*":arr3[i]"!")
    if(arr3[i] in arr4){ arr5[arr3[i]] }
  }
  for(i=1;i<=num1;i++){
    if(!(arr2[i] in arr5)){
      val=val "," arr2[i]
    }
  }
  =val
}
1
' Input_file2  Input_file1

说明: 为以上添加详细说明。

awk '                                               ##Starting awk program from here.
FNR==NR{                                            ##Checking FNR==NR which will be true when file2 is being read.
  arr1[,,]=                                 ##Creating arr1 with index of 1st, 2nd and 4th field and value of 5th field.
  next                                              ##next will skip all further statements from here.
}
{
  val=""                                            ##Nullifying val here.
  delete arr2;delete arr3;delete arr4;delete arr5   ##Deleting arrays here.
  num1=split(arr1[,,],arr2,",")               ##Splitting arr1 with index of ,, here to arr2.
  for(i=1;i<=num1;i++){ arr4[arr2[i]] }             ##Running loop till num1, creating arr4 with value of arr2 index of i here.
  num2=split($NF,arr3,",")                          ##Splitting current line last field to arr3 with separator of comma here.
}
((,,) in arr1){                               ##Checking if ,, of current line are present in arr1 then do following.
  for(i=1;i<=num2;i++){                             ##Running for loop till num2 here.
    val=(val?val ",":"")(arr3[i] in arr4?arr3[i]"*":arr3[i]"!") ##Creating val which compares values of file1 and file2 is they are common then add * or add ! of file1 current value(one of the 5th field values).
    if(arr3[i] in arr4){ arr5[arr3[i]] }            ##If arr3 value is present in arr4 then create arr5 with index of value of arr3 with index of i.
  }
  for(i=1;i<=num1;i++){                             ##Running loop till value of num1 here.
    if(!(arr2[i] in arr5)){                         ##If value of arr2 is NOT present in arr5(to get values which are already printed common ones of file1, file2) then do following.
      val=val "," arr2[i]                           ##Append arr2 value to val.
    }
  }
  =val                                            ##Assign val to 5th field here.
}
1                                                   ##Printing edited/non-edited line here.
' Input_file2  Input_file1                          ##Mentioning Input_file names here.