awk:将具有唯一内容的行合并到具有相同 id 的每一行
awk: Merge rows with unique content to each row of same id
我有一个条目列表,每个条目都用换行符分隔,其中每个条目都是一组以逗号分隔的值。逗号分隔类似于每列的值。一些条目映射到相同的 id(每个条目中的第一个值)但具有不同的国家和地区。 (列表如下)
我正在尝试将共享相同 ID 但可能具有不同国家和地区的条目分组到单个行条目,其中关联的多个国家 and/or 地区附加到相应的列并分隔通过“%%%”序列彼此分离。
到目前为止,我已经设法通过 awk 在某种程度上实现了这一点,但我的方法的副作用是我现在有重复的区域列。
因此,我不确定我是应该继续使用当前基于 awk 的方法还是分支到一个更具扩展性的解决方案来处理这项工作。
awk -F ',' -v OFS=',' '{x=;="";ctry[x]=ctry[x]"%%% ";="";a[x]=[=10=]; str=a[x]; }END{for(x in a){outputString=("\n" x a[x] ctry[x]"\n"); gsub (/%%% \n/,"",outputString); print outputString}}' list.csv > final.csv
list.csv 的内容:
123123, Shelf Life Test,f,Other,066900,Germany,809900,Chem CMI,066900,Europe
123123, Shelf Life Test,f,Other,066900,Poland,810000,Chem CMI,066900,APAC
123123, Shelf Life Test,f,Other,066900,Spain,810100,Chem CMI,066900,APAC
123123, Shelf Life Test,f,Other,066900,France,2810200,Chem CMI,066900,North America
456456,Ammonium Citrus Esther,f,SupraTex Chem Analysis, 475000, Nigeria,814600,Chem Sensory,129475,MEA (Middle East and Africa)
789789,Archive 9 BASES II,f,HydroCare,066900,Belgium,211500,Chem CMI,066900,CIS (Commonwealth of Independent States)
当前输出:
789789,Archive 9 BASES II,f,HydroCare,066900,Belgium,211500,Chem CMI,066900,CIS (Commonwealth of Independent States)
456456,Ammonium Citrus Esther,f,SupraTex Chem Analysis, 475000, Nigeria,814600,Chem Sensory,129475,MEA (Middle East and Africa)
123123, Shelf Life Test,f,Other,066900,France,2810200,Chem CMI,066900,Europe%%% APAC%%% APAC%%% North America
预期输出:
789789,Archive 9 BASES II,f,HydroCare,066900,Belgium,211500,Chem CMI,066900,CIS (Commonwealth of Independent States)
456456,Ammonium Citrus Esther,f,SupraTex Chem Analysis, 475000, Nigeria,814600,Chem Sensory,129475,MEA (Middle East and Africa)
123123, Shelf Life Test,f,Other,066900,Germany%%% Poland%%% Spain%%% France,2810200,Chem CMI,066900,Europe%%% APAC%%% North America
一种方式:
awk -F, '( in a){x=a[];len=split(x,arr,",");arr[6]=arr[6]"%%% ";y=arr[1];arr[10]=arr[10]"%%% ";for(i=2;i<=len;i++){y=y","arr[i];a[]=y;}next;}{a[]=[=10=];}END{for(i in a){print a[i];}}' file
试试这个 Perl 解决方案
$ cat ginzburg.txt
123123, Shelf Life Test,f,Other,066900,Germany,809900,Chem CMI,066900,Europe
123123, Shelf Life Test,f,Other,066900,Poland,810000,Chem CMI,066900,APAC
123123, Shelf Life Test,f,Other,066900,Spain,810100,Chem CMI,066900,APAC
123123, Shelf Life Test,f,Other,066900,France,2810200,Chem CMI,066900,North America
456456,Ammonium Citrus Esther,f,SupraTex Chem Analysis, 475000, Nigeria,814600,Chem Sensory,129475,MEA (Middle East and Africa)
789789,Archive 9 BASES II,f,HydroCare,066900,Belgium,211500,Chem CMI,066900,CIS (Commonwealth of Independent States)
$ perl -F, -lanE ' $x=join(",",@F[0..4]);$kv{$x}=join(",",@F[6..8]);@t=@{$kv2{$x}};push(@t,$F[5]);$kv2{$x}=[@t]; @p=@{$kv3{$x}};push(@p,$F[-1]);$kv3{$x}=[@p]; END { for(keys %kv) { %tv=(); %tv=map{$_=>1} @{$kv3{$_}};print "$_ ",join("%%",@{$kv2{$_}})," ",$kv{$_},",",join("%%",keys %tv) } } ' ginzburg.txt
456456,Ammonium Citrus Esther,f,SupraTex Chem Analysis, 475000 Nigeria 814600,Chem Sensory,129475,MEA (Middle East and Africa)
123123, Shelf Life Test,f,Other,066900 Germany%%Poland%%Spain%%France 2810200,Chem CMI,066900,Europe%%North America%%APAC
789789,Archive 9 BASES II,f,HydroCare,066900 Belgium 211500,Chem CMI,066900,CIS (Commonwealth of Independent States)
$
请尝试类似的操作:
awk -F, '{
x=;
if (!shown[x,]++)
ctry[x]=ctry[x]"%%% ";
if (!shown[x,]++)
nation[x]=nation[x]"%%% ";
a[x]=","","","","
b[x]=","","
}
END{
for(x in a){
gsub (/%%% $/,"",nation[x]);
gsub (/%%% $/,"",ctry[x]);
print a[x]","nation[x]","b[x]","ctry[x]"\n";
}
}' list.csv > final.csv
您会看到上面的脚本基于 OP 的脚本并进行了一些重构和修改。重点是行:if (!shown[x,]++)
和下面类似的行,它避免了重复。
我有一个条目列表,每个条目都用换行符分隔,其中每个条目都是一组以逗号分隔的值。逗号分隔类似于每列的值。一些条目映射到相同的 id(每个条目中的第一个值)但具有不同的国家和地区。 (列表如下)
我正在尝试将共享相同 ID 但可能具有不同国家和地区的条目分组到单个行条目,其中关联的多个国家 and/or 地区附加到相应的列并分隔通过“%%%”序列彼此分离。
到目前为止,我已经设法通过 awk 在某种程度上实现了这一点,但我的方法的副作用是我现在有重复的区域列。
因此,我不确定我是应该继续使用当前基于 awk 的方法还是分支到一个更具扩展性的解决方案来处理这项工作。
awk -F ',' -v OFS=',' '{x=;="";ctry[x]=ctry[x]"%%% ";="";a[x]=[=10=]; str=a[x]; }END{for(x in a){outputString=("\n" x a[x] ctry[x]"\n"); gsub (/%%% \n/,"",outputString); print outputString}}' list.csv > final.csv
list.csv 的内容:
123123, Shelf Life Test,f,Other,066900,Germany,809900,Chem CMI,066900,Europe
123123, Shelf Life Test,f,Other,066900,Poland,810000,Chem CMI,066900,APAC
123123, Shelf Life Test,f,Other,066900,Spain,810100,Chem CMI,066900,APAC
123123, Shelf Life Test,f,Other,066900,France,2810200,Chem CMI,066900,North America
456456,Ammonium Citrus Esther,f,SupraTex Chem Analysis, 475000, Nigeria,814600,Chem Sensory,129475,MEA (Middle East and Africa)
789789,Archive 9 BASES II,f,HydroCare,066900,Belgium,211500,Chem CMI,066900,CIS (Commonwealth of Independent States)
当前输出:
789789,Archive 9 BASES II,f,HydroCare,066900,Belgium,211500,Chem CMI,066900,CIS (Commonwealth of Independent States)
456456,Ammonium Citrus Esther,f,SupraTex Chem Analysis, 475000, Nigeria,814600,Chem Sensory,129475,MEA (Middle East and Africa)
123123, Shelf Life Test,f,Other,066900,France,2810200,Chem CMI,066900,Europe%%% APAC%%% APAC%%% North America
预期输出:
789789,Archive 9 BASES II,f,HydroCare,066900,Belgium,211500,Chem CMI,066900,CIS (Commonwealth of Independent States)
456456,Ammonium Citrus Esther,f,SupraTex Chem Analysis, 475000, Nigeria,814600,Chem Sensory,129475,MEA (Middle East and Africa)
123123, Shelf Life Test,f,Other,066900,Germany%%% Poland%%% Spain%%% France,2810200,Chem CMI,066900,Europe%%% APAC%%% North America
一种方式:
awk -F, '( in a){x=a[];len=split(x,arr,",");arr[6]=arr[6]"%%% ";y=arr[1];arr[10]=arr[10]"%%% ";for(i=2;i<=len;i++){y=y","arr[i];a[]=y;}next;}{a[]=[=10=];}END{for(i in a){print a[i];}}' file
试试这个 Perl 解决方案
$ cat ginzburg.txt
123123, Shelf Life Test,f,Other,066900,Germany,809900,Chem CMI,066900,Europe
123123, Shelf Life Test,f,Other,066900,Poland,810000,Chem CMI,066900,APAC
123123, Shelf Life Test,f,Other,066900,Spain,810100,Chem CMI,066900,APAC
123123, Shelf Life Test,f,Other,066900,France,2810200,Chem CMI,066900,North America
456456,Ammonium Citrus Esther,f,SupraTex Chem Analysis, 475000, Nigeria,814600,Chem Sensory,129475,MEA (Middle East and Africa)
789789,Archive 9 BASES II,f,HydroCare,066900,Belgium,211500,Chem CMI,066900,CIS (Commonwealth of Independent States)
$ perl -F, -lanE ' $x=join(",",@F[0..4]);$kv{$x}=join(",",@F[6..8]);@t=@{$kv2{$x}};push(@t,$F[5]);$kv2{$x}=[@t]; @p=@{$kv3{$x}};push(@p,$F[-1]);$kv3{$x}=[@p]; END { for(keys %kv) { %tv=(); %tv=map{$_=>1} @{$kv3{$_}};print "$_ ",join("%%",@{$kv2{$_}})," ",$kv{$_},",",join("%%",keys %tv) } } ' ginzburg.txt
456456,Ammonium Citrus Esther,f,SupraTex Chem Analysis, 475000 Nigeria 814600,Chem Sensory,129475,MEA (Middle East and Africa)
123123, Shelf Life Test,f,Other,066900 Germany%%Poland%%Spain%%France 2810200,Chem CMI,066900,Europe%%North America%%APAC
789789,Archive 9 BASES II,f,HydroCare,066900 Belgium 211500,Chem CMI,066900,CIS (Commonwealth of Independent States)
$
请尝试类似的操作:
awk -F, '{
x=;
if (!shown[x,]++)
ctry[x]=ctry[x]"%%% ";
if (!shown[x,]++)
nation[x]=nation[x]"%%% ";
a[x]=","","","","
b[x]=","","
}
END{
for(x in a){
gsub (/%%% $/,"",nation[x]);
gsub (/%%% $/,"",ctry[x]);
print a[x]","nation[x]","b[x]","ctry[x]"\n";
}
}' list.csv > final.csv
您会看到上面的脚本基于 OP 的脚本并进行了一些重构和修改。重点是行:if (!shown[x,]++)
和下面类似的行,它避免了重复。