删除 R 中的重复子字符串
Remove duplicated substrings in R
我在 R 中有一个数据框如下
bacteria sample
1 A HM_001
2 B HM_001_HM_001
3 C A2_HM_001
4 D A2_HM_001_HM_001
5 E HM_002
6 F HM_002_HM_002
7 G A2_HM_002
8 H A2_HM_002_HM_002
并希望删除 sample
列中的重复子字符串,以便最终输出如下:
bacteria sample
1 A HM_001
2 B HM_001
3 C A2_HM_001
4 D A2_HM_001
5 E HM_002
6 F HM_002
7 G A2_HM_002
8 H A2_HM_002
将正则表达式与 gsub
结合使用
df1$sample_new <- with(df1, gsub("([A-Z]+_\d+)_?\1+", "\1", sample))
-输出
df1
# bacteria sample sample_new
#1 A HM_001 HM_001
#2 B HM_001_HM_001 HM_001
#3 C A2_HM_001 A2_HM_001
#4 D A2_HM_001_HM_001 A2_HM_001
#5 E HM_002 HM_002
#6 F HM_002_HM_002 HM_002
#7 G A2_HM_002 A2_HM_002
#8 H A2_HM_002_HM_002 A2_HM_002
数据
df1 <- structure(list(bacteria = c("A", "B", "C", "D", "E", "F", "G",
"H"), sample = c("HM_001", "HM_001_HM_001", "A2_HM_001", "A2_HM_001_HM_001",
"HM_002", "HM_002_HM_002", "A2_HM_002", "A2_HM_002_HM_002")),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8"))
我在 R 中有一个数据框如下
bacteria sample
1 A HM_001
2 B HM_001_HM_001
3 C A2_HM_001
4 D A2_HM_001_HM_001
5 E HM_002
6 F HM_002_HM_002
7 G A2_HM_002
8 H A2_HM_002_HM_002
并希望删除 sample
列中的重复子字符串,以便最终输出如下:
bacteria sample
1 A HM_001
2 B HM_001
3 C A2_HM_001
4 D A2_HM_001
5 E HM_002
6 F HM_002
7 G A2_HM_002
8 H A2_HM_002
将正则表达式与 gsub
df1$sample_new <- with(df1, gsub("([A-Z]+_\d+)_?\1+", "\1", sample))
-输出
df1
# bacteria sample sample_new
#1 A HM_001 HM_001
#2 B HM_001_HM_001 HM_001
#3 C A2_HM_001 A2_HM_001
#4 D A2_HM_001_HM_001 A2_HM_001
#5 E HM_002 HM_002
#6 F HM_002_HM_002 HM_002
#7 G A2_HM_002 A2_HM_002
#8 H A2_HM_002_HM_002 A2_HM_002
数据
df1 <- structure(list(bacteria = c("A", "B", "C", "D", "E", "F", "G",
"H"), sample = c("HM_001", "HM_001_HM_001", "A2_HM_001", "A2_HM_001_HM_001",
"HM_002", "HM_002_HM_002", "A2_HM_002", "A2_HM_002_HM_002")),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8"))