如何在 R 中使用 conditional/for 循环将单列数据转换为两列矩阵
How to convert single column data into two-column matrix using conditional/for loop in R
我有一个单列数据框 - 示例数据:
1 >PROKKA_00002 Alpha-ketoglutarate permease
2 MTESSITERGAPELADTRRRIWAIVGASSGNLVEWFDFYVYSFCSLYFAHIFFPSGNTTT
3 QLLQTAGVFAAGFLMRPIGGWLFGRIADRRGRKTSMLISVCMMCFGSLVIACLPGYAVIG
4 >PROKKA_00003 lipoprotein
5 MRTIIVIASLLLTGCSHMANDAWSGQDKAQHFLASAMLSAAGNEYAQHQGYSRDRSAAIG
每个字母序列都与其上方的“>”行相关联。我需要一个双列数据框,第一列中的行以“>”开头,第二列中的各个字母行连接为一个序列。这是我到目前为止尝试过的:
y <- matrix(0,5836,2) #empty matrix with 5836 rows and two columns
z <- 0
for(i in 1:nrow(df)){
if((grepl(pattern = "^>", x = df)) == TRUE){ #tried to set the conditional "if a line starts with ">", execute code"
z <- z + 1
y[z,1] <- paste(df[i])
} else{
y[z,2] <- paste(df[i], collapse = "")
}
}
我最终会使用 as.data.frame 将矩阵 y 转换回 data.frame,但我的循环不断出现错误:“}”中出现意外的“}”。我也不确定我的条件是否正确。谁能帮忙?将不胜感激!
尝试为目标符号和列 headers 的行创建索引。然后拆分该索引上的数据。调用 cumsum(ind1)[!ind1]
首先通过将逻辑向量强制转换为数字来创建一个 id 行,然后消除具有列 headers 的行。
ind1 <- grepl(">", mydf$x)
#split data on the index created
newdf <- data.frame(mydf$x[ind1][cumsum(ind1)], mydf$x)[!ind1,]
#Add names
names(newdf) <- c("Name", "Value")
newdf
# Name Value
# 2 >PROKKA_00002 Alpha-ketoglutarate
# 3 >PROKKA_00002 MTESSITERGAPEL
# 5 >PROKKA_00003 lipoprotein
# 6 >PROKKA_00003 MRTIIVIASLLLT
数据
mydf <- data.frame(x=c(">PROKKA_00002","Alpha-ketoglutarate","MTESSITERGAPEL", ">PROKKA_00003", "lipoprotein" ,"MRTIIVIASLLLT"))
如果您能够适当地为行分配节号,则可以使用 plyr 来完成此操作:
library(plyr)
df <- data.frame(v1=c(">PROKKA_00002 Alpha-ketoglutarate permease",
"MTESSITERGAPELADTRRRIWAIVGASSGNLVEWFDFYVYSFCSLYFAHIFFPSGNTTT",
"QLLQTAGVFAAGFLMRPIGGWLFGRIADRRGRKTSMLISVCMMCFGSLVIACLPGYAVIG",
">PROKKA_00003 lipoprotein",
"MRTIIVIASLLLTGCSHMANDAWSGQDKAQHFLASAMLSAAGNEYAQHQGYSRDRSAAIG"))
df$hasMark <- ifelse(grepl(">",df$v1,fixed=TRUE),1, 0)
df$section <- cumsum(df$hasMark)
t <- ddply(df, "section", function(x){
data.frame(v2=head(x,1),v3=paste(x$v1[2:nrow(x)], collapse=''))
})
t <- subset(t, select=-c(section,v2.hasMark,v2.section)) #drop the extra columns
如果您随后查看 't' 我相信这就是您在原始 post
中寻找的内容
尽管我会坚持使用软件包,但这里有一个解决方案
初始化数据
mydf <- data.frame(x=c(">PROKKA_00002 Alpha-ketoglutarate","MTESSITERGAPEL", "MTESSITERGAPEL",">PROKKA_00003 lipoprotein", "MTESSITERGAPEL" ,"MRTIIVIASLLLT"), stringsAsFactors = F)
进程
ind <- grep(">", mydf$x)
temp<-data.frame(ind=ind, from=ind+1, to=c((ind-1)[-1], nrow(mydf)))
seqs<-rep(NA, length(ind))
for(i in 1:length(ind)) {
seqs[i]<-paste(mydf$x[temp$from[i]:temp$to[i]], collapse="")
}
fastatable<-data.frame(name=gsub(">", "", mydf[ind,1]), sequence=seqs)
> fastatable
name sequence
1 PROKKA_00002 Alpha-ketoglutarate MTESSITERGAPELMTESSITERGAPEL
2 PROKKA_00003 lipoprotein MTESSITERGAPELMRTIIVIASLLLT
我有一个单列数据框 - 示例数据:
1 >PROKKA_00002 Alpha-ketoglutarate permease
2 MTESSITERGAPELADTRRRIWAIVGASSGNLVEWFDFYVYSFCSLYFAHIFFPSGNTTT
3 QLLQTAGVFAAGFLMRPIGGWLFGRIADRRGRKTSMLISVCMMCFGSLVIACLPGYAVIG
4 >PROKKA_00003 lipoprotein
5 MRTIIVIASLLLTGCSHMANDAWSGQDKAQHFLASAMLSAAGNEYAQHQGYSRDRSAAIG
每个字母序列都与其上方的“>”行相关联。我需要一个双列数据框,第一列中的行以“>”开头,第二列中的各个字母行连接为一个序列。这是我到目前为止尝试过的:
y <- matrix(0,5836,2) #empty matrix with 5836 rows and two columns
z <- 0
for(i in 1:nrow(df)){
if((grepl(pattern = "^>", x = df)) == TRUE){ #tried to set the conditional "if a line starts with ">", execute code"
z <- z + 1
y[z,1] <- paste(df[i])
} else{
y[z,2] <- paste(df[i], collapse = "")
}
}
我最终会使用 as.data.frame 将矩阵 y 转换回 data.frame,但我的循环不断出现错误:“}”中出现意外的“}”。我也不确定我的条件是否正确。谁能帮忙?将不胜感激!
尝试为目标符号和列 headers 的行创建索引。然后拆分该索引上的数据。调用 cumsum(ind1)[!ind1]
首先通过将逻辑向量强制转换为数字来创建一个 id 行,然后消除具有列 headers 的行。
ind1 <- grepl(">", mydf$x)
#split data on the index created
newdf <- data.frame(mydf$x[ind1][cumsum(ind1)], mydf$x)[!ind1,]
#Add names
names(newdf) <- c("Name", "Value")
newdf
# Name Value
# 2 >PROKKA_00002 Alpha-ketoglutarate
# 3 >PROKKA_00002 MTESSITERGAPEL
# 5 >PROKKA_00003 lipoprotein
# 6 >PROKKA_00003 MRTIIVIASLLLT
数据
mydf <- data.frame(x=c(">PROKKA_00002","Alpha-ketoglutarate","MTESSITERGAPEL", ">PROKKA_00003", "lipoprotein" ,"MRTIIVIASLLLT"))
如果您能够适当地为行分配节号,则可以使用 plyr 来完成此操作:
library(plyr)
df <- data.frame(v1=c(">PROKKA_00002 Alpha-ketoglutarate permease",
"MTESSITERGAPELADTRRRIWAIVGASSGNLVEWFDFYVYSFCSLYFAHIFFPSGNTTT",
"QLLQTAGVFAAGFLMRPIGGWLFGRIADRRGRKTSMLISVCMMCFGSLVIACLPGYAVIG",
">PROKKA_00003 lipoprotein",
"MRTIIVIASLLLTGCSHMANDAWSGQDKAQHFLASAMLSAAGNEYAQHQGYSRDRSAAIG"))
df$hasMark <- ifelse(grepl(">",df$v1,fixed=TRUE),1, 0)
df$section <- cumsum(df$hasMark)
t <- ddply(df, "section", function(x){
data.frame(v2=head(x,1),v3=paste(x$v1[2:nrow(x)], collapse=''))
})
t <- subset(t, select=-c(section,v2.hasMark,v2.section)) #drop the extra columns
如果您随后查看 't' 我相信这就是您在原始 post
中寻找的内容尽管我会坚持使用软件包,但这里有一个解决方案
初始化数据
mydf <- data.frame(x=c(">PROKKA_00002 Alpha-ketoglutarate","MTESSITERGAPEL", "MTESSITERGAPEL",">PROKKA_00003 lipoprotein", "MTESSITERGAPEL" ,"MRTIIVIASLLLT"), stringsAsFactors = F)
进程
ind <- grep(">", mydf$x)
temp<-data.frame(ind=ind, from=ind+1, to=c((ind-1)[-1], nrow(mydf)))
seqs<-rep(NA, length(ind))
for(i in 1:length(ind)) {
seqs[i]<-paste(mydf$x[temp$from[i]:temp$to[i]], collapse="")
}
fastatable<-data.frame(name=gsub(">", "", mydf[ind,1]), sequence=seqs)
> fastatable
name sequence
1 PROKKA_00002 Alpha-ketoglutarate MTESSITERGAPELMTESSITERGAPEL
2 PROKKA_00003 lipoprotein MTESSITERGAPELMRTIIVIASLLLT