计算tsv文件不同section中分隔符分隔的字符串出现的频率
Calculate the frequency of strings separated by delimiter in different section of tsv file
我有一个数据框 mydf
,其中左右基因由“:”分隔。我需要计算这些基因在每个文件的 LeftGene
和 RightGene
列中出现的次数,并得到类似的结果。在 R 中最好的方法是什么?
sample LeftGene RightGene
file1
ATT:TAA
ATT:ATT ATT
file2
TTP:TTG TTP:TTP
结果
file1
LeftGene RightGene
ATT=3 ATT=1
TAA=1
file2
LeftGene RightGene
TTP=1 TTP=2
TTG=1
收件人:阿克伦
这里是实际数据的输出,我们有file_name
,需要得到每个文件中Left.Gene.Symbols
和Right.Gene.Symbols
的频率。我也很想看到所有文件中这些基因的频率(累积)。感谢您的帮助。
mydf<-structure(c("AMLM12001KP", NA, "1114002", NA, NA, NA, NA, NA,
"1121501", NA, NA, NA, "NA", "NA", "NA", "NA", "CR1L", "GIGYF2:GIGYF2:GIGYF2:ENPP3",
"NA", "NA", "NA", "NA", "NTNG1:NTNG1:ENPP3", "NA", "NA", "NA",
"NA", "NA", "CDC27:CDC27", "NA", "ENPP3", "NA", "NA", "NA", "NA",
"NA"), .Dim = c(12L, 3L), .Dimnames = list(NULL, c("files_name",
"Left.Gene.Symbols", "Right.Gene.Symbols")))
预期输出:
AMLM12001KP
Left.Gene.Symbols Right.Gene.Symbols
1114002
Left.Gene.Symbols Right.Gene.Symbols
CR1L=1 CDC27=2
GIGYF2=3 ENPP3=1
ENPP3=1
1121501
Left.Gene.Symbols Right.Gene.Symbols
NTNG1=2
ENPP3=1
All files
Left.Gene.Symbol Right.Gene.Symbols
CR1L=1 CDC27=2
GIGYF2=3 ENPP3=1
NTNG1=2
ENPP3=2
我们 split
'df' 的第 2 和第 3 列带有分隔符 :
,转换为 'long' 格式 cSplit
从 splitstackshape
.输出将是 data.table
。我们使用 melt
通过选择 'id.var' 作为 'sample' 再次重塑它,同时删除 NA
值。按 'sample'、'variable' 和 'value' 分组,我们得到行数 (.N
),通过 paste
ing 'value' 创建新变量] 和 'N' 以及一个序列变量 ('ind')。然后,我们dcast
从'long'格式变成'wide'格式。
library(splitstackshape)
library(data.table)
dM <- melt(cSplit(df, 2:3, ':', 'long'),
id.var='sample', na.rm=TRUE)[, .N,.(sample, variable, value)]
dM[, valueN:= paste(value, N, sep="=")]
dM[, ind:= 1:.N, .(sample, variable)]
dcast(dM, ind+sample~variable, value.var='valueN')
# ind sample LeftGene RightGene
#1: 1 file1 ATT=2 ATT=1
#2: 1 file2 TTP=1 TTP=2
#3: 2 file2 TTG=1 NA
数据
df <- structure(list(sample = c("file1", "file2"),
LeftGene = c("ATT:ATT",
"TTP:TTG"), RightGene = c("ATT", "TTP:TTP")),
.Names = c("sample",
"LeftGene", "RightGene"), class = "data.frame",
row.names = c(NA, -2L))
编辑
dd2<-structure(c("AMLM12001KP", NA, "1114002", NA, NA, NA, NA, NA,"1121501", NA, NA, NA, "NA", "NA", "NA", "NA", "CR1L", "GIGYF2:GIGYF2:GIGYF2:ENPP3","NA", "NA", "NA", "NA", "NTNG1:NTNG1:ENPP3", "NA", "NA", "NA","NA", "NA", "CDC27:CDC27", "NA", "ENPP3", "NA", "NA", "NA", "NA", "NA"), .Dim = c(12L, 3L), .Dimnames = list(NULL, c("files_name", "Left.Gene.Symbols", "Right.Gene.Symbols")))
## change character NAs to <NA> and carry-forward the file column
dd2[dd2 == 'NA'] <- NA
dd2[, 1] <- na.omit(unique(dd2[, 1]))[cumsum(!is.na(dd2[, 1]))]
## split based on file name
sp <- split(data.frame(dd2, stringsAsFactors = FALSE), dd2[, 1])
## split each string by `:` and make a table
(l <- lapply(sp, function(x) {
x <- droplevels(x[, -1])
f <- function(x) na.omit(unlist(strsplit(x, ':')))
left <- f(x[, 1])
right <- f(x[, 2])
table(c(left, right), rep(names(x), c(length(left), length(right))))
}))
# $`1114002`
#
# Left.Gene.Symbols Right.Gene.Symbols
# CDC27 0 2
# CR1L 1 0
# ENPP3 1 1
# GIGYF2 3 0
#
# $`1121501`
#
# Left.Gene.Symbols
# ENPP3 1
# NTNG1 2
#
# $AMLM12001KP
# < table of extent 0 x 0 >
并且由于每个列表元素都是一个 table,因此将它们作为 tables
data.frame(l$`1114002`)
# Var1 Var2 Freq
# 1 CDC27 Left.Gene.Symbols 0
# 2 CR1L Left.Gene.Symbols 1
# 3 ENPP3 Left.Gene.Symbols 1
# 4 GIGYF2 Left.Gene.Symbols 3
# 5 CDC27 Right.Gene.Symbols 2
# 6 CR1L Right.Gene.Symbols 0
# 7 ENPP3 Right.Gene.Symbols 1
# 8 GIGYF2 Right.Gene.Symbols 0
这是列表格式的另一种方式
rl <- readLines(textConnection("
sample LeftGene RightGene
file1
ATT:ATT ATT
file2
TTP:TTG TTP:TTP
"))
dd <- setNames(read.table(text = rl[grep('file', rl) + 1], stringsAsFactors = FALSE),
c('LeftGene','RightGene'))
rownames(dd) <- paste0('File', 1:nrow(dd))
setNames(lapply(1:nrow(dd), function(x) {
sp <- strsplit(unlist(dd[x, ]), ':')
table(unlist(sp), rep(names(sp), lengths(sp)))
}), rownames(dd))
# $File1
#
# LeftGene RightGene
# ATT 2 1
#
# $File2
#
# LeftGene RightGene
# TTG 1 0
# TTP 1 2
或
setNames(lapply(1:nrow(dd), function(x) {
sp <- strsplit(unlist(dd[x, ]), ':')
lapply(sp, function(y) data.frame(table(y)))
}), rownames(dd))
# $File1
# $File1$LeftGene
# y Freq
# 1 ATT 2
#
# $File1$RightGene
# y Freq
# 1 ATT 1
#
#
# $File2
# $File2$LeftGene
# y Freq
# 1 TTG 1
# 2 TTP 1
#
# $File2$RightGene
# y Freq
# 1 TTP 2
我有一个数据框 mydf
,其中左右基因由“:”分隔。我需要计算这些基因在每个文件的 LeftGene
和 RightGene
列中出现的次数,并得到类似的结果。在 R 中最好的方法是什么?
sample LeftGene RightGene
file1
ATT:TAA
ATT:ATT ATT
file2
TTP:TTG TTP:TTP
结果
file1
LeftGene RightGene
ATT=3 ATT=1
TAA=1
file2
LeftGene RightGene
TTP=1 TTP=2
TTG=1
收件人:阿克伦
这里是实际数据的输出,我们有file_name
,需要得到每个文件中Left.Gene.Symbols
和Right.Gene.Symbols
的频率。我也很想看到所有文件中这些基因的频率(累积)。感谢您的帮助。
mydf<-structure(c("AMLM12001KP", NA, "1114002", NA, NA, NA, NA, NA,
"1121501", NA, NA, NA, "NA", "NA", "NA", "NA", "CR1L", "GIGYF2:GIGYF2:GIGYF2:ENPP3",
"NA", "NA", "NA", "NA", "NTNG1:NTNG1:ENPP3", "NA", "NA", "NA",
"NA", "NA", "CDC27:CDC27", "NA", "ENPP3", "NA", "NA", "NA", "NA",
"NA"), .Dim = c(12L, 3L), .Dimnames = list(NULL, c("files_name",
"Left.Gene.Symbols", "Right.Gene.Symbols")))
预期输出:
AMLM12001KP
Left.Gene.Symbols Right.Gene.Symbols
1114002
Left.Gene.Symbols Right.Gene.Symbols
CR1L=1 CDC27=2
GIGYF2=3 ENPP3=1
ENPP3=1
1121501
Left.Gene.Symbols Right.Gene.Symbols
NTNG1=2
ENPP3=1
All files
Left.Gene.Symbol Right.Gene.Symbols
CR1L=1 CDC27=2
GIGYF2=3 ENPP3=1
NTNG1=2
ENPP3=2
我们 split
'df' 的第 2 和第 3 列带有分隔符 :
,转换为 'long' 格式 cSplit
从 splitstackshape
.输出将是 data.table
。我们使用 melt
通过选择 'id.var' 作为 'sample' 再次重塑它,同时删除 NA
值。按 'sample'、'variable' 和 'value' 分组,我们得到行数 (.N
),通过 paste
ing 'value' 创建新变量] 和 'N' 以及一个序列变量 ('ind')。然后,我们dcast
从'long'格式变成'wide'格式。
library(splitstackshape)
library(data.table)
dM <- melt(cSplit(df, 2:3, ':', 'long'),
id.var='sample', na.rm=TRUE)[, .N,.(sample, variable, value)]
dM[, valueN:= paste(value, N, sep="=")]
dM[, ind:= 1:.N, .(sample, variable)]
dcast(dM, ind+sample~variable, value.var='valueN')
# ind sample LeftGene RightGene
#1: 1 file1 ATT=2 ATT=1
#2: 1 file2 TTP=1 TTP=2
#3: 2 file2 TTG=1 NA
数据
df <- structure(list(sample = c("file1", "file2"),
LeftGene = c("ATT:ATT",
"TTP:TTG"), RightGene = c("ATT", "TTP:TTP")),
.Names = c("sample",
"LeftGene", "RightGene"), class = "data.frame",
row.names = c(NA, -2L))
编辑
dd2<-structure(c("AMLM12001KP", NA, "1114002", NA, NA, NA, NA, NA,"1121501", NA, NA, NA, "NA", "NA", "NA", "NA", "CR1L", "GIGYF2:GIGYF2:GIGYF2:ENPP3","NA", "NA", "NA", "NA", "NTNG1:NTNG1:ENPP3", "NA", "NA", "NA","NA", "NA", "CDC27:CDC27", "NA", "ENPP3", "NA", "NA", "NA", "NA", "NA"), .Dim = c(12L, 3L), .Dimnames = list(NULL, c("files_name", "Left.Gene.Symbols", "Right.Gene.Symbols")))
## change character NAs to <NA> and carry-forward the file column
dd2[dd2 == 'NA'] <- NA
dd2[, 1] <- na.omit(unique(dd2[, 1]))[cumsum(!is.na(dd2[, 1]))]
## split based on file name
sp <- split(data.frame(dd2, stringsAsFactors = FALSE), dd2[, 1])
## split each string by `:` and make a table
(l <- lapply(sp, function(x) {
x <- droplevels(x[, -1])
f <- function(x) na.omit(unlist(strsplit(x, ':')))
left <- f(x[, 1])
right <- f(x[, 2])
table(c(left, right), rep(names(x), c(length(left), length(right))))
}))
# $`1114002`
#
# Left.Gene.Symbols Right.Gene.Symbols
# CDC27 0 2
# CR1L 1 0
# ENPP3 1 1
# GIGYF2 3 0
#
# $`1121501`
#
# Left.Gene.Symbols
# ENPP3 1
# NTNG1 2
#
# $AMLM12001KP
# < table of extent 0 x 0 >
并且由于每个列表元素都是一个 table,因此将它们作为 tables
data.frame(l$`1114002`)
# Var1 Var2 Freq
# 1 CDC27 Left.Gene.Symbols 0
# 2 CR1L Left.Gene.Symbols 1
# 3 ENPP3 Left.Gene.Symbols 1
# 4 GIGYF2 Left.Gene.Symbols 3
# 5 CDC27 Right.Gene.Symbols 2
# 6 CR1L Right.Gene.Symbols 0
# 7 ENPP3 Right.Gene.Symbols 1
# 8 GIGYF2 Right.Gene.Symbols 0
这是列表格式的另一种方式
rl <- readLines(textConnection("
sample LeftGene RightGene
file1
ATT:ATT ATT
file2
TTP:TTG TTP:TTP
"))
dd <- setNames(read.table(text = rl[grep('file', rl) + 1], stringsAsFactors = FALSE),
c('LeftGene','RightGene'))
rownames(dd) <- paste0('File', 1:nrow(dd))
setNames(lapply(1:nrow(dd), function(x) {
sp <- strsplit(unlist(dd[x, ]), ':')
table(unlist(sp), rep(names(sp), lengths(sp)))
}), rownames(dd))
# $File1
#
# LeftGene RightGene
# ATT 2 1
#
# $File2
#
# LeftGene RightGene
# TTG 1 0
# TTP 1 2
或
setNames(lapply(1:nrow(dd), function(x) {
sp <- strsplit(unlist(dd[x, ]), ':')
lapply(sp, function(y) data.frame(table(y)))
}), rownames(dd))
# $File1
# $File1$LeftGene
# y Freq
# 1 ATT 2
#
# $File1$RightGene
# y Freq
# 1 ATT 1
#
#
# $File2
# $File2$LeftGene
# y Freq
# 1 TTG 1
# 2 TTP 1
#
# $File2$RightGene
# y Freq
# 1 TTP 2