如何在 R 中匹配、替换和求和来自另一个数据集的 header 行?

How to match, replace and sum header rows from another dataset in R?

我有两个data-sets:

一个。看起来像这样的数据框:

        SpeciesA  SpeciesB  SpeciesC  SpeciesD  SpeciesE  SpeciesF
Site1     1          0        4        6          2        5
Site2     1          0        4        6          2        5
Site3     1          0        4        6          2        5
Site4     1          0        4        6          2        5

(注意:行值不相同,这里只是为了表示)

b。另一个 data-set 看起来像这样:

Family          Species
Family1         SpeciesA
Family1         SpeciesB
Family1         SpeciesC
Family2         SpeciesD
Family3         SpeciesE
Family4         SpeciesF

我想将data-set(2)中的Family列与data-frame(1)中的相应Species相匹配,并将同一个Family下的值(如果有多个物种)相加.我知道我可以使用 merge 函数,但我不知道如何使用它,或者如何在 header 行中调用它然后将其全部求和。

Pre-Final输出

         Family1    Family1   Family1  Family2  Family3  Family4
Site1     1          0        4        6          2        5 
Site2     1          0        4        6          2        5 
Site3     1          0        4        6          2        5 
Site5     1          0        4        6          2        5 

最终输出

         Family1      Family2    Family3   Family4
Site1     5             6          2        5           
Site2     5             6          2        5             
Site3     5             6          2        5             
Site4     5             6          2        5     

如果我理解正确,您可以将第一个 data.frame 从 "wide" 重塑为 "long" 格式,merge 使用第二个 data.frame,然后使用适当的聚合将结果重新转换为宽格式:

dfa$id <- row.names(dfa)
mdfa <- reshape2::melt(dfa, id.vars = "id", variable.name = "Species")

reshape2::dcast(
    merge(dfb, mdfa, by = "Species"), 
    id ~ Family, 
    fun.aggregate = sum
)
#      id Family1 Family2 Family3 Family4
# 1 Site1       5       6       2       5
# 2 Site2       5       6       2       5
# 3 Site3       5       6       2       5
# 4 Site4       5       6       2       5

数据:

dfa <- read.table(text = "SpeciesA  SpeciesB  SpeciesC  SpeciesD  SpeciesE  SpeciesF
Site1     1          0        4        6          2        5
Site2     1          0        4        6          2        5
Site3     1          0        4        6          2        5
Site4     1          0        4        6          2        5",
header = TRUE, stringsAsFactors = FALSE)

dfb <- read.table(text = "Family          Species
Family1         SpeciesA
Family1         SpeciesB
Family1         SpeciesC
Family2         SpeciesD
Family3         SpeciesE
Family4         SpeciesF",
header = TRUE, stringsAsFactors = FALSE)

你可以这样做:

transform(dfa, Family1=SpeciesA+SpeciesB+SpeciesC, Family2=SpeciesD, Family3=SpeciesE, Family4=SpeciesF)[-(1:6)]

结果如下:

dfa <- read.table(text = "SpeciesA  SpeciesB  SpeciesC  SpeciesD  SpeciesE  SpeciesF
Site1     1          0        4        6          2        5
Site2     1          0        4        6          2        5
Site3     1          0        4        6          2        5
Site4     1          0        4        6          2        5",
                  header = TRUE, stringsAsFactors = FALSE)
# > transform(dfa, Family1=SpeciesA+SpeciesB+SpeciesC, Family2=SpeciesD, Family3=SpeciesE, Family4=SpeciesF)[-(1:6)]
#       Family1 Family2 Family3 Family4
# Site1       5       6       2       5
# Site2       5       6       2       5
# Site3       5       6       2       5
# Site4       5       6       2       5

或者你可以做一个矩阵乘法:

as.matrix(dfa) %*% matrix(c(1,1,1,0,0,0, 0,0,0,1,0,0, 0,0,0,0,1,0, 0,0,0,0,0,1), 6)
# > as.matrix(dfa) %*% matrix(c(1,1,1,0,0,0, 0,0,0,1,0,0, 0,0,0,0,1,0, 0,0,0,0,0,1), 6)
#       [,1] [,2] [,3] [,4]
# Site1    5    6    2    5
# Site2    5    6    2    5
# Site3    5    6    2    5
# Site4    5    6    2    5

dat2 <- read.table(header=TRUE, text=
"Family          Species
Family1         SpeciesA
Family1         SpeciesB
Family1         SpeciesC
Family2         SpeciesD
Family3         SpeciesE
Family4         SpeciesF")

您可以将矩阵乘法的代码缩短为

as.matrix(dfa) %*% t(table(dat2)) # or 
tcrossprod(as.matrix(dfa), as.matrix(table(dat2)))

(tnx 到@alexis_laz 发表评论)
.
另一个以 R 为基数的解决方案:

d <- cbind(rowSums(dfa[1:3]), dfa[-(1:3)])
names(d) <- paste0("Family", 1:4)
d

我的答案 2 data.table 或 dplyr:

data <- read.table(text="
sites      SpeciesA  SpeciesB  SpeciesC  SpeciesD  SpeciesE  SpeciesF
Site1     1          0        4        6          2        5
Site2     1          0        4        6          2        5
Site3     1          0        4        6          2        5
Site4     1          0        4        6          2        5" ,  header=TRUE, stringsAsFactors=FALSE)
famdf <- read.table(text="
Family          Species
Family1         SpeciesA
Family1         SpeciesB
Family1         SpeciesC
Family2         SpeciesD
Family3         SpeciesE
Family4         SpeciesF" ,  header=TRUE, stringsAsFactors=FALSE)

#My answer 1 with data.table:
melted<-data.table::melt(data,id.vars="sites", variable.name= "Species")

data.table::dcast(
  setDT(merge(famdf, melted, by = "Species"))[,c("sites","Family","value")], 
  ... ~ Family,
    fun = sum,
  value.var = "value", 
)
#end
#My answer 2 with dplyr or data.table:
transpose<-function(df){
  n<-df[,1]
  df <- as.data.frame(t(df[,-1]))
  colnames(df) <- n
  df$id<-factor(row.names(df))
  return(df)
  }
data<-transpose(data)
data$fam<-fam$Family[match(data$id, fam$Species)]
data <- subset(data, select = -id )

#Sum option 1 data.table
library(data.table)
transpose(setDF(setDT(data)[, lapply(.SD,sum), by = .(fam)]))
#Sum option 2 dplyr
library(dplyr)
result<-as.data.frame(data %>%
  group_by(fam) %>%
  summarise_each(funs(sum))
)
transpose(result)

这是另一个带有查找 table(命名向量)和 rowSums.

的基本 R 解决方案
# get lookup table
lookup <- setNames(dfb$Species, dfb$Family)
# get corresponding column positions with match
colPos <- names(lookup)[match(names(dfa), lookup)]

# return data.frame with named columns
setNames(data.frame(lapply(unique(names(lookup)),
                           function(i) rowSums(dfa[i == colPos]))),
         unique(names(lookup)))

这个returns

      Family1 Family2 Family3 Family4
Site1       5       6       2       5
Site2       5       6       2       5
Site3       5       6       2       5
Site4       5       6       2       5

第二行match用于查找对应的列位置。在第三行中,lapply 遍历唯一的家族名称并将 rowSums 应用于与这些名称对应的列。这个 returns 一个列表,它被转换成一个 data.frame 并用 setNames.

命名