在 R 中的 for 循环中嵌入脚本

Question

我在 R 中有一个数据框，看起来像这样：

 library(tibble)
 sample <- tribble(~subj, ~session,
            "A", 1,
            "A", 2, 
            "A", 3,
            "B", 1,
            "B", 2,
            "C", 1,
            "C", 2,
            "C", 3,
            "C", 4)

从这个例子可以看出，每个科目都有多个会话，但科目的会话数并不相同。我的真实数据集中有 94 行（5 个主题，每个主题有 15 到 20 个不同的会话）。

我有另一个脚本获取我的主要数据集（一组语言数据，每个会话中的每个主题都有详细的语音特征，有近 200,000 行）并按主题和会话过滤以创建显示欧几里得距离的距离矩阵不同的词之间。出于实际原因，我不能在这里复制它，但在这里创建了一个示例脚本：

 library(tibble)
 data <- tribble(~subj, ~session, ~Target, ~S1C1_target,           # S1C1 = syllable 1, consonant 1
                 ~S1C1_T.Sonorant, ~S1C1_T.Consonantal,            # _T. = target consonant of S1C1
                 ~S1C1_T.Voice, ~S1C1_T.Nasal, ~S1C1_T.Degree,     # .Voice/.Nasal/etc are phonetic 
                                                                   # properties of the target word
            "A", 1, "electricity", "i", 0, 0, 0, 0, 0,
            "A", 1, "hectic", "h", 0.8, 0, 1, 0, 0,
            "A", 1, "pillow", "p", -1, 1, -1, 0, 0,
            "A", 2, "hello", "h", -0.5, 1, 0, -1, 0,
            "A", 2, "cup", "k", 0.8, 0, 1, 0, 0,
            "A", 2, "exam", "e", 0, 0, 0, 0, 0,
            "B", 1, "wug", "w", 0.8, 0, 1, 0, 0,
            "B", 1, "wug", "w", 0.8, 0, 1, 0, 0,
            "B", 1, "hug", "h", 0.8, 0, 1, 0, 0,
            "B", 2, "wug", "w", -0.5, 1, 0, -1, 0,
            "B", 2, "well", "w", 0.8, 0, 1, 0, 0,
            "B", 2, "what", "w", 0.8, 0, 1, 0, 0)

我想首先为每个会话中的每个主题创建一个数据子集。有时一个参与者在 Target 中有多个相同单词的标记，所以我在这里也为重复迭代创建一个平均值：

 matrixA1 <- data %>%                          # name the data after the subj and session name/number
   filter(subj == "A" & session == 1) %>%  
   dplyr::select(-subj, -session) %>%          # leave only the numeric values + `Target`
   group_by(Target) %>% 
   summarize_all(.funs = list(mean))           # Average across targets with more than one token



 ##### Calculate Euclidean distance between each phonetic property of each S1C1 target consonant

 ones <- rep(1,nrow(matrixA1))  # count repeated rows

 Son.mat.S1C1_T <- matrixA1$S1C1_T.Sonorant %*% t(ones) - ones %*% t(matrixA1$S1C1_T.Sonorant)
 rownames(Son.mat.S1C1_T) <- matrixA1$Target
 colnames(Son.mat.S1C1_T) <- matrixA1$Target
 colnames(Son.mat.S1C1_T) <- paste(colnames(Son.mat.S1C1_T), "Son.S1C1_T", sep = "_")

 Son.mat.S1C1_T <- Son.mat.S1C1_T^2

 Con.mat.S1C1_T <- matrixA1$S1C1_T.Consonantal %*% t(ones) - ones %*% t(matrixA1$S1C1_T.Consonantal)
 rownames(Con.mat.S1C1_T) <- matrixA1$Target
 colnames(Con.mat.S1C1_T) <- matrixA1$Target
 colnames(Con.mat.S1C1_T) <- paste(colnames(Con.mat.S1C1_T), "Con.S1C1_T", sep = "_")

 Con.mat.S1C1_T <- Con.mat.S1C1_T^2

 Voice.mat.S1C1_T <- matrixA1$S1C1_T.Voice %*% t(ones) - ones %*% t(matrixA1$S1C1_T.Voice)
 rownames(Voice.mat.S1C1_T) <- matrixA1$Target
 colnames(Voice.mat.S1C1_T) <- matrixA1$Target
 colnames(Voice.mat.S1C1_T) <- paste(colnames(Voice.mat.S1C1_T), "Voice.S1C1_T", sep = "_")

 Voice.mat.S1C1_T <- Voice.mat.S1C1_T^2

 Nasal.mat.S1C1_T <- matrixA1$S1C1_T.Nasal %*% t(ones) - ones %*% t(matrixA1$S1C1_T.Nasal)
 rownames(Nasal.mat.S1C1_T) <- matrixA1$Target
 colnames(Nasal.mat.S1C1_T) <- matrixA1$Target
 colnames(Nasal.mat.S1C1_T) <- paste(colnames(Nasal.mat.S1C1_T), "Nasal.S1C1_T", sep = "_")

 S1C1.1A <- Son.mat.S1C1_T +
   Con.mat.S1C1_T +
   Voice.mat.S1C1_T +
   Nasal.mat.S1C1_T

 colnames(S1C1.1A) = gsub("_Son.S1C1_T", "", colnames(S1C1.1A))

这将创建一个如下所示的矩阵：

             electricity hectic pillow
 electricity    0.00      1.64   3.00
 hectic         1.64      0.00   8.24
 pillow         3.00      8.24   0.00

如你所见，这段代码已经很长了，真正的代码要长很多。我知道某种循环将是处理它的最佳方法，但我不知道如何运行它。我希望它做的是：

对于 sample 中的每一行，创建一个数据框，在名称

subj

session

对于这些数据帧中的每一个，运行上面的脚本，来自 #####，为每个主题和每个会话创建一个矩阵，如上所示。

要做到这一点，我认为最好的方法是将脚本嵌入到一个for循环中，并指定sample.[=20=中的每一行应该是运行 ]

Answer 1

为了回答您的第一个问题（在 for 循环中嵌入脚本），我建议使用 source() 命令。然后你只需要 assign() 命令。

感觉你的工作流程是这样的：

你的前两个代码块。
第三个代码块（matrixA1 <- 及以下）中的所有内容。
for(i in 1:nrow(sample)){ source(your_script.R) assign(x = paste0("df_", sample$subj[i], sample$session[i]), value = S1C1.1A) }

您需要更改过滤条件。 filter(subj == sample$subj[i] & session == sample$session[i]) 应该可以。

你真的不需要一个单独的脚本，有一个长循环就可以了。如果您也愿意，也可以将其设为函数。但是循环很棒，source() 也是一个很棒的命令！这里的关键是assign()命令。

Answer 2

在我看来你不需要参考你的 sample 数据框，因为关于 subj 和 session 组合的信息都在你的 data.如果不是这样，请告诉我。否则，这是我的方法。

首先，不是手动筛选 subj 和 session 的每个组合的数据，而是 summarize 您的数据，根据主题对数据进行分组后，一次性完成-会话组合。在此之前，给每个组合一个 id 和 group_indices:

data_summ <- data %>%                          
  mutate(id = group_indices(., subj, session)) %>%
  group_by(subj, session, Target) %>% 
  summarize_all(.funs = list(mean))

现在，您可以使用列表方法来提高透明度。首先将汇总数据拆分为数据框列表，每个主题会话一个 id:

data_list <- data_summ %>%
  split(., f = .$id)

现在您可以通过 data_list[[1]] 获取第一个数据帧，通过 data_list[[2]] 获取第二个数据帧，依此类推。这允许您遍历列表并为每个列表元素计算矩阵。我已经简化了您的一些代码 - 例如，您不需要重新命名四个矩阵中的每一个（基于 S1C1_T.Consonantal、S1C1_T.Consonantal...）。我建议您将所有结果存储在一个名为 mat_list 的单独列表中。

mat_list = list()

for (i in 1:length(data_list)) {

  element <- data_list[[i]]

  ones <- rep(1, nrow(element))

  sonorant_vec <- element$S1C1_T.Sonorant
  sonorant_mat <- (sonorant_vec %*% t(ones) - ones %*% t(sonorant_vec))^2

  consonantal_vec <- element$S1C1_T.Consonantal
  consonantal_mat <- (consonantal_vec %*% t(ones) - ones %*% t(consonantal_vec))^2

  voice_vec <- element$S1C1_T.Voice
  voice_mat <- (voice_vec %*% t(ones) - ones %*% t(voice_vec))^2

  nasal_vec <- element$S1C1_T.Nasal
  nasal_mat <- (nasal_vec %*% t(ones) - ones %*% t(nasal_vec))^2

  all_mat <- sonorant_mat + consonantal_mat + voice_mat + nasal_mat
  rownames(all_mat) <- element$Target
  colnames(all_mat) <- element$Target

  mat_list[[i]] <- all_mat
}

瞧瞧：

[[1]]
            electricity hectic pillow
electricity        0.00   1.64   3.00
hectic             1.64   0.00   8.24
pillow             3.00   8.24   0.00

[[2]]
       cup exam hello
cup   0.00 1.64  4.69
exam  1.64 0.00  2.25
hello 4.69 2.25  0.00

[[3]]
    hug wug
hug   0   0
wug   0   0

[[4]]
     well what  wug
well 0.00 0.00 4.69
what 0.00 0.00 4.69
wug  4.69 4.69 0.00

编辑：如果你想避免 for 循环，你可以将循环内的块放入一个函数中，然后 lapply 到 data_list:

lapply(data_list, FUN = function(element) {

  ones <- rep(1, nrow(element))

  sonorant_vec <- element$S1C1_T.Sonorant
  sonorant_mat <- (sonorant_vec %*% t(ones) - ones %*% t(sonorant_vec))^2

  consonantal_vec <- element$S1C1_T.Consonantal
  consonantal_mat <- (consonantal_vec %*% t(ones) - ones %*% t(consonantal_vec))^2

  voice_vec <- element$S1C1_T.Voice
  voice_mat <- (voice_vec %*% t(ones) - ones %*% t(voice_vec))^2

  nasal_vec <- element$S1C1_T.Nasal
  nasal_mat <- (nasal_vec %*% t(ones) - ones %*% t(nasal_vec))^2

  all_mat <- sonorant_mat + consonantal_mat + voice_mat + nasal_mat
  rownames(all_mat) <- element$Target
  colnames(all_mat) <- element$Target

  return(all_mat)
})

编辑 2

要根据主题-会话组合名称命名列表元素，您可以这样做：

data_summ <- data %>%                          
  group_by(subj, session, Target) %>% 
  summarize_all(.funs = list(mean)) %>%
  mutate(subj_session = paste(subj, session))

然后根据这个新的subj_session标识符拆分数据：

data_list <- data_summ %>%
  split(., f = .$subj_session)

Answer 3

这是一种使用 base R 的方法。基本上，您在按 subj 和 session 拆分时对每一列执行相同的操作。

agg_data <-  aggregate(x = data[grep('Sonorant|Consonantal|Voice|Nasal', names(data))],
                       by =  data[c('subj', 'session', 'Target')],
                       FUN = mean)

by(data = agg_data[-which(names(agg_data) %in% c('subj', 'session'))],
   INDICES = agg_data[c('subj', 'session')],
   FUN = function (DF) {
     ones = rep(1, nrow(DF))
     mat = Reduce('+',
            lapply(DF[grep('Sonorant|Consonantal|Voice|Nasal', names(DF))],
                   function (x) (x %*% t(ones) - ones %*% t(x))^2)
            )
     colnames(mat) <- rownames(mat) <- DF[['Target']]
     mat
   }
)

结果 - by 对象：

subj: A
session: 1
            electricity hectic pillow
electricity        0.00   1.64   3.00
hectic             1.64   0.00   8.24
pillow             3.00   8.24   0.00
--------------------------------------------------------------------------------------------------------------------------- 
subj: B
session: 1
    hug wug
hug   0   0
wug   0   0
--------------------------------------------------------------------------------------------------------------------------- 
subj: A
session: 2
       cup exam hello
cup   0.00 1.64  4.69
exam  1.64 0.00  2.25
hello 4.69 2.25  0.00
--------------------------------------------------------------------------------------------------------------------------- 
subj: B
session: 2
     well what  wug
well 0.00 0.00 4.69
what 0.00 0.00 4.69
wug  4.69 4.69 0.00

最后，这是一个使用data.table的方法。由于您正在创建距离矩阵，因此此版本使用 dist(...) 并且我们将距离矩阵包装到一个列表中：

library(data.table)
dt = as.data.table(data)

done_dt = dt[, {tmp = .SD[, lapply(.SD, mean),
                by = Target,
                .SDcols = patterns('Sonorant|Consonantal|Voice|Nasal')]

      list(euc_dist = list(Reduce('+',
                                  lapply(tmp[, -1L, with = FALSE],
                                         function(x) dist(setNames(x, tmp[[1L]]))^2))))
      }
   , by = .(subj, session)]

输出：

done_dt
     subj session       euc_dist
   <char>   <num>         <list>
1:      A       1 1.64,3.00,8.24
2:      A       2 4.69,2.25,1.64
3:      B       1              0
4:      B       2 4.69,4.69,0.00


done_dt[, euc_dist]
[[1]]
       electricity hectic
hectic        1.64       
pillow        3.00   8.24

[[2]]
     hello  cup
cup   4.69     
exam  2.25 1.64

[[3]]
    wug
hug   0

[[4]]
      wug well
well 4.69     
what 4.69 0.00

Answer 4

这是一个使用 dplyr 和 for 循环的解决方案：

# Step1: summarization of data at Subject, session & Target level
masterDt <- data %>%                          
  group_by(subj, session, Target) %>% 
  summarize_all(.funs = list(mean)) %>%
  mutate(subj_session = paste(subj, session)) %>% 
  ungroup() 

#List of variables to be used in 
varList<- c("S1C1_T.Sonorant","S1C1_T.Consonantal","S1C1_T.Voice")

# Step2: Function to calculate distance

EquiDist = function (ds,varList,rowNameVar) {
  # ds: dataframe
  # varList: list of variables which 
  # rowNameVar : row names on which for the matrix

  ones = rep(1, nrow(ds))
  mat = Reduce('+',
               lapply(ds %>% dplyr::select(varList),
                      function (x) (x %*% t(ones) - ones %*% t(x))^2)
  )
  colnames(mat) <-ds[[rowNameVar]]
  rownames(mat) <- ds[[rowNameVar]]
  mat
}

#calculating distnace for all at one go
overallMat<- EquiDist(masterDt,varList = varList,rowNameVar = "Target")

# Step3: creating an Identifier for unique subject, session & Target level,

NamesGrp<- masterDt %>% 
  dplyr::select("subj_session","Target") %>% 
  dplyr::distinct() %>%
  dplyr::group_by(subj_session) %>%
  mutate(Identifier=paste0(Target, collapse = ",")) %>%
  dplyr::select(-Target) %>%
  dplyr::distinct()  %>% 
  dplyr::ungroup()

# matrix for each subject and each session

l=list()
temp<- matrix()
for (i in 1:nrow(NamesGrp)){
  List_Names=NamesGrp[["subj_session"]][i]
  listIdentifier=c(unlist(strsplit(NamesGrp[["Identifier"]][i],",")))
  temp= overallMat[listIdentifier,listIdentifier]
  l[[List_Names]]<-temp

}

#output can be accessed by names/ index of list l 
l$`A 1`
            electricity hectic pillow
electricity        0.00   1.64   3.00
hectic             1.64   0.00   8.24
pillow             3.00   8.24   0.00

在 R 中的 for 循环中嵌入脚本

Embedding a script within a for-loop in R

for-loop

r

tibble