r中的时变网络

Question

我有关于大学俱乐部每周社交时间可能发生并且确实发生的每一次互动的数据

我的数据样本如下

structure(list(from = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", 
"B", "C"), class = "factor"), to = structure(c(2L, 3L, 2L, 3L, 
2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A", 
"B", "C"), class = "factor"), timestalked = c(0L, 1L, 0L, 4L, 
1L, 2L, 0L, 1L, 0L, 2L, 1L, 0L, 1L, 2L, 1L, 0L, 0L, 0L), week = structure(c(1L, 
1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L, 
2L), .Label = c("1/1/2010", "1/15/2010", "1/8/2010"), class = "factor")), .Names = c("from", 
"to", "timestalked", "week"), class = "data.frame", row.names = c(NA, 
-18L))

我正在尝试计算网络统计数据，例如每个星期、过去两周和年初至今的 A、B、C 的中心性。我让它工作的唯一方法是在我想要分析的时间单位内手动分解文件，但我希望必须有一种不那么费力的方法。

当timestalked为0时，应视为无边

输出将产生一个 .csv，其中包含以下内容：

actor  cent_week1 cent_week2 cent_week3 cent_last2weeks cent_yeartodate
 A       
 B
 C

其中 cent_week1 是 1/1/2010 中心性； cent_last2weeks 仅考虑 1/8/2010 和 1/15/2010； cent_yeartodate 是同时考虑的所有数据。这被应用于包含数百万个观测值的更大的数据集。

Answer 1

无法发表评论，所以我正在写一篇"answer"。如果您想对 timestalked 执行一些数学运算并通过 from 获取值（在您的示例中没有找到任何名为 actor 的变量），这里有一个 data.table 方法这可能会有帮助：

dat <- as.data.table(dat) # or add 'data.table' to the class parameter
dat$week <- as.Date(dat$week, format = "%m/%d/%Y")
dat[, .(cent = mean(timestalked)), by = list(from, weeknum = week(week))]

这给出了以下输出：

dat[, .(cent = mean(timestalked)), by = list(from, weeknum = week(week))]

   from weeknum cent
1:    A       1  0.5
2:    A       2  2.0
3:    A       3  1.5
4:    B       1  0.5
5:    B       2  1.0
6:    B       3  0.5
7:    C       1  1.5
8:    C       2  0.5
9:    C       3  0.0

将此分配给 new_dat。您可以简单地使用 new_dat[weeknum %in% 2:3] 或您想要的任何其他变体或一年中的 sum 按周进行子集化。此外，您还可以根据需要 sort/order。

希望这对您有所帮助！

Answer 2

怎么样：

library(dplyr)
centralities <- tmp       %>% 
  group_by(week)          %>% 
  filter(timestalked > 0) %>% 
  do(
    week_graph=igraph::graph_from_edgelist(as.matrix(cbind(.$from, .$to)))
  )                       %>% 
  do(
    ecs = igraph::eigen_centrality(.$week_graph)$vector
  )                       %>% 
  summarise(ecs_A = ecs[[1]], ecs_B = ecs[[2]], ecs_C = ecs[[3]])

如果你有很多演员，你可以使用summarise_all。把它变成长格式留作练习。

Answer 3

可以通过在另一个 table 中设置您的 windows，然后对每个 windows:

进行分组操作来实现

数据准备：

# Load Data
DT <- structure(list(from = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", 
"B", "C"), class = "factor"), to = structure(c(2L, 3L, 2L, 3L, 
2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A", 
"B", "C"), class = "factor"), timestalked = c(0L, 1L, 0L, 4L, 
1L, 2L, 0L, 1L, 0L, 2L, 1L, 0L, 1L, 2L, 1L, 0L, 0L, 0L), week = structure(c(1L, 
1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L, 
2L), .Label = c("1/1/2010", "1/15/2010", "1/8/2010"), class = "factor")), .Names = c("from", 
"to", "timestalked", "week"), class = "data.frame", row.names = c(NA, 
-18L))

# Code
library(igraph)
library(data.table)

setDT(DT)

# setup events
DT <- DT[timestalked > 0]
DT[, week := as.Date(week, format = "%m/%d/%Y")]

# setup windows, edit as needed
date_ranges <- data.table(label = c("cent_week_1","cent_week_2","cent_last2weeks","cent_yeartodate"),
                          week_from = as.Date(c("2010-01-01","2010-01-08","2010-01-08","2010-01-01")),
                          week_to = as.Date(c("2010-01-01","2010-01-08","2010-01-15","2010-01-15"))
)

# find all events within windows
DT[, JA := 1]
date_ranges[, JA := 1]
graph_base <- merge(DT, date_ranges, by = "JA", allow.cartesian = TRUE)[week >= week_from & week <= week_to]

现在是 by group 代码，第二行有点粗暴，欢迎讨论如何避免重复调用的想法

graph_base <- graph_base[, .(graphs = list(graph_from_data_frame(.SD))), by = label, .SDcols = c("from", "to", "timestalked")] # create graphs
graph_base <- graph_base[, .(vertex = names(eigen_centrality(graphs[[1]])$vector), ec = eigen_centrality(graphs[[1]])$vector), by = label] # calculate centrality

用于最终格式化的 dcast：

dcast(graph_base, vertex ~ label, value.var = "ec")
   vertex cent_last2weeks cent_week_1 cent_week_2 cent_yeartodate
1:      A       1.0000000   0.7071068   0.8944272       0.9397362
2:      B       0.7052723   0.7071068   0.4472136       0.7134685
3:      C       0.9008487   1.0000000   1.0000000       1.0000000

Answer 4

此分析遵循一般的拆分-应用-组合方法，其中数据按周重新拆分，应用图形函数，然后将结果组合在一起。有几个工具可以做到这一点，但下面使用基础 R 和 data.table.

基本R

首先为你的数据设置数据-class，这样术语最近两周才有意义。

# Set date class and order
d$week <- as.Date(d$week, format="%m/%d/%Y")
d <- d[order(d$week), ]
d <- d[d$timestalked > 0, ] # remove edges // dont need to do this is using weights

然后拆分并应用图函数

# split data and form graph for eack week
g1 <- lapply(split(seq(nrow(d)), d$week), function(i) 
                                                  graph_from_data_frame(d[i,]))
# you can then run graph functions to extract specific measures
(grps <- sapply(g1, function(x) eigen_centrality(x,
                                            weights = E(x)$timestalked)$vector))

#   2010-01-01 2010-01-08 2010-01-15
# A  0.5547002  0.9284767  1.0000000
# B  0.8320503  0.3713907  0.7071068
# C  1.0000000  1.0000000  0.7071068

# Aside: If you only have one function to run on the graphs, 
# you could do this in one step
# 
# sapply(split(seq(nrow(d)), d$week), function(i) {
#             x = graph_from_data_frame(d[i,])
#             eigen_centrality(x, weights = E(x)$timestalked)$vector
#           })

然后您需要结合对所有数据的分析 - 因为您只需要再构建两个图表，这不是耗时的部分。

fun1 <- function(i, name) {
            x = graph_from_data_frame(i)
            d = data.frame(eigen_centrality(x, weights = E(x)$timestalked)$vector)
            setNames(d, name)
    }


a = fun1(d, "alldata")
lt = fun1(d[d$week %in% tail(unique(d$week), 2), ], "lasttwo")

# Combine: could use `cbind` in this example, but perhaps `merge` is 
# safer if there are different levels between dates
data.frame(grps, lt, a) # or
Reduce(merge, lapply(list(grps, a, lt), function(x) data.frame(x, nms = row.names(x))))

#   nms X2010.01.01 X2010.01.08 X2010.01.15  alldata lasttwo
# 1   A   0.5547002   0.9284767   1.0000000 0.909899     1.0
# 2   B   0.8320503   0.3713907   0.7071068 0.607475     0.5
# 3   C   1.0000000   1.0000000   0.7071068 1.000000     1.0

data.table

耗时的步骤很可能是在数据上显式拆分应用函数。 data.table 应该在这里提供一些好处，特别是当数据变大时，and/or 有更多的组。

# function to apply to graph
fun <- function(d) {
  x = graph_from_data_frame(d)
  e = eigen_centrality(x, weights = E(x)$timestalked)$vector
  list(e, names(e))
}

library(data.table)
dcast(
  setDT(d)[, fun(.SD), by=week], # apply function - returns data in  long format
  V2 ~ week, value.var = "V1")   # convert to wide format

#    V2 2010-01-01 2010-01-08 2010-01-15
# 1:  A  0.5547002  0.9284767  1.0000000
# 2:  B  0.8320503  0.3713907  0.7071068
# 3:  C  1.0000000  1.0000000  0.7071068

然后运行像以前一样对整个数据/过去两周的函数。

答案之间存在差异，这取决于我们在计算中心性时如何使用 weights 参数，而其他人不使用权重。

d=structure(list(from = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", 
"B", "C"), class = "factor"), to = structure(c(2L, 3L, 2L, 3L, 
2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A", 
"B", "C"), class = "factor"), timestalked = c(0L, 1L, 0L, 4L, 
1L, 2L, 0L, 1L, 0L, 2L, 1L, 0L, 1L, 2L, 1L, 0L, 0L, 0L), week = structure(c(1L, 
1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L, 
2L), .Label = c("1/1/2010", "1/15/2010", "1/8/2010"), class = "factor")), .Names = c("from", 
"to", "timestalked", "week"), class = "data.frame", row.names = c(NA, 
-18L))

r中的时变网络

Time varying network in r

r

igraph

sna