从全局环境中的对象获取 colnames(具有特定模式),然后 return 新增功能
Get colnames from objects in global environment (With specific pattern), then just return what's new
所以我有一堆中间数据框对象,它们在我的全局环境中根据它们的顺序编号。即 IRIS1_St、IRIS2_Db、IRIS1_St艺术、IRIS2_FIXAR、IRIS4_Change、IRIS10_bananas
我已经弄清楚如何在不保留列表列的情况下提取这些以及 return 行数和列数(见下文),从技术上讲,我已经提取了列名。但我终其一生都无法弄清楚如何将这个 colname 列变成不是列表的东西,这样我就可以比较滞后值和 return 一个显示新内容的更简单的列。我尝试了 data.table()、data.frame()、as.character() 然后 str_replace_all() 将其转换为向量。但是好像没什么用,这好像是因为我不擅长使用列表!
library(dplyr)
library(purrr)
library(stringr)
IRIS1_St <- iris
IRIS2_Db <- IRIS1_St %>%
mutate(Petal.Length2 = Petal.Length*2)
IRIS3_Sum <- IRIS2_Db %>%
mutate(Sepal.sum = sum(Sepal.Length, Sepal.Width))
IRIS4_Change <- IRIS3_Sum %>%
mutate(SL.Change = Sepal.Length - lag(Sepal.Length)) %>% filter(Petal.Length >=4)
IRIS10_bananas <- IRIS4_Change %>% mutate( bananas = case_when(Sepal.Length >6 ~ "BANANAS!!"))
Obj_Size <- grep("^IRIS",names(.GlobalEnv),value=TRUE) %>%
na.omit() %>%
mget(envir = globalenv()) %>%
{OS <<-.} %>%
map_df(nrow) %>%
pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "nrow") %>%
left_join(OS %>%
map_df(ncol) %>%
pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "ncol")
) %>%
data.frame(OS %>%
lapply(colnames) %>%
data.table()) %>%
mutate(number = as.numeric(replace_na(str_extract(Obj_name, "(?i)(?<=IRIS\D{0,1})\d+"), 0))) %>%
arrange(number, Obj_name) %>%
select(-number) %>% data.frame() %>%
rename(colnames = '.')
#just to seperate out the colname extraction I've done so far
OST <- OS %>% lapply(colnames) %>% data.table()
为了提取新内容,我尝试了以下方法,但由于我已经列出了一个列表,所以它搞砸了。
Obj_Size_New <- Obj_Size %>%
mutate(lag_col = as.character(lag(colnames)),
new_col = setdiff(as.character(colnames), lag_col))
预期输出如下;
Obj_name nrow ncol new_col
1 IRIS1_St 150 5 Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species
2 IRIS2_Db 150 6 Petal.Length2
3 IRIS3_Sum 150 7 Sepal.sum
4 IRIS4_Change 89 8 SL.Change
5 IRIS10_bananas 89 9 bananas
我根据下面 akrun 的建议重写了上面的内容;
library(dplyr)
library(purrr)
library(stringr)
library(tibble)
Obj_Size <- grep("^IRIS",names(.GlobalEnv),value=TRUE) %>% #ID all objects in GE starting with "IRIS"
na.omit() %>%
mget(envir = globalenv()) %>% #Use base R to get them
{OS <<-.} %>% #create intermediate object in GE to join to later
map_df(nrow) %>% #Map nrow using purrr
pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "nrow") %>% #pivot so it's readable
left_join(OS %>% #repeat with ncol and join back to dataset
map_df(ncol) %>%
pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "ncol")
) %>%
left_join(OS %>% #repeat with colnames
map(colnames) %>%
enframe() %>% #create 2 col dataframe
rename(Obj_name = name,
colnames = value)) %>%
mutate(number = as.numeric(replace_na(str_extract(Obj_name, "(?i)(?<=IRIS\D{0,1})\d+"), 0))) %>% #extract number after IRIS in object name so we can order correctly even when we get to 10 as when ordered by name with 10 it puts it after 1.
arrange(number, Obj_name) %>%
select(-number) %>%
data.frame() %>%
mutate (new_col = map2_chr(colnames, lag(colnames), ~toString(setdiff(.x, .y)))) #Id changes between colnames and only return anything new.
我们可以使用 map2
来做 setdiff
到 return 一个 list
列通过比较 list
列和 lag
list
列
library(dplyr)
library(purrr)
Obj_Size %>%
mutate(new_col = map2(colnames, lag(colnames), setdiff), colnames = NULL)
-输出
# Obj_name nrow ncol new_col
#1 IRIS1_St 150 5 Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species
#2 IRIS2_Db 150 6 Petal.Length2
#3 IRIS3_Sum 150 7 Sepal.sum
#4 IRIS4_Change 89 8 SL.Change
#5 IRIS10_bananas 89 9 bananas
如果需要是character
列,使用
library(stringr)
Obj_Size %>%
mutate(new_col = map2_chr(colnames, lag(colnames),
~ str_c(setdiff(.x, .y), collapse=", ")), colnames = NULL)
-输出
# Obj_name nrow ncol new_col
#1 IRIS1_St 150 5 Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species
#2 IRIS2_Db 150 6 Petal.Length2
#3 IRIS3_Sum 150 7 Sepal.sum
#4 IRIS4_Change 89 8 SL.Change
#5 IRIS10_bananas 89 9 bananas
或使用 base R
和 Map
Obj_Size$new_col <- Map(setdiff, Obj_Size$colnames, c(NA, head(Obj_Size$colnames,-1)))
由于您在 colnames
中有一个列表,您可以使用 purrr
的 map
变体:
library(dplyr)
library(purrr)
Obj_Size %>%
mutate(new_col = map2_chr(colnames, lag(colnames), ~toString(setdiff(.x, .y))))
其中 new_col
如下所示:
# new_col
#1 Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species
#2 Petal.Length2
#3 Sepal.sum
#4 SL.Change
#5 bananas
在基础 R 中你可以使用 mapply
:
Obj_Size$new_col <- mapply(function(x, y) toString(setdiff(x, y)),
Obj_Size$colnames, c(NA, Obj_Size$colnames[-nrow(Obj_Size)]))
所以我有一堆中间数据框对象,它们在我的全局环境中根据它们的顺序编号。即 IRIS1_St、IRIS2_Db、IRIS1_St艺术、IRIS2_FIXAR、IRIS4_Change、IRIS10_bananas
我已经弄清楚如何在不保留列表列的情况下提取这些以及 return 行数和列数(见下文),从技术上讲,我已经提取了列名。但我终其一生都无法弄清楚如何将这个 colname 列变成不是列表的东西,这样我就可以比较滞后值和 return 一个显示新内容的更简单的列。我尝试了 data.table()、data.frame()、as.character() 然后 str_replace_all() 将其转换为向量。但是好像没什么用,这好像是因为我不擅长使用列表!
library(dplyr)
library(purrr)
library(stringr)
IRIS1_St <- iris
IRIS2_Db <- IRIS1_St %>%
mutate(Petal.Length2 = Petal.Length*2)
IRIS3_Sum <- IRIS2_Db %>%
mutate(Sepal.sum = sum(Sepal.Length, Sepal.Width))
IRIS4_Change <- IRIS3_Sum %>%
mutate(SL.Change = Sepal.Length - lag(Sepal.Length)) %>% filter(Petal.Length >=4)
IRIS10_bananas <- IRIS4_Change %>% mutate( bananas = case_when(Sepal.Length >6 ~ "BANANAS!!"))
Obj_Size <- grep("^IRIS",names(.GlobalEnv),value=TRUE) %>%
na.omit() %>%
mget(envir = globalenv()) %>%
{OS <<-.} %>%
map_df(nrow) %>%
pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "nrow") %>%
left_join(OS %>%
map_df(ncol) %>%
pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "ncol")
) %>%
data.frame(OS %>%
lapply(colnames) %>%
data.table()) %>%
mutate(number = as.numeric(replace_na(str_extract(Obj_name, "(?i)(?<=IRIS\D{0,1})\d+"), 0))) %>%
arrange(number, Obj_name) %>%
select(-number) %>% data.frame() %>%
rename(colnames = '.')
#just to seperate out the colname extraction I've done so far
OST <- OS %>% lapply(colnames) %>% data.table()
为了提取新内容,我尝试了以下方法,但由于我已经列出了一个列表,所以它搞砸了。
Obj_Size_New <- Obj_Size %>%
mutate(lag_col = as.character(lag(colnames)),
new_col = setdiff(as.character(colnames), lag_col))
预期输出如下;
Obj_name nrow ncol new_col
1 IRIS1_St 150 5 Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species
2 IRIS2_Db 150 6 Petal.Length2
3 IRIS3_Sum 150 7 Sepal.sum
4 IRIS4_Change 89 8 SL.Change
5 IRIS10_bananas 89 9 bananas
我根据下面 akrun 的建议重写了上面的内容;
library(dplyr)
library(purrr)
library(stringr)
library(tibble)
Obj_Size <- grep("^IRIS",names(.GlobalEnv),value=TRUE) %>% #ID all objects in GE starting with "IRIS"
na.omit() %>%
mget(envir = globalenv()) %>% #Use base R to get them
{OS <<-.} %>% #create intermediate object in GE to join to later
map_df(nrow) %>% #Map nrow using purrr
pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "nrow") %>% #pivot so it's readable
left_join(OS %>% #repeat with ncol and join back to dataset
map_df(ncol) %>%
pivot_longer(1:max(ncol(.)), names_to = "Obj_name", values_to = "ncol")
) %>%
left_join(OS %>% #repeat with colnames
map(colnames) %>%
enframe() %>% #create 2 col dataframe
rename(Obj_name = name,
colnames = value)) %>%
mutate(number = as.numeric(replace_na(str_extract(Obj_name, "(?i)(?<=IRIS\D{0,1})\d+"), 0))) %>% #extract number after IRIS in object name so we can order correctly even when we get to 10 as when ordered by name with 10 it puts it after 1.
arrange(number, Obj_name) %>%
select(-number) %>%
data.frame() %>%
mutate (new_col = map2_chr(colnames, lag(colnames), ~toString(setdiff(.x, .y)))) #Id changes between colnames and only return anything new.
我们可以使用 map2
来做 setdiff
到 return 一个 list
列通过比较 list
列和 lag
list
列
library(dplyr)
library(purrr)
Obj_Size %>%
mutate(new_col = map2(colnames, lag(colnames), setdiff), colnames = NULL)
-输出
# Obj_name nrow ncol new_col
#1 IRIS1_St 150 5 Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species
#2 IRIS2_Db 150 6 Petal.Length2
#3 IRIS3_Sum 150 7 Sepal.sum
#4 IRIS4_Change 89 8 SL.Change
#5 IRIS10_bananas 89 9 bananas
如果需要是character
列,使用
library(stringr)
Obj_Size %>%
mutate(new_col = map2_chr(colnames, lag(colnames),
~ str_c(setdiff(.x, .y), collapse=", ")), colnames = NULL)
-输出
# Obj_name nrow ncol new_col
#1 IRIS1_St 150 5 Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species
#2 IRIS2_Db 150 6 Petal.Length2
#3 IRIS3_Sum 150 7 Sepal.sum
#4 IRIS4_Change 89 8 SL.Change
#5 IRIS10_bananas 89 9 bananas
或使用 base R
和 Map
Obj_Size$new_col <- Map(setdiff, Obj_Size$colnames, c(NA, head(Obj_Size$colnames,-1)))
由于您在 colnames
中有一个列表,您可以使用 purrr
的 map
变体:
library(dplyr)
library(purrr)
Obj_Size %>%
mutate(new_col = map2_chr(colnames, lag(colnames), ~toString(setdiff(.x, .y))))
其中 new_col
如下所示:
# new_col
#1 Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, Species
#2 Petal.Length2
#3 Sepal.sum
#4 SL.Change
#5 bananas
在基础 R 中你可以使用 mapply
:
Obj_Size$new_col <- mapply(function(x, y) toString(setdiff(x, y)),
Obj_Size$colnames, c(NA, Obj_Size$colnames[-nrow(Obj_Size)]))