使用其他数据填充缺失值?
Fill up missing values using the other data?
A <- data.frame(Item_A = c("00EF", "00EF", "00EF", "00EF", "00EF", "00FR", "00FR"),
Item_B = c(NA, NA, NA, NA, "JAMES RIVER", NA, NA))
B <- data.frame(Item_A = c("00EF", "00EF", "00EF", "00FR", "00FR"),
Item_B = c("JAMES RIVER", NA, "JAMES RIVER",
"RICE MIDSTREAM", "RICE MIDSTREAM"))
预计:
A <- data.frame(Item_A = c("00EF", "00EF", "00EF", "00EF", "00EF", "00FR", "00FR"),
Item_B = c("JAMES RIVER", "JAMES RIVER", "JAMES RIVER",
"JAMES RIVER", "JAMES RIVER", "RICE MIDSTREAM", "RICE MIDSTREAM"))
B <- data.frame(Item_A = c("00EF", "00EF", "00EF", "00FR", "00FR"),
Item_B = c("JAMES RIVER", "JAMES RIVER", "JAMES RIVER",
"RICE MIDSTREAM", "RICE MIDSTREAM"))
我必须根据 Item_A
相同的其他行的 Item_B
填写第 Item_B
项。例如,数据集A
中Item_B
的第一个到第四个观测需要变成"JAMES RIVER"。
能否请您提出一种方法来填充 R 中的缺失值?我尝试了很多技术,但无法得到我想要的。
您可以试试 tidyr 库助手 fill
library(tidyr)
A %>%
tidyr::fill(Item_B, .direction = "down") %>%
tidyr::fill(Item_B, .direction = "up")
Item_A Item_B
1 00FF JAMES RIVER
2 00FF JAMES RIVER
3 00FF JAMES RIVER
4 00FF JAMES RIVER
5 00FF JAMES RIVER
6 00FR JAMES RIVER
7 00FR JAMES RIVER
据我了解这个问题,这 不是 只是一个简单地在每个 data.frame 的一列中填充缺失值的练习。我相信这需要在查找或映射 table:
的帮助下填写属于 Item_A
的 Item_B
的值
library(data.table)
# create mapping table from both data.frames
map <- unique(rbindlist(list(A, B)))[!is.na(Item_B)]
# or, in case there are additional columns besides Item_A and Item_B
map <- unique(rbindlist(list(A, B))[!is.na(Item_B), .(Item_A, Item_B)])
map
Item_A Item_B
1: 00FF JAMES RIVER
2: 00EF JAMES RIVER
3: 00FR RICE MIDSTREAM
# join and replace
setDT(A)[map, on = c("Item_A"), Item_B := i.Item_B][]
Item_A Item_B
1: 00FF JAMES RIVER
2: 00FF JAMES RIVER
3: 00FF JAMES RIVER
4: 00FF JAMES RIVER
5: 00FF JAMES RIVER
6: 00FR RICE MIDSTREAM
7: 00FR RICE MIDSTREAM
setDT(B)[map, on = c("Item_A"), Item_B := i.Item_B][]
Item_A Item_B
1: 00EF JAMES RIVER
2: 00EF JAMES RIVER
3: 00EF JAMES RIVER
4: 00FR RICE MIDSTREAM
5: 00FR RICE MIDSTREAM
在连接期间,有两列名为 Item_B
,一个来自第一个数据 table,A
(或 B
,resp.),另一个来自第二个数据tablemap
。为了区分它们,i.
前缀表示i.Item_B
应该取自map
。
您可以尝试创建一个字典数据框。
library(dplyr)
dictionnary <- bind_rows(A,B) %>%
filter(!is.na(Item_B)) %>%
distinct
find_name <- function(id){
name <- dictionnary[["Item_B"]][which(dictionnary[["Item_A"]]==id)]
return(name)
}
test_id <- c("00EF","00EF","00EF","00FR","00FR")
new_names <- sapply(test_id ,find_name )
然后您可以声明您的数据框:
New_A <- data.frame(Item_A=c("00FF","00FF","00FF","00FF","00FF","00FR","00FR"),
Item_B=sapply(c("00FF","00FF","00FF","00FF","00FF","00FR","00FR"),find_name))
New_B <- data.frame(Item_A=c("00EF","00EF","00EF","00FR","00FR"),
Item_B=sapply(c("00EF","00EF","00EF","00FR","00FR"),find_name))
@YXCHEN 根据您的输入更新
lookup_df <- unique(rbindlist(list(A, B)))[!is.na(Item_B)]
left_join(A %>% select(Item_A), lookup_df)
left_join(B %>% select(Item_A), lookup_df)
A <- data.frame(Item_A = c("00EF", "00EF", "00EF", "00EF", "00EF", "00FR", "00FR"),
Item_B = c(NA, NA, NA, NA, "JAMES RIVER", NA, NA))
B <- data.frame(Item_A = c("00EF", "00EF", "00EF", "00FR", "00FR"),
Item_B = c("JAMES RIVER", NA, "JAMES RIVER",
"RICE MIDSTREAM", "RICE MIDSTREAM"))
预计:
A <- data.frame(Item_A = c("00EF", "00EF", "00EF", "00EF", "00EF", "00FR", "00FR"),
Item_B = c("JAMES RIVER", "JAMES RIVER", "JAMES RIVER",
"JAMES RIVER", "JAMES RIVER", "RICE MIDSTREAM", "RICE MIDSTREAM"))
B <- data.frame(Item_A = c("00EF", "00EF", "00EF", "00FR", "00FR"),
Item_B = c("JAMES RIVER", "JAMES RIVER", "JAMES RIVER",
"RICE MIDSTREAM", "RICE MIDSTREAM"))
我必须根据 Item_A
相同的其他行的 Item_B
填写第 Item_B
项。例如,数据集A
中Item_B
的第一个到第四个观测需要变成"JAMES RIVER"。
能否请您提出一种方法来填充 R 中的缺失值?我尝试了很多技术,但无法得到我想要的。
您可以试试 tidyr 库助手 fill
library(tidyr)
A %>%
tidyr::fill(Item_B, .direction = "down") %>%
tidyr::fill(Item_B, .direction = "up")
Item_A Item_B
1 00FF JAMES RIVER
2 00FF JAMES RIVER
3 00FF JAMES RIVER
4 00FF JAMES RIVER
5 00FF JAMES RIVER
6 00FR JAMES RIVER
7 00FR JAMES RIVER
据我了解这个问题,这 不是 只是一个简单地在每个 data.frame 的一列中填充缺失值的练习。我相信这需要在查找或映射 table:
的帮助下填写属于Item_A
的 Item_B
的值
library(data.table)
# create mapping table from both data.frames
map <- unique(rbindlist(list(A, B)))[!is.na(Item_B)]
# or, in case there are additional columns besides Item_A and Item_B
map <- unique(rbindlist(list(A, B))[!is.na(Item_B), .(Item_A, Item_B)])
map
Item_A Item_B 1: 00FF JAMES RIVER 2: 00EF JAMES RIVER 3: 00FR RICE MIDSTREAM
# join and replace
setDT(A)[map, on = c("Item_A"), Item_B := i.Item_B][]
Item_A Item_B 1: 00FF JAMES RIVER 2: 00FF JAMES RIVER 3: 00FF JAMES RIVER 4: 00FF JAMES RIVER 5: 00FF JAMES RIVER 6: 00FR RICE MIDSTREAM 7: 00FR RICE MIDSTREAM
setDT(B)[map, on = c("Item_A"), Item_B := i.Item_B][]
Item_A Item_B 1: 00EF JAMES RIVER 2: 00EF JAMES RIVER 3: 00EF JAMES RIVER 4: 00FR RICE MIDSTREAM 5: 00FR RICE MIDSTREAM
在连接期间,有两列名为 Item_B
,一个来自第一个数据 table,A
(或 B
,resp.),另一个来自第二个数据tablemap
。为了区分它们,i.
前缀表示i.Item_B
应该取自map
。
您可以尝试创建一个字典数据框。
library(dplyr)
dictionnary <- bind_rows(A,B) %>%
filter(!is.na(Item_B)) %>%
distinct
find_name <- function(id){
name <- dictionnary[["Item_B"]][which(dictionnary[["Item_A"]]==id)]
return(name)
}
test_id <- c("00EF","00EF","00EF","00FR","00FR")
new_names <- sapply(test_id ,find_name )
然后您可以声明您的数据框:
New_A <- data.frame(Item_A=c("00FF","00FF","00FF","00FF","00FF","00FR","00FR"),
Item_B=sapply(c("00FF","00FF","00FF","00FF","00FF","00FR","00FR"),find_name))
New_B <- data.frame(Item_A=c("00EF","00EF","00EF","00FR","00FR"),
Item_B=sapply(c("00EF","00EF","00EF","00FR","00FR"),find_name))
@YXCHEN 根据您的输入更新
lookup_df <- unique(rbindlist(list(A, B)))[!is.na(Item_B)]
left_join(A %>% select(Item_A), lookup_df)
left_join(B %>% select(Item_A), lookup_df)