一次用多个列中的值替换字符串
Replace strings with values across multiple columns at once
我需要用跨多列的数字替换字符串。下面是一个示例数据集:
x <- c("Low Outlier", "High Outlier", "Novice", "Novice", "Emerging", NA, "Proficient", "Approaching")
y <- c("Novice", "Approaching", "Proficient", "Approaching", "High Outlier", "Proficient",NA, "Emerging")
z <- c("High Outlier", "Proficient", "Approaching", "Emerging", "Low Outlier", "Approaching", "Approaching", "Emerging")
sam <- cbind(x,y,z)
我需要将 "High/Low Outliers" 转换为 0,NA 保留为 NA,"Novice" 转换为 1,"Emerging" 转换为 2,"Approaching to 3, and "Proficient" 转换为4.
我尝试用
转换单个变量
sam$x.r <- recode(sam$x.r,'Low Outlier'=0,'High Outlier'=0,'Novice'=1,'Emerging'=2,'Approaching'=3, 'Proficient'=4)
我收到了“警告信息:
在 recode.numeric(Dat17_18.1$I.E.ScoreStat, Low Outlier
= 0, High Outlier
= 0, :
强制引入的 NAs"
我不确定如何一次重新编码所有变量。
对于这种情况,我们可以使用 dplyr
中的 case_when
library(dplyr)
sam %>%
mutate_all(~case_when(. %in% c("Low Outlier", "High Outlier") ~ '0',
. == "Novice" ~ '1',
. == "Emerging" ~ '2',
. == "Approaching" ~ '3',
. == "Proficient" ~ '4',
TRUE ~ NA_character_))
# x y z
#1 0 1 0
#2 0 3 4
#3 1 4 3
#4 1 3 2
#5 2 0 0
#6 <NA> 4 3
#7 4 <NA> 3
#8 3 2 2
但是,最终输出包含字符列,因为我们的原始列也是字符。如果需要,我们可以添加 mutate_all(as.numeric)
将它们转换为数字。
数据
x <- c("Low Outlier", "High Outlier", "Novice", "Novice", "Emerging", NA,
"Proficient", "Approaching")
y <- c("Novice", "Approaching", "Proficient", "Approaching", "High Outlier",
"Proficient",NA, "Emerging")
z <- c("High Outlier", "Proficient", "Approaching", "Emerging", "Low Outlier",
"Approaching", "Approaching", "Emerging")
sam <- data.frame(x,y,z, stringsAsFactors = FALSE)
很快就重复了一遍。这是一个简单的函数:
my_replacer<-function(df,y,z){
df<-as.data.frame(apply(df,2,function(x) gsub(y,z,x)))
#y is what you want to replace
#z is the replacement
#This uses regex
df
}
my_replacer(sam,"Emerging.*","2")
以下是我的使用方法:
library(dplyr)#can use ifelse. Still repetitive
sam<-as.data.frame(sam)
sam %>%
mutate_if(is.factor,as.character)->sam
my_replacer(sam,"Emerging.*","2")
结果:
x y z
1 Low Outlier Novice High Outlier
2 High Outlier Approaching Proficient
3 Novice Proficient Approaching
4 Novice Approaching 2
5 2 High Outlier Low Outlier
6 <NA> Proficient Approaching
7 Proficient <NA> Approaching
8 Approaching 2 2
替换其他:
my_replacer(sam,"Novi.*","1")
x y z
1 Low Outlier 1 High Outlier
2 High Outlier Approaching Proficient
3 1 Proficient Approaching
4 1 Approaching Emerging
5 Emerging High Outlier Low Outlier
6 <NA> Proficient Approaching
7 Proficient <NA> Approaching
8 Approaching Emerging Emerging
就这么干-
sam[] <- recode(sam,'Low Outlier'=0,
'High Outlier'=0,
'Novice'=1,
'Emerging'=2,
'Approaching'=3,
'Proficient'=4)
> sam
x y z
[1,] "0" "1" "0"
[2,] "0" "3" "4"
[3,] "1" "4" "3"
[4,] "1" "3" "2"
[5,] "2" "0" "0"
[6,] NA "4" "3"
[7,] "4" NA "3"
[8,] "3" "2" "2"
我会使用命名向量作为映射
library(dplyr)
mapping = c("High Outlier" = 0, "Low Outlier" = 0, "Novice" = 1, "Emerging" = 2, "Approaching" = 3, "Proficient" = 4)
sam %>%
as.data.frame() %>%
mutate_all(function(i) mapping[i])
另一个使用factors
重新编码和approxfun
赋值的解决方案:
sam[] <- approxfun(1:5, c(0:3, 0))(
as.numeric(factor(sam,
c("Low Outlier", "Novice",
"Emerging", "Approaching",
"Proficient", "High Outlier"))))
# x y z
# [1,] "0" "1" NA
# [2,] NA "3" "0"
# [3,] "1" "0" "3"
# [4,] "1" "3" "2"
# [5,] "2" NA "0"
# [6,] NA "0" "3"
# [7,] "0" NA "3"
# [8,] "3" "2" "2"
我需要用跨多列的数字替换字符串。下面是一个示例数据集:
x <- c("Low Outlier", "High Outlier", "Novice", "Novice", "Emerging", NA, "Proficient", "Approaching")
y <- c("Novice", "Approaching", "Proficient", "Approaching", "High Outlier", "Proficient",NA, "Emerging")
z <- c("High Outlier", "Proficient", "Approaching", "Emerging", "Low Outlier", "Approaching", "Approaching", "Emerging")
sam <- cbind(x,y,z)
我需要将 "High/Low Outliers" 转换为 0,NA 保留为 NA,"Novice" 转换为 1,"Emerging" 转换为 2,"Approaching to 3, and "Proficient" 转换为4.
我尝试用
转换单个变量sam$x.r <- recode(sam$x.r,'Low Outlier'=0,'High Outlier'=0,'Novice'=1,'Emerging'=2,'Approaching'=3, 'Proficient'=4)
我收到了“警告信息:
在 recode.numeric(Dat17_18.1$I.E.ScoreStat, Low Outlier
= 0, High Outlier
= 0, :
强制引入的 NAs"
我不确定如何一次重新编码所有变量。
对于这种情况,我们可以使用 dplyr
中的 case_when
library(dplyr)
sam %>%
mutate_all(~case_when(. %in% c("Low Outlier", "High Outlier") ~ '0',
. == "Novice" ~ '1',
. == "Emerging" ~ '2',
. == "Approaching" ~ '3',
. == "Proficient" ~ '4',
TRUE ~ NA_character_))
# x y z
#1 0 1 0
#2 0 3 4
#3 1 4 3
#4 1 3 2
#5 2 0 0
#6 <NA> 4 3
#7 4 <NA> 3
#8 3 2 2
但是,最终输出包含字符列,因为我们的原始列也是字符。如果需要,我们可以添加 mutate_all(as.numeric)
将它们转换为数字。
数据
x <- c("Low Outlier", "High Outlier", "Novice", "Novice", "Emerging", NA,
"Proficient", "Approaching")
y <- c("Novice", "Approaching", "Proficient", "Approaching", "High Outlier",
"Proficient",NA, "Emerging")
z <- c("High Outlier", "Proficient", "Approaching", "Emerging", "Low Outlier",
"Approaching", "Approaching", "Emerging")
sam <- data.frame(x,y,z, stringsAsFactors = FALSE)
很快就重复了一遍。这是一个简单的函数:
my_replacer<-function(df,y,z){
df<-as.data.frame(apply(df,2,function(x) gsub(y,z,x)))
#y is what you want to replace
#z is the replacement
#This uses regex
df
}
my_replacer(sam,"Emerging.*","2")
以下是我的使用方法:
library(dplyr)#can use ifelse. Still repetitive
sam<-as.data.frame(sam)
sam %>%
mutate_if(is.factor,as.character)->sam
my_replacer(sam,"Emerging.*","2")
结果:
x y z
1 Low Outlier Novice High Outlier
2 High Outlier Approaching Proficient
3 Novice Proficient Approaching
4 Novice Approaching 2
5 2 High Outlier Low Outlier
6 <NA> Proficient Approaching
7 Proficient <NA> Approaching
8 Approaching 2 2
替换其他:
my_replacer(sam,"Novi.*","1")
x y z
1 Low Outlier 1 High Outlier
2 High Outlier Approaching Proficient
3 1 Proficient Approaching
4 1 Approaching Emerging
5 Emerging High Outlier Low Outlier
6 <NA> Proficient Approaching
7 Proficient <NA> Approaching
8 Approaching Emerging Emerging
就这么干-
sam[] <- recode(sam,'Low Outlier'=0,
'High Outlier'=0,
'Novice'=1,
'Emerging'=2,
'Approaching'=3,
'Proficient'=4)
> sam
x y z
[1,] "0" "1" "0"
[2,] "0" "3" "4"
[3,] "1" "4" "3"
[4,] "1" "3" "2"
[5,] "2" "0" "0"
[6,] NA "4" "3"
[7,] "4" NA "3"
[8,] "3" "2" "2"
我会使用命名向量作为映射
library(dplyr)
mapping = c("High Outlier" = 0, "Low Outlier" = 0, "Novice" = 1, "Emerging" = 2, "Approaching" = 3, "Proficient" = 4)
sam %>%
as.data.frame() %>%
mutate_all(function(i) mapping[i])
另一个使用factors
重新编码和approxfun
赋值的解决方案:
sam[] <- approxfun(1:5, c(0:3, 0))(
as.numeric(factor(sam,
c("Low Outlier", "Novice",
"Emerging", "Approaching",
"Proficient", "High Outlier"))))
# x y z
# [1,] "0" "1" NA
# [2,] NA "3" "0"
# [3,] "1" "0" "3"
# [4,] "1" "3" "2"
# [5,] "2" NA "0"
# [6,] NA "0" "3"
# [7,] "0" NA "3"
# [8,] "3" "2" "2"