R:如何使用 ifelse() 函数或其他方法修复特定数据集行中的错误
R: How to fix errors in specific dataset rows by using ifelse() function or other methods
我的数据集包含以下拼写错误
unique(d$gender)
[1] "k" "kobieta" "M" "K" "m─Ö┼╝czyzna" "21" "m" "M─Ö┼╝czyzna"
> unique(d$age)
[1] 19 NA 21 20 30 32 22 25 29
实际上,性别为 21 的行和年龄为 NA 的行已被调换,此外,性别变量使用了不同的命名(事实上,所有 'k' 标题名称都对应于女性 'F'标题 'm' 代表男性 'M')。我写下了这个命令行来修复这个性别变量:
> d$gender = ifelse(d$gender == 'K', 'F',
+ ifelse(d$gender =='kobieta', 'F', ifelse(d$gender == 'k', 'F',
+ ifelse(d$gender == "m-Ö++czyzna", 'M',ifelse(d$gender == '21', 'M',
+ ifelse(d$gender == 'm', 'M', ifelse(d$gender == 'M-Ö++czyzna', 'M',
+ ifelse(d$gender == 'M', 'M', 'M'))))))))
>
> unique(d$gender)
[1] "F" "M"
但我不知道如何对年龄变量做同样的事情,如果这种方法是正确的方法也不知道。有人有什么建议吗?
这是 dput() 结果:
dput(head(d,50))
structure(list(ID = c("P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323"), gender = c("F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F"), age = c(19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19), fixation_time = c(60,
60, 60, 60, 60, 70, 50, 50, 50, 70, 70, 60, 50, 60, 70, 70, 50,
70, 70, 60, 70, 50, 50, 50, 60, 70, 60, 50, 60, 70, 60, 70, 50,
60, 70, 50, 50, 70, 70, 70, 70, 50, 60, 50, 60, 60, 70, 50, 60,
60), block = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
t1.key = c("None", "None", "None", "space", "None", "space",
"None", "None", "None", "space", "None", "None", "space",
"None", "None", "space", "None", "None", "space", "None",
"space", "space", "space", "None", "None", "None", "space",
"space", "None", "None", "space", "None", "None", "None",
"None", "None", "None", "space", "space", "None", "None",
"None", "None", "space", "None", "None", "space", "None",
"space", "None"), T1.response = structure(c(1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
1L), .Label = c("0", "1"), class = "factor"), COND = c("NR",
"NR", "NR", "R", "NR", "R", "NR", "NR", "NR", "R", "NR",
"NR", "R", "NR", "NR", "R", "NR", "NR", "R", "NR", "R", "R",
"R", "NR", "NR", "NR", "R", "R", "NR", "NR", "R", "NR", "NR",
"NR", "NR", "NR", "NR", "R", "R", "NR", "NR", "NR", "NR",
"R", "NR", "NR", "R", "NR", "R", "NR"), T1.rt = c(NA, NA,
NA, 0.812299799988978, NA, 0.72336569998879, NA, NA, NA,
0.772733500052709, NA, NA, 0.606754800013732, NA, NA, 0.601030899968464,
NA, NA, 0.838272600027267, NA, 0.305548300035298, 0.849945599969942,
0.748269900039304, NA, NA, NA, 0.859215400007088, 0.95704890001798,
NA, NA, 0.874362500035204, NA, NA, NA, NA, NA, NA, 0.270455699996091,
0.75726039998699, NA, NA, NA, NA, 0.762694000033662, NA,
NA, 0.789715700026136, NA, 0.90579859999707, NA), CR.key = c("p",
"p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p",
"p", "o", "p", "i", "i", "h", "u", "i", "u", "o", "o", "p",
"p", "p", "o", "p", "i", "o", "p", "p", "p", "o", "o", "o",
"p", "i", "p", "p", "o", "o", "i", "i", "o", "o", "i", "i",
"u"), CR.rt = c(0.651771800010465, 0.585048799985088, 0.652350199990906,
0.69888829998672, 1.01917029998731, 0.550036200031173, 0.0361186999944039,
0.568817299965303, 0.452191599993966, 0.514980700041633,
0.619590600021184, 0.719264700019266, 0.466181399999186,
0.45217840000987, 0.668881699966732, 0.914478300022893, 1.01910460001091,
1.40315000002738, 1.69993370003067, 1.71914210001705, 1.29938790004235,
0.698139799991623, 0.848338100011461, 0.651829700043891,
0.486136299965438, 0.703567499993369, 0.76673849998042, 0.54929809999885,
0.718664799991529, 0.768383099988569, 0.898415500007104,
0.819344500021543, 0.61898209998617, 0.737225699995179, 1.03654629999073,
0.971092400024645, 1.4362695000018, 0.999490200018045, 0.932840399967972,
0.586312200000975, 0.786785800009966, 1.01987839996582, 0.93673920002766,
0.715710600023158, 0.819960499997251, 0.75370900001144, 0.818668299994897,
0.903600800025742, 1.1176545000053, 1.10352450003847), trial_num = c(0,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51,
52, 53), ldots = c(48, 48, 52, 55, 51, 51, 52, 49, 45, 55,
49, 49, 51, 49, 48, 52, 45, 49, 45, 55, 51, 48, 55, 51, 45,
45, 52, 48, 48, 48, 55, 51, 49, 48, 49, 51, 51, 55, 51, 49,
45, 55, 51, 55, 55, 52, 52, 48, 49, 52), rdots = c(52, 52,
48, 45, 49, 49, 48, 51, 55, 45, 51, 51, 49, 51, 52, 48, 55,
51, 55, 45, 49, 52, 45, 49, 55, 55, 48, 52, 52, 52, 45, 49,
51, 52, 51, 49, 49, 45, 49, 51, 55, 45, 49, 45, 45, 48, 48,
52, 51, 48), TASK = c("left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left"), T1.correct = structure(c(1L,
1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 2L), .Label = c("0", "1"), class = "factor"),
Go.Nogo..whether.a.person.should.respond. = c("NR", "NR",
"R", "R", "R", "R", "R", "NR", "NR", "R", "NR", "NR", "R",
"NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "R", "R",
"NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "NR",
"NR", "R", "R", "R", "R", "NR", "NR", "R", "R", "R", "R",
"R", "R", "NR", "NR", "R"), T1.ACC = structure(c(2L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 1L), .Label = c("0", "1"), class = "factor"), CR = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L,
2L, 2L, 5L, 1L, 2L, 1L, 3L, 3L, 4L, 4L, 4L, 3L, 4L, 2L, 3L,
4L, 4L, 4L, 3L, 3L, 3L, 4L, 2L, 4L, 4L, 3L, 3L, 2L, 2L, 3L,
3L, 2L, 2L, 1L), .Label = c("1", "2", "3", "4", "9"), class = "factor"),
difficulty = c("medium", "medium", "medium", "easy", "hard",
"hard", "medium", "hard", "easy", "easy", "hard", "hard",
"hard", "hard", "medium", "medium", "easy", "hard", "easy",
"easy", "hard", "medium", "easy", "hard", "easy", "easy",
"medium", "medium", "medium", "medium", "easy", "hard", "hard",
"medium", "hard", "hard", "hard", "easy", "hard", "hard",
"easy", "easy", "hard", "easy", "easy", "medium", "medium",
"medium", "hard", "medium")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
不知道是哪个时代的问题。但是 ifelse 语句可以用下一种方式重写:
如果d$gender
字段没有异常:
d$gender2 = ifelse(tolower(substr(d$gender,1,1)) == "k", "F", "M")
如果d$gender
字段有异常:
d$gender2 = ifelse(tolower(substr(d$gender,1,1)) == "k", "F",
ifelse(tolower(substr(d$gender,1,1)) == "m" | d$gender == "21", "M", "Other")
我认为这是一种更舒适的方法。你可以使用这样的变体。
在Age的情况下我不知道你想做什么。
我想向您介绍两件事:%in%
运算符和 datastep()
函数。
%in%
运算符可让您将可能的匹配项合并到一个向量中。所以你可以做 x %in% c(y, z)
而不是 ifelse(x == y, a, ifelse(x == z, a, x))
。可以大大减少嵌套条件的数量。
其次,我编写了一个名为 libr 的程序包,其中包含一个名为 datastep()
的函数,该函数专门设计用于像您尝试做的那样清理数据。适用于条件嵌套较多,逻辑复杂的情况。它逐行遍历数据,让您检查每一行的值,并根据特定行的值创建新值。最好的是,您可以根据需要嵌套条件语句,并且仍然可以阅读它们。
这是一个同时使用 %in%
和 datastep()
函数的示例:
library(libr)
# Sample data
d <- data.frame(gender = c("k", "kobieta", "M", "K", "m─Ö┼╝czyzna", "21", "m", "M─Ö┼╝czyzna", "F"),
age = c(19, NA, 21, 20, 30, 32, 22, 25, 29))
# Define datastep
d2 <- datastep(d, {
if (gender %in% c('K', 'k', 'kobieta', 'f')) {
gender_corrected <- 'F'
} else if (gender %in% c('m', 'm─Ö┼╝czyzna', 'M─Ö┼╝czyzna')) {
gender_corrected <- 'M'
} else if (gender == 21) {
gender_corrected <- 'M'
} else {
gender_corrected <- gender
}
if (is.na(age)) {
# Do something
} else {
if (age < 10) {
# Do something else
} else {
age_corrected <- age
}
}
})
所以结果 d2 看起来像这样:
> d2
gender age age_corrected gender_corrected
1 k 19 19 F
2 kobieta NA NA F
3 M 21 21 M
4 K 20 20 F
5 m-Ö++czyzna 30 30 M
6 21 32 32 M
7 m 22 22 M
8 M-Ö++czyzna 25 25 M
9 F 29 29 F
我的数据集包含以下拼写错误
unique(d$gender)
[1] "k" "kobieta" "M" "K" "m─Ö┼╝czyzna" "21" "m" "M─Ö┼╝czyzna"
> unique(d$age)
[1] 19 NA 21 20 30 32 22 25 29
实际上,性别为 21 的行和年龄为 NA 的行已被调换,此外,性别变量使用了不同的命名(事实上,所有 'k' 标题名称都对应于女性 'F'标题 'm' 代表男性 'M')。我写下了这个命令行来修复这个性别变量:
> d$gender = ifelse(d$gender == 'K', 'F',
+ ifelse(d$gender =='kobieta', 'F', ifelse(d$gender == 'k', 'F',
+ ifelse(d$gender == "m-Ö++czyzna", 'M',ifelse(d$gender == '21', 'M',
+ ifelse(d$gender == 'm', 'M', ifelse(d$gender == 'M-Ö++czyzna', 'M',
+ ifelse(d$gender == 'M', 'M', 'M'))))))))
>
> unique(d$gender)
[1] "F" "M"
但我不知道如何对年龄变量做同样的事情,如果这种方法是正确的方法也不知道。有人有什么建议吗?
这是 dput() 结果:
dput(head(d,50))
structure(list(ID = c("P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323",
"P1323", "P1323", "P1323"), gender = c("F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F"), age = c(19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19), fixation_time = c(60,
60, 60, 60, 60, 70, 50, 50, 50, 70, 70, 60, 50, 60, 70, 70, 50,
70, 70, 60, 70, 50, 50, 50, 60, 70, 60, 50, 60, 70, 60, 70, 50,
60, 70, 50, 50, 70, 70, 70, 70, 50, 60, 50, 60, 60, 70, 50, 60,
60), block = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
t1.key = c("None", "None", "None", "space", "None", "space",
"None", "None", "None", "space", "None", "None", "space",
"None", "None", "space", "None", "None", "space", "None",
"space", "space", "space", "None", "None", "None", "space",
"space", "None", "None", "space", "None", "None", "None",
"None", "None", "None", "space", "space", "None", "None",
"None", "None", "space", "None", "None", "space", "None",
"space", "None"), T1.response = structure(c(1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
1L), .Label = c("0", "1"), class = "factor"), COND = c("NR",
"NR", "NR", "R", "NR", "R", "NR", "NR", "NR", "R", "NR",
"NR", "R", "NR", "NR", "R", "NR", "NR", "R", "NR", "R", "R",
"R", "NR", "NR", "NR", "R", "R", "NR", "NR", "R", "NR", "NR",
"NR", "NR", "NR", "NR", "R", "R", "NR", "NR", "NR", "NR",
"R", "NR", "NR", "R", "NR", "R", "NR"), T1.rt = c(NA, NA,
NA, 0.812299799988978, NA, 0.72336569998879, NA, NA, NA,
0.772733500052709, NA, NA, 0.606754800013732, NA, NA, 0.601030899968464,
NA, NA, 0.838272600027267, NA, 0.305548300035298, 0.849945599969942,
0.748269900039304, NA, NA, NA, 0.859215400007088, 0.95704890001798,
NA, NA, 0.874362500035204, NA, NA, NA, NA, NA, NA, 0.270455699996091,
0.75726039998699, NA, NA, NA, NA, 0.762694000033662, NA,
NA, 0.789715700026136, NA, 0.90579859999707, NA), CR.key = c("p",
"p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p",
"p", "o", "p", "i", "i", "h", "u", "i", "u", "o", "o", "p",
"p", "p", "o", "p", "i", "o", "p", "p", "p", "o", "o", "o",
"p", "i", "p", "p", "o", "o", "i", "i", "o", "o", "i", "i",
"u"), CR.rt = c(0.651771800010465, 0.585048799985088, 0.652350199990906,
0.69888829998672, 1.01917029998731, 0.550036200031173, 0.0361186999944039,
0.568817299965303, 0.452191599993966, 0.514980700041633,
0.619590600021184, 0.719264700019266, 0.466181399999186,
0.45217840000987, 0.668881699966732, 0.914478300022893, 1.01910460001091,
1.40315000002738, 1.69993370003067, 1.71914210001705, 1.29938790004235,
0.698139799991623, 0.848338100011461, 0.651829700043891,
0.486136299965438, 0.703567499993369, 0.76673849998042, 0.54929809999885,
0.718664799991529, 0.768383099988569, 0.898415500007104,
0.819344500021543, 0.61898209998617, 0.737225699995179, 1.03654629999073,
0.971092400024645, 1.4362695000018, 0.999490200018045, 0.932840399967972,
0.586312200000975, 0.786785800009966, 1.01987839996582, 0.93673920002766,
0.715710600023158, 0.819960499997251, 0.75370900001144, 0.818668299994897,
0.903600800025742, 1.1176545000053, 1.10352450003847), trial_num = c(0,
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51,
52, 53), ldots = c(48, 48, 52, 55, 51, 51, 52, 49, 45, 55,
49, 49, 51, 49, 48, 52, 45, 49, 45, 55, 51, 48, 55, 51, 45,
45, 52, 48, 48, 48, 55, 51, 49, 48, 49, 51, 51, 55, 51, 49,
45, 55, 51, 55, 55, 52, 52, 48, 49, 52), rdots = c(52, 52,
48, 45, 49, 49, 48, 51, 55, 45, 51, 51, 49, 51, 52, 48, 55,
51, 55, 45, 49, 52, 45, 49, 55, 55, 48, 52, 52, 52, 45, 49,
51, 52, 51, 49, 49, 45, 49, 51, 55, 45, 49, 45, 45, 48, 48,
52, 51, 48), TASK = c("left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left", "left", "left", "left",
"left", "left", "left", "left", "left"), T1.correct = structure(c(1L,
1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 2L), .Label = c("0", "1"), class = "factor"),
Go.Nogo..whether.a.person.should.respond. = c("NR", "NR",
"R", "R", "R", "R", "R", "NR", "NR", "R", "NR", "NR", "R",
"NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "R", "R",
"NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "NR",
"NR", "R", "R", "R", "R", "NR", "NR", "R", "R", "R", "R",
"R", "R", "NR", "NR", "R"), T1.ACC = structure(c(2L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 1L), .Label = c("0", "1"), class = "factor"), CR = structure(c(4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L,
2L, 2L, 5L, 1L, 2L, 1L, 3L, 3L, 4L, 4L, 4L, 3L, 4L, 2L, 3L,
4L, 4L, 4L, 3L, 3L, 3L, 4L, 2L, 4L, 4L, 3L, 3L, 2L, 2L, 3L,
3L, 2L, 2L, 1L), .Label = c("1", "2", "3", "4", "9"), class = "factor"),
difficulty = c("medium", "medium", "medium", "easy", "hard",
"hard", "medium", "hard", "easy", "easy", "hard", "hard",
"hard", "hard", "medium", "medium", "easy", "hard", "easy",
"easy", "hard", "medium", "easy", "hard", "easy", "easy",
"medium", "medium", "medium", "medium", "easy", "hard", "hard",
"medium", "hard", "hard", "hard", "easy", "hard", "hard",
"easy", "easy", "hard", "easy", "easy", "medium", "medium",
"medium", "hard", "medium")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
不知道是哪个时代的问题。但是 ifelse 语句可以用下一种方式重写:
如果d$gender
字段没有异常:
d$gender2 = ifelse(tolower(substr(d$gender,1,1)) == "k", "F", "M")
如果d$gender
字段有异常:
d$gender2 = ifelse(tolower(substr(d$gender,1,1)) == "k", "F",
ifelse(tolower(substr(d$gender,1,1)) == "m" | d$gender == "21", "M", "Other")
我认为这是一种更舒适的方法。你可以使用这样的变体。
在Age的情况下我不知道你想做什么。
我想向您介绍两件事:%in%
运算符和 datastep()
函数。
%in%
运算符可让您将可能的匹配项合并到一个向量中。所以你可以做 x %in% c(y, z)
而不是 ifelse(x == y, a, ifelse(x == z, a, x))
。可以大大减少嵌套条件的数量。
其次,我编写了一个名为 libr 的程序包,其中包含一个名为 datastep()
的函数,该函数专门设计用于像您尝试做的那样清理数据。适用于条件嵌套较多,逻辑复杂的情况。它逐行遍历数据,让您检查每一行的值,并根据特定行的值创建新值。最好的是,您可以根据需要嵌套条件语句,并且仍然可以阅读它们。
这是一个同时使用 %in%
和 datastep()
函数的示例:
library(libr)
# Sample data
d <- data.frame(gender = c("k", "kobieta", "M", "K", "m─Ö┼╝czyzna", "21", "m", "M─Ö┼╝czyzna", "F"),
age = c(19, NA, 21, 20, 30, 32, 22, 25, 29))
# Define datastep
d2 <- datastep(d, {
if (gender %in% c('K', 'k', 'kobieta', 'f')) {
gender_corrected <- 'F'
} else if (gender %in% c('m', 'm─Ö┼╝czyzna', 'M─Ö┼╝czyzna')) {
gender_corrected <- 'M'
} else if (gender == 21) {
gender_corrected <- 'M'
} else {
gender_corrected <- gender
}
if (is.na(age)) {
# Do something
} else {
if (age < 10) {
# Do something else
} else {
age_corrected <- age
}
}
})
所以结果 d2 看起来像这样:
> d2
gender age age_corrected gender_corrected
1 k 19 19 F
2 kobieta NA NA F
3 M 21 21 M
4 K 20 20 F
5 m-Ö++czyzna 30 30 M
6 21 32 32 M
7 m 22 22 M
8 M-Ö++czyzna 25 25 M
9 F 29 29 F