R:如何使用 ifelse() 函数或其他方法修复特定数据集行中的错误

R: How to fix errors in specific dataset rows by using ifelse() function or other methods

我的数据集包含以下拼写错误

unique(d$gender)
[1] "k"           "kobieta"     "M"           "K"           "m─Ö┼╝czyzna" "21"          "m"           "M─Ö┼╝czyzna"

> unique(d$age)
[1] 19 NA 21 20 30 32 22 25 29

实际上,性别为 21 的行和年龄为 NA 的行已被调换,此外,性别变量使用了不同的命名(事实上,所有 'k' 标题名称都对应于女性 'F'标题 'm' 代表男性 'M')。我写下了这个命令行来修复这个性别变量:

> d$gender = ifelse(d$gender == 'K', 'F', 
+            ifelse(d$gender =='kobieta', 'F', ifelse(d$gender == 'k', 'F', 
+            ifelse(d$gender == "m-Ö++czyzna", 'M',ifelse(d$gender == '21', 'M',
+            ifelse(d$gender == 'm', 'M', ifelse(d$gender == 'M-Ö++czyzna', 'M', 
+            ifelse(d$gender == 'M', 'M', 'M'))))))))
> 
> unique(d$gender)
[1] "F" "M"

但我不知道如何对年龄变量做同样的事情,如果这种方法是正确的方法也不知道。有人有什么建议吗?

这是 dput() 结果:

dput(head(d,50))
structure(list(ID = c("P1323", "P1323", "P1323", "P1323", "P1323", 
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
"P1323", "P1323", "P1323", "P1323", "P1323", "P1323", "P1323", 
"P1323", "P1323", "P1323"), gender = c("F", "F", "F", "F", "F", 
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", 
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", 
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", 
"F", "F", "F", "F", "F", "F"), age = c(19, 19, 19, 19, 19, 19, 
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 
19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19), fixation_time = c(60, 
60, 60, 60, 60, 70, 50, 50, 50, 70, 70, 60, 50, 60, 70, 70, 50, 
70, 70, 60, 70, 50, 50, 50, 60, 70, 60, 50, 60, 70, 60, 70, 50, 
60, 70, 50, 50, 70, 70, 70, 70, 50, 60, 50, 60, 60, 70, 50, 60, 
60), block = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), 
    t1.key = c("None", "None", "None", "space", "None", "space", 
    "None", "None", "None", "space", "None", "None", "space", 
    "None", "None", "space", "None", "None", "space", "None", 
    "space", "space", "space", "None", "None", "None", "space", 
    "space", "None", "None", "space", "None", "None", "None", 
    "None", "None", "None", "space", "space", "None", "None", 
    "None", "None", "space", "None", "None", "space", "None", 
    "space", "None"), T1.response = structure(c(1L, 1L, 1L, 2L, 
    1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 
    1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 
    1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 
    1L), .Label = c("0", "1"), class = "factor"), COND = c("NR", 
    "NR", "NR", "R", "NR", "R", "NR", "NR", "NR", "R", "NR", 
    "NR", "R", "NR", "NR", "R", "NR", "NR", "R", "NR", "R", "R", 
    "R", "NR", "NR", "NR", "R", "R", "NR", "NR", "R", "NR", "NR", 
    "NR", "NR", "NR", "NR", "R", "R", "NR", "NR", "NR", "NR", 
    "R", "NR", "NR", "R", "NR", "R", "NR"), T1.rt = c(NA, NA, 
    NA, 0.812299799988978, NA, 0.72336569998879, NA, NA, NA, 
    0.772733500052709, NA, NA, 0.606754800013732, NA, NA, 0.601030899968464, 
    NA, NA, 0.838272600027267, NA, 0.305548300035298, 0.849945599969942, 
    0.748269900039304, NA, NA, NA, 0.859215400007088, 0.95704890001798, 
    NA, NA, 0.874362500035204, NA, NA, NA, NA, NA, NA, 0.270455699996091, 
    0.75726039998699, NA, NA, NA, NA, 0.762694000033662, NA, 
    NA, 0.789715700026136, NA, 0.90579859999707, NA), CR.key = c("p", 
    "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", 
    "p", "o", "p", "i", "i", "h", "u", "i", "u", "o", "o", "p", 
    "p", "p", "o", "p", "i", "o", "p", "p", "p", "o", "o", "o", 
    "p", "i", "p", "p", "o", "o", "i", "i", "o", "o", "i", "i", 
    "u"), CR.rt = c(0.651771800010465, 0.585048799985088, 0.652350199990906, 
    0.69888829998672, 1.01917029998731, 0.550036200031173, 0.0361186999944039, 
    0.568817299965303, 0.452191599993966, 0.514980700041633, 
    0.619590600021184, 0.719264700019266, 0.466181399999186, 
    0.45217840000987, 0.668881699966732, 0.914478300022893, 1.01910460001091, 
    1.40315000002738, 1.69993370003067, 1.71914210001705, 1.29938790004235, 
    0.698139799991623, 0.848338100011461, 0.651829700043891, 
    0.486136299965438, 0.703567499993369, 0.76673849998042, 0.54929809999885, 
    0.718664799991529, 0.768383099988569, 0.898415500007104, 
    0.819344500021543, 0.61898209998617, 0.737225699995179, 1.03654629999073, 
    0.971092400024645, 1.4362695000018, 0.999490200018045, 0.932840399967972, 
    0.586312200000975, 0.786785800009966, 1.01987839996582, 0.93673920002766, 
    0.715710600023158, 0.819960499997251, 0.75370900001144, 0.818668299994897, 
    0.903600800025742, 1.1176545000053, 1.10352450003847), trial_num = c(0, 
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 
    20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 
    36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 
    52, 53), ldots = c(48, 48, 52, 55, 51, 51, 52, 49, 45, 55, 
    49, 49, 51, 49, 48, 52, 45, 49, 45, 55, 51, 48, 55, 51, 45, 
    45, 52, 48, 48, 48, 55, 51, 49, 48, 49, 51, 51, 55, 51, 49, 
    45, 55, 51, 55, 55, 52, 52, 48, 49, 52), rdots = c(52, 52, 
    48, 45, 49, 49, 48, 51, 55, 45, 51, 51, 49, 51, 52, 48, 55, 
    51, 55, 45, 49, 52, 45, 49, 55, 55, 48, 52, 52, 52, 45, 49, 
    51, 52, 51, 49, 49, 45, 49, 51, 55, 45, 49, 45, 45, 48, 48, 
    52, 51, 48), TASK = c("left", "left", "left", "left", "left", 
    "left", "left", "left", "left", "left", "left", "left", "left", 
    "left", "left", "left", "left", "left", "left", "left", "left", 
    "left", "left", "left", "left", "left", "left", "left", "left", 
    "left", "left", "left", "left", "left", "left", "left", "left", 
    "left", "left", "left", "left", "left", "left", "left", "left", 
    "left", "left", "left", "left", "left"), T1.correct = structure(c(1L, 
    1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 
    1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 
    2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 
    2L, 1L, 1L, 2L), .Label = c("0", "1"), class = "factor"), 
    Go.Nogo..whether.a.person.should.respond. = c("NR", "NR", 
    "R", "R", "R", "R", "R", "NR", "NR", "R", "NR", "NR", "R", 
    "NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "R", "R", 
    "NR", "NR", "R", "NR", "NR", "NR", "R", "R", "NR", "NR", 
    "NR", "R", "R", "R", "R", "NR", "NR", "R", "R", "R", "R", 
    "R", "R", "NR", "NR", "R"), T1.ACC = structure(c(2L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 
    2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 
    2L, 1L, 1L), .Label = c("0", "1"), class = "factor"), CR = structure(c(4L, 
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 
    2L, 2L, 5L, 1L, 2L, 1L, 3L, 3L, 4L, 4L, 4L, 3L, 4L, 2L, 3L, 
    4L, 4L, 4L, 3L, 3L, 3L, 4L, 2L, 4L, 4L, 3L, 3L, 2L, 2L, 3L, 
    3L, 2L, 2L, 1L), .Label = c("1", "2", "3", "4", "9"), class = "factor"), 
    difficulty = c("medium", "medium", "medium", "easy", "hard", 
    "hard", "medium", "hard", "easy", "easy", "hard", "hard", 
    "hard", "hard", "medium", "medium", "easy", "hard", "easy", 
    "easy", "hard", "medium", "easy", "hard", "easy", "easy", 
    "medium", "medium", "medium", "medium", "easy", "hard", "hard", 
    "medium", "hard", "hard", "hard", "easy", "hard", "hard", 
    "easy", "easy", "hard", "easy", "easy", "medium", "medium", 
    "medium", "hard", "medium")), row.names = c(NA, -50L), class = c("tbl_df", 
"tbl", "data.frame"))

不知道是哪个时代的问题。但是 ifelse 语句可以用下一种方式重写:

如果d$gender字段没有异常:

d$gender2 = ifelse(tolower(substr(d$gender,1,1)) == "k", "F", "M") 

如果d$gender字段有异常:

d$gender2 = ifelse(tolower(substr(d$gender,1,1)) == "k", "F",
                   ifelse(tolower(substr(d$gender,1,1)) == "m" | d$gender == "21", "M", "Other")

我认为这是一种更舒适的方法。你可以使用这样的变体。

在Age的情况下我不知道你想做什么。

我想向您介绍两件事:%in% 运算符和 datastep() 函数。

%in% 运算符可让您将可能的匹配项合并到一个向量中。所以你可以做 x %in% c(y, z) 而不是 ifelse(x == y, a, ifelse(x == z, a, x))。可以大大减少嵌套条件的数量。

其次,我编写了一个名为 libr 的程序包,其中包含一个名为 datastep() 的函数,该函数专门设计用于像您尝试做的那样清理数据。适用于条件嵌套较多,逻辑复杂的情况。它逐行遍历数据,让您检查每一行的值,并根据特定行的值创建新值。最好的是,您可以根据需要嵌套条件语句,并且仍然可以阅读它们。

这是一个同时使用 %in%datastep() 函数的示例:


library(libr)

# Sample data
d <- data.frame(gender = c("k", "kobieta", "M", "K", "m─Ö┼╝czyzna", "21", "m", "M─Ö┼╝czyzna", "F"),
                age = c(19, NA, 21, 20, 30, 32, 22, 25, 29))

# Define datastep
d2 <- datastep(d, {
  
    if (gender %in% c('K', 'k', 'kobieta', 'f')) {
      gender_corrected <- 'F'
    } else if (gender %in% c('m', 'm─Ö┼╝czyzna', 'M─Ö┼╝czyzna')) {
      gender_corrected <- 'M'
    } else if (gender == 21) {
      gender_corrected <- 'M'
    } else {
      gender_corrected <- gender 
    }
      
    if (is.na(age)) {
      # Do something
    } else {
      if (age < 10) {
        # Do something else 
      } else {
        age_corrected <- age
      }
    }
  
  
  })

所以结果 d2 看起来像这样:

> d2
       gender age age_corrected gender_corrected
1           k  19            19                F
2     kobieta  NA            NA                F
3           M  21            21                M
4           K  20            20                F
5 m-Ö++czyzna  30            30                M
6          21  32            32                M
7           m  22            22                M
8 M-Ö++czyzna  25            25                M
9           F  29            29                F