R分配答案的概率

R assigning probability of an answer

我有 100 个任务和 20 个人来完成这些任务。我给每个人随机分配了 7 个任务(以便稍后计算评分者间的一致性)。所以我有一个数据集,其中包含 personID、taskID。每个任务有 5 个可能的答案。我通过

模拟 "truth" 答案(来自 5 个可能的答案)

truth <- sample(answers, no.tasks, replace = FALSE, prob = c(0.2, 0.2, 0.2, 0.2, 0.2))

并将其添加到我的数据集中,该数据集现在具有三列:personID、taskID、truth;

我还说50%的人表现不好(poorperson),10%的任务是hard tasks(硬任务)。我假设每个人的概率:表现不佳、表现良好、任务艰巨、任务简单

ppoor <- 0.7
pgood <- 0.99
phard <- 0.2
peasy <- 0.8

现在,我需要根据她在任务中的优劣以及任务的难易程度来模拟此人的回答。对于我正在做的一个这样的组合:

for(i in 1:length(dataSet$taskID)) { ifelse(dataSet$personID[i] %in% poorperson && dataSet[dataSet$personID == i,]$taskID %in% hardtasks, probs[i] <- ppoor * phard, NA)}

但是,我没有得到我期望的结果,即我想为每个任务和工作人员组合获得 'probs'。我在这里做错了什么?

我在理解您的代码时遇到了困难,但我重写了它,希望能就您需要推进的内容展开讨论。如果您有任何问题,请告诉我!

# Initialize 
no.tasks <- 100
no.workers <- 20 
tasksperworker <- no.tasks/no.workers # 5 each ( why did you have 7?)

# ANSWERS for Tasks
answers <- c("liver", "blood", "lung", "brain", "heart")
(truth <- sample(answers, no.tasks, replace = TRUE, prob = c(0.2, 0.2, 0.2, 0.2, 0.2)))

# TASKS
prop_hardtasks <- .10
hardtasks <- sample(1:no.tasks, prop_hardtasks * no.tasks)
easytasks <- setdiff(1:no.tasks, hardtasks)
phard <- 0.2
peasy <- 0.8
(task_Difficulty <- ifelse(1:no.tasks %in% easytasks, peasy, phard))

# WORKERS
prop_poorworkers <- .50 
poorworkers <- sample(1:no.workers, prop_poorworkers * no.workers)
goodworkers <- setdiff(1:no.workers, poorworkers)
ppoor <- 0.7
pgood <- 0.99
(worker_Ability <- ifelse(1:no.workers %in% goodworkers, pgood, ppoor))

# The dataset
# One step data creation
dataSet <- data.frame("workerID" = rep(1:no.workers, each = tasksperworker), 
                  "taskID" = 1:no.tasks, 
                  "truth" = truth, 
                  "taskDifficulty" = factor(task_Difficulty, labels = c("hard","easy")), 
                  "workerAbility" = factor(rep(worker_Ability, each = tasksperworker), labels = c("poor","good")), 
                  "probCorrect" = task_Difficulty * worker_Ability)

# I am coding out the old method as I believe it samples twice which is not necessary
# (assignmentMatrix <- replicate(no.workers, sample(1:no.tasks, tasksperworker, replace=FALSE))) 
#(assignEach <- reshape::melt.matrix(assignmentMatrix))
#(dataSet <- cbind.data.frame("workerID" = assignEach[,2], 
#                             "taskID" = assignEach[,3], 
#                             "truth" = truth[assignEach[,2]], 
#                             "taskDifficulty" = factor(task_Difficulty, labels = c("hard",'easy')), 
#                             "workerAbility" = factor(rep(worker_Ability,each = tasksperworker), labels = c("poor","good")), 
#                             "probCorrect" = task_Difficulty * worker_Ability))

根据要求编辑

鉴于有 5 种可能性,并且每个任务(我们随机生成)都有一个事实,我们希望保存工人正确回答的概率,同时将剩余的概率分配给其他可能的答案。每个 task/worker,此向量可能会发生变化。 sample(..., prob) 参数必须始终等于 1。因此我继续执行以下操作:

# Initialize a matrix where each row contains the probability vector we will use to sample. 
truthProb <- matrix(NA, nrow = no.tasks, ncol = length(answers), dimnames = list(1:no.tasks, answers)) 

# run a for loop to populate it
for(i in 1:no.tasks){
  # Because Answer always changes, 
  # Find location of truth amongst answer vector using which
  # And place the probCorrect value into that spot
  truthProb[i, which(answers %in% dataSet$truth[i])] <- dataSet$probCorrect[i]
  # I would assign equal remaining probabilities to other incorrect answers.
  truthProb[i, -which(answers %in% dataSet$truth[i])] <- (1 - dataSet$probCorrect[i]) / (length(answers) - 1) 
}
rowSums(truthProb) # Should sum to 1

# Add their answers here by using apply to say, given a probability from each row (task), draw the single answer
dataSet$results <- apply(truthProb, 1, function(x) sample(answers, 1, replace = F, prob = x)) 

tail(truthProb) # see the probabilities for each task
tail(dataSet) # can compare the last bit

# how did we do? table showing results at each probCorrect (combination of difficulty and ability)
table(dataSet$truth, dataSet$results, dataSet$probCorrect) 

# double check again that this idea works as intended...
dataSet$truth[1]
truthProb[1,]
sum(1 * (dataSet$truth[1] == replicate(100, sample(answers, 1, replace = F, prob = truthProb[1,])))) /100

编辑全部的函数版本:

# Simulate a Function!!
sim <- function(answers, no.tasks, no.workers, prop_hardtasks, prop_poorworkers, prob_hardeasy, prob_poorgood){
  # Initialize 
  tasksperworker <- no.tasks/no.workers
  # ANSWERS for Tasks
  truth <- sample(answers, no.tasks, replace = TRUE) # assumes equal probability of each answer
  # TASKS
  hardtasks <- sample(1:no.tasks, prop_hardtasks * no.tasks)
  easytasks <- setdiff(1:no.tasks, hardtasks)
  phard <- prob_hardeasy[1]
  peasy <- prob_hardeasy[2]
  (task_Difficulty <- ifelse(1:no.tasks %in% easytasks, peasy, phard))
  # WORKERS
  poorworkers <- sample(1:no.workers, prop_poorworkers * no.workers)
  goodworkers <- setdiff(1:no.workers, poorworkers)
  ppoor <- prob_poorgood[1]
  pgood <- prob_poorgood[2]
  (worker_Ability <- ifelse(1:no.workers %in% goodworkers, pgood, ppoor))
  # One step data creation
  dataSet <- data.frame("workerID" = rep(1:no.workers, each = tasksperworker), 
                        "taskID" = 1:no.tasks, "truth" = truth, "taskDifficulty" = factor(task_Difficulty, labels = c("hard","easy")), 
                        "workerAbility" = factor(rep(worker_Ability, each = tasksperworker), labels = c("poor","good")), 
                        "probCorrect" = task_Difficulty * rep(worker_Ability, each = tasksperworker))
  # SIMULATE ANSWER
  truthProb <- matrix(NA, nrow = no.tasks, ncol = length(answers), dimnames = list(1:no.tasks, answers)) 
  for(i in 1:no.tasks){
    truthProb[i, which(answers %in% dataSet$truth[i])] <- dataSet$probCorrect[i]
    truthProb[i, -which(answers %in% dataSet$truth[i])] <- (1 - dataSet$probCorrect[i]) / (length(answers) - 1) 
  }
  dataSet$results <- apply(truthProb, 1, function(x) sample(answers, 1, replace = F, prob = x)) 
  # Return
  return(dataSet)
}

dat <- sim(answers = LETTERS[1:5],  #  c("liver", "blood", "lung", "brain", "heart")
           no.tasks = 100, 
           prop_hardtasks = 10/100, 
           prob_hardeasy = c(.2, .8),
           no.workers = 20, 
           prop_poorworkers = 10/20, 
           prob_poorgood = c(.77, .99))
head(dat)
table(dat$truth, dat$results, dat$probCorrect)