R 中的决策树预测使用 id 来影响预测
decision tree prediction in R uses the id to influence the prediction
我正在使用 C50 库来尝试预测明年毕业的出勤率,但是我的树将 ID 显示为预测的一部分!当我把它拿出来时,我的树只变成一个节点(级别).. 任何建议将不胜感激
部分数据集(JSON):
{"id":"50","name":"James Charlie","faculty":"Science","degree":"Bachelor degree","course":"Sport Science","attend":"No","year":"2016"},
完整 dataset/Student 对象:git repo
R 脚本:
con=dbConnect(MySQL(), user = 'root', password = '', dbname='students', host = 'localhost') dbListTables(con) Student <- dbReadTable(con, 'students') rows <- nrow(Student)
Student$attend <- as.factor(Student$attend) Student$year <- as.factor(Student$year)
Student$faculty <- as.factor(Student$faculty)
Student$course <- as.factor(Student$course)
Student
dim(Student)
summary(Student)
str(Student)
Student <- Student[-2]
dim(Student)
str(Student)
set.seed(1234)
Student_rand <- Student[order(runif(719)), ] #randomize the data
Student_train <- Student_rand[1:400, ] #split data/train data to predect the test
Student_test <- Student_rand[401:719, ] #validation for train prediction
summary(Student_train)
prop.table(table(Student_train$attend))#propability for prediction
prop.table(table(Student_test$attend))
Student_model <- C5.0(Student_train[,-5],Student_train$attend)
summary(Student_model)
Student_model
summary(Student_model)
jpeg("tree.jpg")
plot(Student_model)
dev.off()
Student_model$predictors
Student_model$trials
Student_model$tree
summary(Student_model)
Student_pred <- predict(Student_model, Student_test,type="class")
table(Student_test$attend ,Student_pred)
CrossTable(Student_pred, Student_test$attend,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('predicted default', 'actual default'))
最后是树:
**我尝试的第一件事是删除 ID,但出现以下错误:
partysplit 错误(varid = as.integer(i), index = index, info = k, prob = NULL)*
: minimum of ‘index’ is not equal to 1 In addition: Warning
message: In min(index, na.rm = TRUE) : no non-missing arguments to
min; returning Inf
*
然后我尝试并添加了一个随机列,这导致预测使用该随机列作为推理..**
复制结构并分配给Student
。保留name和id关联的原始数据框显示hte name变量有很多重复:
str(Student[2])
#'data.frame': 724 obs. of 1 variable:
# $ name: chr "Jill Austin" "David Beckham" "Chris Evans" "Sheldon Cooper" ...
length(table(Student[2]))
#[1] 201
然后我看前165个id的重复度,id小于165的概率很低:
length(table(Student[1:164, 2]))
[1] 163
因此定义一个标记重复的变量:
Student$IsRepeated <- ave( Student$name, Student$name, FUN=length) > 1
然后发现 "name.repeatingness" 在考虑了其他预测因素后与出勤率相关。
> with( Student, table( attend, IsRepeated ) )
IsRepeated
attend FALSE TRUE
No 50 259
Yes 59 356 # so nothing dramatic here, but try other predictors as well
首先我查看了输出:
with( Student, table(attend, year, IsRepeated , faculty) )
有点长,所以我注意到科学组和工程组有些不同:
with( Student, table(attend, year, IsRepeated , fac_EorS=faculty %in% c("Engineering", "Science") ) )
, , IsRepeated = FALSE, fac_EorS = FALSE
year
attend 2015 2016 2017 2018
No 0 0 0 10
Yes 0 0 0 16
, , IsRepeated = TRUE, fac_EorS = FALSE
year
attend 2015 2016 2017 2018
No 9 9 9 131
Yes 37 17 17 113
, , IsRepeated = FALSE, fac_EorS = TRUE
year
attend 2015 2016 2017 2018
No 0 0 1 39
Yes 1 0 0 42
, , IsRepeated = TRUE, fac_EorS = TRUE
year
attend 2015 2016 2017 2018
No 34 34 33 0 # also shows how the `date` became the 2nd split
Yes 45 32 32 63
我正在使用 C50 库来尝试预测明年毕业的出勤率,但是我的树将 ID 显示为预测的一部分!当我把它拿出来时,我的树只变成一个节点(级别).. 任何建议将不胜感激
部分数据集(JSON):
{"id":"50","name":"James Charlie","faculty":"Science","degree":"Bachelor degree","course":"Sport Science","attend":"No","year":"2016"},
完整 dataset/Student 对象:git repo
R 脚本:
con=dbConnect(MySQL(), user = 'root', password = '', dbname='students', host = 'localhost') dbListTables(con) Student <- dbReadTable(con, 'students') rows <- nrow(Student)
Student$attend <- as.factor(Student$attend) Student$year <- as.factor(Student$year)
Student$faculty <- as.factor(Student$faculty)
Student$course <- as.factor(Student$course)
Student
dim(Student)
summary(Student)
str(Student)
Student <- Student[-2]
dim(Student)
str(Student)
set.seed(1234)
Student_rand <- Student[order(runif(719)), ] #randomize the data
Student_train <- Student_rand[1:400, ] #split data/train data to predect the test
Student_test <- Student_rand[401:719, ] #validation for train prediction
summary(Student_train)
prop.table(table(Student_train$attend))#propability for prediction
prop.table(table(Student_test$attend))
Student_model <- C5.0(Student_train[,-5],Student_train$attend)
summary(Student_model)
Student_model
summary(Student_model)
jpeg("tree.jpg")
plot(Student_model)
dev.off()
Student_model$predictors
Student_model$trials
Student_model$tree
summary(Student_model)
Student_pred <- predict(Student_model, Student_test,type="class")
table(Student_test$attend ,Student_pred)
CrossTable(Student_pred, Student_test$attend,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('predicted default', 'actual default'))
最后是树:
**我尝试的第一件事是删除 ID,但出现以下错误:
partysplit 错误(varid = as.integer(i), index = index, info = k, prob = NULL)*
: minimum of ‘index’ is not equal to 1 In addition: Warning message: In min(index, na.rm = TRUE) : no non-missing arguments to min; returning Inf
*
然后我尝试并添加了一个随机列,这导致预测使用该随机列作为推理..**
复制结构并分配给Student
。保留name和id关联的原始数据框显示hte name变量有很多重复:
str(Student[2])
#'data.frame': 724 obs. of 1 variable:
# $ name: chr "Jill Austin" "David Beckham" "Chris Evans" "Sheldon Cooper" ...
length(table(Student[2]))
#[1] 201
然后我看前165个id的重复度,id小于165的概率很低:
length(table(Student[1:164, 2]))
[1] 163
因此定义一个标记重复的变量:
Student$IsRepeated <- ave( Student$name, Student$name, FUN=length) > 1
然后发现 "name.repeatingness" 在考虑了其他预测因素后与出勤率相关。
> with( Student, table( attend, IsRepeated ) )
IsRepeated
attend FALSE TRUE
No 50 259
Yes 59 356 # so nothing dramatic here, but try other predictors as well
首先我查看了输出:
with( Student, table(attend, year, IsRepeated , faculty) )
有点长,所以我注意到科学组和工程组有些不同:
with( Student, table(attend, year, IsRepeated , fac_EorS=faculty %in% c("Engineering", "Science") ) )
, , IsRepeated = FALSE, fac_EorS = FALSE
year
attend 2015 2016 2017 2018
No 0 0 0 10
Yes 0 0 0 16
, , IsRepeated = TRUE, fac_EorS = FALSE
year
attend 2015 2016 2017 2018
No 9 9 9 131
Yes 37 17 17 113
, , IsRepeated = FALSE, fac_EorS = TRUE
year
attend 2015 2016 2017 2018
No 0 0 1 39
Yes 1 0 0 42
, , IsRepeated = TRUE, fac_EorS = TRUE
year
attend 2015 2016 2017 2018
No 34 34 33 0 # also shows how the `date` became the 2nd split
Yes 45 32 32 63