协同过滤 - R
Collaborative Filtering - R
我需要通过使用所有其他评论家排名的加权平均值来获得为 Victoria 推荐的前 5 部电影。
我只能在 Excel 中完成,但我需要翻译成 R。
获取值的公式是:
维多利亚的平均值 + (Σ(其他评论家评分 - 其他评论家平均评分) * 皮尔逊相关系数) / Σ 绝对加权皮尔逊值 - 1(这是维多利亚的)
为了更好地说明
对于电影 The.Matrix 这是应该的结果
约翰投票 = 4
约翰平均数 = 3,0714
约翰·皮尔逊 cor = 0,27709796
公式 = (4-3,07) * 0,27709796 / 6,9280(abs 加权值之和)
(我如何在 R 中创建这个公式???)如果我能解决这个问题,也许我可以做剩下的事情
为每个人完成此操作后,您添加 3,1538(维多利亚的平均值),这应该会产生
3,791701302
我需要为所有 Victorias 没有看过的电影做这件事,这些应该是结果
3.7917013044215, 'The Matrix'
3.50776533175371, 'Forrest Gump'
3.33118834864677, 'The Sixth Sense'
3.11491825315719, 'Shakespeare in Love'
2.9124513228665, 'Blade Runner'
这是 Excel 上的黑客帝国版本:
到目前为止,这是我的代码:
cr2<-t(cr[,2:21])
colnames(cr2)<-cr[,1]
cr2<-as.data.frame(cr2)
cr$Mean <- rowMeans(cr[,2:20], na.rm = TRUE)
cr$Pearson <- cor(cr2[,1:20], cr2[15], use = "pairwise.complete.obs")
cr$PearsonABS <- abs(cor(cr2[,1:20], cr2[15], use = "pairwise.complete.obs"))
x <- sum(cr$PearsonABS) - 1
我用一种非常糟糕的方式(手动)
g <- cr[15,22]
#Forrest Gump
fg = cr[,4] - cr[, 22]
fga = fg * cr[,23]
fgb = fga / x
fgc = sum(fgb, na.rm = TRUE) + g
print(fgc)
#The Shawshank Redemption
sr = cr[,5] - cr[, 22]
sra = sr * cr[,23]
srb = sra / x
src = sum(srb, na.rm = TRUE) + g
print(src)
#Saving Private Ryan
sp = cr[,9] - cr[, 22]
spa = sp * cr[,23]
spb = spa / x
spc = sum(spb, na.rm = TRUE) + g
print(spc)
#Shakespeare in Love
sl = cr[,12] - cr[, 22]
sla = sl * cr[,23]
slb = sla / x
slc = sum(slb, na.rm = TRUE) + g
print(slc)
#Blade Runner
br = cr[,15] - cr[, 22]
bra = br * cr[,23]
brb = bra / x
brc = sum(brb, na.rm = TRUE) + g
print(brc)
#The Matrix
tm = cr[,17] - cr[, 22]
tma = tm * cr[,23]
tmb = tma / x
tmc = sum(tmb, na.rm = TRUE) + g
print(tmc)
#The Sixth Sense
ts = cr[,19] - cr[, 22]
tsa = ts * cr[,23]
tsb = tsa / x
tsc = sum(tsb, na.rm = TRUE) + g
print(tsc)
my_list <- c(fgc, src, spc, slc, brc, tmc, tsc)
head(sort(my_list, decreasing=TRUE), 5)
这是 dput()
dput(cr)
structure(list(User = structure(c(8L, 10L, 2L, 17L, 11L, 1L,
18L, 9L, 7L, 5L, 3L, 14L, 13L, 4L, 20L, 6L, 16L, 12L, 15L, 19L
), .Label = c("Ana", "Anton", "Bernard", "Carles", "Chris", "Ivan",
"Jim", "John", "Marc", "Maria", "Martina", "Nadia", "Nerea",
"Nuria", "Oriol", "Rachel", "Roger", "Sergi", "Valery", "Victoria"
), class = "factor"), Star.Wars.IV...A.New.Hope = c(1L, 5L, NA,
NA, 4L, 2L, NA, 4L, 5L, 4L, 2L, 3L, 2L, 3L, 4L, NA, NA, 4L, 5L,
1L), Star.Wars.VI...Return.of.the.Jedi = c(5L, 3L, NA, 3L, 3L,
4L, NA, NA, 1L, 2L, 1L, 5L, 3L, NA, 4L, NA, NA, 5L, 1L, 2L),
Forrest.Gump = c(2L, NA, NA, NA, 4L, 4L, 3L, NA, NA, NA,
5L, 2L, NA, 3L, NA, 1L, NA, 1L, NA, 2L), The.Shawshank.Redemption = c(NA,
2L, 5L, NA, 1L, 4L, 1L, NA, 4L, 5L, NA, NA, 5L, NA, NA, NA,
NA, 5L, NA, 4L), The.Silence.of.the.Lambs = c(4L, 4L, 2L,
NA, 4L, NA, 1L, 3L, 2L, 3L, NA, 2L, 4L, 2L, 5L, 3L, 4L, 1L,
NA, 5L), Gladiator = c(4L, 2L, NA, 1L, 1L, NA, 4L, 2L, 4L,
NA, 5L, NA, NA, NA, 5L, 2L, NA, 1L, 4L, NA), Toy.Story = c(2L,
1L, 4L, 2L, NA, 3L, NA, 2L, 4L, 4L, 5L, 2L, 4L, 3L, 2L, NA,
2L, 4L, 2L, 2L), Saving.Private.Ryan = c(2L, NA, NA, 3L,
4L, 1L, 5L, NA, 4L, 3L, NA, NA, 5L, NA, NA, 2L, NA, NA, 1L,
3L), Pulp.Fiction = c(NA, NA, NA, 4L, NA, 4L, 2L, 3L, NA,
4L, NA, 1L, NA, NA, 3L, NA, 2L, 5L, 3L, 2L), Stand.by.Me = c(3L,
4L, 1L, NA, 1L, 4L, NA, NA, 1L, NA, NA, NA, NA, 4L, 5L, 1L,
NA, NA, 3L, 2L), Shakespeare.in.Love = c(2L, 3L, NA, NA,
5L, 5L, 1L, NA, 2L, NA, NA, 3L, NA, NA, NA, 5L, 2L, NA, 3L,
1L), Total.Recall = c(NA, 2L, 1L, 4L, 1L, 2L, NA, 2L, 3L,
NA, 3L, NA, 2L, 1L, 1L, NA, NA, NA, 1L, NA), Independence.Day = c(5L,
2L, 4L, 1L, NA, 4L, NA, 3L, 1L, 2L, 2L, 3L, 4L, 2L, 3L, NA,
NA, NA, NA, NA), Blade.Runner = c(2L, NA, 4L, 3L, 4L, NA,
3L, 2L, NA, NA, NA, NA, NA, 2L, NA, NA, NA, 4L, NA, 5L),
Groundhog.Day = c(NA, 2L, 1L, 5L, NA, 1L, NA, 4L, 5L, NA,
NA, 2L, 3L, 3L, 2L, 5L, NA, NA, NA, 5L), The.Matrix = c(4L,
NA, 1L, NA, 3L, NA, 1L, NA, NA, 2L, 1L, 5L, NA, 5L, NA, 2L,
4L, NA, 2L, 4L), Schindler.s.List = c(2L, 5L, 2L, 5L, 5L,
NA, NA, 1L, NA, 5L, NA, NA, NA, 1L, 3L, 2L, NA, 2L, NA, 3L
), The.Sixth.Sense = c(5L, 1L, 3L, 1L, 5L, 3L, NA, 3L, NA,
1L, 2L, NA, NA, NA, NA, 4L, NA, 1L, NA, 5L), Raiders.of.the.Lost.Ark = c(NA,
3L, 1L, 1L, NA, NA, 5L, 5L, NA, NA, 1L, NA, 5L, NA, 3L, 3L,
NA, 2L, NA, 3L), Babe = c(NA, NA, 3L, 2L, NA, 2L, 2L, NA,
5L, NA, 4L, 2L, NA, NA, 1L, 4L, NA, 5L, NA, NA), Mean = c(3.07142857142857,
2.78571428571429, 2.46153846153846, 2.69230769230769, 3.21428571428571,
3.07142857142857, 2.54545454545455, 2.83333333333333, 3.15384615384615,
3.18181818181818, 2.81818181818182, 2.72727272727273, 3.7,
2.63636363636364, 3.15384615384615, 2.83333333333333, 2.8,
3.07692307692308, 2.5, 3.0625), Pearson = structure(c(0.277097961607667,
0.492592183071889, -0.184664098655286, -0.306988756155365,
0.047716527859489, 0.597614304667197, 0.0363696483726654,
0.0793422835603058, -0.444514447822542, -0.344265186329548,
-0.186499664263607, 0.365148371670111, 0.205737799949456,
0.427324672683063, 1, -0.732776720760177, 0.944911182523068,
-0.587378478571482, 0.578979445733232, -0.0881134221062802
), .Dim = c(20L, 1L), .Dimnames = list(c("John", "Maria",
"Anton", "Roger", "Martina", "Ana", "Sergi", "Marc", "Jim",
"Chris", "Bernard", "Nuria", "Nerea", "Carles", "Victoria",
"Ivan", "Rachel", "Nadia", "Oriol", "Valery"), "Victoria")),
PearsonABS = structure(c(0.277097961607667, 0.492592183071889,
0.184664098655286, 0.306988756155365, 0.047716527859489,
0.597614304667197, 0.0363696483726654, 0.0793422835603058,
0.444514447822542, 0.344265186329548, 0.186499664263607,
0.365148371670111, 0.205737799949456, 0.427324672683063,
1, 0.732776720760177, 0.944911182523068, 0.587378478571482,
0.578979445733232, 0.0881134221062802), .Dim = c(20L, 1L), .Dimnames = list(
c("John", "Maria", "Anton", "Roger", "Martina", "Ana",
"Sergi", "Marc", "Jim", "Chris", "Bernard", "Nuria",
"Nerea", "Carles", "Victoria", "Ivan", "Rachel", "Nadia",
"Oriol", "Valery"), "Victoria"))), .Names = c("User",
"Star.Wars.IV...A.New.Hope", "Star.Wars.VI...Return.of.the.Jedi",
"Forrest.Gump", "The.Shawshank.Redemption", "The.Silence.of.the.Lambs",
"Gladiator", "Toy.Story", "Saving.Private.Ryan", "Pulp.Fiction",
"Stand.by.Me", "Shakespeare.in.Love", "Total.Recall", "Independence.Day",
"Blade.Runner", "Groundhog.Day", "The.Matrix", "Schindler.s.List",
"The.Sixth.Sense", "Raiders.of.the.Lost.Ark", "Babe", "Mean",
"Pearson", "PearsonABS"), row.names = c(NA, -20L), class = "data.frame")
我希望我解释清楚了。
有人可以帮助我吗?
好的,我希望你能跟上这个,我会尽量给出足够的解释。目标是编写一个函数,可以包装您的手动计算并使其易于 运行 多次。
正在整理
首先我们要使数据整齐,这样更容易使用。这意味着让每一列成为一个变量,这样就不会在多个列中有电影评级。参见 more about tidy data here。我们将使用 tidyverse
包来完成此操作。
- 用
select
删除均值和皮尔逊列。我们希望能够根据我们正在查看的用户以不同方式计算它们,并且它们也以列表的形式奇怪地存储在您的 dput
中。
- 使用
gather
获取所有电影列并将电影名称放入一列,将评级放入另一列。
- 将
User
重命名为小写以保持一致性
- 使用
group_by
和 mutate
对每个用户进行平均评分。 na.rm = TRUE
表示mean
计算时会忽略所有NA
值。
看起来像这样。看到数据现在很整洁,只有四列。
library(tidyverse)
library(magrittr)
tidy_cr <- cr %>%
select(-Mean, -Pearson, -PearsonABS) %>%
gather("film", "rating", -User) %>%
rename(user = User) %>%
group_by(user) %>%
mutate(mean = mean(rating, na.rm = TRUE)) %>%
ungroup()
# A tibble: 400 x 4
user film rating mean
<fct> <chr> <int> <dbl>
1 John Star.Wars.IV...A.New.Hope 1 3.07
2 Maria Star.Wars.IV...A.New.Hope 5 2.79
3 Anton Star.Wars.IV...A.New.Hope NA 2.46
4 Roger Star.Wars.IV...A.New.Hope NA 2.69
5 Martina Star.Wars.IV...A.New.Hope 4 3.21
6 Ana Star.Wars.IV...A.New.Hope 2 3.07
7 Sergi Star.Wars.IV...A.New.Hope NA 2.55
8 Marc Star.Wars.IV...A.New.Hope 4 2.83
9 Jim Star.Wars.IV...A.New.Hope 5 3.15
10 Chris Star.Wars.IV...A.New.Hope 4 3.18
# ... with 390 more rows
维多利亚例子
这是一步一步向您展示每个阶段的输出,以维多利亚为例。首先,我们想找出维多利亚没看过哪些电影。我们通过 filter
向下搜索 user
列中有 Victoria 和 NA
评级的行,然后拉出 film
列。
v_films_not_seen <- tidy_cr %>%
filter(user == "Victoria" & is.na(rating)) %>%
extract2("film")
[1] "Forrest.Gump" "The.Shawshank.Redemption" "Saving.Private.Ryan"
[4] "Shakespeare.in.Love" "Blade.Runner" "The.Matrix"
[7] "The.Sixth.Sense"
然后,我们要提取 Victoria 的评分,以便计算相关系数和她的具体均值。我们再次减少到 Victoria 的行,现在拉出 rating
列。
v_persons_ratings <- tidy_cr %>%
filter(user == "Victoria") %>%
extract2("rating")
[1] 4 4 NA NA 5 5 2 NA 3 5 NA 1 3 NA 2 NA 3 NA 3 1
现在我们要计算相关性。我们首先 group_by
用户,因此此计算将针对每个用户进行。然后我们使用 mutate
和 cor
来计算每个用户的评分与 Victoria 的评分之间的相关性,我们在最后一步中将其保存为 v_persons_ratings
。选项 use = "complete.obs"
表示相关性仅查看两个用户对电影评分的观察结果。然后我们用abs
得到相关系数的绝对值
v_correlations <- tidy_cr %>%
group_by(user) %>%
mutate(
pearson = cor(rating, v_persons_ratings, use = "complete.obs"),
pearson_abs = abs(pearson)
) %>%
ungroup()
# A tibble: 400 x 6
user film rating mean pearson pearson_abs
<fct> <chr> <int> <dbl> <dbl> <dbl>
1 John Star.Wars.IV...A.New.Hope 1 3.07 0.277 0.277
2 Maria Star.Wars.IV...A.New.Hope 5 2.79 0.493 0.493
3 Anton Star.Wars.IV...A.New.Hope NA 2.46 -0.185 0.185
4 Roger Star.Wars.IV...A.New.Hope NA 2.69 -0.307 0.307
5 Martina Star.Wars.IV...A.New.Hope 4 3.21 0.0477 0.0477
6 Ana Star.Wars.IV...A.New.Hope 2 3.07 0.598 0.598
7 Sergi Star.Wars.IV...A.New.Hope NA 2.55 0.0364 0.0364
8 Marc Star.Wars.IV...A.New.Hope 4 2.83 0.0793 0.0793
9 Jim Star.Wars.IV...A.New.Hope 5 3.15 -0.445 0.445
10 Chris Star.Wars.IV...A.New.Hope 4 3.18 -0.344 0.344
# ... with 390 more rows
最后,我们使用 filter
和 %in%
运算符减少到包含 Victoria 未曾看过的电影的行。这次我们 group_by
电影,所以计算是按电影进行的。然后,我们使用 summarise
使用您在问题中列出的公式计算推荐分数,并添加 Victoria 的平均值。最后,我们使用 arrange
和 desc
.
按降序排序
v_recommendations <- v_correlations %>%
filter(film %in% v_films_not_seen) %>%
group_by(film) %>%
summarise(
score = sum((rating - mean) * pearson, na.rm = TRUE) / (sum(pearson_abs) - 1)) %>%
mutate(score = score + mean(v_persons_ratings, na.rm = TRUE)) %>%
arrange(desc(score))
# A tibble: 7 x 2
film score
<chr> <dbl>
1 The.Matrix 3.79
2 Forrest.Gump 3.51
3 The.Sixth.Sense 3.33
4 Shakespeare.in.Love 3.11
5 Blade.Runner 2.91
6 Saving.Private.Ryan 2.89
7 The.Shawshank.Redemption 2.81
函数
哇!现在我们可以将上面的代码放入一个函数中,将上面所有的 v_
对象替换为通用对象,并提供 tbl
、person
和 n
作为参数。 tbl
是电影、用户、评分和用户平均评分的数据,person
是我们想要推荐的人,n
是我们想要推荐的数量。这段代码和上面基本一样,只是我在推荐table.
的return只在n
行的末尾添加了head(n)
top_recs <- function(tbl, person, n){
films_not_seen <- tbl %>%
filter(user == person & is.na(rating)) %>%
extract2("film")
persons_ratings <- tbl %>%
filter(user == person) %>%
extract2("rating")
correlations <- tbl %>%
group_by(user) %>%
mutate(
pearson = cor(rating, persons_ratings, use = "complete.obs"),
pearson_abs = abs(pearson)
) %>%
ungroup()
recommendations <- correlations %>%
filter(film %in% films_not_seen) %>%
group_by(film) %>%
summarise(
score = sum((rating - mean) * pearson, na.rm = TRUE) / (sum(pearson_abs) - 1)) %>%
mutate(score = score + mean(persons_ratings, na.rm = TRUE)) %>%
arrange(desc(score)) %>%
head(n)
}
我们可以再次使用 Victoria 测试该功能,看看它是否有效:
top_recs(tidy_cr, "Victoria", 5) %>% print()
# A tibble: 5 x 2
film score
<chr> <dbl>
1 The.Matrix 3.79
2 Forrest.Gump 3.51
3 The.Sixth.Sense 3.33
4 Shakespeare.in.Love 3.11
5 Blade.Runner 2.91
但我们现在也可以获得任何用户的推荐,例如 Bernard 的 8 条推荐:
top_recs(tidy_cr, "Bernard", 8) %>% print()
# A tibble: 8 x 2
film score
<chr> <dbl>
1 The.Shawshank.Redemption 3.23
2 Pulp.Fiction 3.17
3 Schindler.s.List 3.11
4 Blade.Runner 2.97
5 Saving.Private.Ryan 2.94
6 Shakespeare.in.Love 2.74
7 Groundhog.Day 2.72
8 Stand.by.Me 2.60
备注
这实际上是一个有趣的练习,可以让您了解您是否可以找出工具来执行您想要的操作并相应地构建工作流程。我认为这种方法的强大之处在于我们现在可以获得针对任何用户的推荐。也许更重要的是,此代码中的任何内容(我认为)都不依赖于这些特定用户或电影或评级。如果你有更多的数据,你可以将它们添加到 tidy_cr
的底部,该功能仍然有效。如果您更改评分公式,只需编辑几行即可更新。
我希望你能一步一步学习更多关于如何在 R 中编写你自己的函数!接下来的其他步骤可能是错误检查,比如如果您要求的推荐比现有的更多,或者如果您键入不在数据库中的用户名,则显示一些信息。我认为这可能令人生畏,但 ?
是你在 R 中检查函数选项的最好朋友。我绝对不知道这一切!
我需要通过使用所有其他评论家排名的加权平均值来获得为 Victoria 推荐的前 5 部电影。 我只能在 Excel 中完成,但我需要翻译成 R。 获取值的公式是:
维多利亚的平均值 + (Σ(其他评论家评分 - 其他评论家平均评分) * 皮尔逊相关系数) / Σ 绝对加权皮尔逊值 - 1(这是维多利亚的)
为了更好地说明 对于电影 The.Matrix 这是应该的结果
约翰投票 = 4
约翰平均数 = 3,0714
约翰·皮尔逊 cor = 0,27709796
公式 = (4-3,07) * 0,27709796 / 6,9280(abs 加权值之和)
(我如何在 R 中创建这个公式???)如果我能解决这个问题,也许我可以做剩下的事情
为每个人完成此操作后,您添加 3,1538(维多利亚的平均值),这应该会产生 3,791701302
我需要为所有 Victorias 没有看过的电影做这件事,这些应该是结果
3.7917013044215, 'The Matrix'
3.50776533175371, 'Forrest Gump'
3.33118834864677, 'The Sixth Sense'
3.11491825315719, 'Shakespeare in Love'
2.9124513228665, 'Blade Runner'
这是 Excel 上的黑客帝国版本:
到目前为止,这是我的代码:
cr2<-t(cr[,2:21])
colnames(cr2)<-cr[,1]
cr2<-as.data.frame(cr2)
cr$Mean <- rowMeans(cr[,2:20], na.rm = TRUE)
cr$Pearson <- cor(cr2[,1:20], cr2[15], use = "pairwise.complete.obs")
cr$PearsonABS <- abs(cor(cr2[,1:20], cr2[15], use = "pairwise.complete.obs"))
x <- sum(cr$PearsonABS) - 1
我用一种非常糟糕的方式(手动)
g <- cr[15,22]
#Forrest Gump
fg = cr[,4] - cr[, 22]
fga = fg * cr[,23]
fgb = fga / x
fgc = sum(fgb, na.rm = TRUE) + g
print(fgc)
#The Shawshank Redemption
sr = cr[,5] - cr[, 22]
sra = sr * cr[,23]
srb = sra / x
src = sum(srb, na.rm = TRUE) + g
print(src)
#Saving Private Ryan
sp = cr[,9] - cr[, 22]
spa = sp * cr[,23]
spb = spa / x
spc = sum(spb, na.rm = TRUE) + g
print(spc)
#Shakespeare in Love
sl = cr[,12] - cr[, 22]
sla = sl * cr[,23]
slb = sla / x
slc = sum(slb, na.rm = TRUE) + g
print(slc)
#Blade Runner
br = cr[,15] - cr[, 22]
bra = br * cr[,23]
brb = bra / x
brc = sum(brb, na.rm = TRUE) + g
print(brc)
#The Matrix
tm = cr[,17] - cr[, 22]
tma = tm * cr[,23]
tmb = tma / x
tmc = sum(tmb, na.rm = TRUE) + g
print(tmc)
#The Sixth Sense
ts = cr[,19] - cr[, 22]
tsa = ts * cr[,23]
tsb = tsa / x
tsc = sum(tsb, na.rm = TRUE) + g
print(tsc)
my_list <- c(fgc, src, spc, slc, brc, tmc, tsc)
head(sort(my_list, decreasing=TRUE), 5)
这是 dput()
dput(cr)
structure(list(User = structure(c(8L, 10L, 2L, 17L, 11L, 1L,
18L, 9L, 7L, 5L, 3L, 14L, 13L, 4L, 20L, 6L, 16L, 12L, 15L, 19L
), .Label = c("Ana", "Anton", "Bernard", "Carles", "Chris", "Ivan",
"Jim", "John", "Marc", "Maria", "Martina", "Nadia", "Nerea",
"Nuria", "Oriol", "Rachel", "Roger", "Sergi", "Valery", "Victoria"
), class = "factor"), Star.Wars.IV...A.New.Hope = c(1L, 5L, NA,
NA, 4L, 2L, NA, 4L, 5L, 4L, 2L, 3L, 2L, 3L, 4L, NA, NA, 4L, 5L,
1L), Star.Wars.VI...Return.of.the.Jedi = c(5L, 3L, NA, 3L, 3L,
4L, NA, NA, 1L, 2L, 1L, 5L, 3L, NA, 4L, NA, NA, 5L, 1L, 2L),
Forrest.Gump = c(2L, NA, NA, NA, 4L, 4L, 3L, NA, NA, NA,
5L, 2L, NA, 3L, NA, 1L, NA, 1L, NA, 2L), The.Shawshank.Redemption = c(NA,
2L, 5L, NA, 1L, 4L, 1L, NA, 4L, 5L, NA, NA, 5L, NA, NA, NA,
NA, 5L, NA, 4L), The.Silence.of.the.Lambs = c(4L, 4L, 2L,
NA, 4L, NA, 1L, 3L, 2L, 3L, NA, 2L, 4L, 2L, 5L, 3L, 4L, 1L,
NA, 5L), Gladiator = c(4L, 2L, NA, 1L, 1L, NA, 4L, 2L, 4L,
NA, 5L, NA, NA, NA, 5L, 2L, NA, 1L, 4L, NA), Toy.Story = c(2L,
1L, 4L, 2L, NA, 3L, NA, 2L, 4L, 4L, 5L, 2L, 4L, 3L, 2L, NA,
2L, 4L, 2L, 2L), Saving.Private.Ryan = c(2L, NA, NA, 3L,
4L, 1L, 5L, NA, 4L, 3L, NA, NA, 5L, NA, NA, 2L, NA, NA, 1L,
3L), Pulp.Fiction = c(NA, NA, NA, 4L, NA, 4L, 2L, 3L, NA,
4L, NA, 1L, NA, NA, 3L, NA, 2L, 5L, 3L, 2L), Stand.by.Me = c(3L,
4L, 1L, NA, 1L, 4L, NA, NA, 1L, NA, NA, NA, NA, 4L, 5L, 1L,
NA, NA, 3L, 2L), Shakespeare.in.Love = c(2L, 3L, NA, NA,
5L, 5L, 1L, NA, 2L, NA, NA, 3L, NA, NA, NA, 5L, 2L, NA, 3L,
1L), Total.Recall = c(NA, 2L, 1L, 4L, 1L, 2L, NA, 2L, 3L,
NA, 3L, NA, 2L, 1L, 1L, NA, NA, NA, 1L, NA), Independence.Day = c(5L,
2L, 4L, 1L, NA, 4L, NA, 3L, 1L, 2L, 2L, 3L, 4L, 2L, 3L, NA,
NA, NA, NA, NA), Blade.Runner = c(2L, NA, 4L, 3L, 4L, NA,
3L, 2L, NA, NA, NA, NA, NA, 2L, NA, NA, NA, 4L, NA, 5L),
Groundhog.Day = c(NA, 2L, 1L, 5L, NA, 1L, NA, 4L, 5L, NA,
NA, 2L, 3L, 3L, 2L, 5L, NA, NA, NA, 5L), The.Matrix = c(4L,
NA, 1L, NA, 3L, NA, 1L, NA, NA, 2L, 1L, 5L, NA, 5L, NA, 2L,
4L, NA, 2L, 4L), Schindler.s.List = c(2L, 5L, 2L, 5L, 5L,
NA, NA, 1L, NA, 5L, NA, NA, NA, 1L, 3L, 2L, NA, 2L, NA, 3L
), The.Sixth.Sense = c(5L, 1L, 3L, 1L, 5L, 3L, NA, 3L, NA,
1L, 2L, NA, NA, NA, NA, 4L, NA, 1L, NA, 5L), Raiders.of.the.Lost.Ark = c(NA,
3L, 1L, 1L, NA, NA, 5L, 5L, NA, NA, 1L, NA, 5L, NA, 3L, 3L,
NA, 2L, NA, 3L), Babe = c(NA, NA, 3L, 2L, NA, 2L, 2L, NA,
5L, NA, 4L, 2L, NA, NA, 1L, 4L, NA, 5L, NA, NA), Mean = c(3.07142857142857,
2.78571428571429, 2.46153846153846, 2.69230769230769, 3.21428571428571,
3.07142857142857, 2.54545454545455, 2.83333333333333, 3.15384615384615,
3.18181818181818, 2.81818181818182, 2.72727272727273, 3.7,
2.63636363636364, 3.15384615384615, 2.83333333333333, 2.8,
3.07692307692308, 2.5, 3.0625), Pearson = structure(c(0.277097961607667,
0.492592183071889, -0.184664098655286, -0.306988756155365,
0.047716527859489, 0.597614304667197, 0.0363696483726654,
0.0793422835603058, -0.444514447822542, -0.344265186329548,
-0.186499664263607, 0.365148371670111, 0.205737799949456,
0.427324672683063, 1, -0.732776720760177, 0.944911182523068,
-0.587378478571482, 0.578979445733232, -0.0881134221062802
), .Dim = c(20L, 1L), .Dimnames = list(c("John", "Maria",
"Anton", "Roger", "Martina", "Ana", "Sergi", "Marc", "Jim",
"Chris", "Bernard", "Nuria", "Nerea", "Carles", "Victoria",
"Ivan", "Rachel", "Nadia", "Oriol", "Valery"), "Victoria")),
PearsonABS = structure(c(0.277097961607667, 0.492592183071889,
0.184664098655286, 0.306988756155365, 0.047716527859489,
0.597614304667197, 0.0363696483726654, 0.0793422835603058,
0.444514447822542, 0.344265186329548, 0.186499664263607,
0.365148371670111, 0.205737799949456, 0.427324672683063,
1, 0.732776720760177, 0.944911182523068, 0.587378478571482,
0.578979445733232, 0.0881134221062802), .Dim = c(20L, 1L), .Dimnames = list(
c("John", "Maria", "Anton", "Roger", "Martina", "Ana",
"Sergi", "Marc", "Jim", "Chris", "Bernard", "Nuria",
"Nerea", "Carles", "Victoria", "Ivan", "Rachel", "Nadia",
"Oriol", "Valery"), "Victoria"))), .Names = c("User",
"Star.Wars.IV...A.New.Hope", "Star.Wars.VI...Return.of.the.Jedi",
"Forrest.Gump", "The.Shawshank.Redemption", "The.Silence.of.the.Lambs",
"Gladiator", "Toy.Story", "Saving.Private.Ryan", "Pulp.Fiction",
"Stand.by.Me", "Shakespeare.in.Love", "Total.Recall", "Independence.Day",
"Blade.Runner", "Groundhog.Day", "The.Matrix", "Schindler.s.List",
"The.Sixth.Sense", "Raiders.of.the.Lost.Ark", "Babe", "Mean",
"Pearson", "PearsonABS"), row.names = c(NA, -20L), class = "data.frame")
我希望我解释清楚了。 有人可以帮助我吗?
好的,我希望你能跟上这个,我会尽量给出足够的解释。目标是编写一个函数,可以包装您的手动计算并使其易于 运行 多次。
正在整理
首先我们要使数据整齐,这样更容易使用。这意味着让每一列成为一个变量,这样就不会在多个列中有电影评级。参见 more about tidy data here。我们将使用 tidyverse
包来完成此操作。
- 用
select
删除均值和皮尔逊列。我们希望能够根据我们正在查看的用户以不同方式计算它们,并且它们也以列表的形式奇怪地存储在您的dput
中。 - 使用
gather
获取所有电影列并将电影名称放入一列,将评级放入另一列。 - 将
User
重命名为小写以保持一致性 - 使用
group_by
和mutate
对每个用户进行平均评分。na.rm = TRUE
表示mean
计算时会忽略所有NA
值。
看起来像这样。看到数据现在很整洁,只有四列。
library(tidyverse)
library(magrittr)
tidy_cr <- cr %>%
select(-Mean, -Pearson, -PearsonABS) %>%
gather("film", "rating", -User) %>%
rename(user = User) %>%
group_by(user) %>%
mutate(mean = mean(rating, na.rm = TRUE)) %>%
ungroup()
# A tibble: 400 x 4
user film rating mean
<fct> <chr> <int> <dbl>
1 John Star.Wars.IV...A.New.Hope 1 3.07
2 Maria Star.Wars.IV...A.New.Hope 5 2.79
3 Anton Star.Wars.IV...A.New.Hope NA 2.46
4 Roger Star.Wars.IV...A.New.Hope NA 2.69
5 Martina Star.Wars.IV...A.New.Hope 4 3.21
6 Ana Star.Wars.IV...A.New.Hope 2 3.07
7 Sergi Star.Wars.IV...A.New.Hope NA 2.55
8 Marc Star.Wars.IV...A.New.Hope 4 2.83
9 Jim Star.Wars.IV...A.New.Hope 5 3.15
10 Chris Star.Wars.IV...A.New.Hope 4 3.18
# ... with 390 more rows
维多利亚例子
这是一步一步向您展示每个阶段的输出,以维多利亚为例。首先,我们想找出维多利亚没看过哪些电影。我们通过 filter
向下搜索 user
列中有 Victoria 和 NA
评级的行,然后拉出 film
列。
v_films_not_seen <- tidy_cr %>%
filter(user == "Victoria" & is.na(rating)) %>%
extract2("film")
[1] "Forrest.Gump" "The.Shawshank.Redemption" "Saving.Private.Ryan"
[4] "Shakespeare.in.Love" "Blade.Runner" "The.Matrix"
[7] "The.Sixth.Sense"
然后,我们要提取 Victoria 的评分,以便计算相关系数和她的具体均值。我们再次减少到 Victoria 的行,现在拉出 rating
列。
v_persons_ratings <- tidy_cr %>%
filter(user == "Victoria") %>%
extract2("rating")
[1] 4 4 NA NA 5 5 2 NA 3 5 NA 1 3 NA 2 NA 3 NA 3 1
现在我们要计算相关性。我们首先 group_by
用户,因此此计算将针对每个用户进行。然后我们使用 mutate
和 cor
来计算每个用户的评分与 Victoria 的评分之间的相关性,我们在最后一步中将其保存为 v_persons_ratings
。选项 use = "complete.obs"
表示相关性仅查看两个用户对电影评分的观察结果。然后我们用abs
得到相关系数的绝对值
v_correlations <- tidy_cr %>%
group_by(user) %>%
mutate(
pearson = cor(rating, v_persons_ratings, use = "complete.obs"),
pearson_abs = abs(pearson)
) %>%
ungroup()
# A tibble: 400 x 6
user film rating mean pearson pearson_abs
<fct> <chr> <int> <dbl> <dbl> <dbl>
1 John Star.Wars.IV...A.New.Hope 1 3.07 0.277 0.277
2 Maria Star.Wars.IV...A.New.Hope 5 2.79 0.493 0.493
3 Anton Star.Wars.IV...A.New.Hope NA 2.46 -0.185 0.185
4 Roger Star.Wars.IV...A.New.Hope NA 2.69 -0.307 0.307
5 Martina Star.Wars.IV...A.New.Hope 4 3.21 0.0477 0.0477
6 Ana Star.Wars.IV...A.New.Hope 2 3.07 0.598 0.598
7 Sergi Star.Wars.IV...A.New.Hope NA 2.55 0.0364 0.0364
8 Marc Star.Wars.IV...A.New.Hope 4 2.83 0.0793 0.0793
9 Jim Star.Wars.IV...A.New.Hope 5 3.15 -0.445 0.445
10 Chris Star.Wars.IV...A.New.Hope 4 3.18 -0.344 0.344
# ... with 390 more rows
最后,我们使用 filter
和 %in%
运算符减少到包含 Victoria 未曾看过的电影的行。这次我们 group_by
电影,所以计算是按电影进行的。然后,我们使用 summarise
使用您在问题中列出的公式计算推荐分数,并添加 Victoria 的平均值。最后,我们使用 arrange
和 desc
.
v_recommendations <- v_correlations %>%
filter(film %in% v_films_not_seen) %>%
group_by(film) %>%
summarise(
score = sum((rating - mean) * pearson, na.rm = TRUE) / (sum(pearson_abs) - 1)) %>%
mutate(score = score + mean(v_persons_ratings, na.rm = TRUE)) %>%
arrange(desc(score))
# A tibble: 7 x 2
film score
<chr> <dbl>
1 The.Matrix 3.79
2 Forrest.Gump 3.51
3 The.Sixth.Sense 3.33
4 Shakespeare.in.Love 3.11
5 Blade.Runner 2.91
6 Saving.Private.Ryan 2.89
7 The.Shawshank.Redemption 2.81
函数
哇!现在我们可以将上面的代码放入一个函数中,将上面所有的 v_
对象替换为通用对象,并提供 tbl
、person
和 n
作为参数。 tbl
是电影、用户、评分和用户平均评分的数据,person
是我们想要推荐的人,n
是我们想要推荐的数量。这段代码和上面基本一样,只是我在推荐table.
n
行的末尾添加了head(n)
top_recs <- function(tbl, person, n){
films_not_seen <- tbl %>%
filter(user == person & is.na(rating)) %>%
extract2("film")
persons_ratings <- tbl %>%
filter(user == person) %>%
extract2("rating")
correlations <- tbl %>%
group_by(user) %>%
mutate(
pearson = cor(rating, persons_ratings, use = "complete.obs"),
pearson_abs = abs(pearson)
) %>%
ungroup()
recommendations <- correlations %>%
filter(film %in% films_not_seen) %>%
group_by(film) %>%
summarise(
score = sum((rating - mean) * pearson, na.rm = TRUE) / (sum(pearson_abs) - 1)) %>%
mutate(score = score + mean(persons_ratings, na.rm = TRUE)) %>%
arrange(desc(score)) %>%
head(n)
}
我们可以再次使用 Victoria 测试该功能,看看它是否有效:
top_recs(tidy_cr, "Victoria", 5) %>% print()
# A tibble: 5 x 2
film score
<chr> <dbl>
1 The.Matrix 3.79
2 Forrest.Gump 3.51
3 The.Sixth.Sense 3.33
4 Shakespeare.in.Love 3.11
5 Blade.Runner 2.91
但我们现在也可以获得任何用户的推荐,例如 Bernard 的 8 条推荐:
top_recs(tidy_cr, "Bernard", 8) %>% print()
# A tibble: 8 x 2
film score
<chr> <dbl>
1 The.Shawshank.Redemption 3.23
2 Pulp.Fiction 3.17
3 Schindler.s.List 3.11
4 Blade.Runner 2.97
5 Saving.Private.Ryan 2.94
6 Shakespeare.in.Love 2.74
7 Groundhog.Day 2.72
8 Stand.by.Me 2.60
备注
这实际上是一个有趣的练习,可以让您了解您是否可以找出工具来执行您想要的操作并相应地构建工作流程。我认为这种方法的强大之处在于我们现在可以获得针对任何用户的推荐。也许更重要的是,此代码中的任何内容(我认为)都不依赖于这些特定用户或电影或评级。如果你有更多的数据,你可以将它们添加到 tidy_cr
的底部,该功能仍然有效。如果您更改评分公式,只需编辑几行即可更新。
我希望你能一步一步学习更多关于如何在 R 中编写你自己的函数!接下来的其他步骤可能是错误检查,比如如果您要求的推荐比现有的更多,或者如果您键入不在数据库中的用户名,则显示一些信息。我认为这可能令人生畏,但 ?
是你在 R 中检查函数选项的最好朋友。我绝对不知道这一切!