JS 中的数据操作:找到最匹配的两个用户
Data manipulation in JS: Finding two users with the best match
我现在遇到了一个问题。我有一个包含数据的 CSV 文件。我已经使用 d3.js.
解析了它
看起来像这样:
moviesByUser = [
{"userId":"001","values":[
{"movieID":"222"},
{"movieID":"333"}
]},
{"userId":"045","values":[
{"movieID":"111"}
]},
{"userId":"294","values":[
{"movieID":"222"},
{"movieID":"333"},
{"movieID":"789"}
]},
{"userId":"246","values":[
{"movieID":"222"},
{"movieID":"111"},
{"movieID":"987"}
]},
{"userId":"642","values":[
{"movieID":"222"},
{"movieID":"111"},
{"movieID":"333"},
{"movieID":"789"},
]}
];
我一直想做的是想出一种方法来找到匹配最多的两个用户,在本例中是电影。实际数据是1000的用户和电影。
提前致谢。
基本上创建一个列表来存储每对用户的匹配数。为了更有效地做到这一点,按电影对用户进行分组,并将每个新用户与每部电影列表中的每个用户“配对”。
const moviesByUser = [
{
"userId": "001", "values": [
{ "movieID": "222" },
{ "movieID": "333" }
]
},
{
"userId": "045", "values": [
{ "movieID": "111" }
]
},
{
"userId": "294", "values": [
{ "movieID": "222" },
{ "movieID": "333" },
{ "movieID": "789" }
]
},
{
"userId": "246", "values": [
{ "movieID": "222" },
{ "movieID": "111" },
{ "movieID": "987" }
]
},
{
"userId": "642", "values": [
{ "movieID": "222" },
{ "movieID": "111" },
{ "movieID": "333" },
{ "movieID": "789" },
]
}
];
const usersByMovieID = {};
const pairs = {};
let best = 0;
// sort by number of movies DESC
moviesByUser.sort((a,b) => b.values.length - a.values.length);
for (let user of moviesByUser) {
// since the list is sorted, no further user can win anymore; not even a tie.
if(user.values.length < best) break;
for (let { movieID } of user.values) {
usersByMovieID[movieID] ??= [];
for (let other of usersByMovieID[movieID]) {
const hash = other.userId + "/" + user.userId;
pairs[hash] = (pairs[hash] || 0) + 1;
best = Math.max(best, pairs[hash]);
}
usersByMovieID[movieID].push(user);
}
}
const result = Object.entries(pairs)
.filter(([hash, matches]) => matches === best)
.map(([hash, matches]) => {
const [user1, user2] = hash.split("/");
return { matches, user1, user2 }
});
console.log(result);
.as-console-wrapper{top:0;max-height:100%!important}
我想不出任何特别有效的算法。但是我们可以把低效的写得相当干净。
// utility functions
const pairs = ([x, ...xs] = []) =>
xs .length == 0 ? [] : xs .map (y => [x, y]) .concat (pairs (xs))
const shared = (xs = [], ys = []) =>
xs .reduce ((t, x) => ys.includes (x) ? t + 1: t, 0)
// main function
const mostMatches = (moviesByUser) =>
pairs (moviesByUser .map (({userId, values}) => ({
id: userId,
ms: values .map (m => m .movieID)
}))) .reduce (({ids, matches}, [x, y, t = shared (x .ms, y .ms)]) =>
t > matches ? {matches: t, ids: [x.id, y.id]} : {matches, ids},
{matches: -1}
)
// sample data
const moviesByUser = [{userId: "001", values: [{movieID: "222"}, {movieID: "333"}]}, {userId: "045", values: [{movieID: "111"}]}, {userId: "294", values: [{movieID: "222"}, {movieID: "333"}, {movieID: "789"}]}, {userId: "246", values: [{movieID: "222"}, {movieID: "111"}, {movieID: "987"}]}, {userId: "642", values: [{movieID: "222"}, {movieID: "111"}, {movieID: "333"}, {movieID: "789"}]}]
// demo
console .log (mostMatches (moviesByUser))
我们首先将元素转换成稍微好用的格式:
{
"userId": "001", "values": [
{ "movieID": "222" },
{ "movieID": "333" }
]
}
变成
{id: '001', m: ['222', '333']}
然后我们取 pairs
这样的元素,并使用 reduce
将它们折叠成一个结果,计算有多少 shared
电影在他们的两个列表中,每当我们找到更大的电影时更新我们的最大值和用户 ID。
我们使用两个通用实用函数。
pairs
获取一个元素列表,并且 return 是该列表中的 n * (n - 1) / 2
对元素。例如,
pairs (["a", "b", "c", "d"])
//=> [["a", "b"], ["a", "c"], ["a", "d"], ["b", "c"], ["b", "d"], ["c", "d"]]
shared
获取两个值列表并计算它们共有的值的数量。例如,
shared ([2, 3, 5, 7, 11, 13, 17, 19, 23, 29], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20])
//=> 4 (the four values 11, 13, 17, and 19 are in both lists.)
我们的最终输出类似于 {matches: 3, ids: ['294', '642']}
。如果需要,只需要多做一点工作,我们就可以 return 配对中的原始用户。
性能将类似于 O (n^2 * m)
,其中 n
是用户数量,m
是他们的平均电影数量。我看不出有什么方法可以避免 n ^ 2
,但也许聪明的算法可以减少 m
因素。
我现在遇到了一个问题。我有一个包含数据的 CSV 文件。我已经使用 d3.js.
解析了它看起来像这样:
moviesByUser = [
{"userId":"001","values":[
{"movieID":"222"},
{"movieID":"333"}
]},
{"userId":"045","values":[
{"movieID":"111"}
]},
{"userId":"294","values":[
{"movieID":"222"},
{"movieID":"333"},
{"movieID":"789"}
]},
{"userId":"246","values":[
{"movieID":"222"},
{"movieID":"111"},
{"movieID":"987"}
]},
{"userId":"642","values":[
{"movieID":"222"},
{"movieID":"111"},
{"movieID":"333"},
{"movieID":"789"},
]}
];
我一直想做的是想出一种方法来找到匹配最多的两个用户,在本例中是电影。实际数据是1000的用户和电影。
提前致谢。
基本上创建一个列表来存储每对用户的匹配数。为了更有效地做到这一点,按电影对用户进行分组,并将每个新用户与每部电影列表中的每个用户“配对”。
const moviesByUser = [
{
"userId": "001", "values": [
{ "movieID": "222" },
{ "movieID": "333" }
]
},
{
"userId": "045", "values": [
{ "movieID": "111" }
]
},
{
"userId": "294", "values": [
{ "movieID": "222" },
{ "movieID": "333" },
{ "movieID": "789" }
]
},
{
"userId": "246", "values": [
{ "movieID": "222" },
{ "movieID": "111" },
{ "movieID": "987" }
]
},
{
"userId": "642", "values": [
{ "movieID": "222" },
{ "movieID": "111" },
{ "movieID": "333" },
{ "movieID": "789" },
]
}
];
const usersByMovieID = {};
const pairs = {};
let best = 0;
// sort by number of movies DESC
moviesByUser.sort((a,b) => b.values.length - a.values.length);
for (let user of moviesByUser) {
// since the list is sorted, no further user can win anymore; not even a tie.
if(user.values.length < best) break;
for (let { movieID } of user.values) {
usersByMovieID[movieID] ??= [];
for (let other of usersByMovieID[movieID]) {
const hash = other.userId + "/" + user.userId;
pairs[hash] = (pairs[hash] || 0) + 1;
best = Math.max(best, pairs[hash]);
}
usersByMovieID[movieID].push(user);
}
}
const result = Object.entries(pairs)
.filter(([hash, matches]) => matches === best)
.map(([hash, matches]) => {
const [user1, user2] = hash.split("/");
return { matches, user1, user2 }
});
console.log(result);
.as-console-wrapper{top:0;max-height:100%!important}
我想不出任何特别有效的算法。但是我们可以把低效的写得相当干净。
// utility functions
const pairs = ([x, ...xs] = []) =>
xs .length == 0 ? [] : xs .map (y => [x, y]) .concat (pairs (xs))
const shared = (xs = [], ys = []) =>
xs .reduce ((t, x) => ys.includes (x) ? t + 1: t, 0)
// main function
const mostMatches = (moviesByUser) =>
pairs (moviesByUser .map (({userId, values}) => ({
id: userId,
ms: values .map (m => m .movieID)
}))) .reduce (({ids, matches}, [x, y, t = shared (x .ms, y .ms)]) =>
t > matches ? {matches: t, ids: [x.id, y.id]} : {matches, ids},
{matches: -1}
)
// sample data
const moviesByUser = [{userId: "001", values: [{movieID: "222"}, {movieID: "333"}]}, {userId: "045", values: [{movieID: "111"}]}, {userId: "294", values: [{movieID: "222"}, {movieID: "333"}, {movieID: "789"}]}, {userId: "246", values: [{movieID: "222"}, {movieID: "111"}, {movieID: "987"}]}, {userId: "642", values: [{movieID: "222"}, {movieID: "111"}, {movieID: "333"}, {movieID: "789"}]}]
// demo
console .log (mostMatches (moviesByUser))
我们首先将元素转换成稍微好用的格式:
{
"userId": "001", "values": [
{ "movieID": "222" },
{ "movieID": "333" }
]
}
变成
{id: '001', m: ['222', '333']}
然后我们取 pairs
这样的元素,并使用 reduce
将它们折叠成一个结果,计算有多少 shared
电影在他们的两个列表中,每当我们找到更大的电影时更新我们的最大值和用户 ID。
我们使用两个通用实用函数。
pairs
获取一个元素列表,并且 return 是该列表中的n * (n - 1) / 2
对元素。例如,pairs (["a", "b", "c", "d"]) //=> [["a", "b"], ["a", "c"], ["a", "d"], ["b", "c"], ["b", "d"], ["c", "d"]]
shared
获取两个值列表并计算它们共有的值的数量。例如,shared ([2, 3, 5, 7, 11, 13, 17, 19, 23, 29], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]) //=> 4 (the four values 11, 13, 17, and 19 are in both lists.)
我们的最终输出类似于 {matches: 3, ids: ['294', '642']}
。如果需要,只需要多做一点工作,我们就可以 return 配对中的原始用户。
性能将类似于 O (n^2 * m)
,其中 n
是用户数量,m
是他们的平均电影数量。我看不出有什么方法可以避免 n ^ 2
,但也许聪明的算法可以减少 m
因素。