用 node.js 和 horseman 从 js 中抓取 html
Scrape html from js with node.js and horseman
我正在尝试从此页面中获取包含工资信息的击球手数组:
https://www.swishanalytics.com/optimus/mlb/dfs-batter-projections
我正在使用 node.js 和 node-horseman。
这是我的代码:
var Horseman = require('node-horseman');
var horseman = new Horseman();
horseman.open('https://www.swishanalytics.com/optimus/mlb/dfs-batter-projections');
if (horseman.status() === 200) {
console.log('[+] Successful page opening')
horseman.screenshot('image.png');
console.log(horseman.html());
}
horseman.close();
问题是来自 horseman.html() 的 return 仍然有很多 JavaScript,无法用 cheerio 之类的东西提取。如何以编程方式执行 javascript?
例如,如果我同时查看源代码 link,我会看到击球手所在的区域以
开头
function Model(){ this.batterArray =
[{"team_short":"rockies","mlbam_id":"571448","player_name":"Nolan Arenado",
显然这仍然是 javascript...我假设在某些时候它必须被执行并转换为 HTML 才能由浏览器呈现?
我刚刚对此进行了测试,它似乎有效:
var Horseman = require('node-horseman');
var horseman = new Horseman();
horseman.open('https://www.swishanalytics.com/optimus/mlb/dfs-batter-projections');
if (horseman.status() === 200) {
console.log('[+] Successful page opening')
horseman.screenshot('image.png');
var batters = horseman.evaluate(function(){
return (new Model()).batterArray;
});
console.log(batters);
}
horseman.close();
这将为您提供一系列可以在代码中使用的击球手。您可以将其写入文件或从中创建一个 table。
它应该是这样工作的。
var Horseman = require('node-horseman');
var horseman = new Horseman();
horseman
.open('https://www.swishanalytics.com/optimus/mlb/dfs-batter-projections')
.status()
.then((status) => {
if(status === 200){
console.log('[+] Successful page opening')
horseman.screenshot('image.png');
var batters = horseman.evaluate(function(){
return (new Model()).batterArray;
});
console.log(batters);
}else{
console.log('no batters');
}
})
.close();
我正在尝试从此页面中获取包含工资信息的击球手数组: https://www.swishanalytics.com/optimus/mlb/dfs-batter-projections
我正在使用 node.js 和 node-horseman。
这是我的代码:
var Horseman = require('node-horseman');
var horseman = new Horseman();
horseman.open('https://www.swishanalytics.com/optimus/mlb/dfs-batter-projections');
if (horseman.status() === 200) {
console.log('[+] Successful page opening')
horseman.screenshot('image.png');
console.log(horseman.html());
}
horseman.close();
问题是来自 horseman.html() 的 return 仍然有很多 JavaScript,无法用 cheerio 之类的东西提取。如何以编程方式执行 javascript?
例如,如果我同时查看源代码 link,我会看到击球手所在的区域以
开头function Model(){ this.batterArray =
[{"team_short":"rockies","mlbam_id":"571448","player_name":"Nolan Arenado",
显然这仍然是 javascript...我假设在某些时候它必须被执行并转换为 HTML 才能由浏览器呈现?
我刚刚对此进行了测试,它似乎有效:
var Horseman = require('node-horseman');
var horseman = new Horseman();
horseman.open('https://www.swishanalytics.com/optimus/mlb/dfs-batter-projections');
if (horseman.status() === 200) {
console.log('[+] Successful page opening')
horseman.screenshot('image.png');
var batters = horseman.evaluate(function(){
return (new Model()).batterArray;
});
console.log(batters);
}
horseman.close();
这将为您提供一系列可以在代码中使用的击球手。您可以将其写入文件或从中创建一个 table。
它应该是这样工作的。
var Horseman = require('node-horseman');
var horseman = new Horseman();
horseman
.open('https://www.swishanalytics.com/optimus/mlb/dfs-batter-projections')
.status()
.then((status) => {
if(status === 200){
console.log('[+] Successful page opening')
horseman.screenshot('image.png');
var batters = horseman.evaluate(function(){
return (new Model()).batterArray;
});
console.log(batters);
}else{
console.log('no batters');
}
})
.close();