在casperjs中打开多个链接
Open multiple links in casperjs
我正在尝试从该网站 http://www.basketball-reference.com/teams/GSW/2016_games.html 中抓取所有 link 特殊类型 (boxscore-links),然后一个一个地访问它们,从每个访问过的 [= =17=]。一开始我想抓取所有 link ,一个一个地访问它们并获得网站的标题。问题是它总是打印相同的标题和相同的当前 url(初始 url),即使它显然必须是新的。在我看来 'this'-关键字有问题...
(不要看 links 的限制,我从 casperjs 的 github 上的示例中获取了代码,我把它留给控制台不要超载。)
这是我的代码:
var casper = require("casper").create({
verbose: true
});
// The base links array
var links = [ "http://www.basketball-reference.com/teams/GSW/2016_games.html" ];
// If we don't set a limit, it could go on forever
var upTo = ~~casper.cli.get(0) || 10;
var currentLink = 0;
// Get the links, and add them to the links array
function addLinks(link) {
this.then(function() {
var found = this.evaluate(searchLinks);
this.echo(found.length + " links found on " + link);
links = links.concat(found);
});
}
// Fetch all <a> elements from the page and return
// the ones which contains a href starting with 'http://'
function searchLinks() {
var links = document.querySelectorAll('#teams_games td:nth-child(5) a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href');
});
}
// Just opens the page and prints the title
function start(link) {
this.start(link, function() {
this.wait(5000, function() {
this.echo('Page title: ' + this.getTitle());
this.echo('Current url: ' + this.getCurrentUrl());
});
});
}
// As long as it has a next link, and is under the maximum limit, will keep running
function check() {
if (links[currentLink] && currentLink < upTo) {
this.echo('--- Link ' + currentLink + ' ---');
start.call(this, links[currentLink]);
addLinks.call(this, links[currentLink]);
currentLink++;
this.run(check);
} else {
this.echo("All done.");
this.exit();
}
}
casper.start().then(function() {
this.echo("Starting");
});
casper.run(check);
考虑一个 URL 数组,您可以遍历它们,使用类似以下内容连续访问每个:
casper.each(urls, function(self, url) {
self.thenOpen(url, function(){
this.echo('Opening: ' + url);
// Do Whatever
});
});
显然这不会在页面上找到链接,但这是遍历一组已知 URL 的好方法。
我正在尝试从该网站 http://www.basketball-reference.com/teams/GSW/2016_games.html 中抓取所有 link 特殊类型 (boxscore-links),然后一个一个地访问它们,从每个访问过的 [= =17=]。一开始我想抓取所有 link ,一个一个地访问它们并获得网站的标题。问题是它总是打印相同的标题和相同的当前 url(初始 url),即使它显然必须是新的。在我看来 'this'-关键字有问题... (不要看 links 的限制,我从 casperjs 的 github 上的示例中获取了代码,我把它留给控制台不要超载。) 这是我的代码:
var casper = require("casper").create({
verbose: true
});
// The base links array
var links = [ "http://www.basketball-reference.com/teams/GSW/2016_games.html" ];
// If we don't set a limit, it could go on forever
var upTo = ~~casper.cli.get(0) || 10;
var currentLink = 0;
// Get the links, and add them to the links array
function addLinks(link) {
this.then(function() {
var found = this.evaluate(searchLinks);
this.echo(found.length + " links found on " + link);
links = links.concat(found);
});
}
// Fetch all <a> elements from the page and return
// the ones which contains a href starting with 'http://'
function searchLinks() {
var links = document.querySelectorAll('#teams_games td:nth-child(5) a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href');
});
}
// Just opens the page and prints the title
function start(link) {
this.start(link, function() {
this.wait(5000, function() {
this.echo('Page title: ' + this.getTitle());
this.echo('Current url: ' + this.getCurrentUrl());
});
});
}
// As long as it has a next link, and is under the maximum limit, will keep running
function check() {
if (links[currentLink] && currentLink < upTo) {
this.echo('--- Link ' + currentLink + ' ---');
start.call(this, links[currentLink]);
addLinks.call(this, links[currentLink]);
currentLink++;
this.run(check);
} else {
this.echo("All done.");
this.exit();
}
}
casper.start().then(function() {
this.echo("Starting");
});
casper.run(check);
考虑一个 URL 数组,您可以遍历它们,使用类似以下内容连续访问每个:
casper.each(urls, function(self, url) {
self.thenOpen(url, function(){
this.echo('Opening: ' + url);
// Do Whatever
});
});
显然这不会在页面上找到链接,但这是遍历一组已知 URL 的好方法。