变量在 thenOpen 函数中有不同的值
Variable has a different value inside the thenOpen function
我正在使用 CasperJS 构建网络抓取工具。我正在尝试抓取 3 页。每页有15个link我要抓取。
在下面的代码片段中,我要抓取的所有 link 都收集在 allLinks
数组中。它的长度是 45。当我记录数组时,我看到了我想要正确抓取的所有 links。然后我遍历每个 link 并访问该页面。当我记录 link
(包含每个 link)时,我仍然可以按预期看到所有 45 个 link。但是,当我在 thenOpen
函数中记录 link
时,我只能从第一页看到 links。所以,它只记录 15 links。它不会记录第 2 页和第 3 页的任何 links。
casper.then(function(){
this.each(allLinks,function(self,link){
console.log("Getting all the links that need to be visited");
console.log(allLinks);
console.log("Getting each link");
console.log(link);
this.thenOpen(link,function(a){
console.log("Inside function that extracts data");
console.log(link);
});
});
});
完整代码如下
var casper = require('casper').create();
var url = casper.cli.get(0);
console.log(url);
var page2 = casper.cli.get(1);
console.log(page2);
jsonObj = { data : [] };
//var url = 'http://www.houzz.com/professionals/c/Nashville--TN/p/15';
var webPage = require('webpage');
zapTitle = [];
zapContact = [];
zapServices = [];
var page = webPage.create();
var nextBtn = "a.navigation-button.next";
var allLinks = [];
casper.start(url);
casper.waitForSelector(nextBtn, processPage);
casper.run();
function processPage() {
for (var i = 1; i <= page2; i = i + 1) {
this.then(function(){
console.log(i);
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
console.log(allLinks);
if (!this.exists(nextBtn)) {
return;
}
this.thenClick(nextBtn).then(function() {
this.echo(this.getCurrentUrl());
});
});
};
}
function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
}
casper.then(function(){
this.each(allLinks,function(self,link){
console.log("Inside the each function");
console.log(link);
this.thenOpen(link,function(a){
console.log("Inside function that extracts data");
console.log(link);
var description = this.fetchText('div.profile-about div:nth-child(1)');
description = description.replace(/[\t\n]/g,"");
var name = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(2) div.info-list-text');
name = name.replace(/[<b>Contact</b>: ]/g,"");
jsonObj.data.push({
"title" : this.fetchText('a.profile-full-name'),
"contact" : this.fetchText('div.profile-about div:nth-child(1)'),
"services" : this.getHTML('div.info-list-text span:nth-child(2) span'),
"name" : name,
"location" : this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span'),
"description" : description,
"reviews" : this.getHTML('div.pro-rating a span.pro-review-string span')
});
casper.open('https://zapier.com/hooks/catch/29s1m6/', {
method: 'post',
data: {
"title" : this.fetchText('a.profile-full-name'),
"contact" : this.getHTML('div.pro-contact-methods span.pro-contact-text:nth-child(2)'),
"services" : this.getHTML('div.info-list-text span:nth-child(2) span'),
"name" : name,
"location" : this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span'),
"description" : description,
"reviews" : this.getHTML('div.pro-rating a span.pro-review-string span')
}
});
}).then(function() {
console.log(jsonObj.data.length);
//console.log(jsonObj);
if (jsonObj.data.length == 13) {
console.log(jsonObj.data[13].title);
}
/*for(var i = 0; i < jsonObj.data.length; i = i + 1 ) {
console.log(i);
console.log("zaptitle");
//zapTitle.push(jsonObj.data[i]);
console.log(jsonObj.data[i].title);
//}
}*/
//require('utils').dump(jsonObj.data[2].title);
//require('utils').dump(jsonObj);
//require('utils').dump(jsonObj.data[8]);
//require('utils').dump(zapTitle);
for(var i = 0; i < jsonObj.data.length; i = i + 1 ) {
zapServices.push(jsonObj.data[i].services);
}
/*casper.open('https://zapier.com/hooks/catch/29s1m6/', {
method: 'post',
data: {"title" : zapTitle,
//"contact" : zapContact,
"services" : zapServices
}*/
});
});
});
casper.getHTML(selector)
如果找不到您要查找的元素 (code reference),则会失败并退出脚本。当然,如果你使用的是 PhantomJS 2.0 或 2.1,这个错误是隐藏的。
罪魁祸首是 12th page 上的 "location" : this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span'),
,因为它不存在。在尝试访问它之前,您需要检查选择器是否存在(例如 casper.exists(selector)
)。
这就是 little console.log
debugging 可以做到的。
我正在使用 CasperJS 构建网络抓取工具。我正在尝试抓取 3 页。每页有15个link我要抓取。
在下面的代码片段中,我要抓取的所有 link 都收集在 allLinks
数组中。它的长度是 45。当我记录数组时,我看到了我想要正确抓取的所有 links。然后我遍历每个 link 并访问该页面。当我记录 link
(包含每个 link)时,我仍然可以按预期看到所有 45 个 link。但是,当我在 thenOpen
函数中记录 link
时,我只能从第一页看到 links。所以,它只记录 15 links。它不会记录第 2 页和第 3 页的任何 links。
casper.then(function(){
this.each(allLinks,function(self,link){
console.log("Getting all the links that need to be visited");
console.log(allLinks);
console.log("Getting each link");
console.log(link);
this.thenOpen(link,function(a){
console.log("Inside function that extracts data");
console.log(link);
});
});
});
完整代码如下
var casper = require('casper').create();
var url = casper.cli.get(0);
console.log(url);
var page2 = casper.cli.get(1);
console.log(page2);
jsonObj = { data : [] };
//var url = 'http://www.houzz.com/professionals/c/Nashville--TN/p/15';
var webPage = require('webpage');
zapTitle = [];
zapContact = [];
zapServices = [];
var page = webPage.create();
var nextBtn = "a.navigation-button.next";
var allLinks = [];
casper.start(url);
casper.waitForSelector(nextBtn, processPage);
casper.run();
function processPage() {
for (var i = 1; i <= page2; i = i + 1) {
this.then(function(){
console.log(i);
var pageData = this.evaluate(getPageData);
allLinks = allLinks.concat(pageData);
console.log(allLinks);
if (!this.exists(nextBtn)) {
return;
}
this.thenClick(nextBtn).then(function() {
this.echo(this.getCurrentUrl());
});
});
};
}
function getPageData(){
//return document.title;
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
}
casper.then(function(){
this.each(allLinks,function(self,link){
console.log("Inside the each function");
console.log(link);
this.thenOpen(link,function(a){
console.log("Inside function that extracts data");
console.log(link);
var description = this.fetchText('div.profile-about div:nth-child(1)');
description = description.replace(/[\t\n]/g,"");
var name = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(2) div.info-list-text');
name = name.replace(/[<b>Contact</b>: ]/g,"");
jsonObj.data.push({
"title" : this.fetchText('a.profile-full-name'),
"contact" : this.fetchText('div.profile-about div:nth-child(1)'),
"services" : this.getHTML('div.info-list-text span:nth-child(2) span'),
"name" : name,
"location" : this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span'),
"description" : description,
"reviews" : this.getHTML('div.pro-rating a span.pro-review-string span')
});
casper.open('https://zapier.com/hooks/catch/29s1m6/', {
method: 'post',
data: {
"title" : this.fetchText('a.profile-full-name'),
"contact" : this.getHTML('div.pro-contact-methods span.pro-contact-text:nth-child(2)'),
"services" : this.getHTML('div.info-list-text span:nth-child(2) span'),
"name" : name,
"location" : this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span'),
"description" : description,
"reviews" : this.getHTML('div.pro-rating a span.pro-review-string span')
}
});
}).then(function() {
console.log(jsonObj.data.length);
//console.log(jsonObj);
if (jsonObj.data.length == 13) {
console.log(jsonObj.data[13].title);
}
/*for(var i = 0; i < jsonObj.data.length; i = i + 1 ) {
console.log(i);
console.log("zaptitle");
//zapTitle.push(jsonObj.data[i]);
console.log(jsonObj.data[i].title);
//}
}*/
//require('utils').dump(jsonObj.data[2].title);
//require('utils').dump(jsonObj);
//require('utils').dump(jsonObj.data[8]);
//require('utils').dump(zapTitle);
for(var i = 0; i < jsonObj.data.length; i = i + 1 ) {
zapServices.push(jsonObj.data[i].services);
}
/*casper.open('https://zapier.com/hooks/catch/29s1m6/', {
method: 'post',
data: {"title" : zapTitle,
//"contact" : zapContact,
"services" : zapServices
}*/
});
});
});
casper.getHTML(selector)
如果找不到您要查找的元素 (code reference),则会失败并退出脚本。当然,如果你使用的是 PhantomJS 2.0 或 2.1,这个错误是隐藏的。
罪魁祸首是 12th page 上的 "location" : this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span'),
,因为它不存在。在尝试访问它之前,您需要检查选择器是否存在(例如 casper.exists(selector)
)。
这就是 little console.log
debugging 可以做到的。