casperjs:导入 json 并循环导出结果 json
casperjs: import json and cycle export result json
我需要在 casperjs 中导入一些 links 并在 json 中导出结果 html 或其他任何东西,但是如果我在一个文件中有 100 万 link 我需要一些自动的东西,比如一个循环从 json(示例)中调用每个 link,然后使用 casperjs 提取 html,然后写入文件 json 或其他任何内容。这是我的脚本,但写在文件中,而不是在某些文件 db、json 或 csv 中使用 links。我如何根据需要修改此脚本?:
var casper = require('casper').create({
pageSettings: {
loadImages: true,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36',
javascriptEnabled: true,
verbose: true,
logLevel: "debug",
cookiesEnabled: true
}
});
var fs = require('fs');
var x = require("casper").selectXPath;
phantom.cookiesEnabled = true;
phantom.javascriptEnabled = true;
casper.start().thenOpen("LINK_LOGIN", function() {
console.log("Link opened...");
});
casper.then(
function() {
casper.echo("clicking..");
casper.click(x("/html/body/div[@id='whais']/ul[@id='undest']/li[@id='login-you']/a"));
});
casper.then(function(){
console.log("Login...");
this.sendKeys('input[id="login"]', 'USER');
this.sendKeys('input[id="password"]', 'PASSWORD');
casper.echo("click");
casper.click('input[type="submit"][name="form"]');
this.evaluate(function(){
document.getElementById("button-send").click();
});
});
casper.thenOpen("OTHER_LINK_SAME_DOMAIN", function() {
console.log("page loading...");
console.log("...write html");
var html = this.getHTML();
var f = fs.open('my.html', 'w');
f.write(html);
f.close();
}).waitForText("how are you?", function() {
this.echo('Found the answer.');
},
function() {
this.echo('not found answer, time out!');
},60000
);
casper.run();
谢谢!!!
I have 1 milion link in one files.
首先你应该阅读那个文件的内容。看看fs.read.
I need something automatic, like a cycle for call each link from json(example) then extract html with casperjs then write in file json or whatever.
使用循环打开每个url,然后保存。示例代码:
var url_list = [...]// contains the url from the local file
casper.start()
var index = 0;
casper.then(function () {
for (var i = 0; i < url_list.length; i++) {
casper.thenOpen(url_list[i], function () {
fs.write(index + '.html', this.getHTML(), 'w')
index ++
})
}
}
)
我需要在 casperjs 中导入一些 links 并在 json 中导出结果 html 或其他任何东西,但是如果我在一个文件中有 100 万 link 我需要一些自动的东西,比如一个循环从 json(示例)中调用每个 link,然后使用 casperjs 提取 html,然后写入文件 json 或其他任何内容。这是我的脚本,但写在文件中,而不是在某些文件 db、json 或 csv 中使用 links。我如何根据需要修改此脚本?:
var casper = require('casper').create({
pageSettings: {
loadImages: true,
loadPlugins: false,
userAgent: 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36',
javascriptEnabled: true,
verbose: true,
logLevel: "debug",
cookiesEnabled: true
}
});
var fs = require('fs');
var x = require("casper").selectXPath;
phantom.cookiesEnabled = true;
phantom.javascriptEnabled = true;
casper.start().thenOpen("LINK_LOGIN", function() {
console.log("Link opened...");
});
casper.then(
function() {
casper.echo("clicking..");
casper.click(x("/html/body/div[@id='whais']/ul[@id='undest']/li[@id='login-you']/a"));
});
casper.then(function(){
console.log("Login...");
this.sendKeys('input[id="login"]', 'USER');
this.sendKeys('input[id="password"]', 'PASSWORD');
casper.echo("click");
casper.click('input[type="submit"][name="form"]');
this.evaluate(function(){
document.getElementById("button-send").click();
});
});
casper.thenOpen("OTHER_LINK_SAME_DOMAIN", function() {
console.log("page loading...");
console.log("...write html");
var html = this.getHTML();
var f = fs.open('my.html', 'w');
f.write(html);
f.close();
}).waitForText("how are you?", function() {
this.echo('Found the answer.');
},
function() {
this.echo('not found answer, time out!');
},60000
);
casper.run();
谢谢!!!
I have 1 milion link in one files.
首先你应该阅读那个文件的内容。看看fs.read.
I need something automatic, like a cycle for call each link from json(example) then extract html with casperjs then write in file json or whatever.
使用循环打开每个url,然后保存。示例代码:
var url_list = [...]// contains the url from the local file
casper.start()
var index = 0;
casper.then(function () {
for (var i = 0; i < url_list.length; i++) {
casper.thenOpen(url_list[i], function () {
fs.write(index + '.html', this.getHTML(), 'w')
index ++
})
}
}
)