如何打印 html 源以使用 phantom-crawler 进行控制台
How to print html source to console with phantom-crawler
我刚刚下载并安装了用于 nodejs 的 phantom-crawler。我将以下脚本复制并粘贴到名为 crawler.js:
的文件中
var Crawler = require('phantom-crawler');
// Can be initialized with optional options object
var crawler = new Crawler();
// queue is an array of URLs to be crawled
crawler.queue.push('https://google.com/');
// Can also do `crawler.fetch(url)` instead of pushing it and crawling it
// Extract plainText out of each phantomjs page
Promise.all(crawler.crawl())
.then(function(pages) {
var texts = [];
for (var i = 0; i < pages.length; i++) {
var page = pages[i];
// suffix Promise to return promises instead of callbacks
var text = page.getPromise('plainText');
texts.push(text);
text.then(function(p) {
return function() {
// Pages are like tabs, they should be closed
p.close()
}
}(page));
}
return Promise.all(texts);
})
.then(function(texts) {
// texts = array of plaintext from the website bodies
// also supports ajax requests
console.log(texts);
})
.then(function () {
// kill that phantomjs bridge
crawler.phantom.then(function (p) {
p.exit();
});
})
我想将完整的 html 源代码(在本例中为 google 页面)打印到控制台。
我搜索了很多,但没有找到类似的东西,那我该怎么做呢?
获得 content
而不是 plainText
承诺。
模块phantom-crawler uses the module node-phantom-simple, which uses phantomjs.
您可以在 phantomjs wiki 中找到可以调用的属性列表。
var Crawler = require('phantom-crawler');
// Can be initialized with optional options object
var crawler = new Crawler();
// queue is an array of URLs to be crawled
crawler.queue.push('https://google.com/');
// Can also do `crawler.fetch(url)` instead of pushing it and crawling it
// Extract plainText out of each phantomjs page
Promise.all(crawler.crawl())
.then(function(pages) {
var allHtml = [];
for (var i = 0; i < pages.length; i++) {
var page = pages[i];
// suffix Promise to return promises instead of callbacks
var html = page.getPromise('content');
allHtml.push(html);
html.then(function(p) {
return function() {
// Pages are like tabs, they should be closed
p.close()
}
}(page));
}
return Promise.all(allHtml);
})
.then(function(allHtml) {
// allHtml = array of plaintext from the website bodies
// also supports ajax requests
console.log(allHtml);
})
.then(function () {
// kill that phantomjs bridge
crawler.phantom.then(function (p) {
p.exit();
});
})
我刚刚下载并安装了用于 nodejs 的 phantom-crawler。我将以下脚本复制并粘贴到名为 crawler.js:
的文件中var Crawler = require('phantom-crawler');
// Can be initialized with optional options object
var crawler = new Crawler();
// queue is an array of URLs to be crawled
crawler.queue.push('https://google.com/');
// Can also do `crawler.fetch(url)` instead of pushing it and crawling it
// Extract plainText out of each phantomjs page
Promise.all(crawler.crawl())
.then(function(pages) {
var texts = [];
for (var i = 0; i < pages.length; i++) {
var page = pages[i];
// suffix Promise to return promises instead of callbacks
var text = page.getPromise('plainText');
texts.push(text);
text.then(function(p) {
return function() {
// Pages are like tabs, they should be closed
p.close()
}
}(page));
}
return Promise.all(texts);
})
.then(function(texts) {
// texts = array of plaintext from the website bodies
// also supports ajax requests
console.log(texts);
})
.then(function () {
// kill that phantomjs bridge
crawler.phantom.then(function (p) {
p.exit();
});
})
我想将完整的 html 源代码(在本例中为 google 页面)打印到控制台。
我搜索了很多,但没有找到类似的东西,那我该怎么做呢?
获得 content
而不是 plainText
承诺。
模块phantom-crawler uses the module node-phantom-simple, which uses phantomjs.
您可以在 phantomjs wiki 中找到可以调用的属性列表。
var Crawler = require('phantom-crawler');
// Can be initialized with optional options object
var crawler = new Crawler();
// queue is an array of URLs to be crawled
crawler.queue.push('https://google.com/');
// Can also do `crawler.fetch(url)` instead of pushing it and crawling it
// Extract plainText out of each phantomjs page
Promise.all(crawler.crawl())
.then(function(pages) {
var allHtml = [];
for (var i = 0; i < pages.length; i++) {
var page = pages[i];
// suffix Promise to return promises instead of callbacks
var html = page.getPromise('content');
allHtml.push(html);
html.then(function(p) {
return function() {
// Pages are like tabs, they should be closed
p.close()
}
}(page));
}
return Promise.all(allHtml);
})
.then(function(allHtml) {
// allHtml = array of plaintext from the website bodies
// also supports ajax requests
console.log(allHtml);
})
.then(function () {
// kill that phantomjs bridge
crawler.phantom.then(function (p) {
p.exit();
});
})