NightmareJS - 网络爬虫需要遍历 JSON 数据

Question

我正在构建一个将覆盖 200 多个站点的网络爬虫。我当前的代码运行在我为十几个站点构建的外部 JSON 文件之上。示例：

[
  {
    "company": "My Company",
    "url": "http://example.com/jobs/",
    "query": "div.job-listings>dt a",
    "link": "div.job-listings>dt a"
  },
  {
    "company": "Another Company",
    "url": "http://anothercompany.com/careers/",
    "query": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a",
    "link": ".clearfix~ .col-sm-6+ .col-sm-6 a , .col-sm-6:nth-child(4) a"
  }
]

当我尝试 async.each it would log all of the original objects at the top of the function before attempting to enter the nightmare instance then return error Nothing responds to "goto". Then I tried async.eachSeries 时，打印出正确的结果但在第一次迭代后停止。

var async = require ('async');
var data = require('./input.json')
var Nightmare = require('nightmare');
var nightmare = Nightmare({ show: false })

function crawl(data, cb) {
  console.log(data) // When async.each will iterate all items then error
  var nightmare = new Nightmare()
  nightmare
    .goto(data.url) // go to JSON specified url
    .wait(data.query) // wait until CSS selector loads
    .evaluate(function (data) {
      positionsArr = []
      obj = {}
      obj.company = data.company
      query = document.querySelectorAll(data.query)
      link = document.querySelectorAll(data.link)
    /* Set query and link equal to all elements with selector
    itearte through appending text (innerText) from each element
    with job url to obj*/
      var i;
      for (i = 0; i < query.length; i++) {
    positionsObj = {}
    positionsObj.title = query[i].innerText.trim()
      // if each position has individual page
      if (data.link !== null) {
        positionsObj.url = link[i].href
      } else {
          positionsObj.url = data.url
      }
    positionsArr.push(positionsObj)
      }
      obj.positions = positionsArr
      return obj
    }, data)
  .end()
  .then(function (obj) {
    console.log(obj)
    console.log('done')
  })
  .catch(function (error) {
    console.error('error', error);
  });
}


async.eachSeries(data, crawl, function (err){
    console.log('done!');
})

如何在不为每个文件编写单独文件的情况下完成这项工作？或者是否有更好的方式来抓取这么多网站？

Source code

Answer 1

如果要执行第二步等，则必须使用回调(cb):

.end()
.then(function (obj) {
    console.log(obj);
    console.log('done');
    cb();
})
.catch(function (error) {
    console.error('error', error);
    cb(error);
});

NightmareJS - 网络爬虫需要遍历 JSON 数据

NightmareJS - Web Crawler needs to iterate over JSON data

javascript

json

node.js

nightmare

electron