抓取重定向的页面

scraping a page that redirects

我尝试抓取一个简单的页面(需要 cheerio 和请求): https://www.ishares.com/uk/individual/en/products/251824/

代码失败。我相信这是因为,为了达到上述目的,在上一页提示用户输入 "individual" 或 "institutional",因此被重定向。

我尝试了 url 的不同变体,但都失败了。

如何使用 node.js 获取原始 HTML?

代码如下:

var express = require('express');
var path = require('path');
var request = require('request');
var cheerio = require('cheerio');   // fast flexible implement of jQuery for server.
var fs = require('fs');

var app = express();
var port = 8000;
var timeLog = [];  // for dl to measure the time of events.

// var startTime = Date.now();


timeLog[0] = Date.now();
console.log('program initiated at time: '+new Date());


// example 1:  pull the webpage and print to console
var url ="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf";
url = "https://www.ishares.com/uk/individual/en/products/251824/";
url="https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";


request(url,function functionName(err,resp,body) {
 var $ = cheerio.load(body);

 var distYield = $('.col-distYield');
 var distYieldText = distYield.text();
 console.log('we got to line 24');
 console.log(distYieldText);


 timeLog[2] = Date.now();
 console.log('data capture time: '+(timeLog[2] - timeLog[0])/1000+' seconds');

  if (err) {
    console.log(err);
  }else {
    //console.log(body);
    console.log('the body was written: success');
  }
});

// example 2:  download webpage and save file
var destination = fs.createWriteStream('./downloads/iSharesSEMB.html');
request(url)
  .pipe(destination);


// example 3:
var destination = fs.createWriteStream('./downloads/iSharesSEMB2.html');
request(url)
  .pipe(destination)
  .on("finish",function () {
    console.log('done');
  })
  .on('error',function (err) {
    console.log(err);
  });



timeLog[1] = Date.now();
console.log('program completed at time: '+new Date());
console.log('Asynchronous program run time: '+(timeLog[1] - timeLog[0])/1000+' seconds');

好的,我让它工作了。我为 request 启用了 cookie 支持,但随后进入了重定向循环。添加一个承诺就可以了。这里只有相关的 HTML 请求部分:

const request = require('request'),
    cheerio = require('cheerio');


const url = "https://www.ishares.com/uk/individual/en/products/251824/ishares-jp-morgan-emerging-markets-bond-ucits-etf?siteEntryPassthrough=true&locale=en_GB&userType=individual";

options = {
    jar: true
}

const getDistYield = url => {
    return new Promise((resolve, reject) => {
        request(url, options, function(err,resp,body) {
            if (err) reject(err);
            let $ = cheerio.load(body);
            resolve($('.col-distYield'));
        })
    })
}

getDistYield(url)
    .then((tag) => {
        console.log(tag.text())
    }).catch((e) => {
        console.error(e)
    })

输出:

Distribution Yield
The distribution yield represents the ratio of distributed income over the last 12 months to the fund’s current NAV.
as of 20-Feb-2018
4.82

此外,请注意我使用了您提供的最后一个 URL。

我希望这对你有用:)

修改了解析部分以仅获取嵌套的值(而不是文本)class。

resolve($('.col-distYield > span:nth-child(2)'));