phantomJS爬取多页面获取错序执行

Question

我正在使用 PhantomJS 来抓取从输入流生成的页面。但是 resultjson 和 Headers 始终是 {} 即使状态是 'success'（第一个 url 可以，但通常第二个开始是 {}）。谁能告诉我有什么问题它。

var page = require('webpage').create();
var system = require('system');
var args = system.args;
var fs = require('fs');
var resultjson = {};
var Headers = {};
var urlType, url, path;
var isInUse = false;


function pageInit(page){
    page.settings.resourceTimeout = 500000;

    page.onResourceTimeout = function (request) {
        console.log("fail");
    }

    page.onConsoleMessage = function (msg, lineNum, sourceID) {
        //console.log(msg + "at line " + lineNum);
    }


    page.onResourceReceived = function (response) {
        if (response.url === url && response.stage === "end") {
            if (response.status === 301 && response.redirectURL !== null) {
                url = response.redirectURL;
            }
            else {
                resultjson.Id = response.id;
                resultjson.Url = response.url;
                resultjson.Time = response.time;
                resultjson.BodySize = response.bodySize;
                resultjson.ContentType = response.contentType;
                resultjson.RedirectURL = response.redirectURL;
                resultjson.Stage = response.stage;
                resultjson.Status = response.status;
                resultjson.StatusText = response.statusText;
                response.headers.forEach(function (header) {
                    Headers[header.name] = header.value;
                });
            }
        }
    };
}
function GetOtherPage(url, path) {
    page.open(url, function (status) {
        console.log(JSON.stringify(resultjson));
        console.log(JSON.stringify(Headers));
        resultjson = {};
        Headers = {};
        if (status !== "success") {
            //fs.write(path, page.content, 'w');
            console.log("fail");
        }
        else {
            fs.write(path, page.content, 'w');
            console.log("success");
        }
        DoTask();
    });
}
function DoTask() {
    page.close();
    page = require('webpage').create();
    pageInit(page);
    //page.content = "";
    urlType = parseInt(system.stdin.readLine());
    if (urlType === -1)
        phantom.exit();
    url = system.stdin.readLine();
    path = system.stdin.readLine();
    if (urlType === 3) {
        GetOtherPage(url, path);
    }
}
DoTask();

结果：

3(input)
https://www.google.com(input)
output.html(input)
{}
{}
success
3(input)
https://www.google.com(input)
output.html(input)
{}
{}
success

更新：似乎当您尝试通过一个 phantomJS 进程获取相同的 url 时，phantomJS 会将它们放入您的磁盘缓存中，因此它不会请求 url 第二个时间，所以 resultjson 和 Headers 是 {}.

Answer 1

属性 response.url 是 normalized，因此它会在键入的那个后面添加一个 '/'。

由于比较 response.url === url 为假，因此它永远不会进入 if。

使用 https://www.google.com/ 进行测试（尾随 '/'），得到以下输出：

3
https://www.google.com/
output.html
{"Id":1,"Url":"https://www.google.com/","Time":"2016-07-29T03:56:24.294Z","ContentType":"text/html; charset=UTF-8","RedirectU
RL":"https://www.google.com.br/?gfe_rd=cr&ei=BNSaV_KoL6KB8QfJrYPoDA","Stage":"end","Status":302,"StatusText":"Found"}
{"Cache-Control":"private","Content-Type":"text/html; charset=UTF-8","Location":"https://www.google.com.br/?gfe_rd=cr&ei=BNSa
V_KoL6KB8QfJrYPoDA","Content-Length":"263","Date":"Fri, 29 Jul 2016 03:56:52 GMT","Alternate-Protocol":"443:quic","Alt-Svc":"
quic=\":443\"; ma=2592000; v=\"36,35,34,33,32,31,30,29,28,27,26,25\""}
success

_{使用 phantomjs v2.1.1 测试}

phantomJS爬取多页面获取错序执行

phantomJS crawling multi pages get misorder execution

javascript

web-crawler

phantomjs