http GET 返回完全不同的 HTML 元素和内部 HTML 与在浏览器的 DOM 元素上看到的
http GET returning TOTALLY DIFFERENT HTML elements and innerHTML from what's seen on browser's DOM Elements
目标:
将整个 HTML 页面(此 特定 url 的) 导出到 'newFileOnLocalDisk.txt' 以供离线数据研究。
遇到的问题:
能够将此特定 url 的 HTML 导出到 'newFileOnLocalDisk.txt' ... 但是 ... 导出的 HTML 元素和值与浏览器的 DOM.
const http = require('http');
const url = 'http://www.bursamalaysia.com/market/securities/equities/prices/#/?filter=BS02&page=3';
const httpGet = url => {
return new Promise((resolve, reject) => {
http.get(url, res => {
res.setEncoding('utf8');
let body = '';
res.on('data', chunk => body += chunk);
res.on('end', () => resolve(body));
}).on('error', reject);
});
};
async function exportToLocalHTMLfile () {
const body = await httpGet(url);
/*********************************************************
* What's returned to console.log is totally different
* from what is in browser's DOM Elements / inspect element
*
* Question #1:
* How can we export "as-is or in raw HTML" directly from
* less-friendly pages like this?
*
*********************************************************/
console.log(body); //<-- will be later exported to TEXT file. Codes removed to keep this article leaner.
}
exportToLocalHTMLfile();
console.log(body)
返回了以下摘录。这与浏览器的 DOM 元素
完全不同
<html><head>
<meta http-equiv="Pragma" content="no-cache"/>
<meta http-equiv="Expires" content="-1"/>
<meta http-equiv="CacheControl" content="no-cache"/>
<noscript>Please enable JavaScript to view the page content.</noscript>
<APM_DO_NOT_TOUCH>
<script language="javascript">
//<![CDATA[
(function(){
var securemsg;
var packmsg;
var CryptoUtils;
try{(function(){try{var _S,IS,lS=1,LS=1,ZS=1,__=1,i_=1,I_=1,j_=1;for(var J_=0;J_<IS;++J_)lS+=2,LS+=2,ZS+=2,__+=2,i_+=2,I_+=2,j_+=3;_S=lS+LS+ZS+__+i_+I_+j_;window.JS===_S&&(window.JS=++_S)}catch(l_){window.JS=_S}var L_=window.sdkljshr489=!0;function o_(S){window.sdkljshr489&&S&&(L_=!1);return L_}function O_(){}o_(window[O_.name]===O_);o_("undefined"===window.vodsS0);window.vodsS0=null;o_(/\x3c/.test(function(){return"\x3c"})&/x3d/.test(function(){return"0";"x3d"}));
var Z_=window.attachEvent||/mobi/i.test(navigator.userAgent),_ .... (end of excerpt)
请注意,'newFileOnLocalDisk.txt' 中不可读的内容长度为 61,055 个字符。 None 其中与 id="bm_price_container"
中的任何数据相似
如果有比const http = require('http');
更好的方法或策略来完成这个任务,请指点一下?预先感谢您在闪耀光芒方面付出的时间和努力。
您需要一个 Node.js 模块来在无头浏览器中加载页面以执行页面中的所有 JavaScript 代码并动态更新 DOM。例如,您可以尝试 GoogleChrome/puppeteer.
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(
'http://www.bursamalaysia.com/market/securities/equities/prices/#/?filter=BS02&page=3',
{ waitUntil: 'networkidle0' },
);
await page.waitFor(5000);
const html = await page.content();
console.log(html);
await browser.close();
} catch (err) {
console.error(err);
}
})();
目标:
将整个 HTML 页面(此 特定 url 的) 导出到 'newFileOnLocalDisk.txt' 以供离线数据研究。
遇到的问题:
能够将此特定 url 的 HTML 导出到 'newFileOnLocalDisk.txt' ... 但是 ... 导出的 HTML 元素和值与浏览器的 DOM.
const http = require('http');
const url = 'http://www.bursamalaysia.com/market/securities/equities/prices/#/?filter=BS02&page=3';
const httpGet = url => {
return new Promise((resolve, reject) => {
http.get(url, res => {
res.setEncoding('utf8');
let body = '';
res.on('data', chunk => body += chunk);
res.on('end', () => resolve(body));
}).on('error', reject);
});
};
async function exportToLocalHTMLfile () {
const body = await httpGet(url);
/*********************************************************
* What's returned to console.log is totally different
* from what is in browser's DOM Elements / inspect element
*
* Question #1:
* How can we export "as-is or in raw HTML" directly from
* less-friendly pages like this?
*
*********************************************************/
console.log(body); //<-- will be later exported to TEXT file. Codes removed to keep this article leaner.
}
exportToLocalHTMLfile();
console.log(body)
返回了以下摘录。这与浏览器的 DOM 元素
<html><head>
<meta http-equiv="Pragma" content="no-cache"/>
<meta http-equiv="Expires" content="-1"/>
<meta http-equiv="CacheControl" content="no-cache"/>
<noscript>Please enable JavaScript to view the page content.</noscript>
<APM_DO_NOT_TOUCH>
<script language="javascript">
//<![CDATA[
(function(){
var securemsg;
var packmsg;
var CryptoUtils;
try{(function(){try{var _S,IS,lS=1,LS=1,ZS=1,__=1,i_=1,I_=1,j_=1;for(var J_=0;J_<IS;++J_)lS+=2,LS+=2,ZS+=2,__+=2,i_+=2,I_+=2,j_+=3;_S=lS+LS+ZS+__+i_+I_+j_;window.JS===_S&&(window.JS=++_S)}catch(l_){window.JS=_S}var L_=window.sdkljshr489=!0;function o_(S){window.sdkljshr489&&S&&(L_=!1);return L_}function O_(){}o_(window[O_.name]===O_);o_("undefined"===window.vodsS0);window.vodsS0=null;o_(/\x3c/.test(function(){return"\x3c"})&/x3d/.test(function(){return"0";"x3d"}));
var Z_=window.attachEvent||/mobi/i.test(navigator.userAgent),_ .... (end of excerpt)
请注意,'newFileOnLocalDisk.txt' 中不可读的内容长度为 61,055 个字符。 None 其中与 id="bm_price_container"
如果有比const http = require('http');
更好的方法或策略来完成这个任务,请指点一下?预先感谢您在闪耀光芒方面付出的时间和努力。
您需要一个 Node.js 模块来在无头浏览器中加载页面以执行页面中的所有 JavaScript 代码并动态更新 DOM。例如,您可以尝试 GoogleChrome/puppeteer.
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto(
'http://www.bursamalaysia.com/market/securities/equities/prices/#/?filter=BS02&page=3',
{ waitUntil: 'networkidle0' },
);
await page.waitFor(5000);
const html = await page.content();
console.log(html);
await browser.close();
} catch (err) {
console.error(err);
}
})();