Web 定期抓取作业性能
Web scraping periodically job performance
我在节点中制作了一个脚本,用于定期抓取网页作为一项工作,并将数据保存为 MongoDB 中的对象。我需要每隔 5-15 秒或多或少执行一次该功能。但是,我的代码目前性能不佳,您能否提供一些建议或任何帮助?
const $ = require('cheerio');
const MarketModel = require('./models/marketModel');
const mongoose = require('mongoose');
const puppeteer = require('puppeteer');
var schedule = require('node-schedule');
//Connection to DataBase:
//To connect to Development environment DB (Comment line below if not using it)
mongoose.connect('mongodb://localhost:27017/Tradheo', {
useNewUrlParser: true
});
//To connect to DB in cloud:
//mongoose.connect(process.env.MONGO_URI, { useNewUrlParser: true });
mongoose.connection.on('error', error => console.log(error));
mongoose.Promise = global.Promise;
getMarketData = function () {
console.log("Web scraping to get market data...")
let markets = []
let marketSpain = {
country: 'Spain',
name: 'IBEX 35',
companies: []
}
let marketGermany = {
country: 'Germany',
name: 'DAX',
companies: []
}
puppeteer
.launch()
.then(function (browser) {
return browser.newPage();
})
.then(function (page) {
return page.goto('url', {
timeout: 3000000
}).then(function () {
return page.content();
});
})
.then(function (html) {
$("table[class='genTbl closedTbl crossRatesTbl elpTbl elp30'] > tbody > tr", html).each((i, elem) => {
marketSpain.companies.push({
name: $("td[class='bold left noWrap elp plusIconTd'] > a", html).eq(i).html(),
last: $("td", elem).eq(2).text(),
high: $("td", elem).eq(3).text(),
low: $("td", elem).eq(4).text(),
change: $("td", elem).eq(5).text(),
changePerCent: $("td", elem).eq(6).text(),
volume: $("td", elem).eq(7).text(),
time: $("td", elem).eq(8).text(),
purchase: false,
sale: false
});
});
markets.push(marketSpain);
puppeteer
.launch()
.then(function (browser) {
return browser.newPage();
})
.then(function (page) {
return page.goto('url').then(function () {
return page.content();
});
})
.then(function (html) {
$("table[class='genTbl closedTbl crossRatesTbl elpTbl elp30'] > tbody > tr", html).each((i, elem) => {
marketGermany.companies.push({
name: $("td[class='bold left noWrap elp plusIconTd'] > a", html).eq(i).html(),
last: $("td", elem).eq(2).text(),
high: $("td", elem).eq(3).text(),
low: $("td", elem).eq(4).text(),
change: $("td", elem).eq(5).text(),
changePerCent: $("td", elem).eq(6).text(),
volume: $("td", elem).eq(7).text(),
time: $("td", elem).eq(8).text(),
purchase: false,
sale: false
});
});
markets.push(marketGermany)
MarketModel.create({
markets,
}, (err) => {
if (err) return handleError(err);
})
console.log("Done!")
})
.catch(function (err) {
console.log(err);
});
})
.catch(function (err) {
console.log(err);
});
}
var j = schedule.scheduleJob('*/15 * 8-17 * * 1-5', function () {
getMarketData();
});
我得到的输出类型适用于第一个函数调用,但随后它开始抛出 TimeError 异常和 MaxListenersExceededWarning。例如:
Web scraping to get market data...
Done!
Web scraping to get market data...
Web scraping to get market data...
Web scraping to get market data...
{ TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded
at Promise.then (/home/javier/Workspace/Tradheo/server/node_modules/puppeteer/lib/Lifecycl eWatcher.js:142:21)
at <anonymous>
-- ASYNC --
at Frame.<anonymous> (/home/javier/Workspace/Tradheo/server/node_modules/puppeteer/lib/helper.j s:111:15)
at Page.goto (/home/javier/Workspace/Tradheo/server/node_modules/puppeteer/lib/Page.js:629:49)
at Page.<anonymous> (/home/javier/Workspace/Tradheo/server/node_modules/puppeteer/lib/helper.j s:112:23)
at /home/javier/Workspace/Tradheo/server/scraping.js:69:33
at <anonymous>
at process._tickCallback (internal/process/next_tick.js:189:7) name: 'TimeoutError' }
Web scraping to get market data...
我认为问题在于您没有关闭 puppeteer 浏览器。
尝试在承诺链的末尾和 catch
中添加 browser.close()
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://news.ycombinator.com', {waitUntil: 'networkidle2'});
await page.pdf({path: 'hn.pdf', format: 'A4'});
await browser.close();
})();
我在节点中制作了一个脚本,用于定期抓取网页作为一项工作,并将数据保存为 MongoDB 中的对象。我需要每隔 5-15 秒或多或少执行一次该功能。但是,我的代码目前性能不佳,您能否提供一些建议或任何帮助?
const $ = require('cheerio');
const MarketModel = require('./models/marketModel');
const mongoose = require('mongoose');
const puppeteer = require('puppeteer');
var schedule = require('node-schedule');
//Connection to DataBase:
//To connect to Development environment DB (Comment line below if not using it)
mongoose.connect('mongodb://localhost:27017/Tradheo', {
useNewUrlParser: true
});
//To connect to DB in cloud:
//mongoose.connect(process.env.MONGO_URI, { useNewUrlParser: true });
mongoose.connection.on('error', error => console.log(error));
mongoose.Promise = global.Promise;
getMarketData = function () {
console.log("Web scraping to get market data...")
let markets = []
let marketSpain = {
country: 'Spain',
name: 'IBEX 35',
companies: []
}
let marketGermany = {
country: 'Germany',
name: 'DAX',
companies: []
}
puppeteer
.launch()
.then(function (browser) {
return browser.newPage();
})
.then(function (page) {
return page.goto('url', {
timeout: 3000000
}).then(function () {
return page.content();
});
})
.then(function (html) {
$("table[class='genTbl closedTbl crossRatesTbl elpTbl elp30'] > tbody > tr", html).each((i, elem) => {
marketSpain.companies.push({
name: $("td[class='bold left noWrap elp plusIconTd'] > a", html).eq(i).html(),
last: $("td", elem).eq(2).text(),
high: $("td", elem).eq(3).text(),
low: $("td", elem).eq(4).text(),
change: $("td", elem).eq(5).text(),
changePerCent: $("td", elem).eq(6).text(),
volume: $("td", elem).eq(7).text(),
time: $("td", elem).eq(8).text(),
purchase: false,
sale: false
});
});
markets.push(marketSpain);
puppeteer
.launch()
.then(function (browser) {
return browser.newPage();
})
.then(function (page) {
return page.goto('url').then(function () {
return page.content();
});
})
.then(function (html) {
$("table[class='genTbl closedTbl crossRatesTbl elpTbl elp30'] > tbody > tr", html).each((i, elem) => {
marketGermany.companies.push({
name: $("td[class='bold left noWrap elp plusIconTd'] > a", html).eq(i).html(),
last: $("td", elem).eq(2).text(),
high: $("td", elem).eq(3).text(),
low: $("td", elem).eq(4).text(),
change: $("td", elem).eq(5).text(),
changePerCent: $("td", elem).eq(6).text(),
volume: $("td", elem).eq(7).text(),
time: $("td", elem).eq(8).text(),
purchase: false,
sale: false
});
});
markets.push(marketGermany)
MarketModel.create({
markets,
}, (err) => {
if (err) return handleError(err);
})
console.log("Done!")
})
.catch(function (err) {
console.log(err);
});
})
.catch(function (err) {
console.log(err);
});
}
var j = schedule.scheduleJob('*/15 * 8-17 * * 1-5', function () {
getMarketData();
});
我得到的输出类型适用于第一个函数调用,但随后它开始抛出 TimeError 异常和 MaxListenersExceededWarning。例如:
Web scraping to get market data...
Done!
Web scraping to get market data...
Web scraping to get market data...
Web scraping to get market data...
{ TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded
at Promise.then (/home/javier/Workspace/Tradheo/server/node_modules/puppeteer/lib/Lifecycl eWatcher.js:142:21)
at <anonymous>
-- ASYNC --
at Frame.<anonymous> (/home/javier/Workspace/Tradheo/server/node_modules/puppeteer/lib/helper.j s:111:15)
at Page.goto (/home/javier/Workspace/Tradheo/server/node_modules/puppeteer/lib/Page.js:629:49)
at Page.<anonymous> (/home/javier/Workspace/Tradheo/server/node_modules/puppeteer/lib/helper.j s:112:23)
at /home/javier/Workspace/Tradheo/server/scraping.js:69:33
at <anonymous>
at process._tickCallback (internal/process/next_tick.js:189:7) name: 'TimeoutError' }
Web scraping to get market data...
我认为问题在于您没有关闭 puppeteer 浏览器。 尝试在承诺链的末尾和 catch
中添加 browser.close()const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://news.ycombinator.com', {waitUntil: 'networkidle2'});
await page.pdf({path: 'hn.pdf', format: 'A4'});
await browser.close();
})();