当队列为 'full' 时暂停循环
Pausing a loop when the queue is 'full'
我正在用 puppeteer!.
做网络爬虫
循环向函数 (loadPage(URL)) 传递了一个有效的url,但想在抓取处理 N 个页面时暂停循环。
我考虑过超时,puppeer 花费的平均时间 运行。但我不认为这是一个解决方案。但是,我愿意接受任何讨论。
谢谢。
--- 编辑中 供将来参考 ---
const puppeteer = require('puppeteer');
const stores = require('./data.json').stores;
const MAX_CONCURRENT_TASKS = 5;
let TOTAL_PAGES = 0;
const start = async () => {
//@TODO Create a separate log routine
console.log('Total de Lojas', stores.length)
let actualStore = null;
let activatedStores = [];
for (const store of stores) {
if (store.active) {
activatedStores.push(store)
}
}
//@TODO Create a separate log routine
console.log('Lojas ativas', activatedStores.length)
try {
const browser = await puppeteer.launch({
headless: false //Debug porpouse
});
const pagePool = await Promise.all(Array.from(
new Array(MAX_CONCURRENT_TASKS),
() => browser.newPage()
))
while (activatedStores.length !== 0) {
//@TODO Create a separate log routine
console.log(`Stores left: ${activatedStores.length - MAX_CONCURRENT_TASKS}!`)
await Promise.all(
activatedStores.splice(0, MAX_CONCURRENT_TASKS)
.map((store, i) => loadPage(store.siteMap,
pagePool[i], store))
)
}
await browser.close();
} catch (error) {
//@TODO create function to generate error logs
console.error(error)
}
}
/**
*Function to load pages
*
* @param {string} url - a valid url
* @param {puppeter} page - puppeteer browser.newPage()
* @param {Object} store - the settings of this store
*/
const loadPage = async (url, page, store) => {
const opts = {
timeout: 0,
waitUntil: 'domcontentloaded'
}
page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36')
await page.goto(url, opts);
//@TODO Create a separate log routine
console.log(await page.evaluate(() => document.location.href));
}
start()
我无法在此处为您提供代码示例,但您绝对应该查看 iterators and generators concept。 Generators 使用非阻塞暂停原理,允许你做一些计算,停止并做其他逻辑,return 到你的计算。
没有代码,很难确定您到底需要什么。也许这个例子可以给你一些提示。
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const urls = Array.from(
new Array(20),
(_, i) => `https://example.org/?foo=${i}`
);
const numberOfConcurrentTasks = 3;
const browser = await puppeteer.launch();
const pagePool = await Promise.all(Array.from(
new Array(numberOfConcurrentTasks),
() => browser.newPage()
));
while (urls.length !== 0) {
console.log(`URLs left: ${urls.length}.`);
await Promise.all(
urls.splice(0, numberOfConcurrentTasks)
.map((url, i) => processDoc(url, pagePool[i]))
);
}
await browser.close();
} catch (err) {
console.error(err);
}
})();
async function processDoc(url, page) {
await page.goto(url);
console.log(await page.evaluate(() => document.location.href));
}
我正在用 puppeteer!.
做网络爬虫循环向函数 (loadPage(URL)) 传递了一个有效的url,但想在抓取处理 N 个页面时暂停循环。
我考虑过超时,puppeer 花费的平均时间 运行。但我不认为这是一个解决方案。但是,我愿意接受任何讨论。
谢谢。
--- 编辑中 供将来参考 ---
const puppeteer = require('puppeteer');
const stores = require('./data.json').stores;
const MAX_CONCURRENT_TASKS = 5;
let TOTAL_PAGES = 0;
const start = async () => {
//@TODO Create a separate log routine
console.log('Total de Lojas', stores.length)
let actualStore = null;
let activatedStores = [];
for (const store of stores) {
if (store.active) {
activatedStores.push(store)
}
}
//@TODO Create a separate log routine
console.log('Lojas ativas', activatedStores.length)
try {
const browser = await puppeteer.launch({
headless: false //Debug porpouse
});
const pagePool = await Promise.all(Array.from(
new Array(MAX_CONCURRENT_TASKS),
() => browser.newPage()
))
while (activatedStores.length !== 0) {
//@TODO Create a separate log routine
console.log(`Stores left: ${activatedStores.length - MAX_CONCURRENT_TASKS}!`)
await Promise.all(
activatedStores.splice(0, MAX_CONCURRENT_TASKS)
.map((store, i) => loadPage(store.siteMap,
pagePool[i], store))
)
}
await browser.close();
} catch (error) {
//@TODO create function to generate error logs
console.error(error)
}
}
/**
*Function to load pages
*
* @param {string} url - a valid url
* @param {puppeter} page - puppeteer browser.newPage()
* @param {Object} store - the settings of this store
*/
const loadPage = async (url, page, store) => {
const opts = {
timeout: 0,
waitUntil: 'domcontentloaded'
}
page.setUserAgent('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36')
await page.goto(url, opts);
//@TODO Create a separate log routine
console.log(await page.evaluate(() => document.location.href));
}
start()
我无法在此处为您提供代码示例,但您绝对应该查看 iterators and generators concept。 Generators 使用非阻塞暂停原理,允许你做一些计算,停止并做其他逻辑,return 到你的计算。
没有代码,很难确定您到底需要什么。也许这个例子可以给你一些提示。
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const urls = Array.from(
new Array(20),
(_, i) => `https://example.org/?foo=${i}`
);
const numberOfConcurrentTasks = 3;
const browser = await puppeteer.launch();
const pagePool = await Promise.all(Array.from(
new Array(numberOfConcurrentTasks),
() => browser.newPage()
));
while (urls.length !== 0) {
console.log(`URLs left: ${urls.length}.`);
await Promise.all(
urls.splice(0, numberOfConcurrentTasks)
.map((url, i) => processDoc(url, pagePool[i]))
);
}
await browser.close();
} catch (err) {
console.error(err);
}
})();
async function processDoc(url, page) {
await page.goto(url);
console.log(await page.evaluate(() => document.location.href));
}