如何让 puppeteer 加载网站更快?
How to make puppeteer load websites faster?
所以我正在与 puppeteer 合作来实现自动化,它工作正常,但是当我加载网站时,它比我的普通网站需要更多的时间来加载,我尝试使用这个做缓存
const puppeteer = require('puppeteer');
let time = new Date()
async function test() {
const browser = await puppeteer.launch({
headless: true,
executablePath:"D:\Desktop\node_modules\puppeteer\.local-chromium\win64-848005\chrome-win\chrome.exe",
args: ['--no-sandbox'],
});
const page = await browser.newPage();
const response = await page.goto('https://example.com/');
console.log(`${new Date() -time }`)
console.log(response);
await browser.close();
}
它适用于 example.com 缓存已存储并且加载速度更快,但我的目标网站似乎不允许缓存存储
还有其他方法可以加快流程吗?
如果您只是想让网站在抓取时加载得更快,而不依赖于某些图像或 javascript,您可以阻止这些资源。
按资源类型阻止
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', (req) => {
if (req.resourceType() === 'image') {
req.abort();
} else {
req.continue();
}
});
await page.goto('https://bbc.com');
await page.screenshot({path: 'no-images.png', fullPage: true});
await browser.close();
})();
按域阻止
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: true,
});
const page = await browser.newPage();
const options = {
waitUntil: 'networkidle2',
timeout: 30000,
};
// Before: Normal navigtation
await page.goto('https://theverge.com', options);
await page.screenshot({path: 'before.png', fullPage: true});
const metrics = await page.metrics();
console.info(metrics);
// After: Navigation with some domains blocked
// Array of third-party domains to block
const blockedDomains = [
'https://pagead2.googlesyndication.com',
'https://creativecdn.com',
'https://www.googletagmanager.com',
'https://cdn.krxd.net',
'https://adservice.google.com',
'https://cdn.concert.io',
'https://z.moatads.com',
'https://cdn.permutive.com'];
await page.setRequestInterception(true);
page.on('request', (request) => {
const url = request.url();
if (blockedDomains.some((d) => url.startsWith(d))) {
request.abort();
} else {
request.continue();
}
});
await page.goto('https://theverge.com', options);
await page.screenshot({path: 'after.png', fullPage: true});
const metricsAfter = await page.metrics();
console.info(metricsAfter);
await browser.close();
})();
所以我正在与 puppeteer 合作来实现自动化,它工作正常,但是当我加载网站时,它比我的普通网站需要更多的时间来加载,我尝试使用这个做缓存
const puppeteer = require('puppeteer');
let time = new Date()
async function test() {
const browser = await puppeteer.launch({
headless: true,
executablePath:"D:\Desktop\node_modules\puppeteer\.local-chromium\win64-848005\chrome-win\chrome.exe",
args: ['--no-sandbox'],
});
const page = await browser.newPage();
const response = await page.goto('https://example.com/');
console.log(`${new Date() -time }`)
console.log(response);
await browser.close();
}
它适用于 example.com 缓存已存储并且加载速度更快,但我的目标网站似乎不允许缓存存储
还有其他方法可以加快流程吗?
如果您只是想让网站在抓取时加载得更快,而不依赖于某些图像或 javascript,您可以阻止这些资源。
按资源类型阻止
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', (req) => {
if (req.resourceType() === 'image') {
req.abort();
} else {
req.continue();
}
});
await page.goto('https://bbc.com');
await page.screenshot({path: 'no-images.png', fullPage: true});
await browser.close();
})();
按域阻止
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: true,
});
const page = await browser.newPage();
const options = {
waitUntil: 'networkidle2',
timeout: 30000,
};
// Before: Normal navigtation
await page.goto('https://theverge.com', options);
await page.screenshot({path: 'before.png', fullPage: true});
const metrics = await page.metrics();
console.info(metrics);
// After: Navigation with some domains blocked
// Array of third-party domains to block
const blockedDomains = [
'https://pagead2.googlesyndication.com',
'https://creativecdn.com',
'https://www.googletagmanager.com',
'https://cdn.krxd.net',
'https://adservice.google.com',
'https://cdn.concert.io',
'https://z.moatads.com',
'https://cdn.permutive.com'];
await page.setRequestInterception(true);
page.on('request', (request) => {
const url = request.url();
if (blockedDomains.some((d) => url.startsWith(d))) {
request.abort();
} else {
request.continue();
}
});
await page.goto('https://theverge.com', options);
await page.screenshot({path: 'after.png', fullPage: true});
const metricsAfter = await page.metrics();
console.info(metricsAfter);
await browser.close();
})();