如何使用 Puppeteer 库 return 一个值来抓取动态网页?
How to return a value using the Puppeteer library to scrape dynamic web pages?
我正在使用 cheerio.js 和 puppeteer.js 跟随这个 tutorial 来尝试做一些基本的网络抓取。我并没有完全按照教程进行操作,因为我试图在服务器端编写它,让我的后端处理所有的抓取,然后在将来将该数据传递给前端。
正如现在写的那样,我得到
[nodemon] restarting due to changes...
[nodemon] starting node server.js
Your app is listening on port 8080
[Function]
看起来 dynamicScraper
正在返回 [Function]
,而我希望它是教程中的 html?
主server.js文件
'use strict';
const express = require('express');
const cors = require('cors');
const app = express();
const cheerio = require('./potusScraper');
app.use(express.json());
app.use(
cors({
origin: ['http://localhost:3000']
})
);
app.get('/', (req, res) => {
let { scraper, dynamicScraper } = cheerio;
//dynamicScraper should return html as a string?
dynamicScraper()
.then(html => {
res.send(html);
})
.catch(err => {
console.log(err);
});
});
app.listen(process.env.PORT || 8080, () => {
console.log(`Your app is listening on port ${process.env.PORT || 8080}`);
});
potusScraper.js 文件
'use strict';
const rp = require('request-promise');
const $ = require('cheerio');
const puppeteer = require('puppeteer');
const url = 'https://en.wikipedia.org/wiki/List_of_Presidents_of_the_United_States';
const url2 = 'https://www.reddit.com';
const cheerio = {
scraper: function() {
return rp(url)
.then(html => {
const wikiUrls=[];
for (let i = 0; i < 45; i++) {
wikiUrls.push($('big > a', html)[i].attribs.href);
}
return(wikiUrls);
})
.catch(err => console.log(err))
},
dynamicScraper: function() {
return puppeteer //doesn't look like this works?
.launch()
.then(browser => {
return browser.newPage();
})
.then(page => {
return page.goto(url2)
.then(() => {return page.content});
})
.then(html => {
console.log(html);
return(html);
})
.catch(err => console.log(err));
}
}
module.exports = cheerio;
您将在此代码行中返回 page.content
函数而不是调用它:
.then(page => {
return page.goto(url2)
.then(() => {return page.content});
})
第三行应该是这样的:
.then(() => {return page.content()});
此外,您可以使用简洁的箭头函数来简化代码:
.then(() => page.content());
我正在使用 cheerio.js 和 puppeteer.js 跟随这个 tutorial 来尝试做一些基本的网络抓取。我并没有完全按照教程进行操作,因为我试图在服务器端编写它,让我的后端处理所有的抓取,然后在将来将该数据传递给前端。
正如现在写的那样,我得到
[nodemon] restarting due to changes...
[nodemon] starting node server.js
Your app is listening on port 8080
[Function]
看起来 dynamicScraper
正在返回 [Function]
,而我希望它是教程中的 html?
主server.js文件
'use strict';
const express = require('express');
const cors = require('cors');
const app = express();
const cheerio = require('./potusScraper');
app.use(express.json());
app.use(
cors({
origin: ['http://localhost:3000']
})
);
app.get('/', (req, res) => {
let { scraper, dynamicScraper } = cheerio;
//dynamicScraper should return html as a string?
dynamicScraper()
.then(html => {
res.send(html);
})
.catch(err => {
console.log(err);
});
});
app.listen(process.env.PORT || 8080, () => {
console.log(`Your app is listening on port ${process.env.PORT || 8080}`);
});
potusScraper.js 文件
'use strict';
const rp = require('request-promise');
const $ = require('cheerio');
const puppeteer = require('puppeteer');
const url = 'https://en.wikipedia.org/wiki/List_of_Presidents_of_the_United_States';
const url2 = 'https://www.reddit.com';
const cheerio = {
scraper: function() {
return rp(url)
.then(html => {
const wikiUrls=[];
for (let i = 0; i < 45; i++) {
wikiUrls.push($('big > a', html)[i].attribs.href);
}
return(wikiUrls);
})
.catch(err => console.log(err))
},
dynamicScraper: function() {
return puppeteer //doesn't look like this works?
.launch()
.then(browser => {
return browser.newPage();
})
.then(page => {
return page.goto(url2)
.then(() => {return page.content});
})
.then(html => {
console.log(html);
return(html);
})
.catch(err => console.log(err));
}
}
module.exports = cheerio;
您将在此代码行中返回 page.content
函数而不是调用它:
.then(page => {
return page.goto(url2)
.then(() => {return page.content});
})
第三行应该是这样的:
.then(() => {return page.content()});
此外,您可以使用简洁的箭头函数来简化代码:
.then(() => page.content());