无法即时抓取和打印链接
Can't scrape and print the links on the fly
我在 node.js
中编写了一个脚本,用于从网页中抓取 links
不同标题。当我执行以下脚本时,我在控制台中打印了 undefined
而不是我想要的 links
。我定义的选择器是准确的。
我不希望将 links
放入数组中,而 return 结果;相反,我希望即时打印它们。由于我对结合使用 node.js
和 puppeteer
编写脚本还很陌生,所以我无法弄清楚我犯的错误。
这是我的脚本 (Link to that site):
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://whosebug.com/questions/tagged/web-scraping");
let url = await page.evaluate(() => {
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
//would like to keep the following line intact
console.log(item.getAttribute('href'));
});
})
browser.close();
return resolve(url);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
The following script works just fine if I consider to declare an empty array results
and store the scraped links within it and finally return the results
but I do not wish to go like this. I would like to stick to the way I tried above, as in printing the result on the fly.
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://whosebug.com/questions/tagged/web-scraping");
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
results.push({
url: item.getAttribute('href'),
});
});
return results;
})
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
再一次:我的问题是如何在不将其存储在数组中的情况下像 console.log(item.getAttribute('href'));
一样即时打印 link?
该库使用起来有点笨拙,但在 github- https://github.com/GoogleChrome/puppeteer/issues/628
上找到了从此线程获取 href 的正确方法
我的工作代码是使用await page.$$eval
async function getWhosebugLinks(){
return new Promise(async(resolve, reject)=>{
console.log(`going to launch chromium via puppeteer`)
const browser = await puppeteer.launch()
console.log(`creating page/tab`)
const page = await browser.newPage()
await page.goto('https://whosebug.com/questions/tagged/web-scraping')
console.log("fetched SO web-scraping, now parsing link href")
let matches = await page.$$eval('a.question-hyperlink', hrefs=>hrefs.map((a)=>{
return a.href
})) // $$eval and map version, $$eval returns an array
console.log("matches = ", matches.length)
await browser.close()
resolve(matches)
})
}
getWhosebugLinks()
.then(hrefs=>{
console.log("hrefs: ", hrefs)
})
注意事项,
async
函数将 return 一个承诺。
new Promise
也会return一个承诺。
关于这一点,您可以简单地使用 .console
事件来即时打印它们。用法,
page.on("console", msg => console.log(msg.text()));
await page.evaluate(async => {
console.log("I will be printed on node console too")
})
高级用法已在 上讨论。
到 运行 console.log()
inside evaluate()
只需复制下面定义页面的行
page.on('console', obj => console.log(obj._text));
所以现在整个代码段将是这样的
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('console', obj => console.log(obj._text));
await page.goto("https://whosebug.com/questions/tagged/web-scraping");
let url = await page.evaluate(() => {
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
//would like to keep the following line intact
console.log(item.getAttribute('href'));
});
})
browser.close();
return resolve(url);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
希望对您有所帮助
我在 node.js
中编写了一个脚本,用于从网页中抓取 links
不同标题。当我执行以下脚本时,我在控制台中打印了 undefined
而不是我想要的 links
。我定义的选择器是准确的。
我不希望将 links
放入数组中,而 return 结果;相反,我希望即时打印它们。由于我对结合使用 node.js
和 puppeteer
编写脚本还很陌生,所以我无法弄清楚我犯的错误。
这是我的脚本 (Link to that site):
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://whosebug.com/questions/tagged/web-scraping");
let url = await page.evaluate(() => {
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
//would like to keep the following line intact
console.log(item.getAttribute('href'));
});
})
browser.close();
return resolve(url);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
The following script works just fine if I consider to declare an empty array
results
and store the scraped links within it and finally return theresults
but I do not wish to go like this. I would like to stick to the way I tried above, as in printing the result on the fly.
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://whosebug.com/questions/tagged/web-scraping");
let urls = await page.evaluate(() => {
let results = [];
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
results.push({
url: item.getAttribute('href'),
});
});
return results;
})
browser.close();
return resolve(urls);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
再一次:我的问题是如何在不将其存储在数组中的情况下像 console.log(item.getAttribute('href'));
一样即时打印 link?
该库使用起来有点笨拙,但在 github- https://github.com/GoogleChrome/puppeteer/issues/628
上找到了从此线程获取 href 的正确方法我的工作代码是使用await page.$$eval
async function getWhosebugLinks(){
return new Promise(async(resolve, reject)=>{
console.log(`going to launch chromium via puppeteer`)
const browser = await puppeteer.launch()
console.log(`creating page/tab`)
const page = await browser.newPage()
await page.goto('https://whosebug.com/questions/tagged/web-scraping')
console.log("fetched SO web-scraping, now parsing link href")
let matches = await page.$$eval('a.question-hyperlink', hrefs=>hrefs.map((a)=>{
return a.href
})) // $$eval and map version, $$eval returns an array
console.log("matches = ", matches.length)
await browser.close()
resolve(matches)
})
}
getWhosebugLinks()
.then(hrefs=>{
console.log("hrefs: ", hrefs)
})
注意事项,
async
函数将 return 一个承诺。new Promise
也会return一个承诺。
关于这一点,您可以简单地使用 .console
事件来即时打印它们。用法,
page.on("console", msg => console.log(msg.text()));
await page.evaluate(async => {
console.log("I will be printed on node console too")
})
高级用法已在
到 运行 console.log()
inside evaluate()
只需复制下面定义页面的行
page.on('console', obj => console.log(obj._text));
所以现在整个代码段将是这样的
const puppeteer = require('puppeteer');
function run () {
return new Promise(async (resolve, reject) => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('console', obj => console.log(obj._text));
await page.goto("https://whosebug.com/questions/tagged/web-scraping");
let url = await page.evaluate(() => {
let items = document.querySelectorAll('a.question-hyperlink');
items.forEach((item) => {
//would like to keep the following line intact
console.log(item.getAttribute('href'));
});
})
browser.close();
return resolve(url);
} catch (e) {
return reject(e);
}
})
}
run().then(console.log).catch(console.error);
希望对您有所帮助