如何在 Puppeteer 中迭代 html 标签以获取带通配符的 innerText?
How do you iterate html tags to get innerText with wildcards in Puppeteer?
出于教育目的,我正在尝试获取对此页面的评论 https://www.tripadvisor.es/Restaurant_Review-g294308-d4754017-Reviews-or10-TAC_ROLL-Quito_Pichincha_Province.html。我每页有 10 条评论,我有这些 html 选择器集(我的代码用于从每个页面获取所有 10 条评论,但页面已更新):
#review_593124597 > div:nth-child(1) > div:nth-child(2) > div:nth-child(5) > div:nth-child(1) > p:nth-child(1)
#review_583146930 > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div:nth-child(1) > p:nth-child(1)
#review_577877496 > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div:nth-child(1) > p:nth-child(1)
#review_572957932 > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div:nth-child(1) > p:nth-child(1)
#review_571417105 > div:nth-child(1) > div:nth-child(2) > div:nth-child(5) > div:nth-child(1) > p:nth-child(1)
#review_565883882 > div:nth-child(1) > div:nth-child(2) > div:nth-child(5) > div:nth-child(1) > p:nth-child(1)
#review_564612180 > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div:nth-child(1) > p:nth-child(1)
#review_554301618 > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div:nth-child(1) > p:nth-child(1)
改变的 2 件事是评论 ID 和第 4 个 div(在 nth-child 4 和 5 之间,我不知道这些是否也会影响结果innerText).我正在尝试获取这些元素的 innerText 但我没有运气。我目前使用的代码是:
const comentarios = 'div[id^=review_] > div:nth-child(1) > div:nth-child(2) > div:nth-child(5) > div:nth-child(1) > p:nth-child(1)'
const comnetarioLength = 'partial_entry';
let listLength = await page.evaluate((sel) => {
window.scrollBy(0, window.innerHeight);
return document.getElementsByClassName(sel).length;
}, comnetarioLength);
console.log(listLength);
以下是我以前使用的代码,但页面已更新,我不知道我到底需要做什么,因为我只得到每个页面的第一个 innerText:
for (let i = 1; i <= listLength; i++) {
let selectorComentarios = comentarios.replace("Index", i); //<--I know
//this is supposed to be different
let comentario = await page.evaluate((sel) => { // Let's create variables and store values...
try {
let comentarioText = document.querySelector(sel).innerText;
return comentarioText;
}
catch (e) { }
}, selectorComentarios);
console.log(comentario);
}
是这样的吗?此脚本输出包含前 10 条评论的数组。
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://www.tripadvisor.es/Restaurant_Review-g294308-d4754017-Reviews-or10-TAC_ROLL-Quito_Pichincha_Province.html');
const reviews = await page.evaluate(
() => [...document.querySelectorAll('p.partial_entry')]
.map( ({ innerText }) => innerText )
)
console.log(reviews);
await browser.close();
} catch (err) {
console.error(err);
}
})();
出于教育目的,我正在尝试获取对此页面的评论 https://www.tripadvisor.es/Restaurant_Review-g294308-d4754017-Reviews-or10-TAC_ROLL-Quito_Pichincha_Province.html。我每页有 10 条评论,我有这些 html 选择器集(我的代码用于从每个页面获取所有 10 条评论,但页面已更新):
#review_593124597 > div:nth-child(1) > div:nth-child(2) > div:nth-child(5) > div:nth-child(1) > p:nth-child(1)
#review_583146930 > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div:nth-child(1) > p:nth-child(1)
#review_577877496 > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div:nth-child(1) > p:nth-child(1)
#review_572957932 > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div:nth-child(1) > p:nth-child(1)
#review_571417105 > div:nth-child(1) > div:nth-child(2) > div:nth-child(5) > div:nth-child(1) > p:nth-child(1)
#review_565883882 > div:nth-child(1) > div:nth-child(2) > div:nth-child(5) > div:nth-child(1) > p:nth-child(1)
#review_564612180 > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div:nth-child(1) > p:nth-child(1)
#review_554301618 > div:nth-child(1) > div:nth-child(2) > div:nth-child(4) > div:nth-child(1) > p:nth-child(1)
改变的 2 件事是评论 ID 和第 4 个 div(在 nth-child 4 和 5 之间,我不知道这些是否也会影响结果innerText).我正在尝试获取这些元素的 innerText 但我没有运气。我目前使用的代码是:
const comentarios = 'div[id^=review_] > div:nth-child(1) > div:nth-child(2) > div:nth-child(5) > div:nth-child(1) > p:nth-child(1)'
const comnetarioLength = 'partial_entry';
let listLength = await page.evaluate((sel) => {
window.scrollBy(0, window.innerHeight);
return document.getElementsByClassName(sel).length;
}, comnetarioLength);
console.log(listLength);
以下是我以前使用的代码,但页面已更新,我不知道我到底需要做什么,因为我只得到每个页面的第一个 innerText:
for (let i = 1; i <= listLength; i++) {
let selectorComentarios = comentarios.replace("Index", i); //<--I know
//this is supposed to be different
let comentario = await page.evaluate((sel) => { // Let's create variables and store values...
try {
let comentarioText = document.querySelector(sel).innerText;
return comentarioText;
}
catch (e) { }
}, selectorComentarios);
console.log(comentario);
}
是这样的吗?此脚本输出包含前 10 条评论的数组。
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://www.tripadvisor.es/Restaurant_Review-g294308-d4754017-Reviews-or10-TAC_ROLL-Quito_Pichincha_Province.html');
const reviews = await page.evaluate(
() => [...document.querySelectorAll('p.partial_entry')]
.map( ({ innerText }) => innerText )
)
console.log(reviews);
await browser.close();
} catch (err) {
console.error(err);
}
})();