如何确定性地使用 CSS 选择器与 puppeteer?
How to use CSS selectors deterministically with puppeteer?
我正在尝试自定义一个在 soundcloud 上播放歌曲并录制的木偶脚本。使用 CSS 选择器我也想打印歌曲的持续时间。
我似乎无法让 CSS 选择器工作。我正在使用的 url 是 https://soundcloud.com/octasine/octasine-audio-example-1
我现在有一个可用的 CSS 选择器,可以从页面中获取分钟和秒。我看到的挑战是有时页面还没有完成渲染,我得到一个空数组 return 使用 await page.waitForNavigation();
导致承诺失败。
我缺少什么让木偶操作者更可靠地工作?
这就是我使用 CSS 选择器的方式:
const work = async () => {
const inputsValues = [];
const inputElements = await page.$$('span.sc-visuallyhidden');
for (const element of inputElements) {
let inputValue;
inputValue = await element.getProperty('innerText');
inputValue = await inputValue.jsonValue();
if (inputValue.includes('Duration')){
console.log("DURATION");
mins = inputValue.split(" ")[1];
secs = inputValue.split(" ")[3];
console.log(mins);
console.log(secs);
console.log(inputValue);
}
inputsValues.push(inputValue);
}
console.log(inputsValues)
}
await work();
我的完整脚本example.js
:
// example.js -- node version v14.17.2 -- dependency installed with npm i puppeteer-stream
const { launch, getStream } = require("puppeteer-stream");
const fs = require("fs");
const { Console } = require("console");
const file = fs.createWriteStream(__dirname + "/test.webm");
async function test() {
const browser = await launch();
const page = await browser.newPage();
await page.goto("https://soundcloud.com/octasine/octasine-audio-example-1");
// await page.waitForNavigation();
let html_var = await page.content();
// Write the file
fs.writeFile("example.html", html_var, function (err) {
// Checks if there is an error
if (err) return console.log(err);
});
console.log("Wrote html to example.html");
// await page.click("//a[contains(text(), 'Play')]");
await page.evaluate(() => {
let elements = document.getElementsByClassName('snippetUXPlayButton');
for (let element of elements)
element.click();
});
const work = async () => {
const inputsValues = [];
const inputElements = await page.$$('span.sc-visuallyhidden');
for (const element of inputElements) {
let inputValue;
inputValue = await element.getProperty('innerText');
inputValue = await inputValue.jsonValue();
if (inputValue.includes('Duration')){
console.log("DURATION");
mins = inputValue.split(" ")[1];
secs = inputValue.split(" ")[3];
console.log(mins);
console.log(secs);
console.log(inputValue);
}
inputsValues.push(inputValue);
}
console.log(inputsValues)
}
await work();
let page_url = await page.url();
console.log(page_url)
await page.evaluate(() => {
let elements = document.getElementsByClassName('sc-visuallyhidden');
for (let element of elements)
console.log(element.innerHTML);
});
const stream = await getStream(page, { audio: true, video: true });
console.log("recording");
stream.pipe(file);
setTimeout(async () => {
await stream.destroy();
file.close();
console.log("finished");
browser.close();
}, 1000 * 5 + mins * 60000 + secs * 1000);
}
test();
中的示例脚本
带有span.sc-visuallyhidden
选择器的元素被动态地一个一个地填充到DOM中,因此$$('span.sc-visuallyhidden')
的长度随着页面加载而增长。在您填充 inputElements
数组时,它可能还不包含 Duration。
要 100% 确保它在您的元素集中可用,您需要等待它呈现到 DOM。例如。通过抓住它的确切选择器:
await page.waitForSelector('.playbackTimeline__duration > span.sc-visuallyhidden')
我建议将您的 work()
函数重构为 page.$$eval
方法,如下所示:
const inputsValues = await page.$$eval('span.sc-visuallyhidden', elems => elems.map(el => el.innerText))
输出为:
8 months ago, 2,452 plays, View all likes, View all reposts, 10 followers, 2 tracks, 414 plays, View all likes, View all comments, Current time: 0 seconds, Duration: 2 minutes 26 seconds, Current track: Octasine Audio Example 1
...包含:Duration: 2 minutes 26 seconds
您可以像以前一样处理到分钟和秒。
我正在尝试自定义一个在 soundcloud 上播放歌曲并录制的木偶脚本。使用 CSS 选择器我也想打印歌曲的持续时间。 我似乎无法让 CSS 选择器工作。我正在使用的 url 是 https://soundcloud.com/octasine/octasine-audio-example-1
我现在有一个可用的 CSS 选择器,可以从页面中获取分钟和秒。我看到的挑战是有时页面还没有完成渲染,我得到一个空数组 return 使用 await page.waitForNavigation();
导致承诺失败。
我缺少什么让木偶操作者更可靠地工作?
这就是我使用 CSS 选择器的方式:
const work = async () => {
const inputsValues = [];
const inputElements = await page.$$('span.sc-visuallyhidden');
for (const element of inputElements) {
let inputValue;
inputValue = await element.getProperty('innerText');
inputValue = await inputValue.jsonValue();
if (inputValue.includes('Duration')){
console.log("DURATION");
mins = inputValue.split(" ")[1];
secs = inputValue.split(" ")[3];
console.log(mins);
console.log(secs);
console.log(inputValue);
}
inputsValues.push(inputValue);
}
console.log(inputsValues)
}
await work();
我的完整脚本example.js
:
// example.js -- node version v14.17.2 -- dependency installed with npm i puppeteer-stream
const { launch, getStream } = require("puppeteer-stream");
const fs = require("fs");
const { Console } = require("console");
const file = fs.createWriteStream(__dirname + "/test.webm");
async function test() {
const browser = await launch();
const page = await browser.newPage();
await page.goto("https://soundcloud.com/octasine/octasine-audio-example-1");
// await page.waitForNavigation();
let html_var = await page.content();
// Write the file
fs.writeFile("example.html", html_var, function (err) {
// Checks if there is an error
if (err) return console.log(err);
});
console.log("Wrote html to example.html");
// await page.click("//a[contains(text(), 'Play')]");
await page.evaluate(() => {
let elements = document.getElementsByClassName('snippetUXPlayButton');
for (let element of elements)
element.click();
});
const work = async () => {
const inputsValues = [];
const inputElements = await page.$$('span.sc-visuallyhidden');
for (const element of inputElements) {
let inputValue;
inputValue = await element.getProperty('innerText');
inputValue = await inputValue.jsonValue();
if (inputValue.includes('Duration')){
console.log("DURATION");
mins = inputValue.split(" ")[1];
secs = inputValue.split(" ")[3];
console.log(mins);
console.log(secs);
console.log(inputValue);
}
inputsValues.push(inputValue);
}
console.log(inputsValues)
}
await work();
let page_url = await page.url();
console.log(page_url)
await page.evaluate(() => {
let elements = document.getElementsByClassName('sc-visuallyhidden');
for (let element of elements)
console.log(element.innerHTML);
});
const stream = await getStream(page, { audio: true, video: true });
console.log("recording");
stream.pipe(file);
setTimeout(async () => {
await stream.destroy();
file.close();
console.log("finished");
browser.close();
}, 1000 * 5 + mins * 60000 + secs * 1000);
}
test();
中的示例脚本
带有span.sc-visuallyhidden
选择器的元素被动态地一个一个地填充到DOM中,因此$$('span.sc-visuallyhidden')
的长度随着页面加载而增长。在您填充 inputElements
数组时,它可能还不包含 Duration。
要 100% 确保它在您的元素集中可用,您需要等待它呈现到 DOM。例如。通过抓住它的确切选择器:
await page.waitForSelector('.playbackTimeline__duration > span.sc-visuallyhidden')
我建议将您的 work()
函数重构为 page.$$eval
方法,如下所示:
const inputsValues = await page.$$eval('span.sc-visuallyhidden', elems => elems.map(el => el.innerText))
输出为:
8 months ago, 2,452 plays, View all likes, View all reposts, 10 followers, 2 tracks, 414 plays, View all likes, View all comments, Current time: 0 seconds, Duration: 2 minutes 26 seconds, Current track: Octasine Audio Example 1
...包含:Duration: 2 minutes 26 seconds
您可以像以前一样处理到分钟和秒。