如何使用 puppeteer 从 iframe 标签中抓取日期
How to scrape date from iframe tag with with puppeteer
我尝试从 iframe/frame 标签中抓取一些数据,但我被代码困住了 puppeteer.I 我是新手所以请耐心等待 me.This 是 link 的网站。在那里,当我在第一帧中单击名称时,我会在 的第二帧中再次获得一些数据,我可以单击并在第三帧中获取数据。
在代码中,我尝试循环真正的第一帧以获取第二和第三帧的所有数据。
感谢您的任何提示。
我有 运行 这个命令:
document.querySelector("body > form > font > select > option")
在控制台中,但我找不到在 puppeteer 中 运行 它的方法。
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('');
const iframeParagraph = await page.evaluate(() => {
const iframe = document.getElementsByName("stanga");
// grab iframe's document object
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
const iframeP = iframeDoc.getElementsByName("fmtstatii");
return iframeP.innerHTML;
});
console.log(iframeParagraph);
await browser.close();
})();
或
const puppeteer = require('puppeteer');
let scrape = async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('');
await page.click('document.querySelector("body > form > font > select")');
await page.waitFor(1000);
const result = await page.evaluate(() => {
let statie = document.querySelector('document.querySelector("body > form > font > select > option")').innerText;
return {
statie
}
});
browser.close();
return result;
};
scrape().then((value) => {
console.log(value); // Success!
});
这是我得到的错误:
[(node:13308) UnhandledPromiseRejectionWarning: Error: Evaluation failed: DOMException: Failed to execute 'querySelector' on 'Document': 'document.querySelector("body > form > font > select")' is not a
valid selector.
at __puppeteer_evaluation_script__:1:33
at ExecutionContext._evaluateInternal (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\ExecutionContext.js:122:13)
at process._tickCallback (internal/process/next_tick.js:68:7)
-- ASYNC --
at ExecutionContext.<anonymous> (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\helper.js:111:15)
at ElementHandle.$ (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\JSHandle.js:395:50)
at ElementHandle.<anonymous> (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\helper.js:112:23)
at DOMWorld.$ (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\DOMWorld.js:121:34)
at process._tickCallback (internal/process/next_tick.js:68:7)
-- ASYNC --
at Frame.<anonymous> (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\helper.js:111:15)
at Page.click (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\Page.js:986:29)
at scrape (D:\Zero\ratt_scrap\scrape.js:23:16)
at process._tickCallback (internal/process/next_tick.js:68:7)
(node:13308) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 1)
(node:13308) \[DEP0018\] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.]
你犯了几个错误:
您应该与 Frame
而不是 Page
对象交互。
const frame = await page.frames().find(frame => frame.name() === 'stanga'); // Find the right frame.
click()
方法需要selector <string>
,所以你不需要在click()
方法里面添加document.querySelector
。
await frame.click('body > form > font > select');
要获得所有 innerText
,您必须遍历元素。
不要忘记添加 await
。您错过了 close
方法。
await browser.close();
解决方案:
const puppeteer = require('puppeteer');
let scrape = async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('http://ratt.ro/txt');
const frame = await page.frames().find(frame => frame.name() === 'stanga');
await frame.click('body > form > font > select');
await page.waitFor(1000);
const optionsResult = await frame.$$eval('body > form > font > select > option', (options) => {
const result = options.map(option => option.innerText);
return result;
});
await browser.close();
return optionsResult;
};
scrape().then((value) => {
console.log(value); // Success!
});
我尝试从 iframe/frame 标签中抓取一些数据,但我被代码困住了 puppeteer.I 我是新手所以请耐心等待 me.This 是 link 的网站。在那里,当我在第一帧中单击名称时,我会在 的第二帧中再次获得一些数据,我可以单击并在第三帧中获取数据。 在代码中,我尝试循环真正的第一帧以获取第二和第三帧的所有数据。
感谢您的任何提示。
我有 运行 这个命令: document.querySelector("body > form > font > select > option") 在控制台中,但我找不到在 puppeteer 中 运行 它的方法。
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('');
const iframeParagraph = await page.evaluate(() => {
const iframe = document.getElementsByName("stanga");
// grab iframe's document object
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
const iframeP = iframeDoc.getElementsByName("fmtstatii");
return iframeP.innerHTML;
});
console.log(iframeParagraph);
await browser.close();
})();
或
const puppeteer = require('puppeteer');
let scrape = async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('');
await page.click('document.querySelector("body > form > font > select")');
await page.waitFor(1000);
const result = await page.evaluate(() => {
let statie = document.querySelector('document.querySelector("body > form > font > select > option")').innerText;
return {
statie
}
});
browser.close();
return result;
};
scrape().then((value) => {
console.log(value); // Success!
});
这是我得到的错误:
[(node:13308) UnhandledPromiseRejectionWarning: Error: Evaluation failed: DOMException: Failed to execute 'querySelector' on 'Document': 'document.querySelector("body > form > font > select")' is not a
valid selector.
at __puppeteer_evaluation_script__:1:33
at ExecutionContext._evaluateInternal (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\ExecutionContext.js:122:13)
at process._tickCallback (internal/process/next_tick.js:68:7)
-- ASYNC --
at ExecutionContext.<anonymous> (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\helper.js:111:15)
at ElementHandle.$ (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\JSHandle.js:395:50)
at ElementHandle.<anonymous> (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\helper.js:112:23)
at DOMWorld.$ (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\DOMWorld.js:121:34)
at process._tickCallback (internal/process/next_tick.js:68:7)
-- ASYNC --
at Frame.<anonymous> (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\helper.js:111:15)
at Page.click (D:\Zero\ratt_scrap\node_modules\puppeteer\lib\Page.js:986:29)
at scrape (D:\Zero\ratt_scrap\scrape.js:23:16)
at process._tickCallback (internal/process/next_tick.js:68:7)
(node:13308) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 1)
(node:13308) \[DEP0018\] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.]
你犯了几个错误:
您应该与
Frame
而不是Page
对象交互。const frame = await page.frames().find(frame => frame.name() === 'stanga'); // Find the right frame.
click()
方法需要selector <string>
,所以你不需要在click()
方法里面添加document.querySelector
。await frame.click('body > form > font > select');
要获得所有
innerText
,您必须遍历元素。不要忘记添加
await
。您错过了close
方法。await browser.close();
解决方案:
const puppeteer = require('puppeteer');
let scrape = async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('http://ratt.ro/txt');
const frame = await page.frames().find(frame => frame.name() === 'stanga');
await frame.click('body > form > font > select');
await page.waitFor(1000);
const optionsResult = await frame.$$eval('body > form > font > select > option', (options) => {
const result = options.map(option => option.innerText);
return result;
});
await browser.close();
return optionsResult;
};
scrape().then((value) => {
console.log(value); // Success!
});