卡在从第三帧抓取数据

Stuck on scraping data from third frame

我不是专业人士,只是想从网站上抓取一些数据。 这里的一些人帮助我 select 首先 "frame" 但我需要从第三帧抓取数据并将来自帧 1 + 帧 2 + 帧 3 的数据连接在一个 result.This 中 这是我的:

const puppeteer = require('puppeteer');

let scrape = async() => {
    const browser = await puppeteer.launch({
        headless: false,
        slowMo: 250
    });
    const page = await browser.newPage();
    await page.goto('', {
        waituntil: "networkidle0"
    });
    const frame = await page.frames().find(f => f.name() === 'stanga');
    const button = await frame.$('body > form > font > select > option:nth-child(12)');
    button.click();
    await page.waitFor(1000);
    const frame1 = await page.frames().find(a => a.name() ==='centru');
    const select = await frame1.$('body > form > font > select > option:nth-child(1)');
    await page.waitFor(500);
    select.click();
    await page.waitFor(500);

    const result = await page.$$eval("body > font", (options) => {
        const timpi = options.map(option => option.innerText);

        return timpi

    });

    await browser.close();
    return result;
};
scrape().then((value) => {
    console.log(value);
});

感谢您的帮助。

您必须改进您的抓取工具,不仅要点击 select,还要从 select 对象中提取 selected 项目值。

  const frame = await page.frames().find(f => f.name() === "stanga");
  const select1 = await frame.$(
    "body > form > font > select > option:nth-child(12)"
  );

  const select1Value = await frame.evaluate(
    select1 => select1.textContent,
    select1
  );

select1Value 将具有 select 框中 selected 项的值。必须对下一帧中的 select2 执行相同的操作。

在您的代码中,您没有 select frame3,这就是您无法从中读取数据的原因。

我已经更新了你的代码,这是我可以从你的代码中得到的结果:

$ node scrape.js
Frame1: AT_Miresei_1
Frame2:  [1]  E1
Frame3: Linia: E12019-07-25 22:29:13Sosire1: 22:55 Sosire2: 23:00

这就是我的最终结果,但还有很多需要改进的地方(代码质量和可读性)。

const puppeteer = require("puppeteer");

let scrape = async () => {
  let result;

  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  await page.goto("http://ratt.ro/txt", {
    waituntil: "networkidle0"
  });

  // Frame 1
  const frame = await page.frames().find(f => f.name() === "stanga");
  const button = await frame.$(
    "body > form > font > select > option:nth-child(12)"
  );

  const select1Value = await frame.evaluate(
    button => button.textContent,
    button
  );

  button.click();
  await page.waitFor(1000);

  // Frame 2
  const frame1 = await page.frames().find(a => a.name() === "centru");
  const select = await frame1.$(
    "body > form > font > select > option:nth-child(1)"
  );

  const select2Value = await frame1.evaluate(
    select => select.textContent,
    select
  );

  await page.waitFor(200);
  select.click();

  await page.waitFor(200);

  // Frame 3
  const frame3 = await page.frames().find(f => f.name() === "dreapta");
  const element = await frame3.$("body");
  const frame3Text = await frame3.evaluate(
    element => element.textContent,
    element
  );

  await browser.close();

  result =
    "Frame1: " +
    select1Value +
    "\nFrame2: " +
    select2Value +
    "\nFrame3: " +
    frame3Text.trim();

  return result;
};

scrape().then(value => {
  console.log(value);
});

我已经修复了我们的脚本:

const puppeteer = require('puppeteer');

let scrape = async () => {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();

await page.goto('http://example.com/txt', { waitUntil: "networkidle2" });
const optionSelector = 'body > form > font > select > option';
const frames = await page.frames();
const expectedFrames = ['stanga', 'centru'];
const scrapedText = [];


const getOptions = (frameName) => { 
  return frameName.$$eval(optionSelector, (options) => {
    const result = options.map(option => option.innerText);

    return result;
  }, optionSelector);
}

for (const frame of frames) {
  const name = frame.name();

  if (expectedFrames.includes(name)) {
    await frame.click(optionSelector);
    await page.waitFor(1000);
    const result = await getOptions(frame);

    scrapedText.push({[name]: result});
  } else if (name === 'dreapta') {
    const result = await frame.$eval('body', elm =>  elm.innerText);

    scrapedText.push({[name]: result.split(/\n+/g)});
  }
}


await browser.close();

return scrapedText;
};

scrape().then((value) => {
  console.log(value); 
});

输出:

[{ 
   stanga: ['Mures','A Saguna', 'A.Guttenbrun_1', ... and more items]
 },
 {
   centru: ['[0] E3'] 
 },
 { 
   dreapta: ['Linia: E3','2019-07-25 23:19:40','Sosire1: 23:39','Sosire2: 23:41'] 
}]