在 Python 中使用 PyQt5 和 Beautiful Soup 抓取 Web 以获取其内容

Scraping Web to Get its contents with PyQt5 and Beautiful Soup in Python

我正在尝试将给定 here 的解决方案从 PyQt4 转换为 PyQt5 作为练习。

不知何故,收集的 html 代码在途中丢失了。我在方法上放了一些 print() 以了解正在发生的事情。 Callable 方法的 print() 显示了 HTML 代码。但是,在 handleLoadFinished 方法中它是 None,因此,函数 funAfuncB 不能工作。

我正在使用的代码是:

import sys, signal
from bs4        import BeautifulSoup
from bs4.dammit import UnicodeDammit
from PyQt5           import QtCore, QtGui
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage as QWebPage

class WebPage(QWebPage):
  def __init__(self):
    QWebPage.__init__(self)
    self.loadFinished.connect(self.handleLoadFinished)

  def process(self, items):
    self._items = iter(items)
    self.fetchNext()

  def fetchNext(self):
    try:
        self._url, self._func = next(self._items)
        self.load(QtCore.QUrl(self._url))
    except StopIteration:
        return False
    return True

  def handleLoadFinished(self):
    A = self.toHtml(self.Callable)
    print('\n\n\n\n\n')
    print("####################### handleLoadFinished: ", A)
    self._func(self._url, self.toHtml(self.Callable))
    if not self.fetchNext():
        print('# processing complete')
        #self._exit()

  def Callable(self, html_str):
    self.html = html_str
    print('####################  Callable html:', self.html)

  def _exit(self):
    print("exiting...")
    QApplication.instance().quit()

def funcA(url, html):
  print('# processing:', url)
  print('html:', html)
  soup = BeautifulSoup(html, "html.parser")
  # do stuff with soup...

def funcB(url, html):
  print('# processing:', url)
  print('html:', html)
  soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)
  # do stuff with soup...

items = [
          ('http://whosebug.com', funcA),
          ('http://google.com', funcB),
        ]

signal.signal(signal.SIGINT, signal.SIG_DFL)
print('Press Ctrl+C to quit\n')
app     = QApplication(sys.argv)
webpage = WebPage()
webpage.process(items)
sys.exit(app.exec_())

任何帮助我理解和纠正它的建议将不胜感激!

在QtWebEngine中获取html是异步的,所以你得到None,而必须通过functools.partial()传递“self._func”来添加url:

from functools import partial
import signal
import sys

from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit

from PyQt5.QtCore import QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage as QWebPage


class WebPage(QWebPage):
    def __init__(self):
        QWebPage.__init__(self)
        self.loadFinished.connect(self.handleLoadFinished)

    def process(self, items):
        self._items = iter(items)
        self.fetchNext()

    def fetchNext(self):
        try:
            self._url, self._func = next(self._items)
            self.load(QUrl(self._url))
        except StopIteration:
            return False
        return True

    def handleLoadFinished(self):
        wrapper = partial(self._func, self._url)
        self.toHtml(wrapper)
        if not self.fetchNext():
            print("# processing complete")


def funcA(url, html):
    print("# processing:", url)
    print("html:", html)
    soup = BeautifulSoup(html, "html.parser")


def funcB(url, html):
    print("# processing:", url)
    print("html:", html)
    soup = BeautifulSoup(UnicodeDammit(html).unicode_markup)


items = [
    ("http://whosebug.com", funcA),
    ("http://google.com", funcB),
]


def main():

    signal.signal(signal.SIGINT, signal.SIG_DFL)
    print("Press Ctrl+C to quit\n")
    app = QApplication(sys.argv)
    webpage = WebPage()
    webpage.process(items)
    sys.exit(app.exec_())


if __name__ == "__main__":
    main()