如何使用 .append() 和 openpyxl 来抓取多个网页的内容
How to use .append() with openpyxl to scrape content of multiple web pages
我正在尝试创建一个代码来从网站上抓取“指环王”的脚本并将其粘贴到 Excel 电子表格中。我已经开始为一页创建代码,一切都很好,但是,当我添加一个循环来遍历包含脚本的所有页面时,我遇到了电子表格的内容每次都被覆盖的问题抓取了一个新页面。
查看 openpyxl
的文档,我知道我应该使用 .append()
,并且我在创建变量 if
循环后尝试这样做=14=](此代码中未显示)。不用说它不起作用,我最终得到了一个电子表格,其中只有第一页第一个单元格的内容。
我还阅读了一些旧线程,这些线程建议使用 .cell()
指定单元格以粘贴文本,但是,我不确定这是否是最佳解决方案,因为表格的行数每个网页都不同。
有人能给我指出正确的方向吗?
from selenium import webdriver
import os
import openpyxl
from openpyxl import Workbook
driver = webdriver.Chrome()
# divides url into 3 parts to loop through the pages
url1 = 'http://www.ageofthering.com/atthemovies/scripts/fellowshipofthering'
url2 = 'to'
url3 = '.php'
# main loop: browses the various pages of the script
# f: first number in page url
for f in range(1, 38, 4):
# s: second number in page url
s = f + 3
# combines the url
url = url1 + str(f) + url2 + str(s) + url3
driver.get(url)
# finds length of row and column tags on webpage
rows = len(driver.find_elements_by_xpath("//*[@id='AutoNumber1']/tbody/tr"))
columns = len(driver.find_elements_by_xpath("//*[@id='AutoNumber1']/tbody/tr[3]/td"))
# divides url into 3 parts to loop through the rows and columns
first = "//*[@id='AutoNumber1']/tbody/tr["
second = "]/td["
third = "]"
# loops through the rows (r) and columns (c) of each page
# try/except are used to consider rows with only 1 column
for r in range(1, rows+1):
for c in range(1, columns+1):
try:
# combines the xpath of each cell of the table
final = first + str(r) + second + str(c) + third
# stores the content of each cell in a variable (data)
data = driver.find_element_by_xpath(final).text
# writes content of table in an Excel spreadsheet
fname = 'script.xlsx'
if os.path.exists(fname):
workbook = openpyxl.load_workbook(fname)
worksheet = workbook.get_sheet_by_name('Sheet')
else:
workbook = Workbook()
worksheet = workbook.active
worksheet.cell(row=r, column=c).value = data
workbook.save(fname)
except:
continue
# closes Chrome
driver.quit()
两件事:
- excelsheet中的行与网页中的行不匹配。 excel行要单独设置。
- 在每一行打开和保存 sheet 会大大减慢该过程。每个网页打开一次应该就足够了,以防出现问题。
这是更新后的代码:
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
import os
import openpyxl
from openpyxl import Workbook
options = webdriver.ChromeOptions()
options.add_argument("disable-extensions")
options.add_argument("disable-plugins")
options.experimental_options["useAutomationExtension"] = False # prevent load error - Error Loading Extension - Failed to load extension from ... - Could not load extension from ... Loading of unpacked extensions is disabled
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
# divides url into 3 parts to loop through the pages
url1 = 'http://www.ageofthering.com/atthemovies/scripts/fellowshipofthering'
url2 = 'to'
url3 = '.php'
# main loop: browses the various pages of the script
# f: first number in page url
rpaste = 1 # paste data in excel
for f in range(1, 38, 4):
# s: second number in page url
s = f + 3
# combines the url
url = url1 + str(f) + url2 + str(s) + url3
driver.get(url)
# finds length of row and column tags on webpage
rows = len(driver.find_elements_by_xpath("//*[@id='AutoNumber1']/tbody/tr"))
columns = len(driver.find_elements_by_xpath("//*[@id='AutoNumber1']/tbody/tr[3]/td"))
# divides url into 3 parts to loop through the rows and columns
first = "//*[@id='AutoNumber1']/tbody/tr["
second = "]/td["
third = "]"
fname = 'script.xlsx'
if os.path.exists(fname):
workbook = openpyxl.load_workbook(fname)
worksheet = workbook['Sheet']
else:
workbook = Workbook()
worksheet = workbook.active
# loops through the rows (r) and columns (c) of each page
# try/except are used to consider rows with only 1 column
print('Paste Row', rpaste)
for r in range(1, rows+1):
while worksheet.cell(rpaste, 1).value: # get next empty row in sheet
rpaste += 1
for c in range(1, columns+1):
try:
# combines the xpath of each cell of the table
final = first + str(r) + second + str(c) + third
# stores the content of each cell in a variable (data)
data = driver.find_element_by_xpath(final).text
if c == 1 and "Scene" in data and "~" in data: # add extra empty row if new scene
rpaste += 1
# writes content of table in an Excel spreadsheet
worksheet.cell(rpaste, column=c).value = data
except:
continue
workbook.save(fname)
# closes Chrome
driver.quit()
我正在尝试创建一个代码来从网站上抓取“指环王”的脚本并将其粘贴到 Excel 电子表格中。我已经开始为一页创建代码,一切都很好,但是,当我添加一个循环来遍历包含脚本的所有页面时,我遇到了电子表格的内容每次都被覆盖的问题抓取了一个新页面。
查看 openpyxl
的文档,我知道我应该使用 .append()
,并且我在创建变量 if
循环后尝试这样做=14=](此代码中未显示)。不用说它不起作用,我最终得到了一个电子表格,其中只有第一页第一个单元格的内容。
我还阅读了一些旧线程,这些线程建议使用 .cell()
指定单元格以粘贴文本,但是,我不确定这是否是最佳解决方案,因为表格的行数每个网页都不同。
有人能给我指出正确的方向吗?
from selenium import webdriver
import os
import openpyxl
from openpyxl import Workbook
driver = webdriver.Chrome()
# divides url into 3 parts to loop through the pages
url1 = 'http://www.ageofthering.com/atthemovies/scripts/fellowshipofthering'
url2 = 'to'
url3 = '.php'
# main loop: browses the various pages of the script
# f: first number in page url
for f in range(1, 38, 4):
# s: second number in page url
s = f + 3
# combines the url
url = url1 + str(f) + url2 + str(s) + url3
driver.get(url)
# finds length of row and column tags on webpage
rows = len(driver.find_elements_by_xpath("//*[@id='AutoNumber1']/tbody/tr"))
columns = len(driver.find_elements_by_xpath("//*[@id='AutoNumber1']/tbody/tr[3]/td"))
# divides url into 3 parts to loop through the rows and columns
first = "//*[@id='AutoNumber1']/tbody/tr["
second = "]/td["
third = "]"
# loops through the rows (r) and columns (c) of each page
# try/except are used to consider rows with only 1 column
for r in range(1, rows+1):
for c in range(1, columns+1):
try:
# combines the xpath of each cell of the table
final = first + str(r) + second + str(c) + third
# stores the content of each cell in a variable (data)
data = driver.find_element_by_xpath(final).text
# writes content of table in an Excel spreadsheet
fname = 'script.xlsx'
if os.path.exists(fname):
workbook = openpyxl.load_workbook(fname)
worksheet = workbook.get_sheet_by_name('Sheet')
else:
workbook = Workbook()
worksheet = workbook.active
worksheet.cell(row=r, column=c).value = data
workbook.save(fname)
except:
continue
# closes Chrome
driver.quit()
两件事:
- excelsheet中的行与网页中的行不匹配。 excel行要单独设置。
- 在每一行打开和保存 sheet 会大大减慢该过程。每个网页打开一次应该就足够了,以防出现问题。
这是更新后的代码:
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
import os
import openpyxl
from openpyxl import Workbook
options = webdriver.ChromeOptions()
options.add_argument("disable-extensions")
options.add_argument("disable-plugins")
options.experimental_options["useAutomationExtension"] = False # prevent load error - Error Loading Extension - Failed to load extension from ... - Could not load extension from ... Loading of unpacked extensions is disabled
driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
# divides url into 3 parts to loop through the pages
url1 = 'http://www.ageofthering.com/atthemovies/scripts/fellowshipofthering'
url2 = 'to'
url3 = '.php'
# main loop: browses the various pages of the script
# f: first number in page url
rpaste = 1 # paste data in excel
for f in range(1, 38, 4):
# s: second number in page url
s = f + 3
# combines the url
url = url1 + str(f) + url2 + str(s) + url3
driver.get(url)
# finds length of row and column tags on webpage
rows = len(driver.find_elements_by_xpath("//*[@id='AutoNumber1']/tbody/tr"))
columns = len(driver.find_elements_by_xpath("//*[@id='AutoNumber1']/tbody/tr[3]/td"))
# divides url into 3 parts to loop through the rows and columns
first = "//*[@id='AutoNumber1']/tbody/tr["
second = "]/td["
third = "]"
fname = 'script.xlsx'
if os.path.exists(fname):
workbook = openpyxl.load_workbook(fname)
worksheet = workbook['Sheet']
else:
workbook = Workbook()
worksheet = workbook.active
# loops through the rows (r) and columns (c) of each page
# try/except are used to consider rows with only 1 column
print('Paste Row', rpaste)
for r in range(1, rows+1):
while worksheet.cell(rpaste, 1).value: # get next empty row in sheet
rpaste += 1
for c in range(1, columns+1):
try:
# combines the xpath of each cell of the table
final = first + str(r) + second + str(c) + third
# stores the content of each cell in a variable (data)
data = driver.find_element_by_xpath(final).text
if c == 1 and "Scene" in data and "~" in data: # add extra empty row if new scene
rpaste += 1
# writes content of table in an Excel spreadsheet
worksheet.cell(rpaste, column=c).value = data
except:
continue
workbook.save(fname)
# closes Chrome
driver.quit()