如何减少代码的运行时间?
How can I decrease the runtime of my code?
是否可以对这段代码进行任何修改以使其 运行 更快?我的代码目前正在运行,但需要 10 个小时以上才能抓取所有 50,000 个配置文件。请告诉我如何减少 运行 时间。谢谢!
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
text = "test"
driver = webdriver.Chrome("/Users/nzalle/Downloads/chromedriver")
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[@align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[@align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
除了将 issuedBy
移动到 for 循环之外,因为它不需要重复声明(这仍然只会产生非常非常微不足道的差异),您似乎没有什么可以做的改变。由于这是从互联网上抓取数据,因此最大的限制因素将是您的宽带速度。总的来说,你的系统有一个 O(n) 的大 O,这意味着随着数据集的增加,处理时间线性增加。
处理此脚本的时间肯定主要是宽带速度的瓶颈,您无法控制宽带速度,因此,不,您可以更改的不多
是否可以对这段代码进行任何修改以使其 运行 更快?我的代码目前正在运行,但需要 10 个小时以上才能抓取所有 50,000 个配置文件。请告诉我如何减少 运行 时间。谢谢!
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
text = "test"
driver = webdriver.Chrome("/Users/nzalle/Downloads/chromedriver")
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[@align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[@align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
除了将 issuedBy
移动到 for 循环之外,因为它不需要重复声明(这仍然只会产生非常非常微不足道的差异),您似乎没有什么可以做的改变。由于这是从互联网上抓取数据,因此最大的限制因素将是您的宽带速度。总的来说,你的系统有一个 O(n) 的大 O,这意味着随着数据集的增加,处理时间线性增加。
处理此脚本的时间肯定主要是宽带速度的瓶颈,您无法控制宽带速度,因此,不,您可以更改的不多