Redfin Selenium Scraper 脚本正在输出重复的行
Redfin Selenium Scraper Script is outputting duplicate rows
我用 Selenium 构建了一个网络爬虫来抓取 redfin.com 上的 redfin 估计数据。我遇到的问题是,当我将抓取的数据输出到 csv 时,它经常多次复制行值,我不确定如何修复它。
这是我的代码:
from selenium import webdriver
from selenium.webdriver.remote import webelement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, InvalidElementStateException
import pandas as pd
import time
from bs4 import BeautifulSoup
import os
from datetime import datetime
input_file = ".\pa-property-value-tools\input\addresses.xlsx"
input_df = pd.read_excel(input_file)
input_df['Address'] = input_df['Address'].astype(str)
output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate'])
driver = webdriver.Chrome('C:\Users\user\Downloads\chromedriver_win32 (1)\chromedriver.exe')
#driver = webdriver.Firefox(executable_path = 'C:\Users\Morgan.weiss\Downloads\geckodriver-v0.24.0-win64\geckodriver.exe')
def append_date_timestamp(filepath, extension):
return (
filepath + "-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + "." + extension
)
def get_redfin_estimate(address):
driver.get('https://www.redfin.com/')
print(address)
driver.find_element_by_name('searchInputBox').clear()
driver.find_element_by_name('searchInputBox').send_keys(address)
time.sleep(3)
try:
pop_up = driver.find_element_by_css_selector("div[data-rf-test-name='expanded-results']")
if pop_up:
types = pop_up.find_elements_by_class_name("expanded-row-content")
for ele in types:
val = ele.find_element_by_class_name("expanded-type")
if val.text == "ADDRESSES":
ele.find_element_by_css_selector("div[data-rf-test-name='item-row-active']").click()
else:
return ('N/A', 'N/A')
except:
pass
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
price1 = soup.find('div', {'class', 'avm'}).div.text
print(price1)
url = driver.current_url if driver.current_url else 'N/A'
return(price1, url)
except AttributeError:
try:
time.sleep(3)
price2 = soup.find('span',class_='avmLabel').find_next('span', class_='value').text
print(price2)
url = driver.current_url if driver.current_url else 'N/A'
return(price2, url)
except:
return('N/A', 'N/A')
outputfile = append_date_timestamp(".\pa-property-value-tools\output\output", "csv")
count = 0
exception = 0
wait_after = 10
current_date = datetime.now().strftime("%Y-%m-%d")
driver.get('https://www.redfin.com/')
time.sleep(100)
for row in input_df.itertuples():
try:
count += 1
estimate,url_source = get_redfin_estimate(row.Address)
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
if count % wait_after == 0:
# if file does not exist write header
if not os.path.isfile(outputfile):
output_df.to_csv(outputfile, index=False)
else: # else it exists so append without writing the header
output_df.to_csv(outputfile, mode='a', index=False, header=False)
#output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate', 'url', 'date_pulled'])
print("Waiting 20 seconds for every " + str(wait_after) + " calls")
time.sleep(20)
time.sleep(1)
except (NoSuchElementException,InvalidElementStateException) as e:
print(e)
exception += 1
print(exception)
continue
print(exception)
if count % wait_after > 0:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
driver.quit()
我认为导致此问题的部分在这里:
outputfile = append_date_timestamp(".\pa-property-value-tools\output\output", "csv")
count = 0
exception = 0
wait_after = 10
current_date = datetime.now().strftime("%Y-%m-%d")
driver.get('https://www.redfin.com/')
time.sleep(100)
for row in input_df.itertuples():
try:
count += 1
estimate,url_source = get_redfin_estimate(row.Address)
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
if count % wait_after == 0:
# if file does not exist write header
if not os.path.isfile(outputfile):
output_df.to_csv(outputfile, index=False)
else: # else it exists so append without writing the header
output_df.to_csv(outputfile, mode='a', index=False, header=False)
#output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate', 'url', 'date_pulled'])
print("Waiting 20 seconds for every " + str(wait_after) + " calls")
time.sleep(20)
time.sleep(1)
except (NoSuchElementException,InvalidElementStateException) as e:
print(e)
exception += 1
print(exception)
continue
print(exception)
if count % wait_after > 0:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
我不确定是什么问题,非常感谢任何建议。
编辑:
对于标记为有问题的代码。代码所做的是计算我们在地址中迭代的次数。如果我们已经经历了 10 次,那么我们将它们输出到 csv 中。我们对每个调用都有一个随机等待时间,这样我们就不会阻止 ip 地址。问题出在这些代码行中,因为某些原因我得到了重复项。
您似乎没有在写入 csv 文件后重置 output_df
。
您在此处附加到数据框:
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
然后再次将 output_df
的内容附加到 csv 文件 mode='a'
:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
这就是多次写入行的原因。
写入 csv 文件后重置数据帧应该可以解决此问题:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
output_df = pd.DataFrame()
我用 Selenium 构建了一个网络爬虫来抓取 redfin.com 上的 redfin 估计数据。我遇到的问题是,当我将抓取的数据输出到 csv 时,它经常多次复制行值,我不确定如何修复它。
这是我的代码:
from selenium import webdriver
from selenium.webdriver.remote import webelement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, InvalidElementStateException
import pandas as pd
import time
from bs4 import BeautifulSoup
import os
from datetime import datetime
input_file = ".\pa-property-value-tools\input\addresses.xlsx"
input_df = pd.read_excel(input_file)
input_df['Address'] = input_df['Address'].astype(str)
output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate'])
driver = webdriver.Chrome('C:\Users\user\Downloads\chromedriver_win32 (1)\chromedriver.exe')
#driver = webdriver.Firefox(executable_path = 'C:\Users\Morgan.weiss\Downloads\geckodriver-v0.24.0-win64\geckodriver.exe')
def append_date_timestamp(filepath, extension):
return (
filepath + "-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + "." + extension
)
def get_redfin_estimate(address):
driver.get('https://www.redfin.com/')
print(address)
driver.find_element_by_name('searchInputBox').clear()
driver.find_element_by_name('searchInputBox').send_keys(address)
time.sleep(3)
try:
pop_up = driver.find_element_by_css_selector("div[data-rf-test-name='expanded-results']")
if pop_up:
types = pop_up.find_elements_by_class_name("expanded-row-content")
for ele in types:
val = ele.find_element_by_class_name("expanded-type")
if val.text == "ADDRESSES":
ele.find_element_by_css_selector("div[data-rf-test-name='item-row-active']").click()
else:
return ('N/A', 'N/A')
except:
pass
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
price1 = soup.find('div', {'class', 'avm'}).div.text
print(price1)
url = driver.current_url if driver.current_url else 'N/A'
return(price1, url)
except AttributeError:
try:
time.sleep(3)
price2 = soup.find('span',class_='avmLabel').find_next('span', class_='value').text
print(price2)
url = driver.current_url if driver.current_url else 'N/A'
return(price2, url)
except:
return('N/A', 'N/A')
outputfile = append_date_timestamp(".\pa-property-value-tools\output\output", "csv")
count = 0
exception = 0
wait_after = 10
current_date = datetime.now().strftime("%Y-%m-%d")
driver.get('https://www.redfin.com/')
time.sleep(100)
for row in input_df.itertuples():
try:
count += 1
estimate,url_source = get_redfin_estimate(row.Address)
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
if count % wait_after == 0:
# if file does not exist write header
if not os.path.isfile(outputfile):
output_df.to_csv(outputfile, index=False)
else: # else it exists so append without writing the header
output_df.to_csv(outputfile, mode='a', index=False, header=False)
#output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate', 'url', 'date_pulled'])
print("Waiting 20 seconds for every " + str(wait_after) + " calls")
time.sleep(20)
time.sleep(1)
except (NoSuchElementException,InvalidElementStateException) as e:
print(e)
exception += 1
print(exception)
continue
print(exception)
if count % wait_after > 0:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
driver.quit()
我认为导致此问题的部分在这里:
outputfile = append_date_timestamp(".\pa-property-value-tools\output\output", "csv")
count = 0
exception = 0
wait_after = 10
current_date = datetime.now().strftime("%Y-%m-%d")
driver.get('https://www.redfin.com/')
time.sleep(100)
for row in input_df.itertuples():
try:
count += 1
estimate,url_source = get_redfin_estimate(row.Address)
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
if count % wait_after == 0:
# if file does not exist write header
if not os.path.isfile(outputfile):
output_df.to_csv(outputfile, index=False)
else: # else it exists so append without writing the header
output_df.to_csv(outputfile, mode='a', index=False, header=False)
#output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate', 'url', 'date_pulled'])
print("Waiting 20 seconds for every " + str(wait_after) + " calls")
time.sleep(20)
time.sleep(1)
except (NoSuchElementException,InvalidElementStateException) as e:
print(e)
exception += 1
print(exception)
continue
print(exception)
if count % wait_after > 0:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
我不确定是什么问题,非常感谢任何建议。
编辑:
对于标记为有问题的代码。代码所做的是计算我们在地址中迭代的次数。如果我们已经经历了 10 次,那么我们将它们输出到 csv 中。我们对每个调用都有一个随机等待时间,这样我们就不会阻止 ip 地址。问题出在这些代码行中,因为某些原因我得到了重复项。
您似乎没有在写入 csv 文件后重置 output_df
。
您在此处附加到数据框:
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
然后再次将 output_df
的内容附加到 csv 文件 mode='a'
:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
这就是多次写入行的原因。
写入 csv 文件后重置数据帧应该可以解决此问题:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
output_df = pd.DataFrame()