Python3、bs4、网络爬虫;连接到网站时出错
Python 3, bs4, webcrawler; error connecting too website
我正在尝试为特定网站构建网络爬虫。
但出于某种原因,我不会连接到该网站。
我收到一个错误(我自己造成的)无法连接。
使用 selesium tot 调用该网站,我发现它无法连接
作为新手,我可能犯了一个愚蠢的错误,但我不知道是什么。
希望你愿意帮助我。
import csv
import requests
import datetime
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
captcha = input('Press Enter after bypassing Captcha')
# def get_driver():
# driver = webdriver.Chrome()
# return driver
def get_driver():
# initialize options
options = webdriver.ChromeOptions()
# pass in headless argument to options
options.add_argument('--headless')
# initialize driver
driver = webdriver.Chrome(chrome_options=options)
return driver
def connect_to_base(browser, page_number):
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
html = None
links = None
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(base_url)
#wait for table element with id = 'map' to load
#before returning True
WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'result-content')))
return True
except Exception as ex:
connection_attempts += 1
print(f'Error connecting to {base_url}')
print(f'Attempt #{connection_attempts}')
return False
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
inside = soup.find_all('a', {'class':'property-inner'},{'href'})
# Make empty lists with header lines
output_list = []
listing = 1
for items in inside:
href = items.get('href')
url1 = href.format(page)
if len(browser.find_elements_by_xpath("//a[@class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[@class='CookiesOK']").click()
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(url1)
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.CLASS_NAME, 'detail-address')))
return True
except Exception as ex:
connection_attempts += 1
print(f'Error connecting to {base_url}')
print(f'Attempt #{connection_attempts}')
details = BeautifulSoup(browser.page_source, 'html')
adres = details.find_all ('div', {'class':'detail-address'})
for adresinfo in adres:
try:
adres = adres[0].get_text(separator=',', strip=True)
except Indexerror:
adres = "Unknown"
kenmerken = details.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr_kenmerken = ','.join([td.text.strip() for td in kenmerken[0].select('td.value')])
except IndexError:
tr_kenmerken = 'Unknown'
waarde = details.find_all ('div', {'class':'detail-tab-content woningwaarde'})
try:
tr_waarde = ','.join([td.text.strip() for td in waarde[0].select('td.value')])
except IndexError:
tr_waarde = 'Unknown'
informatie = {
'adres': adres,
'kenmerken': tr_kenmerken,
'waarde': tr_waarde,
'url': href
}
output_list.append(informatie)
listing += 1
return output_list
def get_load_time(article_url):
try:
# set headers
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# make get request to article_url
response = requests.get(
article_url, headers=headers, stream=True, timeout=3.000)
# get page load time
load_time = response.elapsed.total_seconds()
except Exception as ex:
load_time = 'Loading Error'
return load_time
def write_to_file(output_list, filename):
for row in output_list:
with open(filename, 'a') as csvfile:
fieldnames = ['adres', 'kenmerken', 'waarde', 'link']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
def run_process(page_number, filename, browser):
if connect_to_base(browser, page_number):
sleep(2)
html = browser.page_source
output_list = parse_html(html)
write_to_file(output_list, filename)
else:
print('Error connecting to jaap')
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'output_{output_timestamp}.csv'
browser = get_driver()
# scrape and crawl
while current_page <= 3:
print(f'Scraping page #{current_page}...')
run_process(current_page, output_filename, browser)
current_page = current_page + 1
# exit
browser.quit()
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')
我看到你把 EC.presence_of_element_located((By.ID,{'class':'result-content'}))
改成了 EC.presence_of_element_located((By.CLASS_NAME,'result-content')))
接下来,您可能会遇到一个问题(取决于打开浏览器的位置)必须 bypass/clicking 一个 javascript 表示您可以并接受 cookie。
但是考虑到数据以 json 格式存储在 html 的 script
标签中,所有这些代码似乎是一项非常繁重的工作。为什么不直接使用 requests
,取出 json,转换为数据帧,然后写入 csv?
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
def run_process(page_number):
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('script', {'id':'page-data'}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData['properties'])
return df
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'C:/test/output_{output_timestamp}.csv'
final_df = pd.DataFrame()
while current_page <= 3:
print(f'Scraping page #{current_page}...')
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')
输出:
Scraping page #1...
Scraping page #2...
Scraping page #3...
Elapsed run time: 7.441420555114746 seconds
和看起来像这样的 csv 文件:
app area detailsUrl expired houseTypeValue id latLng latLng.latitude latLng.longitude location.city location.street location.zipcode lotSize market numberOfRooms openHouseDate openHouseTimes openhouse photo price priceToShow showoffColor showoffCustomText showoffPhotoText spotlight status veiling
0 False 165 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 6899666 NaN 52.368420 4.833631 AMSTERDAM Hof van Versailles 61 1064NX 216 sale 4 None None False 10014EAAF8B8883668593EFAC9E5FF1C 595000.0 595000.0 None None None False Sale False
1 True 211 /te-koop/noord+holland/groot-amsterdam/amsterd... False Appartement 10585731 NaN 52.327550 4.889076 AMSTERDAM Beysterveld 35 1083KA Onbekend sale 4 None None False E4F9E5BC7BC90B5B92C7BD8D48B7A677 925000.0 925000.0 None None None False Sale False
2 True 111 /te-koop/noord+holland/groot-amsterdam/amsterd... False Dubbele bovenwoning 11731386 NaN 52.341890 4.896053 AMSTERDAM Uiterwaardenstraat 320 2 1079DC Onbekend sale 5 None None False AB9F45B2CD4AD7879C5A80F18092F9D4 750000.0 750000.0 None None None False SoldConditionally False
3 False 269 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 11840681 NaN 52.358266 4.875508 AMSTERDAM Korte van Eeghenstraat 4 1071ER 107 sale 9 None None False A3DF2B1D426B5E4D501503C5D0E66966 3100000.0 3100000.0 None None None False Sale False
4 False 100 /te-koop/noord+holland/groot-amsterdam/amsterd... False Tussenwoning 12152943 NaN 52.421245 4.899478 AMSTERDAM Pieter A v Heijningestraat 9 1035SV 83 sale 5 None None False 55C6F589523FA553D67A709776DD70DD 399000.0 399000.0 None None None False Sale False
5 True 111 /te-koop/noord+holland/groot-amsterdam/amsterd... False Bovenwoning 15796874 NaN NaN NaN AMSTERDAM Eerste Amstelvlietpad 20 1096GB Onbekend sale 3 None None False AE822B627ED096310B9ECBE7756340C8 1200000.0 1200000.0 None None None False Sale False
6 True 76 /te-koop/noord+holland/groot-amsterdam/amsterd... False Benedenwoning 10580650 NaN 52.346010 4.888799 AMSTERDAM Grevelingenstraat 18 HS 1078KP Onbekend sale 2 None None False 6FD1011D917E776DCF4DA836B5FFEE3E 550000.0 550000.0 None None None False SoldConditionally False
7 False 298 /te-koop/noord+holland/groot-amsterdam/amsterd... False Villa 9623182 NaN 52.330610 4.862902 AMSTERDAM Cannenburg 51 1081GW 651 sale 7 None None False 15FA170B99D4E2DEA03B6FC27E3B5B74 2495000.0 2495000.0 None None None False Sale False
8 False 270 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 15791215 NaN 52.347780 5.004530 AMSTERDAM Nico Jessekade 189 1087MR 200 sale 9 None None False 6EA5C0CDA0475DFC88A3A918A6B2909A 1549000.0 1549000.0 None None None False SoldConditionally False
9 False 201 /te-koop/noord+holland/groot-amsterdam/amsterd... False Villa 9617942 NaN 52.377391 4.764554 AMSTERDAM Osdorperweg 803 1067SW 1348 sale 6 None None False 4680429D99EC5AC47C950D57A77DF1EB 950000.0 950000.0 None None None False Sale False
更新:
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
def run_process(page_number):
page_number = 1
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('script', {'id':'page-data'}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData['properties'])
root_URL = 'https://jaap.nl'
df['detailsUrl'] = root_URL + df['detailsUrl']
allPropDetails = pd.DataFrame()
for idx, row in df.iterrows():
propDetails = pd.DataFrame(index=[0])
w=1
detailLink = row['detailsUrl']
print ('Scraping: %s' %(row['location.street']))
dfs = pd.read_html(detailLink)
for each in dfs:
#each = dfs[8]
w=1
if each.isnull().all().all():
continue
each = each.dropna(axis=0, how='all')
specialCase = False
for col in list(each.columns):
if each[col].dtypes == 'object':
if each[col].str.contains('Voorziening').any():
specialCase = True
break
if specialCase == True:
df_obj = each.select_dtypes(['object'])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))
cols1 = list(each.iloc[2:,0])
each = each.iloc[2:,:]
each[1] = each[1] + '---' + each[2]
each = each.iloc[:,-2]
each.index = cols1
each = each.to_frame().T
propRow = each
propRow.index = [0]
temp_df = pd.DataFrame(index=[0])
for col in propRow.columns:
temp_df = temp_df.merge(propRow[col].str.split('---', expand=True).rename(columns={0:col, 1:col+'.distance'}),left_index=True, right_index=True )
propRow = temp_df
else:
df_obj = each.select_dtypes(['object'])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))
temp_df = each.T
cols = [ temp_df.index[0] + '_' + colName for colName in list(temp_df.iloc[0,:]) ]
propRow = temp_df.iloc[-1,:]
propRow.index = cols
propRow = propRow.to_frame().T
propRow.index = [0]
propDetails = propDetails.merge(propRow, left_index=True, right_index=True)
propDetails.index = [idx]
allPropDetails = allPropDetails.append(propDetails, sort=True)
df = df.merge(allPropDetails, how = 'left', left_index=True, right_index=True)
return df
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'C:/test/output_{output_timestamp}.csv'
final_df = pd.DataFrame()
while current_page <= 3:
print(f'Scraping page #{current_page}...')
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')
我正在尝试为特定网站构建网络爬虫。 但出于某种原因,我不会连接到该网站。 我收到一个错误(我自己造成的)无法连接。 使用 selesium tot 调用该网站,我发现它无法连接
作为新手,我可能犯了一个愚蠢的错误,但我不知道是什么。 希望你愿意帮助我。
import csv
import requests
import datetime
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
captcha = input('Press Enter after bypassing Captcha')
# def get_driver():
# driver = webdriver.Chrome()
# return driver
def get_driver():
# initialize options
options = webdriver.ChromeOptions()
# pass in headless argument to options
options.add_argument('--headless')
# initialize driver
driver = webdriver.Chrome(chrome_options=options)
return driver
def connect_to_base(browser, page_number):
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
html = None
links = None
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(base_url)
#wait for table element with id = 'map' to load
#before returning True
WebDriverWait(browser, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, 'result-content')))
return True
except Exception as ex:
connection_attempts += 1
print(f'Error connecting to {base_url}')
print(f'Attempt #{connection_attempts}')
return False
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
inside = soup.find_all('a', {'class':'property-inner'},{'href'})
# Make empty lists with header lines
output_list = []
listing = 1
for items in inside:
href = items.get('href')
url1 = href.format(page)
if len(browser.find_elements_by_xpath("//a[@class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[@class='CookiesOK']").click()
connection_attempts = 0
while connection_attempts < 3:
try:
browser.get(url1)
WebDriverWait(browser, 5).until(
EC.presence_of_element_located((By.CLASS_NAME, 'detail-address')))
return True
except Exception as ex:
connection_attempts += 1
print(f'Error connecting to {base_url}')
print(f'Attempt #{connection_attempts}')
details = BeautifulSoup(browser.page_source, 'html')
adres = details.find_all ('div', {'class':'detail-address'})
for adresinfo in adres:
try:
adres = adres[0].get_text(separator=',', strip=True)
except Indexerror:
adres = "Unknown"
kenmerken = details.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr_kenmerken = ','.join([td.text.strip() for td in kenmerken[0].select('td.value')])
except IndexError:
tr_kenmerken = 'Unknown'
waarde = details.find_all ('div', {'class':'detail-tab-content woningwaarde'})
try:
tr_waarde = ','.join([td.text.strip() for td in waarde[0].select('td.value')])
except IndexError:
tr_waarde = 'Unknown'
informatie = {
'adres': adres,
'kenmerken': tr_kenmerken,
'waarde': tr_waarde,
'url': href
}
output_list.append(informatie)
listing += 1
return output_list
def get_load_time(article_url):
try:
# set headers
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# make get request to article_url
response = requests.get(
article_url, headers=headers, stream=True, timeout=3.000)
# get page load time
load_time = response.elapsed.total_seconds()
except Exception as ex:
load_time = 'Loading Error'
return load_time
def write_to_file(output_list, filename):
for row in output_list:
with open(filename, 'a') as csvfile:
fieldnames = ['adres', 'kenmerken', 'waarde', 'link']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writerow(row)
def run_process(page_number, filename, browser):
if connect_to_base(browser, page_number):
sleep(2)
html = browser.page_source
output_list = parse_html(html)
write_to_file(output_list, filename)
else:
print('Error connecting to jaap')
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'output_{output_timestamp}.csv'
browser = get_driver()
# scrape and crawl
while current_page <= 3:
print(f'Scraping page #{current_page}...')
run_process(current_page, output_filename, browser)
current_page = current_page + 1
# exit
browser.quit()
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')
我看到你把 EC.presence_of_element_located((By.ID,{'class':'result-content'}))
改成了 EC.presence_of_element_located((By.CLASS_NAME,'result-content')))
接下来,您可能会遇到一个问题(取决于打开浏览器的位置)必须 bypass/clicking 一个 javascript 表示您可以并接受 cookie。
但是考虑到数据以 json 格式存储在 html 的 script
标签中,所有这些代码似乎是一项非常繁重的工作。为什么不直接使用 requests
,取出 json,转换为数据帧,然后写入 csv?
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
def run_process(page_number):
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('script', {'id':'page-data'}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData['properties'])
return df
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'C:/test/output_{output_timestamp}.csv'
final_df = pd.DataFrame()
while current_page <= 3:
print(f'Scraping page #{current_page}...')
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')
输出:
Scraping page #1...
Scraping page #2...
Scraping page #3...
Elapsed run time: 7.441420555114746 seconds
和看起来像这样的 csv 文件:
app area detailsUrl expired houseTypeValue id latLng latLng.latitude latLng.longitude location.city location.street location.zipcode lotSize market numberOfRooms openHouseDate openHouseTimes openhouse photo price priceToShow showoffColor showoffCustomText showoffPhotoText spotlight status veiling
0 False 165 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 6899666 NaN 52.368420 4.833631 AMSTERDAM Hof van Versailles 61 1064NX 216 sale 4 None None False 10014EAAF8B8883668593EFAC9E5FF1C 595000.0 595000.0 None None None False Sale False
1 True 211 /te-koop/noord+holland/groot-amsterdam/amsterd... False Appartement 10585731 NaN 52.327550 4.889076 AMSTERDAM Beysterveld 35 1083KA Onbekend sale 4 None None False E4F9E5BC7BC90B5B92C7BD8D48B7A677 925000.0 925000.0 None None None False Sale False
2 True 111 /te-koop/noord+holland/groot-amsterdam/amsterd... False Dubbele bovenwoning 11731386 NaN 52.341890 4.896053 AMSTERDAM Uiterwaardenstraat 320 2 1079DC Onbekend sale 5 None None False AB9F45B2CD4AD7879C5A80F18092F9D4 750000.0 750000.0 None None None False SoldConditionally False
3 False 269 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 11840681 NaN 52.358266 4.875508 AMSTERDAM Korte van Eeghenstraat 4 1071ER 107 sale 9 None None False A3DF2B1D426B5E4D501503C5D0E66966 3100000.0 3100000.0 None None None False Sale False
4 False 100 /te-koop/noord+holland/groot-amsterdam/amsterd... False Tussenwoning 12152943 NaN 52.421245 4.899478 AMSTERDAM Pieter A v Heijningestraat 9 1035SV 83 sale 5 None None False 55C6F589523FA553D67A709776DD70DD 399000.0 399000.0 None None None False Sale False
5 True 111 /te-koop/noord+holland/groot-amsterdam/amsterd... False Bovenwoning 15796874 NaN NaN NaN AMSTERDAM Eerste Amstelvlietpad 20 1096GB Onbekend sale 3 None None False AE822B627ED096310B9ECBE7756340C8 1200000.0 1200000.0 None None None False Sale False
6 True 76 /te-koop/noord+holland/groot-amsterdam/amsterd... False Benedenwoning 10580650 NaN 52.346010 4.888799 AMSTERDAM Grevelingenstraat 18 HS 1078KP Onbekend sale 2 None None False 6FD1011D917E776DCF4DA836B5FFEE3E 550000.0 550000.0 None None None False SoldConditionally False
7 False 298 /te-koop/noord+holland/groot-amsterdam/amsterd... False Villa 9623182 NaN 52.330610 4.862902 AMSTERDAM Cannenburg 51 1081GW 651 sale 7 None None False 15FA170B99D4E2DEA03B6FC27E3B5B74 2495000.0 2495000.0 None None None False Sale False
8 False 270 /te-koop/noord+holland/groot-amsterdam/amsterd... False Herenhuis 15791215 NaN 52.347780 5.004530 AMSTERDAM Nico Jessekade 189 1087MR 200 sale 9 None None False 6EA5C0CDA0475DFC88A3A918A6B2909A 1549000.0 1549000.0 None None None False SoldConditionally False
9 False 201 /te-koop/noord+holland/groot-amsterdam/amsterd... False Villa 9617942 NaN 52.377391 4.764554 AMSTERDAM Osdorperweg 803 1067SW 1348 sale 6 None None False 4680429D99EC5AC47C950D57A77DF1EB 950000.0 950000.0 None None None False Sale False
更新:
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
def run_process(page_number):
page_number = 1
base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')
jsonStr = soup.find('script', {'id':'page-data'}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData['properties'])
root_URL = 'https://jaap.nl'
df['detailsUrl'] = root_URL + df['detailsUrl']
allPropDetails = pd.DataFrame()
for idx, row in df.iterrows():
propDetails = pd.DataFrame(index=[0])
w=1
detailLink = row['detailsUrl']
print ('Scraping: %s' %(row['location.street']))
dfs = pd.read_html(detailLink)
for each in dfs:
#each = dfs[8]
w=1
if each.isnull().all().all():
continue
each = each.dropna(axis=0, how='all')
specialCase = False
for col in list(each.columns):
if each[col].dtypes == 'object':
if each[col].str.contains('Voorziening').any():
specialCase = True
break
if specialCase == True:
df_obj = each.select_dtypes(['object'])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))
cols1 = list(each.iloc[2:,0])
each = each.iloc[2:,:]
each[1] = each[1] + '---' + each[2]
each = each.iloc[:,-2]
each.index = cols1
each = each.to_frame().T
propRow = each
propRow.index = [0]
temp_df = pd.DataFrame(index=[0])
for col in propRow.columns:
temp_df = temp_df.merge(propRow[col].str.split('---', expand=True).rename(columns={0:col, 1:col+'.distance'}),left_index=True, right_index=True )
propRow = temp_df
else:
df_obj = each.select_dtypes(['object'])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip('. '))
temp_df = each.T
cols = [ temp_df.index[0] + '_' + colName for colName in list(temp_df.iloc[0,:]) ]
propRow = temp_df.iloc[-1,:]
propRow.index = cols
propRow = propRow.to_frame().T
propRow.index = [0]
propDetails = propDetails.merge(propRow, left_index=True, right_index=True)
propDetails.index = [idx]
allPropDetails = allPropDetails.append(propDetails, sort=True)
df = df.merge(allPropDetails, how = 'left', left_index=True, right_index=True)
return df
if __name__ == '__main__':
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
output_filename = f'C:/test/output_{output_timestamp}.csv'
final_df = pd.DataFrame()
while current_page <= 3:
print(f'Scraping page #{current_page}...')
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f'Elapsed run time: {elapsed_time} seconds')