当 url 保持不变时抓取下一页
crawling next page when url remains same
我需要一些帮助来抓取下面 url 中的数据
https://onland.kbstar.com/quics?page=C060250&keyword=%EB%8F%99%EC%9E%91%EA%B5%AC
我想抓取第二页,但是 url 在我点击“2”时仍然保持不变,但不知道该怎么做。请帮忙!
这是我用来抓取第一页的 python 代码:
from selenium import webdriver
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
import urllib.request as req
import urllib
import re
from datetime import datetime
import requests
dataframe = pd.DataFrame()
def KB_liveON(area_name):
query = area_name
area = urllib.parse.quote(query)
url = 'https://onland.kbstar.com' \
+ '/quics?page=C060250' \
+ '&keyword=' + str(area)
# + '#CP'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table')
trs = table.tbody.find_all('tr')
dataframe = pd.DataFrame()
value_list = []
for tr in trs[::1]:
tds = tr.find_all('td')
#cols = [' '.join(td.text.strip().split()) for td in tds]
cols = [td.text.strip().split() for td in tds]
progress = cols[0]
location = cols[1]
complex_name = cols[2]
area = cols[3]
sale_price = cols[4]
sale_price2 = cols[5]
time = cols[6]
type_of_sale = cols[7]
construction_company = cols[8]
value_list.append([progress ,location, complex_name ,area, sale_price, sale_price2, time, type_of_sale, construction_company])
cols = ['progress' ,'location','complex_name' ,'area','sale_price','sale_price2','time','type_of_sale','construction_company']
df = pd.DataFrame(value_list, columns=cols)
return df
kb = KB_liveON('동작구')
dataframe = dataframe.append(kb)
dataframe
首先,我在 Google Colab 上安装了一个 Selenium WebDriver。然后我写一个爬虫从多个页面爬取数据。
Python代码:
import time
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
def extract_data(value_list, html_tags):
soup = BeautifulSoup(html_tags, 'lxml')
table = soup.find('table')
trs = table.tbody.find_all('tr')
for tr in trs[::1]:
tds = tr.find_all('td')
cols = [td.text.strip().split() for td in tds]
progress = cols[0]
location = cols[1]
complex_name = cols[2]
area = cols[3]
sale_price = cols[4]
sale_price2 = cols[5]
time = cols[6]
type_of_sale = cols[7]
construction_company = cols[8]
value_list.append([progress ,location, complex_name ,area, sale_price, sale_price2, time, type_of_sale, construction_company])
return value_list
def KB_liveON(area):
url = 'https://onland.kbstar.com' \
+ '/quics?page=C060250' \
+ '&keyword=' + str(area)
wd.get(url)
data_list = []
# Extract data from first page
tbl = wd.find_elements_by_class_name("tbl_list")[0]
html_tags = tbl.get_attribute('outerHTML')
data_list = extract_data(data_list, html_tags)
# Find and extract data from other pages except first page
forms = wd.find_elements_by_xpath("//div[@class='paging']//form")
for f in forms[1:]:
f.submit()
time.sleep(10)
tbl = wd.find_elements_by_class_name("tbl_list")[0]
html_tags = tbl.get_attribute('outerHTML')
data_list = extract_data(data_list, html_tags)
time.sleep(10)
cols = ['progress' ,'location','complex_name' ,'area','sale_price','sale_price2','time','type_of_sale','construction_company']
df = pd.DataFrame(data_list, columns=cols)
return df
if __name__=='__main__':
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver', options=options)
df = KB_liveON('동작구')
print (df)
输出(抓取结果):
我需要一些帮助来抓取下面 url 中的数据 https://onland.kbstar.com/quics?page=C060250&keyword=%EB%8F%99%EC%9E%91%EA%B5%AC
我想抓取第二页,但是 url 在我点击“2”时仍然保持不变,但不知道该怎么做。请帮忙!
这是我用来抓取第一页的 python 代码:
from selenium import webdriver
import pandas as pd
import numpy as np
import time
from bs4 import BeautifulSoup
import urllib.request as req
import urllib
import re
from datetime import datetime
import requests
dataframe = pd.DataFrame()
def KB_liveON(area_name):
query = area_name
area = urllib.parse.quote(query)
url = 'https://onland.kbstar.com' \
+ '/quics?page=C060250' \
+ '&keyword=' + str(area)
# + '#CP'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table')
trs = table.tbody.find_all('tr')
dataframe = pd.DataFrame()
value_list = []
for tr in trs[::1]:
tds = tr.find_all('td')
#cols = [' '.join(td.text.strip().split()) for td in tds]
cols = [td.text.strip().split() for td in tds]
progress = cols[0]
location = cols[1]
complex_name = cols[2]
area = cols[3]
sale_price = cols[4]
sale_price2 = cols[5]
time = cols[6]
type_of_sale = cols[7]
construction_company = cols[8]
value_list.append([progress ,location, complex_name ,area, sale_price, sale_price2, time, type_of_sale, construction_company])
cols = ['progress' ,'location','complex_name' ,'area','sale_price','sale_price2','time','type_of_sale','construction_company']
df = pd.DataFrame(value_list, columns=cols)
return df
kb = KB_liveON('동작구')
dataframe = dataframe.append(kb)
dataframe
首先,我在 Google Colab 上安装了一个 Selenium WebDriver。然后我写一个爬虫从多个页面爬取数据。
Python代码:
import time
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
def extract_data(value_list, html_tags):
soup = BeautifulSoup(html_tags, 'lxml')
table = soup.find('table')
trs = table.tbody.find_all('tr')
for tr in trs[::1]:
tds = tr.find_all('td')
cols = [td.text.strip().split() for td in tds]
progress = cols[0]
location = cols[1]
complex_name = cols[2]
area = cols[3]
sale_price = cols[4]
sale_price2 = cols[5]
time = cols[6]
type_of_sale = cols[7]
construction_company = cols[8]
value_list.append([progress ,location, complex_name ,area, sale_price, sale_price2, time, type_of_sale, construction_company])
return value_list
def KB_liveON(area):
url = 'https://onland.kbstar.com' \
+ '/quics?page=C060250' \
+ '&keyword=' + str(area)
wd.get(url)
data_list = []
# Extract data from first page
tbl = wd.find_elements_by_class_name("tbl_list")[0]
html_tags = tbl.get_attribute('outerHTML')
data_list = extract_data(data_list, html_tags)
# Find and extract data from other pages except first page
forms = wd.find_elements_by_xpath("//div[@class='paging']//form")
for f in forms[1:]:
f.submit()
time.sleep(10)
tbl = wd.find_elements_by_class_name("tbl_list")[0]
html_tags = tbl.get_attribute('outerHTML')
data_list = extract_data(data_list, html_tags)
time.sleep(10)
cols = ['progress' ,'location','complex_name' ,'area','sale_price','sale_price2','time','type_of_sale','construction_company']
df = pd.DataFrame(data_list, columns=cols)
return df
if __name__=='__main__':
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver', options=options)
df = KB_liveON('동작구')
print (df)
输出(抓取结果):