如何从 python 和 BeautifulSoup 中的 html table 抓取 url
How to scrape url from a html table in python and BeautifulSoup
我想从 this website 的 html table 中抓取 url。我能够收集 LOCATION |日期 |总结 |截止日期。但是 SUMMARY 字段有一个 url 到另一个页面。我想连同这个 url 一起抓取整个 table 所以我抓取的数据变成 LOCATION |日期 |总结 |截止日期 |网址
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.tendersinfo.com/global-information-technology-tenders-{}.php'
amount_of_pages = 4796 #5194
rows = []
for i in range(1,amount_of_pages):
response = rq.get(url.format(i))
if response.status_code == 200:
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',{'id':'datatable'})
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
for tr in table.find_all("tr")[1:]:
cells = []
tds = tr.find_all("td")
if len(tds) == 0:
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
links = [th.findAll('a')]
else:
for td in tds:
cells.append(td.text.strip())
links = [td.findAll('a')]
rows.append(cells)
您需要获取 <td>
标签下的 '' 标签,并拉出 href
属性。
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.tendersinfo.com/global-information-technology-tenders-{}.php'
amount_of_pages = 4796 #5194
rows = []
headers = []
for i in range(1,amount_of_pages+1): #<-- if theres 4796 pages, your range needs to be to 4797. range goes from (start, end) but the is not inclusive of the end value
response = rq.get(url.format(i))
print (i)
if response.status_code == 200:
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',{'id':'datatable'})
if len(headers) == 0:
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
headers.append('URL')
for tr in table.find_all("tr")[1:]:
cells = []
tds = tr.find_all("td")
for td in tds:
cells.append(td.text.strip())
if td.find('a'):
link = td.find('a')['href']
cells = cells + [link]
rows.append(cells)
df = pd.DataFrame(rows,
columns =headers)
我想从 this website 的 html table 中抓取 url。我能够收集 LOCATION |日期 |总结 |截止日期。但是 SUMMARY 字段有一个 url 到另一个页面。我想连同这个 url 一起抓取整个 table 所以我抓取的数据变成 LOCATION |日期 |总结 |截止日期 |网址
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.tendersinfo.com/global-information-technology-tenders-{}.php'
amount_of_pages = 4796 #5194
rows = []
for i in range(1,amount_of_pages):
response = rq.get(url.format(i))
if response.status_code == 200:
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',{'id':'datatable'})
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
for tr in table.find_all("tr")[1:]:
cells = []
tds = tr.find_all("td")
if len(tds) == 0:
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
links = [th.findAll('a')]
else:
for td in tds:
cells.append(td.text.strip())
links = [td.findAll('a')]
rows.append(cells)
您需要获取 <td>
标签下的 '' 标签,并拉出 href
属性。
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.tendersinfo.com/global-information-technology-tenders-{}.php'
amount_of_pages = 4796 #5194
rows = []
headers = []
for i in range(1,amount_of_pages+1): #<-- if theres 4796 pages, your range needs to be to 4797. range goes from (start, end) but the is not inclusive of the end value
response = rq.get(url.format(i))
print (i)
if response.status_code == 200:
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',{'id':'datatable'})
if len(headers) == 0:
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
headers.append('URL')
for tr in table.find_all("tr")[1:]:
cells = []
tds = tr.find_all("td")
for td in tds:
cells.append(td.text.strip())
if td.find('a'):
link = td.find('a')['href']
cells = cells + [link]
rows.append(cells)
df = pd.DataFrame(rows,
columns =headers)