从 python 中的 html table 抓取数据
Crawl data from an html table in python
我是网络爬虫的初学者,我需要帮助从 table 获取值。我有所有必填字段(位置、日期、摘要、截止日期)。我想要的是摘要在另一页上有一个 url。我希望 url 与其他字段一起附加,例如 (LOCATION,DATE,SUMMARY,DEADLINE,URL)
到目前为止,这是我的代码。但它不起作用
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.tendersinfo.com/global-information-technology-tenders-{}.php'
amount_of_pages = 2 #5194
rows = []
for i in range(1,amount_of_pages):
response = rq.get(url.format(i))
if response.status_code == 200:
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',{'id':'datatable'})
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
for tr in table.find_all("tr")[1:]:
cells = []
tds = tr.find_all("td")
if len(tds) == 0:
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
for td in tds:
cells.append(td.text.strip())
cells.append('https://www.tendersinfo.com/' + td.find('a')['href'])
rows.append(cells)
给你,我只是重新编码了大部分内容。
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
location = []
posted_date = []
summary = []
deadline = []
url = 'https://www.tendersinfo.com/global-information-technology-tenders-{}.php'
amount_of_pages = 10 # Max is 5194 currently
rows = []
for i in range(1,amount_of_pages):
response = rq.get(url.format(i))
if response.status_code == 200:
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',{'id':'datatable'})
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
for tr in table.find_all("tr")[1:]:
cells = []
tds = tr.find_all("td")
if len(tds) == 0:
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
for td in tds:
cells.append(td.text.strip())
rows.append(cells)
pd.DataFrame(rows, columns=headers).to_csv(r"C:\Users\HP\Desktop\Web Scraping (RFP's)\RFP_SCRAPED_DATA.csv", index=False)
既然你使用 pandas,为什么不使用 read_html return 提取的表作为数据帧列表。
>>> tables = pd.read_html("https://www.tendersinfo.com/global-information-technology-tenders.php")
>>> tables[1]
LOCATION DATE SUMMARY DEADLINE
0 India 21-May-2020 Liquid Crystal Display Lcd Panel Or Monitors. 01-Jun-2020
1 India 21-May-2020 Random Access Memory. 01-Jun-2020
2 India 21-May-2020 Supply Of Analog Transceiver-handheld. 01-Jun-2020
3 India 21-May-2020 Supply Of Computer Printers. 01-Jun-2020
4 India 21-May-2020 All In One Pc. 01-Jun-2020
您可以使用 pd.read_html
轻松获得 table 并使用 df.to_csv()
.
将此数据保存到 csv
文件中
import pandas as pd
url = "https://www.tendersinfo.com/ajax_all_new_search.php?country=information-technology&increment=1&%20select=500&%20total=259655&%20search_id=19906&%20order=id&%20imagevalue=1"
df = pd.read_html(url)[0]
df.to_csv("RFP_SCRAPED_DATA.csv", index=False)
我是网络爬虫的初学者,我需要帮助从 table 获取值。我有所有必填字段(位置、日期、摘要、截止日期)。我想要的是摘要在另一页上有一个 url。我希望 url 与其他字段一起附加,例如 (LOCATION,DATE,SUMMARY,DEADLINE,URL)
到目前为止,这是我的代码。但它不起作用
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.tendersinfo.com/global-information-technology-tenders-{}.php'
amount_of_pages = 2 #5194
rows = []
for i in range(1,amount_of_pages):
response = rq.get(url.format(i))
if response.status_code == 200:
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',{'id':'datatable'})
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
for tr in table.find_all("tr")[1:]:
cells = []
tds = tr.find_all("td")
if len(tds) == 0:
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
for td in tds:
cells.append(td.text.strip())
cells.append('https://www.tendersinfo.com/' + td.find('a')['href'])
rows.append(cells)
给你,我只是重新编码了大部分内容。
import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
location = []
posted_date = []
summary = []
deadline = []
url = 'https://www.tendersinfo.com/global-information-technology-tenders-{}.php'
amount_of_pages = 10 # Max is 5194 currently
rows = []
for i in range(1,amount_of_pages):
response = rq.get(url.format(i))
if response.status_code == 200:
soup = BeautifulSoup(response.text,'html.parser')
table = soup.find('table',{'id':'datatable'})
headers = []
for th in table.find("tr").find_all("th"):
headers.append(th.text.strip())
for tr in table.find_all("tr")[1:]:
cells = []
tds = tr.find_all("td")
if len(tds) == 0:
ths = tr.find_all("th")
for th in ths:
cells.append(th.text.strip())
else:
for td in tds:
cells.append(td.text.strip())
rows.append(cells)
pd.DataFrame(rows, columns=headers).to_csv(r"C:\Users\HP\Desktop\Web Scraping (RFP's)\RFP_SCRAPED_DATA.csv", index=False)
既然你使用 pandas,为什么不使用 read_html return 提取的表作为数据帧列表。
>>> tables = pd.read_html("https://www.tendersinfo.com/global-information-technology-tenders.php")
>>> tables[1]
LOCATION DATE SUMMARY DEADLINE
0 India 21-May-2020 Liquid Crystal Display Lcd Panel Or Monitors. 01-Jun-2020
1 India 21-May-2020 Random Access Memory. 01-Jun-2020
2 India 21-May-2020 Supply Of Analog Transceiver-handheld. 01-Jun-2020
3 India 21-May-2020 Supply Of Computer Printers. 01-Jun-2020
4 India 21-May-2020 All In One Pc. 01-Jun-2020
您可以使用 pd.read_html
轻松获得 table 并使用 df.to_csv()
.
csv
文件中
import pandas as pd
url = "https://www.tendersinfo.com/ajax_all_new_search.php?country=information-technology&increment=1&%20select=500&%20total=259655&%20search_id=19906&%20order=id&%20imagevalue=1"
df = pd.read_html(url)[0]
df.to_csv("RFP_SCRAPED_DATA.csv", index=False)