想要从网页及其下一页中抓取
Want to scrape from a web page and its next pages
我想提取公司名称、人员、国家/地区,Phone 并通过电子邮件发送到 excel 文件。我尝试了以下代码,但它 returns 在 excel 文件中只有一个值。如何将它循环到第一页和下一页..
import csv
import re
import requests
import urllib.request
from bs4 import BeautifulSoup
for page in range(10):
url = "http://www.aepcindia.com/buyersdirectory"
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'lxml')
tbody = soup('div', {'class':'view-content'})#[0].find_all('')
f = open('filename.csv', 'w', newline = '')
Headers = "Name,Person,Country,Email,Phone\n"
csv_writer = csv.writer(f)
f.write(Headers)
for i in tbody:
try:
name = i.find("div", {"class":"company_name"}).get_text()
person = i.find("div", {"class":"title"}).get_text()
country = i.find("div", {"class":"views-field views-field-field-country"}).get_text()
email = i.find("div", {"class":"email"}).get_text()
phone = i.find("div", {"class":"telephone_no"}).get_text()
print(name, person, country, email, phone)
f.write("{}".format(name).replace(","," ")+ ",{}".format(person)+ ",{}".format(country)+ ",{}".format(email) + ",{}".format(phone) + "\n")
except: AttributeError
f.close()
这里是网页的link
http://www.aepcindia.com/buyersdirectory
import requests
from bs4 import BeautifulSoup
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0'}
def main(url):
with requests.Session() as req:
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Title", "Country", "Email", "Phone"])
for item in range(0, 10):
print(f"Extracting Page# {item +1}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
name = [name.text for name in soup.select("div.company_name")]
title = [title.text for title in soup.select("div.title")]
country = [country.text for country in soup.findAll(
"div", class_="field-content", text=True)]
email = [email.a.text for email in soup.select(
"div.email")]
phone = [phone.text
for phone in soup.select("div.telephone_no")]
data = zip(name, title, country, email, phone)
writer.writerows(data)
main("http://www.aepcindia.com/buyersdirectory?page={}")
输出:view-online
我想提取公司名称、人员、国家/地区,Phone 并通过电子邮件发送到 excel 文件。我尝试了以下代码,但它 returns 在 excel 文件中只有一个值。如何将它循环到第一页和下一页..
import csv
import re
import requests
import urllib.request
from bs4 import BeautifulSoup
for page in range(10):
url = "http://www.aepcindia.com/buyersdirectory"
soup = BeautifulSoup(urllib.request.urlopen(url).read(), 'lxml')
tbody = soup('div', {'class':'view-content'})#[0].find_all('')
f = open('filename.csv', 'w', newline = '')
Headers = "Name,Person,Country,Email,Phone\n"
csv_writer = csv.writer(f)
f.write(Headers)
for i in tbody:
try:
name = i.find("div", {"class":"company_name"}).get_text()
person = i.find("div", {"class":"title"}).get_text()
country = i.find("div", {"class":"views-field views-field-field-country"}).get_text()
email = i.find("div", {"class":"email"}).get_text()
phone = i.find("div", {"class":"telephone_no"}).get_text()
print(name, person, country, email, phone)
f.write("{}".format(name).replace(","," ")+ ",{}".format(person)+ ",{}".format(country)+ ",{}".format(email) + ",{}".format(phone) + "\n")
except: AttributeError
f.close()
这里是网页的link http://www.aepcindia.com/buyersdirectory
import requests
from bs4 import BeautifulSoup
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0'}
def main(url):
with requests.Session() as req:
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Title", "Country", "Email", "Phone"])
for item in range(0, 10):
print(f"Extracting Page# {item +1}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
name = [name.text for name in soup.select("div.company_name")]
title = [title.text for title in soup.select("div.title")]
country = [country.text for country in soup.findAll(
"div", class_="field-content", text=True)]
email = [email.a.text for email in soup.select(
"div.email")]
phone = [phone.text
for phone in soup.select("div.telephone_no")]
data = zip(name, title, country, email, phone)
writer.writerows(data)
main("http://www.aepcindia.com/buyersdirectory?page={}")
输出:view-online