Python, Beautiful soup, 如何提取数据并打印到csv文件
Python, Beautiful soup, how to extract data and print to csv file
所以我已经为此工作了一段时间,但我似乎无法找到或弄清楚它的答案。所以我正在从 Steam 中提取数据,我需要弄清楚如何获取平台,例如 mac 并将其转换为数字(字符串编号)。例如,如果游戏支持 mac,它将在我的列表中显示为“1”,但如果不支持,它将显示为“0”。我只遇到代码问题 运行 一次,并将其全部设置为“1”。
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import timedelta
import datetime
import time
import csv
my_url = 'https://store.steampowered.com/search/?specials=1&page=1'
#opening up connectin, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grab products
containers = page_soup.findAll("div", {"class":"responsive_search_name_combined"})
filename = "products.csv"
f = open(filename, "w", encoding='UTF-8')
headers = "Titles, Release_date, Discount, Price before, Price after, Positive review, Reviewers, Win, Lin, Osx, Time \n"
f.write(headers)
#f.write(headers)
#len(containers)
#containers[1]
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print(st)
for container in containers:
titles_container = container.findAll("span",{"class":"title"})
titl = titles_container[0].text
print(titl)
product_container = container.findAll("div",{"class":"search_released"})
product_date = product_container[0].text
print(product_date)
product_discount_container = container.findAll("div",{"class":"search_discount"})
product_discount = product_discount_container[0].text
print(product_discount)
product_price_container_before = container.findAll("div",{"class":"search_price"})
product_price_before = product_price_container_before[0].text
test = re.findall('(\d+\W)',product_price_before)
testing = test[0] + test[1]
print(testing)
product_price_container_after = container.findAll("div",{"class":"discounted"})
for product_price_after in product_price_container_after:
product_price_after.find("span").extract()
print(product_price_after.text)
product_review_container = container.findAll("span",{"class":"search_review_summary"})
for product_review in product_review_container:
prr = product_review.get('data-tooltip-html')
a = re.findall('(\d+%)|(\d+\d+)',prr)
c = a[1][1]
print(c)
product_platform_container = container.findAll("span",{"class":"platform_img"})
for product_platform in product_platform_container:
platform = product_platform.get('class')[1]
platt = re.findall('(\Aw)',platform)
plattt = re.findall('(\Am)',platform)
platttt = re.findall('(\Al)',platform)
print(platt)
print(plattt)
print(platttt)
for p in plattt:
if "m" in p:
macken = "1"
elif "m" not in p:
macken = "0"
print(macken)
f.write(titl + "," + product_date.replace(",","") + "," + product_discount.replace("\n", "") + "," + testing.replace(",", ".") + "," + product_price_after.text.replace("\n","").replace(" ", "").replace(",",".").replace("\t\t\t\t\t\t\t","") + "," + a[0][0] + "," + c.replace(",","") + "," + y + "," + macken + "," + "blah" + "," + st + "\n")
f.close()
pd.read_csv("products.csv", error_bad_lines=False)
我也正在将其写入 csv 文件。所以当我将它写入 csv 文件时,它只会说 1, 1, 1, 1, 1...
我正在从这个页面获取数据:'https://store.steampowered.com/search/?specials=1&page=1'
我知道这个问题有点令人困惑,所以希望您能提供帮助,如果您需要更多代码,请告诉我。
我会这样做:
import csv
# ...
rows = []
product_platform_container = container.findAll("span",{"class":"platform_img"})
for product_platform in product_platform_container:
platform = product_platform.get('class')[1]
win_p = re.findall('(\Aw)',platform)
mac_p = re.findall('(\Am)',platform)
linux_p = re.findall('(\Al)',platform)
print(win_p)
print(mac_p)
print(linux_p)
row = {
"linux": 1 if linux_p else 0,
"win": 1 if win_p else 0,
"mac": 1 if mac_p else 0
}
rows.append(row)
# After you parsed all entries...
fieldnames = ['mac', 'win', 'linux']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in rows:
writer.writerow(row)
说明:在我们用 re
识别平台后,我们创建 csv 行,其中 mac
、win
和 linux
仅当它们对应的匹配项(mac_p
、win_p
和 linux_p
)不为空时才会有 1。这里 f
是你打开的文件对象。
查看 this 文章,其中展示了如何使用 python.
中的 csv 文件
你的陈述是错误的,这就是你得到 1 的原因,请参阅下面的代码!
import requests,csv
from bs4 import BeautifulSoup
req = requests.get('https://store.steampowered.com/search/?specials=1&page=1')
soup = BeautifulSoup(req.content,'html.parser')
data = []
for platform in soup.find_all('div', attrs={'class':'col search_name ellipsis'}):
title = platform.find('span',attrs={'class':'title'}).text
if platform.find('span',attrs={'class':'win'}):
win = '1'
else:
win = '0'
if platform.find('span',attrs={'class':'mac'}):
mac = '1'
else:
mac = '0'
if platform.find('span',attrs={'class':'linux'}):
linux = '1'
else:
linux = '0'
data.append({
'title':title.encode('utf-8'),
'win':win,
'mac':mac,
'linux':linux})
with open('data.csv', 'w', newline='') as f:
fields = ['title','win','mac','linux']
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
writer.writerows(data)
所以我已经为此工作了一段时间,但我似乎无法找到或弄清楚它的答案。所以我正在从 Steam 中提取数据,我需要弄清楚如何获取平台,例如 mac 并将其转换为数字(字符串编号)。例如,如果游戏支持 mac,它将在我的列表中显示为“1”,但如果不支持,它将显示为“0”。我只遇到代码问题 运行 一次,并将其全部设置为“1”。
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import timedelta
import datetime
import time
import csv
my_url = 'https://store.steampowered.com/search/?specials=1&page=1'
#opening up connectin, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grab products
containers = page_soup.findAll("div", {"class":"responsive_search_name_combined"})
filename = "products.csv"
f = open(filename, "w", encoding='UTF-8')
headers = "Titles, Release_date, Discount, Price before, Price after, Positive review, Reviewers, Win, Lin, Osx, Time \n"
f.write(headers)
#f.write(headers)
#len(containers)
#containers[1]
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print(st)
for container in containers:
titles_container = container.findAll("span",{"class":"title"})
titl = titles_container[0].text
print(titl)
product_container = container.findAll("div",{"class":"search_released"})
product_date = product_container[0].text
print(product_date)
product_discount_container = container.findAll("div",{"class":"search_discount"})
product_discount = product_discount_container[0].text
print(product_discount)
product_price_container_before = container.findAll("div",{"class":"search_price"})
product_price_before = product_price_container_before[0].text
test = re.findall('(\d+\W)',product_price_before)
testing = test[0] + test[1]
print(testing)
product_price_container_after = container.findAll("div",{"class":"discounted"})
for product_price_after in product_price_container_after:
product_price_after.find("span").extract()
print(product_price_after.text)
product_review_container = container.findAll("span",{"class":"search_review_summary"})
for product_review in product_review_container:
prr = product_review.get('data-tooltip-html')
a = re.findall('(\d+%)|(\d+\d+)',prr)
c = a[1][1]
print(c)
product_platform_container = container.findAll("span",{"class":"platform_img"})
for product_platform in product_platform_container:
platform = product_platform.get('class')[1]
platt = re.findall('(\Aw)',platform)
plattt = re.findall('(\Am)',platform)
platttt = re.findall('(\Al)',platform)
print(platt)
print(plattt)
print(platttt)
for p in plattt:
if "m" in p:
macken = "1"
elif "m" not in p:
macken = "0"
print(macken)
f.write(titl + "," + product_date.replace(",","") + "," + product_discount.replace("\n", "") + "," + testing.replace(",", ".") + "," + product_price_after.text.replace("\n","").replace(" ", "").replace(",",".").replace("\t\t\t\t\t\t\t","") + "," + a[0][0] + "," + c.replace(",","") + "," + y + "," + macken + "," + "blah" + "," + st + "\n")
f.close()
pd.read_csv("products.csv", error_bad_lines=False)
我也正在将其写入 csv 文件。所以当我将它写入 csv 文件时,它只会说 1, 1, 1, 1, 1...
我正在从这个页面获取数据:'https://store.steampowered.com/search/?specials=1&page=1'
我知道这个问题有点令人困惑,所以希望您能提供帮助,如果您需要更多代码,请告诉我。
我会这样做:
import csv
# ...
rows = []
product_platform_container = container.findAll("span",{"class":"platform_img"})
for product_platform in product_platform_container:
platform = product_platform.get('class')[1]
win_p = re.findall('(\Aw)',platform)
mac_p = re.findall('(\Am)',platform)
linux_p = re.findall('(\Al)',platform)
print(win_p)
print(mac_p)
print(linux_p)
row = {
"linux": 1 if linux_p else 0,
"win": 1 if win_p else 0,
"mac": 1 if mac_p else 0
}
rows.append(row)
# After you parsed all entries...
fieldnames = ['mac', 'win', 'linux']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for row in rows:
writer.writerow(row)
说明:在我们用 re
识别平台后,我们创建 csv 行,其中 mac
、win
和 linux
仅当它们对应的匹配项(mac_p
、win_p
和 linux_p
)不为空时才会有 1。这里 f
是你打开的文件对象。
查看 this 文章,其中展示了如何使用 python.
你的陈述是错误的,这就是你得到 1 的原因,请参阅下面的代码!
import requests,csv
from bs4 import BeautifulSoup
req = requests.get('https://store.steampowered.com/search/?specials=1&page=1')
soup = BeautifulSoup(req.content,'html.parser')
data = []
for platform in soup.find_all('div', attrs={'class':'col search_name ellipsis'}):
title = platform.find('span',attrs={'class':'title'}).text
if platform.find('span',attrs={'class':'win'}):
win = '1'
else:
win = '0'
if platform.find('span',attrs={'class':'mac'}):
mac = '1'
else:
mac = '0'
if platform.find('span',attrs={'class':'linux'}):
linux = '1'
else:
linux = '0'
data.append({
'title':title.encode('utf-8'),
'win':win,
'mac':mac,
'linux':linux})
with open('data.csv', 'w', newline='') as f:
fields = ['title','win','mac','linux']
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
writer.writerows(data)