使用 Python 从网站中提取日期
Date Extracting from a website using Python
# Find string between two strings
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
from urllib.request import urlopen
from lxml import html
import requests
link = "https://www.calendar-12.com/catholic_holidays/2019"
response = urlopen(link)
content = response.read().decode("utf-8")
table = find_between(content, "<tbody>","</tbody>");
rows = table.split("/tr")
csv = "Day\n"
for row in rows:
day = find_between(row, '">', "</t")
day = find_between(day, "> ", "</")
csv = csv + day + "\n"
print(csv)
这段代码应该可以从网站中提取日期,但是没有,你能帮忙解决问题吗?唯一的输出是 Day
如果我正确理解你的问题,应该这样做:
import lxml.html as lh
import requests
link = "https://www.calendar-12.com/catholic_holidays/2019"
req = requests.get(link)
doc= lh.fromstring(req.text)
tab = doc.xpath('//table')[0]
rows = []
for t in tab.xpath('.//tr[@class]//td/a'):
row = []
row.extend(t.text.strip().replace('day,','day,xxx').split('xxx'))
rows.extend(row)
for day,date,holiday in zip(rows[0::3],rows[1::3],rows[2::3]):
print(day,date,holiday)
#EDIT:
#or to store these in variables:
a,b,c = day,date,holiday
print(a,b,c)
输出:
Tuesday, January 1, 2019 Solemnity of Mary, Mother of God
Sunday, January 6, 2019 Epiphany
Tuesday, March 5, 2019 Shrove Tuesday (Mardi Gras)
Wednesday, March 6, 2019 Ash Wednesday
等等
# Find string between two strings
def find_between( s, first, last ):
try:
start = s.index( first ) + len( first )
end = s.index( last, start )
return s[start:end]
except ValueError:
return ""
from urllib.request import urlopen
from lxml import html
import requests
link = "https://www.calendar-12.com/catholic_holidays/2019"
response = urlopen(link)
content = response.read().decode("utf-8")
table = find_between(content, "<tbody>","</tbody>");
rows = table.split("/tr")
csv = "Day\n"
for row in rows:
day = find_between(row, '">', "</t")
day = find_between(day, "> ", "</")
csv = csv + day + "\n"
print(csv)
这段代码应该可以从网站中提取日期,但是没有,你能帮忙解决问题吗?唯一的输出是 Day
如果我正确理解你的问题,应该这样做:
import lxml.html as lh
import requests
link = "https://www.calendar-12.com/catholic_holidays/2019"
req = requests.get(link)
doc= lh.fromstring(req.text)
tab = doc.xpath('//table')[0]
rows = []
for t in tab.xpath('.//tr[@class]//td/a'):
row = []
row.extend(t.text.strip().replace('day,','day,xxx').split('xxx'))
rows.extend(row)
for day,date,holiday in zip(rows[0::3],rows[1::3],rows[2::3]):
print(day,date,holiday)
#EDIT:
#or to store these in variables:
a,b,c = day,date,holiday
print(a,b,c)
输出:
Tuesday, January 1, 2019 Solemnity of Mary, Mother of God
Sunday, January 6, 2019 Epiphany
Tuesday, March 5, 2019 Shrove Tuesday (Mardi Gras)
Wednesday, March 6, 2019 Ash Wednesday
等等