使用 Python 从网站中提取日期

Date Extracting from a website using Python

# Find string between two strings
def find_between( s, first, last ):
     try:
         start = s.index( first ) + len( first )
         end = s.index( last, start )
         return s[start:end]
    except ValueError:
         return ""

from urllib.request import urlopen
from lxml import html
import requests

link = "https://www.calendar-12.com/catholic_holidays/2019"
response = urlopen(link)
content = response.read().decode("utf-8") 


table = find_between(content, "<tbody>","</tbody>");
rows = table.split("/tr")
csv = "Day\n"
for row in rows:
    day = find_between(row, '">', "</t")
    day = find_between(day, "> ", "</")
    csv = csv + day + "\n"

print(csv)

这段代码应该可以从网站中提取日期,但是没有,你能帮忙解决问题吗?唯一的输出是 Day

如果我正确理解你的问题,应该这样做:

import lxml.html as lh
import requests

link = "https://www.calendar-12.com/catholic_holidays/2019"
req = requests.get(link)

doc= lh.fromstring(req.text)
tab = doc.xpath('//table')[0]
rows = []
for t in tab.xpath('.//tr[@class]//td/a'):
    row = []
    row.extend(t.text.strip().replace('day,','day,xxx').split('xxx'))
    rows.extend(row)
    
for day,date,holiday in zip(rows[0::3],rows[1::3],rows[2::3]):
    print(day,date,holiday)
    #EDIT:
    #or to store these in variables:
    a,b,c = day,date,holiday
    print(a,b,c)

输出:

Tuesday,  January 1, 2019 Solemnity of Mary, Mother of God
Sunday,  January 6, 2019 Epiphany
Tuesday,  March 5, 2019 Shrove Tuesday (Mardi Gras)
Wednesday,  March 6, 2019 Ash Wednesday

等等