python 请求和 bs4 如何通过元素的 children 导航

python requests and bs4 how to navigate through the children of an element

所以这是我的代码

from bs4 import BeautifulSoup
import requests
import time

URL = 'http://www.vn-meido.com/k1/index.php?board=17.0'

# loads page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")

# gets the newest book
book = soup.select_one('td[class^="subject windowbg2"]').text

while True:
    # reloads the page
    r = requests.get(URL)
    soup = BeautifulSoup(r.content, "html.parser")
    # gets the newest book
    new_book = soup.select_one('td[class^="subject windowbg2"]').text
    # checks if a new book has been uploaded
    if book == new_book:
        print("no new book found")
    elif book != new_book:
        print(new_book)
        book = soup.select_one('td[class^="subject windowbg2"]').text
    # repeats after 30 seconds
    time.sleep(30)

但是如果你去网站看看我上传了最新一本书的文本,但我希望能够将标题和作者分开,标题和作者在不同的元素中,但他们不没有办法识别它们(比如 class 或 ID)所以如果你能帮忙,请帮忙,谢谢

假设 html 在条目之间保持一致(我只检查了几个)然后当在顶部的固定列表下找到下一个文本时(我假设这是一本新书)那么你需要提取这本书 url,访问那个 url,然后你可以使用 ``:-soup-containsto target author and book title by specific text andnext_sibling` 来获得所需的 return 值.

N.B。为了这个答案的目的,我已经删除了 while 循环。 elif 的补充是重要的。

from bs4 import BeautifulSoup
import requests


URL = 'http://www.vn-meido.com/k1/index.php?board=17.0'

# loads page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")

# gets the newest book
book = ''   # for testing altered this line

r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
new_book = soup.select_one('td[class^="subject windowbg2"]').text
# checks if a new book has been uploaded
if book == new_book:
    print("no new book found")
elif book != new_book:
    print(new_book)
    new_book_url = soup.select_one('tr:not([class]) td:not([class*=stickybg]) ~ .subject a')['href']
    r = requests.get(new_book_url)
    soup = BeautifulSoup(r.content, "html.parser")
    for member in ['TITLE ', 'AUTHOR']:
        print(soup.select_one(f'strong:-soup-contains("{member}")').next_sibling.next_sibling)