python 请求和 bs4 如何通过元素的 children 导航
python requests and bs4 how to navigate through the children of an element
所以这是我的代码
from bs4 import BeautifulSoup
import requests
import time
URL = 'http://www.vn-meido.com/k1/index.php?board=17.0'
# loads page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
book = soup.select_one('td[class^="subject windowbg2"]').text
while True:
# reloads the page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
new_book = soup.select_one('td[class^="subject windowbg2"]').text
# checks if a new book has been uploaded
if book == new_book:
print("no new book found")
elif book != new_book:
print(new_book)
book = soup.select_one('td[class^="subject windowbg2"]').text
# repeats after 30 seconds
time.sleep(30)
但是如果你去网站看看我上传了最新一本书的文本,但我希望能够将标题和作者分开,标题和作者在不同的元素中,但他们不没有办法识别它们(比如 class 或 ID)所以如果你能帮忙,请帮忙,谢谢
假设 html 在条目之间保持一致(我只检查了几个)然后当在顶部的固定列表下找到下一个文本时(我假设这是一本新书)那么你需要提取这本书 url,访问那个 url,然后你可以使用 ``:-soup-containsto target author and book title by specific text and
next_sibling` 来获得所需的 return 值.
N.B。为了这个答案的目的,我已经删除了 while 循环。 elif
的补充是重要的。
from bs4 import BeautifulSoup
import requests
URL = 'http://www.vn-meido.com/k1/index.php?board=17.0'
# loads page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
book = '' # for testing altered this line
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
new_book = soup.select_one('td[class^="subject windowbg2"]').text
# checks if a new book has been uploaded
if book == new_book:
print("no new book found")
elif book != new_book:
print(new_book)
new_book_url = soup.select_one('tr:not([class]) td:not([class*=stickybg]) ~ .subject a')['href']
r = requests.get(new_book_url)
soup = BeautifulSoup(r.content, "html.parser")
for member in ['TITLE ', 'AUTHOR']:
print(soup.select_one(f'strong:-soup-contains("{member}")').next_sibling.next_sibling)
所以这是我的代码
from bs4 import BeautifulSoup
import requests
import time
URL = 'http://www.vn-meido.com/k1/index.php?board=17.0'
# loads page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
book = soup.select_one('td[class^="subject windowbg2"]').text
while True:
# reloads the page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
new_book = soup.select_one('td[class^="subject windowbg2"]').text
# checks if a new book has been uploaded
if book == new_book:
print("no new book found")
elif book != new_book:
print(new_book)
book = soup.select_one('td[class^="subject windowbg2"]').text
# repeats after 30 seconds
time.sleep(30)
但是如果你去网站看看我上传了最新一本书的文本,但我希望能够将标题和作者分开,标题和作者在不同的元素中,但他们不没有办法识别它们(比如 class 或 ID)所以如果你能帮忙,请帮忙,谢谢
假设 html 在条目之间保持一致(我只检查了几个)然后当在顶部的固定列表下找到下一个文本时(我假设这是一本新书)那么你需要提取这本书 url,访问那个 url,然后你可以使用 ``:-soup-containsto target author and book title by specific text and
next_sibling` 来获得所需的 return 值.
N.B。为了这个答案的目的,我已经删除了 while 循环。 elif
的补充是重要的。
from bs4 import BeautifulSoup
import requests
URL = 'http://www.vn-meido.com/k1/index.php?board=17.0'
# loads page
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
book = '' # for testing altered this line
r = requests.get(URL)
soup = BeautifulSoup(r.content, "html.parser")
# gets the newest book
new_book = soup.select_one('td[class^="subject windowbg2"]').text
# checks if a new book has been uploaded
if book == new_book:
print("no new book found")
elif book != new_book:
print(new_book)
new_book_url = soup.select_one('tr:not([class]) td:not([class*=stickybg]) ~ .subject a')['href']
r = requests.get(new_book_url)
soup = BeautifulSoup(r.content, "html.parser")
for member in ['TITLE ', 'AUTHOR']:
print(soup.select_one(f'strong:-soup-contains("{member}")').next_sibling.next_sibling)