使用 Selenium 加载所有帖子然后提取帖子
Issue with Selenium to load all posts and then extract posts
我要抓取这个URLhttps://healthunlocked.com/positivewellbeing。
我写了下面的命令首先点击see more posts
加载所有post然后提取每个post的全文。我正在尝试 运行 代码,但这需要太多时间!!!我运行最近2天的代码,我还在等待完成运行。我想它仍然会尝试通过代码的第一部分加载 posts,因为我还没有看到任何输出(提取的 post)。不知道我做的对不对?
我的代码如下:
wait = WebDriverWait(driver, 10)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
## Load all posts
while (driver.find_element_by_xpath('//*[@id="__next"]/main/div[2]/div[1]/div[1]/div[3]/div[31]/button')):
time.sleep(5)
driver.find_element_by_xpath('//*[@id="__next"]/main/div[2]/div[1]/div[1]/div[3]/div[31]/button').click()
##extract posts
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(3)
lst_post = [x.get_attribute('href') for x in driver.find_elements_by_xpath("//div[@class='results-post']/a")]
for lst in lst_post:
time.sleep(5)
driver.get(lst)
post_body = wait.until(EC.presence_of_element_located((By.XPATH,"/html/body/div[1]/main/div[2]/div[1]/div[1]/div[1]")))
like_count = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,".post-action--like")))
#print (ascii(post_body.text))
print (post_body.text)
print('\n')
似乎该站点使用 API 来获取 post 的列表并获取 post 数据:
post 列表:https://solaris.healthunlocked.com/posts/positivewellbeing/latest
post url: https://solaris.healthunlocked.com/posts/positivewellbeing/145621054
使用 requests
,你可以调用这些 API 而不是使用 selenium,这样会更快。
此外,通过这种方式,您可以通过记录最后一个 post ID 来控制何时停止抓取。例如,如有必要,您可以从停止抓取的地方开始。
以下代码获取上个月创建的所有 post 并获取它们各自的信息:
import requests
import time
from datetime import datetime, timedelta
allPostUrl = 'https://solaris.healthunlocked.com/posts/positivewellbeing/latest'
now = datetime.today()
postFromTime = now + timedelta(days=-1*30) # last month
fetchAllPost = False
nextPost = ""
posts = []
while not fetchAllPost:
url = f'{allPostUrl}{f"?createdBeforePostId={nextPost}" if nextPost else ""}'
print(f"GET {url}")
r = requests.get(url)
result = r.json()
posts.extend(result)
if len(result) > 0 and nextPost != result[len(result)-1]["postId"]:
lastCreated = datetime.strptime(result[len(result)-1]["dateCreated"], '%Y-%m-%dT%H:%M:%S.%fZ')
if lastCreated < postFromTime:
fetchAllPost = True
else:
nextPost = result[len(result)-1]["postId"]
else:
fetchAllPost = True
print(f"received {len(posts)} posts")
data = []
for idx, post in enumerate(posts):
url = f'https://solaris.healthunlocked.com/posts/positivewellbeing/{post["postId"]}'
print(f"[{idx+1}/{len(posts)}] GET {url}")
r = requests.get(url)
result = r.json()
data.append({
"body": result["body"],
"likes": result["numRatings"]
})
print(data)
我很无聊,所以我提出了自己的方法。
它抓取所有链接,访问它们,返回并抓取尚未访问的链接
import time
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException
driver = webdriver.Chrome()
driver.implicitly_wait(6)
driver.get("https://healthunlocked.com/positivewellbeing/posts")
# click accept cookies
driver.find_element_by_id("ccc-notify-accept").click()
post_links = set()
while True:
driver.get("https://healthunlocked.com/positivewellbeing/posts")
all_posts = [post for post in
driver.find_element_by_class_name("results-posts").find_elements_by_class_name("results-post") if
"results-post" == post.get_attribute("class")]
# handle clicking more posts
while len(all_posts) <= len(post_links):
see_more_posts = [btn for btn in driver.find_elements_by_class_name("btn-secondary")
if btn.text == "See more posts"]
try:
see_more_posts[0].click()
except ElementClickInterceptedException:
# handle floating box covering "see more posts" button
driver.execute_script("return document.getElementsByClassName('floating-box-sign-up')[0].remove();")
see_more_posts[0].click()
all_posts = [post for post in driver.find_element_by_class_name("results-posts").find_elements_by_class_name("results-post") if "results-post" == post.get_attribute("class")]
# popoulate links
start_from = len(post_links)
for post in all_posts[start_from:]: # len(post_links): <-- to avoid visiting same links
# save link
link = post.find_element_by_tag_name("a").get_attribute("href")
post_links.add(link)
# visit the site and scrape info
for post_site in list(post_links)[start_from:]:
driver.get(post_site)
post_text = driver.find_element_by_class_name("post-body").text
for btn in driver.find_element_by_class_name("post-actions__buttons").find_elements_by_tag_name("button"):
if "Like" in btn.text:
post_like = btn.text.split()[1][1]
print(f"\n{post_text}\nLikes -->{post_like}\n")
我要抓取这个URLhttps://healthunlocked.com/positivewellbeing。
我写了下面的命令首先点击see more posts
加载所有post然后提取每个post的全文。我正在尝试 运行 代码,但这需要太多时间!!!我运行最近2天的代码,我还在等待完成运行。我想它仍然会尝试通过代码的第一部分加载 posts,因为我还没有看到任何输出(提取的 post)。不知道我做的对不对?
我的代码如下:
wait = WebDriverWait(driver, 10)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
## Load all posts
while (driver.find_element_by_xpath('//*[@id="__next"]/main/div[2]/div[1]/div[1]/div[3]/div[31]/button')):
time.sleep(5)
driver.find_element_by_xpath('//*[@id="__next"]/main/div[2]/div[1]/div[1]/div[3]/div[31]/button').click()
##extract posts
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(3)
lst_post = [x.get_attribute('href') for x in driver.find_elements_by_xpath("//div[@class='results-post']/a")]
for lst in lst_post:
time.sleep(5)
driver.get(lst)
post_body = wait.until(EC.presence_of_element_located((By.XPATH,"/html/body/div[1]/main/div[2]/div[1]/div[1]/div[1]")))
like_count = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,".post-action--like")))
#print (ascii(post_body.text))
print (post_body.text)
print('\n')
似乎该站点使用 API 来获取 post 的列表并获取 post 数据:
post 列表:https://solaris.healthunlocked.com/posts/positivewellbeing/latest
post url: https://solaris.healthunlocked.com/posts/positivewellbeing/145621054
使用 requests
,你可以调用这些 API 而不是使用 selenium,这样会更快。
此外,通过这种方式,您可以通过记录最后一个 post ID 来控制何时停止抓取。例如,如有必要,您可以从停止抓取的地方开始。
以下代码获取上个月创建的所有 post 并获取它们各自的信息:
import requests
import time
from datetime import datetime, timedelta
allPostUrl = 'https://solaris.healthunlocked.com/posts/positivewellbeing/latest'
now = datetime.today()
postFromTime = now + timedelta(days=-1*30) # last month
fetchAllPost = False
nextPost = ""
posts = []
while not fetchAllPost:
url = f'{allPostUrl}{f"?createdBeforePostId={nextPost}" if nextPost else ""}'
print(f"GET {url}")
r = requests.get(url)
result = r.json()
posts.extend(result)
if len(result) > 0 and nextPost != result[len(result)-1]["postId"]:
lastCreated = datetime.strptime(result[len(result)-1]["dateCreated"], '%Y-%m-%dT%H:%M:%S.%fZ')
if lastCreated < postFromTime:
fetchAllPost = True
else:
nextPost = result[len(result)-1]["postId"]
else:
fetchAllPost = True
print(f"received {len(posts)} posts")
data = []
for idx, post in enumerate(posts):
url = f'https://solaris.healthunlocked.com/posts/positivewellbeing/{post["postId"]}'
print(f"[{idx+1}/{len(posts)}] GET {url}")
r = requests.get(url)
result = r.json()
data.append({
"body": result["body"],
"likes": result["numRatings"]
})
print(data)
我很无聊,所以我提出了自己的方法。 它抓取所有链接,访问它们,返回并抓取尚未访问的链接
import time
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException
driver = webdriver.Chrome()
driver.implicitly_wait(6)
driver.get("https://healthunlocked.com/positivewellbeing/posts")
# click accept cookies
driver.find_element_by_id("ccc-notify-accept").click()
post_links = set()
while True:
driver.get("https://healthunlocked.com/positivewellbeing/posts")
all_posts = [post for post in
driver.find_element_by_class_name("results-posts").find_elements_by_class_name("results-post") if
"results-post" == post.get_attribute("class")]
# handle clicking more posts
while len(all_posts) <= len(post_links):
see_more_posts = [btn for btn in driver.find_elements_by_class_name("btn-secondary")
if btn.text == "See more posts"]
try:
see_more_posts[0].click()
except ElementClickInterceptedException:
# handle floating box covering "see more posts" button
driver.execute_script("return document.getElementsByClassName('floating-box-sign-up')[0].remove();")
see_more_posts[0].click()
all_posts = [post for post in driver.find_element_by_class_name("results-posts").find_elements_by_class_name("results-post") if "results-post" == post.get_attribute("class")]
# popoulate links
start_from = len(post_links)
for post in all_posts[start_from:]: # len(post_links): <-- to avoid visiting same links
# save link
link = post.find_element_by_tag_name("a").get_attribute("href")
post_links.add(link)
# visit the site and scrape info
for post_site in list(post_links)[start_from:]:
driver.get(post_site)
post_text = driver.find_element_by_class_name("post-body").text
for btn in driver.find_element_by_class_name("post-actions__buttons").find_elements_by_tag_name("button"):
if "Like" in btn.text:
post_like = btn.text.split()[1][1]
print(f"\n{post_text}\nLikes -->{post_like}\n")