抓取:从 AngularJs 网站提取所有文本和链接(href 和 ng-href)并抓取
Crawling: Extract all text and links(href and ng-href) from AngularJs website and Crawl
在尝试使用单点登录爬取 angular Js 页面的大量努力之后,我已经提供了这段代码。这段代码运行良好,登录打开所需的页面并将其删除,但我没有获得 angular 加载的网站中存在的所有链接和文本。我的 xpath 似乎是正确的。
它也没有抓取正在提取的链接。我需要更改代码中的哪些内容才能提取网站和后续网页中的所有文本?
import scrapy
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ps_crawler.items import PsCrawlerItem
import time
from selenium.webdriver.common.keys import Keys
class SISSpider(scrapy.Spider):
name = "SIS"
allowed_domains = ["domain.com"]
start_urls = ["https://domain.com/login?"]
def __init__(self):
self.driver = webdriver.Chrome()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
# selenium part of the job
self.driver.get("https://domain.com/login?")
time.sleep(5)
self.driver.find_element_by_xpath('//*[@id="Login"]/div[2]/div[1]/div[2]/form/div[1]/input').send_keys("ssasdad")
self.driver.find_element_by_xpath('//*[@id="Login"]/div[2]/div[1]/div[2]/form/div[2]/input').send_keys("")
#self.driver.find_element_by_xpath('//*[@id="login"]').click()
more_btn = WebDriverWait(self.driver, 10).until(
EC.visibility_of_element_located((By.XPATH, '//*[@id="login"]'))
)
time.sleep(5)
more_btn.click()
time.sleep(5)
self.driver.execute_script("window.open('https://domain.com/#/admin','_blank');");
time.sleep(10)
window_now = self.driver.window_handles[1]
self.driver.switch_to_window(window_now)
## stop when we reach the desired page
#if self.driver.current_url.endswith('page=20'):
# break
#now scrapy should do the job
time.sleep(10)
response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
time.sleep(10)
for post in response.xpath('//div'):
item = PsCrawlerItem()
print post.xpath('a/span/text()').extract(), post.xpath('a/@href').extract(), post.xpath('a/@ng-href').extract()
您只需按如下方式稍微调整一下您的 xpath。希望这能解决问题。
for post in response.xpath('//body'):
print post.xpath('//text()').extract(), post.xpath('//a//@href').extract()
在尝试使用单点登录爬取 angular Js 页面的大量努力之后,我已经提供了这段代码。这段代码运行良好,登录打开所需的页面并将其删除,但我没有获得 angular 加载的网站中存在的所有链接和文本。我的 xpath 似乎是正确的。
它也没有抓取正在提取的链接。我需要更改代码中的哪些内容才能提取网站和后续网页中的所有文本?
import scrapy
from scrapy import signals
from scrapy.http import TextResponse
from scrapy.xlib.pydispatch import dispatcher
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ps_crawler.items import PsCrawlerItem
import time
from selenium.webdriver.common.keys import Keys
class SISSpider(scrapy.Spider):
name = "SIS"
allowed_domains = ["domain.com"]
start_urls = ["https://domain.com/login?"]
def __init__(self):
self.driver = webdriver.Chrome()
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
self.driver.close()
def parse(self, response):
# selenium part of the job
self.driver.get("https://domain.com/login?")
time.sleep(5)
self.driver.find_element_by_xpath('//*[@id="Login"]/div[2]/div[1]/div[2]/form/div[1]/input').send_keys("ssasdad")
self.driver.find_element_by_xpath('//*[@id="Login"]/div[2]/div[1]/div[2]/form/div[2]/input').send_keys("")
#self.driver.find_element_by_xpath('//*[@id="login"]').click()
more_btn = WebDriverWait(self.driver, 10).until(
EC.visibility_of_element_located((By.XPATH, '//*[@id="login"]'))
)
time.sleep(5)
more_btn.click()
time.sleep(5)
self.driver.execute_script("window.open('https://domain.com/#/admin','_blank');");
time.sleep(10)
window_now = self.driver.window_handles[1]
self.driver.switch_to_window(window_now)
## stop when we reach the desired page
#if self.driver.current_url.endswith('page=20'):
# break
#now scrapy should do the job
time.sleep(10)
response = TextResponse(url=self.driver.current_url, body=self.driver.page_source, encoding='utf-8')
time.sleep(10)
for post in response.xpath('//div'):
item = PsCrawlerItem()
print post.xpath('a/span/text()').extract(), post.xpath('a/@href').extract(), post.xpath('a/@ng-href').extract()
您只需按如下方式稍微调整一下您的 xpath。希望这能解决问题。
for post in response.xpath('//body'):
print post.xpath('//text()').extract(), post.xpath('//a//@href').extract()