Scrapy:如何在抓取时忽略所有 Javascript、JQuery、...
Scrapy: How to ignore all Javascript, JQuery, ... while scraping
我想我有一个解决方案,但当然有些网站有不同的结构,它在那里不起作用。我需要知道如何删除所有 Javascript、JQuery 以及网站源代码中可能包含的所有非纯文本代码。
我在 MySpider.py 和 items.py 上尝试了此解决方案 (Scraping text without javascript code using scrapy)。我不知道为什么它不能与 remove_tags_with_content
一起使用,但事实并非如此。目前的工作文件如下所示:
MySpider.py
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.utils.markup import remove_tags_with_content
from Scrapy_One.items import Items_Main
class MySpider(CrawlSpider):
name = 'spiderName'
allowed_domains = ['abb.de']
start_urls = ['http://www.abb.de/']
rules = (Rule(LinkExtractor(allow = ('', ),
deny = ('/(\w|\W)*([Ii]mpressum|[Aa]bout|[Pp]rivacy|[Tt]erms|[Cc]opyright|[Hh]elp|[Hh]ilfe|[Dd]atenschutz|[Rr]echtliche(\w|\W)*[Hh]inweis|[Hh]aftungsausschlu)'),
unique = True),
callback = 'parse_stuff',
follow = True),
)
def parse_stuff(self, response):
hxs = Selector(response)
sites = hxs.xpath('//html')
items_main = []
for site in sites:
loader = ItemLoader(item = Items_Main(), response = response)
loader.add_xpath('fragment', '//body//text()')
items_main.append(loader.load_item())
return items_main
items.py
from scrapy.item import Item, Field
from scrapy.loader.processors import MapCompose, Join, TakeFirst
#from scrapy.utils.markup import remove_tags_with_content
from w3lib.html import replace_escape_chars, remove_tags
class Items_Main(Item):
fragment = Field(
input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars),
output_processor = Join(),
)
我知道这不是我想要的(删除每个 Javascript-,JQuery-,...代码)但这是我必须采用的当前情况。
因此,如果您对如何摆脱它有任何建议,我想尝试一下。
我想我找到了答案(至少对我有用)。
我将以下行从 MySpider.py loader.add_xpath('fragment', '//body//text()')
更改为 loader.add_xpath('fragment', '//*[not(self::script)]/text()')
。
所以现在这个文件的完整代码是:
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
from Scrapy_One.items import Items_Main
class MySpider(CrawlSpider):
name = 'spiderName'
allowed_domains = ['example.de']
start_urls = ['http://www.example.de/']
rules = (Rule(LinkExtractor(allow = ('', ),
deny = ('/(\w|\W)*([Ii]mpressum|[Aa]bout|[Pp]rivacy|[Tt]erms|[Cc]opyright|[Hh]elp|[Hh]ilfe|[Dd]atenschutz|[KkCc]onta[kc]t|[Rr]echtliche(\w|\W)*[Hh]inweis|[Hh]aftungsausschlu)'),
unique = True),
callback = 'parse_stuff',
follow = True),
)
def parse_stuff(self, response):
hxs = Selector(response)
sites = hxs.xpath('//body')
items_main = []
for site in sites:
loader = ItemLoader(item = Items_Main(), response = response)
loader.add_xpath('fragment', '//*[not(self::script)]/text()')
items_main.append(loader.load_item())
return items_main
我想我有一个解决方案,但当然有些网站有不同的结构,它在那里不起作用。我需要知道如何删除所有 Javascript、JQuery 以及网站源代码中可能包含的所有非纯文本代码。
我在 MySpider.py 和 items.py 上尝试了此解决方案 (Scraping text without javascript code using scrapy)。我不知道为什么它不能与 remove_tags_with_content
一起使用,但事实并非如此。目前的工作文件如下所示:
MySpider.py
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.utils.markup import remove_tags_with_content
from Scrapy_One.items import Items_Main
class MySpider(CrawlSpider):
name = 'spiderName'
allowed_domains = ['abb.de']
start_urls = ['http://www.abb.de/']
rules = (Rule(LinkExtractor(allow = ('', ),
deny = ('/(\w|\W)*([Ii]mpressum|[Aa]bout|[Pp]rivacy|[Tt]erms|[Cc]opyright|[Hh]elp|[Hh]ilfe|[Dd]atenschutz|[Rr]echtliche(\w|\W)*[Hh]inweis|[Hh]aftungsausschlu)'),
unique = True),
callback = 'parse_stuff',
follow = True),
)
def parse_stuff(self, response):
hxs = Selector(response)
sites = hxs.xpath('//html')
items_main = []
for site in sites:
loader = ItemLoader(item = Items_Main(), response = response)
loader.add_xpath('fragment', '//body//text()')
items_main.append(loader.load_item())
return items_main
items.py
from scrapy.item import Item, Field
from scrapy.loader.processors import MapCompose, Join, TakeFirst
#from scrapy.utils.markup import remove_tags_with_content
from w3lib.html import replace_escape_chars, remove_tags
class Items_Main(Item):
fragment = Field(
input_processor = MapCompose(lambda v: v.strip(), remove_tags, replace_escape_chars),
output_processor = Join(),
)
我知道这不是我想要的(删除每个 Javascript-,JQuery-,...代码)但这是我必须采用的当前情况。 因此,如果您对如何摆脱它有任何建议,我想尝试一下。
我想我找到了答案(至少对我有用)。
我将以下行从 MySpider.py loader.add_xpath('fragment', '//body//text()')
更改为 loader.add_xpath('fragment', '//*[not(self::script)]/text()')
。
所以现在这个文件的完整代码是:
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
from Scrapy_One.items import Items_Main
class MySpider(CrawlSpider):
name = 'spiderName'
allowed_domains = ['example.de']
start_urls = ['http://www.example.de/']
rules = (Rule(LinkExtractor(allow = ('', ),
deny = ('/(\w|\W)*([Ii]mpressum|[Aa]bout|[Pp]rivacy|[Tt]erms|[Cc]opyright|[Hh]elp|[Hh]ilfe|[Dd]atenschutz|[KkCc]onta[kc]t|[Rr]echtliche(\w|\W)*[Hh]inweis|[Hh]aftungsausschlu)'),
unique = True),
callback = 'parse_stuff',
follow = True),
)
def parse_stuff(self, response):
hxs = Selector(response)
sites = hxs.xpath('//body')
items_main = []
for site in sites:
loader = ItemLoader(item = Items_Main(), response = response)
loader.add_xpath('fragment', '//*[not(self::script)]/text()')
items_main.append(loader.load_item())
return items_main