从 Spider 转移到 CrawlSpider
Move from Spider to CrawlSpider
我试图从一般的蜘蛛转移到 CrawlSpider 来利用规则。然而,
我的爬虫不再那样工作了。你看到我哪里做错了吗?
之前:
class GitHubSpider(scrapy.Spider):
name = "github"
start_urls = [
"https://github.com/search?p=1&q=React+Django&type=Users",
]
def parse(self, response):
engineer_links = response.css("a.mr-1::attr(href)")
yield from response.follow_all(engineer_links, self.parse_engineer)
pagination_links = response.css(".next_page::attr(href)")
yield from response.follow_all(pagination_links, self.parse)
def parse_engineer(self, response):
yield {
"username": response.css(".vcard-username::text").get().strip(),
}
新(不工作):
class GitHubSpider(CrawlSpider):
name = "github"
start_urls = [
"https://github.com/search?p=1&q=React+Django&type=Users",
]
rules = (
Rule(
LinkExtractor(restrict_css=("a.mr-1::attr(href)")),
callback="parse_engineer",
),
Rule(LinkExtractor(restrict_css=(".next_page::attr(href)"))),
)
def parse_engineer(self, response):
yield {
"username": response.css(".vcard-username::text").get().strip(),
}
现在,它正在运行:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class GitHubSpider(CrawlSpider):
name = "github"
allowed_domains = [github.com]
start_urls = [
"https://github.com/search?p=1&q=React+Django&type=Users"
]
rules = (
Rule(LinkExtractor(restrict_css="a.mr-1"),callback="parse_engineer",),
Rule(LinkExtractor(restrict_css=".next_page")),
)
def parse_engineer(self, response):
yield {
"username": response.css(".vcard-username::text").get().strip()
}
我试图从一般的蜘蛛转移到 CrawlSpider 来利用规则。然而, 我的爬虫不再那样工作了。你看到我哪里做错了吗?
之前:
class GitHubSpider(scrapy.Spider):
name = "github"
start_urls = [
"https://github.com/search?p=1&q=React+Django&type=Users",
]
def parse(self, response):
engineer_links = response.css("a.mr-1::attr(href)")
yield from response.follow_all(engineer_links, self.parse_engineer)
pagination_links = response.css(".next_page::attr(href)")
yield from response.follow_all(pagination_links, self.parse)
def parse_engineer(self, response):
yield {
"username": response.css(".vcard-username::text").get().strip(),
}
新(不工作):
class GitHubSpider(CrawlSpider):
name = "github"
start_urls = [
"https://github.com/search?p=1&q=React+Django&type=Users",
]
rules = (
Rule(
LinkExtractor(restrict_css=("a.mr-1::attr(href)")),
callback="parse_engineer",
),
Rule(LinkExtractor(restrict_css=(".next_page::attr(href)"))),
)
def parse_engineer(self, response):
yield {
"username": response.css(".vcard-username::text").get().strip(),
}
现在,它正在运行:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class GitHubSpider(CrawlSpider):
name = "github"
allowed_domains = [github.com]
start_urls = [
"https://github.com/search?p=1&q=React+Django&type=Users"
]
rules = (
Rule(LinkExtractor(restrict_css="a.mr-1"),callback="parse_engineer",),
Rule(LinkExtractor(restrict_css=".next_page")),
)
def parse_engineer(self, response):
yield {
"username": response.css(".vcard-username::text").get().strip()
}