如何通过Python Scrapy spider解析内嵌链接
How to parse embedded links through Python Scrapy spider
我正在尝试使用 python 的 scrappy 从网站中提取课程目录信息。问题是,每门课程的整页都有一个 link,我需要一页一页地遍历这些页面以提取它们的信息,然后将这些信息输入 SQL 数据库。无论如何,我不知道如何依次更改蜘蛛中的url。下面附上的是我到目前为止的代码。
import scrapy
def find_between(s, first, last):
try:
start = s.index(first) + len(first)
end = s.index(last, start)
return s[start:end]
except ValueError:
return ""
class QuoteSpider(scrapy.Spider):
name = 'courses'
start_urls = [
'http://catalog.aucegypt.edu/content.php?catoid=36&navoid=1738',
]
def parse(self, response):
# pages in span+ a
all_courses = response.css('.width a')
for course in all_courses:
courseURL = course.xpath('@href').extract()
cleanCourseURL = find_between(str(courseURL), "['", "']")
fullURL = "http://catalog.aucegypt.edu/" + cleanCourseURL
#iterate through urls
QuoteSpider.start_urls += fullURL
courseName = response.css('.block_content')
yield {
'courseNum': fullURL,
'test': courseName
}
通常你需要 yield
这个新的 URL 并用相应的 callback
:
处理它
def parse(self, response):
# pages in span+ a
all_courses = response.css('.width a')
for course in all_courses:
courseURL = course.xpath('@href').extract()
cleanCourseURL = find_between(str(courseURL), "['", "']")
fullURL = "http://catalog.aucegypt.edu/" + cleanCourseURL
courseName = response.css('.block_content')
yield scrapy.Request(
url=fullURL,
callback=self.parse_course,
cb_kwargs={
'course_name': courseName,
},
)
def parse_course(self, response, course_name):
# parse you course here...
我正在尝试使用 python 的 scrappy 从网站中提取课程目录信息。问题是,每门课程的整页都有一个 link,我需要一页一页地遍历这些页面以提取它们的信息,然后将这些信息输入 SQL 数据库。无论如何,我不知道如何依次更改蜘蛛中的url。下面附上的是我到目前为止的代码。
import scrapy
def find_between(s, first, last):
try:
start = s.index(first) + len(first)
end = s.index(last, start)
return s[start:end]
except ValueError:
return ""
class QuoteSpider(scrapy.Spider):
name = 'courses'
start_urls = [
'http://catalog.aucegypt.edu/content.php?catoid=36&navoid=1738',
]
def parse(self, response):
# pages in span+ a
all_courses = response.css('.width a')
for course in all_courses:
courseURL = course.xpath('@href').extract()
cleanCourseURL = find_between(str(courseURL), "['", "']")
fullURL = "http://catalog.aucegypt.edu/" + cleanCourseURL
#iterate through urls
QuoteSpider.start_urls += fullURL
courseName = response.css('.block_content')
yield {
'courseNum': fullURL,
'test': courseName
}
通常你需要 yield
这个新的 URL 并用相应的 callback
:
def parse(self, response):
# pages in span+ a
all_courses = response.css('.width a')
for course in all_courses:
courseURL = course.xpath('@href').extract()
cleanCourseURL = find_between(str(courseURL), "['", "']")
fullURL = "http://catalog.aucegypt.edu/" + cleanCourseURL
courseName = response.css('.block_content')
yield scrapy.Request(
url=fullURL,
callback=self.parse_course,
cb_kwargs={
'course_name': courseName,
},
)
def parse_course(self, response, course_name):
# parse you course here...