需要帮助在 Python 循环中使用 Scrapy 爬取多个网页并从那里到下一页
Need help in crawling multiple web pages and to the next page from there using Scrapy in Python in a loop
目前我正在同时抓取多个网站,需要抓取下一页,其中 link 将从已抓取的站点获取下一页。所以需要不断地爬取每一页的下一页。请注意,每页的第二页具有相同的 div 内容。
Spider.py
class UstodaySpider(scrapy.Spider):
name = 'usatoday'
start_urls = ['https://en.wikipedia.org/wiki/India',
'https://en.wikipedia.org/wiki/USA
]
def parse(self, response):
items = MynewsItem()
print ("**********************************")
print (type(response))
print (response.url)
all_section = response.css(' a.gnt_m_flm_a ')
for quote in all_section:
news_provider_id = '14'
news_title = quote.css('a.gnt_m_flm_a').xpath("text()").extract()
news_details = quote.css('a.gnt_m_flm_a').xpath("@data-c-br").extract()
news_image = quote.css("img.gnt_m_flm_i").xpath("@data-gl-srcset").extract()
news_page_url = quote.css('a.gnt_m_flm_a').xpath("@href").extract()
items['news_provider_id'] = news_provider_id
items['news_title'] = news_title
items['news_details'] = news_details
items['news_image'] = news_image
items['news_page_url'] = news_page_url
yield items
next_page = 'https://en.wikipedia.org/wiki/India' + str(news_page_url)
print(next_page)
Pipeline.py
import mysql
class MynewsPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = mysql.connector.connect(
host = 'localhost',
user = 'root',
password = '',
database = 'mydb',
port = '3306'
)
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS news_crawl_newsdetails""")
self.curr.execute("""create table news_crawl_newsdetails(
news_provider_id text,
news_title text,
news_details text,
news_image text,
news_page_url text
)""" )
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self,item):
# print (item['news_title'][0])
self.curr.execute("""insert into news_crawl_newsdetails (news_provider_id,news_title,news_details,news_image,news_page_url) values (%s,%s,%s,%s,%s)""", (
item['news_provider_id'],
item['news_title'][0],
item['news_details'][0],
item['news_image'][0],
item['news_page_url'][0]
))
self.conn.commit()
Items.py
import scrapy
class MynewsItem(scrapy.Item):
news_provider_id = scrapy.Field()
news_title = scrapy.Field()
news_details = scrapy.Field()
news_image = scrapy.Field()
news_page_url = scrapy.Field()
news_des = scrapy.Field()
pass
你可以试试这个方法:
您应该找到 next_page xpath 。它可以是 link 或指向下一页的按钮:
next_page = response.selector.xpath(--xpath expression--).extract_first()
if next_page is not None:
next_page_link = response.urljoin(next_page)
yield scrapy.Request(url = next_page_link, callback=self.parse)
这就是您的解析函数的样子
def parse(self, response):
items = MynewsItem()
print ("**********************************")
print (type(response))
print (response.url)
all_section = response.css(' a.gnt_m_flm_a ')
for quote in all_section:
news_provider_id = '14'
news_title = quote.css('a.gnt_m_flm_a').xpath("text()").extract()
news_details = quote.css('a.gnt_m_flm_a').xpath("@data-c-br").extract()
news_image = quote.css("img.gnt_m_flm_i").xpath("@data-gl-srcset").extract()
news_page_url = quote.css('a.gnt_m_flm_a').xpath("@href").extract()
items['news_provider_id'] = news_provider_id
items['news_title'] = news_title
items['news_details'] = news_details
items['news_image'] = news_image
items['news_page_url'] = news_page_url
next_page = response.selector.xpath("").extract_first()
if next_page is not None:
next_page_link = response.urljoin(next_page)
yield scrapy.Request(url= next_page_link, callback=self.parse)
目前我正在同时抓取多个网站,需要抓取下一页,其中 link 将从已抓取的站点获取下一页。所以需要不断地爬取每一页的下一页。请注意,每页的第二页具有相同的 div 内容。
Spider.py
class UstodaySpider(scrapy.Spider):
name = 'usatoday'
start_urls = ['https://en.wikipedia.org/wiki/India',
'https://en.wikipedia.org/wiki/USA
]
def parse(self, response):
items = MynewsItem()
print ("**********************************")
print (type(response))
print (response.url)
all_section = response.css(' a.gnt_m_flm_a ')
for quote in all_section:
news_provider_id = '14'
news_title = quote.css('a.gnt_m_flm_a').xpath("text()").extract()
news_details = quote.css('a.gnt_m_flm_a').xpath("@data-c-br").extract()
news_image = quote.css("img.gnt_m_flm_i").xpath("@data-gl-srcset").extract()
news_page_url = quote.css('a.gnt_m_flm_a').xpath("@href").extract()
items['news_provider_id'] = news_provider_id
items['news_title'] = news_title
items['news_details'] = news_details
items['news_image'] = news_image
items['news_page_url'] = news_page_url
yield items
next_page = 'https://en.wikipedia.org/wiki/India' + str(news_page_url)
print(next_page)
Pipeline.py
import mysql
class MynewsPipeline(object):
def __init__(self):
self.create_connection()
self.create_table()
def create_connection(self):
self.conn = mysql.connector.connect(
host = 'localhost',
user = 'root',
password = '',
database = 'mydb',
port = '3306'
)
self.curr = self.conn.cursor()
def create_table(self):
self.curr.execute("""DROP TABLE IF EXISTS news_crawl_newsdetails""")
self.curr.execute("""create table news_crawl_newsdetails(
news_provider_id text,
news_title text,
news_details text,
news_image text,
news_page_url text
)""" )
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self,item):
# print (item['news_title'][0])
self.curr.execute("""insert into news_crawl_newsdetails (news_provider_id,news_title,news_details,news_image,news_page_url) values (%s,%s,%s,%s,%s)""", (
item['news_provider_id'],
item['news_title'][0],
item['news_details'][0],
item['news_image'][0],
item['news_page_url'][0]
))
self.conn.commit()
Items.py
import scrapy
class MynewsItem(scrapy.Item):
news_provider_id = scrapy.Field()
news_title = scrapy.Field()
news_details = scrapy.Field()
news_image = scrapy.Field()
news_page_url = scrapy.Field()
news_des = scrapy.Field()
pass
你可以试试这个方法:
您应该找到 next_page xpath 。它可以是 link 或指向下一页的按钮:
next_page = response.selector.xpath(--xpath expression--).extract_first()
if next_page is not None:
next_page_link = response.urljoin(next_page)
yield scrapy.Request(url = next_page_link, callback=self.parse)
这就是您的解析函数的样子
def parse(self, response):
items = MynewsItem()
print ("**********************************")
print (type(response))
print (response.url)
all_section = response.css(' a.gnt_m_flm_a ')
for quote in all_section:
news_provider_id = '14'
news_title = quote.css('a.gnt_m_flm_a').xpath("text()").extract()
news_details = quote.css('a.gnt_m_flm_a').xpath("@data-c-br").extract()
news_image = quote.css("img.gnt_m_flm_i").xpath("@data-gl-srcset").extract()
news_page_url = quote.css('a.gnt_m_flm_a').xpath("@href").extract()
items['news_provider_id'] = news_provider_id
items['news_title'] = news_title
items['news_details'] = news_details
items['news_image'] = news_image
items['news_page_url'] = news_page_url
next_page = response.selector.xpath("").extract_first()
if next_page is not None:
next_page_link = response.urljoin(next_page)
yield scrapy.Request(url= next_page_link, callback=self.parse)