如果标签嵌套很多,如何获取所有数据?
How to get all of data if tags with a lots of nested?
我尝试获取第一天的电影数据(今天是 8/25),并从网站上找到 href 是
https://www.vscinemas.com.tw/vsweb/theater/detail.aspx?id=16#movieTime-1508017172
我想获取所有电影数据(名称、gradle、时间、类型...等),例如:
FAST AND FURIOUS 9
childview
數位
['11:30', '12:45', '14:25', '15:30', '17:15', '18:15', '20:00', '21:00', '22:45', '23:45']
1
IMAX
['12:00', '16:45', '21:30']
2
['GC 數位']
['13:00', '16:10', '18:00', '20:50', '23:35']
但是我的网站标签规则,如果一些电影有多个类型像IMAX 4DX,它会嵌套另一个<div class="movieDay" />
,其他电影数据会在<div />
下,当然,如果仍然还有多种类型的电影,它会再次嵌套!
写到inThreeNode的时候,发现不对,因为要一遍又一遍的检查和嵌套。肯定有一些方法可以检查规则,写出更清晰的代码,但我不知道如何实现。
这是我的蜘蛛,如有任何帮助,我们将不胜感激。
# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import sys
import time
class TesttwoSpider(scrapy.Spider):
name = 'testtwo'
allowed_domains = ['www.vscinemas.com.tw/vsweb/']
start_urls = ['https://www.vscinemas.com.tw/vsweb/theater/detail.aspx?id=16']
# print(sys.path)
# use selenium
# options = Options()
# options.add_argument("--disable-notifications")
# chrome = webdriver.Chrome('/Users/motogod19/chromedriver', chrome_options=options)
# chrome.get("https://www.facebook.com/")
def parse(self, response):
print('************ Start parse ************')
# childview one of movie types
# teenager one of movie types
firstHtmlNodes = response.xpath('//div[@class="theaterTime"]/article[@id="movieTime-1508017172"]')
for firstHtmlNode in firstHtmlNodes:
cnName = firstHtmlNode.xpath('./h2/text()').extract()
print('Check length')
print(len(cnName))
getFistFloorEnNameLength = len(cnName)
for index in range(len(cnName)):
movieName = firstHtmlNode.xpath('./h2[{}]/text()'.format(index+1)).extract_first()
movieGradle = firstHtmlNode.xpath('./h1[{}]/span/@class'.format(index+1)).extract_first()
movieType = firstHtmlNode.xpath('./div[{}]/h4/text()'.format(index+1)).extract_first()
movieTime = firstHtmlNode.xpath('./div[{}]/ul/li/a/text()'.format(index+1)).extract()
print(movieName)
print(movieGradle)
print(movieType)
print(movieTime)
# lats one check <div class="movieDay" /> exists or not
if getFistFloorEnNameLength == index + 1:
nextNode = firstHtmlNode.xpath('./div[{}]/div[1]'.format(index+1))
if len(nextNode) != 0:
movieType = nextNode.xpath('./h4/text()').extract_first()
movieTime = nextNode.xpath('./ul/li/a/text()').extract()
print('1')
print(movieType)
print(movieTime)
inTwoNode = nextNode.xpath('./div[1]')
if len(inTwoNode) != 0:
movieType = inTwoNode.xpath('./h4/text()').extract()
movieTime = inTwoNode.xpath('./ul/li/a/text()').extract()
print('2')
print(movieType)
print(movieTime)
# I stop here, because I have to check and nested over and over again.
inThreeNode = inTwoNode.xpath('./h2/text()')
else:
movieName = firstHtmlNode.xpath('./div[{}]/h2/text()'.format(index+1))
print('3')
print(movieName)
HTML 此页面上的代码已损坏(<div class="movieDay"
标签未正确关闭)。您可以尝试手动编辑 HTML 并关闭上面的标签 (我使用正则表达式):
import re
from scrapy.http import HtmlResponse
class TesttwoSpider(scrapy.Spider):
name = '68916451'
allowed_domains = ['www.vscinemas.com.tw/vsweb/']
start_urls = ['https://www.vscinemas.com.tw/vsweb/theater/detail.aspx?id=16']
# print(sys.path)
# use selenium
# options = Options()
# options.add_argument("--disable-notifications")
# chrome = webdriver.Chrome('/Users/motogod19/chromedriver', chrome_options=options)
# chrome.get("https://www.facebook.com/")
def parse(self, response):
content = response.text
content = re.sub(r'</ul>\s*(</?div)', r'</ul>\n</div>', content)
# with open('Samples/68916451.html', 'w', encoding='utf-8') as f:
# f.write(content)
response = HtmlResponse(url=response.url, body=content, encoding='utf-8')
for movie_day in response.xpath('//article[last()]//div[@class="movieDay"]'):
movie_title = movie_day.xpath('./preceding-sibling::h2[1]/text()').get()
movie_type = movie_day.xpath('./h4/text()').get()
movie_time = movie_day.xpath('./ul/li/a/text()').getall()
yield {
'Title': movie_title,
'Type': movie_type,
'Time': movie_time
}
我尝试获取第一天的电影数据(今天是 8/25),并从网站上找到 href 是
https://www.vscinemas.com.tw/vsweb/theater/detail.aspx?id=16#movieTime-1508017172
我想获取所有电影数据(名称、gradle、时间、类型...等),例如:
FAST AND FURIOUS 9
childview
數位
['11:30', '12:45', '14:25', '15:30', '17:15', '18:15', '20:00', '21:00', '22:45', '23:45']
1
IMAX
['12:00', '16:45', '21:30']
2
['GC 數位']
['13:00', '16:10', '18:00', '20:50', '23:35']
但是我的网站标签规则,如果一些电影有多个类型像IMAX 4DX,它会嵌套另一个<div class="movieDay" />
,其他电影数据会在<div />
下,当然,如果仍然还有多种类型的电影,它会再次嵌套!
写到inThreeNode的时候,发现不对,因为要一遍又一遍的检查和嵌套。肯定有一些方法可以检查规则,写出更清晰的代码,但我不知道如何实现。
这是我的蜘蛛,如有任何帮助,我们将不胜感激。
# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import sys
import time
class TesttwoSpider(scrapy.Spider):
name = 'testtwo'
allowed_domains = ['www.vscinemas.com.tw/vsweb/']
start_urls = ['https://www.vscinemas.com.tw/vsweb/theater/detail.aspx?id=16']
# print(sys.path)
# use selenium
# options = Options()
# options.add_argument("--disable-notifications")
# chrome = webdriver.Chrome('/Users/motogod19/chromedriver', chrome_options=options)
# chrome.get("https://www.facebook.com/")
def parse(self, response):
print('************ Start parse ************')
# childview one of movie types
# teenager one of movie types
firstHtmlNodes = response.xpath('//div[@class="theaterTime"]/article[@id="movieTime-1508017172"]')
for firstHtmlNode in firstHtmlNodes:
cnName = firstHtmlNode.xpath('./h2/text()').extract()
print('Check length')
print(len(cnName))
getFistFloorEnNameLength = len(cnName)
for index in range(len(cnName)):
movieName = firstHtmlNode.xpath('./h2[{}]/text()'.format(index+1)).extract_first()
movieGradle = firstHtmlNode.xpath('./h1[{}]/span/@class'.format(index+1)).extract_first()
movieType = firstHtmlNode.xpath('./div[{}]/h4/text()'.format(index+1)).extract_first()
movieTime = firstHtmlNode.xpath('./div[{}]/ul/li/a/text()'.format(index+1)).extract()
print(movieName)
print(movieGradle)
print(movieType)
print(movieTime)
# lats one check <div class="movieDay" /> exists or not
if getFistFloorEnNameLength == index + 1:
nextNode = firstHtmlNode.xpath('./div[{}]/div[1]'.format(index+1))
if len(nextNode) != 0:
movieType = nextNode.xpath('./h4/text()').extract_first()
movieTime = nextNode.xpath('./ul/li/a/text()').extract()
print('1')
print(movieType)
print(movieTime)
inTwoNode = nextNode.xpath('./div[1]')
if len(inTwoNode) != 0:
movieType = inTwoNode.xpath('./h4/text()').extract()
movieTime = inTwoNode.xpath('./ul/li/a/text()').extract()
print('2')
print(movieType)
print(movieTime)
# I stop here, because I have to check and nested over and over again.
inThreeNode = inTwoNode.xpath('./h2/text()')
else:
movieName = firstHtmlNode.xpath('./div[{}]/h2/text()'.format(index+1))
print('3')
print(movieName)
HTML 此页面上的代码已损坏(<div class="movieDay"
标签未正确关闭)。您可以尝试手动编辑 HTML 并关闭上面的标签 (我使用正则表达式):
import re
from scrapy.http import HtmlResponse
class TesttwoSpider(scrapy.Spider):
name = '68916451'
allowed_domains = ['www.vscinemas.com.tw/vsweb/']
start_urls = ['https://www.vscinemas.com.tw/vsweb/theater/detail.aspx?id=16']
# print(sys.path)
# use selenium
# options = Options()
# options.add_argument("--disable-notifications")
# chrome = webdriver.Chrome('/Users/motogod19/chromedriver', chrome_options=options)
# chrome.get("https://www.facebook.com/")
def parse(self, response):
content = response.text
content = re.sub(r'</ul>\s*(</?div)', r'</ul>\n</div>', content)
# with open('Samples/68916451.html', 'w', encoding='utf-8') as f:
# f.write(content)
response = HtmlResponse(url=response.url, body=content, encoding='utf-8')
for movie_day in response.xpath('//article[last()]//div[@class="movieDay"]'):
movie_title = movie_day.xpath('./preceding-sibling::h2[1]/text()').get()
movie_type = movie_day.xpath('./h4/text()').get()
movie_time = movie_day.xpath('./ul/li/a/text()').getall()
yield {
'Title': movie_title,
'Type': movie_type,
'Time': movie_time
}