Scrapy-seleniumを使い、データ取得のプログラムを作成しています。
そのサイトの次ページへのリンクが
<button class="" on click="location.href='/used/btT/index3.html?NOTKEI=1';return false;">次のページ</button>
となっており,下記のようにコーディングしてみましたが、1ページ目はスムーズにデータ取得できましたが、2ページ目以降は、遷移後取得はできているみたいですが、少しずつ取得し、最終的には取得できなくなります。どこに問題があるかご指導いただけると助かります。
実現したいこと
2ページ目以降のデータ取得をスムーズに取得したい
発生している問題・エラーメッセージ
2ページ目以降、取得できるデータ数が減っていき、完全に取得できなくなる。
該当のソースコード
【exam_z.py】
import scrapy from scrapy_selenium import SeleniumRequest from time import sleep from selenium.webdriver.common.keys import Keys from scrapy.selector import Selector import logging from sensor.items import SensorItem from scrapy.loader import ItemLoader class ExamZSpider(scrapy.Spider): name = 'exam_z' def start_requests(self): yield SeleniumRequest( url='https://www.example.net/used/search.php?STID=CS210610&NOTKEI=1&BT=T', wait_time=3, callback=self.parse ) def parse(self, response): # driver = response.meta['driver'] products = response.xpath('//h3[@class="cassetteMain__title"]') for elem in productss: yield response.follow(url=elem.xpath('.//a/@href').get(), callback=self.parse_item) # 次のページへをクリック driver = response.meta['driver'] next_page = driver.find_element_by_xpath('//*[@id="js-resultBar"]/div[2]/div/div[2]/button[2]') next_page.click() sleep(3) if next_page: driver = response.meta['driver'] yield SeleniumRequest( url=driver.current_url, wait_time=3, callback=self.parse ) def parse_item(self, response): logging.info(response.url) loader = ItemLoader(item=ExampleItem(), response = response) loader.add_value('url', response.url) loader.add_xpath('title', '//h2[@class="title1"]/span/text()') loader.add_xpath('price', '//p[@class="basePrice__price"]/span/text()') loader.add_xpath('maker', '//h2[@class="title1"]/text()') loader.add_xpath('name', '//h2[@class="title1"]/text()') loader.add_xpath('year', '//p[contains(text(),"年式")]/following-sibling::p[1]/text()') loader.add_xpath('mileage', '//div[@class="specWrap__box"][2]/p[1]/following-sibling::p/text()') loader.add_xpath('inspection', '//div[@class="specWrap__box"][3]/p[1]/following-sibling::p/text()') loader.add_xpath('transmission', '//th[contains(text(),"ミッション")]/following-sibling::td/text()') loader.add_xpath('company', '//div[@class="sideDetailInfo__contents__inner"]/p/a/text()') loader.add_xpath('address', '//div[@class="sideDetailInfo__contents__inner"]/p[2]/text()') loader.add_xpath('tel', '//div[@class="ppc"]//p[@class="ppc__phoneNum__item"]/text()[1]') yield loader.load_item()
【settings.py】
# Scrapy settings for example project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'example' SPIDER_MODULES = ['example.spiders'] NEWSPIDER_MODULE = 'example.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'example (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = True # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'ja', } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'sensor.middlewares.SensorSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'scrapy_selenium.SeleniumMiddleware': 800 } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'example.pipelines.ExamplePipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' from shutil import which SELENIUM_DRIVER_NAME = 'chrome' SELENIUM_DRIVER_EXECUTABLE_PATH = which('chromedriver') SELENIUM_DRIVER_ARGUMENTS=['-headless'] # '--headless' if using chrome instead of firefox FEED_EXPORT_ENCODING = 'utf-8'
【items.py】
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy from itemloaders.processors import TakeFirst, MapCompose, Join def convert_integer(element): if element: return int(element) return 0 def strip_period(element): if element: return element.replace('.', '') return element def get_maker(element): if element: return element[:element.find(' ')] return element def get_name(element): if element: return element[element.find(' ') + len(' '):].replace(' ', '') return element def get_mileage(element): if element: return element.replace('万', '000').replace('km', '') return element def get_address(element): if element: return element.replace('住所:', '') return element def get_tel(element): if element: return element.replace('-', '') return element class SensorItem(scrapy.Item): url = scrapy.Field( output_processor = TakeFirst() ) title = scrapy.Field( output_processor = TakeFirst() ) price = scrapy.Field( input_processor = MapCompose(strip_period), output_processor = Join('') # input_processor = MapCompose(convert_join), #output_processor = TakeFirst() ) maker = scrapy.Field( input_processor = MapCompose(get_maker), output_processor = TakeFirst() ) name = scrapy.Field( input_processor = MapCompose(get_name), output_processor = TakeFirst() ) year = scrapy.Field( input_processor = MapCompose(convert_integer), output_processor = TakeFirst() ) mileage = scrapy.Field( input_processor = MapCompose(strip_period, get_mileage), output_processor = Join('') #output_processor = TakeFirst() ) inspection = scrapy.Field( output_processor = TakeFirst() ) transmission = scrapy.Field( output_processor = TakeFirst() ) company = scrapy.Field( output_processor = TakeFirst() ) address = scrapy.Field( input_processor = MapCompose(get_address), output_processor = TakeFirst() ) tel = scrapy.Field( input_processor = MapCompose(get_tel, convert_integer), output_processor = TakeFirst() ) pass
試したこと
次のページへのコードを下記のようにしましたが、2ページ目以降に遷移しませんでした。
driver = response.meta['driver']
next_page=driver.find_element_by_xpath('//[@id="jsresultBar"]/div[2]/div/div[2]/button[2]')
next_page.click()
if next_page:
driver = response.meta['driver']
yield response.follow(url=driver.current_url, callback=self.parse)
#yield response.follow(url=next_page, callback=self.parse)
Python 3.9.12
scrapy 2.4.1
scrapy-slenium 0.0.7

0 コメント