每个网址两次Scrapy + Selenium请求两次

import scrapy

from selenium import webdriver

class ProductSpider(scrapy.Spider):

name = "product_spider"

allowed_domains = ['ebay.com']

start_urls = ['http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40']

def __init__(self):

self.driver = webdriver.Firefox()

def parse(self, response):

self.driver.get(response.url)

while True:

next = self.driver.find_element_by_xpath('//td[@class="pagn-next"]/a')

try:

next.click()

# get the data and write it to scrapy items

except:

break

self.driver.close()

此解决方案效果很好,但是它对相同的URL两次请求,一个是scrapy Scheduler,另一个是Selenium Web驱动程序。

与没有selenium的苛刻要求相比,完成这项工作将花费两倍的时间。如何避免这种情况?

回答:

这是解决这个问题的有用技巧。

,在本地运行它

from flask import Flask, request, make_response

from flask_restful import Resource, Api

from selenium import webdriver

from selenium.webdriver.chrome.options import Options

app = Flask(__name__)

api = Api(app)

class Selenium(Resource):

_driver = None

@staticmethod

def getDriver():

if not Selenium._driver:

chrome_options = Options()

chrome_options.add_argument("--headless")

Selenium._driver = webdriver.Chrome(chrome_options=chrome_options)

return Selenium._driver

@property

def driver(self):

return Selenium.getDriver()

def get(self):

url = str(request.args['url'])

self.driver.get(url)

return make_response(self.driver.page_source)

api.add_resource(Selenium, '/')

if __name__ == '__main__':

app.run(debug=True)

现在将使用硒Chrome / Firefox驱动程序返回编译后的网页。

现在我们的蜘蛛会是什么样子,

import scrapy

import urllib

class ProductSpider(scrapy.Spider):

name = 'products'

allowed_domains = ['ebay.com']

urls = [

'http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40',

]

def start_requests(self):

for url in self.urls:

url = 'http://127.0.0.1:5000/?url={}'.format(urllib.quote(url))

yield scrapy.Request(url)

def parse(self, response):

yield {

'field': response.xpath('//td[@class="pagn-next"]/a'),

}

以上是 每个网址两次Scrapy + Selenium请求两次 的全部内容, 来源链接: utcz.com/qa/423462.html

回到顶部