每个网址两次Scrapy + Selenium请求两次
import scrapy
from selenium import webdriver
class ProductSpider(scrapy.Spider): name = "product_spider"
allowed_domains = ['ebay.com']
start_urls = ['http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40']
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath('//td[@class="pagn-next"]/a')
try:
next.click()
# get the data and write it to scrapy items
except:
break
self.driver.close()
此解决方案效果很好,但是它对相同的URL两次请求,一个是scrapy Scheduler,另一个是Selenium Web驱动程序。
与没有selenium的苛刻要求相比,完成这项工作将花费两倍的时间。如何避免这种情况?
回答:
这是解决这个问题的有用技巧。
,在本地运行它
from flask import Flask, request, make_responsefrom flask_restful import Resource, Api
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
app = Flask(__name__)
api = Api(app)
class Selenium(Resource):
_driver = None
@staticmethod
def getDriver():
if not Selenium._driver:
chrome_options = Options()
chrome_options.add_argument("--headless")
Selenium._driver = webdriver.Chrome(chrome_options=chrome_options)
return Selenium._driver
@property
def driver(self):
return Selenium.getDriver()
def get(self):
url = str(request.args['url'])
self.driver.get(url)
return make_response(self.driver.page_source)
api.add_resource(Selenium, '/')
if __name__ == '__main__':
app.run(debug=True)
现在将使用硒Chrome / Firefox驱动程序返回编译后的网页。
现在我们的蜘蛛会是什么样子,
import scrapyimport urllib
class ProductSpider(scrapy.Spider):
name = 'products'
allowed_domains = ['ebay.com']
urls = [
'http://www.ebay.com/sch/i.html?_odkw=books&_osacat=0&_trksid=p2045573.m570.l1313.TR0.TRC0.Xpython&_nkw=python&_sacat=0&_from=R40',
]
def start_requests(self):
for url in self.urls:
url = 'http://127.0.0.1:5000/?url={}'.format(urllib.quote(url))
yield scrapy.Request(url)
def parse(self, response):
yield {
'field': response.xpath('//td[@class="pagn-next"]/a'),
}
以上是 每个网址两次Scrapy + Selenium请求两次 的全部内容, 来源链接: utcz.com/qa/423462.html