请问如何使爬虫进行的更快?
爬虫小白,根据网上的代码改成了自己想爬的网页,但是很慢,请教如何使速度变快?
import aiohttpimport re
import urllib.request, urllib.error
import xlwt
from selenium import webdriver
import time
import scrapy
from scrapy.http import Request
from urllib import parse
from selenium.webdriver.chrome.options import Options
try:
import scrapy
except:
!pip install scrapy
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import logging
import json
class JsonWriterPipeline(object):
def open_spider(self, spider):
self.file = open('test.jl', 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
class MySpider(scrapy.Spider):
name = "haodaifu"
allowed_domains = ['haodf.com']
download_delay = 2
start_urls = ['https://www.haodf.com/sitemap-zx/2020/']
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
'FEED_FORMAT':'json', # Used for pipeline 2
'FEED_URI': 'testzixun.json' # Used for pipeline 2
}
parse(self, response):
for link in response.xpath('//div[4]/div[2]/li/a[contains(text(), "2020-01")]/@href'):
url = response.urljoin(link.extract()) #从2020页获取所有日期URL链接
yield scrapy.Request(url, callback=self.parse_info)
def parse_info(self, response):
for link in response.xpath('//div[3]/div[1]/li/a/@href'):
url = response.urljoin(link.extract()) #从所有日期获取每页URL链接
yield scrapy.Request(url, callback=self.parse_info_detail)
next_page = response.xpath("//div[3]/div[2]/div/div/a[contains(text(), '下一页')]/@href")
if next_page:
u = response.urljoin(next_page.extract_first()) #下一页的链接
print(u)
yield scrapy.Request(u, callback=self.parse_info)
def parse_info_detail(self, response):
yield {'name': response.xpath('//h1[@class="doctor-name"]/text()').extract_first()
}
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
process.crawl(MySpider)
process.start()
以上是 请问如何使爬虫进行的更快? 的全部内容, 来源链接: utcz.com/p/937861.html