请问如何使爬虫进行的更快?

请问如何使爬虫进行的更快?

爬虫小白,根据网上的代码改成了自己想爬的网页,但是很慢,请教如何使速度变快?

import aiohttp

import re

import urllib.request, urllib.error

import xlwt

from selenium import webdriver

import time

import scrapy

from scrapy.http import Request

from urllib import parse

from selenium.webdriver.chrome.options import Options

try:

import scrapy

except:

!pip install scrapy

import scrapy

from scrapy.crawler import CrawlerProcess

from scrapy.utils.project import get_project_settings

import logging

import json

class JsonWriterPipeline(object):

def open_spider(self, spider):

self.file = open('test.jl', 'w')

def close_spider(self, spider):

self.file.close()

def process_item(self, item, spider):

line = json.dumps(dict(item)) + "\n"

self.file.write(line)

return item

class MySpider(scrapy.Spider):

name = "haodaifu"

allowed_domains = ['haodf.com']

download_delay = 2

start_urls = ['https://www.haodf.com/sitemap-zx/2020/']

custom_settings = {

'LOG_LEVEL': logging.WARNING,

'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1

'FEED_FORMAT':'json', # Used for pipeline 2

'FEED_URI': 'testzixun.json' # Used for pipeline 2

}

parse(self, response):

for link in response.xpath('//div[4]/div[2]/li/a[contains(text(), "2020-01")]/@href'):

url = response.urljoin(link.extract()) #从2020页获取所有日期URL链接

yield scrapy.Request(url, callback=self.parse_info)

def parse_info(self, response):

for link in response.xpath('//div[3]/div[1]/li/a/@href'):

url = response.urljoin(link.extract()) #从所有日期获取每页URL链接

yield scrapy.Request(url, callback=self.parse_info_detail)

next_page = response.xpath("//div[3]/div[2]/div/div/a[contains(text(), '下一页')]/@href")

if next_page:

u = response.urljoin(next_page.extract_first()) #下一页的链接

print(u)

yield scrapy.Request(u, callback=self.parse_info)

def parse_info_detail(self, response):

yield {'name': response.xpath('//h1[@class="doctor-name"]/text()').extract_first()

}

process = CrawlerProcess({

'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})

process.crawl(MySpider)

process.start()

以上是 请问如何使爬虫进行的更快? 的全部内容, 来源链接: utcz.com/p/937861.html

回到顶部