请问如何使爬虫进行的更快？

Z时代
2024-02-17
分类：IT

请问如何使爬虫进行的更快？

爬虫小白，根据网上的代码改成了自己想爬的网页，但是很慢，请教如何使速度变快？

import aiohttp
import re  
import urllib.request, urllib.error
import xlwt
from selenium import webdriver
import time
import scrapy
from scrapy.http import Request
from urllib import parse
from selenium.webdriver.chrome.options import Options
try:
    import scrapy
except: 
    !pip install scrapy
    import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import logging
import json
class JsonWriterPipeline(object):
    def open_spider(self, spider):
        self.file = open('test.jl', 'w')
    def close_spider(self, spider):
        self.file.close()
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item
class MySpider(scrapy.Spider):
    name = "haodaifu"
    allowed_domains = ['haodf.com']
    download_delay = 2
    start_urls = ['https://www.haodf.com/sitemap-zx/2020/']
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'testzixun.json'                        # Used for pipeline 2
    }
    parse(self, response):
        for link in response.xpath('//div[4]/div[2]/li/a[contains(text(), "2020-01")]/@href'):
            url = response.urljoin(link.extract()) #从2020页获取所有日期URL链接
            yield scrapy.Request(url, callback=self.parse_info)
    def parse_info(self, response):
        for link in response.xpath('//div[3]/div[1]/li/a/@href'):
            url = response.urljoin(link.extract()) #从所有日期获取每页URL链接
            yield scrapy.Request(url, callback=self.parse_info_detail)
        next_page = response.xpath("//div[3]/div[2]/div/div/a[contains(text(), '下一页')]/@href")
        if next_page:
            u = response.urljoin(next_page.extract_first()) #下一页的链接
            print(u)
            yield scrapy.Request(u, callback=self.parse_info)
    def parse_info_detail(self, response):
                yield {'name': response.xpath('//h1[@class="doctor-name"]/text()').extract_first()
            }
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
process.crawl(MySpider)process.start()

以上是请问如何使爬虫进行的更快？的全部内容，来源链接： utcz.com/p/937861.html

请问如何使爬虫进行的更快？

其他人也看了：