scrapy下载不了图片

Z时代
2024-02-25
分类：IT

scrapy下载不了图片

pipeline

# -*- coding: utf-8 -*-
import pymysql
import scrapy
# Define your item pipelines here
from SlideblockVeification.items import SlideblockveificationItem
import re
import os
import shutil
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class SlideblockveificationPipeline:
    def open_spider(self, spider):
        db = spider.settings.get('MYSQL_DB_NAME','test')
        host = spider.settings.get('MYSQL_HOST', 'localhost')
        port = spider.settings.get('MYSQL_PORT', 3306)
        user = spider.settings.get('MYSQL_USER', 'root')
        passwd = spider.settings.get('MYSQL_PASSWORD', '')
#第二步是引入连接数据库，pymysql.connect过程中传入多个参数：数据库主机名（默认为本地主机），数据库登录名（默认为当前用户），数据库密码（默认为空），要打开的数据库名称（无默认，可缺省），MySQL使用的TCP端口（默认为3306，可缺省），数据库字符编码（可缺省），self后边的连接名称可以自取名
        self.db_conn =pymysql.connect(host=host, port=port, db=db, user=user, passwd=passwd, charset='utf8')
#第三步 获取游标self.连接名.cursor()，游标就像是鼠标一样，后续操作数据库全部靠游标，使用游标的execute命令来执行。
        self.db_cur = self.db_conn.cursor()
    def close_spider(self, spider):
#第五步 提交数据库执行
        self.db_conn.commit()
#第六步 关闭数据库
        self.db_conn.close()
    def process_item(self, item, spider):
        self.insert_db(item)
        return item
    def insert_db(self, item):
        values = (
            item['Bg'],
            item['slideblock'],
        )
        print("Insert 成功了")
        sql = 'INSERT INTO Image(bg,slideblock) VALUES(%s,%s)'
#第四步 用游标执行数据库命令
        self.db_cur.execute(sql, values)
class ImgsPipeline(ImagesPipeline):
    IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
    def get_media_requests(self, item, info):
        image_url = item["Bg"]
        print("这个是pipeline")
        print(image_url)
        header = {
            "user-agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
                  }
        yield scrapy.Request(image_url, headers=header,meta={'name':item['Bg']})
    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
            return item
    def file_path(self,request,response=None,info=None):
        name = request.meta['name'] # 接收上面meta传递过来的图片名称                                      
        name = re.sub(r'[？\\*|“<>:/]', '', name) # 过滤windows字符串，不经过这么一个步骤，你会发现有乱码或无法下载
         #添加图片后缀名
        #filename= name
        filename= "name"
        filename= filenamename +'.jpg'
        print(filename)
        return filename

爬虫文件

# -*- coding: utf-8 -*-
import scrapy
from selenium import webdriver
from selenium .webdriver .common .keys import  Keys
import time
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy import signals
from SlideblockVeification.items import SlideblockveificationItem
class RunSpider(scrapy.Spider):
    name = 'run'
    start_urls = ['https://accounts.douban.com/passport/login?source=group']
    def __init__(self):
        self.driver = webdriver.Chrome(executable_path=r'E:/python/chromedriver_win32/chromedriver.exe')
        #直白的说super().__init__()，就是继承父类的init方法
        #F:\chromedriver_win32\
        self.wait = WebDriverWait(self.driver, 10)
        super().__init__()
    def parse(self, response):
        self.driver.maximize_window()
        self.driver.get('https://accounts.douban.com/passport/login?source=group')
        time.sleep(5)
        self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[1]/ul[1]/li[2]').click()
        self.driver.find_element_by_id("username").send_keys('13949094212')
        self.driver.find_element_by_id("password").send_keys('1234444')
        self.driver.implicitly_wait(5)
        self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[2]/div[1]/div[4]/a').click()
        time.sleep(15)
        try:
            ss=self.driver.find_element_by_xpath('//*[@id="tcaptcha_iframe"]')
            a=True
        except:
            a=False
        if a==True:
            print("有验证码")
            iframe2=self.driver.find_element_by_xpath("//*[@class='tcaptcha-transform']/iframe")
            self.driver.switch_to.frame(iframe2)
            item=SlideblockveificationItem()
            item["Bg"]=self.driver.find_element_by_xpath('//*[@id="slideBg"]').get_attribute('src')
            print(item["Bg"])
            item["slideblock"]=self.driver.find_element_by_xpath('//*[@id="slideBlock"]').get_attribute('src')
            yield item
        elif a==False:
            print("这个不用验证")
    #下边的代码指的是官方给出的用signal自动关闭浏览器的方法
    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
       spider = super(RunSpider, cls).from_crawler(crawler, *args, **kwargs)
       crawler.signals.connect(spider.closeSpider, signals.spider_closed)
       return spider
    def closeSpider(self):
       self.driver.quit()       print("代表运行")

setting

# -*- coding: utf-8 -*- # Scrapy settings for SlideblockVeification project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://docs.scrapy.org/en/latest/topics/settings.html # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'SlideblockVeification' MEDIA_ALLOW_REDIRECTS =True SPIDER_MODULES = ['SlideblockVeification.spiders'] NEWSPIDER_MODULE = 'SlideblockVeification.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'SlideblockVeification (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False LOG_LEVEL='ERROR' # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'SlideblockVeification.middlewares.SlideblockveificationSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { 'SlideblockVeification.middlewares.seleniumEdit': 543, } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'SlideblockVeification.pipelines.SlideblockveificationPipeline': 500, 'SlideblockVeification.pipelines.ImgsPipeline':550, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False IMAGES_URLS_FIELD = "Bg" #自定义保存路径 IMAGES_STORE = "image" # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

以上是 scrapy下载不了图片的全部内容，来源链接： utcz.com/p/938056.html

scrapy下载不了图片

其他人也看了：