scrapy下载不了图片

scrapy下载不了图片

pipeline

# -*- coding: utf-8 -*-

import pymysql

import scrapy

# Define your item pipelines here

from SlideblockVeification.items import SlideblockveificationItem

import re

import os

import shutil

from scrapy.pipelines.images import ImagesPipeline

from scrapy.exceptions import DropItem

from scrapy.utils.project import get_project_settings

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

class SlideblockveificationPipeline:

def open_spider(self, spider):

db = spider.settings.get('MYSQL_DB_NAME','test')

host = spider.settings.get('MYSQL_HOST', 'localhost')

port = spider.settings.get('MYSQL_PORT', 3306)

user = spider.settings.get('MYSQL_USER', 'root')

passwd = spider.settings.get('MYSQL_PASSWORD', '')

#第二步是引入连接数据库,pymysql.connect过程中传入多个参数:数据库主机名(默认为本地主机),数据库登录名(默认为当前用户),数据库密码(默认为空),要打开的数据库名称(无默认,可缺省),MySQL使用的TCP端口(默认为3306,可缺省),数据库字符编码(可缺省),self后边的连接名称可以自取名

self.db_conn =pymysql.connect(host=host, port=port, db=db, user=user, passwd=passwd, charset='utf8')

#第三步 获取游标self.连接名.cursor(),游标就像是鼠标一样,后续操作数据库全部靠游标,使用游标的execute命令来执行。

self.db_cur = self.db_conn.cursor()

def close_spider(self, spider):

#第五步 提交数据库执行

self.db_conn.commit()

#第六步 关闭数据库

self.db_conn.close()

def process_item(self, item, spider):

self.insert_db(item)

return item

def insert_db(self, item):

values = (

item['Bg'],

item['slideblock'],

)

print("Insert 成功了")

sql = 'INSERT INTO Image(bg,slideblock) VALUES(%s,%s)'

#第四步 用游标执行数据库命令

self.db_cur.execute(sql, values)

class ImgsPipeline(ImagesPipeline):

IMAGES_STORE = get_project_settings().get("IMAGES_STORE")

def get_media_requests(self, item, info):

image_url = item["Bg"]

print("这个是pipeline")

print(image_url)

header = {

"user-agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",

}

yield scrapy.Request(image_url, headers=header,meta={'name':item['Bg']})

def item_completed(self, results, item, info):

image_paths = [x['path'] for ok, x in results if ok]

if not image_paths:

raise DropItem("Item contains no images")

return item

def file_path(self,request,response=None,info=None):

name = request.meta['name'] # 接收上面meta传递过来的图片名称

name = re.sub(r'[?\\*|“<>:/]', '', name) # 过滤windows字符串,不经过这么一个步骤,你会发现有乱码或无法下载

#添加图片后缀名

#filename= name

filename= "name"

filename= filenamename +'.jpg'

print(filename)

return filename

爬虫文件

# -*- coding: utf-8 -*-

import scrapy

from selenium import webdriver

from selenium .webdriver .common .keys import Keys

import time

import os

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from scrapy import signals

from SlideblockVeification.items import SlideblockveificationItem

class RunSpider(scrapy.Spider):

name = 'run'

start_urls = ['https://accounts.douban.com/passport/login?source=group']

def __init__(self):

self.driver = webdriver.Chrome(executable_path=r'E:/python/chromedriver_win32/chromedriver.exe')

#直白的说super().__init__(),就是继承父类的init方法

#F:\chromedriver_win32\

self.wait = WebDriverWait(self.driver, 10)

super().__init__()

def parse(self, response):

self.driver.maximize_window()

self.driver.get('https://accounts.douban.com/passport/login?source=group')

time.sleep(5)

self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[1]/ul[1]/li[2]').click()

self.driver.find_element_by_id("username").send_keys('13949094212')

self.driver.find_element_by_id("password").send_keys('1234444')

self.driver.implicitly_wait(5)

self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[2]/div[1]/div[4]/a').click()

time.sleep(15)

try:

ss=self.driver.find_element_by_xpath('//*[@id="tcaptcha_iframe"]')

a=True

except:

a=False

if a==True:

print("有验证码")

iframe2=self.driver.find_element_by_xpath("//*[@class='tcaptcha-transform']/iframe")

self.driver.switch_to.frame(iframe2)

item=SlideblockveificationItem()

item["Bg"]=self.driver.find_element_by_xpath('//*[@id="slideBg"]').get_attribute('src')

print(item["Bg"])

item["slideblock"]=self.driver.find_element_by_xpath('//*[@id="slideBlock"]').get_attribute('src')

yield item

elif a==False:

print("这个不用验证")

#下边的代码指的是官方给出的用signal自动关闭浏览器的方法

@classmethod

def from_crawler(cls, crawler, *args, **kwargs):

spider = super(RunSpider, cls).from_crawler(crawler, *args, **kwargs)

crawler.signals.connect(spider.closeSpider, signals.spider_closed)

return spider

def closeSpider(self):

self.driver.quit()

print("代表运行")

setting

# -*- coding: utf-8 -*-

# Scrapy settings for SlideblockVeification project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

# https://docs.scrapy.org/en/latest/topics/settings.html

# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'SlideblockVeification'

MEDIA_ALLOW_REDIRECTS =True

SPIDER_MODULES = ['SlideblockVeification.spiders']

NEWSPIDER_MODULE = 'SlideblockVeification.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'SlideblockVeification (+http://www.yourdomain.com)'

# Obey robots.txt rules

ROBOTSTXT_OBEY = False

LOG_LEVEL='ERROR'

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

#}

# Enable or disable spider middlewares

# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

# 'SlideblockVeification.middlewares.SlideblockveificationSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html

DOWNLOADER_MIDDLEWARES = {

'SlideblockVeification.middlewares.seleniumEdit': 543,

}

# Enable or disable extensions

# See https://docs.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

# 'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html

ITEM_PIPELINES = {

'SlideblockVeification.pipelines.SlideblockveificationPipeline': 500,

'SlideblockVeification.pipelines.ImgsPipeline':550,

}

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

IMAGES_URLS_FIELD = "Bg"

#自定义保存路径

IMAGES_STORE = "image"

# Enable and configure HTTP caching (disabled by default)

# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

以上是 scrapy下载不了图片 的全部内容, 来源链接: utcz.com/p/938056.html

回到顶部