scrapy下载不了图片
pipeline
# -*- coding: utf-8 -*-import pymysql
import scrapy
# Define your item pipelines here
from SlideblockVeification.items import SlideblockveificationItem
import re
import os
import shutil
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class SlideblockveificationPipeline:
def open_spider(self, spider):
db = spider.settings.get('MYSQL_DB_NAME','test')
host = spider.settings.get('MYSQL_HOST', 'localhost')
port = spider.settings.get('MYSQL_PORT', 3306)
user = spider.settings.get('MYSQL_USER', 'root')
passwd = spider.settings.get('MYSQL_PASSWORD', '')
#第二步是引入连接数据库,pymysql.connect过程中传入多个参数:数据库主机名(默认为本地主机),数据库登录名(默认为当前用户),数据库密码(默认为空),要打开的数据库名称(无默认,可缺省),MySQL使用的TCP端口(默认为3306,可缺省),数据库字符编码(可缺省),self后边的连接名称可以自取名
self.db_conn =pymysql.connect(host=host, port=port, db=db, user=user, passwd=passwd, charset='utf8')
#第三步 获取游标self.连接名.cursor(),游标就像是鼠标一样,后续操作数据库全部靠游标,使用游标的execute命令来执行。
self.db_cur = self.db_conn.cursor()
def close_spider(self, spider):
#第五步 提交数据库执行
self.db_conn.commit()
#第六步 关闭数据库
self.db_conn.close()
def process_item(self, item, spider):
self.insert_db(item)
return item
def insert_db(self, item):
values = (
item['Bg'],
item['slideblock'],
)
print("Insert 成功了")
sql = 'INSERT INTO Image(bg,slideblock) VALUES(%s,%s)'
#第四步 用游标执行数据库命令
self.db_cur.execute(sql, values)
class ImgsPipeline(ImagesPipeline):
IMAGES_STORE = get_project_settings().get("IMAGES_STORE")
def get_media_requests(self, item, info):
image_url = item["Bg"]
print("这个是pipeline")
print(image_url)
header = {
"user-agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
}
yield scrapy.Request(image_url, headers=header,meta={'name':item['Bg']})
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
def file_path(self,request,response=None,info=None):
name = request.meta['name'] # 接收上面meta传递过来的图片名称
name = re.sub(r'[?\\*|“<>:/]', '', name) # 过滤windows字符串,不经过这么一个步骤,你会发现有乱码或无法下载
#添加图片后缀名
#filename= name
filename= "name"
filename= filenamename +'.jpg'
print(filename)
return filename
爬虫文件
# -*- coding: utf-8 -*-import scrapy
from selenium import webdriver
from selenium .webdriver .common .keys import Keys
import time
import os
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy import signals
from SlideblockVeification.items import SlideblockveificationItem
class RunSpider(scrapy.Spider):
name = 'run'
start_urls = ['https://accounts.douban.com/passport/login?source=group']
def __init__(self):
self.driver = webdriver.Chrome(executable_path=r'E:/python/chromedriver_win32/chromedriver.exe')
#直白的说super().__init__(),就是继承父类的init方法
#F:\chromedriver_win32\
self.wait = WebDriverWait(self.driver, 10)
super().__init__()
def parse(self, response):
self.driver.maximize_window()
self.driver.get('https://accounts.douban.com/passport/login?source=group')
time.sleep(5)
self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[1]/ul[1]/li[2]').click()
self.driver.find_element_by_id("username").send_keys('13949094212')
self.driver.find_element_by_id("password").send_keys('1234444')
self.driver.implicitly_wait(5)
self.driver.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div[2]/div[1]/div[4]/a').click()
time.sleep(15)
try:
ss=self.driver.find_element_by_xpath('//*[@id="tcaptcha_iframe"]')
a=True
except:
a=False
if a==True:
print("有验证码")
iframe2=self.driver.find_element_by_xpath("//*[@class='tcaptcha-transform']/iframe")
self.driver.switch_to.frame(iframe2)
item=SlideblockveificationItem()
item["Bg"]=self.driver.find_element_by_xpath('//*[@id="slideBg"]').get_attribute('src')
print(item["Bg"])
item["slideblock"]=self.driver.find_element_by_xpath('//*[@id="slideBlock"]').get_attribute('src')
yield item
elif a==False:
print("这个不用验证")
#下边的代码指的是官方给出的用signal自动关闭浏览器的方法
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(RunSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.closeSpider, signals.spider_closed)
return spider
def closeSpider(self):
self.driver.quit()
print("代表运行")
setting
# -*- coding: utf-8 -*-# Scrapy settings for SlideblockVeification project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'SlideblockVeification'
MEDIA_ALLOW_REDIRECTS =True
SPIDER_MODULES = ['SlideblockVeification.spiders']
NEWSPIDER_MODULE = 'SlideblockVeification.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'SlideblockVeification (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL='ERROR'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'SlideblockVeification.middlewares.SlideblockveificationSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'SlideblockVeification.middlewares.seleniumEdit': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'SlideblockVeification.pipelines.SlideblockveificationPipeline': 500,
'SlideblockVeification.pipelines.ImgsPipeline':550,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
IMAGES_URLS_FIELD = "Bg"
#自定义保存路径
IMAGES_STORE = "image"
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
以上是 scrapy下载不了图片 的全部内容, 来源链接: utcz.com/p/938056.html