Scrapy1.8.0之图片下载器

编程

创建模板爬虫

(全局: )scrapy startproject [BMW]

(项目:) scrapy genspider -t crwal bmw_spider ["car.autohome.com.cn"] : 创建爬虫

items.py

class BmwItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

category = scrapy.Field()

# 必须添加以下两个变量,其中 image_urls为可迭代的图片url集,images保存的是图片信息

image_urls = scrapy.Field()

images = scrapy.Field()

spider.py

class BmwSpiderSpider(CrawlSpider):

name = "bmw_spider"

allowed_domains = ["car.autohome.com.cn"]

start_urls = ["https://car.autohome.com.cn/pic/series/2139.html"]

rules = (

Rule(LinkExtractor(

allow=r"https://car.autohome.com.cn/pic/series/2139.+"),

callback="parse_page"

),

)

def parse_page(self, response):

category = response.xpath("//div[@class="uibox"]/div/text()").get()

srcs = response.xpath("//div[contains(@class, "uibox-con")]/ul/li//img/@src").getall()

srcs = list(map(lambda x: response.urljoin(x.replace("240x180_0_q95_c42_", "")), srcs))

yield BmwItem(category=category, image_urls=srcs)

pipelines.py

from scrapy.pipelines.images import ImagesPipeline

# 继承ImagesPipeline

class BMWImagesPipeline(ImagesPipeline):

# 获取其他Item, 在file_path之前运行,返回值会给request

def get_media_requests(self, item, info):

request_objs = super(BMWImagesPipeline, self).get_media_requests(item, info)

for request_obj in request_objs:

request_obj.item = item

return request_objs

# 重写下载路径

def file_path(self, request, response=None, info=None):

path = super(BMWImagesPipeline, self).file_path(request, response, info)

category_path = request.item.get("category") # 获取get_media_requests返回的request

image_name = path.replace("full/", "")

image_path = os.path.join(category_path, image_name)

return image_path

settings.py

ITEM_PIPELINES = {

# "bmw.pipelines.BmwPipeline": 300,

# "scrapy.pipelines.images.ImagesPipeline": 1

"bmw.pipelines.BMWImagesPipeline": 1

}

IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), "img")

点击获取项目

以上是 Scrapy1.8.0之图片下载器 的全部内容, 来源链接: utcz.com/z/511960.html

回到顶部