Scrapy1.8.0之图片下载器
创建模板爬虫 (全局: )scrapy startproject [BMW]
(项目:) scrapy genspider -t crwal bmw_spider ["car.autohome.com.cn"] : 创建爬虫
items.py
class BmwItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
category = scrapy.Field()
# 必须添加以下两个变量,其中 image_urls为可迭代的图片url集,images保存的是图片信息
image_urls = scrapy.Field()
images = scrapy.Field()
spider.py
class BmwSpiderSpider(CrawlSpider):
name = "bmw_spider"
allowed_domains = ["car.autohome.com.cn"]
start_urls = ["https://car.autohome.com.cn/pic/series/2139.html"]
rules = (
Rule(LinkExtractor(
allow=r"https://car.autohome.com.cn/pic/series/2139.+"),
callback="parse_page"
),
)
def parse_page(self, response):
category = response.xpath("//div[@class="uibox"]/div/text()").get()
srcs = response.xpath("//div[contains(@class, "uibox-con")]/ul/li//img/@src").getall()
srcs = list(map(lambda x: response.urljoin(x.replace("240x180_0_q95_c42_", "")), srcs))
yield BmwItem(category=category, image_urls=srcs)
pipelines.py
from scrapy.pipelines.images import ImagesPipeline
# 继承ImagesPipeline
class BMWImagesPipeline(ImagesPipeline):
# 获取其他Item, 在file_path之前运行,返回值会给request
def get_media_requests(self, item, info):
request_objs = super(BMWImagesPipeline, self).get_media_requests(item, info)
for request_obj in request_objs:
request_obj.item = item
return request_objs
# 重写下载路径
def file_path(self, request, response=None, info=None):
path = super(BMWImagesPipeline, self).file_path(request, response, info)
category_path = request.item.get("category") # 获取get_media_requests返回的request
image_name = path.replace("full/", "")
image_path = os.path.join(category_path, image_name)
return image_path
settings.py
ITEM_PIPELINES = {
# "bmw.pipelines.BmwPipeline": 300,
# "scrapy.pipelines.images.ImagesPipeline": 1
"bmw.pipelines.BMWImagesPipeline": 1
}
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), "img")
点击获取项目
以上是 Scrapy1.8.0之图片下载器 的全部内容, 来源链接: utcz.com/z/511960.html