Scrapy:使用重命名默认图像名称自定义图像管道

所有图像都已成功下载到我定义的文件夹中,但是在保存到硬盘之前,我无法命名所选的下载图像。

这是我的代码

pipelines.py

class jellyImagesPipeline(ImagesPipeline):

def image_key(self, url, item):

name = item['image_name']

return 'full/%s.jpg' % (name)

def get_media_requests(self, item, info):

print'Entered get_media_request'

for image_url in item['image_urls']:

yield Request(image_url)

Image_spider.py

 def getImage(self, response):

item = JellyfishItem()

item['image_urls']= [response.url]

item['image_name']= response.meta['image_name']

return item

我需要在代码中进行哪些更改?

更新1

pipelines.py

class jellyImagesPipeline(ImagesPipeline):

def image_custom_key(self, response):

print '\n\n image_custom_key \n\n'

name = response.meta['image_name'][0]

img_key = 'full/%s.jpg' % (name)

print "custom image key:", img_key

return img_key

def get_images(self, response, request, info):

print "\n\n get_images \n\n"

for key, image, buf, in super(jellyImagesPipeline, self).get_images(response, request, info):

yield key, image, buf

key = self.image_custom_key(response)

orig_image = Image.open(StringIO(response.body))

image, buf = self.convert_image(orig_image)

yield key, image, buf

def get_media_requests(self, item, info):

print "\n\nget_media_requests\n"

return [Request(x, meta={'image_name': item["image_name"]})

for x in item.get('image_urls', [])]

更新2

def image_key(self, image_name):

print 'entered into image_key'

name = 'homeshop/%s.jpg' %(image_name)

print name

return name

def get_images(self,request):

print '\nEntered into get_images'

key = self.image_key(request.url)

yield key

def get_media_requests(self, item, info):

print '\n\nEntered media_request'

print item['image_name']

yield Request(item['image_urls'][0], meta=dict(image_name=item['image_name']))

def item_completed(self, results, item, info):

print '\n\nentered into item_completed\n'

print 'Name : ', item['image_urls']

print item['image_name']

for tuple in results:

print tuple

回答:

在 pipelines.py

from scrapy.contrib.pipeline.images import ImagesPipeline

from scrapy.http import Request

from PIL import Image

from cStringIO import StringIO

import re

class jellyImagesPipeline(ImagesPipeline):

CONVERTED_ORIGINAL = re.compile('^full/[0-9,a-f]+.jpg$')

# name information coming from the spider, in each item

# add this information to Requests() for individual images downloads

# through "meta" dictionary

def get_media_requests(self, item, info):

print "get_media_requests"

return [Request(x, meta={'image_name': item["image_name"]})

for x in item.get('image_urls', [])]

# this is where the image is extracted from the HTTP response

def get_images(self, response, request, info):

print "get_images"

for key, image, buf, in super(jellyImagesPipeline, self).get_images(response, request, info):

if self.CONVERTED_ORIGINAL.match(key):

key = self.change_filename(key, response)

yield key, image, buf

def change_filename(self, key, response):

return "full/%s.jpg" % response.meta['image_name'][0]

在中settings.py,确保有

ITEM_PIPELINES = ['jelly.pipelines.jellyImagesPipeline']

IMAGES_STORE = '/path/to/where/you/want/to/store/images'

spider示例:从Python.org主页获取图像,已保存图像的名称(和路径)将遵循站点结构,即位于名为 www.python.org

from scrapy.spider import BaseSpider

from scrapy.selector import HtmlXPathSelector

from scrapy.item import Item, Field

import urlparse

class CustomItem(Item):

image_urls = Field()

image_names = Field()

images = Field()

class ImageSpider(BaseSpider):

name = "customimg"

allowed_domains = ["www.python.org"]

start_urls = ['http://www.python.org']

def parse(self, response):

hxs = HtmlXPathSelector(response)

sites = hxs.select('//img')

items = []

for site in sites:

item = CustomItem()

item['image_urls'] = [urlparse.urljoin(response.url, u) for u in site.select('@src').extract()]

# the name information for your image

item['image_name'] = ['whatever_you_want']

items.append(item)

return items

以上是 Scrapy:使用重命名默认图像名称自定义图像管道 的全部内容, 来源链接: utcz.com/qa/425965.html

回到顶部