Scrapy Images Downloading

我的Spider运行时没有显示任何错误,但是图像未存储在文件夹中,这是我的抓取文件:

Spider.py:

import scrapy

import re

import os

import urlparse

from scrapy.spiders import CrawlSpider, Rule

from scrapy.linkextractors import LinkExtractor

from scrapy.loader.processors import Join, MapCompose, TakeFirst

from scrapy.pipelines.images import ImagesPipeline

from production.items import ProductionItem, ListResidentialItem

class productionSpider(scrapy.Spider):

name = "production"

allowed_domains = ["someurl.com"]

start_urls = [

"someurl.com"

]

def parse(self, response):

for sel in response.xpath('//html/body'):

item = ProductionItem()

img_url = sel.xpath('//a[@data-tealium-id="detail_nav_showphotos"]/@href').extract()[0]

yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseBasicListingInfo, meta={'item': item})

def parseBasicListingInfo(item, response):

item = response.request.meta['item']

item = ListResidentialItem()

try:

image_urls = map(unicode.strip,response.xpath('//a[@itemprop="contentUrl"]/@data-href').extract())

item['image_urls'] = [ x for x in image_urls]

except IndexError:

item['image_urls'] = ''

return item

settings.py:

from scrapy.settings.default_settings import ITEM_PIPELINES

from scrapy.pipelines.images import ImagesPipeline

BOT_NAME = 'production'

SPIDER_MODULES = ['production.spiders']

NEWSPIDER_MODULE = 'production.spiders'

DEFAULT_ITEM_CLASS = 'production.items'

ROBOTSTXT_OBEY = True

DEPTH_PRIORITY = 1

IMAGE_STORE = '/images'

CONCURRENT_REQUESTS = 250

DOWNLOAD_DELAY = 2

ITEM_PIPELINES = {

'scrapy.contrib.pipeline.images.ImagesPipeline': 300,

}

items.py

# -*- coding: utf-8 -*-

import scrapy

class ProductionItem(scrapy.Item):

img_url = scrapy.Field()

# ScrapingList Residential & Yield Estate for sale

class ListResidentialItem(scrapy.Item):

image_urls = scrapy.Field()

images = scrapy.Field()

pass

我的管道文件为空,我不确定应该添加到pipeline.py文件中。

回答:

我的工作最终结果是:

spider.py:

import scrapy

import re

import urlparse

from scrapy.spiders import CrawlSpider, Rule

from scrapy.linkextractors import LinkExtractor

from scrapy.loader.processors import Join, MapCompose, TakeFirst

from scrapy.pipelines.images import ImagesPipeline

from production.items import ProductionItem

from production.items import ImageItem

class productionSpider(scrapy.Spider):

name = "production"

allowed_domains = ["url"]

start_urls = [

"startingurl.com"

]

def parse(self, response):

for sel in response.xpath('//html/body'):

item = ProductionItem()

img_url = sel.xpath('//a[@idd="followclaslink"]/@href').extract()[0]

yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseImages, meta={'item': item})

def parseImages(self, response):

for elem in response.xpath("//img"):

img_url = elem.xpath("@src").extract_first()

yield ImageItem(image_urls=[img_url])

Settings.py

BOT_NAME = 'production'

SPIDER_MODULES = ['production.spiders']

NEWSPIDER_MODULE = 'production.spiders'

DEFAULT_ITEM_CLASS = 'production.items'

ROBOTSTXT_OBEY = True

IMAGES_STORE = '/Users/home/images'

DOWNLOAD_DELAY = 2

ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}

# Disable cookies (enabled by default)

items.py

# -*- coding: utf-8 -*-

import scrapy

class ProductionItem(scrapy.Item):

img_url = scrapy.Field()

# ScrapingList Residential & Yield Estate for sale

class ListResidentialItem(scrapy.Item):

image_urls = scrapy.Field()

images = scrapy.Field()

class ImageItem(scrapy.Item):

image_urls = scrapy.Field()

images = scrapy.Field()

pipelines.py

import scrapy

from scrapy.pipelines.images import ImagesPipeline

from scrapy.exceptions import DropItem

class MyImagesPipeline(ImagesPipeline):

def get_media_requests(self, item, info):

for image_url in item['image_urls']:

yield scrapy.Request(image_url)

def item_completed(self, results, item, info):

image_paths = [x['path'] for ok, x in results if ok]

if not image_paths:

raise DropItem("Item contains no images")

item['image_paths'] = image_paths

return item

以上是 Scrapy Images Downloading 的全部内容, 来源链接: utcz.com/qa/410852.html

回到顶部