如何在Python脚本中运行Scrapy

我不知道应该把我的Spider代码放在哪里以及如何从main函数中调用它。请帮忙。这是示例代码:

# This snippet can be used to run scrapy spiders independent of scrapyd or the scrapy command line tool and use it from a script. 

#

# The multiprocessing library is used in order to work around a bug in Twisted, in which you cannot restart an already running reactor or in this case a scrapy instance.

#

# [Here](http://groups.google.com/group/scrapy-users/browse_thread/thread/f332fc5b749d401a) is the mailing-list discussion for this snippet.

#!/usr/bin/python

import os

os.environ.setdefault('SCRAPY_SETTINGS_MODULE', 'project.settings') #Must be at the top before other imports

from scrapy import log, signals, project

from scrapy.xlib.pydispatch import dispatcher

from scrapy.conf import settings

from scrapy.crawler import CrawlerProcess

from multiprocessing import Process, Queue

class CrawlerScript():

def __init__(self):

self.crawler = CrawlerProcess(settings)

if not hasattr(project, 'crawler'):

self.crawler.install()

self.crawler.configure()

self.items = []

dispatcher.connect(self._item_passed, signals.item_passed)

def _item_passed(self, item):

self.items.append(item)

def _crawl(self, queue, spider_name):

spider = self.crawler.spiders.create(spider_name)

if spider:

self.crawler.queue.append_spider(spider)

self.crawler.start()

self.crawler.stop()

queue.put(self.items)

def crawl(self, spider):

queue = Queue()

p = Process(target=self._crawl, args=(queue, spider,))

p.start()

p.join()

return queue.get(True)

# Usage

if __name__ == "__main__":

log.start()

"""

This example runs spider1 and then spider2 three times.

"""

items = list()

crawler = CrawlerScript()

items.append(crawler.crawl('spider1'))

for i in range(3):

items.append(crawler.crawl('spider2'))

print items

# Snippet imported from snippets.scrapy.org (which no longer works)

# author: joehillen

# date : Oct 24, 2010

回答:

所有其他答案均参考Scrapyv0.x。根据更新的文档,Scrapy 1.0要求:

import scrapy

from scrapy.crawler import CrawlerProcess

class MySpider(scrapy.Spider):

# Your spider definition

...

process = CrawlerProcess({

'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'

})

process.crawl(MySpider)

process.start() # the script will block here until the crawling is finished

以上是 如何在Python脚本中运行Scrapy 的全部内容, 来源链接: utcz.com/qa/426906.html

回到顶部