Scrapy start_urls

from scrapy.spider import Spider

from scrapy.selector import Selector

from dirbot.items import Website

class DmozSpider(Spider):

name = "dmoz"

allowed_domains = ["dmoz.org"]

start_urls = [

"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",

"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",

]

def parse(self, response):

"""

The lines below is a spider contract. For more info see:

http://doc.scrapy.org/en/latest/topics/contracts.html

@url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/

@scrapes name

"""

sel = Selector(response)

sites = sel.xpath('//ul[@class="directory-url"]/li')

items = []

for site in sites:

item = Website()

item['name'] = site.xpath('a/text()').extract()

item['url'] = site.xpath('a/@href').extract()

item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')

items.append(item)

return items

但是为什么只抓取这两个网页呢?我看到了, allowed_domains = ["dmoz.org"]但是这两页还包含指向dmoz.org域内其他页面的链接!为什么它也不会抓取它们?

回答:

start_urlsclass属性包含起始网址-仅此而已。如果你要提取其他网页的网址,parse请使用[another]回调从相应的回调请求中获取收益:

class Spider(BaseSpider):

name = 'my_spider'

start_urls = [

'http://www.domain.com/'

]

allowed_domains = ['domain.com']

def parse(self, response):

'''Parse main page and extract categories links.'''

hxs = HtmlXPathSelector(response)

urls = hxs.select("//*[@id='tSubmenuContent']/a[position()>1]/@href").extract()

for url in urls:

url = urlparse.urljoin(response.url, url)

self.log('Found category url: %s' % url)

yield Request(url, callback = self.parseCategory)

def parseCategory(self, response):

'''Parse category page and extract links of the items.'''

hxs = HtmlXPathSelector(response)

links = hxs.select("//*[@id='_list']//td[@class='tListDesc']/a/@href").extract()

for link in links:

itemLink = urlparse.urljoin(response.url, link)

self.log('Found item link: %s' % itemLink, log.DEBUG)

yield Request(itemLink, callback = self.parseItem)

def parseItem(self, response):

...

以上是 Scrapy start_urls 的全部内容, 来源链接: utcz.com/qa/411856.html

回到顶部