抓取下一页

我有以下代码用于scrapy框架:

# -*- coding: utf-8 -*-

import scrapy

from scrapy.contrib.spiders import Rule

from scrapy.linkextractors import LinkExtractor

from lxml import html

class Scrapy1Spider(scrapy.Spider):

name = "scrapy1"

allowed_domains = ["sfbay.craigslist.org"]

start_urls = (

'http://sfbay.craigslist.org/search/npo',

)

rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),)

def parse(self, response):

site = html.fromstring(response.body_as_unicode())

titles = site.xpath('//div[@class="content"]/p[@class="row"]')

print len(titles), 'AAAA'

但是问题是我得到100个结果,但没有转到下一页。

回答:

rule未使用,因为你没有使用CrawlSpider

因此,你必须requests手动创建下一页,如下所示:

# -*- coding: utf-8 -*-

import scrapy

from scrapy.contrib.spiders import Rule

from scrapy.linkextractors import LinkExtractor

from lxml import html

class Scrapy1Spider(scrapy.Spider):

name = "craiglist"

allowed_domains = ["sfbay.craigslist.org"]

start_urls = (

'http://sfbay.craigslist.org/search/npo',

)

Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),)

def parse(self, response):

site = html.fromstring(response.body_as_unicode())

titles = site.xpath('//div[@class="content"]/p[@class="row"]')

print len(titles), 'AAAA'

# follow next page links

next_page = response.xpath('.//a[@class="button next"]/@href').extract()

if next_page:

next_href = next_page[0]

next_page_url = 'http://sfbay.craigslist.org' + next_href

request = scrapy.Request(url=next_page_url)

yield request

或者CrawlSpider像这样使用:

# -*- coding: utf-8 -*-

import scrapy

from scrapy.spiders import CrawlSpider, Rule

from scrapy.linkextractors import LinkExtractor

from lxml import html

class Scrapy1Spider(CrawlSpider):

name = "craiglist"

allowed_domains = ["sfbay.craigslist.org"]

start_urls = (

'http://sfbay.craigslist.org/search/npo',

)

rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_page", follow= True),)

def parse_page(self, response):

site = html.fromstring(response.body_as_unicode())

titles = site.xpath('//div[@class="content"]/p[@class="row"]')

print len(titles), 'AAAA'

以上是 抓取下一页 的全部内容, 来源链接: utcz.com/qa/423676.html

回到顶部