抓取下一页

Z时代
2024-01-10
分类：问答

我有以下代码用于scrapy框架：

# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
class Scrapy1Spider(scrapy.Spider):
    name = "scrapy1"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = (
        'http://sfbay.craigslist.org/search/npo',
    )
    rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),)
    def parse(self, response):
        site = html.fromstring(response.body_as_unicode())
        titles = site.xpath('//div[@class="content"]/p[@class="row"]')
        print len(titles), 'AAAA'

但是问题是我得到100个结果，但没有转到下一页。

回答：

你rule未使用，因为你没有使用CrawlSpider。

因此，你必须requests手动创建下一页，如下所示：

# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
class Scrapy1Spider(scrapy.Spider):
    name = "craiglist"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = (
        'http://sfbay.craigslist.org/search/npo',
    )
    Rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse", follow= True),)
    def parse(self, response):
        site = html.fromstring(response.body_as_unicode())
        titles = site.xpath('//div[@class="content"]/p[@class="row"]')
        print len(titles), 'AAAA'
        # follow next page links
        next_page = response.xpath('.//a[@class="button next"]/@href').extract()
        if next_page:
            next_href = next_page[0]
            next_page_url = 'http://sfbay.craigslist.org' + next_href
            request = scrapy.Request(url=next_page_url)
            yield request

或者CrawlSpider像这样使用：

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
class Scrapy1Spider(CrawlSpider):
    name = "craiglist"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = (
        'http://sfbay.craigslist.org/search/npo',
    )
    rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_page", follow= True),)
    def parse_page(self, response):
        site = html.fromstring(response.body_as_unicode())
        titles = site.xpath('//div[@class="content"]/p[@class="row"]')
        print len(titles), 'AAAA'

以上是抓取下一页的全部内容，来源链接： utcz.com/qa/423676.html

抓取下一页

回答：

其他人也看了：