python 爬虫?

想爬去健康界这个网页一直,没有返回值

coding:utf-8

'''
健康界新闻爬去
'''
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import FormRequest, Request
from scrapy.selector import Selector
from Scrapy_demo.items import *
from scrapy import log
from scrapy.utils.response import get_base_url
import urlparse
import json
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy import Request
from scrapy import log
from Scrapy_demo.items import *
from datetime import datetime
import re
import requests
import time

class HealthSpider(CrawlSpider):

name = 'health'

allowed_domains = ['www.cn-healthcare.com/']

download_delay = 1

start_urls = ['http://www.cn-healthcare.com/',]

def parse(self,response):

response_selector = Selector(response)

index_url = response_selector.css("div.indextitle-text").xpath('a/@href').extract()

for detail_link in index_url:

print 'detail_link'

print detail_link

if detail_link:

try:

yield Request(url=detail_link, callback=self.parse_items)

except:

yield Request(url=detail_link, callback=self.parse_items)

log.msg("Page " + detail_link + " parse ERROR, try again !", level=log.WARNING)

mytime = str(time.time())

result =mytime[:10] +mytime[11:14]

first_url = 'http://www.cn-healthcare.com/api/column//kxw/{page_index}?_=%s'% result

for index in range(100):

try:

next_url=first_url.replace('{page_index}',str(index))

print next_url

r = requests.get(next_url).json()#Dict

print type(r.get('data'))

print (r.get('data'))[0]['url']

           for j in range(len(r.get('data'))):

obj = 'http://www.cn-healthcare.com/'+r.get('data')[j]['url']

yield Request(url=obj, callback=self.parse_items)

except TypeError:

print 'error'

def parse_items(self, response):

items = []

item = HealthItem()

sel = Selector(response)

item['title']="ok"

print 'Spider is work'

yield item

图片描述

回答:

试试

r.json().get('count')

回答:

list[index]如果没有还属性会报错,可能你爬的数据没有count这个索引,用list.get(index,default)设置默认值,如果获取不到数据,用default来代替

以上是 python 爬虫? 的全部内容, 来源链接: utcz.com/a/164441.html

回到顶部