【Python】使用scrapy爬取新浪微博，无法进入回调的parse_item函数，怎么回事呢？

Z时代
2024-01-10
分类：技术分享

为什么会无法进入parse_item函数呢，自己讲web.com的url全部换成了csdn之后就可以了，甚至用的还是weibo的cookie，自己怀疑是不是因为微博重定向的原因，代码如下：

import scrapy
import re
from scrapy.selector import Selector
from scrapy.http import Request
from tutorial.items import DmozItem
from string import maketrans
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
def extractData(regex, content, index=1):
r = '0'
p = re.compile(regex)
m = p.search(content)
if m:
r = m.group(index)
return r
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["weibo.com"]
download_delay = 2
rules=[
Rule(LinkExtractor(allow=('/')),callback='parse_item',follow=True)
]
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, sdch, br",
"Accept-Language": "zh-CN,zh;q=0.8",
"Connection": "keep-alive",
# "Host": "login.sina.com.cn",
"Referer": "http://weibo.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36"
}
cookies = {
'ALF':'我的cookie',
'Apache':'我的cookie',
'SCF':'我的cookie',
'SINAGLOBAL':'我的cookie',
'SSOLoginState':'我的cookie',
'SUB':'我的cookie',
'SUBP':'我的cookie',
'SUHB':'我的cookie',
'TC-Page-G0':'我的cookie',
'TC-Ugrow-G0':'我的cookie',
'TC-V5-G0':'我的cookie',
'ULV':'我的cookie',
'UOR':'我的cookie',
'WBStorage':'我的cookie',
'YF-Page-G0':'我的cookie',
'YF-Ugrow-G0':'我的cookie',
'YF-V5-G0':'我的cookie',
'_s_tentry':'-',
'log_sid_t':'我的cookie',
'un':'我的cookie',
}
def start_requests(self):
return [Request("http://weibo.com/u/2010226570?refer_flag=1001030101_&is_all=1",cookies = self.cookies,headers=self.headers)]
def parse_item(self, response):
print "comehere!"
regexID=r'class=\\"username\\">(.*)\<\\/h1>'
content=response.body
item=DmozItem()
ID=extractData(regexID,content,1)
item['ID']=ID
print ID
yield item