Python爬虫关于网易云音乐的评论加密方式的报错?
最近正在学习抓取网易云歌曲的评论,遇到了一点问题,想求助一下各位网友。
以这首歌为例子,很显然网易云音乐的ajax评论是加密的。
通过一些查看前辈们的代码,我大致模仿了一下。可是遇到了一些问题。由于segmentfault的pre标签限制了长度,代码直接贴出来可读性不好,所以我分块贴一下。环境是python3,使用python2可能会出现字符编码问题。
抓取的网易云音乐的评论
代码
encrypt1.py,用于处理加密信息的模块
from Crypto.Cipher import AESimport codecs
import os
import base64
import json
import requests
def aesEncrypt(text, secKey):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(secKey, 2, '0102030405060708')
ciphertext = encryptor.encrypt(text)
ciphertext = base64.b64encode(ciphertext)
return str(ciphertext)
def rsaEncrypt(text, pubKey, modulus):
text = text[::-1]
rs = int(codecs.encode(text.encode('utf-8'), 'hex'), 16)**int(pubKey, 16) % int(modulus, 16)
return format(rs, 'x').zfill(256)
def createSecretKey(size):
return (''.join(map(lambda xx: (hex(xx)[2:]), os.urandom(size))))[0:16]
encrypt2.py,用于处理加密信息的模块2,就是结合上面的三个方法总处理程序,问题就是处在了这里。print(len(req.json())的输出结果是0。注释:要填写username和password程序才能运行
from encrypt1 import *import json
import requests
def readEver(songId):
url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_' + str(songId) + '/?csrf_token=' # commment by ajax
headers = {'Cookie': 'appver=1.5.0.75771;', 'Referer': 'http://music.163.com/'}
text = {'username': '', 'password': '', 'rememberLogin': 'true'}
text = json.dumps(text)
nonce = '0CoJUm6Qyw8W8jud'
secKey = createSecretKey(16)
encText = aesEncrypt(aesEncrypt(text, nonce), secKey) # params
pubKey = '010001'
modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
encSecKey = rsaEncrypt(secKey, pubKey, modulus) # enSecKey
data = {'params': encText, 'encSecKey': encSecKey}
req = requests.post(url, headers=headers, data=data)
print(len(req.json()) # 输出结果是0
total = req.json()['total']
if int(total) > 10000:
print(songId, total)
else:
pass
爬虫主程序
from bs4 import BeautifulSoupfrom pprint import pprint
from encrypt1 import *
from encrypt2 import *
import requests
Default_Header = {
'Referer':'http://music.163.com/',
'Host':'music.163.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0 Iceweasel/38.3.0',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate'}
BASE_URL = 'http://music.163.com'
_session = requests.session()
_session.headers.update(Default_Header)
def getPage(pageIndex):
pageUrl = 'http://music.163.com/discover/playlist/?order=hot&cat=全部&limit=35&offset=' + pageIndex
soup = BeautifulSoup(_session.get(pageUrl).content)
songList = soup.findAll('a', attrs={'class':'tit f-thide s-fc0'})
for i in songList:
print(i['href'])
getPlayList(i['href'])
def getPlayList(playListId):
playListUrl = BASE_URL + playListId
soup = BeautifulSoup(_session.get(playListUrl).content)
songList = soup.find('ul', attrs={'class': 'f-hide'})
for i in songList.findAll('li'):
startIndex = (i.find('a'))['href']
songId = startIndex.split('=')[1]
readEver(songId)
if __name__=='__main__':
for i in range(1,43):
getPage(str(i*35))
报错信息
正如前面所言,出错的原因是req.json返回的长度为0,所以req.json()['total']会报错。
Traceback (most recent call last): File "wangyi.py", line 40, in <module>
getPage(str(i*35))
File "wangyi.py", line 24, in getPage
getPlayList(i['href'])
File "wangyi.py", line 33, in getPlayList
readEver(songId)
File "/home/testenv/encrypt.py", line 40, in readEver
total = req.json()['total']
File "/home/testenv/.local/lib/python3.5/site-packages/requests/models.py", line 850, in json
return complexjson.loads(self.text, **kwargs)
File "/usr/lib/python3.5/json/__init__.py", line 319, in loads
return _default_decoder.decode(s)
File "/usr/lib/python3.5/json/decoder.py", line 339, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/usr/lib/python3.5/json/decoder.py", line 357, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
问题描述
很明显应该是加密程序出现了问题,请问各位网友我应该怎么处理这个问题?
回答:
我之前写过一个爬评论的爬虫,貌似没有什么加密啊,难道是网易的策略改了?你可以参考一下我写的,地址在这
回答:
如果加密,还是只想用python通过请求来做,那么就必须分析它的JS加密库,既然前台可以显示,肯定前台就做了解密。
不想那么麻烦,那么可以利用浏览器加载之后在获取内容,这样不更方便么。
可以使用selenium来达到获取评论的目的
以上是 Python爬虫关于网易云音乐的评论加密方式的报错? 的全部内容, 来源链接: utcz.com/a/165298.html