Python爬虫关于网易云音乐的评论加密方式的报错?

最近正在学习抓取网易云歌曲的评论,遇到了一点问题,想求助一下各位网友。

以这首歌为例子,很显然网易云音乐的ajax评论是加密的。
很明显网易云的ajax是要加密的

通过一些查看前辈们的代码,我大致模仿了一下。可是遇到了一些问题。由于segmentfault的pre标签限制了长度,代码直接贴出来可读性不好,所以我分块贴一下。环境是python3,使用python2可能会出现字符编码问题。

抓取的网易云音乐的评论


代码

encrypt1.py,用于处理加密信息的模块

from Crypto.Cipher import AES

import codecs

import os

import base64

import json

import requests

def aesEncrypt(text, secKey):

pad = 16 - len(text) % 16

text = text + pad * chr(pad)

encryptor = AES.new(secKey, 2, '0102030405060708')

ciphertext = encryptor.encrypt(text)

ciphertext = base64.b64encode(ciphertext)

return str(ciphertext)

def rsaEncrypt(text, pubKey, modulus):

text = text[::-1]

rs = int(codecs.encode(text.encode('utf-8'), 'hex'), 16)**int(pubKey, 16) % int(modulus, 16)

return format(rs, 'x').zfill(256)

def createSecretKey(size):

return (''.join(map(lambda xx: (hex(xx)[2:]), os.urandom(size))))[0:16]

encrypt2.py,用于处理加密信息的模块2,就是结合上面的三个方法总处理程序,问题就是处在了这里。print(len(req.json())的输出结果是0。注释:要填写username和password程序才能运行

from encrypt1 import *

import json

import requests

def readEver(songId):

url = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_' + str(songId) + '/?csrf_token=' # commment by ajax

headers = {'Cookie': 'appver=1.5.0.75771;', 'Referer': 'http://music.163.com/'}

text = {'username': '', 'password': '', 'rememberLogin': 'true'}

text = json.dumps(text)

nonce = '0CoJUm6Qyw8W8jud'

secKey = createSecretKey(16)

encText = aesEncrypt(aesEncrypt(text, nonce), secKey) # params

pubKey = '010001'

modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'

encSecKey = rsaEncrypt(secKey, pubKey, modulus) # enSecKey

data = {'params': encText, 'encSecKey': encSecKey}

req = requests.post(url, headers=headers, data=data)

print(len(req.json()) # 输出结果是0

total = req.json()['total']

if int(total) > 10000:

print(songId, total)

else:

pass

爬虫主程序

from bs4 import BeautifulSoup

from pprint import pprint

from encrypt1 import *

from encrypt2 import *

import requests

Default_Header = {

'Referer':'http://music.163.com/',

'Host':'music.163.com',

'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0 Iceweasel/38.3.0',

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'Accept-Encoding':'gzip, deflate'}

BASE_URL = 'http://music.163.com'

_session = requests.session()

_session.headers.update(Default_Header)

def getPage(pageIndex):

pageUrl = 'http://music.163.com/discover/playlist/?order=hot&cat=全部&limit=35&offset=' + pageIndex

soup = BeautifulSoup(_session.get(pageUrl).content)

songList = soup.findAll('a', attrs={'class':'tit f-thide s-fc0'})

for i in songList:

print(i['href'])

getPlayList(i['href'])

def getPlayList(playListId):

playListUrl = BASE_URL + playListId

soup = BeautifulSoup(_session.get(playListUrl).content)

songList = soup.find('ul', attrs={'class': 'f-hide'})

for i in songList.findAll('li'):

startIndex = (i.find('a'))['href']

songId = startIndex.split('=')[1]

readEver(songId)

if __name__=='__main__':

for i in range(1,43):

getPage(str(i*35))


报错信息

正如前面所言,出错的原因是req.json返回的长度为0,所以req.json()['total']会报错。

Traceback (most recent call last):

File "wangyi.py", line 40, in <module>

getPage(str(i*35))

File "wangyi.py", line 24, in getPage

getPlayList(i['href'])

File "wangyi.py", line 33, in getPlayList

readEver(songId)

File "/home/testenv/encrypt.py", line 40, in readEver

total = req.json()['total']

File "/home/testenv/.local/lib/python3.5/site-packages/requests/models.py", line 850, in json

return complexjson.loads(self.text, **kwargs)

File "/usr/lib/python3.5/json/__init__.py", line 319, in loads

return _default_decoder.decode(s)

File "/usr/lib/python3.5/json/decoder.py", line 339, in decode

obj, end = self.raw_decode(s, idx=_w(s, 0).end())

File "/usr/lib/python3.5/json/decoder.py", line 357, in raw_decode

raise JSONDecodeError("Expecting value", s, err.value) from None

json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)


问题描述

很明显应该是加密程序出现了问题,请问各位网友我应该怎么处理这个问题?

回答:

我之前写过一个爬评论的爬虫,貌似没有什么加密啊,难道是网易的策略改了?你可以参考一下我写的,地址在这

回答:

如果加密,还是只想用python通过请求来做,那么就必须分析它的JS加密库,既然前台可以显示,肯定前台就做了解密。

不想那么麻烦,那么可以利用浏览器加载之后在获取内容,这样不更方便么。
可以使用selenium来达到获取评论的目的

以上是 Python爬虫关于网易云音乐的评论加密方式的报错? 的全部内容, 来源链接: utcz.com/a/165298.html

回到顶部