python爬虫:使用账号、密码和验证码登录知乎网页

python

先上代码,后分析出现的问题:

 1 #coding:utf-8

2 import re

3 from bs4 import BeautifulSoup

4 import gzip

5 import urllib.request

6 import urllib.parse

7 import http.cookiejar

8 import ssl

9 import time

10

11 def get_opener(heads):

12 cj=http.cookiejar.CookieJar()

13 pro=urllib.request.HTTPCookieProcessor(cj)

14 opener=urllib.request.build_opener(pro)

15 header=[]

16 for key,value in heads.items():

17 header.append((key,value))

18 opener.addheaders=header

19 return opener

20

21 def ungzip(data):

22 try:

23 print("正在解压....")

24 data=gzip.decompress(data)

25 print("解压完成")

26 except:

27 print("无需解压")

28 return data

29

30 if __name__=="__main__":

31 ssl._create_default_https_context = ssl._create_unverified_context

32 heads={

33 "Accept":"text/html, application/xhtml+xml, */*",

34 "Accept-Language":"zh-CN",

35 "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0",

36 "Accept-Encoding": "gzip, deflate",

37 "Host": "www.zhihu.com",

38 "DNT": "1",

39 "Connection": "Keep-Alive"

40 }

41 opener=get_opener(heads)

42 url="https://www.zhihu.com/"

43 op=opener.open(url)

44 data1=op.read()

45 data1=ungzip(data1).decode(\'utf-8\')

46 #print(data1.decode(\'utf-8\'))

47 #print(op.read().decode(\'utf-8\'))

48 ## xsrf=re.findall(r\'name="_xsrf" value=".*"\',data1)

49 ## print(xsrf[0])

50 ## print(type(xsrf[0]))

51 ## value=xsrf[0].split(" ")

52 ## print(value)

53 ## _xsrf=re.findall(r\'".*"\',value[1])[0]

54 ## print(_xsrf)

55 soup=BeautifulSoup(data1,"html.parser")

56 _xsrf=soup.find("input",{\'type\':\'hidden\'}).get("value")

57 password="hzc19911005"

58 #captcha_type="cn"

59 phone_num="13267243809"

60 captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)

61 captchadata=opener.open(captcha_url).read()

62 with open("1.gif",\'wb\') as file:

63 file.write(captchadata)

64 yanzhengma=input("captcha:")

65 postdata={

66 "_xsrf":_xsrf,

67 "password":password,

68 #"captcha_type":captcha_type,#不能带有这个字段

69 "phone_num":phone_num,

70 "captcha":yanzhengma

71 }

72 postdata=urllib.parse.urlencode(postdata).encode()

73 login_url="https://www.zhihu.com/login/phone_num"

74 op2=opener.open(login_url,postdata)

75 login_data=op2.read()

76 data=ungzip(login_data).decode("utf-8")

77 print(data)

78 result=dict(eval(data))

79 if result["r"]==0:

80 print("登录成功")

81

1、出现“SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)”:

Python 2.7.9 之后版本引入了一个新特性

 

当你urllib.urlopen一个 https 的时候会验证一次 SSL 证书 

 

当目标使用的是自签名的证书时就会爆出一个

 

urllib.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)> 的错误消息,

 

处理方法:

 

import ssl 

ssl._create_default_https_context = ssl._create_unverified_context

 

2、出现验证码错误,返回: 验证码过期:{ "r": 1, "errcode": 1991829, "data": {"captcha":"验证码回话无效 :(","name":"ERR_VERIFY_CAPTCHA_SESSION_INVALID"}, "msg": "验证码回话无效 :(" }:

  1. 发给服务器的post数据没有带验证码:"captcha",解决办法:postdata={
            "_xsrf":_xsrf,
            "password":password,
            #"captcha_type":captcha_type,#不能带有这个字段
            "phone_num":phone_num,
            "captcha":yanzhengma
            }
  2. 验证码过期,解决办法:先从url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)下载图片保存在本地,然后人工识别,手动输入验证码

1 captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)

2 captchadata=opener.open(captcha_url).read()

3 with open("1.gif",\'wb\') as file:

4 file.write(captchadata)

5 yanzhengma=input("captcha:")

 

 

 

 

以上是 python爬虫:使用账号、密码和验证码登录知乎网页 的全部内容, 来源链接: utcz.com/z/386564.html

回到顶部