python爬虫:使用账号、密码和验证码登录知乎网页

先上代码,后分析出现的问题:
1 #coding:utf-82 import re
3 from bs4 import BeautifulSoup
4 import gzip
5 import urllib.request
6 import urllib.parse
7 import http.cookiejar
8 import ssl
9 import time
10
11 def get_opener(heads):
12 cj=http.cookiejar.CookieJar()
13 pro=urllib.request.HTTPCookieProcessor(cj)
14 opener=urllib.request.build_opener(pro)
15 header=[]
16 for key,value in heads.items():
17 header.append((key,value))
18 opener.addheaders=header
19 return opener
20
21 def ungzip(data):
22 try:
23 print("正在解压....")
24 data=gzip.decompress(data)
25 print("解压完成")
26 except:
27 print("无需解压")
28 return data
29
30 if __name__=="__main__":
31 ssl._create_default_https_context = ssl._create_unverified_context
32 heads={
33 "Accept":"text/html, application/xhtml+xml, */*",
34 "Accept-Language":"zh-CN",
35 "User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0",
36 "Accept-Encoding": "gzip, deflate",
37 "Host": "www.zhihu.com",
38 "DNT": "1",
39 "Connection": "Keep-Alive"
40 }
41 opener=get_opener(heads)
42 url="https://www.zhihu.com/"
43 op=opener.open(url)
44 data1=op.read()
45 data1=ungzip(data1).decode(\'utf-8\')
46 #print(data1.decode(\'utf-8\'))
47 #print(op.read().decode(\'utf-8\'))
48 ## xsrf=re.findall(r\'name="_xsrf" value=".*"\',data1)
49 ## print(xsrf[0])
50 ## print(type(xsrf[0]))
51 ## value=xsrf[0].split(" ")
52 ## print(value)
53 ## _xsrf=re.findall(r\'".*"\',value[1])[0]
54 ## print(_xsrf)
55 soup=BeautifulSoup(data1,"html.parser")
56 _xsrf=soup.find("input",{\'type\':\'hidden\'}).get("value")
57 password="hzc19911005"
58 #captcha_type="cn"
59 phone_num="13267243809"
60 captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)
61 captchadata=opener.open(captcha_url).read()
62 with open("1.gif",\'wb\') as file:
63 file.write(captchadata)
64 yanzhengma=input("captcha:")
65 postdata={
66 "_xsrf":_xsrf,
67 "password":password,
68 #"captcha_type":captcha_type,#不能带有这个字段
69 "phone_num":phone_num,
70 "captcha":yanzhengma
71 }
72 postdata=urllib.parse.urlencode(postdata).encode()
73 login_url="https://www.zhihu.com/login/phone_num"
74 op2=opener.open(login_url,postdata)
75 login_data=op2.read()
76 data=ungzip(login_data).decode("utf-8")
77 print(data)
78 result=dict(eval(data))
79 if result["r"]==0:
80 print("登录成功")
81
1、出现“SSLError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)”:
Python 2.7.9 之后版本引入了一个新特性
当你urllib.urlopen一个 https 的时候会验证一次 SSL 证书
当目标使用的是自签名的证书时就会爆出一个
urllib.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:581)> 的错误消息,
处理方法:
import sslssl._create_default_https_context = ssl._create_unverified_context
2、出现验证码错误,返回: 验证码过期:{ "r": 1, "errcode": 1991829, "data": {"captcha":"验证码回话无效 :(","name":"ERR_VERIFY_CAPTCHA_SESSION_INVALID"}, "msg": "验证码回话无效 :(" }:
- 发给服务器的post数据没有带验证码:"captcha",解决办法:postdata={
 "_xsrf":_xsrf,
 "password":password,
 #"captcha_type":captcha_type,#不能带有这个字段
 "phone_num":phone_num,
 "captcha":yanzhengma
 }
- 验证码过期,解决办法:先从url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)下载图片保存在本地,然后人工识别,手动输入验证码
1 captcha_url="https://www.zhihu.com/captcha.gif?r=%d&type=login"% (time.time() * 1000)2 captchadata=opener.open(captcha_url).read()
3 with open("1.gif",\'wb\') as file:
4 file.write(captchadata)
5 yanzhengma=input("captcha:")
以上是 python爬虫:使用账号、密码和验证码登录知乎网页 的全部内容, 来源链接: utcz.com/z/386564.html





