spider.2爬虫的基础
# 1.创建请求对象(Request())url = "http://..."
# 1.1 添加多个请求头,每次放一个进行访问
# list = [agent1,agent2,agent3,agent4,agent5]
# agent = random.choice(list)
headers = {
"User-Agent": "", # 伪装,反爬虫机制 # 1.1 "User-Agent":agent,
"Cookie": "", # Cookie模拟登陆
}
# 1.2创建自定义请求对象
req = urllib.request.Request(url, headers=headers)
# 2.获取响应对象(urlopen())
res = urllib.request.urlopen(req)
# 3.获取内容(read().decode("utf-8")
html = res.read().decode("utf-8")
# decode() : bytes -> string
# encode() : string -> bytes
# 2-3.可结合
# html = request.urlopen(req).read().decode("utf-8")
print(html)
一、python爬虫基础步骤
# 1.构建处理器对象(专门处理请求的对象)http_hander = request.HTTPHandler()
# 2.创建自定义opener
opener = request.build_opener(http_hander)
# 3.创建自定义请求对象
req = request.Request("http://www.baidu.com")
# 4.1 发送请求,获取响应
# reponse = opener.open(req).read()
# 4.2 把自定义opener设置为全局,这样urlopen发送的请求也会使用自定义的opener
request.install_opener(opener)
reponse = request.urlopen(req).read()
print(reponse)
二、自定义opener
# 1.接收用户从终端输入key = input("请输入要搜索的内容:")
wd = {"wd": key} # dict
url = "http://www.baidu.com/s?"
# 2.构造url编码,进行urlencode编码
wdd = urllib.parse.urlencode(wd)
# 3.拼接url
url = url+wdd
# 4.创建请求对象
req = request.Request(url)
# 5.获取响应对象
reponse = request.urlopen(req).read().decode()
print(reponse)
三、处理get请求,进行urlencode编码
# 1.构造请求头信息header={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/79.0.3928.4 Safari/537.36"
}
url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
# 2.接收用户从终端输入
key = input("请输入要搜索的内容:")
formdata={
"i": key,
"from": "AUTO",
"to": "AUTO",
"smartresult": "dict",
"client": "fanyideskweb",
"salt": "16003477829589",
"sign": "3f351e5f7e0d84706ef063ccabe3e169",
"lts": "1600347782958",
"bv": "cb9a601990a9118249221b303a87fd75",
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"action": "FY_BY_REALTlME",
}
# 3.把data转为bytes数据类型
data = urllib.parse.urlencode(formdata).encode(encoding="utf-8")
# 4.发请求,获响应,获取内容
req = request.Request(url,data=data,headers=header)
resp = request.urlopen(req).read().decode()
# 5.正则表达式,提取"tgt":"like"}]]}中间的任意内容
pat = r""tgt":"(.*?)"}]]}"
result = re.findall(pat,resp)
print(result[0])
四、处理post请求,有道翻译
list1 = ["http://www.baidu.com","http://www.baidu.com","http://www.baidu25234234235454254243.com","http://www.baidu.com","http://www.baidu.com",]
i
= 0for url in list1:i
+= 1try:
request.urlopen(url)
except Exception as e:
print(e)
print("第",i,"此请求完成")
--异常处理
base_url = "https://movie.douban.com/j/chart/top_list?""type=11&interval_id=100%3A90&action=&start={}&limit=20"header
= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) ""AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/79.0.3928.4 Safari/537.36"
}
i = 0
while True:
url =base_url.format(i * 20)
# "网站名:{name}, 地址 {url}".format(name="菜鸟教程", url="www.runoob.com")
req = request.Request(url,headers=header)
res = request.urlopen(req).read().decode()
print(res)
if res == ""or res is None:
break
i += 1
五、ajax请求的使用
import sslurl
= "https://www.12306.cn/mormhweb/"header
= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) ""AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/79.0.3928.4 Safari/537.36"
}
req = request.Request(url,headers=header)
# 验证忽略证书
context = ssl._create_unverified_context()
res = request.urlopen(req,context=context).read().decode()
print(res)
六、https请求的使用
url = "https://www.qiushibaike.com/text/"header
= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64;"" x64; rv:80.0) Gecko/20100101 "
"Firefox/80.0"
}
# 构造请求
res = requests.get(url,headers=header)
info = res.text
infos = re.findall(r"<div class="content">s*<span>s*(.+)s*</span>",info)
for info in infos:
with open("duanzi.txt","a",encoding="utf-8") as f:
f.write(info + "
")
print(infos)
七、糗事百科案例
以上是 spider.2爬虫的基础 的全部内容, 来源链接: utcz.com/z/530908.html