spider.2爬虫的基础

python

# 1.创建请求对象(Request())

url = "http://..."

# 1.1 添加多个请求头,每次放一个进行访问

# list = [agent1,agent2,agent3,agent4,agent5]

# agent = random.choice(list)

headers = {

"User-Agent": "", # 伪装,反爬虫机制 # 1.1 "User-Agent":agent,

"Cookie": "", # Cookie模拟登陆

}

# 1.2创建自定义请求对象

req = urllib.request.Request(url, headers=headers)

# 2.获取响应对象(urlopen())

res = urllib.request.urlopen(req)

# 3.获取内容(read().decode("utf-8")

html = res.read().decode("utf-8")

# decode() : bytes -> string

# encode() : string -> bytes

# 2-3.可结合

# html = request.urlopen(req).read().decode("utf-8")

print(html)

一、python爬虫基础步骤

# 1.构建处理器对象(专门处理请求的对象)

http_hander = request.HTTPHandler()

# 2.创建自定义opener

opener = request.build_opener(http_hander)

# 3.创建自定义请求对象

req = request.Request("http://www.baidu.com")

# 4.1 发送请求,获取响应

# reponse = opener.open(req).read()

# 4.2 把自定义opener设置为全局,这样urlopen发送的请求也会使用自定义的opener

request.install_opener(opener)

reponse = request.urlopen(req).read()

print(reponse)

二、自定义opener

# 1.接收用户从终端输入

key = input("请输入要搜索的内容:")

wd = {"wd": key} # dict

url = "http://www.baidu.com/s?"

# 2.构造url编码,进行urlencode编码

wdd = urllib.parse.urlencode(wd)

# 3.拼接url

url = url+wdd

# 4.创建请求对象

req = request.Request(url)

# 5.获取响应对象

reponse = request.urlopen(req).read().decode()

print(reponse)

三、处理get请求,进行urlencode编码

# 1.构造请求头信息

header={

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) "

"AppleWebKit/537.36 (KHTML, like Gecko) "

"Chrome/79.0.3928.4 Safari/537.36"

}

url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"

# 2.接收用户从终端输入

key = input("请输入要搜索的内容:")

formdata={

"i": key,

"from": "AUTO",

"to": "AUTO",

"smartresult": "dict",

"client": "fanyideskweb",

"salt": "16003477829589",

"sign": "3f351e5f7e0d84706ef063ccabe3e169",

"lts": "1600347782958",

"bv": "cb9a601990a9118249221b303a87fd75",

"doctype": "json",

"version": "2.1",

"keyfrom": "fanyi.web",

"action": "FY_BY_REALTlME",

}

# 3.把data转为bytes数据类型

data = urllib.parse.urlencode(formdata).encode(encoding="utf-8")

# 4.发请求,获响应,获取内容

req = request.Request(url,data=data,headers=header)

resp = request.urlopen(req).read().decode()

# 5.正则表达式,提取"tgt":"like"}]]}中间的任意内容

pat = r""tgt":"(.*?)"}]]}"

result = re.findall(pat,resp)

print(result[0])

四、处理post请求,有道翻译

list1 = [

"http://www.baidu.com",

"http://www.baidu.com",

"http://www.baidu25234234235454254243.com",

"http://www.baidu.com",

"http://www.baidu.com",

]

i = 0

for url in list1:

i += 1

try:

request.urlopen(url)

except Exception as e:

print(e)

print("",i,"此请求完成")

--异常处理

base_url = "https://movie.douban.com/j/chart/top_list?"

"type=11&interval_id=100%3A90&action=&start={}&limit=20"

header = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) "

"AppleWebKit/537.36 (KHTML, like Gecko)"

" Chrome/79.0.3928.4 Safari/537.36"

}

i = 0

while True:

url =base_url.format(i * 20)

# "网站名:{name}, 地址 {url}".format(name="菜鸟教程", url="www.runoob.com")

req = request.Request(url,headers=header)

res = request.urlopen(req).read().decode()

print(res)

if res == ""or res is None:

break

i += 1

五、ajax请求的使用

import ssl

url = "https://www.12306.cn/mormhweb/"

header = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) "

"AppleWebKit/537.36 (KHTML, like Gecko)"

" Chrome/79.0.3928.4 Safari/537.36"

}

req = request.Request(url,headers=header)

# 验证忽略证书

context = ssl._create_unverified_context()

res = request.urlopen(req,context=context).read().decode()

print(res)

六、https请求的使用

url = "https://www.qiushibaike.com/text/"

header = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64;"

" x64; rv:80.0) Gecko/20100101 "

"Firefox/80.0"

}

# 构造请求

res = requests.get(url,headers=header)

info = res.text

infos = re.findall(r"<div class="content">s*<span>s*(.+)s*</span>",info)

for info in infos:

with open("duanzi.txt","a",encoding="utf-8") as f:

f.write(info + "

")

print(infos)

七、糗事百科案例

以上是 spider.2爬虫的基础 的全部内容, 来源链接: utcz.com/z/530908.html

回到顶部