16python爬虫之Requests库爬取海量图片

python

Requests 是一个 Python 的 HTTP 客户端库。

Request支持HTTP连接保持和连接池,支持使用cookie保持会话,支持文件上传,支持自动响应内容的编码,支持国际化的URL和POST数据自动编码。

在python内置模块的基础上进行了高度的封装从而使得python进行网络请求时,变得人性化,使用Requests可以轻而易举的完成浏览器可有的任何操作。现代,国际化,友好

requests会自动实现持久连接keep-alive

开源地址:https://github.com/kennethreitz/requests

中文文档:http://docs.python-requests.org/zh_CN/latest/index.html

目录

一、Requests基础

二、发送请求与接收响应(基本GET请求)

三、发送请求与接收响应(基本POST请求)

四、response属性

五、代理

六、cookie和session

七、案例

一、Requests基础

requests库">1.安装Requests库

pip install  requests

2.使用Requests库

import requests

二、发送请求与接收响应(基本GET请求)

response = requests.get(url)

1.传送 parmas参数

  • 参数包含在url中

response = requests.get("http://httpbin.org/get?name=zhangsan&age=22")

print(response.text)

  • 通过get方法传送参数

data = {

"name": "zhangsan",

"age": 30

}

response = requests.get("http://httpbin.org/get", params=data)

print(response.text)

2.模拟发送请求头(传送headers参数)

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"

}

response = requests.get("http://httpbin.org/get", headers=headers)

print(response.text)

三、发送请求与接收响应(基本POST请求)

response = requests.post(url, data = data, headers=headers)

四、response属性

属性

描述

response.text

获取str类型(Unicode编码)的响应

response.content

获取bytes类型的响应

response.status_code

获取响应状态码

response.headers

获取响应头

response.request

获取响应对应的请求

五、代理

proxies = {

"http": "https://175.44.148.176:9000",

"https": "https://183.129.207.86:14002"

}

response = requests.get("https://www.baidu.com/", proxies=proxies)

六、cookie和session

  • 使用的cookie和session好处:很多网站必须登录之后(或者获取某种权限之后)才能能够请求到相关数据。
  • 使用的cookie和session的弊端:一套cookie和session往往和一个用户对应.请求太快,请求次数太多,容易被服务器识别为爬虫,从而使账号收到损害。

1.不需要cookie的时候尽量不去使用cookie。

2.为了获取登录之后的页面,我们必须发送带有cookies的请求,此时为了确保账号安全应该尽量降低数据

采集速度。

1.cookie

(1)获取cookie信息

response.cookies

2.session

(1)构造session回话对象

session = requests.session()

示例:

def login_renren():

login_url = "http://www.renren.com/SysHome.do"

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"

}

session = requests.session()

login_data = {

"email": "账号",

"password": "密码"

}

response = session.post(login_url, data=login_data, headers=headers)

response = session.get("http://www.renren.com/971909762/newsfeed/photo")

print(response.text)

login_renren()

七、案例

案例1:百度贴吧页面爬取(GET请求)

import requests

import sys

class BaiduTieBa:

def __init__(self, name, pn, ):

self.name = name

self.url = "http://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}".format(name, pn)

self.headers = {

# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"

# 使用较老版本的请求头,该浏览器不支持js

"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"

}

self.url_list = [self.url + str(pn*50) for pn in range(pn)]

print(self.url_list)

def get_data(self, url):

"""

请求数据

:param url:

:return:

"""

response = requests.get(url, headers=self.headers)

return response.content

def save_data(self, data, num):

"""

保存数据

:param data:

:param num:

:return:

"""

file_name = "./pages/" + self.name + "_" + str(num) + ".html"

with open(file_name, "wb") as f:

f.write(data)

def run(self):

for url in self.url_list:

data = self.get_data(url)

num = self.url_list.index(url)

self.save_data(data, num)

if __name__ == "__main__":

name = sys.argv[1]

pn = int(sys.argv[2])

baidu = BaiduTieBa(name, pn)

baidu.run()

案例2:金山词霸翻译(POST请求)

import requests

import sys

import json

class JinshanCiBa:

def __init__(self, words):

self.url = "http://fy.iciba.com/ajax.php?a=fy"

self.headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0",

"X-Requested-With": "XMLHttpRequest"

}

self.post_data = {

"f": "auto",

"t": "auto",

"w": words

}

def get_data(self):

"""

请求数据

:param url:

:return:

"""

response = requests.post(self.url, data=self.post_data, headers=self.headers)

return response.text

def show_translation(self):

"""

显示翻译结果

:param data:

:param num:

:return:

"""

response = self.get_data()

json_data = json.loads(response, encoding="utf-8")

if json_data["status"] == 0:

translation = json_data["content"]["word_mean"]

elif json_data["status"] == 1:

translation = json_data["content"]["out"]

else:

translation = None

print(translation)

def run(self):

self.show_translation()

if __name__ == "__main__":

words = sys.argv[1]

ciba = JinshanCiBa(words)

ciba.run()

案例3:百度贴吧图片爬取

(1)普通版

从已下载页面中提取url来爬取图片(页面下载方法见案例1)

from lxml import etree

import requests

class DownloadPhoto:

def __init__(self):

pass

def download_img(self, url):

response = requests.get(url)

index = url.rfind("/")

file_name = url[index + 1:]

print("下载图片:" + file_name)

save_name = "./photo/" + file_name

with open(save_name, "wb") as f:

f.write(response.content)

def parse_photo_url(self, page):

html = etree.parse(page, etree.HTMLParser())

nodes = html.xpath("//a[contains(@class, "thumbnail")]/img/@bpic")

print(nodes)

print(len(nodes))

for node in nodes:

self.download_img(node)

if __name__ == "__main__":

download = DownloadPhoto()

for i in range(6000):

download.parse_photo_url("./pages/校花_{}.html".format(i))

(2)多线程版

main.py

import requests

from lxml import etree

from file_download import DownLoadExecutioner, file_download

class XiaoHua:

def __init__(self, init_url):

self.init_url = init_url

self.download_executioner = DownLoadExecutioner()

def start(self):

self.download_executioner.start()

self.download_img(self.init_url)

def download_img(self, url):

html_text = file_download(url, type="text")

html = etree.HTML(html_text)

img_urls = html.xpath("//a[contains(@class,"thumbnail")]/img/@bpic")

self.download_executioner.put_task(img_urls)

# 获取下一页的连接

next_page = html.xpath("//div[@id="frs_list_pager"]/a[contains(@class,"next")]/@href")

next_page = "http:" + next_page[0]

self.download_img(next_page)

if __name__ == "__main__":

x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8")

x.start()

file_download.py

import requests

import threading

from queue import Queue

def file_download(url, type="content"):

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"

}

r = requests.get(url, headers=headers)

if type == "text":

return r.text

return r.content

class DownLoadExecutioner(threading.Thread):

def __init__(self):

super().__init__()

self.q = Queue(maxsize=50)

# 图片保存目录

self.save_dir = "./img/"

# 图片计数

self.index = 0

def put_task(self, urls):

if isinstance(urls, list):

for url in urls:

self.q.put(url)

else:

self.q.put(urls)

def run(self):

while True:

url = self.q.get()

content = file_download(url)

# 截取图片名称

index = url.rfind("/")

file_name = url[index+1:]

save_name = self.save_dir + file_name

with open(save_name, "wb+") as f:

f.write(content)

self.index += 1

print(save_name + "下载成功! 当前已下载图片总数:" + str(self.index))

(3)线程池版

main.py

import requests

from lxml import etree

from file_download_pool import DownLoadExecutionerPool, file_download

class XiaoHua:

def __init__(self, init_url):

self.init_url = init_url

self.download_executioner = DownLoadExecutionerPool()

def start(self):

self.download_img(self.init_url)

def download_img(self, url):

html_text = file_download(url, type="text")

html = etree.HTML(html_text)

img_urls = html.xpath("//a[contains(@class,"thumbnail")]/img/@bpic")

self.download_executioner.put_task(img_urls)

# 获取下一页的连接

next_page = html.xpath("//div[@id="frs_list_pager"]/a[contains(@class,"next")]/@href")

next_page = "http:" + next_page[0]

self.download_img(next_page)

if __name__ == "__main__":

x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8")

x.start()

file_download_pool.py

import requests

import concurrent.futures as futures

def file_download(url, type="content"):

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"

}

r = requests.get(url, headers=headers)

if type == "text":

return r.text

return r.content

class DownLoadExecutionerPool():

def __init__(self):

super().__init__()

# 图片保存目录

self.save_dir = "./img_pool/"

# 图片计数

self.index = 0

# 线程池

self.ex = futures.ThreadPoolExecutor(max_workers=30)

def put_task(self, urls):

if isinstance(urls, list):

for url in urls:

self.ex.submit(self.save_img, url)

else:

self.ex.submit(self.save_img, urls)

def save_img(self, url):

content = file_download(url)

# 截取图片名称

index = url.rfind("/")

file_name = url[index+1:]

save_name = self.save_dir + file_name

with open(save_name, "wb+") as f:

f.write(content)

self.index += 1

print(save_name + "下载成功! 当前已下载图片总数:" + str(self.index))

作者:Recalcitrant

链接:https://www.jianshu.com/p/140012f88f8eRequests 是一个 Python 的 HTTP 客户端库。

Request支持HTTP连接保持和连接池,支持使用cookie保持会话,支持文件上传,支持自动响应内容的编码,支持国际化的URL和POST数据自动编码。

在python内置模块的基础上进行了高度的封装,从而使得python进行网络请求时,变得人性化,使用Requests可以轻而易举的完成浏览器可有的任何操作。现代,国际化,友好。

requests会自动实现持久连接keep-alive

![image](//upload-images.jianshu.io/upload_images/17476284-aaaae0326f700dc7.png)

开源地址:https://github.com/kennethreitz/requests

中文文档:http://docs.python-requests.org/zh_CN/latest/index.html

目录

一、Requests基础

二、发送请求与接收响应(基本GET请求)

三、发送请求与接收响应(基本POST请求)

四、response属性

五、代理

六、cookie和session

七、案例

一、Requests基础

1.安装Requests库

pip install  requests

2.使用Requests库

import requests

二、发送请求与接收响应(基本GET请求)

response = requests.get(url)

1.传送 parmas参数

  • 参数包含在url中

response = requests.get("http://httpbin.org/get?name=zhangsan&age=22")

print(response.text)

  • 通过get方法传送参数

data = {

"name": "zhangsan",

"age": 30

}

response = requests.get("http://httpbin.org/get", params=data)

print(response.text)

2.模拟发送请求头(传送headers参数)

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"

}

response = requests.get("http://httpbin.org/get", headers=headers)

print(response.text)

三、发送请求与接收响应(基本POST请求)

response = requests.post(url, data = data, headers=headers)

四、response属性

属性

描述

response.text

获取str类型(Unicode编码)的响应

response.content

获取bytes类型的响应

response.status_code

获取响应状态码

response.headers

获取响应头

response.request

获取响应对应的请求

五、代理

proxies = {

"http": "https://175.44.148.176:9000",

"https": "https://183.129.207.86:14002"

}

response = requests.get("https://www.baidu.com/", proxies=proxies)

六、cookie和session

  • 使用的cookie和session好处:很多网站必须登录之后(或者获取某种权限之后)才能能够请求到相关数据。
  • 使用的cookie和session的弊端:一套cookie和session往往和一个用户对应.请求太快,请求次数太多,容易被服务器识别为爬虫,从而使账号收到损害。

1.不需要cookie的时候尽量不去使用cookie。

2.为了获取登录之后的页面,我们必须发送带有cookies的请求,此时为了确保账号安全应该尽量降低数据

采集速度。

1.cookie

(1)获取cookie信息

response.cookies

2.session

(1)构造session回话对象

session = requests.session()

示例:

def login_renren():

login_url = "http://www.renren.com/SysHome.do"

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"

}

session = requests.session()

login_data = {

"email": "账号",

"password": "密码"

}

response = session.post(login_url, data=login_data, headers=headers)

response = session.get("http://www.renren.com/971909762/newsfeed/photo")

print(response.text)

login_renren()

七、案例

案例1:百度贴吧页面爬取(GET请求)

import requests

import sys

class BaiduTieBa:

def __init__(self, name, pn, ):

self.name = name

self.url = "http://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}".format(name, pn)

self.headers = {

# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"

# 使用较老版本的请求头,该浏览器不支持js

"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"

}

self.url_list = [self.url + str(pn*50) for pn in range(pn)]

print(self.url_list)

def get_data(self, url):

"""

请求数据

:param url:

:return:

"""

response = requests.get(url, headers=self.headers)

return response.content

def save_data(self, data, num):

"""

保存数据

:param data:

:param num:

:return:

"""

file_name = "./pages/" + self.name + "_" + str(num) + ".html"

with open(file_name, "wb") as f:

f.write(data)

def run(self):

for url in self.url_list:

data = self.get_data(url)

num = self.url_list.index(url)

self.save_data(data, num)

if __name__ == "__main__":

name = sys.argv[1]

pn = int(sys.argv[2])

baidu = BaiduTieBa(name, pn)

baidu.run()

案例2:金山词霸翻译(POST请求)

import requests

import sys

import json

class JinshanCiBa:

def __init__(self, words):

self.url = "http://fy.iciba.com/ajax.php?a=fy"

self.headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0",

"X-Requested-With": "XMLHttpRequest"

}

self.post_data = {

"f": "auto",

"t": "auto",

"w": words

}

def get_data(self):

"""

请求数据

:param url:

:return:

"""

response = requests.post(self.url, data=self.post_data, headers=self.headers)

return response.text

def show_translation(self):

"""

显示翻译结果

:param data:

:param num:

:return:

"""

response = self.get_data()

json_data = json.loads(response, encoding="utf-8")

if json_data["status"] == 0:

translation = json_data["content"]["word_mean"]

elif json_data["status"] == 1:

translation = json_data["content"]["out"]

else:

translation = None

print(translation)

def run(self):

self.show_translation()

if __name__ == "__main__":

words = sys.argv[1]

ciba = JinshanCiBa(words)

ciba.run()

案例3:百度贴吧图片爬取

(1)普通版

从已下载页面中提取url来爬取图片(页面下载方法见案例1)

from lxml import etree

import requests

class DownloadPhoto:

def __init__(self):

pass

def download_img(self, url):

response = requests.get(url)

index = url.rfind("/")

file_name = url[index + 1:]

print("下载图片:" + file_name)

save_name = "./photo/" + file_name

with open(save_name, "wb") as f:

f.write(response.content)

def parse_photo_url(self, page):

html = etree.parse(page, etree.HTMLParser())

nodes = html.xpath("//a[contains(@class, "thumbnail")]/img/@bpic")

print(nodes)

print(len(nodes))

for node in nodes:

self.download_img(node)

if __name__ == "__main__":

download = DownloadPhoto()

for i in range(6000):

download.parse_photo_url("./pages/校花_{}.html".format(i))

(2)多线程版

main.py

import requests

from lxml import etree

from file_download import DownLoadExecutioner, file_download

class XiaoHua:

def __init__(self, init_url):

self.init_url = init_url

self.download_executioner = DownLoadExecutioner()

def start(self):

self.download_executioner.start()

self.download_img(self.init_url)

def download_img(self, url):

html_text = file_download(url, type="text")

html = etree.HTML(html_text)

img_urls = html.xpath("//a[contains(@class,"thumbnail")]/img/@bpic")

self.download_executioner.put_task(img_urls)

# 获取下一页的连接

next_page = html.xpath("//div[@id="frs_list_pager"]/a[contains(@class,"next")]/@href")

next_page = "http:" + next_page[0]

self.download_img(next_page)

if __name__ == "__main__":

x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8")

x.start()

file_download.py

import requests

import threading

from queue import Queue

def file_download(url, type="content"):

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"

}

r = requests.get(url, headers=headers)

if type == "text":

return r.text

return r.content

class DownLoadExecutioner(threading.Thread):

def __init__(self):

super().__init__()

self.q = Queue(maxsize=50)

# 图片保存目录

self.save_dir = "./img/"

# 图片计数

self.index = 0

def put_task(self, urls):

if isinstance(urls, list):

for url in urls:

self.q.put(url)

else:

self.q.put(urls)

def run(self):

while True:

url = self.q.get()

content = file_download(url)

# 截取图片名称

index = url.rfind("/")

file_name = url[index+1:]

save_name = self.save_dir + file_name

with open(save_name, "wb+") as f:

f.write(content)

self.index += 1

print(save_name + "下载成功! 当前已下载图片总数:" + str(self.index))

(3)线程池版

main.py

import requests

from lxml import etree

from file_download_pool import DownLoadExecutionerPool, file_download

class XiaoHua:

def __init__(self, init_url):

self.init_url = init_url

self.download_executioner = DownLoadExecutionerPool()

def start(self):

self.download_img(self.init_url)

def download_img(self, url):

html_text = file_download(url, type="text")

html = etree.HTML(html_text)

img_urls = html.xpath("//a[contains(@class,"thumbnail")]/img/@bpic")

self.download_executioner.put_task(img_urls)

# 获取下一页的连接

next_page = html.xpath("//div[@id="frs_list_pager"]/a[contains(@class,"next")]/@href")

next_page = "http:" + next_page[0]

self.download_img(next_page)

if __name__ == "__main__":

x = XiaoHua("http://tieba.baidu.com/f?kw=校花&ie=utf-8")

x.start()

file_download_pool.py

import requests

import concurrent.futures as futures

def file_download(url, type="content"):

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"

}

r = requests.get(url, headers=headers)

if type == "text":

return r.text

return r.content

class DownLoadExecutionerPool():

def __init__(self):

super().__init__()

# 图片保存目录

self.save_dir = "./img_pool/"

# 图片计数

self.index = 0

# 线程池

self.ex = futures.ThreadPoolExecutor(max_workers=30)

def put_task(self, urls):

if isinstance(urls, list):

for url in urls:

self.ex.submit(self.save_img, url)

else:

self.ex.submit(self.save_img, urls)

def save_img(self, url):

content = file_download(url)

# 截取图片名称

index = url.rfind("/")

file_name = url[index+1:]

save_name = self.save_dir + file_name

with open(save_name, "wb+") as f:

f.write(content)

self.index += 1

print(save_name + "下载成功! 当前已下载图片总数:" + str(self.index))

作者:Recalcitrant

链接:https://www.jianshu.com/p/140012f88f8e

在线练习:https://www.520mg.com/it

以上是 16python爬虫之Requests库爬取海量图片 的全部内容, 来源链接: utcz.com/z/530127.html

回到顶部