python 带你了解爬虫
一篇文章带你了解《python爬虫》
一 什么是网络爬虫:
1. 通俗理解:爬虫是一个模拟人类请求网站行为的程序。可以自动请求网页、并数据抓取下来,然后使用一定的规则提取有价值的数据。
2. 专业介绍:百度百科。
二 python urllib:
# demo01.py(urillb基本使用)
# 导入urllib库(该库不需要安装)import urllib.request
# 请求百度,并接收响应
response = urllib.request.urlopen("http://www.baidu.com/")
# 打印页面
print(response.read().decode(\'utf-8\'))
# demo2.py(用法讲解)
# urllib 用法讲解# urlopen : urllib.request.urlopen(\'网址\',\'数据\',\'超时设置\')
import urllib.request
import urllib.parse
import urllib.error
"""
A:
response = urllib.request.urlopen(\'http://www.baidu.com/\')
print(response.read().decode(\'utf-8\'))
B:
data = urllib.parse.urlencode({\'word\': \'hello\'}).encode(\'utf-8\')
response = urllib.request.urlopen("http://httpbin.org/post", data = data)
print(response.read())
C:
response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)
print(response.read())
"""
try:
response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.1)
except urllib.error.URLError as e:
if isinstance(e.reason.socket.timeout):
print(response.read())
# demo03.py(响应)
# urllib 响应import urllib.request
response = urllib.request.urlopen("http://www.baidu.com/")
# 打印响应类型
print(type(response))
# 打印状态码
print(response.status)
# 打印响应头
print(response.getheaders())
# demo04.py(Request 详解)
# Request 详解import urllib.request
from urllib import parse
"""
A:
request = urllib.request.Request(\'http://www.baidu.com\')
response = urllib.request.urlopen(request)
print(response.read().decode(\'utf-8\'))
B:
url = "http://httpbin.org/post"
# 指定请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
"Host": "api.github.com"
}
# 请求数据
dict = {
"name":"Germey"
}
data = bytes(parse.urlencode(dict),encoding=\'utf-8\')
request = urllib.request.Request(url=url,data=data,headers=headers,method=\'POST\')
response = urllib.request.urlopen(request)
print(response.read().decode(\'utf-8\'))
"""
url = "http://httpbin.org/post"
# 请求数据
dict = {
"name":"Germey"
}
data = bytes(parse.urlencode(dict),encoding=\'utf-8\')
request = urllib.request.Request(url=url,data=data,method=\'POST\')
request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36")
response = urllib.request.urlopen(request)
print(response.read().decode(\'utf-8\'))
# demo05.py (代理)
# handler(代理)import urllib.request
proxy_header = urllib.request.ProxyHandler({
"http":"http://xxx.xxx.xxx.xxx:xxxx",
"https":"https://xxx.xxx.xxx.xxx:xxxx"
})
opener = urllib.request.build_opener(proxy_header)
response = opener.open(\'http://www.baidu.com\')
print(response.read().decode(\'utf-8\'))
# demo06.py(cookie)
# cookieimport http.cookiejar
import urllib.request
"""
A: http.cookiejar 简单使用
cookie = http.cookiejar.CookieJar()
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open(\'http://www.baidu.com\')
print(response.read().decode(\'utf-8\'))
B:MozillaCookieJar 将网站的cookie存储在本地文件中
filename = "utils/cookie.txt"
cookie = http.cookiejar.MozillaCookieJar(filename)
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open(\'http://www.baidu.com\')
cookie.save(ignore_discard=True,ignore_expires=True)
C: LWPCookieJar 将网站的cookie存储在本地文件中
filename = "utils/cookie01.txt"
cookie = http.cookiejar.LWPCookieJar(filename)
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open(\'http://www.baidu.com\')
cookie.save(ignore_discard=True,ignore_expires=True)
D: 使用文件中的cookie
"""
cookie = http.cookiejar.LWPCookieJar()
cookie.load(\'utils/cookie01.txt\',ignore_discard=True,ignore_expires=True)
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open(\'http://www.baidu.com\')
print(response.read().decode(\'utf-8\'))
# demo07.py(异常处理)
# 异常处理import urllib.request
from urllib import error
"""
A: urllib error 简单使用
try:
response = urllib.request.urlopen(\'http://www.baidu.com\')
except error.URLError as e:
print(e.reason)
B:
try:
response = urllib.request.urlopen(\'http://www.baidu.com/\')
print(response.read().decode(\'utf-8\'))
except error.URLError as e:
print(e.reason)
else:
print("*************")
C: timeout
try:
response = urllib.request.urlopen(\'http://www.baidu.com\',timeout=0.01)
except error.URLError as e:
print(e.reason)
"""
# 一个不存在的连接
try:
response = urllib.request.urlopen("http://www.abcdhaha2.com/")
html = response.read().decode(\'utf-8\')
print(html)
except error.URLError as e:
print(e.reason)
# demo08.py(URL解析)
from urllib.parse import urlparsefrom urllib.parse import urlunparse
from urllib.parse import urljoin
from urllib.parse import urlencode
# 语法:urlparse("网址",scheme=\'http|https\', allow_fragments=True)
# A
resuit = urlparse(\'https://www.baidu.com/index.html;user?id=5#comment\')
print(type(resuit))
print(resuit)
# B
resuit = urlparse(\'www.baidu.com/index.html;user?id=5#comment\', scheme="https")
print(resuit)
# C
resuit = urlparse(\'https://www.baidu.com/index.html;user?id=5#comment\', allow_fragments=True)
print(resuit)
# D
resuit = urlparse(\'https://www.baidu.com/index.html;user?id=5#comment\', allow_fragments=False)
print(resuit)
# E
resuit = urlparse(\'https://www.baidu.com/index.html#comment\', allow_fragments=False)
print(resuit)
# F (urlunparse)
data = ["http", "www.baidu.com", "index.html", "user", "a=6", "comment"]
print(urlunparse(data))
# G (urljoin)
# 语法 : urljoin("网址","要添加的后缀")
print(urljoin("https://www.cnblogs.com/xingxingnbsp/p/xxxxxxxxx.html", "12129466.html"))
# H (urlencode)
params = {
\'name\': \'hello_urllib\',
\'age\': 18
}
base_url = \'http://www.baidu.com?\'
url = base_url + urlencode(params)
print(url)
三 python requests:
1. 安装 requests 库:pip install requests
# demo01.py
# requests 基本使用import requests
response = requests.get("http://www.baidu.com")
print(type(response)) # 打印响应类型
print(response.status_code) # 打印状态码
print(type(response.text)) # 打印响应内容类型
print(response.text) # 打印响应内容
print(response.cookies) # 打印响应cookie
2. 请求方式:
1 requests.get(\'网址\')2 requests.post(\'网址\')
3 requests.put(\'网址\')
4 requests.patch(\'网址\')
5 requests.delete(\'网址\')
6 requests.head(\'网址\')
7 requests.options(\'网址\')
3. 基本get请求:
# demo02.py
import requests"""
A:
response = requests.get(\'http://www.baidu.com\')
print(response.text)
B:
response = requests.get(\'http://httpbin.org/get?name=hello&age=22\')
print(response.text)
"""
data = {
"name":"hello",
"age":22
}
response = requests.get(\'http://httpbin.org/get\',params=data)
print(response.text)
4. 解析json:
# demo03.py
# 解析jsonimport requests
response = requests.get(\'https://api.jinse.com/v6/www/information/list?catelogue_key=news&limit=23&information_id=18762945&flag=down&version=9.9.9&_source=www\')
print(type(response))
print(response.json())
print(type(response.json()))
5. 获取二进制数据
# demo04.py
import requests"""
A:
response = requests.get(\'https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500\')
print(type(response.text))
print(type(response.content))
print(response.text)
print(response.content)
"""
response = requests.get(\'https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500\')
with open(\'images/image.png\',\'wb\') as f:
f.write(response.content)
f.close()
6. 添加headers:
# demo05.py
import requestsheaders = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
}
response = requests.get("http://www.baidu.com",headers=headers)
print(response.text)
7. 基本的post请求
# demo06.py
import requests"""
A:
data = {
"name":"hello",
"age":22
}
response = requests.post("http://httpbin.org/post",data=data)
print(response.text)
"""
data = {
"name":"hello",
"age":22
}
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
}
response = requests.post("http://httpbin.org/post",data=data,headers=headers)
print(response.text)
8. 响应:(response属性)
# demo07.py
import requestsresponse = requests.get(\'http://www.baidu.com\')
print(type(response.status_code),response.status_code) # 打印响应 状态码类型 和 状态码
print(type(response.headers),response.headers) # 打印响应 头类型 和 响应头
print(type(response.cookies),response.cookies) # 打印响应 cookies类型 和 cookies
print(type(response.url),response.url) # 打印响应 URL类型 和 URL
print(type(response.history),response.history) # 打印历史记录
9. 状态码判断:
# demo08.py
import requests"""
A:
response = requests.get(\'http://www.baidu.com\')
# 这里使用了python三元表达式
exit() if not response.status_code == requests.codes.ok else print(\'request successfully\')
B:
response = requests.get(\'http://www.baidu.com\')
# 这里使用了python三元表达式
exit() if not response.status_code == 200 else print(\'request successfully\')
"""
response = requests.get(\'http://www.baidu.com\')
if not response.status_code == 200:
exit()
else:
print(\'request successfully\')
# 以上三种方式表达的意思是一样的
10. 高级操作:
# demo09.py
import requests# A: 上传文件 ----------------------------------------------------------------
files = {
"files":open(\'images/image.png\',\'rb\')
}
response = requests.post(\'http://www.baidu.com\',files=files)
print(response.text)
# B:获取cookie -------------------------------------------------------------
response = requests.get(\'http://www.baidu.com\')
print(response.cookies)
for key,value in response.cookies.items():
print(key + "=" + value)
# C: 会话维持 --------------------------------------------------------------
requests.get(\'http://httpbin.org/cookie/set/number/123456789\')
response = requests.get(\'http://httpbin.org/cookkie\')
print(response.text)
s = requests.session()
s.get(\'http://httpbin.org/cookie/set/number/123456789\')
response = s.get(\'http://httpbin.org/cookkie\')
print(response.text)
# D: 代理设置 --------------------------------------------------------------
# 方式一:
proxies = {
\'http\':\'http://ip:port\',
\'https\':\'https://ip:port\'
}
response = requests.get(\'http://www.baidu.com\',proxies=proxies)
print(response.status_code)
# 方式二:
proxies = {
\'http\':\'http://user:password@ip:port/\',
\'https\':\'https://user:password@ip:port/\'
}
response = requests.get(\'http://www.baidu.com\',proxies=proxies)
print(response.status_code)
# 方式三:
proxies = {
\'http\':\'socks5://ip:port\',
\'https\':\'socks5://ip:port\'
}
response = requests.get(\'http://www.baidu.com\',proxies=proxies)
print(response.status_code)
# E: 证书认证 ----------------------------------------------------------------
response = requests.get(\'http://www.12306.cn\')
print(response.status_code)
response = requests.get(\'http://www.12306.cn\',verify=False)
print(response.status_code)
# 注意这里的路径 \'path/server.crt\',\'path/key\' 该成自己的
response = requests.get(\'http://www.12306.cn\',cert=(\'path/server.crt\',\'path/key\'))
print(response.status_code)
# F:超时设置 ----------------------------------------------------------------
from requests.exceptions import ReadTimeout
try:
response = requests.get(\'http://www.taobao.com\', timeout=0.1)
print(response.status_code)
except ReadTimeout:
print("Timeout")
# G: 认证管理 ----------------------------------------------------------------
from requests.auth import HTTPBasicAuth
response = requests.get(\'http://www.taobao.com\', auth=HTTPBasicAuth(\'user\',\'123\'))
print(response.status_code)
response = requests.get(\'http://www.taobao.com\', auth=(\'user\',\'123\'))
print(response.status_code)
# H: 异常处理 ----------------------------------------------------------------
from requests.exceptions import ReadTimeout,ConnectionError,HTTPError,RequestException
try:
response = requests.get(\'http://www.taobao.com\', timeout=0.1)
print(response.status_code)
except ReadTimeout:
print("Timeout")
except HTTPError:
print("HTTPError")
except ConnectionError:
print("ConnectionError")
except RequestException:
print("Error")
四 BeautifulSoup库详解:(网页解析器)
1. 安装 :pip install beautifulsoup4
2. BeautifulSoup基本用法:
# demo01.py
# BeautifulSoup 的基本使用from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
<h2>这是一个列表</h2>
<ul>
<li>选项1</li>
<li>选项2</li>
<li>选项3</li>
<li>选项4</li>
<li>选项5</li>
<li>选项6</li>
<li>选项7</li>
<li>选项8</li>
<li>选项9</li>
</ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,\'lxml\')
print(soup.prettify())
print(soup.title.string)
3. 标签选择器:(只能拿一次)
# demo02.py
# BeautifulSoup 标签选择器(只拿一次)from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
<h2>这是一个列表</h2>
<ul>
<li>选项1</li>
<li>选项2</li>
<li>选项3</li>
<li>选项4</li>
<li>选项5</li>
<li>选项6</li>
<li>选项7</li>
<li>选项8</li>
<li>选项9</li>
</ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,\'lxml\')
print(soup.title)
print(type(soup.title))
print(soup.head)
print(soup.li)
4. 获取标签名称:
# demo03.py
# BeautifulSoup 获取标签名称from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
</body>
</html>
"""
soup = BeautifulSoup(html,\'lxml\')
print(soup.title.name)
5. 获取标签属性:
# demo04.py
# BeautifulSoup 获取标签属性from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<p class="font-p"></p>
<a href="http://www.baidu.com">百度一下 你就知道</a>
</body>
</html>
"""
soup = BeautifulSoup(html,\'lxml\')
print(soup.p.attrs)
print(soup.p.attrs["class"])
print(soup.a.attrs["href"])
6. 获取内容:
# demo05.py
# BeautifulSoup 获取标签属性from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
div
<a href="http://www.baidu.com">百度一下 你就知道</a>
</body>
</html>
"""
soup = BeautifulSoup(html,\'lxml\')
print(soup.p.string)
print(soup.a.string)
7. 嵌套选择:
# demo06.py
# 嵌套选择from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
<h2>这是一个列表</h2>
<ul>
<li>选项1</li>
</ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,\'lxml\')
print(soup.ul.li.string)
8. 子节点和孙节点:
# demo07.py
# 子节点和孙节点from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
<h2>这是一个列表</h2>
<ul><li>选项1</li><li>选项2</li><li><a href="http://www.baidu.com">百度一下 你就知道</a></li></ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html, \'lxml\')
print(soup.ul.contents) # 选择所有子节点 返回值为列表类型
print(soup.ul.childern) # 选择单个子节点
print(soup.ul.descendants) # 获取所有子孙节点
for i,child in enumerate(soup.ul.descendants):
print(i,child)
9. 父节点和祖先节点:
# demo08.py
# 父节点和祖先节点from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<div>
<ol>
<li><a href="http://www.baidu.com">百度一下 你就知道</a></li>
</ol>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,\'lxml\')
print(soup.a.parent) # 选择父节点
print(type(soup.a.parents)) # 选择所有父节点
print(list(enumerate(soup.a.parents)))
10.兄弟节点:
# demo09.py
# 兄弟节点from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
<h1>我是一个大大的H1</h1>
<h2>我是一个大大的H2</h2>
<p>我是一个简单的p标签</p>
<h3>我是一个大大的H3</h3>
<h4>我是一个大大的H4</h4>
</div>
</body>
</html>
"""
html = html.replace(\'\n\',\'\').replace(\' \',\'\') # 去掉html代码的 "\n" 和 空格
soup = BeautifulSoup(html, \'lxml\')
print(list(enumerate(soup.p.next_siblings))) # 获取当前加点下所有的兄弟节点
print(list(enumerate(soup.p.previous_siblings))) # 获取当前加点上所有的兄弟节点
11. 标准选择器(***重点***)
# demo10.py
from bs4 import BeautifulSoup# 标准选择器(重点 建议反复观看)
# 语法:find_all(name,attrs,recursive,text,**kwargs)
"""
find 返回符合条件的单个元素 find_all 返回所有符合条件的所有元素
1. find_parent() # 返回直接父节点
2. find_parents() # 获取所有祖先节点
3. find_next_sibling() # 返回当前节点后边一个兄弟节点
4. find_next_siblings() # 返回当前节点后边所有兄弟节点
5. find_all_next() # 返回当前节点后所有符合条件的节点
6. find_next() # 返回当前节点后第一个符合条件的节点
7. find_all_previous() # 返回当前节点后所有符合条件的节点
8. find_previous() # 返回当前节点后第一个符合条件的节点
"""
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
<h2>这是一个列表</h2>
<ul id="list-1">
<li class="zhangsan">选项1</li>
<li class="zhangsan">选项2</li>
<li class="zhangsan">选项3</li>
</ul>
<ul id="list-2">
<li class="lisi">选项1</li>
<li class="lisi">选项2</li>
<li class="lisi">选项3</li>
</ul>
</div>
</body>
</html>
"""
# A:name --------------------------------------------------------------
soup = BeautifulSoup(html, \'lxml\')
print(soup.find_all(\'ul\')) # 获取所有ul标签 返回列表类型
print(type(soup.find_all(\'ul\')[0])) # 获取类型
for ul in soup.find_all(\'ul\'):
print(ul.find_all(\'li\'))
# B:attrs -------------------------------------------------------------
# 方式一:
soup = BeautifulSoup(html, \'lxml\')
print(soup.find_all(attrs={"id":"list-1"})) # 获取 id 为 list-1 的所有元素
print(soup.find_all(attrs={"class":"lisi"})) # 获取 class 为 lisi 的所有元素
# 方式二:
print(soup.find_all(id = "list-1")) # 获取 id 为 list-1 的所有元素
print(soup.find_all(class_ = "lisi")) # 获取 class 为 lisi 的所有元素
# 以上两种方式执行结果是一样的
# C:text --------------------------------------------------------------
soup = BeautifulSoup(html, \'lxml\')
print(soup.find_all(text = "选项1"))
# D:css选择器(***) -----------------------------------------------------
# 1:
soup = BeautifulSoup(html, \'lxml\')
print(soup.select(\'#list-2\')) # ID 选择器
print(soup.select(\'.zhangsan\')) # class 选择器
print(soup.select(\'ul li\')) # 标签选择器
print(soup.select(\'#divid h2\')) # ID 和 标签 共同使用
# 2:
soup = BeautifulSoup(html, \'lxml\')
for ul in soup.select(\'ul\'):
print(ul.select(\'li\'))
# 3:属性选择器
soup = BeautifulSoup(html, \'lxml\')
for ul in soup.select(\'ul\'):
print(ul.get(\'id\'))
print(ul[\'id\'])
# 4:获取内容
soup = BeautifulSoup(html, \'lxml\')
for li in soup.select(\'li\'):
print(li.get_text())
五 pyquery 库详解
1. 安装: pip install pyquery
2. 初始化:
# demo01.py
# 初始化from pyquery import PyQuery
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
<h2>这是一个列表</h2>
<ul id="list-1">
<li class="zhangsan">选项1</li>
<li class="zhangsan">选项2</li>
<li class="zhangsan">选项3</li>
</ul>
<ul id="list-2">
<li class="lisi">选项1</li>
<li class="lisi">选项2</li>
<li class="lisi">选项3</li>
</ul>
</div>
</body>
</html>
"""
# A: 字符串初始化 -------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
print(doc(\'li\'))
# B: URL初始化 ----------------------------------------------------------------------------------------------------------
doc = PyQuery(url="http://www.baidu.com")
print(doc(\'head\'))
# C: 文件初始化(在同级目录下创建index.html 代码和上边的一样) ---------------------------------------------------------------
# 这种方法会报错 :UnicodeDecodeError: \'gbk\' codec can\'t decode byte 0x80 in position 187: illegal multibyte sequence
# 解决方法去掉html文件中的中文字符,这种解决方式不推荐(有待研究)
# doc = PyQuery(filename=\'index.html\')
# print(doc(\'li\'))
# 可以改成这种方法(但是,总感觉有问题)
with open("index.html","r",encoding="utf-8")as f:
doc = f.read()
result = PyQuery(doc)
print(result(\'li\'))
3. 基本CSS选择器:
# demo02.py
# 基本CSS选择器from pyquery import PyQuery
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
<h2>这是一个列表</h2>
<ul id="list-1">
<li class="zhangsan">选项1</li>
<li class="zhangsan">选项2</li>
<li class="zhangsan">选项3</li>
</ul>
<ul id="list-2">
<li class="lisi">选项1</li>
<li class="lisi">选项2</li>
<li class="lisi">选项3</li>
</ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
print(doc(\'#divid #list-1 li\'))
4. 查找元素:
A: 子元素
# demo03.py
# 子元素from pyquery import PyQuery
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
<h2>这是一个列表</h2>
<ul id="list-1">
<li class="zhangsan">选项1</li>
<li class="zhangsan">选项2</li>
<li class="zhangsan">选项3</li>
</ul>
<ul id="list-2">
<li class="lisi">选项1</li>
<li class="lisi">选项2</li>
<li class="lisi">选项3</li>
</ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
items = doc(\'#list-1\')
print(type(items))
print(items)
li_list = items.find(\'li\')
print(type(li_list))
print(li_list)
B: 父元素
# demo04.py
# 父元素from pyquery import PyQuery
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
<h2>这是一个列表</h2>
<ul id="list-1">
<li class="zhangsan">选项1</li>
<li class="zhangsan">选项2</li>
<li class="zhangsan">选项3</li>
</ul>
<ul id="list-2">
<li class="lisi">选项1</li>
<li class="lisi">选项2</li>
<li class="lisi">选项3</li>
</ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
items = doc(\'#list-1\')
container = items.parent()
print(type(container))
print(container)
parents = items.parents()
print(type(parents))
print(parents)
C: 兄弟元素
# demo05.py
# 兄弟元素
from pyquery import PyQueryhtml = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
<h2>这是一个列表</h2>
<ul id="list-1">
<li class="zhangsan">选项1</li>
<li class="zhangsan">选项2</li>
<li class="zhangsan">选项3</li>
</ul>
<ul id="list-2">
<li class="lisi">选项1</li>
<li class="lisi">选项2</li>
<li class="lisi">选项3</li>
</ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
lis = doc(\'#list-1 .zhangsan\')
print(lis.siblings())
print(lis.siblings(\'.zhangsan\'))
D: 遍历
# demo06.py
# 遍历from pyquery import PyQuery
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
<h2>这是一个列表</h2>
<ul id="list-1">
<li class="zhangsan">选项1</li>
<li class="zhangsan">选项2</li>
<li class="zhangsan">选项3</li>
</ul>
<ul id="list-2">
<li class="lisi">选项1</li>
<li class="lisi">选项2</li>
<li class="lisi">选项3</li>
</ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
lis = doc(\'#list-2 .lisi\')
print(lis)
li_list = doc(\'.lisi\').items()
print(type(li_list))
for li in li_list:
print(li)
E: 获取信息(标签属性)
# demo07.py
# 获取信息(获取属性)from pyquery import PyQuery
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
<a href="http://www.baidu.com">百度一下 你就知道</a>
</div>
</body>
</html>
"""
doc = PyQuery(html)
a = doc(\'#divid a\')
print(a)
print(a.attr(\'href\'))
print(a.attr.href)
F: 获取文本
# demo08.py
# 获取文本from pyquery import PyQuery
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
<a href="http://www.baidu.com">百度一下 你就知道</a>
</div>
</body>
</html>
"""
doc = PyQuery(html)
a = doc(\'#divid a\')
print(a)
print(a.text())
G: 获取html
# demo09.py
# 获取htmlfrom pyquery import PyQuery
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
<a href="http://www.baidu.com">百度一下 你就知道</a>
</div>
</body>
</html>
"""
doc = PyQuery(html)
div = doc(\'#divid\')
print(div)
print(div.html())
H: DOM操作
# demo10.py
# DOM 操作from pyquery import PyQuery
html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>BeautifulSoup 学习</title>
</head>
<body>
<h1>BeautifulSoup</h1>
div id="divid">
<h2>这是一个列表</h2>
<ul id="list-1">
<li class="zhangsan">选项1</li>
<li class="zhangsan">选项2</li>
<li class="zhangsan">选项3</li>
</ul>
<ul id="list-2">
<li class="lisi">选项1</li>
<li class="lisi">选项1</li>
<li class="lisi">选项1</li>
</ul>
</div>
</body>
</html>
"""
# 1. addClass,removeClass ----------------------------------------------------------------------------------------------
doc = PyQuery(html)
li = doc(\'.lisi\')
print(li)
li.remove_class(\'lisi\')
print(li)
li.add_class(\'zhangsan\')
print(li)
# 2. attr,css ----------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
li = doc(\'.zhangsan\')
print(li)
li.attr(\'name\',\'link\')
print(li)
li.css(\'font-size\',\'40px\')
print(li)
# 3. remove ------------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
div = doc(\'#divid\')
print(div.text())
div = doc.find(\'h2\').remove()
print(div.text())
# 4. 伪类选择器 ---------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
li = doc(\'.zhangsan:first-child\') # 获取列表的第一个选项
print(li)
li = doc(\'.zhangsan:last-child\') # 获取列表的最后一个选项
print(li)
li = doc(\'.zhangsan:nth-child(2)\') # 获取列表的第二个选项
print(li)
li = doc(\'.zhangsan:gt(0)\') # 获取索引大于0的所有选项
print(li)
li = doc(\'.zhangsan:nth-child(1n)\') # 获取第一个之后的所有选项(包括第一个选项)
print(li)
li = doc(\'.zhangsan:contains(选项3)\') # 过去内容为"选项3"的选项
print(li)
六 selenium库详解(自动化测试工具)
selenium 在爬虫中主要用来解决JavaScrapt渲染问题
1. 安装:pip install selenium
2. 基本使用:
# demo01.py
from selenium import webdriverfrom selenium.webdriver.common.keys import Keys
"""
项目目标:实现百度搜索
1. 创建浏览器对象 请求百度
2. 元素定位输入框
3. 输入搜索内容
4. 点击回车
"""
# 创建浏览器对象(我用的是谷歌浏览器)
browser = webdriver.Chrome()
try:
# 请求百度
browser.get("http://www.baidu.com")
# 定位输入框
input = browser.find_element_by_id(\'kw\')
# 输入搜索内容
input.send_keys("selenium")
# 点击回车
input.send_keys(Keys.ENTER)
# 打印当前的url地址
print(browser.current_url)
# 打印cookies
print(browser.get_cookies())
# 打印页面
print(browser.page_source)
except Exception as e:
print(e,"=============================")
finally:
browser.close()
"""
有可能会遇到的错误
1. selenium.common.exceptions.WebDriverException: Message: \'chromedriver\' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home
这是由于程序找不到 chromedriver 驱动
解决:
下载 chromedriver (http://chromedriver.storage.googleapis.com/index.html)
注意版本:版本对照表 (https://blog.csdn.net/BinGISer/article/details/88559532)
2. selenium.common.exceptions.SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 78
这是由于 ChromeDriver 和 Chrome 版本不对应
解决:
删除之前下载的 chromedriver
重新下载 chromedriver (http://chromedriver.storage.googleapis.com/index.html)
注意版本:版本对照表 (https://blog.csdn.net/BinGISer/article/details/88559532)
大功告成
"""
3. 声明浏览器对象
# demo02.py
# selenium 声明浏览器from selenium import webdriver
browser = webdriver.Chrome() # 谷歌浏览器
browser = webdriver.Firefox() # 火狐浏览器
browser = webdriver.Edge() # 微软浏览器
browser = webdriver.PhantomJS() # 无界面浏览器
browser = webdriver.Safari() # Safari浏览器
4. 访问页面
# demo03.py
import timefrom selenium import webdriver
# 声明浏览器对象
browser = webdriver.Chrome()
# 访问淘宝
browser.get(\'https://www.taobao.com\')
# 将浏览器最大化显示
browser.maximize_window()
# 停止5秒
time.sleep(5)
# 打印响应页面
print(browser.page_source)
# 关闭浏览器
browser.close()
5. 查找元素(单个元素)
# demo04.py
# 查找元素(单个元素)from selenium import webdriver
# 声明浏览器对象
browser = webdriver.Chrome()
# 访问淘宝
browser.get(\'https://www.taobao.com\')
# 将浏览器最大化显示
browser.maximize_window()
# 定位淘宝搜索框(三种方式都可以)
input_id = browser.find_element_by_id(\'q\')
input_selector = browser.find_element_by_css_selector(\'#q\')
input_xpath = browser.find_element_by_xpath(\'//*[@id="q"]\')
print(input_id)
print(input_selector)
print(input_xpath)
# 关闭浏览器
browser.close()
"""
查找单个元素常用方法:
browser.find_element_by_xpath()
browser.find_element_by_name()
browser.find_element_by_link_text()
browser.find_element_by_partial_link_text()
browser.find_element_by_tag_name()
browser.find_element_by_class_name()
browser.find_element_by_css_selector()
"""
6. 查找元素(多个元素)
# demo05.py
# 查找元素(单个元素)from selenium import webdriver
# 声明浏览器对象
browser = webdriver.Chrome()
# 访问淘宝
browser.get(\'https://www.taobao.com\')
# 将浏览器最大化显示
browser.maximize_window()
# 查找 class="J_Cat a-all" 的所有元素
li_list = browser.find_elements_by_css_selector(\'.J_Cat\')
print(li_list)
# 关闭浏览器
browser.close()
"""
查找多个元素常用方法:
browser.find_elements_by_xpath()
browser.find_elements_by_name()
browser.find_elements_by_link_text()
browser.find_elements_by_partial_link_text()
browser.find_elements_by_tag_name()
browser.find_elements_by_class_name()
browser.find_elements_by_css_selector()
"""
7. 元素交互
# demo06.py
import timefrom selenium import webdriver
# 声明浏览器对象
browser = webdriver.Chrome()
# 请求淘宝
browser.get("https://www.taobao.com")
# 窗口最大化
browser.maximize_window()
# 定位搜索框
input = browser.find_element_by_id(\'q\')
# 输入"内存条"
input.send_keys("内存条")
time.sleep(3)
# 清除搜索框内容
input.clear()
time.sleep(5)
# 输入 "1T硬盘"
input.send_keys("1T硬盘")
# 定位搜索按钮
button = browser.find_element_by_class_name(\'btn-search\')
# 点击搜索按钮
button.click()
time.sleep(10)
# 关闭浏览器
browser.close()
8. 执行javascrapt
# demo07.py
# 执行 javascraptfrom selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
# 滚动条拉到最下边
browser.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\')
# 弹窗
browser.execute_script(\'alert("To Bottom")\')
9. 获取元素信息(获取属性)
# demo08.py
# 获取元素信息(获取属性)from selenium import webdriver
browser = webdriver.Chrome()
url = "https://www.zhihu.com/"
browser.get(url)
logo = browser.find_element_by_css_selector(\'.SignFlowHomepage-logo\')
print(logo)
print(logo.get_attribute(\'src\'))
browser.close()
10. 获取元素信息(获取文本值)
# demo09.py
# 获取元素信息(获取文本值)from selenium import webdriver
browser = webdriver.Chrome()
url = "https://www.zhihu.com/explore"
browser.get(url)
input = browser.find_element_by_id(\'Popover1-toggle\')
input.send_keys(\'新冠病毒\')
print(input.text)
11. 获取元素信息(获取ID,位置,标签名,大小)
# demo10.py
# 获取元素信息(获取ID,位置,标签名,大小)from selenium import webdriver
browser = webdriver.Chrome()
url = "https://www.zhihu.com/explore"
browser.get(url)
input = browser.find_element_by_id(\'Popover1-toggle\')
print(input.id)
print(input.location)
print(input.tag_name)
print(input.size)
browser.close()
12. 获取元素信息(iframe)
# demo11.py
# 获取元素信息(iframe)from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
browser = webdriver.Chrome()
url = "https://www.runoob.com/try/try.php?filename=tryjquery_hide"
browser.get(url)
browser.switch_to.frame(\'iframeResult\')
button = browser.find_element_by_css_selector(\'button\')
print(button)
try:
logo = browser.find_element_by_class_name(\'logo\')
except NoSuchElementException:
print(\'NO LOGO\')
finally:
browser.switch_to.parent_frame()
logo = browser.find_element_by_class_name(\'logo\')
print(logo)
print(logo.text)
browser.close()
13. 等待
# demo12.py
# 等待"""
显示等待就是有条件的等待
隐式等待就是无条件的等待
隐式等待
当使用了隐式等待执行测试的时候,如果 WebDriver 没有在 DOM 中找到元素,将继续等待,超出设定时间后则抛出找不到元素的异常,
换句话说,当查找元素或元素并没有立即出现的时候,隐式等待将等待一段时间再查找 DOM,默认的时间是 0
显式等待
指定某个条件,然后设置最长等待时间。如果在这个时间还没有找到元素,那么便会抛出异常。
只有该条件触发,才执行后续代码,这个使用更灵活。
主要涉及到selenium.webdriver.support 下的expected_conditions类。
"""
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get(\'http://www.taobao.com\')
browser.maximize_window()
browser.implicitly_wait(10)
wait = WebDriverWait(browser,10)
input = wait.until(EC.presence_of_all_elements_located((By.ID,\'q\')))
button = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,\'.btn-search\')))
print(input)
print(button)
browser.close()
14. 浏览器的前进和后退
# demo13.py
# 浏览器的前进和后退import time
from selenium import webdriver
browser = webdriver.Chrome()
browser.get(\'https://www.baidu.com\')
time.sleep(1)
browser.get(\'https://www.taobao.com\')
time.sleep(1)
browser.get(\'https://www.cnblogs.com/xingxingnbsp/\')
time.sleep(1)
browser.back()
time.sleep(2)
browser.forward()
time.sleep(2)
browser.close()
15. Cookies
# demo14.py
# cookiesfrom selenium import webdriver
browser = webdriver.Chrome()
browser.get(\'https://www.zhihu.com/explore\')
print(browser.get_cookies())
browser.add_cookie({"name":"name","domain":"www.zhihu.com","value":"germey"})
print(browser.get_cookies())
browser.delete_all_cookies()
print(browser.get_cookies())
browser.close()
16. 选项卡管理(不兼容)
# demo15.py
# 选项卡管理import time
from selenium import webdriver
browser = webdriver.Chrome()
browser.get(\'https://www.baidu.com\')
time.sleep(2)
browser.execute_script(\'window.open()\')
print(browser.window_handles)
browser.switch_to_window(browser.window_handles[1])
browser.get(\'https://www.taobao.com\')
time.sleep(2)
browser.get(\'https://www.cnblogs.com/xingxingnbsp/\')
time.sleep(3)
browser.close()
17. 异常处理
# demo16.py
from selenium import webdriverfrom selenium.common.exceptions import TimeoutException,NoSuchElementException
browser = webdriver.Chrome()
try:
browser.get(\'https://www.baidu.com\')
except TimeoutException:
print(\'Time Out\')
try:
browser.find_element_by_id(\'hello\')
except NoSuchElementException:
print(\'No Element\')
finally:
browser.close()
文章摘录:https://www.cnblogs.com/xingxingnbsp/p/12129466.html
以上是 python 带你了解爬虫 的全部内容, 来源链接: utcz.com/z/388030.html