python 带你了解爬虫

python

一篇文章带你了解《python爬虫》

一 什么是网络爬虫:

       1. 通俗理解:爬虫是一个模拟人类请求网站行为的程序。可以自动请求网页、并数据抓取下来,然后使用一定的规则提取有价值的数据。

  2. 专业介绍:百度百科。

二 python urllib:

# demo01.py(urillb基本使用)

# 导入urllib库(该库不需要安装)

import urllib.request

# 请求百度,并接收响应

response = urllib.request.urlopen("http://www.baidu.com/")

# 打印页面

print(response.read().decode(\'utf-8\'))

# demo2.py(用法讲解)

# urllib 用法讲解

# urlopen : urllib.request.urlopen(\'网址\',\'数据\',\'超时设置\')

import urllib.request

import urllib.parse

import urllib.error

"""

A:

response = urllib.request.urlopen(\'http://www.baidu.com/\')

print(response.read().decode(\'utf-8\'))

B:

data = urllib.parse.urlencode({\'word\': \'hello\'}).encode(\'utf-8\')

response = urllib.request.urlopen("http://httpbin.org/post", data = data)

print(response.read())

C:

response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)

print(response.read())

"""

try:

response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.1)

except urllib.error.URLError as e:

if isinstance(e.reason.socket.timeout):

print(response.read())

# demo03.py(响应)

# urllib 响应

import urllib.request

response = urllib.request.urlopen("http://www.baidu.com/")

# 打印响应类型

print(type(response))

# 打印状态码

print(response.status)

# 打印响应头

print(response.getheaders())

# demo04.py(Request 详解)

# Request 详解

import urllib.request

from urllib import parse

"""

A:

request = urllib.request.Request(\'http://www.baidu.com\')

response = urllib.request.urlopen(request)

print(response.read().decode(\'utf-8\'))

B:

url = "http://httpbin.org/post"

# 指定请求头

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",

"Host": "api.github.com"

}

# 请求数据

dict = {

"name":"Germey"

}

data = bytes(parse.urlencode(dict),encoding=\'utf-8\')

request = urllib.request.Request(url=url,data=data,headers=headers,method=\'POST\')

response = urllib.request.urlopen(request)

print(response.read().decode(\'utf-8\'))

"""

url = "http://httpbin.org/post"

# 请求数据

dict = {

"name":"Germey"

}

data = bytes(parse.urlencode(dict),encoding=\'utf-8\')

request = urllib.request.Request(url=url,data=data,method=\'POST\')

request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36")

response = urllib.request.urlopen(request)

print(response.read().decode(\'utf-8\'))

# demo05.py (代理)

# handler(代理)

import urllib.request

proxy_header = urllib.request.ProxyHandler({

"http":"http://xxx.xxx.xxx.xxx:xxxx",

"https":"https://xxx.xxx.xxx.xxx:xxxx"

})

opener = urllib.request.build_opener(proxy_header)

response = opener.open(\'http://www.baidu.com\')

print(response.read().decode(\'utf-8\'))

# demo06.py(cookie)

# cookie

import http.cookiejar

import urllib.request

"""

A: http.cookiejar 简单使用

cookie = http.cookiejar.CookieJar()

handir = urllib.request.HTTPCookieProcessor(cookie)

opener = urllib.request.build_opener(handir)

response = opener.open(\'http://www.baidu.com\')

print(response.read().decode(\'utf-8\'))

B:MozillaCookieJar 将网站的cookie存储在本地文件中

filename = "utils/cookie.txt"

cookie = http.cookiejar.MozillaCookieJar(filename)

handir = urllib.request.HTTPCookieProcessor(cookie)

opener = urllib.request.build_opener(handir)

response = opener.open(\'http://www.baidu.com\')

cookie.save(ignore_discard=True,ignore_expires=True)

C: LWPCookieJar 将网站的cookie存储在本地文件中

filename = "utils/cookie01.txt"

cookie = http.cookiejar.LWPCookieJar(filename)

handir = urllib.request.HTTPCookieProcessor(cookie)

opener = urllib.request.build_opener(handir)

response = opener.open(\'http://www.baidu.com\')

cookie.save(ignore_discard=True,ignore_expires=True)

D: 使用文件中的cookie

"""

cookie = http.cookiejar.LWPCookieJar()

cookie.load(\'utils/cookie01.txt\',ignore_discard=True,ignore_expires=True)

handir = urllib.request.HTTPCookieProcessor(cookie)

opener = urllib.request.build_opener(handir)

response = opener.open(\'http://www.baidu.com\')

print(response.read().decode(\'utf-8\'))

# demo07.py(异常处理)

# 异常处理

import urllib.request

from urllib import error

"""

A: urllib error 简单使用

try:

response = urllib.request.urlopen(\'http://www.baidu.com\')

except error.URLError as e:

print(e.reason)

B:

try:

response = urllib.request.urlopen(\'http://www.baidu.com/\')

print(response.read().decode(\'utf-8\'))

except error.URLError as e:

print(e.reason)

else:

print("*************")

C: timeout

try:

response = urllib.request.urlopen(\'http://www.baidu.com\',timeout=0.01)

except error.URLError as e:

print(e.reason)

"""

# 一个不存在的连接

try:

response = urllib.request.urlopen("http://www.abcdhaha2.com/")

html = response.read().decode(\'utf-8\')

print(html)

except error.URLError as e:

print(e.reason)

# demo08.py(URL解析)

from urllib.parse import urlparse

from urllib.parse import urlunparse

from urllib.parse import urljoin

from urllib.parse import urlencode

# 语法:urlparse("网址",scheme=\'http|https\', allow_fragments=True)

# A

resuit = urlparse(\'https://www.baidu.com/index.html;user?id=5#comment\')

print(type(resuit))

print(resuit)

# B

resuit = urlparse(\'www.baidu.com/index.html;user?id=5#comment\', scheme="https")

print(resuit)

# C

resuit = urlparse(\'https://www.baidu.com/index.html;user?id=5#comment\', allow_fragments=True)

print(resuit)

# D

resuit = urlparse(\'https://www.baidu.com/index.html;user?id=5#comment\', allow_fragments=False)

print(resuit)

# E

resuit = urlparse(\'https://www.baidu.com/index.html#comment\', allow_fragments=False)

print(resuit)

# F (urlunparse)

data = ["http", "www.baidu.com", "index.html", "user", "a=6", "comment"]

print(urlunparse(data))

# G (urljoin)

# 语法 : urljoin("网址","要添加的后缀")

print(urljoin("https://www.cnblogs.com/xingxingnbsp/p/xxxxxxxxx.html", "12129466.html"))

# H (urlencode)

params = {

\'name\': \'hello_urllib\',

\'age\': 18

}

base_url = \'http://www.baidu.com?\'

url = base_url + urlencode(params)

print(url)

三 python requests:

1. 安装 requests 库:pip install requests

# demo01.py

# requests 基本使用

import requests

response = requests.get("http://www.baidu.com")

print(type(response)) # 打印响应类型

print(response.status_code) # 打印状态码

print(type(response.text)) # 打印响应内容类型

print(response.text) # 打印响应内容

print(response.cookies) # 打印响应cookie

2. 请求方式:

1 requests.get(\'网址\')

2 requests.post(\'网址\')

3 requests.put(\'网址\')

4 requests.patch(\'网址\')

5 requests.delete(\'网址\')

6 requests.head(\'网址\')

7 requests.options(\'网址\')

3. 基本get请求:

# demo02.py

import requests

"""

A:

response = requests.get(\'http://www.baidu.com\')

print(response.text)

B:

response = requests.get(\'http://httpbin.org/get?name=hello&age=22\')

print(response.text)

"""

data = {

"name":"hello",

"age":22

}

response = requests.get(\'http://httpbin.org/get\',params=data)

print(response.text)

4. 解析json:

# demo03.py

# 解析json

import requests

response = requests.get(\'https://api.jinse.com/v6/www/information/list?catelogue_key=news&limit=23&information_id=18762945&flag=down&version=9.9.9&_source=www\')

print(type(response))

print(response.json())

print(type(response.json()))

5. 获取二进制数据

# demo04.py

import requests

"""

A:

response = requests.get(\'https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500\')

print(type(response.text))

print(type(response.content))

print(response.text)

print(response.content)

"""

response = requests.get(\'https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500\')

with open(\'images/image.png\',\'wb\') as f:

f.write(response.content)

f.close()

6. 添加headers:

# demo05.py

import requests

headers = {

"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"

}

response = requests.get("http://www.baidu.com",headers=headers)

print(response.text)

7. 基本的post请求

# demo06.py

import requests

"""

A:

data = {

"name":"hello",

"age":22

}

response = requests.post("http://httpbin.org/post",data=data)

print(response.text)

"""

data = {

"name":"hello",

"age":22

}

headers = {

"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"

}

response = requests.post("http://httpbin.org/post",data=data,headers=headers)

print(response.text)

8. 响应:(response属性)

# demo07.py

import requests

response = requests.get(\'http://www.baidu.com\')

print(type(response.status_code),response.status_code) # 打印响应 状态码类型 和 状态码

print(type(response.headers),response.headers) # 打印响应 头类型 和 响应头

print(type(response.cookies),response.cookies) # 打印响应 cookies类型 和 cookies

print(type(response.url),response.url) # 打印响应 URL类型 和 URL

print(type(response.history),response.history) # 打印历史记录

9. 状态码判断:

# demo08.py

import requests

"""

A:

response = requests.get(\'http://www.baidu.com\')

# 这里使用了python三元表达式

exit() if not response.status_code == requests.codes.ok else print(\'request successfully\')

B:

response = requests.get(\'http://www.baidu.com\')

# 这里使用了python三元表达式

exit() if not response.status_code == 200 else print(\'request successfully\')

"""

response = requests.get(\'http://www.baidu.com\')

if not response.status_code == 200:

exit()

else:

print(\'request successfully\')

# 以上三种方式表达的意思是一样的

10. 高级操作:

# demo09.py

import requests

# A: 上传文件 ----------------------------------------------------------------

files = {

"files":open(\'images/image.png\',\'rb\')

}

response = requests.post(\'http://www.baidu.com\',files=files)

print(response.text)

# B:获取cookie -------------------------------------------------------------

response = requests.get(\'http://www.baidu.com\')

print(response.cookies)

for key,value in response.cookies.items():

print(key + "=" + value)

# C: 会话维持 --------------------------------------------------------------

requests.get(\'http://httpbin.org/cookie/set/number/123456789\')

response = requests.get(\'http://httpbin.org/cookkie\')

print(response.text)

s = requests.session()

s.get(\'http://httpbin.org/cookie/set/number/123456789\')

response = s.get(\'http://httpbin.org/cookkie\')

print(response.text)

# D: 代理设置 --------------------------------------------------------------

# 方式一:

proxies = {

\'http\':\'http://ip:port\',

\'https\':\'https://ip:port\'

}

response = requests.get(\'http://www.baidu.com\',proxies=proxies)

print(response.status_code)

# 方式二:

proxies = {

\'http\':\'http://user:password@ip:port/\',

\'https\':\'https://user:password@ip:port/\'

}

response = requests.get(\'http://www.baidu.com\',proxies=proxies)

print(response.status_code)

# 方式三:

proxies = {

\'http\':\'socks5://ip:port\',

\'https\':\'socks5://ip:port\'

}

response = requests.get(\'http://www.baidu.com\',proxies=proxies)

print(response.status_code)

# E: 证书认证 ----------------------------------------------------------------

response = requests.get(\'http://www.12306.cn\')

print(response.status_code)

response = requests.get(\'http://www.12306.cn\',verify=False)

print(response.status_code)

# 注意这里的路径 \'path/server.crt\',\'path/key\' 该成自己的

response = requests.get(\'http://www.12306.cn\',cert=(\'path/server.crt\',\'path/key\'))

print(response.status_code)

# F:超时设置 ----------------------------------------------------------------

from requests.exceptions import ReadTimeout

try:

response = requests.get(\'http://www.taobao.com\', timeout=0.1)

print(response.status_code)

except ReadTimeout:

print("Timeout")

# G: 认证管理 ----------------------------------------------------------------

from requests.auth import HTTPBasicAuth

response = requests.get(\'http://www.taobao.com\', auth=HTTPBasicAuth(\'user\',\'123\'))

print(response.status_code)

response = requests.get(\'http://www.taobao.com\', auth=(\'user\',\'123\'))

print(response.status_code)

# H: 异常处理 ----------------------------------------------------------------

from requests.exceptions import ReadTimeout,ConnectionError,HTTPError,RequestException

try:

response = requests.get(\'http://www.taobao.com\', timeout=0.1)

print(response.status_code)

except ReadTimeout:

print("Timeout")

except HTTPError:

print("HTTPError")

except ConnectionError:

print("ConnectionError")

except RequestException:

print("Error")

四 BeautifulSoup库详解:(网页解析器)

1. 安装 :pip install beautifulsoup4

2. BeautifulSoup基本用法:

# demo01.py

# BeautifulSoup 的基本使用

from bs4 import BeautifulSoup

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div>

<h2>这是一个列表</h2>

<ul>

<li>选项1</li>

<li>选项2</li>

<li>选项3</li>

<li>选项4</li>

<li>选项5</li>

<li>选项6</li>

<li>选项7</li>

<li>选项8</li>

<li>选项9</li>

</ul>

</div>

</body>

</html>

"""

soup = BeautifulSoup(html,\'lxml\')

print(soup.prettify())

print(soup.title.string)

3. 标签选择器:(只能拿一次)

# demo02.py

# BeautifulSoup 标签选择器(只拿一次)

from bs4 import BeautifulSoup

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div>

<h2>这是一个列表</h2>

<ul>

<li>选项1</li>

<li>选项2</li>

<li>选项3</li>

<li>选项4</li>

<li>选项5</li>

<li>选项6</li>

<li>选项7</li>

<li>选项8</li>

<li>选项9</li>

</ul>

</div>

</body>

</html>

"""

soup = BeautifulSoup(html,\'lxml\')

print(soup.title)

print(type(soup.title))

print(soup.head)

print(soup.li)

4. 获取标签名称:

# demo03.py

# BeautifulSoup 获取标签名称

from bs4 import BeautifulSoup

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

</body>

</html>

"""

soup = BeautifulSoup(html,\'lxml\')

print(soup.title.name)

5. 获取标签属性:

# demo04.py

# BeautifulSoup 获取标签属性

from bs4 import BeautifulSoup

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<p class="font-p"></p>

<a href="http://www.baidu.com">百度一下 你就知道</a>

</body>

</html>

"""

soup = BeautifulSoup(html,\'lxml\')

print(soup.p.attrs)

print(soup.p.attrs["class"])

print(soup.a.attrs["href"])

6. 获取内容:

# demo05.py

# BeautifulSoup 获取标签属性

from bs4 import BeautifulSoup

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

div

<a href="http://www.baidu.com">百度一下 你就知道</a>

</body>

</html>

"""

soup = BeautifulSoup(html,\'lxml\')

print(soup.p.string)

print(soup.a.string)

7. 嵌套选择:

 # demo06.py

# 嵌套选择

from bs4 import BeautifulSoup

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div>

<h2>这是一个列表</h2>

<ul>

<li>选项1</li>

</ul>

</div>

</body>

</html>

"""

soup = BeautifulSoup(html,\'lxml\')

print(soup.ul.li.string)

8. 子节点和孙节点:

 # demo07.py

# 子节点和孙节点

from bs4 import BeautifulSoup

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div>

<h2>这是一个列表</h2>

<ul><li>选项1</li><li>选项2</li><li><a href="http://www.baidu.com">百度一下 你就知道</a></li></ul>

</div>

</body>

</html>

"""

soup = BeautifulSoup(html, \'lxml\')

print(soup.ul.contents) # 选择所有子节点 返回值为列表类型

print(soup.ul.childern) # 选择单个子节点

print(soup.ul.descendants) # 获取所有子孙节点

for i,child in enumerate(soup.ul.descendants):

print(i,child)

9. 父节点和祖先节点:

 # demo08.py

# 父节点和祖先节点

from bs4 import BeautifulSoup

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>Title</title>

</head>

<body>

<div>

<ol>

<li><a href="http://www.baidu.com">百度一下 你就知道</a></li>

</ol>

</div>

</body>

</html>

"""

soup = BeautifulSoup(html,\'lxml\')

print(soup.a.parent) # 选择父节点

print(type(soup.a.parents)) # 选择所有父节点

print(list(enumerate(soup.a.parents)))

10.兄弟节点:

# demo09.py

# 兄弟节点

from bs4 import BeautifulSoup

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div>

<h1>我是一个大大的H1</h1>

<h2>我是一个大大的H2</h2>

<p>我是一个简单的p标签</p>

<h3>我是一个大大的H3</h3>

<h4>我是一个大大的H4</h4>

</div>

</body>

</html>

"""

html = html.replace(\'\n\',\'\').replace(\' \',\'\') # 去掉html代码的 "\n" 和 空格

soup = BeautifulSoup(html, \'lxml\')

print(list(enumerate(soup.p.next_siblings))) # 获取当前加点下所有的兄弟节点

print(list(enumerate(soup.p.previous_siblings))) # 获取当前加点上所有的兄弟节点

11. 标准选择器(***重点***)

 # demo10.py

from bs4 import BeautifulSoup

# 标准选择器(重点 建议反复观看)

# 语法:find_all(name,attrs,recursive,text,**kwargs)

"""

find 返回符合条件的单个元素 find_all 返回所有符合条件的所有元素

1. find_parent() # 返回直接父节点

2. find_parents() # 获取所有祖先节点

3. find_next_sibling() # 返回当前节点后边一个兄弟节点

4. find_next_siblings() # 返回当前节点后边所有兄弟节点

5. find_all_next() # 返回当前节点后所有符合条件的节点

6. find_next() # 返回当前节点后第一个符合条件的节点

7. find_all_previous() # 返回当前节点后所有符合条件的节点

8. find_previous() # 返回当前节点后第一个符合条件的节点

"""

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div id="divid">

<h2>这是一个列表</h2>

<ul id="list-1">

<li class="zhangsan">选项1</li>

<li class="zhangsan">选项2</li>

<li class="zhangsan">选项3</li>

</ul>

<ul id="list-2">

<li class="lisi">选项1</li>

<li class="lisi">选项2</li>

<li class="lisi">选项3</li>

</ul>

</div>

</body>

</html>

"""

# A:name --------------------------------------------------------------

soup = BeautifulSoup(html, \'lxml\')

print(soup.find_all(\'ul\')) # 获取所有ul标签 返回列表类型

print(type(soup.find_all(\'ul\')[0])) # 获取类型

for ul in soup.find_all(\'ul\'):

print(ul.find_all(\'li\'))

# B:attrs -------------------------------------------------------------

# 方式一:

soup = BeautifulSoup(html, \'lxml\')

print(soup.find_all(attrs={"id":"list-1"})) # 获取 id 为 list-1 的所有元素

print(soup.find_all(attrs={"class":"lisi"})) # 获取 class 为 lisi 的所有元素

# 方式二:

print(soup.find_all(id = "list-1")) # 获取 id 为 list-1 的所有元素

print(soup.find_all(class_ = "lisi")) # 获取 class 为 lisi 的所有元素

# 以上两种方式执行结果是一样的

# C:text --------------------------------------------------------------

soup = BeautifulSoup(html, \'lxml\')

print(soup.find_all(text = "选项1"))

# D:css选择器(***) -----------------------------------------------------

# 1:

soup = BeautifulSoup(html, \'lxml\')

print(soup.select(\'#list-2\')) # ID 选择器

print(soup.select(\'.zhangsan\')) # class 选择器

print(soup.select(\'ul li\')) # 标签选择器

print(soup.select(\'#divid h2\')) # ID 和 标签 共同使用

# 2:

soup = BeautifulSoup(html, \'lxml\')

for ul in soup.select(\'ul\'):

print(ul.select(\'li\'))

# 3:属性选择器

soup = BeautifulSoup(html, \'lxml\')

for ul in soup.select(\'ul\'):

print(ul.get(\'id\'))

print(ul[\'id\'])

# 4:获取内容

soup = BeautifulSoup(html, \'lxml\')

for li in soup.select(\'li\'):

print(li.get_text())

五 pyquery 库详解

1. 安装: pip install pyquery

2. 初始化:

# demo01.py

# 初始化

from pyquery import PyQuery

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div id="divid">

<h2>这是一个列表</h2>

<ul id="list-1">

<li class="zhangsan">选项1</li>

<li class="zhangsan">选项2</li>

<li class="zhangsan">选项3</li>

</ul>

<ul id="list-2">

<li class="lisi">选项1</li>

<li class="lisi">选项2</li>

<li class="lisi">选项3</li>

</ul>

</div>

</body>

</html>

"""

# A: 字符串初始化 -------------------------------------------------------------------------------------------------------

doc = PyQuery(html)

print(doc(\'li\'))

# B: URL初始化 ----------------------------------------------------------------------------------------------------------

doc = PyQuery(url="http://www.baidu.com")

print(doc(\'head\'))

# C: 文件初始化(在同级目录下创建index.html 代码和上边的一样) ---------------------------------------------------------------

# 这种方法会报错 :UnicodeDecodeError: \'gbk\' codec can\'t decode byte 0x80 in position 187: illegal multibyte sequence

# 解决方法去掉html文件中的中文字符,这种解决方式不推荐(有待研究)

# doc = PyQuery(filename=\'index.html\')

# print(doc(\'li\'))

# 可以改成这种方法(但是,总感觉有问题)

with open("index.html","r",encoding="utf-8")as f:

doc = f.read()

result = PyQuery(doc)

print(result(\'li\'))

3. 基本CSS选择器:

# demo02.py

# 基本CSS选择器

from pyquery import PyQuery

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div id="divid">

<h2>这是一个列表</h2>

<ul id="list-1">

<li class="zhangsan">选项1</li>

<li class="zhangsan">选项2</li>

<li class="zhangsan">选项3</li>

</ul>

<ul id="list-2">

<li class="lisi">选项1</li>

<li class="lisi">选项2</li>

<li class="lisi">选项3</li>

</ul>

</div>

</body>

</html>

"""

doc = PyQuery(html)

print(doc(\'#divid #list-1 li\'))

4. 查找元素:

A: 子元素

# demo03.py

# 子元素

from pyquery import PyQuery

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div id="divid">

<h2>这是一个列表</h2>

<ul id="list-1">

<li class="zhangsan">选项1</li>

<li class="zhangsan">选项2</li>

<li class="zhangsan">选项3</li>

</ul>

<ul id="list-2">

<li class="lisi">选项1</li>

<li class="lisi">选项2</li>

<li class="lisi">选项3</li>

</ul>

</div>

</body>

</html>

"""

doc = PyQuery(html)

items = doc(\'#list-1\')

print(type(items))

print(items)

li_list = items.find(\'li\')

print(type(li_list))

print(li_list)

B: 父元素

# demo04.py

# 父元素

from pyquery import PyQuery

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div id="divid">

<h2>这是一个列表</h2>

<ul id="list-1">

<li class="zhangsan">选项1</li>

<li class="zhangsan">选项2</li>

<li class="zhangsan">选项3</li>

</ul>

<ul id="list-2">

<li class="lisi">选项1</li>

<li class="lisi">选项2</li>

<li class="lisi">选项3</li>

</ul>

</div>

</body>

</html>

"""

doc = PyQuery(html)

items = doc(\'#list-1\')

container = items.parent()

print(type(container))

print(container)

parents = items.parents()

print(type(parents))

print(parents)

C: 兄弟元素

# demo05.py

# 兄弟元素

from pyquery import PyQuery

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div id="divid">

<h2>这是一个列表</h2>

<ul id="list-1">

<li class="zhangsan">选项1</li>

<li class="zhangsan">选项2</li>

<li class="zhangsan">选项3</li>

</ul>

<ul id="list-2">

<li class="lisi">选项1</li>

<li class="lisi">选项2</li>

<li class="lisi">选项3</li>

</ul>

</div>

</body>

</html>

"""

doc = PyQuery(html)

lis = doc(\'#list-1 .zhangsan\')

print(lis.siblings())

print(lis.siblings(\'.zhangsan\'))

D: 遍历

# demo06.py

# 遍历

from pyquery import PyQuery

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div id="divid">

<h2>这是一个列表</h2>

<ul id="list-1">

<li class="zhangsan">选项1</li>

<li class="zhangsan">选项2</li>

<li class="zhangsan">选项3</li>

</ul>

<ul id="list-2">

<li class="lisi">选项1</li>

<li class="lisi">选项2</li>

<li class="lisi">选项3</li>

</ul>

</div>

</body>

</html>

"""

doc = PyQuery(html)

lis = doc(\'#list-2 .lisi\')

print(lis)

li_list = doc(\'.lisi\').items()

print(type(li_list))

for li in li_list:

print(li)

E: 获取信息(标签属性)

# demo07.py

# 获取信息(获取属性)

from pyquery import PyQuery

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div id="divid">

<a href="http://www.baidu.com">百度一下 你就知道</a>

</div>

</body>

</html>

"""

doc = PyQuery(html)

a = doc(\'#divid a\')

print(a)

print(a.attr(\'href\'))

print(a.attr.href)

F: 获取文本

# demo08.py

# 获取文本

from pyquery import PyQuery

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div id="divid">

<a href="http://www.baidu.com">百度一下 你就知道</a>

</div>

</body>

</html>

"""

doc = PyQuery(html)

a = doc(\'#divid a\')

print(a)

print(a.text())

G: 获取html

# demo09.py

# 获取html

from pyquery import PyQuery

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

<div id="divid">

<a href="http://www.baidu.com">百度一下 你就知道</a>

</div>

</body>

</html>

"""

doc = PyQuery(html)

div = doc(\'#divid\')

print(div)

print(div.html())

H: DOM操作

# demo10.py

# DOM 操作

from pyquery import PyQuery

html = """

<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>BeautifulSoup 学习</title>

</head>

<body>

<h1>BeautifulSoup</h1>

div id="divid">

<h2>这是一个列表</h2>

<ul id="list-1">

<li class="zhangsan">选项1</li>

<li class="zhangsan">选项2</li>

<li class="zhangsan">选项3</li>

</ul>

<ul id="list-2">

<li class="lisi">选项1</li>

<li class="lisi">选项1</li>

<li class="lisi">选项1</li>

</ul>

</div>

</body>

</html>

"""

# 1. addClass,removeClass ----------------------------------------------------------------------------------------------

doc = PyQuery(html)

li = doc(\'.lisi\')

print(li)

li.remove_class(\'lisi\')

print(li)

li.add_class(\'zhangsan\')

print(li)

# 2. attr,css ----------------------------------------------------------------------------------------------------------

doc = PyQuery(html)

li = doc(\'.zhangsan\')

print(li)

li.attr(\'name\',\'link\')

print(li)

li.css(\'font-size\',\'40px\')

print(li)

# 3. remove ------------------------------------------------------------------------------------------------------------

doc = PyQuery(html)

div = doc(\'#divid\')

print(div.text())

div = doc.find(\'h2\').remove()

print(div.text())

# 4. 伪类选择器 ---------------------------------------------------------------------------------------------------------

doc = PyQuery(html)

li = doc(\'.zhangsan:first-child\') # 获取列表的第一个选项

print(li)

li = doc(\'.zhangsan:last-child\') # 获取列表的最后一个选项

print(li)

li = doc(\'.zhangsan:nth-child(2)\') # 获取列表的第二个选项

print(li)

li = doc(\'.zhangsan:gt(0)\') # 获取索引大于0的所有选项

print(li)

li = doc(\'.zhangsan:nth-child(1n)\') # 获取第一个之后的所有选项(包括第一个选项)

print(li)

li = doc(\'.zhangsan:contains(选项3)\') # 过去内容为"选项3"的选项

print(li)

六 selenium库详解(自动化测试工具)

selenium 在爬虫中主要用来解决JavaScrapt渲染问题

1. 安装:pip install selenium

2. 基本使用:

# demo01.py

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

"""

项目目标:实现百度搜索

1. 创建浏览器对象 请求百度

2. 元素定位输入框

3. 输入搜索内容

4. 点击回车

"""

# 创建浏览器对象(我用的是谷歌浏览器)

browser = webdriver.Chrome()

try:

# 请求百度

browser.get("http://www.baidu.com")

# 定位输入框

input = browser.find_element_by_id(\'kw\')

# 输入搜索内容

input.send_keys("selenium")

# 点击回车

input.send_keys(Keys.ENTER)

# 打印当前的url地址

print(browser.current_url)

# 打印cookies

print(browser.get_cookies())

# 打印页面

print(browser.page_source)

except Exception as e:

print(e,"=============================")

finally:

browser.close()

"""

有可能会遇到的错误

1. selenium.common.exceptions.WebDriverException: Message: \'chromedriver\' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home

这是由于程序找不到 chromedriver 驱动

解决:

下载 chromedriver (http://chromedriver.storage.googleapis.com/index.html)

注意版本:版本对照表 (https://blog.csdn.net/BinGISer/article/details/88559532)

2. selenium.common.exceptions.SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 78

这是由于 ChromeDriver 和 Chrome 版本不对应

解决:

删除之前下载的 chromedriver

重新下载 chromedriver (http://chromedriver.storage.googleapis.com/index.html)

注意版本:版本对照表 (https://blog.csdn.net/BinGISer/article/details/88559532)

大功告成

"""

3. 声明浏览器对象

# demo02.py

# selenium 声明浏览器

from selenium import webdriver

browser = webdriver.Chrome() # 谷歌浏览器

browser = webdriver.Firefox() # 火狐浏览器

browser = webdriver.Edge() # 微软浏览器

browser = webdriver.PhantomJS() # 无界面浏览器

browser = webdriver.Safari() # Safari浏览器

4. 访问页面

 # demo03.py

import time

from selenium import webdriver

# 声明浏览器对象

browser = webdriver.Chrome()

# 访问淘宝

browser.get(\'https://www.taobao.com\')

# 将浏览器最大化显示

browser.maximize_window()

# 停止5秒

time.sleep(5)

# 打印响应页面

print(browser.page_source)

# 关闭浏览器

browser.close()

5. 查找元素(单个元素)

# demo04.py

# 查找元素(单个元素)

from selenium import webdriver

# 声明浏览器对象

browser = webdriver.Chrome()

# 访问淘宝

browser.get(\'https://www.taobao.com\')

# 将浏览器最大化显示

browser.maximize_window()

# 定位淘宝搜索框(三种方式都可以)

input_id = browser.find_element_by_id(\'q\')

input_selector = browser.find_element_by_css_selector(\'#q\')

input_xpath = browser.find_element_by_xpath(\'//*[@id="q"]\')

print(input_id)

print(input_selector)

print(input_xpath)

# 关闭浏览器

browser.close()

"""

查找单个元素常用方法:

browser.find_element_by_xpath()

browser.find_element_by_name()

browser.find_element_by_link_text()

browser.find_element_by_partial_link_text()

browser.find_element_by_tag_name()

browser.find_element_by_class_name()

browser.find_element_by_css_selector()

"""

6. 查找元素(多个元素)

# demo05.py

# 查找元素(单个元素)

from selenium import webdriver

# 声明浏览器对象

browser = webdriver.Chrome()

# 访问淘宝

browser.get(\'https://www.taobao.com\')

# 将浏览器最大化显示

browser.maximize_window()

# 查找 class="J_Cat a-all" 的所有元素

li_list = browser.find_elements_by_css_selector(\'.J_Cat\')

print(li_list)

# 关闭浏览器

browser.close()

"""

查找多个元素常用方法:

browser.find_elements_by_xpath()

browser.find_elements_by_name()

browser.find_elements_by_link_text()

browser.find_elements_by_partial_link_text()

browser.find_elements_by_tag_name()

browser.find_elements_by_class_name()

browser.find_elements_by_css_selector()

"""

7. 元素交互

# demo06.py

import time

from selenium import webdriver

# 声明浏览器对象

browser = webdriver.Chrome()

# 请求淘宝

browser.get("https://www.taobao.com")

# 窗口最大化

browser.maximize_window()

# 定位搜索框

input = browser.find_element_by_id(\'q\')

# 输入"内存条"

input.send_keys("内存条")

time.sleep(3)

# 清除搜索框内容

input.clear()

time.sleep(5)

# 输入 "1T硬盘"

input.send_keys("1T硬盘")

# 定位搜索按钮

button = browser.find_element_by_class_name(\'btn-search\')

# 点击搜索按钮

button.click()

time.sleep(10)

# 关闭浏览器

browser.close()

8. 执行javascrapt

# demo07.py

# 执行 javascrapt

from selenium import webdriver

browser = webdriver.Chrome()

browser.get("https://www.taobao.com")

# 滚动条拉到最下边

browser.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\')

# 弹窗

browser.execute_script(\'alert("To Bottom")\')

9. 获取元素信息(获取属性)

# demo08.py

# 获取元素信息(获取属性)

from selenium import webdriver

browser = webdriver.Chrome()

url = "https://www.zhihu.com/"

browser.get(url)

logo = browser.find_element_by_css_selector(\'.SignFlowHomepage-logo\')

print(logo)

print(logo.get_attribute(\'src\'))

browser.close()

10. 获取元素信息(获取文本值)

# demo09.py

# 获取元素信息(获取文本值)

from selenium import webdriver

browser = webdriver.Chrome()

url = "https://www.zhihu.com/explore"

browser.get(url)

input = browser.find_element_by_id(\'Popover1-toggle\')

input.send_keys(\'新冠病毒\')

print(input.text)

11. 获取元素信息(获取ID,位置,标签名,大小)

# demo10.py

# 获取元素信息(获取ID,位置,标签名,大小)

from selenium import webdriver

browser = webdriver.Chrome()

url = "https://www.zhihu.com/explore"

browser.get(url)

input = browser.find_element_by_id(\'Popover1-toggle\')

print(input.id)

print(input.location)

print(input.tag_name)

print(input.size)

browser.close()

12. 获取元素信息(iframe)

# demo11.py

# 获取元素信息(iframe)

from selenium import webdriver

from selenium.common.exceptions import NoSuchElementException

browser = webdriver.Chrome()

url = "https://www.runoob.com/try/try.php?filename=tryjquery_hide"

browser.get(url)

browser.switch_to.frame(\'iframeResult\')

button = browser.find_element_by_css_selector(\'button\')

print(button)

try:

logo = browser.find_element_by_class_name(\'logo\')

except NoSuchElementException:

print(\'NO LOGO\')

finally:

browser.switch_to.parent_frame()

logo = browser.find_element_by_class_name(\'logo\')

print(logo)

print(logo.text)

browser.close()

13. 等待

# demo12.py

# 等待

"""

显示等待就是有条件的等待

隐式等待就是无条件的等待

隐式等待

当使用了隐式等待执行测试的时候,如果 WebDriver 没有在 DOM 中找到元素,将继续等待,超出设定时间后则抛出找不到元素的异常,

换句话说,当查找元素或元素并没有立即出现的时候,隐式等待将等待一段时间再查找 DOM,默认的时间是 0

显式等待

指定某个条件,然后设置最长等待时间。如果在这个时间还没有找到元素,那么便会抛出异常。

只有该条件触发,才执行后续代码,这个使用更灵活。 

主要涉及到selenium.webdriver.support 下的expected_conditions类。 

"""

from selenium import webdriver

from selenium.webdriver.support.wait import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.by import By

browser = webdriver.Chrome()

browser.get(\'http://www.taobao.com\')

browser.maximize_window()

browser.implicitly_wait(10)

wait = WebDriverWait(browser,10)

input = wait.until(EC.presence_of_all_elements_located((By.ID,\'q\')))

button = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,\'.btn-search\')))

print(input)

print(button)

browser.close()

14. 浏览器的前进和后退

# demo13.py

# 浏览器的前进和后退

import time

from selenium import webdriver

browser = webdriver.Chrome()

browser.get(\'https://www.baidu.com\')

time.sleep(1)

browser.get(\'https://www.taobao.com\')

time.sleep(1)

browser.get(\'https://www.cnblogs.com/xingxingnbsp/\')

time.sleep(1)

browser.back()

time.sleep(2)

browser.forward()

time.sleep(2)

browser.close()

15. Cookies

# demo14.py

# cookies

from selenium import webdriver

browser = webdriver.Chrome()

browser.get(\'https://www.zhihu.com/explore\')

print(browser.get_cookies())

browser.add_cookie({"name":"name","domain":"www.zhihu.com","value":"germey"})

print(browser.get_cookies())

browser.delete_all_cookies()

print(browser.get_cookies())

browser.close()

16. 选项卡管理(不兼容)

# demo15.py

# 选项卡管理

import time

from selenium import webdriver

browser = webdriver.Chrome()

browser.get(\'https://www.baidu.com\')

time.sleep(2)

browser.execute_script(\'window.open()\')

print(browser.window_handles)

browser.switch_to_window(browser.window_handles[1])

browser.get(\'https://www.taobao.com\')

time.sleep(2)

browser.get(\'https://www.cnblogs.com/xingxingnbsp/\')

time.sleep(3)

browser.close()

17. 异常处理

 # demo16.py

from selenium import webdriver

from selenium.common.exceptions import TimeoutException,NoSuchElementException

browser = webdriver.Chrome()

try:

browser.get(\'https://www.baidu.com\')

except TimeoutException:

print(\'Time Out\')

try:

browser.find_element_by_id(\'hello\')

except NoSuchElementException:

print(\'No Element\')

finally:

browser.close()

 文章摘录:https://www.cnblogs.com/xingxingnbsp/p/12129466.html

以上是 python 带你了解爬虫 的全部内容, 来源链接: utcz.com/z/388030.html

回到顶部