Python3爬虫(十四) 验证码处理

python

 Infi-chu:

http://www.cnblogs.com/Infi-chu/

一、图形验证码识别
1.使用tesserocr

import tesserocr

from PIL import Image

# 在本地存储一张验证码的图片做测试
image = Image.open(\'test.jpg\')
result = tesserocr.image_to_text(image)
print(result)

# 直接将文本转为字符串
import tesserocr
print(tesserocr.file_to_text(\'test.jpg\'))

2.处理验证码图片
convert()方法,可将图片转化为灰度图像、二值化图像

image = image.convert(\'L\')	# 将图像转化为灰度图像

image.show()

image = image.convert(\'1\') # 将图像转化为二值化图像,二值化阈值默认是127

# 现将图片转化成灰度图像,再转化成二值化图像

image = image.convert(\'L\')

threshold = 80 # 设定阈值

table = []

for i in range(256):

if i < threshold:

table.append(0)

else:

table.append(1)

image = image.point(table,\'1\')

image.show() # 图像变得清晰

result = tesserocr.image_to_text(image)

print(result)

二、滑动验证码识别
滑动验证码就如同用一块拼图去在图片中填充
1.滑动验证码特点:
防模拟
防伪造
防暴力

2.如何识别:
采用浏览器模拟验证

3.初始化:

EMAIL = \'test@test.com\'

PASSWORD = \'123456\'

class CrackGeetest():

def __init__(self):

self.url = \'https://account.geetest.com/login\'

self.browser = webdriver.Chome()

self.wait = WebDriverWait(self.browser,20)

self.email = EMAIL

self.pasword = PASSWORD

4.模拟点击:

# 寻找按钮

def get_geetest_button(self):

button = self.wait.until(EC.element_to_be_clickable((BY.CLASS_NAME,\'geetest_radar_tip\')))

return button

# 点击验证按钮

button = self.get_geetest_button()

button.click()

5.识别缺口:
首先对比原图和现图,利用selenium选取图片元素,得到位置和size,然后获取截图

# 

# 获取位置和size

def position(self):

img = self.wait.until(EC.persence_of_element_located((By.CLASS_NAME,\'geetest_canvas_img\')))

time.sleep(2)

location = img.location

size = img.size

top,bottom,left,right = location[\'y\'],location[\'y\']+size[\'height\'],location[\'x\'],location[\'x\']+size[\'width\']

return (top,bottom,left,right)

# 获取网页截图

def get_geetest_image(self,name=\'captcha.png\'):

top,bottom,left,right = self.get_position() # 获取图片的位置和宽高,随后返回左上角和右下角的坐标

print(\'验证码位置\',top,bottom,left,right)

screenshot = self.get_screenshot() # 得到屏幕目标

captcha = screenshot.crop((left,top,right,bottom))

# 获取第二张图片(带有缺口的图片)

def get_slider(self):

slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,\'geetest_slider_button\')))

return slider

# 点击后出现接口

slider = self.get_slider()

slider.click()

# 在调用 get_geetest_image()函数获取第二张图,分别命名为img1和img2

\'\'\'

对比图像的缺口,需要遍历图片的每一个坐标点,获取两张图片对应像素点的RGB数据,如果差距在一定范围内,则代表两个像素相同,接着继续对比下一个像素点。如果差距在一定范围之外,则说明不是相同的像素点,则该位置就是缺口位置

\'\'\'

def is_pixel_equal(self,img1,img2,x,y):

# 取两个图片的像素点

pixel1 = img1.load()[x,y]

pixel2 = img2.load()[x,y]

threshold = 60

# 两张图RGB的绝对值小于定义的阈值,则代表像素点相同,继续遍历。否则不相同,为缺口位置

if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(pixel1[2] - pixel2[2]) < threshold:

return True

else:

return False

def get_gap(self,img1,img2):

left = 60

for i in range(left,img1.size[0]):

for j in range(img1.size[1]):

if not self.is_pixel_equal(img1.img2,i,j): # 判断两个图片的某一点的像素是否相同

left = i

return left

return left

6.模拟拖动:

def get_track():

track = []

current = 0

mid = distance * 4 / 5

t = 0.2

v = 0

while current < distance:

if current < mid:

a = 2

else:

a = -3

v0 = v

v = v0 + a * t

x = v0*t+1/2*a*t^2

move = v0*t+1/2*a*t^2

current += move

track.append(round(move))

return track

def move_to_gap(self,slider,tracks):

ActionChains(self.browser).click_and_hold(slider).perform()

for x in tracks:

ActionChains(self.browser).move_by_offset(xoffset=x,yoffset=0).perform()

time.sleep(0.3)

ActionChains(self.browser).release().perform()

1.和12306的验证码类似
2.思路:
文字识别、图像识别
3.使用超级鹰平台识别
修改Python API

import requests

from hashlib import md5

class Chaojiying(obj):

def __init__(self,username,password,soft_id):

self.username=username

self.password=md5(password.encode(\'utf-8\')).hexdigest()

self.soft_id=soft_id

self.base_params = {

\'user\':self.username,

\'pass2\':self.password,

\'softid\':self.soft_id,

}

self.headers = {

\'Connection\':\'Keep-Alive\',

\'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)\'

}

def post_pic(self,im,codetype):

params = {

\'codetype\':codetype,

}

params.update(self.base_params)

files = {\'userfile\':(\'test.jpg\',im)}

r = requests.post(\'http://upload.chaojiying.net/Upload/Processing.php\',data=params,files=files,headers=self.headers)

return r.json()

def report_error(self,im_id):

params = {\'id\':im_id,}

params.update(self.base_params)

r = requests.post(\'http://upload.chaojiying.net/Upload/ReportError.php\',data=params,headers=self.headers)

return r.json()

4.初始化:

EMAIL = \'test@test.com\'

PASSWORD = \'\'

CHAOJIYING_USERNAME=\'test\'

CHAOJIYING_PASSWORD=\'\'

CHAOJIYING_SOFT_ID=893590 # 软件ID

CHAOJIYING_KIND=9102 # 验证码类型

class CrackTouClick():

def __init__(self):

self.url=\'输入要识别的网站\'

self.browser=webdriver.Chome()

self.wait=WebDriverWait(self.browser,20)

self.email=EMAIL

self.password=PASSWORD

self.chaojiying=Chaojiying(CHAOJIYING_USERNAME,CHAOJIYING_PASSWORD,CHAOJIYING_SOFT_ID,CHAOJIYING_KIND)

5.获取验证码:

def open():

self.browser.get(self.url)

email=self.wait.until(EC.persence_of_element_located((By.ID,\'email\')))

password=self.wait.until(EC.persence_of_element_located((By.ID,\'password\')))

email.send_keys(self.password)

def get_touclick_button(self):

button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,\'touclick-hod-wrap\')))

return button

def get_touclick_element(self):

element = self.wait.until(EC.persence_of_element_located((By.CLASS_NAME,\'touclick-pub-content\')))

return element

def get_position(self):

element=self.get_touclick_element()

time.sleep(1)

location=element.location

size=element.size

top,bottom,left,right=location[\'y\'],location[\'y\']+size[\'height\'],location[\'x\'],location[\'x\']+size[\'width\']

return (top,bottom,left,right)

def get_screenshot(self):

screenshot=self.browser.get_screenshot_as_png()

screenshot=Image.open(BytesIO(screenshot))

return screenshot

def get_touclick_image(self,name=\'captcha.png\')

top,bottom,left,right=self.get_position()

print(\'验证码位置\',top,bottom,left,right)

screenshot = self.get_screenshot()

captcha = screenshot.crop((left,top,right,bottom))

return captcha

6.识别验证码:

image = self.get_touclick_image()

bytes_array=BytesIO()

image.save(bytes_array,format=\'PNG\')

res = self.chaojiying.post_pic(bytes_array,getvalue(),CHAOJIYING_KIND)

print(res)

def get_points(self,captcha_result):

groups=captcha_result.get(\'pic_str\').split(\'|\')

locations=[[int(number) for number in group.split(\',\')]for group in groups]

return locations

def touch_click_words(self,locations):

for location in locations:

print(location)

ActionChains(self.browser).move_to_element_with_offset(self.get_touclick_element(),location[0],location[1]).click().perform()

time.sleep(1)

 

以上是 Python3爬虫(十四) 验证码处理 的全部内容, 来源链接: utcz.com/z/386561.html

回到顶部