[深度学习]Tensorflow破解验证码
缘起
- 登录
- IP
- 验证码
在爬虫的世界里面,大家一定会经常遇到以上的这些问题,我们除了甩锅给第三方打码平台,IP提供商,copy cookie这样的方式方案,大家可以自己尝试自己解决,多去思考一些原理,本次就将爬虫中遇到的字符型的验证码做一些自己的总结和分析
准备
- Tensorflow 1.0.1
- captcha 0.2.2
思路
现在的解决验证码一般就几个方式,打码平台,手动输入,或者自己训练模型识别,打码平台没其它缺点,就是费钱,手动输入费时间,而且自动化相当差,自己训练就是需要时间,而且需要相当的算法功底,但是自从2012年Deep learning 兴起之后,CNN卷积大行其道,端到端训练已经相当简化了,简化到只要准备好输入足够的训练数据集,输出的结果都不会太差了
训练
首先准备数据集,这里你们使用captcha 这个类库,这个类库可以方便的什么生成数据集,现实中,我们都是要请label工作人员,标注这些数据集,数据集越多,训练效果越好。尽量数据集的比例尽量准备成4:1 的内容,4份训练数据集,1份验证数据集,好比8万的训练集,2万的验证数据集
配置文件
# -*- coding: utf-8 -*-
NUMBER = '0123456789'
CHAR_SMALL = 'abcdefghijklmnopqrstuvwxyz'
CHAR_BIG = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
MAX_CAPTCHA = 6 # 测试1位的训练准确率
VALIDATE_STRING = NUMBER + CHAR_SMALL # + CHAR_BIG
CHAR_SET_LEN = len(VALIDATE_STRING)
IMAGE_HEIGHT = 60
IMAGE_WIDTH = 160
FONT_SIZE = 35
MAX_ACCURACY = 0.9
生成数据集
# -*- coding: utf-8 -*-
from captcha.image import ImageCaptcha
import numpy as np
import matplotlib.pyplot as plt
# from config import NUMBER, CHAR_SMALL, CHAR_BIG, MAX_CAPTCHA, CHAR_SET_LEN, FONT_SIZE
import config
from PIL import Image
import random
char_dict = {}
number_dict = {}
# 生成随机的指定的字符串
def __gen_random_captcha_text(char_set=config.VALIDATE_STRING, size=None):
# char_set must be a str
if not char_set or not isinstance(char_set, str):
raise ValueError('get the empty char_set')
# 随机
result = list(char_set)
random.shuffle(result)
# 返回字符串
return ''.join(result[0:size])
def gen_random_captcha_image():
image = ImageCaptcha(width=config.IMAGE_WIDTH, height=config.IMAGE_HEIGHT,font_sizes=[config.FONT_SIZE])
text = __gen_random_captcha_text(size=config.MAX_CAPTCHA)
captcha = image.generate(text)
captcha_image = Image.open(captcha)
captcha_source = np.array(captcha_image)
return text, captcha_source
# always gen the require image height ,and width image
def gen_require_captcha_image():
while 1:
text, image = gen_random_captcha_image()
if image.shape == (config.IMAGE_HEIGHT, config.IMAGE_WIDTH, 3):
return text, image
# 把彩色图像转为灰度图像(色彩对识别验证码没有什么用,对于抽取特征也没啥用)
def convert2gray(img):
if len(img.shape) > 2:
gray = np.mean(img, -1)
# 上面的转法较快,正规转法如下
# r, g, b = img[:,:,0], img[:,:,1], img[:,:,2]
# gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
return gray
else:
return img
# prepare the char to index
def prepare_char_dict():
if char_dict:
return char_dict
for index, val in enumerate(config.VALIDATE_STRING):
char_dict[val] = index
return char_dict
def prepare_number_dict():
if number_dict:
return number_dict
for index, val in enumerate(config.VALIDATE_STRING):
number_dict[index] = val
return number_dict
def text_to_array(text):
char_dict_tmp = prepare_char_dict()
arr = np.zeros(config.MAX_CAPTCHA * config.CHAR_SET_LEN, dtype=np.int8)
for i, p in enumerate(text):
key_index = char_dict_tmp[p]
index = i * config.CHAR_SET_LEN + key_index
arr[index] = 1
return arr
def array_to_text(arr):
num_dict_tmp = prepare_number_dict()
text = []
char_pos = arr.nonzero()[0]
for index, val in enumerate(char_pos):
if index == 0:
index = 1
key_index = val % (index * config.CHAR_SET_LEN)
text.append(num_dict_tmp[key_index])
return ''.join(text)
def show_image_text():
text, image = gen_random_captcha_image()
f = plt.figure()
ax = f.add_subplot(111)
ax.text(0.1, 0.9, text, ha='center', va='center', transform=ax.transAxes)
plt.imshow(image)
plt.show()
#
if __name__ == '__main__':
# __do_image_text()
# arr = text_to_array('0142')
# print '==========='
# print array_to_text(arr)
show_image_text()
正式训练数模型
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np
from gen_image import text_to_array
from config import MAX_CAPTCHA, CHAR_SET_LEN, IMAGE_HEIGHT, IMAGE_WIDTH, MAX_ACCURACY
from gen_image import gen_require_captcha_image
x_input = tf.placeholder(tf.float32, [None, IMAGE_HEIGHT * IMAGE_WIDTH])
y_input = tf.placeholder(tf.float32, [None, CHAR_SET_LEN * MAX_CAPTCHA])
keep_prob = tf.placeholder(tf.float32)
# 把彩色图像转为灰度图像(色彩对识别验证码没有什么用,对于抽取特征也没啥用)
def convert2gray(img):
if len(img.shape) > 2:
gray = np.mean(img, -1)
return gray
else:
return img
def __weight_variable(shape, stddev=0.01):
initial = tf.random_normal(shape, stddev=stddev)
return tf.Variable(initial)
def __bias_variable(shape, stddev=0.1):
initial = tf.random_normal(shape=shape, stddev=stddev)
return tf.Variable(initial)
def __conv2d(x, w):
# strides 代表移动的平长
return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
def __max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
# 100个一个批次
def gen_next_batch(batch_size=100):
batch_x = np.zeros([batch_size, IMAGE_HEIGHT * IMAGE_WIDTH])
batch_y = np.zeros([batch_size, MAX_CAPTCHA * CHAR_SET_LEN])
for i in xrange(batch_size):
text, image = gen_require_captcha_image()
# 转成灰度图片,因为颜色对于提取字符形状是没有意义的
image = convert2gray(image)
batch_x[i, :] = image.flatten() / 255
batch_y[i, :] = text_to_array(text)
return batch_x, batch_y
def create_layer(x_input, keep_prob):
x_image = tf.reshape(x_input, shape=[-1, IMAGE_WIDTH, IMAGE_HEIGHT, 1])
# 定义第1个卷积层
w_c1 = __weight_variable([5, 5, 1, 32], stddev=0.1) # 3x3 第一层32个卷积核 采用黑白色
b_c1 = __bias_variable([32], stddev=0.1)
h_c1 = tf.nn.relu(tf.nn.bias_add(__conv2d(x_image, w_c1), b_c1)) # 定义第一个卷积层
h_pool1 = __max_pool_2x2(h_c1) # 定义第一个池化层
# h_pool1 = tf.nn.dropout(h_pool1, keep_prob)
# 定义第2个卷积层
w_c2 = __weight_variable([5, 5, 32, 64], stddev=0.1)
b_c2 = __bias_variable([64], stddev=0.1)
h_c2 = tf.nn.relu(tf.nn.bias_add(__conv2d(h_pool1, w_c2), b_c2))
h_pool2 = __max_pool_2x2(h_c2)
# h_pool2 = tf.nn.dropout(h_pool2, keep_prob)
# 定义第3个卷积层
w_c3 = __weight_variable([5, 5, 64, 64], stddev=0.1)
b_c3 = __bias_variable([64], stddev=0.1)
h_c3 = tf.nn.relu(tf.nn.bias_add(__conv2d(h_pool2, w_c3), b_c3))
h_pool3 = __max_pool_2x2(h_c3)
# h_pool3 = tf.nn.dropout(h_pool3, keep_prob)
# 3层池化之后 width 144 / 8 = 18
# height 64 / 8 = 8
# 全链接层1
w_fc1 = __weight_variable([20 * 8 * 64, 1024], stddev=0.1)
b_fc1 = __bias_variable([1024])
h_pool3_flat = tf.reshape(h_pool3, [-1, w_fc1.get_shape().as_list()[0]])
h_fc1 = tf.nn.relu(tf.add(tf.matmul(h_pool3_flat, w_fc1), b_fc1))
# drop out 内容0
h_fc1_dropout = tf.nn.dropout(h_fc1, keep_prob)
# 全链接层2
w_output = __weight_variable([1024, MAX_CAPTCHA * CHAR_SET_LEN], stddev=0.1)
b_output = __bias_variable([MAX_CAPTCHA * CHAR_SET_LEN])
y_output = tf.add(tf.matmul(h_fc1_dropout, w_output), b_output)
return y_output
def create_loss(layer, y_input):
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_input, logits=layer))
return loss
def create_accuracy(output, y_input):
predict = tf.reshape(output, [-1, MAX_CAPTCHA, CHAR_SET_LEN])
max_idx_p = tf.argmax(predict, 2)
max_idx_l = tf.argmax(tf.reshape(y_input, [-1, MAX_CAPTCHA, CHAR_SET_LEN]), 2)
correct_pred = tf.equal(max_idx_p, max_idx_l)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
return accuracy
def train():
# create the layer and loss
layer_output = create_layer(x_input, keep_prob)
loss = create_loss(layer_output, y_input)
accuracy = create_accuracy(layer_output, y_input)
train_step = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
# save model
saver = tf.train.Saver()
with tf.Session() as sess:
tf.global_variables_initializer().run()
acc = 0.0
i = 0
while acc < MAX_ACCURACY:
i = i + 1
batch_x, batch_y = gen_next_batch(64)
_, _loss = sess.run([train_step, loss],
feed_dict={x_input: batch_x, y_input: batch_y, keep_prob: 0.75})
print(i, _loss)
# 每100 step计算一次准确率
if i % 50 == 0:
batch_x_test, batch_y_test = gen_next_batch(100)
acc = sess.run(accuracy, feed_dict={x_input: batch_x_test, y_input: batch_y_test, keep_prob: 1.})
print('step is %s' % i, 'and accy is %s' % acc)
# 如果准确率大于50%,保存模型,完成训练
if acc > MAX_ACCURACY:
print('current acc > %s ,stop now' % MAX_ACCURACY)
saver.save(sess, "break.model", global_step=i)
break
if __name__ == '__main__':
train()
验证结果
# -*- coding: utf-8 -*-
from gen_model import create_layer
import tensorflow as tf
import config
import numpy as np
from gen_image import convert2gray, gen_random_captcha_image, array_to_text
def crack_captcha(captcha_image):
x_input = tf.placeholder(tf.float32, [None, config.IMAGE_HEIGHT * config.IMAGE_WIDTH])
keep_prob = tf.placeholder(tf.float32) # dropout
output = create_layer(x_input, keep_prob)
saver = tf.train.Saver()
with tf.Session() as sess:
saver.restore(sess, tf.train.latest_checkpoint('.'))
predict = tf.argmax(tf.reshape(output, [-1, config.MAX_CAPTCHA, config.CHAR_SET_LEN]), 2)
text_list = sess.run(predict, feed_dict={x_input: [captcha_image], keep_prob: 1})
text = text_list[0].tolist()
vector = np.zeros(config.MAX_CAPTCHA * config.CHAR_SET_LEN)
i = 0
for n in text:
vector[i * config.CHAR_SET_LEN + n] = 1
i += 1
return array_to_text(vector)
def validate_image():
text, image = gen_random_captcha_image()
image = convert2gray(image)
# map the value to 0 -> 1 ,this will really affect the loss function update ,always remember the value should
# suit to the loss learning rate
# refer https://www.youtube.com/watch?v=pU5TG_X7b6E&index=6&list=PLwY2GJhAPWRcZxxVFpNhhfivuW0kX15yG 21:00
image = image.flatten() / 255
predict_text = crack_captcha(image)
print("label is : {} <----> predict is : {}".format(text, predict_text))
if __name__ == '__main__':
validate_image()
结果
训练的时间长短和字符长度和训练字符种类完相关,4位纯数字 ,大概4000次以内训练批次,可以达到90% 以上准确率,当然,GPU跑的更快一点
源码所在
- 原文作者:大鱼
- 原文链接:https://brucedone.com/archives/1005/
- 版权声明:本作品采用知识共享署名-非商业性使用-禁止演绎 4.0 国际许可协议. 进行许可,非商业转载请注明出处(作者,原文链接),商业转载请联系作者获得授权。