缘起
- 登录
- IP
- 验证码
在爬虫的世界里面,大家一定会经常遇到以上的这些问题,我们除了甩锅给第三方打码平台,IP提供商,copy cookie这样的方式方案,大家可以自己尝试自己解决,多去思考一些原理,本次就将爬虫中遇到的字符型的验证码做一些自己的总结和分析
准备
- Tensorflow 1.0.1
- captcha 0.2.2
思路
现在的解决验证码一般就几个方式,打码平台,手动输入,或者自己训练模型识别,打码平台没其它缺点,就是费钱,手动输入费时间,而且自动化相当差,自己训练就是需要时间,而且需要相当的算法功底,但是自从2012年Deep learning 兴起之后,CNN卷积大行其道,端到端训练已经相当简化了,简化到只要准备好输入足够的训练数据集,输出的结果都不会太差了
训练
首先准备数据集,这里你们使用captcha 这个类库,这个类库可以方便的什么生成数据集,现实中,我们都是要请label工作人员,标注这些数据集,数据集越多,训练效果越好。尽量数据集的比例尽量准备成4:1 的内容,4份训练数据集,1份验证数据集,好比8万的训练集,2万的验证数据集
配置文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
# -*- coding: utf-8 -*- NUMBER = '0123456789' CHAR_SMALL = 'abcdefghijklmnopqrstuvwxyz' CHAR_BIG = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' MAX_CAPTCHA = 6 # 测试1位的训练准确率 VALIDATE_STRING = NUMBER + CHAR_SMALL # + CHAR_BIG CHAR_SET_LEN = len(VALIDATE_STRING) IMAGE_HEIGHT = 60 IMAGE_WIDTH = 160 FONT_SIZE = 35 MAX_ACCURACY = 0.9 |
生成数据集
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
# -*- coding: utf-8 -*- from captcha.image import ImageCaptcha import numpy as np import matplotlib.pyplot as plt # from config import NUMBER, CHAR_SMALL, CHAR_BIG, MAX_CAPTCHA, CHAR_SET_LEN, FONT_SIZE import config from PIL import Image import random char_dict = {} number_dict = {} # 生成随机的指定的字符串 def __gen_random_captcha_text(char_set=config.VALIDATE_STRING, size=None): # char_set must be a str if not char_set or not isinstance(char_set, str): raise ValueError('get the empty char_set') # 随机 result = list(char_set) random.shuffle(result) # 返回字符串 return ''.join(result[0:size]) def gen_random_captcha_image(): image = ImageCaptcha(width=config.IMAGE_WIDTH, height=config.IMAGE_HEIGHT,font_sizes=[config.FONT_SIZE]) text = __gen_random_captcha_text(size=config.MAX_CAPTCHA) captcha = image.generate(text) captcha_image = Image.open(captcha) captcha_source = np.array(captcha_image) return text, captcha_source # always gen the require image height ,and width image def gen_require_captcha_image(): while 1: text, image = gen_random_captcha_image() if image.shape == (config.IMAGE_HEIGHT, config.IMAGE_WIDTH, 3): return text, image # 把彩色图像转为灰度图像(色彩对识别验证码没有什么用,对于抽取特征也没啥用) def convert2gray(img): if len(img.shape) > 2: gray = np.mean(img, -1) # 上面的转法较快,正规转法如下 # r, g, b = img[:,:,0], img[:,:,1], img[:,:,2] # gray = 0.2989 * r + 0.5870 * g + 0.1140 * b return gray else: return img # prepare the char to index def prepare_char_dict(): if char_dict: return char_dict for index, val in enumerate(config.VALIDATE_STRING): char_dict[val] = index return char_dict def prepare_number_dict(): if number_dict: return number_dict for index, val in enumerate(config.VALIDATE_STRING): number_dict[index] = val return number_dict def text_to_array(text): char_dict_tmp = prepare_char_dict() arr = np.zeros(config.MAX_CAPTCHA * config.CHAR_SET_LEN, dtype=np.int8) for i, p in enumerate(text): key_index = char_dict_tmp[p] index = i * config.CHAR_SET_LEN + key_index arr[index] = 1 return arr def array_to_text(arr): num_dict_tmp = prepare_number_dict() text = [] char_pos = arr.nonzero()[0] for index, val in enumerate(char_pos): if index == 0: index = 1 key_index = val % (index * config.CHAR_SET_LEN) text.append(num_dict_tmp[key_index]) return ''.join(text) def show_image_text(): text, image = gen_random_captcha_image() f = plt.figure() ax = f.add_subplot(111) ax.text(0.1, 0.9, text, ha='center', va='center', transform=ax.transAxes) plt.imshow(image) plt.show() # if __name__ == '__main__': # __do_image_text() # arr = text_to_array('0142') # print '===========' # print array_to_text(arr) show_image_text() |
正式训练数模型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# -*- coding: utf-8 -*- import tensorflow as tf import numpy as np from gen_image import text_to_array from config import MAX_CAPTCHA, CHAR_SET_LEN, IMAGE_HEIGHT, IMAGE_WIDTH, MAX_ACCURACY from gen_image import gen_require_captcha_image x_input = tf.placeholder(tf.float32, [None, IMAGE_HEIGHT * IMAGE_WIDTH]) y_input = tf.placeholder(tf.float32, [None, CHAR_SET_LEN * MAX_CAPTCHA]) keep_prob = tf.placeholder(tf.float32) # 把彩色图像转为灰度图像(色彩对识别验证码没有什么用,对于抽取特征也没啥用) def convert2gray(img): if len(img.shape) > 2: gray = np.mean(img, -1) return gray else: return img def __weight_variable(shape, stddev=0.01): initial = tf.random_normal(shape, stddev=stddev) return tf.Variable(initial) def __bias_variable(shape, stddev=0.1): initial = tf.random_normal(shape=shape, stddev=stddev) return tf.Variable(initial) def __conv2d(x, w): # strides 代表移动的平长 return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME') def __max_pool_2x2(x): return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') # 100个一个批次 def gen_next_batch(batch_size=100): batch_x = np.zeros([batch_size, IMAGE_HEIGHT * IMAGE_WIDTH]) batch_y = np.zeros([batch_size, MAX_CAPTCHA * CHAR_SET_LEN]) for i in xrange(batch_size): text, image = gen_require_captcha_image() # 转成灰度图片,因为颜色对于提取字符形状是没有意义的 image = convert2gray(image) batch_x[i, :] = image.flatten() / 255 batch_y[i, :] = text_to_array(text) return batch_x, batch_y def create_layer(x_input, keep_prob): x_image = tf.reshape(x_input, shape=[-1, IMAGE_WIDTH, IMAGE_HEIGHT, 1]) # 定义第1个卷积层 w_c1 = __weight_variable([5, 5, 1, 32], stddev=0.1) # 3x3 第一层32个卷积核 采用黑白色 b_c1 = __bias_variable([32], stddev=0.1) h_c1 = tf.nn.relu(tf.nn.bias_add(__conv2d(x_image, w_c1), b_c1)) # 定义第一个卷积层 h_pool1 = __max_pool_2x2(h_c1) # 定义第一个池化层 # h_pool1 = tf.nn.dropout(h_pool1, keep_prob) # 定义第2个卷积层 w_c2 = __weight_variable([5, 5, 32, 64], stddev=0.1) b_c2 = __bias_variable([64], stddev=0.1) h_c2 = tf.nn.relu(tf.nn.bias_add(__conv2d(h_pool1, w_c2), b_c2)) h_pool2 = __max_pool_2x2(h_c2) # h_pool2 = tf.nn.dropout(h_pool2, keep_prob) # 定义第3个卷积层 w_c3 = __weight_variable([5, 5, 64, 64], stddev=0.1) b_c3 = __bias_variable([64], stddev=0.1) h_c3 = tf.nn.relu(tf.nn.bias_add(__conv2d(h_pool2, w_c3), b_c3)) h_pool3 = __max_pool_2x2(h_c3) # h_pool3 = tf.nn.dropout(h_pool3, keep_prob) # 3层池化之后 width 144 / 8 = 18 # height 64 / 8 = 8 # 全链接层1 w_fc1 = __weight_variable([20 * 8 * 64, 1024], stddev=0.1) b_fc1 = __bias_variable([1024]) h_pool3_flat = tf.reshape(h_pool3, [-1, w_fc1.get_shape().as_list()[0]]) h_fc1 = tf.nn.relu(tf.add(tf.matmul(h_pool3_flat, w_fc1), b_fc1)) # drop out 内容0 h_fc1_dropout = tf.nn.dropout(h_fc1, keep_prob) # 全链接层2 w_output = __weight_variable([1024, MAX_CAPTCHA * CHAR_SET_LEN], stddev=0.1) b_output = __bias_variable([MAX_CAPTCHA * CHAR_SET_LEN]) y_output = tf.add(tf.matmul(h_fc1_dropout, w_output), b_output) return y_output def create_loss(layer, y_input): loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_input, logits=layer)) return loss def create_accuracy(output, y_input): predict = tf.reshape(output, [-1, MAX_CAPTCHA, CHAR_SET_LEN]) max_idx_p = tf.argmax(predict, 2) max_idx_l = tf.argmax(tf.reshape(y_input, [-1, MAX_CAPTCHA, CHAR_SET_LEN]), 2) correct_pred = tf.equal(max_idx_p, max_idx_l) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) return accuracy def train(): # create the layer and loss layer_output = create_layer(x_input, keep_prob) loss = create_loss(layer_output, y_input) accuracy = create_accuracy(layer_output, y_input) train_step = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) # save model saver = tf.train.Saver() with tf.Session() as sess: tf.global_variables_initializer().run() acc = 0.0 i = 0 while acc < MAX_ACCURACY: i = i + 1 batch_x, batch_y = gen_next_batch(64) _, _loss = sess.run([train_step, loss], feed_dict={x_input: batch_x, y_input: batch_y, keep_prob: 0.75}) print(i, _loss) # 每100 step计算一次准确率 if i % 50 == 0: batch_x_test, batch_y_test = gen_next_batch(100) acc = sess.run(accuracy, feed_dict={x_input: batch_x_test, y_input: batch_y_test, keep_prob: 1.}) print('step is %s' % i, 'and accy is %s' % acc) # 如果准确率大于50%,保存模型,完成训练 if acc > MAX_ACCURACY: print('current acc > %s ,stop now' % MAX_ACCURACY) saver.save(sess, "break.model", global_step=i) break if __name__ == '__main__': train() |
验证结果
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
# -*- coding: utf-8 -*- from gen_model import create_layer import tensorflow as tf import config import numpy as np from gen_image import convert2gray, gen_random_captcha_image, array_to_text def crack_captcha(captcha_image): x_input = tf.placeholder(tf.float32, [None, config.IMAGE_HEIGHT * config.IMAGE_WIDTH]) keep_prob = tf.placeholder(tf.float32) # dropout output = create_layer(x_input, keep_prob) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, tf.train.latest_checkpoint('.')) predict = tf.argmax(tf.reshape(output, [-1, config.MAX_CAPTCHA, config.CHAR_SET_LEN]), 2) text_list = sess.run(predict, feed_dict={x_input: [captcha_image], keep_prob: 1}) text = text_list[0].tolist() vector = np.zeros(config.MAX_CAPTCHA * config.CHAR_SET_LEN) i = 0 for n in text: vector[i * config.CHAR_SET_LEN + n] = 1 i += 1 return array_to_text(vector) def validate_image(): text, image = gen_random_captcha_image() image = convert2gray(image) # map the value to 0 -> 1 ,this will really affect the loss function update ,always remember the value should # suit to the loss learning rate # refer https://www.youtube.com/watch?v=pU5TG_X7b6E&index=6&list=PLwY2GJhAPWRcZxxVFpNhhfivuW0kX15yG 21:00 image = image.flatten() / 255 predict_text = crack_captcha(image) print("label is : {} <----> predict is : {}".format(text, predict_text)) if __name__ == '__main__': validate_image() |
结果
训练的时间长短和字符长度和训练字符种类完相关,4位纯数字 ,大概4000次以内训练批次,可以达到90% 以上准确率,当然,GPU跑的更快一点