缘起

  • 登录
  • IP
  • 验证码

在爬虫的世界里面,大家一定会经常遇到以上的这些问题,我们除了甩锅给第三方打码平台,IP提供商,copy cookie这样的方式方案,大家可以自己尝试自己解决,多去思考一些原理,本次就将爬虫中遇到的字符型的验证码做一些自己的总结和分析

准备

  • Tensorflow 1.0.1
  • captcha 0.2.2

思路

现在的解决验证码一般就几个方式,打码平台,手动输入,或者自己训练模型识别,打码平台没其它缺点,就是费钱,手动输入费时间,而且自动化相当差,自己训练就是需要时间,而且需要相当的算法功底,但是自从2012年Deep learning 兴起之后,CNN卷积大行其道,端到端训练已经相当简化了,简化到只要准备好输入足够的训练数据集,输出的结果都不会太差了

训练

首先准备数据集,这里你们使用captcha 这个类库,这个类库可以方便的什么生成数据集,现实中,我们都是要请label工作人员,标注这些数据集,数据集越多,训练效果越好。尽量数据集的比例尽量准备成4:1 的内容,4份训练数据集,1份验证数据集,好比8万的训练集,2万的验证数据集

配置文件

    # -*- coding: utf-8 -*-
    NUMBER = '0123456789'
    CHAR_SMALL = 'abcdefghijklmnopqrstuvwxyz'
    CHAR_BIG = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    
    MAX_CAPTCHA = 6  # 测试1位的训练准确率
    VALIDATE_STRING = NUMBER + CHAR_SMALL  # + CHAR_BIG
    CHAR_SET_LEN = len(VALIDATE_STRING)
    
    IMAGE_HEIGHT = 60
    IMAGE_WIDTH = 160
    FONT_SIZE = 35
    
    MAX_ACCURACY = 0.9 

生成数据集

    # -*- coding: utf-8 -*-
    
    from captcha.image import ImageCaptcha
    import numpy as np
    import matplotlib.pyplot as plt
    # from config import NUMBER, CHAR_SMALL, CHAR_BIG, MAX_CAPTCHA, CHAR_SET_LEN, FONT_SIZE
    import config
    from PIL import Image
    import random
    
    char_dict = {}
    number_dict = {}
    
    
    # 生成随机的指定的字符串
    def __gen_random_captcha_text(char_set=config.VALIDATE_STRING, size=None):
        # char_set must be a str
        if not char_set or not isinstance(char_set, str):
            raise ValueError('get the empty char_set')
    
        # 随机
        result = list(char_set)
        random.shuffle(result)
    
        # 返回字符串
        return ''.join(result[0:size])
    
    
    def gen_random_captcha_image():
        image = ImageCaptcha(width=config.IMAGE_WIDTH, height=config.IMAGE_HEIGHT,font_sizes=[config.FONT_SIZE])
    
        text = __gen_random_captcha_text(size=config.MAX_CAPTCHA)
        captcha = image.generate(text)
        captcha_image = Image.open(captcha)
        captcha_source = np.array(captcha_image)
        return text, captcha_source
    
    
    # always gen the require image height ,and width image
    def gen_require_captcha_image():
        while 1:
            text, image = gen_random_captcha_image()
            if image.shape == (config.IMAGE_HEIGHT, config.IMAGE_WIDTH, 3):
                return text, image
    
    
    # 把彩色图像转为灰度图像(色彩对识别验证码没有什么用,对于抽取特征也没啥用)
    def convert2gray(img):
        if len(img.shape) > 2:
            gray = np.mean(img, -1)
            # 上面的转法较快,正规转法如下
            # r, g, b = img[:,:,0], img[:,:,1], img[:,:,2]
            # gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
            return gray
        else:
            return img
    
    
    # prepare the char to index
    def prepare_char_dict():
        if char_dict:
            return char_dict
    
        for index, val in enumerate(config.VALIDATE_STRING):
            char_dict[val] = index
    
        return char_dict
    
    
    def prepare_number_dict():
        if number_dict:
            return number_dict
    
        for index, val in enumerate(config.VALIDATE_STRING):
            number_dict[index] = val
    
        return number_dict
    
    
    def text_to_array(text):
        char_dict_tmp = prepare_char_dict()
    
        arr = np.zeros(config.MAX_CAPTCHA * config.CHAR_SET_LEN, dtype=np.int8)
        for i, p in enumerate(text):
            key_index = char_dict_tmp[p]
            index = i * config.CHAR_SET_LEN + key_index
            arr[index] = 1
    
        return arr
    
    
    def array_to_text(arr):
        num_dict_tmp = prepare_number_dict()
        text = []
        char_pos = arr.nonzero()[0]
        for index, val in enumerate(char_pos):
            if index == 0:
                index = 1
            key_index = val % (index * config.CHAR_SET_LEN)
            text.append(num_dict_tmp[key_index])
        return ''.join(text)
    
    def show_image_text():
        text, image = gen_random_captcha_image()
    
        f = plt.figure()
        ax = f.add_subplot(111)
        ax.text(0.1, 0.9, text, ha='center', va='center', transform=ax.transAxes)
        plt.imshow(image)
    
        plt.show()
    
    #
    if __name__ == '__main__':
        # __do_image_text()
        # arr = text_to_array('0142')
        # print '==========='
        # print array_to_text(arr)
        show_image_text()

正式训练数模型

    # -*- coding: utf-8 -*-
    import tensorflow as tf
    import numpy as np
    from gen_image import text_to_array
    from config import MAX_CAPTCHA, CHAR_SET_LEN, IMAGE_HEIGHT, IMAGE_WIDTH, MAX_ACCURACY
    from gen_image import gen_require_captcha_image
    
    x_input = tf.placeholder(tf.float32, [None, IMAGE_HEIGHT * IMAGE_WIDTH])
    y_input = tf.placeholder(tf.float32, [None, CHAR_SET_LEN * MAX_CAPTCHA])
    keep_prob = tf.placeholder(tf.float32)
    
    
    # 把彩色图像转为灰度图像(色彩对识别验证码没有什么用,对于抽取特征也没啥用)
    def convert2gray(img):
        if len(img.shape) > 2:
            gray = np.mean(img, -1)
            return gray
        else:
            return img
    
    
    def __weight_variable(shape, stddev=0.01):
        initial = tf.random_normal(shape, stddev=stddev)
        return tf.Variable(initial)
    
    
    def __bias_variable(shape, stddev=0.1):
        initial = tf.random_normal(shape=shape, stddev=stddev)
        return tf.Variable(initial)
    
    
    def __conv2d(x, w):
        # strides 代表移动的平长
        return tf.nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
    
    
    def __max_pool_2x2(x):
        return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
    
    
    # 100个一个批次
    def gen_next_batch(batch_size=100):
        batch_x = np.zeros([batch_size, IMAGE_HEIGHT * IMAGE_WIDTH])
        batch_y = np.zeros([batch_size, MAX_CAPTCHA * CHAR_SET_LEN])
    
        for i in xrange(batch_size):
            text, image = gen_require_captcha_image()
    
            # 转成灰度图片,因为颜色对于提取字符形状是没有意义的
            image = convert2gray(image)
    
            batch_x[i, :] = image.flatten() / 255
            batch_y[i, :] = text_to_array(text)
    
        return batch_x, batch_y
    
    
    def create_layer(x_input, keep_prob):
        x_image = tf.reshape(x_input, shape=[-1, IMAGE_WIDTH, IMAGE_HEIGHT, 1])
    
        # 定义第1个卷积层
        w_c1 = __weight_variable([5, 5, 1, 32], stddev=0.1)  # 3x3 第一层32个卷积核 采用黑白色
        b_c1 = __bias_variable([32], stddev=0.1)
        h_c1 = tf.nn.relu(tf.nn.bias_add(__conv2d(x_image, w_c1), b_c1))  # 定义第一个卷积层
        h_pool1 = __max_pool_2x2(h_c1)  # 定义第一个池化层
        # h_pool1 = tf.nn.dropout(h_pool1, keep_prob)
    
        # 定义第2个卷积层
        w_c2 = __weight_variable([5, 5, 32, 64], stddev=0.1)
        b_c2 = __bias_variable([64], stddev=0.1)
        h_c2 = tf.nn.relu(tf.nn.bias_add(__conv2d(h_pool1, w_c2), b_c2))
        h_pool2 = __max_pool_2x2(h_c2)
        # h_pool2 = tf.nn.dropout(h_pool2, keep_prob)
    
        # 定义第3个卷积层
        w_c3 = __weight_variable([5, 5, 64, 64], stddev=0.1)
        b_c3 = __bias_variable([64], stddev=0.1)
        h_c3 = tf.nn.relu(tf.nn.bias_add(__conv2d(h_pool2, w_c3), b_c3))
        h_pool3 = __max_pool_2x2(h_c3)
        # h_pool3 = tf.nn.dropout(h_pool3, keep_prob)
    
        # 3层池化之后 width 144 / 8 = 18
        # height 64 / 8 = 8
    
        # 全链接层1
        w_fc1 = __weight_variable([20 * 8 * 64, 1024], stddev=0.1)
        b_fc1 = __bias_variable([1024])
        h_pool3_flat = tf.reshape(h_pool3, [-1, w_fc1.get_shape().as_list()[0]])
        h_fc1 = tf.nn.relu(tf.add(tf.matmul(h_pool3_flat, w_fc1), b_fc1))
        # drop out 内容0
        h_fc1_dropout = tf.nn.dropout(h_fc1, keep_prob)
    
        # 全链接层2
        w_output = __weight_variable([1024, MAX_CAPTCHA * CHAR_SET_LEN], stddev=0.1)
        b_output = __bias_variable([MAX_CAPTCHA * CHAR_SET_LEN])
        y_output = tf.add(tf.matmul(h_fc1_dropout, w_output), b_output)
    
        return y_output
    
    
    def create_loss(layer, y_input):
        loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_input, logits=layer))
        return loss
    
    
    def create_accuracy(output, y_input):
        predict = tf.reshape(output, [-1, MAX_CAPTCHA, CHAR_SET_LEN])
        max_idx_p = tf.argmax(predict, 2)
        max_idx_l = tf.argmax(tf.reshape(y_input, [-1, MAX_CAPTCHA, CHAR_SET_LEN]), 2)
        correct_pred = tf.equal(max_idx_p, max_idx_l)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        return accuracy
    
    
    def train():
        # create the layer and loss
        layer_output = create_layer(x_input, keep_prob)
        loss = create_loss(layer_output, y_input)
        accuracy = create_accuracy(layer_output, y_input)
    
        train_step = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
        # save model
        saver = tf.train.Saver()
    
        with tf.Session() as sess:
    
            tf.global_variables_initializer().run()
            acc = 0.0
            i = 0
    
            while acc < MAX_ACCURACY:
                i = i + 1
                batch_x, batch_y = gen_next_batch(64)
                _, _loss = sess.run([train_step, loss],
                                    feed_dict={x_input: batch_x, y_input: batch_y, keep_prob: 0.75})
    
                print(i, _loss)
    
                # 每100 step计算一次准确率
                if i % 50 == 0:
                    batch_x_test, batch_y_test = gen_next_batch(100)
                    acc = sess.run(accuracy, feed_dict={x_input: batch_x_test, y_input: batch_y_test, keep_prob: 1.})
                    print('step is %s' % i, 'and accy is %s' % acc)
                    # 如果准确率大于50%,保存模型,完成训练
                    if acc > MAX_ACCURACY:
                        print('current acc > %s  ,stop now' % MAX_ACCURACY)
                        saver.save(sess, "break.model", global_step=i)
                        break
    
    
    if __name__ == '__main__':
        train() 

验证结果

    # -*- coding: utf-8 -*-
    from gen_model import create_layer
    import tensorflow as tf
    import config
    import numpy as np
    from gen_image import convert2gray, gen_random_captcha_image, array_to_text
    
    
    def crack_captcha(captcha_image):
        x_input = tf.placeholder(tf.float32, [None, config.IMAGE_HEIGHT * config.IMAGE_WIDTH])
        keep_prob = tf.placeholder(tf.float32)  # dropout
        output = create_layer(x_input, keep_prob)
    
        saver = tf.train.Saver()
        with tf.Session() as sess:
            saver.restore(sess, tf.train.latest_checkpoint('.'))
            predict = tf.argmax(tf.reshape(output, [-1, config.MAX_CAPTCHA, config.CHAR_SET_LEN]), 2)
            text_list = sess.run(predict, feed_dict={x_input: [captcha_image], keep_prob: 1})
    
            text = text_list[0].tolist()
            vector = np.zeros(config.MAX_CAPTCHA * config.CHAR_SET_LEN)
            i = 0
            for n in text:
                vector[i * config.CHAR_SET_LEN + n] = 1
                i += 1
    
            return array_to_text(vector)
    
    
    def validate_image():
        text, image = gen_random_captcha_image()
        image = convert2gray(image)
    
        # map the value to 0 -> 1 ,this will really affect the loss function update ,always remember the value should
        # suit to the loss learning rate
        # refer https://www.youtube.com/watch?v=pU5TG_X7b6E&index=6&list=PLwY2GJhAPWRcZxxVFpNhhfivuW0kX15yG 21:00
        image = image.flatten() / 255
        predict_text = crack_captcha(image)
        print("label is : {} <----> predict is : {}".format(text, predict_text))
    
    
    if __name__ == '__main__':
        validate_image()

结果

训练的时间长短和字符长度和训练字符种类完相关,4位纯数字 ,大概4000次以内训练批次,可以达到90% 以上准确率,当然,GPU跑的更快一点

源码所在

https://github.com/BruceDone/tensorflow-demos