Python3.4 12306 3月验证码识别

Posted 2020-09-19

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了Python3.4 12306 3月验证码识别相关的知识，希望对你有一定的参考价值。

import ssl
import json
from PIL import Image
import requests
import re
import urllib.request as urllib2
if hasattr(ssl, ‘_create_unverified_context‘):
    ssl.create_default_context = ssl._create_unverified_context
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36"
pic_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.21191171556711197"
def imgCut():
   pic_obj = Image.open(‘./tmp.jpg‘)
   box = (120, 0, 290, 25)
   region = pic_obj.crop(box)
   region.save(‘./text.jpg‘)
def ocrApi():
    filename = ‘./text.jpg‘
    upload_pic_url = "http://cn.docs88.com/pdftowordupload2.php"
    filename_tmp = filename.split(‘/‘)[-1]
    headers_fake = {
        ‘ccept‘: ‘*/*‘,
        ‘Accept-Encoding‘: ‘gzip, deflate‘,
        ‘Accept-Language‘: ‘zh-CN,zh;q=0.8,en;q=0.6‘,
        ‘Connection‘: ‘keep-alive‘,
        ‘Host‘: ‘cn.docs88.com‘,
        ‘Origin‘: ‘http://cn.docs88.com‘,
        ‘User-Agent‘: ‘Mozilla/5.0 (KHTML, like Gecko) Chrome/41.0.2272.89‘,
        ‘X-Requested-With‘: ‘ShockwaveFlash/17.0.0.134‘,
        }
    para = {‘Filename‘: filename_tmp,
            ‘sourcename‘: filename_tmp,
            ‘sourcelanguage‘: ‘cn‘,
            ‘desttype‘: ‘txt‘,
            ‘Upload‘: ‘Submit Query‘,
              }
    upload_pic = requests.post(upload_pic_url, data=para, files={"Filedata" : open(filename, ‘rb‘)}, headers=headers_fake)

    text_result_url = ‘http://cn.docs88.com/‘ + str(upload_pic.content)[5:-1]
    text_result = urllib2.urlopen(text_result_url).read().decode()
    return text_result
def get_img():
    resp = urllib2.urlopen(pic_url)
    raw = resp.read()
    with open(‘./tmp.jpg‘, ‘wb‘) as fp:
        fp.write(raw)
    return Image.open(‘./tmp.jpg‘)
def get_sub_img(im, x, y):
    assert 0 <= x <= 3
    assert 0 <= y <= 2
    #WITH = HEIGHT = 68
    left = 5 + (67 + 5) * x
    top = 41 + (67 + 5) * y
    right = left + 67
    bottom = top + 67
    return im.crop((left, top, right, bottom))
def baidu_stu_lookup(im):
    url = "http://stu.baidu.com/n/image?fr=html5&needRawImageUrl=true&id=WU_FILE_0&name=233.png&type=image%2Fpng&lastModifiedDate=Mon+Mar+16+2015+20%3A49%3A11+GMT%2B0800+(CST)&size="
    im.save("./query_temp_img.png")
    raw = open("./query_temp_img.png", ‘rb‘).read()
    url = url + str(len(raw))
    req = urllib2.Request(url, raw, {‘Content-Type‘: ‘image/png‘, ‘User-Agent‘: UA})
    resp_url = urllib2.urlopen(req).read()
    url = "http://stu.baidu.com/n/searchpc?
queryImageUrl=" + urllib2.quote(resp_url)
    req = urllib2.Request(url, headers={‘User-Agent‘: UA})
    resp = urllib2.urlopen(req)
    html = resp.read().decode()
    return baidu_stu_html_extract(html)
def baidu_stu_html_extract(html):
    pattern = re.compile(r"keywords:‘(.*?)‘")
    matches = pattern.findall(html)
    if not matches:
        return ‘[UNKOWN]‘
    json_str = matches[0]
    json_str = json_str.replace(‘\\x22‘, ‘"‘).replace(‘\\\\‘, ‘\\‘)
    result = [item[‘keyword‘] for item in json.loads(json_str)]
    return ‘|‘.join(result) if result else ‘[UNKOWN]‘
if __name__ == ‘__main__‘:
    im = get_img()
    imgCut()
    captcha_text = ocrApi()
    print(captcha_text)
    dic_list = {}
    count = 0
    for y in range(2):
        for x in range(4):
            count += 1
            im2 = get_sub_img(im, x, y)
            result = baidu_stu_lookup(im2)
            dic_list[count] = result
            print((y, x), result)
    if captcha_text.strip():
        print(‘\n可能的结果是:‘)
        maybe_result = []
        for v in dic_list:
            for c in range(len(captcha_text.strip())):
                text = (captcha_text)[c]
                if text in dic_list[v]:
                    _str_res = ‘%s --- %s‘ % (v, dic_list[v])
                    maybe_result.append(_str_res)
        for r in list(set(maybe_result)):
            print(r)
        else:
            print(‘False‘)

<span style="font-family: Arial, Helvetica, sans-serif;">改自 https://gist.github.com/Evi1m0/fbbdb1ba7c66cc4e1bb2</span>

<span style="font-family: Arial, Helvetica, sans-serif;"></span><h2 style="font-family: ‘Microsoft Yahei‘, sans-serif; margin: 0px; padding: 0px; line-height: 26px;">转载请注明作者与出处：<a target=_blank target="_blank" href="http://blog.csdn.net/u013511642" style="color: rgb(202, 0, 0); text-decoration: none;">http://blog.csdn.net/u013511642</a>   王小涛_同學</h2>