tesseract 验证码识别前预处理之去干扰线

Posted 我的时光穿梭机

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了tesseract 验证码识别前预处理之去干扰线相关的知识,希望对你有一定的参考价值。

常见干扰线有以下几种模式。针对不同模式,采用不同策略处理可达到较好的效果。此处未穷举所有可能的模式,但可以发现,处理方法分两种:一是利用目标字符和背景的色彩差异,二是利用目标字符和背景的形状差异。在背景颜色固定且与目标字符颜色差异较大时,用色差法处理效果较佳;在背景颜色与目标字符颜色差异较小甚至无差异时,可针对二者呈现出的不同形态规律编写相应算法进行处理。

from PIL import Image
import numpy as np
import cv2
imgpath = r'C:\Users\Kelv\Pictures\Veri_codes\test66.png'
img2 = Image.open(imgpath)
# 最后一个参数是alpha通道设置值,决定所处理区域的透明度,0表示背景完全透明,255表示背景不透明。这一参数必须在'RGBA'模式下使用。有关图片模式,可参考 https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
img2 = img2.convert('RGBA')
for y in range(img2.size[1]):
for x in range(img2.size[0]):
try:
r,g,b,a = img2.getpixel((x,y))
# Method 1:
# for regularly repeated background patterns
# if (r,g,b) == (128,191,255) or (r,g,b) == (227,218,237):
# r,g,b,a = (255,255,255,255)
# Method 2:
# for randomly distributed fixed-shape dots
# if 55<r<75 and 1<g<20 and 10<b<20:
# r,g,b,a = (255,255,255,255)
# for i in range(-8,8):
# for j in range(-8,8):
# img2.putpixel((x+i,y+j),(r,g,b,a))
# Method 3:
# for straight line in specific colors
# if (148<r<255 and 108<g<248 and 0<=b<130) or (210<r<255 and 183<g<244 and 134<=b<194) \
# or (219<r<255 and 183<g<244 and 200<=b<255) or (23<r<80 and 33<g<55 and 0<=b<45) \
# or (67<r<127 and 43<g<100 and 31<=b<110) or (167<r<187 and 123<g<143 and 81<=b<101) \
# or (87<r<147 and 130<g<144 and 101<=b<170):
# r,g,b,a = (255,255,255,255)
# img2.putpixel((x,y),(r,g,b,a))
# Method 4-1:
# for cubic-pixel line
if (r,g,b) ==(101,101,254) or (r,g,b) ==(254,101,101):
if \
(img2.getpixel((x,y)) == img2.getpixel((x+1,y)) \
and img2.getpixel((x,y)) == img2.getpixel((x,y+1)) \
and img2.getpixel((x,y)) == img2.getpixel((x+1,y+1))):
r,g,b,a = (255,255,255,255)
img2.putpixel((x,y),(r,g,b,a))
img2.putpixel((x+1,y),(r,g,b,a))
img2.putpixel((x,y+1),(r,g,b,a))
img2.putpixel((x+1,y+1),(r,g,b,a))
Method 4-2:
if \
np.sum((r,g,b)) == np.sum(img2.getpixel((x+1,y))[:3]+img2.getpixel((x,y+1))[:3])/2:
r,g,b,a = (255,255,255,255)
img2.putpixel((x,y),(r,g,b,a))
img2.putpixel((x+1,y),(r,g,b,a))
img2.putpixel((x,y+1),(r,g,b,a))
elif \
np.sum((r,g,b)) == np.sum(img2.getpixel((x-1,y))[:3]+img2.getpixel((x,y+1))[:3])/2:
r,g,b,a = (255,255,255,255)
img2.putpixel((x,y),(r,g,b,a))
img2.putpixel((x-1,y),(r,g,b,a))
img2.putpixel((x,y+1),(r,g,b,a))
elif \
np.sum((r,g,b)) == np.sum(img2.getpixel((x-1,y))[:3]+img2.getpixel((x,y-1))[:3])/2:
r,g,b,a = (255,255,255,255)
img2.putpixel((x,y),(r,g,b,a))
img2.putpixel((x-1,y),(r,g,b,a))
img2.putpixel((x,y-1),(r,g,b,a))
elif \
np.sum((r,g,b)) == np.sum(img2.getpixel((x+1,y))[:3]+img2.getpixel((x,y-1))[:3])/2:
r,g,b,a = (255,255,255,255)
img2.putpixel((x,y),(r,g,b,a))
img2.putpixel((x+1,y),(r,g,b,a))
img2.putpixel((x,y-1),(r,g,b,a))
Method 4-3:
totalpixel = \
np.sum((\
img2.getpixel((x-1,y-1)),img2.getpixel((x,y-1)),img2.getpixel((x+1,y-1)), \
img2.getpixel((x-1,y)) ,img2.getpixel((x+1,y)), \
img2.getpixel((x-1,y+1)),img2.getpixel((x,y+1)),img2.getpixel((x+1,y+1))))
if totalpixel == np.sum((255,255,255,255))*8 or \
totalpixel == np.sum((255,255,255,255))*7+np.sum((r,g,b,a)):
if \
img2.getpixel((x-1,y-1))==img2.getpixel((x,y-1))==img2.getpixel((x+1,y-1)) \
==img2.getpixel((x-1,y)) ==img2.getpixel((x+1,y)) \
==img2.getpixel((x-1,y+1))==img2.getpixel((x,y+1))==img2.getpixel((x+1,y+1)) == (255,255,255,255):
r,g,b,a = (255,255,255,255)
img2.putpixel((x,y),(r,g,b,a))
img2.putpixel((x,y-1),(r,g,b,a))
img2.putpixel((x,y+1),(r,g,b,a))
img2.putpixel((x-1,y),(r,g,b,a))
img2.putpixel((x+1,y),(r,g,b,a))
img2.putpixel((x-1,y-1),(r,g,b,a))
img2.putpixel((x+1,y-1),(r,g,b,a))
img2.putpixel((x+1,y+1),(r,g,b,a))
img2.putpixel((x-1,y+1),(r,g,b,a))
except Exception as e:
print(e)
continue
img2.save(imgpath.split('.')[0]+'x.png')
img2.show()

Pattern 1

before processing

after processing

tesseract 验证码识别前预处理之去干扰线

Pattern 2

before processing

tesseract 验证码识别前预处理之去干扰线

after processing

tesseract 验证码识别前预处理之去干扰线

Pattern 3

before processing

tesseract 验证码识别前预处理之去干扰线

after processing

tesseract 验证码识别前预处理之去干扰线

Pattern 4

before processing

after processing


以上是关于tesseract 验证码识别前预处理之去干扰线的主要内容,如果未能解决你的问题,请参考以下文章

爬虫遇到验证码?Python助你弹窗处理和验证码识别

Python - PIL-pytesseract-tesseract验证码识别

自动化脚本无法处理验证码?Python图片识别库Tesseract实战

python+tesseract验证码识别的一点小心得

验证码识别

图形验证码识别——图像预处理:二值化