使用广度优先搜索OpenCV 和 Tesseract 绕过验证码
Posted 深度学习与计算机视觉
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了使用广度优先搜索OpenCV 和 Tesseract 绕过验证码相关的知识,希望对你有一定的参考价值。
验证码
方法
-
转换为灰度 -
中值滤波器(内核大小 3) -
图像阈值处理 -
岛屿去除 -
中值滤波器(内核大小 3)
def bfs(visited, queue, array, node):
# I make BFS itterative instead of recursive
def getNeighboor(array, node):
neighboors = []
if node[0]+1<array.shape[0]:
if array[node[0]+1,node[1]] == 0:
neighboors.append((node[0]+1,node[1]))
if node[0]-1>0:
if array[node[0]-1,node[1]] == 0:
neighboors.append((node[0]-1,node[1]))
if node[1]+1<array.shape[1]:
if array[node[0],node[1]+1] == 0:
neighboors.append((node[0],node[1]+1))
if node[1]-1>0:
if array[node[0],node[1]-1] == 0:
neighboors.append((node[0],node[1]-1))
return neighboors
queue.append(node)
visited.add(node)
while queue:
current_node = queue.pop(0)
for neighboor in getNeighboor(array, current_node):
if neighboor not in visited:
# print(neighboor)
visited.add(neighboor)
queue.append(neighboor)
def removeIsland(img_arr, threshold):
# !important: the black pixel is 0 and white pixel is 1
while 0 in img_arr:
x,y = np.where(img_arr == 0)
point = (x[0],y[0])
visited = set()
queue = []
bfs(visited, queue, img_arr, point)
if len(visited) <= threshold:
for i in visited:
img_arr[i[0],i[1]] = 1
else:
# if the cluster is larger than threshold (i.e is the text),
# we convert it to a temporary value of 2 to mark that we
# have visited it.
for i in visited:
img_arr[i[0],i[1]] = 2
img_arr = np.where(img_arr==2, 0, img_arr)
return img_arr
img = cv2.imread("temp.png")
# Convert to grayscale
c_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
# Median filter
kernel = np.ones((3,3),np.uint8)
out = cv2.medianBlur(c_gray,3)
# Image thresholding
a = np.where(out>195, 1, out)
out = np.where(a!=1, 0, a)
# Islands removing with threshold = 30
out = removeIsland(out, 30)
# Median filter
out = cv2.medianBlur(out,3)
# Convert to Image type and pass it to tesseract
im = Image.fromarray(out*255)
print(pytesseract.image_to_string(im))
结果
-
它包含正常和大写的字母数字字符。 -
验证码字符串大小为 4 或 5。
-
必须仅包含字母数字字符。 -
长度在 4 到 5 之间。
结论
以上是关于使用广度优先搜索OpenCV 和 Tesseract 绕过验证码的主要内容,如果未能解决你的问题,请参考以下文章