难以用 tesseract 检测数字

Posted 2023-03-16

技术标签:

【中文标题】难以用 tesseract 检测数字【英文标题】：Difficulty detecting digits with tesseract 【发布时间】：2021-12-28 14:31:48 【问题描述】：

我在检测以下类型图片上的文字时遇到了一些困难：

tesseract 似乎很难区分数字和图表。我的目标是找到每个数字及其位置。

从这张图片中，我运行以下代码，它应该给我找到的文本周围的矩形：

import cv2
import pytesseract
from pytesseract import Output
import numpy as np


pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract'

img = cv2.imread('Temp/VE_cropped.png')

kernel = np.ones((2,2),np.uint8)

img_processed = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_processed = cv2.medianBlur(img_processed,3)
img_processed = cv2.threshold(img_processed, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
img_processed = cv2.dilate(img_processed, kernel, iterations = 1)

dict_wordsDetected = pytesseract.image_to_data(img_processed, output_type=Output.DICT)
img_processed = cv2.cvtColor(img_processed, cv2.COLOR_GRAY2RGB)

n_boxes = len(dict_wordsDetected['text'])
for i in range(n_boxes):
    (x, y, w, h) = (dict_wordsDetected['left'][i]
                  , dict_wordsDetected['top'][i]
                  , dict_wordsDetected['width'][i]
                  , dict_wordsDetected['height'][i])
    img_processed = cv2.rectangle(img_processed, (x - 10, y - 10), (x + w + 10, y + h + 10), (0, 0, 255), 2)
cv2.imshow("processed", img_processed)
cv2.waitKey(0)

是什么给了我们这个结果：

【问题讨论】：

确实如此，但即使是白底黑字。您只需添加：img_processed = cv2.threshold(img_processed, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] AFTER img_processed = cv2.dilate(img_processed, kernel, iterations = 1) 除了在找到的文本周围移动矩形之外没有任何效果... 正如我所说，我的问题是 tesseract 无法识别 0455、0435 或 0453 等数字。您建议的命令只是调整红色矩形的大小，但我的问题是在绘制矩形之前. 对不起，我的帖子很清楚，需要这些插图。这是我必须使用的图像，为什么不显示呢？该框应该围绕数字。这只是为了表明 tesseract 找不到数字。这就是我在这里寻求帮助的原因：让tesseract找到数字，我就能找到坐标 【参考方案1】：

我想我明白你想要什么。首先，Tessaract 可以很好地解决许多问题，特别是当我们看到带有易于 OCR 的图像的示例时，这意味着图像没有复杂的背景。在您的情况下，仅使用 Tesseract 或图像阈值处理图像并不简单，您必须对图像进行更多的图像预处理来 OCR。要解决您的问题，您必须清理图像，尝试仅获取数字。这可能是一项艰苦的工作。

最近，我正在寻找一个代码来为具有复杂背景的图像应用 OCR，我找到了一些解决方案，我将向您展示的代码是基于这个 solution。

要提取数字（或尝试），您必须遵循一些步骤

将图像转换为灰度使用 Otsu 方法和逆运算应用图像阈值应用距离变换应用形态学运算清理图像中的小点应用扩张操作来扩大你的数字找到轮廓并根据每个轮廓的宽度和高度对其进行过滤为每个轮廓创建一个船体对象列表绘制船体对象在你的面具中使用扩张操作检索分段区域的按位运算 OCR 预处理图像打印结果

我在这里展示的代码并不完美，我认为它可以改进，但我想向您展示解决问题的起点。

import cv2
import pytesseract
from pytesseract import Output
import numpy as np
import imutils

# loading and resizing image
img = cv2.imread('ABV5H.png')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = imutils.resize(img, width=900)
#gray scale
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
cv2.imshow("Gray", gray)
cv2.waitKey(0)
cv2.destroyAllWindows()

# thresholding with Otsu method and inverse operation
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV | 
cv2.THRESH_OTSU)[1]
cv2.imshow("Threshold", thresh)
cv2.waitKey(0)
cv2.destroyAllWindows()

#distrance transform
dist = cv2.distanceTransform(thresh, cv2.DIST_L2, 5)
dist = cv2.normalize(dist, dist, 0, 1.0, cv2.NORM_MINMAX)
dist = (dist*255).astype('uint8')
dist = cv2.threshold(dist, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
cv2.imshow("Distance Transformation", dist)
cv2.waitKey(0)
cv2.destroyAllWindows()

# Morphological operation kernel (2,2) and OPEN method
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (2,2))
opening = cv2.morphologyEx(dist, cv2.MORPH_OPEN, kernel)
cv2.imshow("Morphology", opening)
cv2.imwrite("morphology.jpg", opening)
cv2.waitKey(0)
cv2.destroyAllWindows()

#dilate operation to enlarge the numbers
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3,3))
dilation = cv2.dilate(opening, kernel, iterations = 1)
cv2.imshow("dilated", dilation)
cv2.imwrite("dilated.jpg", dilation)
cv2.waitKey(0)
cv2.destroyAllWindows()

#finding and grabbing the contours
cnts = cv2.findContours(dilation.copy(), cv2.RETR_EXTERNAL, 
cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
output = img.copy()
for i in cnts:
    cv2.drawContours(output, [i], -1, (0, 0, 255), 3)
cv2.imshow("Contours", output)
cv2.imwrite("contours.jpg", dilation)
cv2.waitKey(0)
cv2.destroyAllWindows()

#filtering the contours
nums = []
output2 = img.copy()
for c in cnts:
    (x, y, w, h) = cv2.boundingRect(c)

    if w >= 5 and w < 75 and h > 15 and h <= 35:
        nums.append(c)
for i in nums:
    cv2.drawContours(output2, [i], -1, (0, 0, 255), 2)
cv2.imshow("Filter", output2)
cv2.imwrite("filter.jpg", output2)
cv2.waitKey(0)
cv2.destroyAllWindows()

# making a list with the hull points
hull = []
# calculate points for each contour
for i in range(len(nums)):
    # creating convex hull object for each contour
    hull.append(cv2.convexHull(nums[i], False))

# create an empty black image
mask = np.zeros(dilation.shape[:2], dtype='uint8')

# draw contours and hull points
for i in range(len(nums)):
    color = (255, 0, 0) # blue - color for convex hull
    # draw ith convex hull object
    cv2.drawContours(mask, hull, i, color, 1, 8)

#dilating the mask to have a proper image for bitwise
mask = cv2.dilate(mask, kernel, iterations = 15)
cv2.imshow("Dilated Mask", mask)
cv2.imwrite("dilated-mask.jpg", mask)
cv2.waitKey(0)
cv2.destroyAllWindows()

#bitwise operation
final = cv2.bitwise_and(dilation, dilation, mask=mask)
cv2.imshow("Pre-processed Image", final)
cv2.imwrite("pre-processed.jpg", final)
cv2.waitKey(0)
cv2.destroyAllWindows()


config = '--psm 12 -c tessedit_char_whitelist=0123456789' #page segmentation mode and white lists
#OCR'ing the image
dict_wordsDetected = pytesseract.image_to_data(final, config = config, 
output_type=Output.DICT)

#filtering the detections and making a list of index
index = []
for idx, txt in enumerate(dict_wordsDetected['text']):
    if len(txt) >= 1:
        dict_wordsDetected['text'][idx] = txt.replace(" ", "")
        index.append(idx)
    
for i in index:

    (x, y, w, h) = (dict_wordsDetected['left'][i]
                  , dict_wordsDetected['top'][i]
                  , dict_wordsDetected['width'][i]
                  , dict_wordsDetected['height'][i])
    img_processed = cv2.rectangle(img, (x - 10, y - 10), (x + w + 10, y + h + 10), (0, 0, 255), 2)
    text = "".format(dict_wordsDetected['text'][i])
    cv2.putText(img, text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2)
cv2.imshow("Voilà le résultat", img)
cv2.imwrite('result.jpg', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

可视化一些操作

（我暂时无法上传我的图片。有一些图片的超链接。这些图片对应一些图片预处理步骤）

Output image after dilation filtered contours Mask after the hull operation and dilation pre-processed image (the image that will be OCR'ed) Results

结果

如您所见，我们可以在输入图像上找到数字。我们有很好的检测，另一方面，我们也有不准确的输出。主要原因是图像预处理。 image 是有噪声的，即使我们已经执行了一些转换。您的问题的关键是图像预处理。您必须记住的另一点是，Tesserat 并不完美，它需要好的图像才能正常工作。除此之外，您还必须了解--psm 模式（页面分割）来改进您的 OCR，以及使用白名单来避免不受欢迎的检测。正如我所说，我们有很好的结果，但我想你可以改进它，如果你的任务只需要 OpenCV 和 Tessaract。因为还有其他的方法比这个简单。

Si tu as besoin d'aide, tu peux me contacter, je préfère parler français que l'anglais。

【讨论】：

感谢 beaucoup 给我的助手！ En effet j'aimerai bien pouvoir échanger avec toi sur ces pratiques, mais possible d'envoyer de messages direct depuis ***。 Souhaites-tu que je t'envoie mon Linkedin？德瑞恩。 Je suis en conversation vers l'intelligence artificielle, connaître ce Genre de Technical est Fundamental pour travailler avec Computer Vision。 Donc on peut se contacter pour quelques échanges, c'est toujours important de parler, on apprend beaucoup。

以上是关于难以用 tesseract 检测数字的主要内容，如果未能解决你的问题，请参考以下文章