Question

I have an image that has some text in it. I want to send the image to OCR but the image has some white noise in it so the OCR results aren't that great. I've tried to erode/dilate the image but couldn't get the perfect threshold to work. Since all the text in the images will be perfectly horizontal I tried the Hough Transform.

Here is what the image looks like when I run the sample hough transform program bundled with OpenCV.

Question

  • How can I black out everything except where the red lines are? OR How can I crop out a separate images for each of the areas highlighted by the red lines?

  • I would only like to concentrate on lines that are horizontal, I can discard the diagonal lines.

Either option will work for me when sending to OCR. However, I'd like to try both to see which fetches best results.

Was it helpful?

Solution

howto/s with output

  • How can I black out everything except where the red lines are?
    • dotess2()
    • ['Footel text goes he: e\n', 'Some mole hele\n', 'Some Text Here\n']
  • OR How can I crop out a separate images for each of the areas highlighted by the red lines?
    • dotess1()
    • ['Foolel text goes he: e\n', 'Some mole hele\n', 'Some Text Here\n', 'Directions\n']

code

# -*- coding: utf-8 -*- 
import cv2
import numpy as np
import math
import subprocess
import os
import operator

#some clean up/init blah blah
junk='\/,-‘’“ ”?.\';!{§_~!@#$%^&*()_+-|:}»£[]¢€¥°><'
tmpdir='./tmp'
if not os.path.exists(tmpdir):
    os.makedirs(tmpdir)
for path, subdirs, files in os.walk(tmpdir):
    for name in files:
        os.remove(os.path.join(path, name))     

#when the preprocessor is not pefect, there will be junk in the result. this is a crude mean of ridding them off
def resfilter(res):
    rd = dict()
    for l in set(res):
        rd[l]=0.

    for l in rd:
        for i in l:
            if i in junk:
                rd[l]-=1
            elif i.isdigit():
                rd[l]+=.5
            else:
                rd[l]+=1
    ret=[]
    for v in sorted(rd.iteritems(), key=operator.itemgetter(1), reverse=True):
        ret.append(v[0])
    return ret

def dotess1():
    res =[]
    for path, subdirs, files in os.walk(tmpdir):
        for name in files:
            fpath = os.path.join(path, name)
            img = cv2.imread(fpath)
            gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)

            '''
            #if the text is too small/contains noise etc, resize and maintain aspect ratio
            if gray.shape[1]<100:
                gray=cv2.resize(gray,(int(100/gray.shape[0]*gray.shape[1]),100))
            '''     
            cv2.imwrite('tmp.jpg',gray)
            args = ['tesseract.exe','tmp.jpg','tessres','-psm','7', '-l','eng']
            subprocess.call(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 
            with open('tessres.txt') as f:
                    for line in f:
                        if line.strip() != '':
                            res.append(line)
    print resfilter(res)


def dotess2():
    res =[]
    args = ['tesseract.exe','clean.jpg','tessres','-psm','3', '-l','eng']
    subprocess.call(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 
    with open('tessres.txt') as f:
            for line in f:
                if line.strip() != '':
                    res.append(line)
    print resfilter(res)

'''
start of code
'''
img = cv2.imread('c:/data/ocr3.png')
gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
canny=cv2.Canny(gray,50,200,3)
cv2.imshow('canny',canny)

#remove the actual horizontal lines so that hough wont detect them
linek = np.zeros((11,11),dtype=np.uint8)
linek[5,...]=1
x=cv2.morphologyEx(canny, cv2.MORPH_OPEN, linek ,iterations=1)
canny-=x
cv2.imshow('canny no horizontal lines',canny)

#draw a fat line so that you can box it up
lines = cv2.HoughLinesP(canny, 1, math.pi/2, 50,50, 50, 20)
linemask = np.zeros(gray.shape,gray.dtype)
for line in lines[0]:
    if line[1]==line[3]:#check horizontal
        pt1 = (line[0],line[1])
        pt2 = (line[2],line[3])
        cv2.line(linemask, pt1, pt2, (255), 30)

cv2.imshow('linemask',linemask)

'''
* two methods of doing ocr,line mode and page mode
* boxmask is used to so that a clean image can be saved for page mode
* for every detected boxes, the roi are cropped and saved so that tess3 can be run in line mode
'''

boxmask = np.zeros(gray.shape,gray.dtype)
contours,hierarchy = cv2.findContours(linemask,cv2.RETR_LIST,cv2.CHAIN_APPROX_SIMPLE)
idx=0
for cnt in contours:
    idx+=1
    area = cv2.contourArea(cnt)
    x,y,w,h = cv2.boundingRect(cnt)
    roi=img[y:y+h,x:x+w].copy()
    cv2.imwrite('%s/%s.jpg'%(tmpdir,str(idx)),roi)
    cv2.rectangle(boxmask,(x,y),(x+w,y+h),(255),-1)


cv2.imshow('clean',img&cv2.cvtColor(boxmask,cv2.COLOR_GRAY2BGR))
cv2.imwrite('clean.jpg',img&cv2.cvtColor(boxmask,cv2.COLOR_GRAY2BGR))
cv2.imshow('img',img)

dotess1()
dotess2()
cv2.waitKey(0)
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top