Extract words in rectangles from text

Question 1

I did the following program in c++ using opencv (I'm not familiar with java+opencv). I've included the output for the two sample images that you have provided. You may have to adjust the thresholds in the contour filtering section for some other images.

#include "stdafx.h"

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>

using namespace cv;
using namespace std;

int _tmain(int argc, _TCHAR* argv[])
{
    // load image as grayscale
    Mat im = imread(INPUT_FILE, CV_LOAD_IMAGE_GRAYSCALE);

    Mat morph;
    // morphological closing with a column filter : retain only large vertical edges
    Mat morphKernelV = getStructuringElement(MORPH_RECT, Size(1, 7));
    morphologyEx(im, morph, MORPH_CLOSE, morphKernelV);

    Mat bwV;
    // binarize: will contain only large vertical edges
    threshold(morph, bwV, 0, 255.0, CV_THRESH_BINARY | CV_THRESH_OTSU);

    // morphological closing with a row filter : retain only large horizontal edges
    Mat morphKernelH = getStructuringElement(MORPH_RECT, Size(7, 1));
    morphologyEx(im, morph, MORPH_CLOSE, morphKernelH);

    Mat bwH;
    // binarize: will contain only large horizontal edges
    threshold(morph, bwH, 0, 255.0, CV_THRESH_BINARY | CV_THRESH_OTSU);

    // combine the virtical and horizontal edges
    Mat bw = bwV & bwH;
    threshold(bw, bw, 128.0, 255.0, CV_THRESH_BINARY_INV);

    // just for illustration
    Mat rgb;
    cvtColor(im, rgb, CV_GRAY2BGR);

    // find contours
    vector<vector<Point>> contours;
    vector<Vec4i> hierarchy;
    findContours(bw, contours, hierarchy, CV_RETR_CCOMP, CV_CHAIN_APPROX_SIMPLE, Point(0, 0));
    // filter contours by area to obtain boxes
    double areaThL = bw.rows * .04 * bw.cols * .06;
    double areaThH = bw.rows * .7 * bw.cols * .7;
    double area = 0;
    for(int idx = 0; idx >= 0; idx = hierarchy[idx][0])
    {
        area = contourArea(contours[idx]); 
        if (area > areaThL && area < areaThH)
        {
            drawContours(rgb, contours, idx, Scalar(0, 0, 255), 2, 8, hierarchy);
            // take bounding rectangle. better to use filled countour as a mask
            // to extract the rectangle because then you won't get any stray elements
            Rect rect = boundingRect(contours[idx]);
            cout << "rect: (" << rect.x << ", " << rect.y << ") " << rect.width << " x " << rect.height << endl;
            Mat imRect(im, rect);
        }
    }

    return 0;
}

Result for the first image:

enter image description here

Result for the second image:

enter image description here

Question 2

I'm not sure whether "real" image processing skills are necessary.

Once you start tackling this problem with OpenCV, Sobel/Canny filters, edge detections and Hough transforms, it starts becoming rather involved. But maybe all this is not necessary here.

It all depends on how "predictable" the input is. That's why I asked in the comments whether the image can serve as a test case. IF the rectangles are always axis-aligned and don't have noise, distortions and interruptions, this can be solved with some trivial loops and pixel comparisons.

So IF you have potentially noisy or distorted input images, then ... good luck, you may have to acquire quite some image processing skills. If the image is not distorted or noisy, a solution like this one might be sufficient:

import java.awt.BorderLayout;
import java.awt.Dimension;
import java.awt.Graphics2D;
import java.awt.GridLayout;
import java.awt.Rectangle;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import javax.imageio.ImageIO;
import javax.swing.ImageIcon;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.SwingUtilities;


public class RectangleInImageTest
{
    public static void main(String[] args) throws IOException
    {
        final BufferedImage image = convertToARGB(ImageIO.read(new File("gcnc2.jpg")));
        final List<BufferedImage> subImages = scan(image);

        SwingUtilities.invokeLater(new Runnable()
        {
            @Override
            public void run()
            {
                createAndShowGUI(image, subImages);
            }
        });
    }

    private static void createAndShowGUI(
        BufferedImage image,
        List<BufferedImage> subImages)
    {
        JFrame f = new JFrame();
        f.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
        f.getContentPane().setLayout(new BorderLayout());

        f.getContentPane().add(new JLabel(new ImageIcon(image)),
            BorderLayout.CENTER);

        JPanel p = new JPanel(new GridLayout(1,0));
        for (BufferedImage subImage : subImages)
        {
            p.add(new JLabel(new ImageIcon(subImage)));
        }
        JPanel pp = new JPanel(new GridLayout(1,1));
        pp.setPreferredSize(new Dimension(800, 100));
        pp.add(new JScrollPane(p));
        f.getContentPane().add(pp, BorderLayout.SOUTH);
        f.setSize(800,800);
        f.setLocationRelativeTo(null);
        f.setVisible(true);
    }


    public static BufferedImage convertToARGB(BufferedImage image)
    {
        BufferedImage newImage = new BufferedImage(
            image.getWidth(), image.getHeight(),
            BufferedImage.TYPE_INT_ARGB);
        Graphics2D g = newImage.createGraphics();
        g.drawImage(image, 0, 0, null);
        g.dispose();
        return newImage;
    }    

    private static List<BufferedImage> scan(BufferedImage image)
    {
        List<BufferedImage> result = new ArrayList<BufferedImage>();
        int w = image.getWidth();
        int h = image.getHeight();
        for (int y=0; y<h; y++)
        {
            for (int x=0; x<w; x++)
            {
                int rgb = image.getRGB(x, y);
                if (!isBlack(rgb))
                {
                    continue;
                }
                if (!isUpperLeftCorner(image, x, y))
                {
                    continue;
                }
                Rectangle rectangle = extractRectangle(image, x,y);
                if (!isValidRectangle(rectangle))
                {
                    continue;
                }
                System.out.println("Rectangle "+rectangle);

                BufferedImage part = new BufferedImage(
                    rectangle.width-2, rectangle.height-2, 
                    BufferedImage.TYPE_INT_ARGB);
                Graphics2D g = part.createGraphics();
                g.drawImage(image, 
                    0, 0, rectangle.width-2, rectangle.height-2,
                    x+1, y+1, x+rectangle.width-1, y+rectangle.height-1, null);
                g.dispose();
                result.add(part);
            }
        }
        return result;
    }

    private static boolean isBlack(int rgb)
    {
        final int threshold = 128;
        int r = (rgb >> 16) & 0xFF;
        int g = (rgb >>  8) & 0xFF;
        int b = (rgb      ) & 0xFF;
        return 
            r < threshold &&
            g < threshold &&
            b < threshold;
    }

    private static boolean isUpperLeftCorner(BufferedImage image, int x, int y)
    {
        if (!isValidAndWhite(image, x-1, y  )) return false;
        if (!isValidAndWhite(image, x  , y-1)) return false;
        if (!isValidAndWhite(image, x-1, y-1)) return false;
        if (!isValidAndWhite(image, x+1, y-1)) return false;
        if (!isValidAndWhite(image, x-1, y+1)) return false;
        if (!isValidAndWhite(image, x+1, y+1)) return false;
        return true;
    }

    private static boolean isValidAndWhite(
        BufferedImage image, int x, int y)
    {
        int w = image.getWidth();
        int h = image.getHeight();
        if (x < 0 || x >= w)
        {
            return false;
        }
        if (y < 0 || y >= h)
        {
            return false;
        }
        int rgb = image.getRGB(x, y);
        return !isBlack(rgb);
    }


    private static Rectangle extractRectangle(
        BufferedImage image, int x0, int y0)
    {
        int w = image.getWidth();
        int h = image.getHeight();

        int x1 = x0;
        int y1 = y0;
        for (int y=y0; y<h; y++)
        {
            int rgb = image.getRGB(x0, y);
            if (!isBlack(rgb))
            {
                y1 = y;
                break;
            }
        }
        for (int x=x0; x<w; x++)
        {
            int rgb = image.getRGB(x, y0);
            if (!isBlack(rgb))
            {
                x1 = x;
                break;
            }
        }
        return new Rectangle(x0, y0, x1-x0, y1-y0);  
    }


    private static boolean isValidRectangle(Rectangle r)
    {
        final int minWidth = 16;
        final int minHeight = 8;
        return r.width >= minWidth && r.height >= minHeight;
    }

}

Question 3

Here's an algorithm that I demonstrated on a similar project using OpenCV:

Find the squares on the original image;
Extract (crop) the squares found so each become a new image;
Perform OCR detection on each image.

Most of these references are not in Java, but I assume you have the skills to convert C/C++ code to Java (btw, cv::Mat is equivalent to IplImage).

Question 4

first of all I hope you are already aware of some image processing because you will need some of this to continue :)

Here is a link about ways to do it : https://dsp.stackexchange.com/questions/3324/how-to-detect-edges-and-rectangles

But to sum up the most used method would be to use a Canny (edges detector) and them to apply Hough in order to detect the straight line and considering the results find the rectangle. In fact Hough is usually used to detect straight line and a rectangle is just 4 straight lines with an angle of 90° between each of them. So using all of this you may be able to improve your research ;)

Hope it will help ;)

Question 5

One possible solution is to perform connected component analysis after binarization using adaptive method. After that, calculate the median width of the connected component, if the connected component width is 5 times larger than the median width, then this connected component is the square we are looking for. The following codes are used to illustrate this idea

    Mat im = imread(inputFileName,CV_LOAD_IMAGE_GRAYSCALE);
Mat outputIm(im.rows,im.cols,CV_8U, Scalar(0));


    Mat bi;



// step 1: adaptive thresholding 
adaptiveThreshold(im,bi,255,ADAPTIVE_THRESH_GAUSSIAN_C,THRESH_BINARY,7,50);

threshold(bi, bi, 128.0, 255.0, CV_THRESH_BINARY_INV);


    // step 2: connected component analysis
std::vector<std::vector<cv::Point> > contours;

findContours(bi, contours, CV_RETR_EXTERNAL , CV_CHAIN_APPROX_NONE);  

    // step 3: analyze these blobs
double area;
std::vector<double> areaArray;
for(int i=0; i<contours.size(); i++)
{
    cv::Rect rect = boundingRect(contours[i]);
    area = rect.width;
    areaArray.push_back(area);
}
std::vector<double> sortedAreaArray;
sortedAreaArray = areaArray;
size_t n = sortedAreaArray.size() / 2;
    nth_element(sortedAreaArray.begin(), sortedAreaArray.begin()+n, sortedAreaArray.end());

double medianArea = sortedAreaArray[n];

for(int i=0; i<contours.size(); i++)
{
    if(areaArray[i]>5*medianArea)
    {
        for(int j=0; j<contours[i].size(); j++)
        {
            int x = contours[i][j].x;
            int y = contours[i][j].y;
            int pos = x+y*bi.cols;
            outputIm.data[pos]=255;
        }
    }

}
imwrite(outputFileName,outputIm);

The output rectangles can be shown:

enter image description here