Question

I am trying to extract data from short, non-uniform product descriptions in order to partially automate making product web pages for my company's online store. Unfortunately, the descriptions are not uniform. Thanks to this site, I have learned enough about regex to make a fair stab at it.

In the product measurement method, here is a failing test.

w. This product is 68 cm by 22 cm by 73 cm -- Length: 68 cm Width: 73 cm Height:

But this very similar test doesn't fail. Why?

x. This product is 68 cm x 22 cm x 73 cm -- Length: 68 cm Width: 22 cm Height: 73 cm

Here is the test class. As a regex noob, I'm sure that I'm not doing it very efficiently, so suggestions on efficiency would be great. And please let me know if you can think of any other failing test cases.

import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TesterClass {
    public static final String[] testArray = 
    {
        "a. Dynabrade 4\" Discs 34-333-102",
        "b. Mercer 4 Inch Discs",
        "c. Mercer 4in Discs",
        "d. Carbo CleanAir 6\' Vacuum Tube Attachment",
        "e. 4 feet",
        "f. 4 Ft",
        "g. 4 foot",
        "h. Carborundum 2-3/4\" Tape 4 yd Roll 97580",
        "i. I want 5 Inches and later 6 Feet",
        "j. I don't want this one pyrex 9",
        "k. This is 4 Inches x 5 Inches x 6 Feet in Size",
        "l. This one is 6 x 5 Inches",
        "m. This is 4\" x 5\" x 6\' in size",
        "n. Something 4-3/4\" Long",
        "o. I don't want 9 xtreme things",
        "p. I don't want 9 men",
        "q. 674m",
        "r. 4 Inches",
        "s. 5x8",
        "t. P58\"",
        "u. 5 x 7",
        "v. 6 yards",
        "w. This product is 68 cm by 22 cm by 73 cm",
        "x. This product is 68 cm x 22 cm x 73 cm"
    };
    public static final String[] FIELDNAMES = {"Length: ","Width: ","Height: "};

    public static final String 
        MEASURE = "(f(ee)?(oo)?t|in(ch)?(es)?|y(ar)?d(s)?|cm|m|meter(s)?|\"|\')",
        MAYBE_MEASURE = MEASURE + "?",
        NUMBER = "([0-9\\-/]+)",
        X = "[(x)(by)]",
        SPACE = "\\s",
        MAYBE_SPACE = "\\s?",
        SPACE_OR_END = "(\\s|$)",
        START = "^",
        END = "$",
        TAB = "\t";

    public static final Pattern 
        regular = Pattern.compile(NUMBER + MAYBE_SPACE + MEASURE + SPACE_OR_END, Pattern.CASE_INSENSITIVE),
        lengthXwidth = Pattern.compile(NUMBER + MAYBE_SPACE + MAYBE_MEASURE + MAYBE_SPACE + X + MAYBE_SPACE + NUMBER + MAYBE_MEASURE, Pattern.CASE_INSENSITIVE),
        beforeX = Pattern.compile(NUMBER + MAYBE_SPACE + MAYBE_MEASURE + MAYBE_SPACE + END, Pattern.CASE_INSENSITIVE),
        afterX = Pattern.compile(START + MAYBE_SPACE + NUMBER + MAYBE_SPACE + MAYBE_MEASURE, Pattern.CASE_INSENSITIVE),
        measure = Pattern.compile(MAYBE_SPACE + MEASURE, Pattern.CASE_INSENSITIVE);


    public static void main(String[] args)
    {
        for (String testString:testArray)
        {
            String[] fields = {"","",""};
            int match = 0;
            Matcher lengthXwidthMatcher = lengthXwidth.matcher(testString);
            Matcher regularMatcher = regular.matcher(testString);

            if (lengthXwidthMatcher.find())
            {
                String[] split = testString.split(X);
                for (int i = 0; i<split.length; i++)
                {
                    Matcher beforeXMatcher = beforeX.matcher(split[i]);
                    Matcher afterXMatcher = afterX.matcher(split[i]);
                    if (beforeXMatcher.find() && match==0)
                    {
                        fields[0] = beforeXMatcher.group();
                        match++;
                    }
                    if (afterXMatcher.find())
                    {
                        if (match==1)
                        {
                            fields[1] = afterXMatcher.group();
                            match++;
                        }
                        else if (match==2)
                        {
                            fields[2] = afterXMatcher.group();
                            match++;
                        }
                    }
                }
                Matcher lengthHasMeasure = measure.matcher(fields[0]);
                Matcher widthHasMeasure = measure.matcher(fields[1]);
                Matcher heightHasMeasure = measure.matcher(fields[2]);
                if (heightHasMeasure.find()==true)
                {
                    if (lengthHasMeasure.find()==false)
                    {
                        fields[0] = fields[0] + heightHasMeasure.group();
                    }
                    if (widthHasMeasure.find()==false)
                    {
                        fields[1] = fields[1] + heightHasMeasure.group();
                    }
                }
                else if (widthHasMeasure.find()==true)
                {
                    if (lengthHasMeasure.find()==false)
                    {
                        fields[0] = fields[0] + widthHasMeasure.group();
                    }
                }
            }
            else if(regularMatcher.find())
            {
                fields[0] = regularMatcher.group();
                match++;
                while (regularMatcher.find() && match<3)
                {
                    fields[match] = regularMatcher.group();
                }
            }

            System.out.println(testString + " -- " + TAB + FIELDNAMES[0] + fields[0] + TAB + FIELDNAMES[1] + fields[1] + TAB + FIELDNAMES[2] + fields[2]);
        }
    }
}

No correct solution

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top