Question

I am creating a small application that will open a word document, scan it for a credit card number (different patterns), replace the text, save and close the document.

My code is fairly simple:

using System;
using System.IO;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using Word = Microsoft.Office.Interop.Word;

namespace ParseFilesAndRemoveRegExp
{
    class Program
    {
        static void Main(string[] args)
        {
            FileManagement m = new FileManagement();
            m.OpenSearchAndReplace();
        }
    }

    class FileManagement
    {
        Word.Application wordapp;

        public FileManagement()
        {
            try
            {
                wordapp = new Word.Application();
            }
            catch(Exception ex)
            {
                if (ex != null)
                {
                    string s = ex.ToString();
                }
            }
        }

        internal void OpenSearchAndReplace()
        {
            object nullobj = System.Reflection.Missing.Value;
            try
            { 
                object filename = @"c:\\temp\\document.docx";
                object replaceAll = Word.WdReplace.wdReplaceAll;

                object matchWildCards = true;
                object readOnly = false;
                object isVisible = false;

                Word.Document doc = wordapp.Documents.Open( ref filename, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, 
                                                            ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                                            ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                doc.Activate();
                wordapp.Selection.Find.ClearFormatting();

                //wordapp.Selection.Find.Text = "[0-9]{16}";
                wordapp.Selection.Find.Text = "\b(?:[0-9][ -]*?){13,16}\b";
                wordapp.Selection.Find.Replacement.ClearFormatting();
                wordapp.Selection.Find.Replacement.Text = "---Cardnumber automatically removed---";

                wordapp.Selection.Find.Execute(ref nullobj, ref nullobj, ref nullobj, ref matchWildCards,
                                    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                    ref replaceAll, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                doc.Save();
            }
            catch(Exception ex)
            {
                string s = ex.ToString();
                if( wordapp != null )
                {
                    //wordapp.Documents.Close( ref nullobj, ref nullobj, ref nullobj );
                    wordapp.Quit( ref nullobj, ref nullobj, ref nullobj );
                }
            }
        }
    }
}

However - I get an exception when I run it: "System.Runtime.InteropServices.COMException (0x800A15B8): The Find What text contains a Pattern Match expression which is not valid".

I thought this might have something to do with the characters I sent to Word, so I previously exchanged \d with [0-9]. But no change. If I run with [0-9]{16}, it replaces the 1234567891012345 with the string I want to use.

Can anyone help me out here? Do I have to search with a number of different regex to manage a document, or can this be done with one simple regex like the one I already have?

Was it helpful?

Solution 3

Doing it the very simple way gave me something that worked:

for (int i = 0; i < 3; ++i)
            { 
                if( i == 0 )
                    wordapp.Selection.Find.Text = "[0-9]{16}";
                else if( i == 1 )
                    wordapp.Selection.Find.Text = "[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}";
                else if( i == 2 )
                    wordapp.Selection.Find.Text = "[0-9]{4} [0-9]{4} [0-9]{4} [0-9]{4}";

                wordapp.Selection.Find.Execute( ref nullobj, ref nullobj, ref nullobj, ref matchWildCards,
                                                ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                                ref replaceAll, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
            }

It is not a very nice setup, but hey - it works. Removed numbers like XXXXXXXXXXXXXXXX, XXXX XXXX XXXX XXXX and XXXX-XXXX-XXXX-XXXX. I will add others if necessary.

OTHER TIPS

Try \\b instead of \b. Otherwise, the string parser will try to put the ascii code 007 (bell) in the string and you won't get a match.

Have you tried escaping?:

wordapp.Selection.Find.Text = @"\b(?:[0-9][ -]*?){13,16}\b"; 

If that doesn't work, you need to start with a simple Regular Expression (or in fact just a plain text word), verify it works and then build up the RegEx in stages.

My guess would be that Word has its own flavour of regex. Have you tried opening a document in Word and using that regex in the Find and Replace dialog?

Actually, according to http://www.regexinference.com/documentation/Microsoft-Word-Wildcards-as-Regular-Expressions.html, Word doesn't support non-capturing parenthesis, so you're going to have to come up with a different solution.

We have the following as the best solution so far which goes beyond single line. It is not ms word but you can get what you want for sure.

private const string _creditCardPatternMatchingExpression = @"(?m:-[*]\w{2}\d{15,16})|(?m:CC\w{2}\d{15,16})|(?m:\d{15,16})|(\d{4}-\d{4}-\d{4}-\d{4})|(\d{4}-\d{6}-\d{5})";

        public static string CleanCreditCardData(this String contentThatMayHaveCreditCardData)
    {
        string initiallyCleanedUpData = Regex.Replace(contentThatMayHaveCreditCardData, _creditCardPatternMatchingExpression, "CCXXXXXXXXXXXXXX");
        string completeSpaceEnterCleanedUpVersion = initiallyCleanedUpData.ToLower().Replace("\r\n", "").Replace("\n", "").Replace(" ", "").Replace("-", "").Replace("<br>", "").Replace("<br />", "").Replace("<br/>", "").Replace("&nbsp;", "");
        if (Regex.IsMatch(completeSpaceEnterCleanedUpVersion,_creditCardPatternMatchingExpression))
            return Regex.Replace(completeSpaceEnterCleanedUpVersion, _creditCardPatternMatchingExpression, "CCXXXXXXXXXXXXXX");

        return initiallyCleanedUpData;
    }
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top