Question

I'm looking for an api that will visually show html difference for both structure, characters/words, and style. This tool must also support double byte characters and be flexible enough for me to add it to my existing website to show the results of the comparison easily. I'm currently using the Component Software COM implementation which doesn't support double byte characters and hasn't been updated in about six years.

Was it helpful?

Solution 2

This is what I used:

[http://code.google.com/p/google-diff-match-patch/][1]

I had to write my own methods to do the compare but after a little work it looks fine. This implementation compares test as passed in so it works fine if you are just comparing 2 text strings. My diff_prettyHtml call was changed to:

public string diff_prettyHtml(List<Diff> diffs)
    {
        StringBuilder html = new StringBuilder();
        foreach (Diff aDiff in diffs)
        {
            string text = aDiff.text.Replace("&", "&amp;").Replace("<", "&lt;")
              .Replace(">", "&gt;").Replace("\n", "<br>");
            switch (aDiff.operation)
            {
                case Operation.INSERT:
                    html.Append("<ins class='diff'>").Append(text)
                        .Append("</ins>");
                    break;
                case Operation.DELETE:
                    html.Append("<del class='diff'>").Append(text)
                        .Append("</del>");
                    break;
                case Operation.EQUAL:
                    html.Append("<span>").Append(text).Append("</span>");
                    break;
            }
        }
        return html.ToString();
    }

Now if you want to do a compare preview of 2 html strings this is a little different. This is what I did:

DiffMatchPatch.diff_match_patch diff = new DiffMatchPatch.diff_match_patch();
                List<DiffMatchPatch.Diff> differences = diff.diff_main(oldHtml,
                    newHtml);
                return diff.diff_previewHtml(differences);


public string diff_previewHtml(List<Diff> diffs) {
      StringBuilder html = new StringBuilder();
      foreach (Diff aDiff in diffs) {
        string text = aDiff.text;
        switch (aDiff.operation) {
          case Operation.INSERT:
                html.Append("<ins class='diff'>").Append(text)
                .Append("</ins>");
            break;
          case Operation.DELETE:
            html.Append("<del class='diff'>").Append(text)
                .Append("</del>");
            break;
          case Operation.EQUAL:
            html.Append(text);
            break;
        }
      }
      return html.ToString();
    }

The unicode class is as follows:

using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using System.Linq;

namespace HtmlCompare
{
    class Unicoder
    {

        private Hashtable _htmlHash = new Hashtable();
        private const string _htmlPattern = @"<(S*?)[^>]*>.*?|<.*?\/>";
        private List<string> _blockElements = "img,br".Split(',').ToList<string>();
        private int _currentHash = 44032;

        public string pushHash(string tag)
        {
            if (_htmlHash[tag] == null)
            {
                //_htmlHash[tag] = char.Parse("\\u" + Convert.ToString(_currentHash,16));
                _htmlHash[tag] = char.ConvertFromUtf32(_currentHash);
                _currentHash++;
            }
            return _htmlHash[tag].ToString();
        }

        private string tagMatch(Match tag)
        {
            return pushHash(tag.Value);
        }

        public string html2plain(string html)
        {
            MatchEvaluator tagEvaluator = new MatchEvaluator(tagMatch);
            return Regex.Replace(html, _htmlPattern, tagEvaluator, RegexOptions.IgnoreCase | RegexOptions.Multiline);
        }

        private string ProcessDiffTag(string tagStart, string tagEnd, string contents)
        {
            ArrayList diffTagParts = new ArrayList();
            MatchCollection matches = Regex.Matches(contents,
                    _htmlPattern,
                    RegexOptions.IgnoreCase | RegexOptions.Multiline);

            if (matches.Count > 0)
            {
                int contentsStringIndex = 0;
                int contentsStringEndIndex = 0;
                int lastContentStringIndex = 0;

                bool lastTag = false;
                TagDefinition definition;
                foreach (Match currentMatch in matches)
                {
                    contentsStringIndex = currentMatch.Index;
                    contentsStringEndIndex = contentsStringIndex + currentMatch.Length;

                    lastTag = (currentMatch == matches[matches.Count - 1]);


                    // did we miss text that isn't a tag?
                    if (contentsStringIndex > lastContentStringIndex)
                    {
                        definition = new TagDefinition();
                        definition.Tag = false;
                        definition.Text = contents.Substring(lastContentStringIndex, contentsStringIndex - lastContentStringIndex);
                        AddTagDefinition(diffTagParts, definition);
                    }
                    else if (lastTag && contents.Length > contentsStringEndIndex) // something after the last tag?
                    {
                        definition = new TagDefinition();
                        definition.Tag = false;
                        definition.Text = contents.Substring(contentsStringEndIndex, contents.Length - contentsStringEndIndex);
                        AddTagDefinition(diffTagParts, definition);
                    }

                    // work on current tag
                    definition = new TagDefinition();
                    definition.Tag = true;
                    definition.OpeningTag = !IsClosingTag(currentMatch.Value);
                    definition.TagType = GetTagType(currentMatch.Value);
                    definition.Text = currentMatch.Value;
                    AddTagDefinition(diffTagParts, definition);

                    lastContentStringIndex = contentsStringEndIndex;
                }

                return GoThroughDiffParts(diffTagParts,
                        tagStart,
                        tagEnd);
            }
            else
                return string.Concat(tagStart, contents, tagEnd);
        }

        private string GetTagType(string tag)
        {
            int startIndex = 1; // skip <
            if (tag.StartsWith("</"))
                startIndex = 2; // skip </
            int endIndex = tag.IndexOf(" ");
            if (endIndex == -1)
                endIndex = tag.IndexOf(">");

            return tag.Substring(startIndex, endIndex - startIndex);

        }

        private string GoThroughDiffParts(ArrayList parts, string startTag, string endTag)
        {
            IEnumerator enumerator = parts.GetEnumerator();
            StringBuilder before = new StringBuilder(string.Empty);
            StringBuilder middle = new StringBuilder(string.Empty);
            StringBuilder after = new StringBuilder(string.Empty);

            TagDefinition definition;
            while (enumerator.MoveNext())
            {
                definition = (TagDefinition)enumerator.Current;
                if (!definition.Used) // have we already used this part?
                {
                    definition.Used = true;
                    if (_blockElements.Contains(definition.TagType))
                        middle.Append(definition.Text);
                    else if (definition.MatchingIndex == -1) // no matching tag
                    {
                        if (definition.Tag) // html tag?
                        {
                            if (definition.OpeningTag)
                                before.Append(definition.Text);
                            else
                                after.Append(definition.Text);
                        }
                        else
                            middle.Append(definition.Text);
                    }
                    else
                    {
                        if (!definition.Tag) // text and has a matching tag
                        {
                            TagDefinition matchingTag = (TagDefinition)parts[definition.MatchingIndex];
                            if (matchingTag.OpeningTag)
                                matchingTag.Text += definition.Text;
                            else
                                matchingTag.Text = string.Concat(definition.Text, matchingTag.Text);
                            definition.Used = true;
                        }
                        else
                            middle.Append(definition.Text);
                    }
                }
            }

            bool includeDiffTag = true;
            if (string.IsNullOrEmpty(middle.ToString()))
                includeDiffTag = false; // we don't want the ins/del tag around nothing
            else if (string.IsNullOrWhiteSpace(middle.ToString())) // spacing should be kept
                middle = new StringBuilder("&nbsp;" + middle.Replace("\n", "<br />"));

            if(includeDiffTag)
                middle.Insert(0, startTag); // <ins>[middle]
            middle.Insert(0, before); // [before]<ins>[middle]
            if (includeDiffTag)
                middle.Append(endTag); // [before]<ins>[middle]</ins>
            middle.Append(after); // [before]<ins>[middle]</ins>[end]

            return middle.ToString();
        }

        private string DiffTagMatch(Match tag)
        {
            string tagStart = tag.Groups[1].Value;
            string tagEnd = tag.Groups[5].Value;
            string contents = tag.Groups[4].Value;
            if (string.IsNullOrEmpty(contents))
                return string.Empty; // we don't want the ins/del tag around nothing
            else if (string.IsNullOrWhiteSpace(contents)) // spacing should be kept
                return string.Concat(tagStart, "&nbsp;", contents.Replace("\n", "<br />"), tagEnd);
            else
                return ProcessDiffTag(tagStart,
                    tagEnd,
                    contents);

        }

        private bool IsClosingTag(string tag)
        {
            return tag.Contains("</") && !tag.ToLower().Contains("<img") && !tag.ToLower().Contains("<br");
        }

        public string CleanUpMisplacedDiffTags(string html)
        {
            return Regex.Replace(html, @"(\<((ins|del).*?)\>)(.*?)(\<\/((ins|del).*?)\>)", DiffTagMatch, RegexOptions.IgnoreCase | RegexOptions.Multiline);
        }

        public string plain2html(string plain)
        {
            IDictionaryEnumerator enumerator = _htmlHash.GetEnumerator();
            while (enumerator.MoveNext())
            {
                plain = Regex.Replace(plain, 
                    _htmlHash[enumerator.Key].ToString(), 
                    enumerator.Key.ToString(), 
                    RegexOptions.IgnoreCase | RegexOptions.Multiline);
            }
            return CleanUpMisplacedDiffTags(plain);
        }

        private void AddTagDefinition(ArrayList list, TagDefinition tag)
        {
            IEnumerator enumerator = list.GetEnumerator();
            TagDefinition currentDefinition;
            int index = 0;
            int insertingIndex = list.Count;
            while (enumerator.MoveNext())
            {

                currentDefinition = (TagDefinition)enumerator.Current;
                //if (!tag.OpeningTag && currentDefinition.MatchingIndex == -1)
                //    currentDefinition.MatchingIndex = insertingIndex;

                if (tag.MatchingIndex == -1 && // matching tag not found yet
                        (currentDefinition.OpeningTag && !tag.OpeningTag) && // opening & closing
                        currentDefinition.TagType == currentDefinition.TagType) // same tag type
                {
                    tag.MatchingIndex = index;
                    currentDefinition.MatchingIndex = insertingIndex;
                }
            }

            list.Add(tag);
        }

        private class TagDefinition
        {
            public bool Tag { get; set; }
            public string TagType { get; set; }
            public string Text { get; set; }
            public int MatchingIndex { get; set; }
            public bool OpeningTag { get; set; }
            public bool Used { get; set; }

            public TagDefinition()
            {
                this.Tag = false;
                this.Text = string.Empty;
                this.TagType = string.Empty;
                this.MatchingIndex = -1;
                this.OpeningTag = false;
                this.Used = false;
            }
        }
    }
}

OTHER TIPS

The only two tools I found that can do something like that are http://changedetection.com and http://imnosy.com. Both offer you to specify a url and watch them for changes.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top