Question

I'm trying to add <abbr> tags to acronyms found on a website. I'm running this as a Chrome Extension but I'm fairly certain the problem is within the javascript itself and doesn't have much to do with the Chrome stuff (I'll include the source just in case anyways)

I should mention that I'm using a lot of code from this link which was suggested on another answer. Unfortunately I'm getting unexpected results as my end goal differs a bit from what is discussed there.

First I have an array of acronyms (shortened here, I included the whole thing on JSFiddle)

"ITPHR": "Inside-the-park home run:hits on which the batter successfully touched all four bases, without the contribution of a fielding error or the ball going outside the ball park.",
"pNERD": "Pitcher&#39;s NERD: expected aesthetic pleasure of watching an individual pitcher",
"RISP": "Runner In Scoring Position: a breakdown of the batter&#39;s batting average with runners in scoring position, which include runners at second and third bases.",
"SBA/ATT": "Stolen base attempts: total number of times the player has attempted to steal a base (SB+CS)",

then the matchText() function from the previously linked artile

var matchText = function (node, regex, callback, excludeElements) {
    excludeElements || (excludeElements = ['script', 'style', 'iframe', 'canvas']);
    var child = node.firstChild;
    do {
        switch (child.nodeType) {
            case 1:
                if (excludeElements.indexOf(child.tagName.toLowerCase()) > -1) {
                    continue;
                }
                matchText(child, regex, callback, excludeElements);
                break;
            case 3:
                child.data.replace(regex, function (all) {
                    var args = [].slice.call(arguments),
                        offset = args[args.length - 2],
                        newTextNode = child.splitText(offset);
                    newTextNode.data = newTextNode.data.substr(all.length);
                    callback.apply(window, [child].concat(args));
                    child = newTextNode;
                });
             break;
        }
    } while (child = child.nextSibling);
    return node;
}

and finally my code that cycles through the array of acronyms and searches all the terms one by one (this might not be the optimal way of doing things, please let me know if you have a better idea)

var abbrList = Object.keys(acronyms);
for (var i = 0; i < abbrList.length; i++) {
    var abbrev = abbrList[i];
    abbrevSearch = abbrev.replace('%', '\\%').replace('+', '\\+').replace('/', '\\/');
    console.log("Looking for " + abbrev);
    matchText(document.body.getElementsByTagName("*"), new RegExp("\\b" + abbrevSearch + "\\b", "g"), function (node, match, offset) {
        var span = document.createElement("abbr");
        // span.className = "sabrabbr"; // If someone decides to style them
        span.setAttribute("title", acronyms[abbrev].replace('&#39;', '\''));
        span.textContent = match;
        node.parentNode.insertBefore(span, node.nextSibling);
    });
}

As a reference here are the Chrome-specific files:

manifest.json { "name": "SABR Acronyms", "version": "0.1", "manifest_version": 2, "description": "Adds tooltips with a definition to commonly used acronyms in baseball.",

  "icons": {
    "16" : "images/16.png",
    "48" : "images/48.png",
    "128" : "images/128.png"
  },

  "permissions": [
    "activeTab"
  ],

  "browser_action": {
    "default_icon": "images/16.png",
    "default_title": "SABR Acronyms"
  },

  "content_scripts": [
  {
    "matches": ["http://*/*"],
    "js": ["content.js","jquery.min.js"],
    "css": ["sabr.css"]
  }
  ],

  "web_accessible_resources": ["content.js", "sabr.js", "sabr.css","jquery.min.js","jquery-2.0.3.min.map"]
}

content.js

var s = document.createElement('script');
s.src = chrome.extension.getURL('sabr.js');
(document.head||document.documentElement).appendChild(s);
s.onload = function() {
    s.parentNode.removeChild(s);
};

I uploaded everything on JSFiddle since it's the easiest way to see the code in action. I copied the <body>...</body> of a page containing an article with a few of the acronyms being used. A lot of them should be picked up but aren't. Exact matches are also picked up but not all the time. There also seems to be a problem single/2-letter acronyms (such as IP in the table). The regular expression is quite simple, I thought \b would do the trick.

Thanks!

Était-ce utile?

La solution

There were a couple of issues with your code (or maybe a little more).

  1. Chrome detects word-boundaries in its own way, so \b does not work as expected (e.g. a . is considered part of a word).

  2. You were using the global modifier which returned the indexes of all the matches it found. But when handling each match, you modified the content of child.data, so the indices that referred to the original child.data were rendered useless. This problem would only come up whenever there were more than 1 matches in a single TextNode. (Note that once this error caused an exception to be raised, execution was aborted, so no further TextNodes were processed.)

  3. The acronyms were searched for (and replaced) in the order of appearance in the acronym list. This could lead to cases, where only a substring of an acronym would be recognised as another acronym and incorrectly replaced. E.g. if ERA was seached for before ERA+, all ERA+ occurrences in the DOM would be replaced by <abbr ...>ERA</abbr>+ and would not be recognised as ERA+ occurrences later on.

  4. Similarly to the above problem, a substring of an already processed acronym, could be subsequently recognised as another acronym and pertially replaced. E.g. if ERA+ was searched for before ERA the following would happen:
    ERA+
    -> <abbr (title_for_ERA+)>ERA+</abbr>
    -> <abbr (title_for_ERA+)><abbr (title_for_ERA)>ERA</abbr>+</abbr>

  5. Your one-letter "acronyms" would also match characters they shouldn't (e.g. E in E-mail, G in Paul G. etc).


(Among many possible ways) I chose to address the above problems like this:

For (1):
Instead of using \b...\b I used (^|[^A-Za-z0-9_])(...)([^A-Za-z0-9_]|$).
This will look for one character that is not a word character before and after our acronym under search (or settle for string start (^) or end ($) respectively). Since the matched characters (if any) before and after the actual acronym match need to be put back in the regular TextNodes, 3 backreferences are created and handled appropriately in the replace callback (see code below).

For (2):
I removed the global modifier and matched one occurrence at a time.
This also required a slight modification, so that the new TextNode, created with the part of child.data after the current match, is subsequently searched as well.

For (3):
Before starting the search and replace operations I ordered the array of acronyms by decreasing length, so longer acronyms were search for (and replaced) before sorter acronyms (which could possible be a substring of the former). E.g. ERA+ is always replaced before ERA, IP/GS is always replaced before IP etc.
(Note that this solves problem (3), but we still have to deal with (4).)

For (4):
Every time I create a new <abbr> node I add a class to it. Later on, when I encounter an element with that special class, I skip it (as I don't want any replacements to happen in a substring of an already matched acronym).

For (5):
Well, I am good, but I am not Jon Skeet :)
There is not much you can do about it, unless you want to bring on some AI, but I suppose it is not much of a problem either (i.e. you can live with it).

(As already mentioned the above solutions are neither the only ones available and probably nor optimal.)


That said, here is my version of the code (with a few more miror (for the most part stylistic) changes):

var matchText = function (node, regex, callback, excludeElements) {
    excludeElements
            || (excludeElements = ['script', 'style', 'iframe', 'canvas']);
    var child = node.firstChild;
    if (!child) {
        return;
    }

    do {
        switch (child.nodeType) {
            case 1:
                if ((child.className === 'sabrabbr') ||
                        (excludeElements.indexOf(
                                child.tagName.toLowerCase()) > -1)) {
                    continue;
                }
                matchText(child, regex, callback, excludeElements);
                break;
            case 3:
                child.data.replace(regex, function (fullMatch, g1, g2, g3, idx,
                                                    original) {
                    var offset = idx + g1.length;
                    newTextNode = child.splitText(offset);
                    newTextNode.data = newTextNode.data.substr(g2.length);
                    callback.apply(window, [child, g2]);
                    child = child.nextSibling;
                });
                break;
        }
    } while (child = child.nextSibling);
    return node;
}

var abbrList = Object.keys(acronyms).sort(function(a, b) {
    return b.length - a.length;
});
for (var i = 0; i < abbrList.length; i++) {
    var abbrev = abbrList[i];
    abbrevSearch = abbrev.replace('%', '\\%').replace('+', '\\+').replace('/', '\\/');
    console.log("Looking for " + abbrev);
    var regex = new RegExp("(^|[^A-Za-z0-9_])(" + abbrevSearch
                           + ")([^A-Za-z0-9_]|$)", "");
    matchText(document.body, regex, function (node, match) {
        var span = document.createElement("abbr");
        span.className = "sabrabbr";
        span.title = acronyms[abbrev].replace('&#39;', '\'');
        span.textContent = match;
        node.parentNode.insertBefore(span, node.nextSibling);
    });
}

For the noble few that made it this far, there is, also, this short demo.

Licencié sous: CC-BY-SA avec attribution
Non affilié à StackOverflow
scroll top