var data = "your fußball, hasn't! flees.";
// Remove unwanted punctuation, in this case full-stops,
// commas, and exclamation marks.
data = data.replace(/[.,!]/g, '');
// split the words up
data.split(' '); // ["your", "fußball", "hasn't", "flees"]
How to avoid natural for node.js splitting words with special characters
-
18-07-2023 - |
Frage
I'm using node natural tokenizer feature, which splits a sentence into words. Usually it's supposed to work as
var natural = require('natural'),
tokenizer = new natural.WordTokenizer();
console.log(tokenizer.tokenize("your dog has't flees."));
// Returns [ 'your', 'dog', 'has', 'n't, 'flees' ]
It works fine, however, when used with German or French words, it splits up the words into two, such as
var natural = require('natural'),
tokenizer = new natural.WordTokenizer();
console.log(tokenizer.tokenize("fußball"));
// Returns ['fu', 'ball']
Which is not correct.
Anyone knows how to avoid that?
Or maybe you know a simplier way to split sentences into words in JavaScript / Node.js?
Thanks!
Lösung
Andere Tipps
The natural docs state
[...] At the moment, most of the algorithms are English-specific
So, I wouldn't expect it to work out-of-the-box without some work on your part.
However, if all you want to do is split a string along whitespace boundaries, use something like this:
var s = "your dog hasn't flees.";
console.log(s.split(/\s+/)); // ["your", "dog", "hasn't", "flees."]
Lizenziert unter: CC-BY-SA mit Zuschreibung
Nicht verbunden mit StackOverflow