Answering my own question, for the sake of those who might look for something like in the future.
It appears as though my proposed solution is indeed the way to go. I have gone ahead and implemented it, and am posting it here. It is comprised of 2 classes: a Token Filter and a Token-Filter-Factory. Usage should be obvious for anyone verse in Solr.
A link to a quick write-up I did for this: http://blog.nitzanshaked.net/solr-domain-name-tokenizer/
The files:
DomainNameTokenFilterFactory.java
package com.clarityray.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import com.clarityray.solr.analysis.DomainNameTokenFilter;
public class DomainNameTokenFilterFactory extends TokenFilterFactory {
private int minLen;
private int maxLen;
private boolean withOriginal;
public DomainNameTokenFilterFactory(Map<String,String> args) {
super(args);
withOriginal = getBoolean(args, "withOriginal", true);
minLen = getInt(args, "minLen", 2);
maxLen = getInt(args, "maxLen", -1);
if (!args.isEmpty())
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@Override
public TokenStream create(TokenStream ts) {
return new DomainNameTokenFilter(ts, minLen, maxLen, withOriginal);
}
}
DomainNameTokenFilter.java
package com.clarityray.solr.analysis;
import java.util.Queue;
import java.util.LinkedList;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class DomainNameTokenFilter extends TokenFilter {
private CharTermAttribute charTermAttr;
private PositionIncrementAttribute posIncAttr;
private Queue<String> output;
private int nextPositionIncrement;
private int minLen;
private int maxLen;
private boolean withOriginal;
public DomainNameTokenFilter(TokenStream ts, int minLen, int maxLen, boolean withOriginal) {
super(ts);
this.charTermAttr = addAttribute(CharTermAttribute.class);
this.posIncAttr = addAttribute(PositionIncrementAttribute.class);
this.output = new LinkedList<String>();
this.minLen = minLen;
this.maxLen = maxLen;
this.withOriginal = withOriginal;
}
private String join(String glue, String[] arr, int start, int end) {
if (end < start)
return "";
StringBuilder sb = new StringBuilder();
sb.append(arr[start]);
for (int i = start+1; i <= end; ++i) {
sb.append(glue);
sb.append(arr[i]);
}
return sb.toString();
}
@Override
public boolean incrementToken() throws IOException {
// first -- output and ready tokens
if (!output.isEmpty()) {
charTermAttr.setEmpty();
charTermAttr.append(output.poll());
posIncAttr.setPositionIncrement(0);
return true;
}
// no tokens ready in output buffer? get next token from input stream
if (!input.incrementToken())
return false;
// get the text for the current token
String s = charTermAttr.toString();
// if the input does not look like a domain name, we leave it as is
if (s.indexOf('.') == -1)
return true;
// create all sub-sequences
String[] subParts = s.split("[.]");
int actualMaxLen = Math.min(
this.maxLen > 0 ? this.maxLen : subParts.length,
subParts.length
);
for (int currentLen = this.minLen; currentLen <= actualMaxLen; ++currentLen)
for (int i = 0; i + currentLen - 1 < subParts.length; ++i)
output.add(join(".", subParts, i, i + currentLen - 1));
// preserve original if so asked
if (withOriginal && actualMaxLen < subParts.length)
output.add(s);
// output first of the generated tokens
charTermAttr.setEmpty();
charTermAttr.append(output.poll());
posIncAttr.setPositionIncrement(1);
return true;
}
}
Hope this helps someone.