Adding spell check to Examine search in Umbraco 8
There is an excellent blog post by Lars-Erik Aabech on how to build a spell checker for search in Umbraco 7, and I have been looking for a way to update it for v8.
Background
The fundamental concepts of Lars' version are exactly what we want to do with this version: index all the text content of pages, and then search for words in the index if there are no results returned.
Code
Firstly we need to create a new index! We're giving it the name SpellCheckIndex
and defining one single field, word
, which will hold all the text content from each of our pages.
public class SpellCheckIndexCreator : LuceneIndexCreator
{
public override IEnumerable<IIndex> Create()
{
LuceneIndex index = new LuceneIndex("SpellCheckIndex",
CreateFileSystemLuceneDirectory("SpellCheckIndex"),
new FieldDefinitionCollection(
new FieldDefinition("word", FieldDefinitionTypes.FullText)
),
new StandardAnalyzer(Version.LUCENE_30)
);
return new[] { index };
}
}
public class SpellCheckIndexPopulator : IndexPopulator
{
private readonly SpellCheckValueSetBuilder _spellCheckValueSetBuilder;
private readonly IContentService _contentService;
public SpellCheckIndexPopulator(SpellCheckValueSetBuilder spellCheckValueSetBuilder, IContentService contentService)
{
_spellCheckValueSetBuilder = spellCheckValueSetBuilder;
_contentService = contentService;
RegisterIndex("SpellCheckIndex");
}
protected override void PopulateIndexes(IReadOnlyList<IIndex> indexes)
{
IContent[] content;
long totalRecords = 0;
int rootNode = -1;
int pageIndex = 0;
int pageSize = 10000;
do
{
content = _contentService.GetPagedDescendants(rootNode, pageIndex, pageSize, out totalRecords).ToArray();
if (content.Length > 0)
{
var valueSets = _spellCheckValueSetBuilder.GetValueSets(content);
foreach (var index in indexes)
{
index.IndexItems(valueSets);
}
}
pageIndex++;
}
while (content.Length == pageSize);
}
}
public class SpellCheckValueSetBuilder : IValueSetBuilder<IContent>
{
private readonly string[] FIELDS = new string[] { "nodeName", "title", "content", "metaDescription", "gridContent" };
public IEnumerable<ValueSet> GetValueSets(params IContent[] content)
{
foreach (var c in content)
{
List<string> cleanValues = new List<string>();
var properties = c.Properties.Where(x => FIELDS.Contains(x.Alias));
CollectCleanValues(properties, cleanValues);
var allWords = string.Join(" ", cleanValues);
var indexValues = new Dictionary<string, object>()
{
["word"] = allWords
};
var valueSet = new ValueSet(c.Id.ToString(), "word", indexValues);
yield return valueSet;
}
}
private void CollectCleanValues(IEnumerable<Property> properties, List<string> cleanValues)
{
foreach (var property in properties)
{
if (property.PropertyType.PropertyEditorAlias == Constants.PropertyEditors.Aliases.TextBox || property.PropertyType.PropertyEditorAlias == Constants.PropertyEditors.Aliases.TextArea)
{
var values = property.Values.WhereNotNull();
foreach (var value in values)
{
if (value.PublishedValue != null)
{
cleanValues.Add(CleanValue(value));
}
}
}
if (property.PropertyType.PropertyEditorAlias == Constants.PropertyEditors.Aliases.Grid)
{
var values = property.Values.WhereNotNull();
foreach (var value in values)
{
if (value.PublishedValue != null)
{
string json = value.PublishedValue.ToString();
GridDataModel gridContent = GridDataModel.Deserialize(json);
cleanValues.Add(CleanValue(gridContent.GetSearchableText()));
}
}
}
}
}
private static string CleanValue(PropertyValue value)
{
// Strip anything that's HTML
string result = HttpUtility.HtmlDecode(value.PublishedValue.ToString().StripHtml());
// Replace newlines
result = result.Replace("\r", " ").Replace("\n", " ");
// Replace punctuation (except single quotes in the middle of word, e.g. we're, don't)
result = Regex.Replace(result, @"[^\w' ]+|'(?!\w)|(?<!\w)'", " ");
// Lowercase all results
result = result.ToLowerInvariant();
return result;
}
private static string CleanValue(string value)
{
string result = value;
// Replace newlines
result = result.Replace("\r", " ").Replace("\n", " ");
// Replace punctuation (except single quotes in the middle of word, e.g. we're, don't)
result = Regex.Replace(result, @"[^\w' ]+|'(?!\w)|(?<!\w)'", " ");
// Lowercase all results
result = result.ToLowerInvariant();
return result;
}
}
And finally, yes, I'm working on getting this up and running for v9 too 😁