Looks like interest to Sitecore implementation of Lucene index has raised since Dream Core event and developers have run into an issue with old data being kept in the index repository. In this article I want to show you how to go around this issue.
First of all let’s see why it’s happening. I ran into this issue when I started playing with new implementation of Lucene index in Sitecore 6. When I created an output of the results I saw duplicates of my data in there. I stated debugging my code and found that Lucene somehow recognizes raw GUID’s which breaks search criteria that Sitecore uses to find items during update/delete procedure.
To solve this issue I had to create additional field for Lucene index (_shorttemplateid) and store there short GUID for an item (item.ID.ToShortID()). Then override AddMatchCriteria method and dependent properties to use short template GUID for matching criteria. Below is the code example.
- namespace LuceneExamples
- {
- public class DatabaseCrawler : Sitecore.Search.Crawlers.DatabaseCrawler
- {
- #region Fields
- private bool _hasIncludes;
- private bool _hasExcludes;
- private Dictionary<string, bool> _templateFilter;
- private ArrayList _customFields;
- #endregion Fields
- #region ctor
- public DatabaseCrawler()
- {
- _templateFilter = new Dictionary<string, bool>();
- _customFields = new ArrayList();
- }
- #endregion ctor
- #region Base class methods
- // Should be overriden to add date fields in "yyyyMMddHHmmss" format. Otherwise it's not possible to create range queries for date values.
- // Also adds _shorttemplateid field which has a template id in ShortID format.
- protected override void AddAllFields(Document document, Item item, bool versionSpecific)
- {
- Assert.ArgumentNotNull(document, "document");
- Assert.ArgumentNotNull(item, "item");
- Sitecore.Collections.FieldCollection fields = item.Fields;
- fields.ReadAll();
- foreach (Sitecore.Data.Fields.Field field in fields)
- {
- if (!string.IsNullOrEmpty(field.Key) && (field.Shared != versionSpecific))
- {
- bool tokenize = base.IsTextField(field);
- if (IndexAllFields)
- {
- if (field.TypeKey == "date" || field.TypeKey == "datetime")
- {
- IndexDateFields(document, field.Key, field.Value);
- }
- else
- {
- document.Add(CreateField(field.Key, field.Value, tokenize, 1f));
- }
- }
- if (tokenize)
- {
- document.Add(CreateField(BuiltinFields.Content, field.Value, true, 1f));
- }
- }
- }
- AddShortTemplateId(document, item);
- AddCustomFields(document, item);
- }
- /// <summary>
- /// Loops through the collection of custom fields and adds them to fields collection of each indexed item.
- /// </summary>
- /// <param name="document">Lucene document</param>
- /// <param name="item">Sitecore data item</param>
- private void AddCustomFields(Document document, Item item)
- {
- foreach(CustomField field in _customFields)
- {
- document.Add(CreateField(field.LuceneFieldName, field.GetFieldValue(item), field.StorageType, field.IndexType, Boost));
- }
- }
- /// <summary>
- /// Creates a Lucene field.
- /// </summary>
- /// <param name="fieldKey">Field name</param>
- /// <param name="fieldValue">Field value</param>
- /// <param name="storeType">Storage option</param>
- /// <param name="indexType">Index type</param>
- /// <param name="boost">Boosting parameter</param>
- /// <returns></returns>
- private Fieldable CreateField(string fieldKey, string fieldValue, Field.Store storeType, Field.Index indexType, float boost)
- {
- Field field = new Field(fieldKey, fieldValue, storeType, indexType);
- field.SetBoost(boost);
- return field;
- }
- /// <summary>
- /// Parses a configuration entry for a custom field and adds it to a collection of custom fields.
- /// </summary>
- /// <param name="node">Configuration entry</param>
- public void AddCustomField(XmlNode node)
- {
- CustomField field = CustomField.ParseConfigNode(node);
- if (field == null)
- {
- throw new InvalidOperationException("Could not parse custom field entry: " + node.OuterXml);
- }
- _customFields.Add(field);
- }
- // Method should use _shorttemplateid to allow one create combined/boolean search queries with template id reference.
- // Also used to create a matching criteria for update/delete actions.
- protected override void AddMatchCriteria(BooleanQuery query)
- {
- query.Add(new TermQuery(new Term(BuiltinFields.Database, Database)), BooleanClause.Occur.MUST);
- query.Add(new TermQuery(new Term(BuiltinFields.Path, Sitecore.Data.ShortID.Encode(Root).ToLowerInvariant())), BooleanClause.Occur.MUST);
- if (HasIncludes || HasExcludes)
- {
- foreach (KeyValuePair<string, bool> pair in TemplateFilter)
- {
- query.Add(new TermQuery(new Term(Constants.ShortTemplate, Sitecore.Data.ShortID.Encode(pair.Key).ToLowerInvariant())), pair.Value ? BooleanClause.Occur.SHOULD : BooleanClause.Occur.MUST_NOT);
- }
- }
- }
- // Method should be overriden because _hasIncludes and _hasExcludes variables were introduced.
- protected override bool IsMatch(Item item)
- {
- bool flag;
- Assert.ArgumentNotNull(item, "item");
- if (!RootItem.Axes.IsAncestorOf(item))
- {
- return false;
- }
- if (!HasIncludes && !HasExcludes)
- {
- return true;
- }
- if (!TemplateFilter.TryGetValue(item.TemplateID.ToString(), out flag))
- {
- return !HasIncludes;
- }
- return flag;
- }
- // Method required to override AddMatchCriteria one.
- new public void IncludeTemplate(string templateId)
- {
- Assert.ArgumentNotNullOrEmpty(templateId, "templateId");
- _hasIncludes = true;
- _templateFilter[templateId] = true;
- }
- // Method required to override AddMatchCriteria one.
- new public void ExcludeTemplate(string templateId)
- {
- Assert.ArgumentNotNullOrEmpty(templateId, "templateId");
- _hasExcludes = true;
- _templateFilter[templateId] = false;
- }
- #endregion Base class methods
- /// <summary>
- /// Converts Sitecore date and datetime fields to the recognizable format for Lucene API.
- /// </summary>
- /// <param name="doc">Lucene document object</param>
- /// <param name="fieldKey">Field name</param>
- /// <param name="fieldValue">Field value</param>
- private void IndexDateFields(Document doc, string fieldKey, string fieldValue)
- {
- DateTime dateTime = Sitecore.DateUtil.IsoDateToDateTime(fieldValue);
- string luceneDate = "";
- if (dateTime != DateTime.MinValue)
- {
- luceneDate = dateTime.ToString(Constants.DateTimeFormat);
- }
- doc.Add(CreateField(fieldKey, luceneDate, false, 1f));
- }
- /// <summary>
- /// Adds template id in ShortID format
- /// </summary>
- /// <param name="doc">Lucene document object</param>
- /// <param name="item">Sitecore item</param>
- private void AddShortTemplateId(Document doc, Item item)
- {
- doc.Add(CreateField(Constants.ShortTemplate, Sitecore.Data.ShortID.Encode(item.TemplateID).ToLowerInvariant(), false, 1f));
- }
- #region Properties
- protected bool HasIncludes
- {
- get
- {
- return _hasIncludes;
- }
- set
- {
- _hasIncludes = value;
- }
- }
- protected bool HasExcludes
- {
- get
- {
- return _hasExcludes;
- }
- set
- {
- _hasExcludes = value;
- }
- }
- protected Dictionary<string, bool> TemplateFilter
- {
- get
- {
- return _templateFilter;
- }
- }
- protected Item RootItem
- {
- get
- {
- return Sitecore.Data.Managers.ItemManager.GetItem(Root, Sitecore.Globalization.Language.Invariant,
- Sitecore.Data.Version.Latest,
- Sitecore.Data.Database.GetDatabase(Database),
- Sitecore.SecurityModel.SecurityCheck.Disable);
- }
- }
- #endregion Properties
- }
- }
This should solve this issue as well as add Lucene recognizable format for Sitecore date and datetime field types. Also it will allow to build Combined and Boolean search queries.
Update. Code for the Constants class:
1: namespace LuceneExamples
2: {
3: public class Constants
4: {
5: // special field for template id in ShortID format
6: public const string ShortTemplate = "_shorttemplateid";
7:
8: // searchable date-time format. All datetime field
9: public const string DateTimeFormat = "yyyyMMddHHmmss";
10:
11: // Path to lucene setting items: /sitecore/system/Settings/Lucene
12: public const string LuceneSettingsPath = "{89783047-026C-45B5-AB5B-338E4A22446C}";
13: }
14: }
Hope it saves someone a minute or two.
15 comments:
Hi Ivan,
We've just run into this problem and the code you posted did the trick! Thank you so much!
One question -- you say the root cause of the issue is that "Lucene somehow recognizes raw GUID's". Could you explain a bit more what you mean by that? How were the old GUIDs breaking Lucene?
Thanks!
rusty
Thanks a lot Ivan, this solved our duplicate indexing issue.
Rusty,
DatabaseCrawler uses item.TemplateID.ToString() code to add a template GUID. This API converts GUID string into upper case value. Lucene StandardAlanyzer (which is used by default) parses upper case value in a specific way. In other words, the outcome is not a GUID anymore. That's why it breaks the search.
If you convert GUID into lower case string, than it should work fine.
--Ivan
hi,
Great Article. I tried to enhance the crawler by padding the numeric values with "00" on left so that I can successfuly execute range queries on numeric fields. So to do the same, I used the same code that you have shown for DateFields, but it don't work. While debugging the code I can see field value is padded. But when I see IndexViewer the value goes unpadded. I tested this more by adding several characters at the end of the field and they also don't appear in the indexViewer. Please help me, I am stuck :(
Hi Ivan,
Great Article, helped a lot otherwise we could have dumped Lucene approach for our work.
Posting my question again.
I tried to enhance the crawler by padding the numeric values with "00" on left so that I can successfuly execute range queries on numeric fields. So to do the same, I used the same code that you have shown for DateFields, but it don't work. While debugging the code I can see field value is padded. But when I see IndexViewer the value goes unpadded. I tested this more by adding several characters at the end of the field and they also don't appear in the indexViewer. Please help me, I am stuck :(
Thanks,
mT
Try to use Luke to see if the value for the field is padded. When you add a numeric field to the index make sure that you use UN_TOKENIZED option.
Here is an article that explains how to treat numeric values: http://wiki.apache.org/lucene-java/SearchNumericalFields
--Ivan
Thanks Ivan for suggesting Luke. Seems like things are working fine, I downloaded Luke and updated IndexViewer and yes things work.
I went through your article on creating Lucene indexes (Part-2) and then came to know about two kind of indexes of sitecore. Again thanks for clearing the confusion.
Thanks,
mT
Is this issue corrected with the Sitecore CMS 6.2.0 rev.100831 (Update-4) release?
Yes, it was one of the fixes of 6.2.0 Update-4.
Hi Ivan,
The moment I add the IncludeTemplate and ExcludeTemplate methods to my DatabaseCrawler it stops excluding items whose template ID's are included in the include hint="list:ExcludeTemplate" section in the web.config for the index.
Any reason that might be happening?
Thanks,
Asif
Asif,
There was one method missing from the provided code. The IsMatch method should use customized _hasIncludes, _hasExcludes and templateFilter variables.
I've added missing method to the code. Try it out.
Hey, Ivan. I'm trying to put this code into my project and I'm seeing errors for some of the constants:
'Sitecore.Constants' does not contain a definition for 'ShortTemplate'
'Sitecore.Constants' does not contain a definition for 'DateTimeFormat'
Any idea what I might be missing?
Thanks in advance.
joshjs
Hi Josh,
I updated the article with missing code sample.
I'd recommend you to take a look at Advanced Database Crawler shared source component (http://trac.sitecore.net/AdvancedDatabaseCrawler). It's a complete solution that is quite extensible and is based on most stable version of Sitecore Lucene API.
Many thanks. :)
data analysis reporting services
SQIAR (http://www.sqiar.com/solutions/technology/tableau) is a leading Business Intelligence company.Sqiar Consultants Provide Tableau Software Consultancy To small and Medium size of organization.
Post a Comment