Question
How can we tune our CQ5 search functionality, exclude certain properties from being indexed, enable spell checking, stemming or other features?
Answer, Resolution
The search module in CQ5 is built using CRX. CRX uses an embedded Apache Lucene index to implement search. CRX provides various parameters and configurations to allow you to fine tune the index to meet your needs.
SearchIndex configuration:
First of all, a CRX repository can have one or many workspaces and each workspace has its own search index. In addition to this, each workspace's search index can be configured separately in the workspace's workspace.xml file.
In a default CQ5 installation you only have one CRX workspace called "crx.default". This workspace stores all your site's content (excluding versions). The search index configuration for this workspace can be found under crx-quickstart/workspaces/crx.default/workspace.xml. All configurations for this can be made within the <SearchIndex> element. A reference of all these configurations can be found here.
Here's an example workspace.xml SearchIndex section:
...
<SearchIndex class="com.day.crx.query.lucene.LuceneHandler">
<param name="path" value="${wsp.home}/index"/>
</SearchIndex>
...
indexing_config.xml configuration:
The configurations in workspace.xml allow you to tune the behavior of the search index, however they don't let you modify what content is being indexed. To configure this, we need to look at the indexing_config.xml file which allows us to specify indexing rules.
- In CQ versions <= 5.4 the default configuration file
indexing_config.xml
can be found undercrx-quickstart/server/runtime/0/_crx/WEB-INF/classes.
In CQ 5.5, the default configuration file is embedded in the repository bundle deployed in the OSGi container.
- Copy the default search index configuration file indexing_config.xml under crx-quickstart/repository/workspaces/crx.default/indexing_config.xml.
* For CQ5.4 and older versions you can find this file under crx-quickstart/server/runtime/0/_crx/WEB-INF/classes/indexing_config.xml.
* For CQ5.5, we copied the default indexing_config.xml file below for your convenience. - Now we can set a path for our custom indexing_config.xml in the SearchIndex element of workspace.xml. Open crx-quickstart/workspaces/crx.default/workspace.xml in a text editor and add a indexingConfiguration parameter as shown below:
...<SearchIndex class="com.day.crx.query.lucene.LuceneHandler"> <param name="path" value="${wsp.home}/index"/> <param name="resultFetchSize" value="50" /> <param name="indexingConfiguration" value="${wsp.home}/indexing_config.xml"/> </SearchIndex>
Applies to
CQ 5.2+
indexing_config.xml from CQ5.5
<?xml version="1.0"?> | |
<!DOCTYPE configuration SYSTEM "http://jackrabbit.apache.org/dtd/indexing-configuration-1.2.dtd"> | |
<configuration | |
xmlns:cq="http://www.day.com/jcr/cq/1.0" | |
xmlns:dam="http://www.day.com/dam/1.0" | |
xmlns:nt="http://www.jcp.org/jcr/nt/1.0" | |
xmlns:jcr="http://www.jcp.org/jcr/1.0" | |
xmlns:sling="http://sling.apache.org/jcr/sling/1.0"> | |
<!-- Do not index content of subassets --> | |
<index-rule nodeType="nt:resource" | |
condition="ancestor::subassets/@jcr:primaryType='{http://www.jcp.org/jcr/nt/1.0}unstructured'"> | |
</index-rule> | |
<!-- | |
Exclude some well known properties from the node scope | |
fulltext index. Do not add rules below this one, since | |
this rule matches any node and acts as a default/fallback. | |
--> | |
<index-rule nodeType="nt:base"> | |
<property nodeScopeIndex="false">analyticsProvider</property> | |
<property nodeScopeIndex="false">analyticsSnippet</property> | |
<property nodeScopeIndex="false">hideInNav</property> | |
<property nodeScopeIndex="false">offTime</property> | |
<property nodeScopeIndex="false">onTime</property> | |
<property nodeScopeIndex="false">cq:allowedTemplates</property> | |
<property nodeScopeIndex="false">cq:childrenOrder</property> | |
<property nodeScopeIndex="false">cq:cugEnabled</property> | |
<property nodeScopeIndex="false">cq:cugPrincipals</property> | |
<property nodeScopeIndex="false">cq:cugRealm</property> | |
<property nodeScopeIndex="false">cq:designPath</property> | |
<property nodeScopeIndex="false">cq:isCancelledForChildren</property> | |
<property nodeScopeIndex="false">cq:isDeep</property> | |
<property nodeScopeIndex="false">cq:lastModified</property> | |
<property nodeScopeIndex="false">cq:lastModifiedBy</property> | |
<property nodeScopeIndex="false">cq:lastPublished</property> | |
<property nodeScopeIndex="false">cq:lastPublishedBy</property> | |
<property nodeScopeIndex="false">cq:lastReplicated</property> | |
<property nodeScopeIndex="false">cq:lastReplicatedBy</property> | |
<property nodeScopeIndex="false">cq:lastReplicationAction</property> | |
<property nodeScopeIndex="false">cq:lastReplicationStatus</property> | |
<property nodeScopeIndex="false">cq:lastRolledout</property> | |
<property nodeScopeIndex="false">cq:lastRolledoutBy</property> | |
<property nodeScopeIndex="false">cq:name</property> | |
<property nodeScopeIndex="false">cq:parentPath</property> | |
<property nodeScopeIndex="false">cq:segments</property> | |
<property nodeScopeIndex="false">cq:siblingOrder</property> | |
<property nodeScopeIndex="false">cq:template</property> | |
<property nodeScopeIndex="false">cq:trigger</property> | |
<property nodeScopeIndex="false">cq:versionComment</property> | |
<property nodeScopeIndex="false">jcr:createdBy</property> | |
<property nodeScopeIndex="false">jcr:lastModifiedBy</property> | |
<property nodeScopeIndex="false">sling:alias</property> | |
<property nodeScopeIndex="false">sling:resourceType</property> | |
<property nodeScopeIndex="false">sling:vanityPath</property> | |
<property isRegexp="true">.*:.*</property> | |
</index-rule> | |
<!-- Cq Page for jcr:contains(jcr:content, "...") searches --> | |
<aggregate primaryType="cq:PageContent"> | |
<include>*</include> | |
<include>*/*</include> | |
<include>*/*/*</include> | |
<include>*/*/*/*</include> | |
</aggregate> | |
<aggregate primaryType="dam:Asset"> | |
<include>jcr:content</include> | |
<include>jcr:content/metadata</include> | |
<include>jcr:content/metadata/*</include> | |
<include>jcr:content/renditions</include> | |
<include>jcr:content/renditions/original</include> | |
<include>jcr:content/renditions/original/jcr:content</include> | |
<!-- child axis orderby index --> | |
<include>jcr:content/renditions/original/jcr:content/jcr:lastModified</include> | |
</aggregate> | |
<!-- nt:file child axis orderby index --> | |
<aggregate primaryType="nt:file"> | |
<include>jcr:content</include> | |
<include>jcr:content/jcr:lastModified</include> | |
</aggregate> | |
<!-- cq:Page child axis orderby index --> | |
<aggregate primaryType="cq:Page"> | |
<include>jcr:content</include> | |
<include>jcr:content/cq:lastModified</include> | |
</aggregate> | |
</configuration> | |