-
Notifications
You must be signed in to change notification settings - Fork 187
Make solr field/schema resolvers respect the tokenized field of attributes #7003
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -178,9 +178,6 @@ public DynamicSchemaResolver(List<String> additionalFields) { | |
| textSortCharacterLimit = 127; | ||
| } | ||
| fieldsCache.add(Metacard.ID + SchemaFields.TEXT_SUFFIX); | ||
| fieldsCache.add(Metacard.ID + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED); | ||
| fieldsCache.add( | ||
| Metacard.ID + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED + SchemaFields.HAS_CASE); | ||
|
|
||
| fieldsCache.add(SchemaFields.METACARD_TYPE_FIELD_NAME); | ||
| fieldsCache.add(SchemaFields.METACARD_TYPE_OBJECT_FIELD_NAME); | ||
|
|
@@ -273,10 +270,10 @@ void addFields(Metacard metacard, SolrInputDocument solrInputDocument) | |
| if (AttributeFormat.XML.equals(format) | ||
| && solrInputDocument.getFieldValue( | ||
| formatIndexName + getSpecialIndexSuffix(AttributeFormat.STRING)) | ||
| == null) { | ||
| == null | ||
| && ad.isTokenized()) { | ||
| List<String> parsedTexts = parseTextFrom(attributeValues); | ||
|
|
||
| // parsedTexts => *_txt_tokenized | ||
| // parsedTexts => *_txt_tokenized - only if attribute is tokenized | ||
| String specialStringIndexName = | ||
| ad.getName() | ||
| + getFieldSuffix(AttributeFormat.STRING) | ||
|
|
@@ -294,12 +291,14 @@ void addFields(Metacard metacard, SolrInputDocument solrInputDocument) | |
| solrInputDocument.addField( | ||
| ad.getName() + getFieldSuffix(AttributeFormat.STRING), truncatedValues); | ||
|
|
||
| // *_txt_tokenized | ||
| solrInputDocument.addField( | ||
| ad.getName() | ||
| + getFieldSuffix(AttributeFormat.STRING) | ||
| + getSpecialIndexSuffix(AttributeFormat.STRING), | ||
| attributeValues); | ||
| // *_txt_tokenized - only if attribute is tokenized | ||
| if (ad.isTokenized()) { | ||
| solrInputDocument.addField( | ||
| ad.getName() | ||
| + getFieldSuffix(AttributeFormat.STRING) | ||
| + getSpecialIndexSuffix(AttributeFormat.STRING), | ||
| attributeValues); | ||
| } | ||
| } else if (AttributeFormat.OBJECT.equals(format)) { | ||
| ByteArrayOutputStream byteArrayOS = new ByteArrayOutputStream(); | ||
| List<Serializable> byteArrays = new ArrayList<>(); | ||
|
|
@@ -594,13 +593,12 @@ String getField( | |
| Map<String, Serializable> enabledFeatures) { | ||
|
|
||
| final String fieldSuffix = getFieldSuffix(format); | ||
| String fieldName = | ||
| propertyName | ||
| + fieldSuffix | ||
| + (isSearchedAsExactValue ? "" : getSpecialIndexSuffix(format, enabledFeatures)); | ||
| String baseName = propertyName + fieldSuffix; | ||
| String extendedName = | ||
| baseName + (isSearchedAsExactValue ? "" : getSpecialIndexSuffix(format, enabledFeatures)); | ||
|
|
||
| if (fieldsCache.contains(fieldName)) { | ||
| return fieldName; | ||
| if (fieldsCache.contains(extendedName)) { | ||
| return extendedName; | ||
| } | ||
|
|
||
| switch (format) { | ||
|
|
@@ -613,13 +611,14 @@ String getField( | |
| default: | ||
| break; | ||
| } | ||
|
|
||
| // if we have the base name in the cache but not the extended name assume the extended doesn't | ||
| // exist otherwise fallback to the old convention | ||
| String bestGuess = fieldsCache.contains(baseName) ? baseName : extendedName; | ||
| LOGGER.debug( | ||
| "Could not find exact schema field name for [{}], attempting to search with [{}]", | ||
| propertyName, | ||
| fieldName); | ||
|
|
||
| return fieldName; | ||
| bestGuess); | ||
| return bestGuess; | ||
| } | ||
|
|
||
| private String getFieldSuffix(AttributeFormat format) { | ||
|
|
@@ -658,8 +657,11 @@ String getCaseSensitiveField( | |
| && mappedPropertyName.endsWith(SchemaFields.PHONETICS)) { | ||
| return mappedPropertyName; | ||
| } | ||
| // TODO We can check if this field really does exist | ||
| return mappedPropertyName + SchemaFields.HAS_CASE; | ||
| String potentialField = mappedPropertyName + SchemaFields.HAS_CASE; | ||
| if (fieldsCache.contains(potentialField)) { | ||
| return potentialField; | ||
| } | ||
| return fieldsCache.contains(mappedPropertyName) ? mappedPropertyName : potentialField; | ||
| } | ||
|
|
||
| private String getSpecialIndexSuffix(AttributeFormat format) { | ||
|
|
@@ -706,6 +708,18 @@ private void addToFieldsCache(AttributeDescriptor descriptor) { | |
|
|
||
| fieldsCache.add(descriptor.getName() + schemaFields.getFieldSuffix(format)); | ||
|
|
||
| if (format.equals(AttributeFormat.GEOMETRY)) { | ||
| fieldsCache.add( | ||
| descriptor.getName() | ||
| + schemaFields.getFieldSuffix(format) | ||
| + getSpecialIndexSuffix(format)); | ||
| return; | ||
| } | ||
|
|
||
| if (!descriptor.isTokenized()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not 100% confident that I know what all of these fields are for, but it looks like there are at least two cases where fields were added to the cache that will no longer be hit:
|
||
| return; | ||
| } | ||
|
|
||
| if (!getSpecialIndexSuffix(format).equals("")) { | ||
| fieldsCache.add( | ||
| descriptor.getName() | ||
|
|
@@ -892,9 +906,7 @@ private Set<String> getAnyTextFields() { | |
| .collect(Collectors.toList()); | ||
| } | ||
|
|
||
| return fields.stream() | ||
| .map(field -> field + SchemaFields.TEXT_SUFFIX) | ||
| .collect(Collectors.toSet()); | ||
| return new HashSet<>(fields); | ||
| } | ||
|
|
||
| private Set<String> getAnyGeoFields() { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -313,19 +313,14 @@ public SolrQuery propertyIsEqualTo(String propertyName, String literal, boolean | |
| private String wildcardSolrQuery( | ||
| String searchPhrase, String propertyName, boolean isCaseSensitive, boolean isExact) { | ||
| String solrQuery; | ||
| String tokenized = resolver.getSpecialIndexSuffix(AttributeFormat.STRING, enabledFeatures); | ||
| if (Metacard.ANY_TEXT.equals(propertyName)) { | ||
| solrQuery = | ||
| resolver | ||
| .anyTextFields() | ||
| .map( | ||
| field -> { | ||
| if (!isExact) { | ||
| return field + tokenized; | ||
| } else { | ||
| return field; | ||
| } | ||
| }) | ||
| field -> | ||
| resolver.getField( | ||
| field, AttributeFormat.STRING, isExact, Collections.emptyMap())) | ||
| .map( | ||
| textField -> { | ||
| if (isCaseSensitive && !isExact) { | ||
|
|
@@ -339,7 +334,7 @@ private String wildcardSolrQuery( | |
| } else { | ||
| String field = getMappedPropertyName(propertyName, AttributeFormat.STRING, true); | ||
| if (!isExact) { | ||
| field += tokenized; | ||
| field = getMappedPropertyName(propertyName, AttributeFormat.STRING, false); | ||
| } | ||
|
Comment on lines
336
to
338
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ✏️ We can remove this case and pass |
||
| if (isCaseSensitive && !isExact) { | ||
| field = resolver.getCaseSensitiveField(field, enabledFeatures); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ | |
| import static org.hamcrest.Matchers.hasItem; | ||
| import static org.hamcrest.Matchers.hasSize; | ||
| import static org.hamcrest.Matchers.is; | ||
| import static org.hamcrest.Matchers.notNullValue; | ||
| import static org.hamcrest.Matchers.nullValue; | ||
| import static org.mockito.ArgumentMatchers.eq; | ||
| import static org.mockito.Mockito.RETURNS_DEEP_STUBS; | ||
|
|
@@ -54,7 +55,7 @@ | |
|
|
||
| public class DynamicSchemaResolverTest { | ||
|
|
||
| private static final int INITIAL_FIELDS_CACHE_COUNT = 5; | ||
| private static final int INITIAL_FIELDS_CACHE_COUNT = 3; | ||
|
|
||
| private static final ObjectMapper METACARD_TYPE_MAPPER = | ||
| MetacardTypeMapperFactory.newObjectMapper(); | ||
|
|
@@ -180,7 +181,7 @@ public void testAnyTextFieldPropertyParsing() throws Exception { | |
| // Perform Test | ||
| List<String> fields = resolver.anyTextFields().collect(Collectors.toList()); | ||
| Truth.assertThat(fields) | ||
| .containsExactly("metadata_txt", "title_txt", "description_txt", "ext.extracted.text_txt"); | ||
| .containsExactly("metadata", "title", "description", "ext.extracted.text"); | ||
| } | ||
|
|
||
| @Test | ||
|
|
@@ -301,4 +302,203 @@ public void getFieldExactValue() { | |
| dynamicSchemaResolver.getField("unknown", AttributeFormat.STRING, true, enabledFeatures), | ||
| is("unknown_txt")); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAddFieldsWithXmlAttributeNotTokenized() throws Exception { | ||
| // Setup - XML attribute that is NOT tokenized | ||
| String metacardTypeName = "test"; | ||
| Set<AttributeDescriptor> attributeDescriptors = new HashSet<>(1); | ||
| String attributeName = "metadata"; | ||
| boolean indexed = true; | ||
| boolean stored = true; | ||
| boolean tokenized = false; | ||
| boolean multiValued = false; | ||
|
|
||
| attributeDescriptors.add( | ||
| new TestAttributeDescriptorImpl( | ||
| attributeName, | ||
| attributeName, | ||
| indexed, | ||
| stored, | ||
| tokenized, | ||
| multiValued, | ||
| BasicTypes.XML_TYPE)); | ||
|
|
||
| Attribute mockAttribute = mock(Attribute.class); | ||
| when(mockAttribute.getValue()).thenReturn("<root>test</root>"); | ||
| when(mockAttribute.getValues()).thenReturn(Collections.singletonList("<root>test</root>")); | ||
|
|
||
| Metacard mockMetacard = mock(Metacard.class, RETURNS_DEEP_STUBS); | ||
| when(mockMetacard.getMetacardType().getName()).thenReturn(metacardTypeName); | ||
| when(mockMetacard.getMetacardType().getAttributeDescriptors()).thenReturn(attributeDescriptors); | ||
| when(mockMetacard.getAttribute(attributeName)).thenReturn(mockAttribute); | ||
|
|
||
| SolrInputDocument solrInputDocument = new SolrInputDocument(); | ||
| DynamicSchemaResolver resolver = new DynamicSchemaResolver(); | ||
|
|
||
| // Perform Test | ||
| resolver.addFields(mockMetacard, solrInputDocument); | ||
|
|
||
| // Verify - tokenized=false means the special string index should NOT be populated | ||
| assertThat(solrInputDocument.getFieldValue("metadata_txt_tokenized"), is(nullValue())); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAddFieldsWithXmlAttributeTokenized() throws Exception { | ||
| // Setup - XML attribute that IS tokenized | ||
| String metacardTypeName = "test"; | ||
| Set<AttributeDescriptor> attributeDescriptors = new HashSet<>(1); | ||
| String attributeName = "metadata"; | ||
| boolean indexed = true; | ||
| boolean stored = true; | ||
| boolean tokenized = true; | ||
| boolean multiValued = false; | ||
|
|
||
| attributeDescriptors.add( | ||
| new TestAttributeDescriptorImpl( | ||
| attributeName, | ||
| attributeName, | ||
| indexed, | ||
| stored, | ||
| tokenized, | ||
| multiValued, | ||
| BasicTypes.XML_TYPE)); | ||
|
|
||
| Attribute mockAttribute = mock(Attribute.class); | ||
| when(mockAttribute.getValue()).thenReturn("<root>test</root>"); | ||
| when(mockAttribute.getValues()).thenReturn(Collections.singletonList("<root>test</root>")); | ||
|
|
||
| Metacard mockMetacard = mock(Metacard.class, RETURNS_DEEP_STUBS); | ||
| when(mockMetacard.getMetacardType().getName()).thenReturn(metacardTypeName); | ||
| when(mockMetacard.getMetacardType().getAttributeDescriptors()).thenReturn(attributeDescriptors); | ||
| when(mockMetacard.getAttribute(attributeName)).thenReturn(mockAttribute); | ||
|
|
||
| SolrInputDocument solrInputDocument = new SolrInputDocument(); | ||
| DynamicSchemaResolver resolver = new DynamicSchemaResolver(); | ||
|
|
||
| // Perform Test | ||
| resolver.addFields(mockMetacard, solrInputDocument); | ||
|
|
||
| // Verify - tokenized=true means the special string index SHOULD be populated | ||
| assertThat(solrInputDocument.getFieldValue("metadata_txt_tokenized"), notNullValue()); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAddFieldsWithStringAttributeNotTokenizedAddsTxtNotTokenizedField() | ||
| throws Exception { | ||
| // Setup - STRING attribute that is NOT tokenized | ||
| String metacardTypeName = "test"; | ||
| Set<AttributeDescriptor> attributeDescriptors = new HashSet<>(1); | ||
| String attributeName = "title"; | ||
| boolean indexed = true; | ||
| boolean stored = true; | ||
| boolean tokenized = false; | ||
| boolean multiValued = false; | ||
|
|
||
| attributeDescriptors.add( | ||
| new TestAttributeDescriptorImpl( | ||
| attributeName, | ||
| attributeName, | ||
| indexed, | ||
| stored, | ||
| tokenized, | ||
| multiValued, | ||
| BasicTypes.STRING_TYPE)); | ||
|
|
||
| Attribute mockAttribute = mock(Attribute.class); | ||
| when(mockAttribute.getValue()).thenReturn("test value"); | ||
| when(mockAttribute.getValues()).thenReturn(Collections.singletonList("test value")); | ||
|
|
||
| Metacard mockMetacard = mock(Metacard.class, RETURNS_DEEP_STUBS); | ||
| when(mockMetacard.getMetacardType().getName()).thenReturn(metacardTypeName); | ||
| when(mockMetacard.getMetacardType().getAttributeDescriptors()).thenReturn(attributeDescriptors); | ||
| when(mockMetacard.getAttribute(attributeName)).thenReturn(mockAttribute); | ||
|
|
||
| SolrInputDocument solrInputDocument = new SolrInputDocument(); | ||
| DynamicSchemaResolver resolver = new DynamicSchemaResolver(); | ||
|
|
||
| // Perform Test | ||
| resolver.addFields(mockMetacard, solrInputDocument); | ||
|
|
||
| // Verify - tokenized=false means the tokenized field should NOT be populated | ||
| assertThat(solrInputDocument.getFieldValue("title_txt"), is("test value")); | ||
| assertThat(solrInputDocument.getFieldValue("title_txt_tokenized"), is(nullValue())); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAddFieldsWithStringAttributeTokenizedAddsBothTxtAndTokenizedField() | ||
| throws Exception { | ||
| // Setup - STRING attribute that IS tokenized | ||
| String metacardTypeName = "test"; | ||
| Set<AttributeDescriptor> attributeDescriptors = new HashSet<>(1); | ||
| String attributeName = "title"; | ||
| boolean indexed = true; | ||
| boolean stored = true; | ||
| boolean tokenized = true; | ||
| boolean multiValued = false; | ||
|
|
||
| attributeDescriptors.add( | ||
| new TestAttributeDescriptorImpl( | ||
| attributeName, | ||
| attributeName, | ||
| indexed, | ||
| stored, | ||
| tokenized, | ||
| multiValued, | ||
| BasicTypes.STRING_TYPE)); | ||
|
|
||
| Attribute mockAttribute = mock(Attribute.class); | ||
| when(mockAttribute.getValue()).thenReturn("test value"); | ||
| when(mockAttribute.getValues()).thenReturn(Collections.singletonList("test value")); | ||
|
|
||
| Metacard mockMetacard = mock(Metacard.class, RETURNS_DEEP_STUBS); | ||
| when(mockMetacard.getMetacardType().getName()).thenReturn(metacardTypeName); | ||
| when(mockMetacard.getMetacardType().getAttributeDescriptors()).thenReturn(attributeDescriptors); | ||
| when(mockMetacard.getAttribute(attributeName)).thenReturn(mockAttribute); | ||
|
|
||
| SolrInputDocument solrInputDocument = new SolrInputDocument(); | ||
| DynamicSchemaResolver resolver = new DynamicSchemaResolver(); | ||
|
|
||
| // Perform Test | ||
| resolver.addFields(mockMetacard, solrInputDocument); | ||
|
|
||
| // Verify - tokenized=true means both fields SHOULD be populated | ||
| assertThat(solrInputDocument.getFieldValue("title_txt"), is("test value")); | ||
| assertThat(solrInputDocument.getFieldValue("title_txt_tokenized"), is("test value")); | ||
| } | ||
|
|
||
| @Test | ||
| public void testGetFieldNumericalFallsBackToRequestedSuffix() { | ||
| // When no numerical field exists in cache, should fall back to the requested suffix | ||
| // fieldsCache is empty for a new resolver | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ✏️ It will have a minimum of three items, but yes it will not have anything for these fields. |
||
| assertThat( | ||
| dynamicSchemaResolver.getField( | ||
| "unknown", AttributeFormat.INTEGER, false, Collections.emptyMap()), | ||
| is("unknown_int")); | ||
| assertThat( | ||
| dynamicSchemaResolver.getField( | ||
| "unknown", AttributeFormat.LONG, false, Collections.emptyMap()), | ||
| is("unknown_lng")); | ||
| assertThat( | ||
| dynamicSchemaResolver.getField( | ||
| "unknown", AttributeFormat.DOUBLE, false, Collections.emptyMap()), | ||
| is("unknown_dbl")); | ||
| assertThat( | ||
| dynamicSchemaResolver.getField( | ||
| "unknown", AttributeFormat.FLOAT, false, Collections.emptyMap()), | ||
| is("unknown_flt")); | ||
| assertThat( | ||
| dynamicSchemaResolver.getField( | ||
| "unknown", AttributeFormat.SHORT, false, Collections.emptyMap()), | ||
| is("unknown_shr")); | ||
| } | ||
|
|
||
| @Test | ||
| public void testGetFieldXmlReturnsTextPathSuffix() { | ||
| // XML format should use TEXT_PATH (_tpt) suffix | ||
| assertThat( | ||
| dynamicSchemaResolver.getField( | ||
| "metadata", AttributeFormat.XML, false, Collections.emptyMap()), | ||
| is("metadata_xml_tpt")); | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.