Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ public class CoreAttributes implements Core, MetacardType {
METACARD_TAGS,
true /* indexed */,
true /* stored */,
false /* tokenized */,
true /* tokenized */,
true /* multivalued */,
BasicTypes.STRING_TYPE));
descriptors.add(
Expand Down
Comment thread
clockard marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,6 @@ public DynamicSchemaResolver(List<String> additionalFields) {
textSortCharacterLimit = 127;
}
fieldsCache.add(Metacard.ID + SchemaFields.TEXT_SUFFIX);
fieldsCache.add(Metacard.ID + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED);
fieldsCache.add(
Metacard.ID + SchemaFields.TEXT_SUFFIX + SchemaFields.TOKENIZED + SchemaFields.HAS_CASE);

fieldsCache.add(SchemaFields.METACARD_TYPE_FIELD_NAME);
fieldsCache.add(SchemaFields.METACARD_TYPE_OBJECT_FIELD_NAME);
Expand Down Expand Up @@ -273,10 +270,10 @@ void addFields(Metacard metacard, SolrInputDocument solrInputDocument)
if (AttributeFormat.XML.equals(format)
&& solrInputDocument.getFieldValue(
formatIndexName + getSpecialIndexSuffix(AttributeFormat.STRING))
== null) {
== null
&& ad.isTokenized()) {
List<String> parsedTexts = parseTextFrom(attributeValues);

// parsedTexts => *_txt_tokenized
// parsedTexts => *_txt_tokenized - only if attribute is tokenized
String specialStringIndexName =
ad.getName()
+ getFieldSuffix(AttributeFormat.STRING)
Expand All @@ -294,12 +291,14 @@ void addFields(Metacard metacard, SolrInputDocument solrInputDocument)
solrInputDocument.addField(
ad.getName() + getFieldSuffix(AttributeFormat.STRING), truncatedValues);

// *_txt_tokenized
solrInputDocument.addField(
ad.getName()
+ getFieldSuffix(AttributeFormat.STRING)
+ getSpecialIndexSuffix(AttributeFormat.STRING),
attributeValues);
// *_txt_tokenized - only if attribute is tokenized
if (ad.isTokenized()) {
solrInputDocument.addField(
ad.getName()
+ getFieldSuffix(AttributeFormat.STRING)
+ getSpecialIndexSuffix(AttributeFormat.STRING),
attributeValues);
}
} else if (AttributeFormat.OBJECT.equals(format)) {
ByteArrayOutputStream byteArrayOS = new ByteArrayOutputStream();
List<Serializable> byteArrays = new ArrayList<>();
Expand Down Expand Up @@ -594,13 +593,12 @@ String getField(
Map<String, Serializable> enabledFeatures) {

final String fieldSuffix = getFieldSuffix(format);
String fieldName =
propertyName
+ fieldSuffix
+ (isSearchedAsExactValue ? "" : getSpecialIndexSuffix(format, enabledFeatures));
String baseName = propertyName + fieldSuffix;
String extendedName =
baseName + (isSearchedAsExactValue ? "" : getSpecialIndexSuffix(format, enabledFeatures));

if (fieldsCache.contains(fieldName)) {
return fieldName;
if (fieldsCache.contains(extendedName)) {
return extendedName;
}

switch (format) {
Expand All @@ -613,13 +611,14 @@ String getField(
default:
break;
}

// if we have the base name in the cache but not the extended name assume the extended doesn't
// exist otherwise fallback to the old convention
String bestGuess = fieldsCache.contains(baseName) ? baseName : extendedName;
LOGGER.debug(
"Could not find exact schema field name for [{}], attempting to search with [{}]",
propertyName,
fieldName);

return fieldName;
bestGuess);
return bestGuess;
}

private String getFieldSuffix(AttributeFormat format) {
Expand Down Expand Up @@ -658,8 +657,11 @@ String getCaseSensitiveField(
&& mappedPropertyName.endsWith(SchemaFields.PHONETICS)) {
return mappedPropertyName;
}
// TODO We can check if this field really does exist
return mappedPropertyName + SchemaFields.HAS_CASE;
String potentialField = mappedPropertyName + SchemaFields.HAS_CASE;
if (fieldsCache.contains(potentialField)) {
return potentialField;
}
return fieldsCache.contains(mappedPropertyName) ? mappedPropertyName : potentialField;
}

private String getSpecialIndexSuffix(AttributeFormat format) {
Expand Down Expand Up @@ -706,6 +708,18 @@ private void addToFieldsCache(AttributeDescriptor descriptor) {

fieldsCache.add(descriptor.getName() + schemaFields.getFieldSuffix(format));

if (format.equals(AttributeFormat.GEOMETRY)) {
fieldsCache.add(
descriptor.getName()
+ schemaFields.getFieldSuffix(format)
+ getSpecialIndexSuffix(format));
return;
}

if (!descriptor.isTokenized()) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not 100% confident that I know what all of these fields are for, but it looks like there are at least two cases where fields were added to the cache that will no longer be hit:

  1. _phonetics for untokenized string fields when phonetics is enabled.
    a. I don't know enough about this feature to know what this would impact.
  2. _tpt for untokenized XML fields.
    a. This one I am really not sure about because I don't see any references to _tpt in our schema.

return;
}

if (!getSpecialIndexSuffix(format).equals("")) {
fieldsCache.add(
descriptor.getName()
Expand Down Expand Up @@ -892,9 +906,7 @@ private Set<String> getAnyTextFields() {
.collect(Collectors.toList());
}

return fields.stream()
.map(field -> field + SchemaFields.TEXT_SUFFIX)
.collect(Collectors.toSet());
return new HashSet<>(fields);
}

private Set<String> getAnyGeoFields() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -313,19 +313,14 @@ public SolrQuery propertyIsEqualTo(String propertyName, String literal, boolean
private String wildcardSolrQuery(
String searchPhrase, String propertyName, boolean isCaseSensitive, boolean isExact) {
String solrQuery;
String tokenized = resolver.getSpecialIndexSuffix(AttributeFormat.STRING, enabledFeatures);
if (Metacard.ANY_TEXT.equals(propertyName)) {
solrQuery =
resolver
.anyTextFields()
.map(
field -> {
if (!isExact) {
return field + tokenized;
} else {
return field;
}
})
field ->
resolver.getField(
field, AttributeFormat.STRING, isExact, Collections.emptyMap()))
.map(
textField -> {
if (isCaseSensitive && !isExact) {
Expand All @@ -339,7 +334,7 @@ private String wildcardSolrQuery(
} else {
String field = getMappedPropertyName(propertyName, AttributeFormat.STRING, true);
if (!isExact) {
field += tokenized;
field = getMappedPropertyName(propertyName, AttributeFormat.STRING, false);
}
Comment on lines 336 to 338
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✏️ We can remove this case and pass isExact as the last argument above.

if (isCaseSensitive && !isExact) {
field = resolver.getCaseSensitiveField(field, enabledFeatures);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import static org.hamcrest.Matchers.hasItem;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.RETURNS_DEEP_STUBS;
Expand Down Expand Up @@ -54,7 +55,7 @@

public class DynamicSchemaResolverTest {

private static final int INITIAL_FIELDS_CACHE_COUNT = 5;
private static final int INITIAL_FIELDS_CACHE_COUNT = 3;

private static final ObjectMapper METACARD_TYPE_MAPPER =
MetacardTypeMapperFactory.newObjectMapper();
Expand Down Expand Up @@ -180,7 +181,7 @@ public void testAnyTextFieldPropertyParsing() throws Exception {
// Perform Test
List<String> fields = resolver.anyTextFields().collect(Collectors.toList());
Truth.assertThat(fields)
.containsExactly("metadata_txt", "title_txt", "description_txt", "ext.extracted.text_txt");
.containsExactly("metadata", "title", "description", "ext.extracted.text");
}

@Test
Expand Down Expand Up @@ -301,4 +302,203 @@ public void getFieldExactValue() {
dynamicSchemaResolver.getField("unknown", AttributeFormat.STRING, true, enabledFeatures),
is("unknown_txt"));
}

@Test
public void testAddFieldsWithXmlAttributeNotTokenized() throws Exception {
// Setup - XML attribute that is NOT tokenized
String metacardTypeName = "test";
Set<AttributeDescriptor> attributeDescriptors = new HashSet<>(1);
String attributeName = "metadata";
boolean indexed = true;
boolean stored = true;
boolean tokenized = false;
boolean multiValued = false;

attributeDescriptors.add(
new TestAttributeDescriptorImpl(
attributeName,
attributeName,
indexed,
stored,
tokenized,
multiValued,
BasicTypes.XML_TYPE));

Attribute mockAttribute = mock(Attribute.class);
when(mockAttribute.getValue()).thenReturn("<root>test</root>");
when(mockAttribute.getValues()).thenReturn(Collections.singletonList("<root>test</root>"));

Metacard mockMetacard = mock(Metacard.class, RETURNS_DEEP_STUBS);
when(mockMetacard.getMetacardType().getName()).thenReturn(metacardTypeName);
when(mockMetacard.getMetacardType().getAttributeDescriptors()).thenReturn(attributeDescriptors);
when(mockMetacard.getAttribute(attributeName)).thenReturn(mockAttribute);

SolrInputDocument solrInputDocument = new SolrInputDocument();
DynamicSchemaResolver resolver = new DynamicSchemaResolver();

// Perform Test
resolver.addFields(mockMetacard, solrInputDocument);

// Verify - tokenized=false means the special string index should NOT be populated
assertThat(solrInputDocument.getFieldValue("metadata_txt_tokenized"), is(nullValue()));
}

@Test
public void testAddFieldsWithXmlAttributeTokenized() throws Exception {
// Setup - XML attribute that IS tokenized
String metacardTypeName = "test";
Set<AttributeDescriptor> attributeDescriptors = new HashSet<>(1);
String attributeName = "metadata";
boolean indexed = true;
boolean stored = true;
boolean tokenized = true;
boolean multiValued = false;

attributeDescriptors.add(
new TestAttributeDescriptorImpl(
attributeName,
attributeName,
indexed,
stored,
tokenized,
multiValued,
BasicTypes.XML_TYPE));

Attribute mockAttribute = mock(Attribute.class);
when(mockAttribute.getValue()).thenReturn("<root>test</root>");
when(mockAttribute.getValues()).thenReturn(Collections.singletonList("<root>test</root>"));

Metacard mockMetacard = mock(Metacard.class, RETURNS_DEEP_STUBS);
when(mockMetacard.getMetacardType().getName()).thenReturn(metacardTypeName);
when(mockMetacard.getMetacardType().getAttributeDescriptors()).thenReturn(attributeDescriptors);
when(mockMetacard.getAttribute(attributeName)).thenReturn(mockAttribute);

SolrInputDocument solrInputDocument = new SolrInputDocument();
DynamicSchemaResolver resolver = new DynamicSchemaResolver();

// Perform Test
resolver.addFields(mockMetacard, solrInputDocument);

// Verify - tokenized=true means the special string index SHOULD be populated
assertThat(solrInputDocument.getFieldValue("metadata_txt_tokenized"), notNullValue());
}

@Test
public void testAddFieldsWithStringAttributeNotTokenizedAddsTxtNotTokenizedField()
throws Exception {
// Setup - STRING attribute that is NOT tokenized
String metacardTypeName = "test";
Set<AttributeDescriptor> attributeDescriptors = new HashSet<>(1);
String attributeName = "title";
boolean indexed = true;
boolean stored = true;
boolean tokenized = false;
boolean multiValued = false;

attributeDescriptors.add(
new TestAttributeDescriptorImpl(
attributeName,
attributeName,
indexed,
stored,
tokenized,
multiValued,
BasicTypes.STRING_TYPE));

Attribute mockAttribute = mock(Attribute.class);
when(mockAttribute.getValue()).thenReturn("test value");
when(mockAttribute.getValues()).thenReturn(Collections.singletonList("test value"));

Metacard mockMetacard = mock(Metacard.class, RETURNS_DEEP_STUBS);
when(mockMetacard.getMetacardType().getName()).thenReturn(metacardTypeName);
when(mockMetacard.getMetacardType().getAttributeDescriptors()).thenReturn(attributeDescriptors);
when(mockMetacard.getAttribute(attributeName)).thenReturn(mockAttribute);

SolrInputDocument solrInputDocument = new SolrInputDocument();
DynamicSchemaResolver resolver = new DynamicSchemaResolver();

// Perform Test
resolver.addFields(mockMetacard, solrInputDocument);

// Verify - tokenized=false means the tokenized field should NOT be populated
assertThat(solrInputDocument.getFieldValue("title_txt"), is("test value"));
assertThat(solrInputDocument.getFieldValue("title_txt_tokenized"), is(nullValue()));
}

@Test
public void testAddFieldsWithStringAttributeTokenizedAddsBothTxtAndTokenizedField()
throws Exception {
// Setup - STRING attribute that IS tokenized
String metacardTypeName = "test";
Set<AttributeDescriptor> attributeDescriptors = new HashSet<>(1);
String attributeName = "title";
boolean indexed = true;
boolean stored = true;
boolean tokenized = true;
boolean multiValued = false;

attributeDescriptors.add(
new TestAttributeDescriptorImpl(
attributeName,
attributeName,
indexed,
stored,
tokenized,
multiValued,
BasicTypes.STRING_TYPE));

Attribute mockAttribute = mock(Attribute.class);
when(mockAttribute.getValue()).thenReturn("test value");
when(mockAttribute.getValues()).thenReturn(Collections.singletonList("test value"));

Metacard mockMetacard = mock(Metacard.class, RETURNS_DEEP_STUBS);
when(mockMetacard.getMetacardType().getName()).thenReturn(metacardTypeName);
when(mockMetacard.getMetacardType().getAttributeDescriptors()).thenReturn(attributeDescriptors);
when(mockMetacard.getAttribute(attributeName)).thenReturn(mockAttribute);

SolrInputDocument solrInputDocument = new SolrInputDocument();
DynamicSchemaResolver resolver = new DynamicSchemaResolver();

// Perform Test
resolver.addFields(mockMetacard, solrInputDocument);

// Verify - tokenized=true means both fields SHOULD be populated
assertThat(solrInputDocument.getFieldValue("title_txt"), is("test value"));
assertThat(solrInputDocument.getFieldValue("title_txt_tokenized"), is("test value"));
}

@Test
public void testGetFieldNumericalFallsBackToRequestedSuffix() {
// When no numerical field exists in cache, should fall back to the requested suffix
// fieldsCache is empty for a new resolver
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✏️ It will have a minimum of three items, but yes it will not have anything for these fields.

assertThat(
dynamicSchemaResolver.getField(
"unknown", AttributeFormat.INTEGER, false, Collections.emptyMap()),
is("unknown_int"));
assertThat(
dynamicSchemaResolver.getField(
"unknown", AttributeFormat.LONG, false, Collections.emptyMap()),
is("unknown_lng"));
assertThat(
dynamicSchemaResolver.getField(
"unknown", AttributeFormat.DOUBLE, false, Collections.emptyMap()),
is("unknown_dbl"));
assertThat(
dynamicSchemaResolver.getField(
"unknown", AttributeFormat.FLOAT, false, Collections.emptyMap()),
is("unknown_flt"));
assertThat(
dynamicSchemaResolver.getField(
"unknown", AttributeFormat.SHORT, false, Collections.emptyMap()),
is("unknown_shr"));
}

@Test
public void testGetFieldXmlReturnsTextPathSuffix() {
// XML format should use TEXT_PATH (_tpt) suffix
assertThat(
dynamicSchemaResolver.getField(
"metadata", AttributeFormat.XML, false, Collections.emptyMap()),
is("metadata_xml_tpt"));
}
}
Loading
Loading