fix indexing bug with data coming from arrays

fixes #295
2015-12-21 14:59:32 -05:00 · 2015-12-21 14:59:32 -05:00 · 8efbd556a3
parent 7bb58e1be4
commit 8efbd556a3
8 changed files with 240 additions and 27 deletions
--- a/index/firestorm/analysis.go
+++ b/index/firestorm/analysis.go
@ -24,34 +24,55 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
 		Rows:  make([]index.IndexRow, 0, 100),
 	}

+	// information we collate as we merge fields with same name
+	fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
+	fieldLengths := make(map[uint16]int)
+	fieldIncludeTermVectors := make(map[uint16]bool)
+	fieldNames := make(map[uint16]string)
+
 	for _, field := range d.Fields {
 		fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name())
 		if newFieldRow != nil {
 			rv.Rows = append(rv.Rows, newFieldRow)
 		}
+		fieldNames[fieldIndex] = field.Name()

 		// add the _id row
 		rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, []byte(d.ID), d.Number, 0, 0, nil))

 		if field.Options().IsIndexed() {
-
 			fieldLength, tokenFreqs := field.Analyze()
-
-			// see if any of the composite fields need this
-			for _, compositeField := range d.CompositeFields {
-				compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
+			existingFreqs := fieldTermFreqs[fieldIndex]
+			if existingFreqs == nil {
+				fieldTermFreqs[fieldIndex] = tokenFreqs
+			} else {
+				existingFreqs.MergeAll(field.Name(), tokenFreqs)
+				fieldTermFreqs[fieldIndex] = existingFreqs
 			}
-
-			// encode this field
-			indexRows := f.indexField(d.ID, d.Number, field, fieldIndex, fieldLength, tokenFreqs)
-			rv.Rows = append(rv.Rows, indexRows...)
+			fieldLengths[fieldIndex] += fieldLength
+			fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
 		}

 		if field.Options().IsStored() {
 			storeRow := f.storeField(d.ID, d.Number, field, fieldIndex)
 			rv.Rows = append(rv.Rows, storeRow)
 		}
+	}

+	// walk through the collated information and proccess
+	// once for each indexed field (unique name)
+	for fieldIndex, tokenFreqs := range fieldTermFreqs {
+		fieldLength := fieldLengths[fieldIndex]
+		includeTermVectors := fieldIncludeTermVectors[fieldIndex]
+
+		// see if any of the composite fields need this
+		for _, compositeField := range d.CompositeFields {
+			compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs)
+		}
+
+		// encode this field
+		indexRows := f.indexField(d.ID, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
+		rv.Rows = append(rv.Rows, indexRows...)
 	}

 	// now index the composite fields
@ -63,7 +84,7 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
 		if compositeField.Options().IsIndexed() {
 			fieldLength, tokenFreqs := compositeField.Analyze()
 			// encode this field
-			indexRows := f.indexField(d.ID, d.Number, compositeField, fieldIndex, fieldLength, tokenFreqs)
+			indexRows := f.indexField(d.ID, d.Number, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
 			rv.Rows = append(rv.Rows, indexRows...)
 		}
 	}
@ -71,14 +92,14 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
 	return rv
 }

-func (f *Firestorm) indexField(docID string, docNum uint64, field document.Field, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) []index.IndexRow {
+func (f *Firestorm) indexField(docID string, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) []index.IndexRow {

 	rows := make([]index.IndexRow, 0, 100)
 	fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))

 	for _, tf := range tokenFreqs {
 		var termFreqRow *TermFreqRow
-		if field.Options().IncludeTermVectors() {
+		if includeTermVectors {
 			tv, newFieldRows := f.termVectorsFromTokenFreq(fieldIndex, tf)
 			rows = append(rows, newFieldRows...)
 			termFreqRow = NewTermFreqRow(fieldIndex, tf.Term, []byte(docID), docNum, uint64(tf.Frequency()), fieldNorm, tv)
--- a/index/firestorm/analysis_test.go
+++ b/index/firestorm/analysis_test.go
@ -80,8 +80,8 @@ func TestAnalysis(t *testing.T) {
 				Rows: []index.IndexRow{
 					NewFieldRow(1, "name"),
 					NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
-					NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}),
 					NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")),
+					NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}),
 				},
 			},
 		},
--- a/index/upside_down/analysis.go
+++ b/index/upside_down/analysis.go
@ -10,6 +10,7 @@
 package upside_down

 import (
+	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/document"
 	"github.com/blevesearch/bleve/index"
 )
@ -24,25 +25,34 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
 	backIndexTermEntries := make([]*BackIndexTermEntry, 0)
 	backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)

+	// information we collate as we merge fields with same name
+	fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
+	fieldLengths := make(map[uint16]int)
+	fieldIncludeTermVectors := make(map[uint16]bool)
+	fieldNames := make(map[uint16]string)
+
+	// walk all the fields, record stored fields now
+	// place information about indexed fields into map
+	// this collates information across fields with
+	// same names (arrays)
 	for _, field := range d.Fields {
 		fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name())
 		if newFieldRow != nil {
 			rv.Rows = append(rv.Rows, newFieldRow)
 		}
+		fieldNames[fieldIndex] = field.Name()

 		if field.Options().IsIndexed() {
-
 			fieldLength, tokenFreqs := field.Analyze()
-
-			// see if any of the composite fields need this
-			for _, compositeField := range d.CompositeFields {
-				compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
+			existingFreqs := fieldTermFreqs[fieldIndex]
+			if existingFreqs == nil {
+				fieldTermFreqs[fieldIndex] = tokenFreqs
+			} else {
+				existingFreqs.MergeAll(field.Name(), tokenFreqs)
+				fieldTermFreqs[fieldIndex] = existingFreqs
 			}
-
-			// encode this field
-			indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, field, fieldIndex, fieldLength, tokenFreqs)
-			rv.Rows = append(rv.Rows, indexRows...)
-			backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
+			fieldLengths[fieldIndex] += fieldLength
+			fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
 		}

 		if field.Options().IsStored() {
@ -53,6 +63,23 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult

 	}

+	// walk through the collated information and proccess
+	// once for each indexed field (unique name)
+	for fieldIndex, tokenFreqs := range fieldTermFreqs {
+		fieldLength := fieldLengths[fieldIndex]
+		includeTermVectors := fieldIncludeTermVectors[fieldIndex]
+
+		// see if any of the composite fields need this
+		for _, compositeField := range d.CompositeFields {
+			compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs)
+		}
+
+		// encode this field
+		indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
+		rv.Rows = append(rv.Rows, indexRows...)
+		backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
+	}
+
 	// now index the composite fields
 	for _, compositeField := range d.CompositeFields {
 		fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(compositeField.Name())
@ -62,7 +89,7 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
 		if compositeField.Options().IsIndexed() {
 			fieldLength, tokenFreqs := compositeField.Analyze()
 			// encode this field
-			indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, compositeField, fieldIndex, fieldLength, tokenFreqs)
+			indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
 			rv.Rows = append(rv.Rows, indexRows...)
 			backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
 		}
--- a/index/upside_down/upside_down.go
+++ b/index/upside_down/upside_down.go
@ -502,7 +502,7 @@ func encodeFieldType(f document.Field) byte {
 	return fieldType
 }

-func (udc *UpsideDownCouch) indexField(docID string, field document.Field, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) ([]index.IndexRow, []*BackIndexTermEntry) {
+func (udc *UpsideDownCouch) indexField(docID string, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) ([]index.IndexRow, []*BackIndexTermEntry) {

 	rows := make([]index.IndexRow, 0, 100)
 	backIndexTermEntries := make([]*BackIndexTermEntry, 0, len(tokenFreqs))
@ -510,7 +510,7 @@ func (udc *UpsideDownCouch) indexField(docID string, field document.Field, field

 	for k, tf := range tokenFreqs {
 		var termFreqRow *TermFrequencyRow
-		if field.Options().IncludeTermVectors() {
+		if includeTermVectors {
 			tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf)
 			rows = append(rows, newFieldRows...)
 			termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
--- a/index_test.go
+++ b/index_test.go
@ -22,8 +22,9 @@ import (
 	"time"

 	"encoding/json"
-	"github.com/blevesearch/bleve/analysis/analyzers/keyword_analyzer"
 	"strconv"
+
+	"github.com/blevesearch/bleve/analysis/analyzers/keyword_analyzer"
 )

 func TestCrud(t *testing.T) {
@ -1229,3 +1230,84 @@ func TestDateTimeFieldMappingIssue287(t *testing.T) {
 		t.Fatal(err)
 	}
 }
+
+func TestDocumentFieldArrayPositionsBug295(t *testing.T) {
+	defer func() {
+		err := os.RemoveAll("testidx")
+		if err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	index, err := New("testidx", NewIndexMapping())
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// index a document with an array of strings
+	err = index.Index("k", struct {
+		Messages []string
+		Another  string
+		MoreData []string
+	}{
+		Messages: []string{
+			"bleve",
+			"bleve",
+		},
+		Another: "text",
+		MoreData: []string{
+			"a",
+			"b",
+			"c",
+			"bleve",
+		},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// search for it in the messages field
+	tq := NewTermQuery("bleve").SetField("Messages")
+	tsr := NewSearchRequest(tq)
+	results, err := index.Search(tsr)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if results.Total != 1 {
+		t.Fatalf("expected 1 result, got %d", results.Total)
+	}
+	if len(results.Hits[0].Locations["Messages"]["bleve"]) != 2 {
+		t.Fatalf("expected 2 locations of 'bleve', got %d", len(results.Hits[0].Locations["Messages"]["bleve"]))
+	}
+	if results.Hits[0].Locations["Messages"]["bleve"][0].ArrayPositions[0] != 0 {
+		t.Errorf("expected array position to be 0")
+	}
+	if results.Hits[0].Locations["Messages"]["bleve"][1].ArrayPositions[0] != 1 {
+		t.Errorf("expected array position to be 1")
+	}
+
+	// search for it in all
+	tq = NewTermQuery("bleve")
+	tsr = NewSearchRequest(tq)
+	results, err = index.Search(tsr)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if results.Total != 1 {
+		t.Fatalf("expected 1 result, got %d", results.Total)
+	}
+	if len(results.Hits[0].Locations["Messages"]["bleve"]) != 2 {
+		t.Fatalf("expected 2 locations of 'bleve', got %d", len(results.Hits[0].Locations["Messages"]["bleve"]))
+	}
+	if results.Hits[0].Locations["Messages"]["bleve"][0].ArrayPositions[0] != 0 {
+		t.Errorf("expected array position to be 0")
+	}
+	if results.Hits[0].Locations["Messages"]["bleve"][1].ArrayPositions[0] != 1 {
+		t.Errorf("expected array position to be 1")
+	}
+
+	err = index.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+}
--- a/test/tests/employee/data/emp10508560.json
+++ b/test/tests/employee/data/emp10508560.json
@ -0,0 +1,44 @@
+{
+  "salary": 104561.8,
+  "_type": "emp",
+  "name": "Deirdre Reed",
+  "mutated": 0,
+  "is_manager": true,
+  "dept": "Accounts",
+  "join_date": "2003-05-28T21:29:00",
+  "manages": {
+    "team_size": 9,
+    "reports": [
+      "Gallia Julián",
+      "Duvessa Nicolás",
+      "Beryl Thomas",
+      "Deirdre Julián",
+      "Antonia Gerónimo",
+      "Ciara Young",
+      "Riona Richardson IX",
+      "Severin Jr.",
+      "Perdita Morgan"
+    ]
+  },
+  "languages_known": [
+    "English",
+    "Spanish",
+    "German",
+    "Italian",
+    "French",
+    "Arabic",
+    "Africans",
+    "Hindi",
+    "Vietnamese",
+    "Urdu",
+    "Dutch",
+    "Quechua",
+    "Japanese",
+    "Chinese",
+    "Nepalese",
+    "Thai",
+    "Malay"
+  ],
+  "emp_id": "10508560",
+  "email": "deirdre@mcdiabetes.com"
+}
--- a/test/tests/employee/mapping.json
+++ b/test/tests/employee/mapping.json
@ -0,0 +1 @@
+{}
--- a/test/tests/employee/searches.json
+++ b/test/tests/employee/searches.json
@ -0,0 +1,38 @@
+[
+	{
+		"search": {
+			"from": 0,
+			"size": 10,
+			"query": {
+				"field": "manages.reports",
+				"term": "julián"
+			}
+		},
+		"result": {
+			"total_hits": 1,
+			"hits": [
+				{
+					"id": "emp10508560",
+					"locations": {
+						"manages.reports": {
+							"julián": [
+								{
+									"pos": 2,
+									"start": 7,
+									"end": 14,
+									"array_positions":[0]
+								},
+								{
+									"pos": 2,
+									"start": 8,
+									"end": 15,
+									"array_positions":[3]
+								}
+							]
+						}
+					}
+				}
+			]
+		}
+	}
+]