0
0
Fork 0

fix indexing bug with data coming from arrays

fixes #295
This commit is contained in:
Marty Schoch 2015-12-21 14:59:32 -05:00
parent 7bb58e1be4
commit 8efbd556a3
8 changed files with 240 additions and 27 deletions

View File

@ -24,34 +24,55 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
Rows: make([]index.IndexRow, 0, 100),
}
// information we collate as we merge fields with same name
fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
fieldLengths := make(map[uint16]int)
fieldIncludeTermVectors := make(map[uint16]bool)
fieldNames := make(map[uint16]string)
for _, field := range d.Fields {
fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name())
if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow)
}
fieldNames[fieldIndex] = field.Name()
// add the _id row
rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, []byte(d.ID), d.Number, 0, 0, nil))
if field.Options().IsIndexed() {
fieldLength, tokenFreqs := field.Analyze()
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
existingFreqs := fieldTermFreqs[fieldIndex]
if existingFreqs == nil {
fieldTermFreqs[fieldIndex] = tokenFreqs
} else {
existingFreqs.MergeAll(field.Name(), tokenFreqs)
fieldTermFreqs[fieldIndex] = existingFreqs
}
// encode this field
indexRows := f.indexField(d.ID, d.Number, field, fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
fieldLengths[fieldIndex] += fieldLength
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
}
if field.Options().IsStored() {
storeRow := f.storeField(d.ID, d.Number, field, fieldIndex)
rv.Rows = append(rv.Rows, storeRow)
}
}
// walk through the collated information and proccess
// once for each indexed field (unique name)
for fieldIndex, tokenFreqs := range fieldTermFreqs {
fieldLength := fieldLengths[fieldIndex]
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs)
}
// encode this field
indexRows := f.indexField(d.ID, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
}
// now index the composite fields
@ -63,7 +84,7 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
if compositeField.Options().IsIndexed() {
fieldLength, tokenFreqs := compositeField.Analyze()
// encode this field
indexRows := f.indexField(d.ID, d.Number, compositeField, fieldIndex, fieldLength, tokenFreqs)
indexRows := f.indexField(d.ID, d.Number, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
}
}
@ -71,14 +92,14 @@ func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
return rv
}
func (f *Firestorm) indexField(docID string, docNum uint64, field document.Field, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) []index.IndexRow {
func (f *Firestorm) indexField(docID string, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) []index.IndexRow {
rows := make([]index.IndexRow, 0, 100)
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
for _, tf := range tokenFreqs {
var termFreqRow *TermFreqRow
if field.Options().IncludeTermVectors() {
if includeTermVectors {
tv, newFieldRows := f.termVectorsFromTokenFreq(fieldIndex, tf)
rows = append(rows, newFieldRows...)
termFreqRow = NewTermFreqRow(fieldIndex, tf.Term, []byte(docID), docNum, uint64(tf.Frequency()), fieldNorm, tv)

View File

@ -80,8 +80,8 @@ func TestAnalysis(t *testing.T) {
Rows: []index.IndexRow{
NewFieldRow(1, "name"),
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}),
NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")),
NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}),
},
},
},

View File

@ -10,6 +10,7 @@
package upside_down
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
)
@ -24,25 +25,34 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
backIndexTermEntries := make([]*BackIndexTermEntry, 0)
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
// information we collate as we merge fields with same name
fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
fieldLengths := make(map[uint16]int)
fieldIncludeTermVectors := make(map[uint16]bool)
fieldNames := make(map[uint16]string)
// walk all the fields, record stored fields now
// place information about indexed fields into map
// this collates information across fields with
// same names (arrays)
for _, field := range d.Fields {
fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name())
if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow)
}
fieldNames[fieldIndex] = field.Name()
if field.Options().IsIndexed() {
fieldLength, tokenFreqs := field.Analyze()
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
existingFreqs := fieldTermFreqs[fieldIndex]
if existingFreqs == nil {
fieldTermFreqs[fieldIndex] = tokenFreqs
} else {
existingFreqs.MergeAll(field.Name(), tokenFreqs)
fieldTermFreqs[fieldIndex] = existingFreqs
}
// encode this field
indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, field, fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
fieldLengths[fieldIndex] += fieldLength
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
}
if field.Options().IsStored() {
@ -53,6 +63,23 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
}
// walk through the collated information and proccess
// once for each indexed field (unique name)
for fieldIndex, tokenFreqs := range fieldTermFreqs {
fieldLength := fieldLengths[fieldIndex]
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLength, tokenFreqs)
}
// encode this field
indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, includeTermVectors, fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
}
// now index the composite fields
for _, compositeField := range d.CompositeFields {
fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(compositeField.Name())
@ -62,7 +89,7 @@ func (udc *UpsideDownCouch) Analyze(d *document.Document) *index.AnalysisResult
if compositeField.Options().IsIndexed() {
fieldLength, tokenFreqs := compositeField.Analyze()
// encode this field
indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, compositeField, fieldIndex, fieldLength, tokenFreqs)
indexRows, indexBackIndexTermEntries := udc.indexField(d.ID, compositeField.Options().IncludeTermVectors(), fieldIndex, fieldLength, tokenFreqs)
rv.Rows = append(rv.Rows, indexRows...)
backIndexTermEntries = append(backIndexTermEntries, indexBackIndexTermEntries...)
}

View File

@ -502,7 +502,7 @@ func encodeFieldType(f document.Field) byte {
return fieldType
}
func (udc *UpsideDownCouch) indexField(docID string, field document.Field, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) ([]index.IndexRow, []*BackIndexTermEntry) {
func (udc *UpsideDownCouch) indexField(docID string, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies) ([]index.IndexRow, []*BackIndexTermEntry) {
rows := make([]index.IndexRow, 0, 100)
backIndexTermEntries := make([]*BackIndexTermEntry, 0, len(tokenFreqs))
@ -510,7 +510,7 @@ func (udc *UpsideDownCouch) indexField(docID string, field document.Field, field
for k, tf := range tokenFreqs {
var termFreqRow *TermFrequencyRow
if field.Options().IncludeTermVectors() {
if includeTermVectors {
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf)
rows = append(rows, newFieldRows...)
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)

View File

@ -22,8 +22,9 @@ import (
"time"
"encoding/json"
"github.com/blevesearch/bleve/analysis/analyzers/keyword_analyzer"
"strconv"
"github.com/blevesearch/bleve/analysis/analyzers/keyword_analyzer"
)
func TestCrud(t *testing.T) {
@ -1229,3 +1230,84 @@ func TestDateTimeFieldMappingIssue287(t *testing.T) {
t.Fatal(err)
}
}
func TestDocumentFieldArrayPositionsBug295(t *testing.T) {
defer func() {
err := os.RemoveAll("testidx")
if err != nil {
t.Fatal(err)
}
}()
index, err := New("testidx", NewIndexMapping())
if err != nil {
t.Fatal(err)
}
// index a document with an array of strings
err = index.Index("k", struct {
Messages []string
Another string
MoreData []string
}{
Messages: []string{
"bleve",
"bleve",
},
Another: "text",
MoreData: []string{
"a",
"b",
"c",
"bleve",
},
})
if err != nil {
t.Fatal(err)
}
// search for it in the messages field
tq := NewTermQuery("bleve").SetField("Messages")
tsr := NewSearchRequest(tq)
results, err := index.Search(tsr)
if err != nil {
t.Fatal(err)
}
if results.Total != 1 {
t.Fatalf("expected 1 result, got %d", results.Total)
}
if len(results.Hits[0].Locations["Messages"]["bleve"]) != 2 {
t.Fatalf("expected 2 locations of 'bleve', got %d", len(results.Hits[0].Locations["Messages"]["bleve"]))
}
if results.Hits[0].Locations["Messages"]["bleve"][0].ArrayPositions[0] != 0 {
t.Errorf("expected array position to be 0")
}
if results.Hits[0].Locations["Messages"]["bleve"][1].ArrayPositions[0] != 1 {
t.Errorf("expected array position to be 1")
}
// search for it in all
tq = NewTermQuery("bleve")
tsr = NewSearchRequest(tq)
results, err = index.Search(tsr)
if err != nil {
t.Fatal(err)
}
if results.Total != 1 {
t.Fatalf("expected 1 result, got %d", results.Total)
}
if len(results.Hits[0].Locations["Messages"]["bleve"]) != 2 {
t.Fatalf("expected 2 locations of 'bleve', got %d", len(results.Hits[0].Locations["Messages"]["bleve"]))
}
if results.Hits[0].Locations["Messages"]["bleve"][0].ArrayPositions[0] != 0 {
t.Errorf("expected array position to be 0")
}
if results.Hits[0].Locations["Messages"]["bleve"][1].ArrayPositions[0] != 1 {
t.Errorf("expected array position to be 1")
}
err = index.Close()
if err != nil {
t.Fatal(err)
}
}

View File

@ -0,0 +1,44 @@
{
"salary": 104561.8,
"_type": "emp",
"name": "Deirdre Reed",
"mutated": 0,
"is_manager": true,
"dept": "Accounts",
"join_date": "2003-05-28T21:29:00",
"manages": {
"team_size": 9,
"reports": [
"Gallia Julián",
"Duvessa Nicolás",
"Beryl Thomas",
"Deirdre Julián",
"Antonia Gerónimo",
"Ciara Young",
"Riona Richardson IX",
"Severin Jr.",
"Perdita Morgan"
]
},
"languages_known": [
"English",
"Spanish",
"German",
"Italian",
"French",
"Arabic",
"Africans",
"Hindi",
"Vietnamese",
"Urdu",
"Dutch",
"Quechua",
"Japanese",
"Chinese",
"Nepalese",
"Thai",
"Malay"
],
"emp_id": "10508560",
"email": "deirdre@mcdiabetes.com"
}

View File

@ -0,0 +1 @@
{}

View File

@ -0,0 +1,38 @@
[
{
"search": {
"from": 0,
"size": 10,
"query": {
"field": "manages.reports",
"term": "julián"
}
},
"result": {
"total_hits": 1,
"hits": [
{
"id": "emp10508560",
"locations": {
"manages.reports": {
"julián": [
{
"pos": 2,
"start": 7,
"end": 14,
"array_positions":[0]
},
{
"pos": 2,
"start": 8,
"end": 15,
"array_positions":[3]
}
]
}
}
}
]
}
}
]