From 89dc2c22bc71426554fbcc81fe6c6046bbd9258e Mon Sep 17 00:00:00 2001 From: dtynn Date: Sun, 17 May 2015 13:07:14 +0800 Subject: [PATCH 1/2] update TermVector --- index/index.go | 9 ++++---- index/upside_down/row.go | 39 +++++++++++++++++++++++++++----- index/upside_down/row_test.go | 10 ++++++-- index/upside_down/upside_down.go | 22 ++++++++++-------- 4 files changed, 58 insertions(+), 22 deletions(-) diff --git a/index/index.go b/index/index.go index c0040ecd..40c35d81 100644 --- a/index/index.go +++ b/index/index.go @@ -61,10 +61,11 @@ type IndexReader interface { type FieldTerms map[string][]string type TermFieldVector struct { - Field string - Pos uint64 - Start uint64 - End uint64 + Field string + ArrayPositions []uint64 + Pos uint64 + Start uint64 + End uint64 } type TermFieldDoc struct { diff --git a/index/upside_down/row.go b/index/upside_down/row.go index 1e3bb942..44f28766 100644 --- a/index/upside_down/row.go +++ b/index/upside_down/row.go @@ -259,14 +259,15 @@ func (dr *DictionaryRow) parseDictionaryV(value []byte) error { // TERM FIELD FREQUENCY type TermVector struct { - field uint16 - pos uint64 - start uint64 - end uint64 + field uint16 + arrayPositions []uint64 + pos uint64 + start uint64 + end uint64 } func (tv *TermVector) String() string { - return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d", tv.field, tv.pos, tv.start, tv.end) + return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions) } type TermFrequencyRow struct { @@ -319,7 +320,11 @@ func (tfr *TermFrequencyRow) DictionaryRowKey() []byte { func (tfr *TermFrequencyRow) Value() []byte { used := 0 - buf := make([]byte, 8+8+(len(tfr.vectors)*(8+8+8+8))) + bufLen := 8 + 8 + for _, vector := range tfr.vectors { + bufLen += 8 + 8 + 8 + 8 + (1+len(vector.arrayPositions))*8 + } + buf := make([]byte, bufLen) used += binary.PutUvarint(buf[used:used+8], tfr.freq) @@ -332,6 +337,10 @@ func (tfr *TermFrequencyRow) Value() []byte { used += binary.PutUvarint(buf[used:used+8], vector.pos) used += binary.PutUvarint(buf[used:used+8], vector.start) used += binary.PutUvarint(buf[used:used+8], vector.end) + used += binary.PutUvarint(buf[used:used+8], uint64(len(vector.arrayPositions))) + for _, arrayPosition := range vector.arrayPositions { + used += binary.PutUvarint(buf[used:used+8], arrayPosition) + } } return buf[0:used] } @@ -431,6 +440,24 @@ func (tfr *TermFrequencyRow) parseV(value []byte) error { } currOffset += bytesRead + var arrayPositionsLen uint64 = 0 + arrayPositionsLen, bytesRead = binary.Uvarint(value[currOffset:]) + if bytesRead <= 0 { + return fmt.Errorf("invalid term frequency value, vector contains no arrayPositionLen") + } + currOffset += bytesRead + + if arrayPositionsLen > 0 { + tv.arrayPositions = make([]uint64, arrayPositionsLen) + for i := 0; uint64(i) < arrayPositionsLen; i++ { + tv.arrayPositions[i], bytesRead = binary.Uvarint(value[currOffset:]) + if bytesRead <= 0 { + return fmt.Errorf("invalid term frequency value, vector contains no arrayPosition of index %d", i) + } + currOffset += bytesRead + } + } + tfr.vectors = append(tfr.vectors, &tv) // try to read next record (may not exist) field, bytesRead = binary.Uvarint(value[currOffset:]) diff --git a/index/upside_down/row_test.go b/index/upside_down/row_test.go index b7ce53c9..dc8c6a02 100644 --- a/index/upside_down/row_test.go +++ b/index/upside_down/row_test.go @@ -61,13 +61,19 @@ func TestRows(t *testing.T) { { NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, - []byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 2, 23, 31, 0, 3, 43, 51}, + []byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0}, }, // test larger varints { NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}), []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, - []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 150, 17, 23, 31, 0, 3, 43, 51}, + []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0}, + }, + // test vectors with arrayPositions + { + NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}), + []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5}, }, { NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil), diff --git a/index/upside_down/upside_down.go b/index/upside_down/upside_down.go index acaa423a..f82947ff 100644 --- a/index/upside_down/upside_down.go +++ b/index/upside_down/upside_down.go @@ -415,7 +415,7 @@ func (udc *UpsideDownCouch) indexField(docID string, field document.Field, field for _, tf := range tokenFreqs { var termFreqRow *TermFrequencyRow if field.Options().IncludeTermVectors() { - tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf) + tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, field.ArrayPositions(), tf) rows = append(rows, newFieldRows...) termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv) } else { @@ -542,7 +542,7 @@ func frequencyFromTokenFreq(tf *analysis.TokenFreq) int { return len(tf.Locations) } -func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) { +func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, arrayPositions []uint64, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) { rv := make([]*TermVector, len(tf.Locations)) newFieldRows := make([]UpsideDownCouchRow, 0) @@ -557,10 +557,11 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis. } } tv := TermVector{ - field: fieldIndex, - pos: uint64(l.Position), - start: uint64(l.Start), - end: uint64(l.End), + field: fieldIndex, + arrayPositions: arrayPositions, + pos: uint64(l.Position), + start: uint64(l.Start), + end: uint64(l.End), } rv[i] = &tv } @@ -574,10 +575,11 @@ func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) [] for i, tv := range in { fieldName := udc.fieldIndexCache.FieldName(tv.field) tfv := index.TermFieldVector{ - Field: fieldName, - Pos: tv.pos, - Start: tv.start, - End: tv.end, + Field: fieldName, + ArrayPositions: tv.arrayPositions, + Pos: tv.pos, + Start: tv.start, + End: tv.end, } rv[i] = &tfv } From b4f74960310312739922a6da36d2c77c779bfcff Mon Sep 17 00:00:00 2001 From: dtynn Date: Mon, 18 May 2015 15:16:35 +0800 Subject: [PATCH 2/2] update the index format version number --- index/upside_down/upside_down.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/upside_down/upside_down.go b/index/upside_down/upside_down.go index f82947ff..7e7cbe27 100644 --- a/index/upside_down/upside_down.go +++ b/index/upside_down/upside_down.go @@ -30,7 +30,7 @@ var VersionKey = []byte{'v'} var UnsafeBatchUseDetected = fmt.Errorf("bleve.Batch is NOT thread-safe, modification after execution detected") -const Version uint8 = 4 +const Version uint8 = 5 var IncompatibleVersion = fmt.Errorf("incompatible version, %d is supported", Version)