0
0

Merge branch 'feature/term_vector' of https://github.com/tukdesk/bleve into tukdesk-feature/term_vector

This commit is contained in:
Marty Schoch 2015-07-29 14:31:15 -04:00
commit c1c4941dde
4 changed files with 59 additions and 23 deletions

View File

@ -61,10 +61,11 @@ type IndexReader interface {
type FieldTerms map[string][]string type FieldTerms map[string][]string
type TermFieldVector struct { type TermFieldVector struct {
Field string Field string
Pos uint64 ArrayPositions []uint64
Start uint64 Pos uint64
End uint64 Start uint64
End uint64
} }
type TermFieldDoc struct { type TermFieldDoc struct {

View File

@ -259,14 +259,15 @@ func (dr *DictionaryRow) parseDictionaryV(value []byte) error {
// TERM FIELD FREQUENCY // TERM FIELD FREQUENCY
type TermVector struct { type TermVector struct {
field uint16 field uint16
pos uint64 arrayPositions []uint64
start uint64 pos uint64
end uint64 start uint64
end uint64
} }
func (tv *TermVector) String() string { func (tv *TermVector) String() string {
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d", tv.field, tv.pos, tv.start, tv.end) return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
} }
type TermFrequencyRow struct { type TermFrequencyRow struct {
@ -319,7 +320,11 @@ func (tfr *TermFrequencyRow) DictionaryRowKey() []byte {
func (tfr *TermFrequencyRow) Value() []byte { func (tfr *TermFrequencyRow) Value() []byte {
used := 0 used := 0
buf := make([]byte, 8+8+(len(tfr.vectors)*(8+8+8+8))) bufLen := 8 + 8
for _, vector := range tfr.vectors {
bufLen += 8 + 8 + 8 + 8 + (1+len(vector.arrayPositions))*8
}
buf := make([]byte, bufLen)
used += binary.PutUvarint(buf[used:used+8], tfr.freq) used += binary.PutUvarint(buf[used:used+8], tfr.freq)
@ -332,6 +337,10 @@ func (tfr *TermFrequencyRow) Value() []byte {
used += binary.PutUvarint(buf[used:used+8], vector.pos) used += binary.PutUvarint(buf[used:used+8], vector.pos)
used += binary.PutUvarint(buf[used:used+8], vector.start) used += binary.PutUvarint(buf[used:used+8], vector.start)
used += binary.PutUvarint(buf[used:used+8], vector.end) used += binary.PutUvarint(buf[used:used+8], vector.end)
used += binary.PutUvarint(buf[used:used+8], uint64(len(vector.arrayPositions)))
for _, arrayPosition := range vector.arrayPositions {
used += binary.PutUvarint(buf[used:used+8], arrayPosition)
}
} }
return buf[0:used] return buf[0:used]
} }
@ -431,6 +440,24 @@ func (tfr *TermFrequencyRow) parseV(value []byte) error {
} }
currOffset += bytesRead currOffset += bytesRead
var arrayPositionsLen uint64 = 0
arrayPositionsLen, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no arrayPositionLen")
}
currOffset += bytesRead
if arrayPositionsLen > 0 {
tv.arrayPositions = make([]uint64, arrayPositionsLen)
for i := 0; uint64(i) < arrayPositionsLen; i++ {
tv.arrayPositions[i], bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no arrayPosition of index %d", i)
}
currOffset += bytesRead
}
}
tfr.vectors = append(tfr.vectors, &tv) tfr.vectors = append(tfr.vectors, &tv)
// try to read next record (may not exist) // try to read next record (may not exist)
field, bytesRead = binary.Uvarint(value[currOffset:]) field, bytesRead = binary.Uvarint(value[currOffset:])

View File

@ -61,13 +61,19 @@ func TestRows(t *testing.T) {
{ {
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}), NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 2, 23, 31, 0, 3, 43, 51}, []byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0},
}, },
// test larger varints // test larger varints
{ {
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}), NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, []byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 150, 17, 23, 31, 0, 3, 43, 51}, []byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0},
},
// test vectors with arrayPositions
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5},
}, },
{ {
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil), NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),

View File

@ -30,7 +30,7 @@ var VersionKey = []byte{'v'}
var UnsafeBatchUseDetected = fmt.Errorf("bleve.Batch is NOT thread-safe, modification after execution detected") var UnsafeBatchUseDetected = fmt.Errorf("bleve.Batch is NOT thread-safe, modification after execution detected")
const Version uint8 = 4 const Version uint8 = 5
var IncompatibleVersion = fmt.Errorf("incompatible version, %d is supported", Version) var IncompatibleVersion = fmt.Errorf("incompatible version, %d is supported", Version)
@ -415,7 +415,7 @@ func (udc *UpsideDownCouch) indexField(docID string, field document.Field, field
for _, tf := range tokenFreqs { for _, tf := range tokenFreqs {
var termFreqRow *TermFrequencyRow var termFreqRow *TermFrequencyRow
if field.Options().IncludeTermVectors() { if field.Options().IncludeTermVectors() {
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf) tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, field.ArrayPositions(), tf)
rows = append(rows, newFieldRows...) rows = append(rows, newFieldRows...)
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv) termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
} else { } else {
@ -542,7 +542,7 @@ func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
return len(tf.Locations) return len(tf.Locations)
} }
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) { func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, arrayPositions []uint64, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) {
rv := make([]*TermVector, len(tf.Locations)) rv := make([]*TermVector, len(tf.Locations))
newFieldRows := make([]UpsideDownCouchRow, 0) newFieldRows := make([]UpsideDownCouchRow, 0)
@ -557,10 +557,11 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
} }
} }
tv := TermVector{ tv := TermVector{
field: fieldIndex, field: fieldIndex,
pos: uint64(l.Position), arrayPositions: arrayPositions,
start: uint64(l.Start), pos: uint64(l.Position),
end: uint64(l.End), start: uint64(l.Start),
end: uint64(l.End),
} }
rv[i] = &tv rv[i] = &tv
} }
@ -574,10 +575,11 @@ func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []
for i, tv := range in { for i, tv := range in {
fieldName := udc.fieldIndexCache.FieldName(tv.field) fieldName := udc.fieldIndexCache.FieldName(tv.field)
tfv := index.TermFieldVector{ tfv := index.TermFieldVector{
Field: fieldName, Field: fieldName,
Pos: tv.pos, ArrayPositions: tv.arrayPositions,
Start: tv.start, Pos: tv.pos,
End: tv.end, Start: tv.start,
End: tv.end,
} }
rv[i] = &tfv rv[i] = &tfv
} }