Merge branch 'feature/term_vector' of https://github.com/tukdesk/bleve into tukdesk-feature/term_vector
This commit is contained in:
commit
c1c4941dde
|
@ -62,6 +62,7 @@ type FieldTerms map[string][]string
|
||||||
|
|
||||||
type TermFieldVector struct {
|
type TermFieldVector struct {
|
||||||
Field string
|
Field string
|
||||||
|
ArrayPositions []uint64
|
||||||
Pos uint64
|
Pos uint64
|
||||||
Start uint64
|
Start uint64
|
||||||
End uint64
|
End uint64
|
||||||
|
|
|
@ -260,13 +260,14 @@ func (dr *DictionaryRow) parseDictionaryV(value []byte) error {
|
||||||
|
|
||||||
type TermVector struct {
|
type TermVector struct {
|
||||||
field uint16
|
field uint16
|
||||||
|
arrayPositions []uint64
|
||||||
pos uint64
|
pos uint64
|
||||||
start uint64
|
start uint64
|
||||||
end uint64
|
end uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (tv *TermVector) String() string {
|
func (tv *TermVector) String() string {
|
||||||
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d", tv.field, tv.pos, tv.start, tv.end)
|
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
|
||||||
}
|
}
|
||||||
|
|
||||||
type TermFrequencyRow struct {
|
type TermFrequencyRow struct {
|
||||||
|
@ -319,7 +320,11 @@ func (tfr *TermFrequencyRow) DictionaryRowKey() []byte {
|
||||||
|
|
||||||
func (tfr *TermFrequencyRow) Value() []byte {
|
func (tfr *TermFrequencyRow) Value() []byte {
|
||||||
used := 0
|
used := 0
|
||||||
buf := make([]byte, 8+8+(len(tfr.vectors)*(8+8+8+8)))
|
bufLen := 8 + 8
|
||||||
|
for _, vector := range tfr.vectors {
|
||||||
|
bufLen += 8 + 8 + 8 + 8 + (1+len(vector.arrayPositions))*8
|
||||||
|
}
|
||||||
|
buf := make([]byte, bufLen)
|
||||||
|
|
||||||
used += binary.PutUvarint(buf[used:used+8], tfr.freq)
|
used += binary.PutUvarint(buf[used:used+8], tfr.freq)
|
||||||
|
|
||||||
|
@ -332,6 +337,10 @@ func (tfr *TermFrequencyRow) Value() []byte {
|
||||||
used += binary.PutUvarint(buf[used:used+8], vector.pos)
|
used += binary.PutUvarint(buf[used:used+8], vector.pos)
|
||||||
used += binary.PutUvarint(buf[used:used+8], vector.start)
|
used += binary.PutUvarint(buf[used:used+8], vector.start)
|
||||||
used += binary.PutUvarint(buf[used:used+8], vector.end)
|
used += binary.PutUvarint(buf[used:used+8], vector.end)
|
||||||
|
used += binary.PutUvarint(buf[used:used+8], uint64(len(vector.arrayPositions)))
|
||||||
|
for _, arrayPosition := range vector.arrayPositions {
|
||||||
|
used += binary.PutUvarint(buf[used:used+8], arrayPosition)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return buf[0:used]
|
return buf[0:used]
|
||||||
}
|
}
|
||||||
|
@ -431,6 +440,24 @@ func (tfr *TermFrequencyRow) parseV(value []byte) error {
|
||||||
}
|
}
|
||||||
currOffset += bytesRead
|
currOffset += bytesRead
|
||||||
|
|
||||||
|
var arrayPositionsLen uint64 = 0
|
||||||
|
arrayPositionsLen, bytesRead = binary.Uvarint(value[currOffset:])
|
||||||
|
if bytesRead <= 0 {
|
||||||
|
return fmt.Errorf("invalid term frequency value, vector contains no arrayPositionLen")
|
||||||
|
}
|
||||||
|
currOffset += bytesRead
|
||||||
|
|
||||||
|
if arrayPositionsLen > 0 {
|
||||||
|
tv.arrayPositions = make([]uint64, arrayPositionsLen)
|
||||||
|
for i := 0; uint64(i) < arrayPositionsLen; i++ {
|
||||||
|
tv.arrayPositions[i], bytesRead = binary.Uvarint(value[currOffset:])
|
||||||
|
if bytesRead <= 0 {
|
||||||
|
return fmt.Errorf("invalid term frequency value, vector contains no arrayPosition of index %d", i)
|
||||||
|
}
|
||||||
|
currOffset += bytesRead
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
tfr.vectors = append(tfr.vectors, &tv)
|
tfr.vectors = append(tfr.vectors, &tv)
|
||||||
// try to read next record (may not exist)
|
// try to read next record (may not exist)
|
||||||
field, bytesRead = binary.Uvarint(value[currOffset:])
|
field, bytesRead = binary.Uvarint(value[currOffset:])
|
||||||
|
|
|
@ -61,13 +61,19 @@ func TestRows(t *testing.T) {
|
||||||
{
|
{
|
||||||
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
|
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
|
||||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||||
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 2, 23, 31, 0, 3, 43, 51},
|
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0},
|
||||||
},
|
},
|
||||||
// test larger varints
|
// test larger varints
|
||||||
{
|
{
|
||||||
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
|
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
|
||||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||||
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 150, 17, 23, 31, 0, 3, 43, 51},
|
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0},
|
||||||
|
},
|
||||||
|
// test vectors with arrayPositions
|
||||||
|
{
|
||||||
|
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
|
||||||
|
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||||
|
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),
|
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),
|
||||||
|
|
|
@ -30,7 +30,7 @@ var VersionKey = []byte{'v'}
|
||||||
|
|
||||||
var UnsafeBatchUseDetected = fmt.Errorf("bleve.Batch is NOT thread-safe, modification after execution detected")
|
var UnsafeBatchUseDetected = fmt.Errorf("bleve.Batch is NOT thread-safe, modification after execution detected")
|
||||||
|
|
||||||
const Version uint8 = 4
|
const Version uint8 = 5
|
||||||
|
|
||||||
var IncompatibleVersion = fmt.Errorf("incompatible version, %d is supported", Version)
|
var IncompatibleVersion = fmt.Errorf("incompatible version, %d is supported", Version)
|
||||||
|
|
||||||
|
@ -415,7 +415,7 @@ func (udc *UpsideDownCouch) indexField(docID string, field document.Field, field
|
||||||
for _, tf := range tokenFreqs {
|
for _, tf := range tokenFreqs {
|
||||||
var termFreqRow *TermFrequencyRow
|
var termFreqRow *TermFrequencyRow
|
||||||
if field.Options().IncludeTermVectors() {
|
if field.Options().IncludeTermVectors() {
|
||||||
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf)
|
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, field.ArrayPositions(), tf)
|
||||||
rows = append(rows, newFieldRows...)
|
rows = append(rows, newFieldRows...)
|
||||||
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
|
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
|
||||||
} else {
|
} else {
|
||||||
|
@ -542,7 +542,7 @@ func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
|
||||||
return len(tf.Locations)
|
return len(tf.Locations)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) {
|
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, arrayPositions []uint64, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) {
|
||||||
rv := make([]*TermVector, len(tf.Locations))
|
rv := make([]*TermVector, len(tf.Locations))
|
||||||
newFieldRows := make([]UpsideDownCouchRow, 0)
|
newFieldRows := make([]UpsideDownCouchRow, 0)
|
||||||
|
|
||||||
|
@ -558,6 +558,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
|
||||||
}
|
}
|
||||||
tv := TermVector{
|
tv := TermVector{
|
||||||
field: fieldIndex,
|
field: fieldIndex,
|
||||||
|
arrayPositions: arrayPositions,
|
||||||
pos: uint64(l.Position),
|
pos: uint64(l.Position),
|
||||||
start: uint64(l.Start),
|
start: uint64(l.Start),
|
||||||
end: uint64(l.End),
|
end: uint64(l.End),
|
||||||
|
@ -575,6 +576,7 @@ func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []
|
||||||
fieldName := udc.fieldIndexCache.FieldName(tv.field)
|
fieldName := udc.fieldIndexCache.FieldName(tv.field)
|
||||||
tfv := index.TermFieldVector{
|
tfv := index.TermFieldVector{
|
||||||
Field: fieldName,
|
Field: fieldName,
|
||||||
|
ArrayPositions: tv.arrayPositions,
|
||||||
Pos: tv.pos,
|
Pos: tv.pos,
|
||||||
Start: tv.start,
|
Start: tv.start,
|
||||||
End: tv.end,
|
End: tv.end,
|
||||||
|
|
Loading…
Reference in New Issue
Block a user