update TermVector
This commit is contained in:
parent
8f70def63b
commit
89dc2c22bc
|
@ -61,10 +61,11 @@ type IndexReader interface {
|
|||
type FieldTerms map[string][]string
|
||||
|
||||
type TermFieldVector struct {
|
||||
Field string
|
||||
Pos uint64
|
||||
Start uint64
|
||||
End uint64
|
||||
Field string
|
||||
ArrayPositions []uint64
|
||||
Pos uint64
|
||||
Start uint64
|
||||
End uint64
|
||||
}
|
||||
|
||||
type TermFieldDoc struct {
|
||||
|
|
|
@ -259,14 +259,15 @@ func (dr *DictionaryRow) parseDictionaryV(value []byte) error {
|
|||
// TERM FIELD FREQUENCY
|
||||
|
||||
type TermVector struct {
|
||||
field uint16
|
||||
pos uint64
|
||||
start uint64
|
||||
end uint64
|
||||
field uint16
|
||||
arrayPositions []uint64
|
||||
pos uint64
|
||||
start uint64
|
||||
end uint64
|
||||
}
|
||||
|
||||
func (tv *TermVector) String() string {
|
||||
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d", tv.field, tv.pos, tv.start, tv.end)
|
||||
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
|
||||
}
|
||||
|
||||
type TermFrequencyRow struct {
|
||||
|
@ -319,7 +320,11 @@ func (tfr *TermFrequencyRow) DictionaryRowKey() []byte {
|
|||
|
||||
func (tfr *TermFrequencyRow) Value() []byte {
|
||||
used := 0
|
||||
buf := make([]byte, 8+8+(len(tfr.vectors)*(8+8+8+8)))
|
||||
bufLen := 8 + 8
|
||||
for _, vector := range tfr.vectors {
|
||||
bufLen += 8 + 8 + 8 + 8 + (1+len(vector.arrayPositions))*8
|
||||
}
|
||||
buf := make([]byte, bufLen)
|
||||
|
||||
used += binary.PutUvarint(buf[used:used+8], tfr.freq)
|
||||
|
||||
|
@ -332,6 +337,10 @@ func (tfr *TermFrequencyRow) Value() []byte {
|
|||
used += binary.PutUvarint(buf[used:used+8], vector.pos)
|
||||
used += binary.PutUvarint(buf[used:used+8], vector.start)
|
||||
used += binary.PutUvarint(buf[used:used+8], vector.end)
|
||||
used += binary.PutUvarint(buf[used:used+8], uint64(len(vector.arrayPositions)))
|
||||
for _, arrayPosition := range vector.arrayPositions {
|
||||
used += binary.PutUvarint(buf[used:used+8], arrayPosition)
|
||||
}
|
||||
}
|
||||
return buf[0:used]
|
||||
}
|
||||
|
@ -431,6 +440,24 @@ func (tfr *TermFrequencyRow) parseV(value []byte) error {
|
|||
}
|
||||
currOffset += bytesRead
|
||||
|
||||
var arrayPositionsLen uint64 = 0
|
||||
arrayPositionsLen, bytesRead = binary.Uvarint(value[currOffset:])
|
||||
if bytesRead <= 0 {
|
||||
return fmt.Errorf("invalid term frequency value, vector contains no arrayPositionLen")
|
||||
}
|
||||
currOffset += bytesRead
|
||||
|
||||
if arrayPositionsLen > 0 {
|
||||
tv.arrayPositions = make([]uint64, arrayPositionsLen)
|
||||
for i := 0; uint64(i) < arrayPositionsLen; i++ {
|
||||
tv.arrayPositions[i], bytesRead = binary.Uvarint(value[currOffset:])
|
||||
if bytesRead <= 0 {
|
||||
return fmt.Errorf("invalid term frequency value, vector contains no arrayPosition of index %d", i)
|
||||
}
|
||||
currOffset += bytesRead
|
||||
}
|
||||
}
|
||||
|
||||
tfr.vectors = append(tfr.vectors, &tv)
|
||||
// try to read next record (may not exist)
|
||||
field, bytesRead = binary.Uvarint(value[currOffset:])
|
||||
|
|
|
@ -61,13 +61,19 @@ func TestRows(t *testing.T) {
|
|||
{
|
||||
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
|
||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 2, 23, 31, 0, 3, 43, 51},
|
||||
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0},
|
||||
},
|
||||
// test larger varints
|
||||
{
|
||||
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
|
||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 150, 17, 23, 31, 0, 3, 43, 51},
|
||||
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0},
|
||||
},
|
||||
// test vectors with arrayPositions
|
||||
{
|
||||
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
|
||||
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
|
||||
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5},
|
||||
},
|
||||
{
|
||||
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),
|
||||
|
|
|
@ -415,7 +415,7 @@ func (udc *UpsideDownCouch) indexField(docID string, field document.Field, field
|
|||
for _, tf := range tokenFreqs {
|
||||
var termFreqRow *TermFrequencyRow
|
||||
if field.Options().IncludeTermVectors() {
|
||||
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf)
|
||||
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, field.ArrayPositions(), tf)
|
||||
rows = append(rows, newFieldRows...)
|
||||
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
|
||||
} else {
|
||||
|
@ -542,7 +542,7 @@ func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
|
|||
return len(tf.Locations)
|
||||
}
|
||||
|
||||
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) {
|
||||
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, arrayPositions []uint64, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) {
|
||||
rv := make([]*TermVector, len(tf.Locations))
|
||||
newFieldRows := make([]UpsideDownCouchRow, 0)
|
||||
|
||||
|
@ -557,10 +557,11 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
|
|||
}
|
||||
}
|
||||
tv := TermVector{
|
||||
field: fieldIndex,
|
||||
pos: uint64(l.Position),
|
||||
start: uint64(l.Start),
|
||||
end: uint64(l.End),
|
||||
field: fieldIndex,
|
||||
arrayPositions: arrayPositions,
|
||||
pos: uint64(l.Position),
|
||||
start: uint64(l.Start),
|
||||
end: uint64(l.End),
|
||||
}
|
||||
rv[i] = &tv
|
||||
}
|
||||
|
@ -574,10 +575,11 @@ func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []
|
|||
for i, tv := range in {
|
||||
fieldName := udc.fieldIndexCache.FieldName(tv.field)
|
||||
tfv := index.TermFieldVector{
|
||||
Field: fieldName,
|
||||
Pos: tv.pos,
|
||||
Start: tv.start,
|
||||
End: tv.end,
|
||||
Field: fieldName,
|
||||
ArrayPositions: tv.arrayPositions,
|
||||
Pos: tv.pos,
|
||||
Start: tv.start,
|
||||
End: tv.end,
|
||||
}
|
||||
rv[i] = &tfv
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue