0
0
Fork 0

update TermVector

This commit is contained in:
dtynn 2015-05-17 13:07:14 +08:00
parent 8f70def63b
commit 89dc2c22bc
4 changed files with 58 additions and 22 deletions

View File

@ -61,10 +61,11 @@ type IndexReader interface {
type FieldTerms map[string][]string
type TermFieldVector struct {
Field string
Pos uint64
Start uint64
End uint64
Field string
ArrayPositions []uint64
Pos uint64
Start uint64
End uint64
}
type TermFieldDoc struct {

View File

@ -259,14 +259,15 @@ func (dr *DictionaryRow) parseDictionaryV(value []byte) error {
// TERM FIELD FREQUENCY
type TermVector struct {
field uint16
pos uint64
start uint64
end uint64
field uint16
arrayPositions []uint64
pos uint64
start uint64
end uint64
}
func (tv *TermVector) String() string {
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d", tv.field, tv.pos, tv.start, tv.end)
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
}
type TermFrequencyRow struct {
@ -319,7 +320,11 @@ func (tfr *TermFrequencyRow) DictionaryRowKey() []byte {
func (tfr *TermFrequencyRow) Value() []byte {
used := 0
buf := make([]byte, 8+8+(len(tfr.vectors)*(8+8+8+8)))
bufLen := 8 + 8
for _, vector := range tfr.vectors {
bufLen += 8 + 8 + 8 + 8 + (1+len(vector.arrayPositions))*8
}
buf := make([]byte, bufLen)
used += binary.PutUvarint(buf[used:used+8], tfr.freq)
@ -332,6 +337,10 @@ func (tfr *TermFrequencyRow) Value() []byte {
used += binary.PutUvarint(buf[used:used+8], vector.pos)
used += binary.PutUvarint(buf[used:used+8], vector.start)
used += binary.PutUvarint(buf[used:used+8], vector.end)
used += binary.PutUvarint(buf[used:used+8], uint64(len(vector.arrayPositions)))
for _, arrayPosition := range vector.arrayPositions {
used += binary.PutUvarint(buf[used:used+8], arrayPosition)
}
}
return buf[0:used]
}
@ -431,6 +440,24 @@ func (tfr *TermFrequencyRow) parseV(value []byte) error {
}
currOffset += bytesRead
var arrayPositionsLen uint64 = 0
arrayPositionsLen, bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no arrayPositionLen")
}
currOffset += bytesRead
if arrayPositionsLen > 0 {
tv.arrayPositions = make([]uint64, arrayPositionsLen)
for i := 0; uint64(i) < arrayPositionsLen; i++ {
tv.arrayPositions[i], bytesRead = binary.Uvarint(value[currOffset:])
if bytesRead <= 0 {
return fmt.Errorf("invalid term frequency value, vector contains no arrayPosition of index %d", i)
}
currOffset += bytesRead
}
}
tfr.vectors = append(tfr.vectors, &tv)
// try to read next record (may not exist)
field, bytesRead = binary.Uvarint(value[currOffset:])

View File

@ -61,13 +61,19 @@ func TestRows(t *testing.T) {
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 2, 23, 31, 0, 3, 43, 51},
[]byte{3, 195, 235, 163, 130, 4, 0, 1, 3, 11, 0, 0, 2, 23, 31, 0, 0, 3, 43, 51, 0},
},
// test larger varints
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2198, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 150, 17, 23, 31, 0, 3, 43, 51},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 0, 0, 150, 17, 23, 31, 0, 0, 3, 43, 51, 0},
},
// test vectors with arrayPositions
{
NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 25896, 3.14, []*TermVector{&TermVector{field: 255, pos: 1, start: 3, end: 11, arrayPositions: []uint64{0}}, &TermVector{field: 0, pos: 2198, start: 23, end: 31, arrayPositions: []uint64{1, 2}}, &TermVector{field: 0, pos: 3, start: 43, end: 51, arrayPositions: []uint64{3, 4, 5}}}),
[]byte{'t', 0, 0, 'b', 'e', 'e', 'r', ByteSeparator, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'},
[]byte{168, 202, 1, 195, 235, 163, 130, 4, 255, 1, 1, 3, 11, 1, 0, 0, 150, 17, 23, 31, 2, 1, 2, 0, 3, 43, 51, 3, 3, 4, 5},
},
{
NewBackIndexRow("budweiser", []*BackIndexTermEntry{&BackIndexTermEntry{Term: proto.String("beer"), Field: proto.Uint32(0)}}, nil),

View File

@ -415,7 +415,7 @@ func (udc *UpsideDownCouch) indexField(docID string, field document.Field, field
for _, tf := range tokenFreqs {
var termFreqRow *TermFrequencyRow
if field.Options().IncludeTermVectors() {
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, tf)
tv, newFieldRows := udc.termVectorsFromTokenFreq(fieldIndex, field.ArrayPositions(), tf)
rows = append(rows, newFieldRows...)
termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, fieldIndex, docID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv)
} else {
@ -542,7 +542,7 @@ func frequencyFromTokenFreq(tf *analysis.TokenFreq) int {
return len(tf.Locations)
}
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) {
func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, arrayPositions []uint64, tf *analysis.TokenFreq) ([]*TermVector, []UpsideDownCouchRow) {
rv := make([]*TermVector, len(tf.Locations))
newFieldRows := make([]UpsideDownCouchRow, 0)
@ -557,10 +557,11 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
}
}
tv := TermVector{
field: fieldIndex,
pos: uint64(l.Position),
start: uint64(l.Start),
end: uint64(l.End),
field: fieldIndex,
arrayPositions: arrayPositions,
pos: uint64(l.Position),
start: uint64(l.Start),
end: uint64(l.End),
}
rv[i] = &tv
}
@ -574,10 +575,11 @@ func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []
for i, tv := range in {
fieldName := udc.fieldIndexCache.FieldName(tv.field)
tfv := index.TermFieldVector{
Field: fieldName,
Pos: tv.pos,
Start: tv.start,
End: tv.end,
Field: fieldName,
ArrayPositions: tv.arrayPositions,
Pos: tv.pos,
Start: tv.start,
End: tv.end,
}
rv[i] = &tfv
}