176 lines
4.7 KiB
Go
176 lines
4.7 KiB
Go
|
// Copyright (c) 2015 Couchbase, Inc.
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||
|
// except in compliance with the License. You may obtain a copy of the License at
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
||
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||
|
// either express or implied. See the License for the specific language governing permissions
|
||
|
// and limitations under the License.
|
||
|
|
||
|
package firestorm
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
"encoding/binary"
|
||
|
"fmt"
|
||
|
|
||
|
"github.com/golang/protobuf/proto"
|
||
|
)
|
||
|
|
||
|
var TermFreqKeyPrefix = []byte{'t'}
|
||
|
|
||
|
type TermFreqRow struct {
|
||
|
field uint16
|
||
|
term []byte
|
||
|
docID []byte
|
||
|
docNum uint64
|
||
|
value TermFreqValue
|
||
|
}
|
||
|
|
||
|
func NewTermVector(field uint16, pos uint64, start uint64, end uint64, arrayPos []uint64) *TermVector {
|
||
|
rv := TermVector{}
|
||
|
|
||
|
rv.Field = proto.Uint32(uint32(field))
|
||
|
rv.Pos = proto.Uint64(pos)
|
||
|
rv.Start = proto.Uint64(start)
|
||
|
rv.End = proto.Uint64(end)
|
||
|
|
||
|
if len(arrayPos) > 0 {
|
||
|
rv.ArrayPositions = make([]uint64, len(arrayPos))
|
||
|
for i, apv := range arrayPos {
|
||
|
rv.ArrayPositions[i] = apv
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return &rv
|
||
|
}
|
||
|
|
||
|
func NewTermFreqRow(field uint16, term []byte, docID []byte, docNum uint64, freq uint64, norm float32, termVectors []*TermVector) *TermFreqRow {
|
||
|
rv := TermFreqRow{
|
||
|
field: field,
|
||
|
term: term,
|
||
|
docID: docID,
|
||
|
docNum: docNum,
|
||
|
}
|
||
|
|
||
|
rv.value.Freq = proto.Uint64(freq)
|
||
|
rv.value.Norm = proto.Float32(norm)
|
||
|
rv.value.Vectors = termVectors
|
||
|
|
||
|
return &rv
|
||
|
}
|
||
|
|
||
|
func NewTermFreqRowKV(key, value []byte) (*TermFreqRow, error) {
|
||
|
rv := TermFreqRow{}
|
||
|
keyLen := len(key)
|
||
|
if keyLen < 3 {
|
||
|
return nil, fmt.Errorf("invalid term frequency key, no valid field")
|
||
|
}
|
||
|
rv.field = binary.LittleEndian.Uint16(key[1:3])
|
||
|
|
||
|
termStartPos := 3
|
||
|
termEndPos := bytes.IndexByte(key[termStartPos:], ByteSeparator)
|
||
|
if termEndPos < 0 {
|
||
|
return nil, fmt.Errorf("invalid term frequency key, no byte separator terminating term")
|
||
|
}
|
||
|
rv.term = key[termStartPos : termStartPos+termEndPos]
|
||
|
|
||
|
docStartPos := termStartPos + termEndPos + 1
|
||
|
docEndPos := bytes.IndexByte(key[docStartPos:], ByteSeparator)
|
||
|
rv.docID = key[docStartPos : docStartPos+docEndPos]
|
||
|
|
||
|
docNumPos := docStartPos + docEndPos + 1
|
||
|
rv.docNum, _ = binary.Uvarint(key[docNumPos:])
|
||
|
|
||
|
err := rv.value.Unmarshal(value)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
return &rv, nil
|
||
|
}
|
||
|
|
||
|
func (tfr *TermFreqRow) Key() []byte {
|
||
|
buf := make([]byte, 3+len(tfr.term)+1+len(tfr.docID)+1+binary.MaxVarintLen64)
|
||
|
buf[0] = 't'
|
||
|
binary.LittleEndian.PutUint16(buf[1:3], tfr.field)
|
||
|
termLen := copy(buf[3:], tfr.term)
|
||
|
buf[3+termLen] = ByteSeparator
|
||
|
docLen := copy(buf[3+termLen+1:], tfr.docID)
|
||
|
buf[3+termLen+1+docLen] = ByteSeparator
|
||
|
used := binary.PutUvarint(buf[3+termLen+1+docLen+1:], tfr.docNum)
|
||
|
return buf[:3+termLen+1+docLen+1+used]
|
||
|
}
|
||
|
|
||
|
func (tfr *TermFreqRow) Value() []byte {
|
||
|
rv, _ := tfr.value.Marshal()
|
||
|
return rv
|
||
|
}
|
||
|
|
||
|
func (tfr *TermFreqRow) String() string {
|
||
|
vectors := ""
|
||
|
for i, v := range tfr.value.GetVectors() {
|
||
|
vectors += fmt.Sprintf("%d - Field: %d Pos: %d Start: %d End: %d ArrayPos: %v - %#v\n", i, v.GetField(), v.GetPos(), v.GetStart(), v.GetEnd(), v.GetArrayPositions(), v.ArrayPositions)
|
||
|
}
|
||
|
return fmt.Sprintf("TermFreqRow - Field: %d\n", tfr.field) +
|
||
|
fmt.Sprintf("Term '%s' - % x\n", tfr.term, tfr.term) +
|
||
|
fmt.Sprintf("DocID '%s' - % x\n", tfr.docID, tfr.docID) +
|
||
|
fmt.Sprintf("DocNum %d\n", tfr.docNum) +
|
||
|
fmt.Sprintf("Freq: %d\n", tfr.value.GetFreq()) +
|
||
|
fmt.Sprintf("Norm: %f\n", tfr.value.GetNorm()) +
|
||
|
fmt.Sprintf("Vectors:\n%s", vectors)
|
||
|
}
|
||
|
|
||
|
func (tfr *TermFreqRow) Field() uint16 {
|
||
|
return tfr.field
|
||
|
}
|
||
|
|
||
|
func (tfr *TermFreqRow) Term() []byte {
|
||
|
return tfr.term
|
||
|
}
|
||
|
|
||
|
func (tfr *TermFreqRow) DocID() []byte {
|
||
|
return tfr.docID
|
||
|
}
|
||
|
|
||
|
func (tfr *TermFreqRow) DocNum() uint64 {
|
||
|
return tfr.docNum
|
||
|
}
|
||
|
|
||
|
func (tfr *TermFreqRow) Norm() float32 {
|
||
|
return tfr.value.GetNorm()
|
||
|
}
|
||
|
|
||
|
func (tfr *TermFreqRow) Freq() uint64 {
|
||
|
return tfr.value.GetFreq()
|
||
|
}
|
||
|
|
||
|
func (tfr *TermFreqRow) Vectors() []*TermVector {
|
||
|
return tfr.value.GetVectors()
|
||
|
}
|
||
|
|
||
|
func (tfr *TermFreqRow) DictionaryRowKey() []byte {
|
||
|
dr := NewDictionaryRow(tfr.field, tfr.term, 0)
|
||
|
return dr.Key()
|
||
|
}
|
||
|
|
||
|
func TermFreqIteratorStart(field uint16, term []byte) []byte {
|
||
|
buf := make([]byte, 3+len(term)+1)
|
||
|
buf[0] = 't'
|
||
|
binary.LittleEndian.PutUint16(buf[1:3], field)
|
||
|
termLen := copy(buf[3:], term)
|
||
|
buf[3+termLen] = ByteSeparator
|
||
|
return buf
|
||
|
}
|
||
|
|
||
|
func TermFreqPrefixFieldTermDocId(field uint16, term []byte, docID []byte) []byte {
|
||
|
buf := make([]byte, 3+len(term)+1+len(docID)+1)
|
||
|
buf[0] = 't'
|
||
|
binary.LittleEndian.PutUint16(buf[1:3], field)
|
||
|
termLen := copy(buf[3:], term)
|
||
|
buf[3+termLen] = ByteSeparator
|
||
|
docLen := copy(buf[3+termLen+1:], docID)
|
||
|
buf[3+termLen+1+docLen] = ByteSeparator
|
||
|
return buf
|
||
|
}
|