b241242465
The new analyzeField() helper func is used for both regular fields and for composite fields. With this change, all analysis is done up front, for both regular fields and composite fields. After analysis, this change counts up all the row capacity needed and extends the AnalysisResult.Rows in one shot, as opposed to the previous approach of dynamically growing the array as needed during append()'s. Also, in this change, the TermFreqRow for _id is added first, which seems more correct.
166 lines
5.1 KiB
Go
166 lines
5.1 KiB
Go
// Copyright (c) 2015 Couchbase, Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
|
// except in compliance with the License. You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software distributed under the
|
|
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
|
// either express or implied. See the License for the specific language governing permissions
|
|
// and limitations under the License.
|
|
|
|
package firestorm
|
|
|
|
import (
|
|
"math"
|
|
|
|
"github.com/blevesearch/bleve/analysis"
|
|
"github.com/blevesearch/bleve/document"
|
|
"github.com/blevesearch/bleve/index"
|
|
)
|
|
|
|
func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
|
|
|
|
rv := &index.AnalysisResult{
|
|
DocID: d.ID,
|
|
Rows: make([]index.IndexRow, 0, 100),
|
|
}
|
|
|
|
docIDBytes := []byte(d.ID)
|
|
|
|
// add the _id row
|
|
rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
|
|
|
|
// information we collate as we merge fields with same name
|
|
fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
|
|
fieldLengths := make(map[uint16]int)
|
|
fieldIncludeTermVectors := make(map[uint16]bool)
|
|
fieldNames := make(map[uint16]string)
|
|
|
|
analyzeField := func(field document.Field, storable bool) {
|
|
fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name())
|
|
if newFieldRow != nil {
|
|
rv.Rows = append(rv.Rows, newFieldRow)
|
|
}
|
|
fieldNames[fieldIndex] = field.Name()
|
|
|
|
if field.Options().IsIndexed() {
|
|
fieldLength, tokenFreqs := field.Analyze()
|
|
existingFreqs := fieldTermFreqs[fieldIndex]
|
|
if existingFreqs == nil {
|
|
fieldTermFreqs[fieldIndex] = tokenFreqs
|
|
} else {
|
|
existingFreqs.MergeAll(field.Name(), tokenFreqs)
|
|
fieldTermFreqs[fieldIndex] = existingFreqs
|
|
}
|
|
fieldLengths[fieldIndex] += fieldLength
|
|
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
|
|
}
|
|
|
|
if storable && field.Options().IsStored() {
|
|
storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex)
|
|
rv.Rows = append(rv.Rows, storeRow)
|
|
}
|
|
}
|
|
|
|
for _, field := range d.Fields {
|
|
analyzeField(field, true)
|
|
}
|
|
|
|
for fieldIndex, tokenFreqs := range fieldTermFreqs {
|
|
// see if any of the composite fields need this
|
|
for _, compositeField := range d.CompositeFields {
|
|
compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
|
|
}
|
|
}
|
|
|
|
for _, compositeField := range d.CompositeFields {
|
|
analyzeField(compositeField, false)
|
|
}
|
|
|
|
rowsCapNeeded := len(rv.Rows)
|
|
for _, tokenFreqs := range fieldTermFreqs {
|
|
rowsCapNeeded += len(tokenFreqs)
|
|
}
|
|
|
|
rows := make([]index.IndexRow, 0, rowsCapNeeded)
|
|
rv.Rows = append(rows, rv.Rows...)
|
|
|
|
// walk through the collated information and proccess
|
|
// once for each indexed field (unique name)
|
|
for fieldIndex, tokenFreqs := range fieldTermFreqs {
|
|
fieldLength := fieldLengths[fieldIndex]
|
|
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
|
|
|
|
rv.Rows = f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows)
|
|
}
|
|
|
|
return rv
|
|
}
|
|
|
|
func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow) []index.IndexRow {
|
|
|
|
tfrs := make([]TermFreqRow, len(tokenFreqs))
|
|
|
|
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
|
|
|
|
if !includeTermVectors {
|
|
i := 0
|
|
for _, tf := range tokenFreqs {
|
|
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil))
|
|
i++
|
|
}
|
|
return rows
|
|
}
|
|
|
|
i := 0
|
|
for _, tf := range tokenFreqs {
|
|
var tv []*TermVector
|
|
tv, rows = f.termVectorsFromTokenFreq(fieldIndex, tf, rows)
|
|
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv))
|
|
i++
|
|
}
|
|
return rows
|
|
}
|
|
|
|
func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) {
|
|
rv := make([]*TermVector, len(tf.Locations))
|
|
|
|
for i, l := range tf.Locations {
|
|
var newFieldRow *FieldRow
|
|
fieldIndex := field
|
|
if l.Field != "" {
|
|
// lookup correct field
|
|
fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field)
|
|
if newFieldRow != nil {
|
|
rows = append(rows, newFieldRow)
|
|
}
|
|
}
|
|
tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions)
|
|
rv[i] = tv
|
|
}
|
|
|
|
return rv, rows
|
|
}
|
|
|
|
func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow {
|
|
fieldValue := make([]byte, 1+len(field.Value()))
|
|
fieldValue[0] = encodeFieldType(field)
|
|
copy(fieldValue[1:], field.Value())
|
|
storedRow := NewStoredRow(docID, docNum, fieldIndex, field.ArrayPositions(), fieldValue)
|
|
return storedRow
|
|
}
|
|
|
|
func encodeFieldType(f document.Field) byte {
|
|
fieldType := byte('x')
|
|
switch f.(type) {
|
|
case *document.TextField:
|
|
fieldType = 't'
|
|
case *document.NumericField:
|
|
fieldType = 'n'
|
|
case *document.DateTimeField:
|
|
fieldType = 'd'
|
|
case *document.CompositeField:
|
|
fieldType = 'c'
|
|
}
|
|
return fieldType
|
|
}
|