0
0
Fork 0
bleve/index/scorch/segment/mem/build.go

221 lines
7.1 KiB
Go

package mem
import (
"math"
"sort"
"github.com/RoaringBitmap/roaring"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
)
// NewFromAnalyzedDocs places the analyzed document mutations into this segment
func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment {
s := New()
// ensure that _id field get fieldID 0
s.getOrDefineField("_id", false)
// walk each doc
for _, result := range results {
s.processDocument(result)
}
// go back and sort the dictKeys
for _, dict := range s.dictKeys {
sort.Strings(dict)
}
// professional debugging
//
// log.Printf("fields: %v\n", s.fields)
// log.Printf("fieldsInv: %v\n", s.fieldsInv)
// log.Printf("fieldsLoc: %v\n", s.fieldsLoc)
// log.Printf("dicts: %v\n", s.dicts)
// log.Printf("dict keys: %v\n", s.dictKeys)
// for i, posting := range s.postings {
// log.Printf("posting %d: %v\n", i, posting)
// }
// for i, freq := range s.freqs {
// log.Printf("freq %d: %v\n", i, freq)
// }
// for i, norm := range s.norms {
// log.Printf("norm %d: %v\n", i, norm)
// }
// for i, field := range s.locfields {
// log.Printf("field %d: %v\n", i, field)
// }
// for i, start := range s.locstarts {
// log.Printf("start %d: %v\n", i, start)
// }
// for i, end := range s.locends {
// log.Printf("end %d: %v\n", i, end)
// }
// for i, pos := range s.locpos {
// log.Printf("pos %d: %v\n", i, pos)
// }
// for i, apos := range s.locarraypos {
// log.Printf("apos %d: %v\n", i, apos)
// }
// log.Printf("stored: %v\n", s.stored)
// log.Printf("stored types: %v\n", s.storedTypes)
// log.Printf("stored pos: %v\n", s.storedPos)
return s
}
func (s *Segment) processDocument(result *index.AnalysisResult) {
// used to collate information across fields
docMap := map[uint16]analysis.TokenFrequencies{}
fieldLens := map[uint16]int{}
docNum := uint64(s.addDocument())
processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) {
fieldLens[field] += l
if existingFreqs, ok := docMap[field]; ok {
existingFreqs.MergeAll(name, tf)
} else {
docMap[field] = tf
}
}
storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) {
s.stored[docNum][field] = append(s.stored[docNum][field], val)
s.storedTypes[docNum][field] = append(s.storedTypes[docNum][field], typ)
s.storedPos[docNum][field] = append(s.storedPos[docNum][field], pos)
}
// walk each composite field
for _, field := range result.Document.CompositeFields {
fieldID := uint16(s.getOrDefineField(field.Name(), false))
l, tf := field.Analyze()
processField(fieldID, field.Name(), l, tf)
}
// walk each field
for i, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name(), field.Options().IncludeTermVectors()))
l := result.Length[i]
tf := result.Analyzed[i]
processField(fieldID, field.Name(), l, tf)
if field.Options().IsStored() {
storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions())
}
}
// now that its been rolled up into docMap, walk that
for fieldID, tokenFrequencies := range docMap {
for term, tokenFreq := range tokenFrequencies {
fieldTermPostings := s.dicts[fieldID][term]
// FIXME this if/else block has duplicate code that has resulted in
// bugs fixed/missed more than once, need to refactor
if fieldTermPostings == 0 {
// need to build new posting
bs := roaring.New()
bs.AddInt(int(docNum))
newPostingID := uint64(len(s.postings) + 1)
// add this new bitset to the postings slice
s.postings = append(s.postings, bs)
// add this to the details slice
s.freqs = append(s.freqs, []uint64{uint64(tokenFreq.Frequency())})
s.norms = append(s.norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))})
// add to locations
var locfields []uint16
var locstarts []uint64
var locends []uint64
var locpos []uint64
var locarraypos [][]uint64
for _, loc := range tokenFreq.Locations {
var locf = fieldID
if loc.Field != "" {
locf = uint16(s.getOrDefineField(loc.Field, false))
}
locfields = append(locfields, locf)
locstarts = append(locstarts, uint64(loc.Start))
locends = append(locends, uint64(loc.End))
locpos = append(locpos, uint64(loc.Position))
if len(loc.ArrayPositions) > 0 {
locarraypos = append(locarraypos, loc.ArrayPositions)
} else {
locarraypos = append(locarraypos, nil)
}
}
s.locfields = append(s.locfields, locfields)
s.locstarts = append(s.locstarts, locstarts)
s.locends = append(s.locends, locends)
s.locpos = append(s.locpos, locpos)
s.locarraypos = append(s.locarraypos, locarraypos)
// record it
s.dicts[fieldID][term] = newPostingID
// this term was new for this field, add it to dictKeys
s.dictKeys[fieldID] = append(s.dictKeys[fieldID], term)
} else {
// posting already started for this field/term
// the actual offset is - 1, because 0 is zero value
bs := s.postings[fieldTermPostings-1]
bs.AddInt(int(docNum))
s.freqs[fieldTermPostings-1] = append(s.freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency()))
s.norms[fieldTermPostings-1] = append(s.norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))))
for _, loc := range tokenFreq.Locations {
var locf = fieldID
if loc.Field != "" {
locf = uint16(s.getOrDefineField(loc.Field, false))
}
s.locfields[fieldTermPostings-1] = append(s.locfields[fieldTermPostings-1], locf)
s.locstarts[fieldTermPostings-1] = append(s.locstarts[fieldTermPostings-1], uint64(loc.Start))
s.locends[fieldTermPostings-1] = append(s.locends[fieldTermPostings-1], uint64(loc.End))
s.locpos[fieldTermPostings-1] = append(s.locpos[fieldTermPostings-1], uint64(loc.Position))
if len(loc.ArrayPositions) > 0 {
s.locarraypos[fieldTermPostings-1] = append(s.locarraypos[fieldTermPostings-1], loc.ArrayPositions)
} else {
s.locarraypos[fieldTermPostings-1] = append(s.locarraypos[fieldTermPostings-1], nil)
}
}
}
}
}
}
func (s *Segment) getOrDefineField(name string, hasLoc bool) int {
fieldID, ok := s.fields[name]
if !ok {
fieldID = uint16(len(s.fieldsInv) + 1)
s.fields[name] = fieldID
s.fieldsInv = append(s.fieldsInv, name)
s.fieldsLoc = append(s.fieldsLoc, hasLoc)
s.dicts = append(s.dicts, make(map[string]uint64))
s.dictKeys = append(s.dictKeys, make([]string, 0))
}
return int(fieldID - 1)
}
func (s *Segment) addDocument() int {
docNum := len(s.stored)
s.stored = append(s.stored, map[uint16][][]byte{})
s.storedTypes = append(s.storedTypes, map[uint16][]byte{})
s.storedPos = append(s.storedPos, map[uint16][][]uint64{})
return docNum
}
func encodeFieldType(f document.Field) byte {
fieldType := byte('x')
switch f.(type) {
case *document.TextField:
fieldType = 't'
case *document.NumericField:
fieldType = 'n'
case *document.DateTimeField:
fieldType = 'd'
case *document.BooleanField:
fieldType = 'b'
case *document.GeoPointField:
fieldType = 'g'
case *document.CompositeField:
fieldType = 'c'
}
return fieldType
}