0
0
Fork 0

fix up issues to get all bleve unit tests passing for scorch

make scorch default
This commit is contained in:
Marty Schoch 2017-12-11 15:47:41 -05:00
parent 00722aa299
commit f13b786609
12 changed files with 172 additions and 77 deletions

View File

@ -21,8 +21,8 @@ import (
"time"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/scorch"
"github.com/blevesearch/bleve/index/store/gtreap"
"github.com/blevesearch/bleve/index/upsidedown"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/search/highlight/highlighter/html"
)
@ -69,7 +69,7 @@ func init() {
Config.DefaultMemKVStore = gtreap.Name
// default index
Config.DefaultIndexType = upsidedown.Name
Config.DefaultIndexType = scorch.Name
bootDuration := time.Since(bootStart)
bleveExpVar.Add("bootDuration", int64(bootDuration))

View File

@ -82,15 +82,27 @@ func (r *Reader) InternalID(id string) (index.IndexInternalID, error) {
}
func (r *Reader) DumpAll() chan interface{} {
panic("dumpall")
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
}
func (r *Reader) DumpDoc(id string) chan interface{} {
panic("dumpdoc")
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
}
func (r *Reader) DumpFields() chan interface{} {
panic("dumpfields")
rv := make(chan interface{})
go func() {
close(rv)
}()
return rv
}
func (r *Reader) Close() error {

View File

@ -29,7 +29,7 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment {
s := New()
// ensure that _id field get fieldID 0
s.getOrDefineField("_id", false)
s.getOrDefineField("_id")
// walk each doc
for _, result := range results {
@ -102,14 +102,14 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
// walk each composite field
for _, field := range result.Document.CompositeFields {
fieldID := uint16(s.getOrDefineField(field.Name(), true))
fieldID := uint16(s.getOrDefineField(field.Name()))
l, tf := field.Analyze()
processField(fieldID, field.Name(), l, tf)
}
// walk each field
for i, field := range result.Document.Fields {
fieldID := uint16(s.getOrDefineField(field.Name(), field.Options().IncludeTermVectors()))
fieldID := uint16(s.getOrDefineField(field.Name()))
l := result.Length[i]
tf := result.Analyzed[i]
processField(fieldID, field.Name(), l, tf)
@ -133,6 +133,9 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
newPostingID := uint64(len(s.Postings) + 1)
// add this new bitset to the postings slice
s.Postings = append(s.Postings, bs)
locationBS := roaring.New()
s.PostingsLocs = append(s.PostingsLocs, locationBS)
// add this to the details slice
s.Freqs = append(s.Freqs, []uint64{uint64(tokenFreq.Frequency())})
s.Norms = append(s.Norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))})
@ -142,10 +145,13 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
var locends []uint64
var locpos []uint64
var locarraypos [][]uint64
if len(tokenFreq.Locations) > 0 {
locationBS.AddInt(int(docNum))
}
for _, loc := range tokenFreq.Locations {
var locf = fieldID
if loc.Field != "" {
locf = uint16(s.getOrDefineField(loc.Field, false))
locf = uint16(s.getOrDefineField(loc.Field))
}
locfields = append(locfields, locf)
locstarts = append(locstarts, uint64(loc.Start))
@ -171,12 +177,16 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
// the actual offset is - 1, because 0 is zero value
bs := s.Postings[fieldTermPostings-1]
bs.AddInt(int(docNum))
locationBS := s.PostingsLocs[fieldTermPostings-1]
s.Freqs[fieldTermPostings-1] = append(s.Freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency()))
s.Norms[fieldTermPostings-1] = append(s.Norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))))
if len(tokenFreq.Locations) > 0 {
locationBS.AddInt(int(docNum))
}
for _, loc := range tokenFreq.Locations {
var locf = fieldID
if loc.Field != "" {
locf = uint16(s.getOrDefineField(loc.Field, false))
locf = uint16(s.getOrDefineField(loc.Field))
}
s.Locfields[fieldTermPostings-1] = append(s.Locfields[fieldTermPostings-1], locf)
s.Locstarts[fieldTermPostings-1] = append(s.Locstarts[fieldTermPostings-1], uint64(loc.Start))
@ -193,13 +203,12 @@ func (s *Segment) processDocument(result *index.AnalysisResult) {
}
}
func (s *Segment) getOrDefineField(name string, hasLoc bool) int {
func (s *Segment) getOrDefineField(name string) int {
fieldID, ok := s.FieldsMap[name]
if !ok {
fieldID = uint16(len(s.FieldsInv) + 1)
s.FieldsMap[name] = fieldID
s.FieldsInv = append(s.FieldsInv, name)
s.FieldsLoc = append(s.FieldsLoc, hasLoc)
s.Dicts = append(s.Dicts, make(map[string]uint64))
s.DictKeys = append(s.DictKeys, make([]string, 0))
}

View File

@ -51,6 +51,7 @@ func (p *PostingsList) Iterator() segment.PostingsIterator {
}
if p.postingsID > 0 {
allbits := p.dictionary.segment.Postings[p.postingsID-1]
rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1]
rv.all = allbits.Iterator()
if p.except != nil {
allExcept := allbits.Clone()
@ -68,6 +69,7 @@ func (p *PostingsList) Iterator() segment.PostingsIterator {
type PostingsIterator struct {
postings *PostingsList
all roaring.IntIterable
locations *roaring.Bitmap
offset int
locoffset int
actual roaring.IntIterable
@ -95,6 +97,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
docNum: uint64(n),
offset: i.offset,
locoffset: i.locoffset,
hasLoc: i.locations.Contains(n),
}
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
@ -108,6 +111,7 @@ type Posting struct {
docNum uint64
offset int
locoffset int
hasLoc bool
}
// Number returns the document number of this posting in this segment
@ -127,7 +131,7 @@ func (p *Posting) Norm() float64 {
// Locations returns the location information for each occurance
func (p *Posting) Locations() []segment.Location {
if !p.iterator.postings.dictionary.segment.FieldsLoc[p.iterator.postings.dictionary.fieldID] {
if !p.hasLoc {
return nil
}
freq := int(p.Frequency())

View File

@ -44,8 +44,6 @@ type Segment struct {
FieldsMap map[string]uint16
// fields id -> name
FieldsInv []string
// field id -> has location info
FieldsLoc []bool
// term dictionary
// field id -> term -> posting id + 1
@ -59,6 +57,9 @@ type Segment struct {
// Postings list id -> Postings bitmap
Postings []*roaring.Bitmap
// Postings List has locations
PostingsLocs []*roaring.Bitmap
// term frequencies
// postings list id -> Freqs (one for each hit in bitmap)
Freqs [][]uint64

View File

@ -27,7 +27,7 @@ import (
"github.com/golang/snappy"
)
var version uint32
const version uint32 = 1
// PersistSegment takes the in-memory segment and persists it to the specified
// path in the zap file format.
@ -58,8 +58,14 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e
return err
}
var postingsListLocs []uint64
postingsListLocs, err = persistPostingsLocs(memSegment, cr)
if err != nil {
return err
}
var postingsLocs []uint64
postingsLocs, err = persistPostingsLists(memSegment, cr, freqOffsets, locOffsets)
postingsLocs, err = persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets)
if err != nil {
return err
}
@ -420,7 +426,43 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac
return freqOffsets, locOfffsets, nil
}
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, freqOffsets, locOffsets []uint64) ([]uint64, error) {
func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) ([]uint64, error) {
var rv []uint64
var postingsBuf bytes.Buffer
for postingID := range memSegment.PostingsLocs {
if postingID != 0 {
postingsBuf.Reset()
}
// record where we start this posting loc
rv = append(rv, uint64(w.Count()))
// write out postings locs to memory so we know the len
postingsLocLen, err := memSegment.PostingsLocs[postingID].WriteTo(&postingsBuf)
if err != nil {
return nil, err
}
buf := make([]byte, binary.MaxVarintLen64)
// write out the length of this postings locs
n := binary.PutUvarint(buf, uint64(postingsLocLen))
_, err = w.Write(buf[:n])
if err != nil {
return nil, err
}
// write out the postings list itself
_, err = w.Write(postingsBuf.Bytes())
if err != nil {
return nil, err
}
}
return rv, nil
}
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, postingsListLocs, freqOffsets, locOffsets []uint64) ([]uint64, error) {
var rv []uint64
var postingsBuf bytes.Buffer
@ -453,6 +495,13 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, freqOffse
return nil, err
}
// write out the start of the loc posting list
n = binary.PutUvarint(buf, postingsListLocs[postingID])
_, err = w.Write(buf[:n])
if err != nil {
return nil, err
}
// write out the length of this postings list
n = binary.PutUvarint(buf, uint64(postingsListLen))
_, err = w.Write(buf[:n])
@ -534,20 +583,9 @@ func persistFields(memSegment *mem.Segment, w *CountHashWriter, dictLocs []uint6
fieldStarts = append(fieldStarts, uint64(w.Count()))
buf := make([]byte, binary.MaxVarintLen64)
// write out if the field has indexed locs (0 or 1)
var indexedLoc uint64
if memSegment.FieldsLoc[fieldID] {
indexedLoc = 1
}
n := binary.PutUvarint(buf, indexedLoc)
_, err := w.Write(buf[:n])
if err != nil {
return 0, err
}
// write out dict location for this field
n = binary.PutUvarint(buf, dictLocs[fieldID])
_, err = w.Write(buf[:n])
n := binary.PutUvarint(buf, dictLocs[fieldID])
_, err := w.Write(buf[:n])
if err != nil {
return 0, err
}

View File

@ -43,11 +43,7 @@ var fieldsCmd = &cobra.Command{
for fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd {
addr := binary.BigEndian.Uint64(data[fieldsIndexOffset+(8*fieldID) : fieldsIndexOffset+(8*fieldID)+8])
var n uint64
indexedLoc, read := binary.Uvarint(data[addr:fieldsIndexEnd])
n += uint64(read)
var dictLoc uint64
dictLoc, read = binary.Uvarint(data[addr+n : fieldsIndexEnd])
dictLoc, read := binary.Uvarint(data[addr+n : fieldsIndexEnd])
n += uint64(read)
var nameLen uint64
@ -56,7 +52,7 @@ var fieldsCmd = &cobra.Command{
name := string(data[addr+n : addr+n+nameLen])
fmt.Printf("field %d '%s' indexedLoc: %t starts at %d (%x)\n", fieldID, name, indexedLoc == 1, dictLoc, dictLoc)
fmt.Printf("field %d '%s' starts at %d (%x)\n", fieldID, name, dictLoc, dictLoc)
fieldID++
}

View File

@ -60,6 +60,8 @@ func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*Posting
n += uint64(read)
rv.locOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
rv.locBitmapOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)
var postingsLen uint64
postingsLen, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64])
n += uint64(read)

View File

@ -27,14 +27,15 @@ import (
// PostingsList is an in-memory represenation of a postings list
type PostingsList struct {
dictionary *Dictionary
term string
postingsOffset uint64
freqOffset uint64
locOffset uint64
postings *roaring.Bitmap
except *roaring.Bitmap
postingKey []byte
dictionary *Dictionary
term string
postingsOffset uint64
freqOffset uint64
locOffset uint64
locBitmapOffset uint64
postings *roaring.Bitmap
except *roaring.Bitmap
postingKey []byte
}
// Iterator returns an iterator for this postings list
@ -68,6 +69,18 @@ func (p *PostingsList) Iterator() segment.PostingsIterator {
}
rv.locChunkStart = p.locOffset + n
var locBitmapLen uint64
locBitmapLen, read = binary.Uvarint(p.dictionary.segment.mm[p.locBitmapOffset : p.locBitmapOffset+binary.MaxVarintLen64])
roaringBytes := p.dictionary.segment.mm[p.locBitmapOffset+uint64(read) : p.locBitmapOffset+uint64(read)+locBitmapLen]
bitmap := roaring.NewBitmap()
_, err := bitmap.FromBuffer(roaringBytes)
if err != nil {
// return nil, fmt.Errorf("error loading roaring bitmap: %v", err)
// FIXME dont break api yet
panic("i died")
}
rv.locBitmap = bitmap
rv.all = p.postings.Iterator()
if p.except != nil {
allExcept := p.postings.Clone()
@ -86,6 +99,7 @@ func (p *PostingsList) Count() uint64 {
var rv uint64
if p.postings != nil {
rv = p.postings.GetCardinality()
if p.except != nil {
except := p.except.GetCardinality()
if except > rv {
@ -117,6 +131,8 @@ type PostingsIterator struct {
locChunkLens []uint64
locChunkStart uint64
locBitmap *roaring.Bitmap
}
func (i *PostingsIterator) loadChunk(chunk int) error {
@ -245,7 +261,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
if err != nil {
return nil, err
}
if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] {
if i.locBitmap.Contains(allN) {
for j := 0; j < int(freq); j++ {
err := i.readLocation(nil)
if err != nil {
@ -280,7 +296,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) {
return nil, err
}
rv.norm = math.Float32frombits(uint32(normBits))
if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] {
if i.locBitmap.Contains(n) {
// read off 'freq' locations
rv.locs = make([]segment.Location, rv.freq)
locs := make([]Location, rv.freq)

View File

@ -78,7 +78,6 @@ type Segment struct {
fieldsMap map[string]uint16
fieldsInv []string
fieldsLoc []bool
fieldsOffsets []uint64
}
@ -112,16 +111,8 @@ func (s *Segment) loadFields() error {
for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd {
addr := binary.BigEndian.Uint64(s.mm[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8])
var n uint64
hasStoredLoc, read := binary.Uvarint(s.mm[addr:fieldsIndexEnd])
n += uint64(read)
if hasStoredLoc == 1 {
s.fieldsLoc = append(s.fieldsLoc, true)
} else {
s.fieldsLoc = append(s.fieldsLoc, false)
}
var dictLoc uint64
dictLoc, read = binary.Uvarint(s.mm[addr+n : fieldsIndexEnd])
dictLoc, read := binary.Uvarint(s.mm[addr+n : fieldsIndexEnd])
n += uint64(read)
s.fieldsOffsets = append(s.fieldsOffsets, dictLoc)

View File

@ -212,6 +212,11 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) {
return nil, err
}
if next == nil {
// no such doc exists
return nil, nil
}
docNum := docInternalToNumber(next.ID)
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
@ -318,6 +323,7 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
var err error
rv := &IndexSnapshotTermFieldReader{
term: term,
snapshot: i,
postings: make([]segment.PostingsList, len(i.segment)),
iterators: make([]segment.PostingsIterator, len(i.segment)),
@ -337,7 +343,6 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
if err != nil {
return nil, err
}
return rv, nil
}

View File

@ -22,6 +22,7 @@ import (
)
type IndexSnapshotTermFieldReader struct {
term []byte
snapshot *IndexSnapshot
postings []segment.PostingsList
iterators []segment.PostingsIterator
@ -29,6 +30,8 @@ type IndexSnapshotTermFieldReader struct {
includeFreq bool
includeNorm bool
includeTermVectors bool
currPosting segment.Posting
currID index.IndexInternalID
}
func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) {
@ -47,26 +50,11 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in
globalOffset := i.snapshot.offsets[i.segmentOffset]
nnum := next.Number()
rv.ID = docNumberToBytes(nnum + globalOffset)
if i.includeFreq {
rv.Freq = next.Frequency()
}
if i.includeNorm {
rv.Norm = next.Norm()
}
if i.includeTermVectors {
locs := next.Locations()
rv.Vectors = make([]*index.TermFieldVector, len(locs))
for i, loc := range locs {
rv.Vectors[i] = &index.TermFieldVector{
Start: loc.Start(),
End: loc.End(),
Pos: loc.Pos(),
ArrayPositions: loc.ArrayPositions(),
Field: loc.Field(),
}
}
}
i.postingToTermFieldDoc(next, rv)
i.currID = rv.ID
i.currPosting = next
return rv, nil
}
i.segmentOffset++
@ -74,7 +62,40 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in
return nil, nil
}
func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Posting, rv *index.TermFieldDoc) {
if i.includeFreq {
rv.Freq = next.Frequency()
}
if i.includeNorm {
rv.Norm = next.Norm()
}
if i.includeTermVectors {
locs := next.Locations()
rv.Vectors = make([]*index.TermFieldVector, len(locs))
for i, loc := range locs {
rv.Vectors[i] = &index.TermFieldVector{
Start: loc.Start(),
End: loc.End(),
Pos: loc.Pos(),
ArrayPositions: loc.ArrayPositions(),
Field: loc.Field(),
}
}
}
}
// Advance go fuck yourself editor
func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) {
// first make sure we aren't already pointing at the right thing, (due to way searchers work)
if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
rv := preAlloced
if rv == nil {
rv = &index.TermFieldDoc{}
}
rv.ID = i.currID
i.postingToTermFieldDoc(i.currPosting, rv)
return rv, nil
}
// FIXME do something better
next, err := i.Next(preAlloced)
if err != nil {