0
0
Fork 0
bleve/index/index.go

221 lines
6.4 KiB
Go
Raw Normal View History

2014-04-17 22:55:53 +02:00
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
2014-04-17 22:55:53 +02:00
package index
import (
"encoding/json"
"fmt"
"time"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index/store"
2014-04-17 22:55:53 +02:00
)
var ErrorUnknownStorageType = fmt.Errorf("unknown storage type")
2014-04-17 22:55:53 +02:00
type Index interface {
Open() error
Close() error
2014-04-17 22:55:53 +02:00
DocCount() (uint64, error)
2014-04-17 22:55:53 +02:00
Update(doc *document.Document) error
Delete(id string) error
Batch(batch *Batch) error
2014-04-17 22:55:53 +02:00
SetInternal(key, val []byte) error
DeleteInternal(key []byte) error
DumpAll() chan interface{}
DumpDoc(id string) chan interface{}
DumpFields() chan interface{}
// Reader returns a low-level accessor on the index data. Close it to
// release associated resources.
Reader() (IndexReader, error)
Stats() json.Marshaler
StatsMap() map[string]interface{}
Analyze(d *document.Document) *AnalysisResult
Advanced() (store.KVStore, error)
}
// AsyncIndex is an interface for indexes which perform
// some important operations asynchronously.
type AsyncIndex interface {
// Wait will block until asynchronous operations started
// before this call have finished or until the specified
// timeout has been reached. If the timeout is reached
// an error is returned.
Wait(timeout time.Duration) error
}
type IndexReader interface {
TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (TermFieldReader, error)
// DocIDReader returns an iterator over documents which identifiers are
// greater than or equal to start and smaller than end. Set start to the
// empty string to iterate from the first document, end to the empty string
// to iterate to the last one.
// The caller must close returned instance to release associated resources.
2014-09-04 01:53:59 +02:00
DocIDReader(start, end string) (DocIDReader, error)
2014-04-17 22:55:53 +02:00
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
DocIDReaderOnly(ids []string) (DocIDReader, error)
FieldDict(field string) (FieldDict, error)
// FieldDictRange is currently defined to include the start and end terms
FieldDictRange(field string, startTerm []byte, endTerm []byte) (FieldDict, error)
FieldDictPrefix(field string, termPrefix []byte) (FieldDict, error)
2014-08-07 19:45:39 +02:00
Document(id string) (*document.Document, error)
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
DocumentFieldTerms(id IndexInternalID) (FieldTerms, error)
DocumentFieldTermsForFields(id IndexInternalID, fields []string) (FieldTerms, error)
Fields() ([]string, error)
GetInternal(key []byte) ([]byte, error)
DocCount() uint64
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
FinalizeDocID(id IndexInternalID) (string, error)
Close() error
2014-04-17 22:55:53 +02:00
}
type FieldTerms map[string][]string
2014-04-17 22:55:53 +02:00
type TermFieldVector struct {
2015-05-17 07:07:14 +02:00
Field string
ArrayPositions []uint64
Pos uint64
Start uint64
End uint64
2014-04-17 22:55:53 +02:00
}
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
// IndexInternalID is an opaque document identifier interal to the index impl
// This allows us to delay the conversion to public identifier (string) and
// avoid it completely in other cases. It also servces to hide the underlying
// representation of a document identifer, allow more flexibility.
type IndexInternalID interface {
Equals(other IndexInternalID) bool
Compare(other IndexInternalID) int
}
2014-04-17 22:55:53 +02:00
type TermFieldDoc struct {
2014-08-07 19:45:39 +02:00
Term string
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
ID IndexInternalID
2014-04-17 22:55:53 +02:00
Freq uint64
Norm float64
Vectors []*TermFieldVector
}
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
func (tfd *TermFieldDoc) Reset() *TermFieldDoc {
*tfd = TermFieldDoc{}
return tfd
}
// TermFieldReader is the interface exposing the enumeration of documents
// containing a given term in a given field. Documents are returned in byte
// lexicographic order over their identifiers.
2014-04-17 22:55:53 +02:00
type TermFieldReader interface {
// Next returns the next document containing the term in this field, or nil
// when it reaches the end of the enumeration. The preAlloced TermFieldDoc
// is optional, and when non-nil, will be used instead of allocating memory.
Next(preAlloced *TermFieldDoc) (*TermFieldDoc, error)
// Advance resets the enumeration at specified document or its immediate
// follower.
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
Advance(ID IndexInternalID, preAlloced *TermFieldDoc) (*TermFieldDoc, error)
// Count returns the number of documents contains the term in this field.
2014-04-17 22:55:53 +02:00
Count() uint64
Close() error
2014-04-17 22:55:53 +02:00
}
type DictEntry struct {
Term string
Count uint64
}
type FieldDict interface {
Next() (*DictEntry, error)
Close() error
2014-08-07 19:45:39 +02:00
}
// DocIDReader is the interface exposing enumeration of documents identifiers.
// Close the reader to release associated resources.
2014-09-04 01:53:59 +02:00
type DocIDReader interface {
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
// Next returns the next document internal identifier in the natural
// index order, or io.EOF when the end of the sequence is reached.
Next() (IndexInternalID, error)
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
// Advance resets the iteration to the first internal identifier greater than
// or equal to ID. If ID is smaller than the start of the range, the iteration
// will start there instead. If ID is greater than or equal to the end of
// the range, Next() call will return io.EOF.
major refactor of index/search API index id's are now opaque (until finally returned to top-level user) - the TermFieldDoc's returned by TermFieldReader no longer contain doc id - instead they return an opaque IndexInternalID - items returned are still in the "natural index order" - but that is no longer guaranteed to be "doc id order" - correct behavior requires that they all follow the same order - but not any particular order - new API FinalizeDocID which converts index internal ID's to public string ID - APIs used internally which previously took doc id now take IndexInternalID - that is DocumentFieldTerms() and DocumentFieldTermsForFields() - however, APIs that are used externally do not reflect this change - that is Document() - DocumentIDReader follows the same changes, but this is less obvious - behavior clarified, used to iterate doc ids, BUT NOT in doc id order - method STILL available to iterate doc ids in range - but again, you won't get them in any meaningful order - new method to iterate actual doc ids from list of possible ids - this was introduced to make the DocIDSearcher continue working searchers now work with the new opaque index internal doc ids - they return new DocumentMatchInternal (which does not have string ID) scorerers also work with these opaque index internal doc ids - they return DocumentMatchInternal (which does not have string ID) collectors now also perform a final step of converting the final result - they STILL return traditional DocumentMatch (with string ID) - but they now also require an IndexReader (so that they can do the conversion)
2016-07-31 19:46:18 +02:00
Advance(ID IndexInternalID) (IndexInternalID, error)
Close() error
}
type Batch struct {
IndexOps map[string]*document.Document
InternalOps map[string][]byte
}
func NewBatch() *Batch {
return &Batch{
IndexOps: make(map[string]*document.Document),
InternalOps: make(map[string][]byte),
}
}
func (b *Batch) Update(doc *document.Document) {
b.IndexOps[doc.ID] = doc
}
func (b *Batch) Delete(id string) {
b.IndexOps[id] = nil
}
func (b *Batch) SetInternal(key, val []byte) {
b.InternalOps[string(key)] = val
}
func (b *Batch) DeleteInternal(key []byte) {
b.InternalOps[string(key)] = nil
}
func (b *Batch) String() string {
rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps))
for k, v := range b.IndexOps {
if v != nil {
rv += fmt.Sprintf("\tINDEX - '%s'\n", k)
} else {
rv += fmt.Sprintf("\tDELETE - '%s'\n", k)
}
}
for k, v := range b.InternalOps {
if v != nil {
rv += fmt.Sprintf("\tSET INTERNAL - '%s'\n", k)
} else {
rv += fmt.Sprintf("\tDELETE INTERNAL - '%s'\n", k)
}
}
return rv
}
func (b *Batch) Reset() {
b.IndexOps = make(map[string]*document.Document)
b.InternalOps = make(map[string][]byte)
}