0
0
Fork 0

Merge branch 'master' into documenting

This commit is contained in:
Ben Campbell 2016-01-29 09:31:30 +13:00
commit 47dbd85551
172 changed files with 11050 additions and 3645 deletions

1
.gitignore vendored
View File

@ -16,3 +16,4 @@
/utils/bleve_registry/bleve_registry
/y.output
*.test
tags

View File

@ -1,16 +1,18 @@
sudo: false
language: go
go:
- 1.4
- 1.5
script:
- go get golang.org/x/tools/cmd/vet
- go get golang.org/x/tools/cmd/cover
- go get github.com/mattn/goveralls
- go get github.com/kisielk/errcheck
- go test -v ./...
- go test -v ./test -indexType=firestorm
- go vet ./...
- errcheck $(go list ./... | grep -v bleve/http/mapping | grep -v bleve/index/store/metrics)
- errcheck ./...
- docs/project-code-coverage.sh
- docs/build_children.sh

View File

@ -0,0 +1,47 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package web
import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/language/en"
"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
webt "github.com/blevesearch/bleve/analysis/tokenizers/web"
"github.com/blevesearch/bleve/registry"
)
const Name = "web"
func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
tokenizer, err := cache.TokenizerNamed(webt.Name)
if err != nil {
return nil, err
}
toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
if err != nil {
return nil, err
}
stopEnFilter, err := cache.TokenFilterNamed(en.StopName)
if err != nil {
return nil, err
}
rv := analysis.Analyzer{
Tokenizer: tokenizer,
TokenFilters: []analysis.TokenFilter{
toLowerFilter,
stopEnFilter,
},
}
return &rv, nil
}
func init() {
registry.RegisterAnalyzer(Name, AnalyzerConstructor)
}

View File

@ -18,7 +18,7 @@ func BenchmarkAnalysis(b *testing.B) {
}
ts := analyzer.Analyze(bleveWikiArticle)
freqs := analysis.TokenFrequency(ts, nil)
freqs := analysis.TokenFrequency(ts, nil, true)
if len(freqs) != 511 {
b.Errorf("expected %d freqs, got %d", 511, len(freqs))
}

View File

@ -26,6 +26,11 @@ type TokenLocation struct {
type TokenFreq struct {
Term []byte
Locations []*TokenLocation
frequency int
}
func (tf *TokenFreq) Frequency() int {
return tf.frequency
}
// TokenFrequencies maps document terms to their combined frequencies from all
@ -42,35 +47,57 @@ func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies)
existingTf, exists := tfs[tfk]
if exists {
existingTf.Locations = append(existingTf.Locations, tf.Locations...)
existingTf.frequency = existingTf.frequency + tf.frequency
} else {
tfs[tfk] = tf
tfs[tfk] = &TokenFreq{
Term: tf.Term,
frequency: tf.frequency,
Locations: make([]*TokenLocation, len(tf.Locations)),
}
copy(tfs[tfk].Locations, tf.Locations)
}
}
}
func TokenFrequency(tokens TokenStream, arrayPositions []uint64) TokenFrequencies {
func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies {
rv := make(map[string]*TokenFreq, len(tokens))
for _, token := range tokens {
curr, ok := rv[string(token.Term)]
if ok {
curr.Locations = append(curr.Locations, &TokenLocation{
if includeTermVectors {
tls := make([]TokenLocation, len(tokens))
tlNext := 0
for _, token := range tokens {
tls[tlNext] = TokenLocation{
ArrayPositions: arrayPositions,
Start: token.Start,
End: token.End,
Position: token.Position,
})
} else {
rv[string(token.Term)] = &TokenFreq{
Term: token.Term,
Locations: []*TokenLocation{
&TokenLocation{
ArrayPositions: arrayPositions,
Start: token.Start,
End: token.End,
Position: token.Position,
},
},
}
curr, ok := rv[string(token.Term)]
if ok {
curr.Locations = append(curr.Locations, &tls[tlNext])
curr.frequency++
} else {
rv[string(token.Term)] = &TokenFreq{
Term: token.Term,
Locations: []*TokenLocation{&tls[tlNext]},
frequency: 1,
}
}
tlNext++
}
} else {
for _, token := range tokens {
curr, exists := rv[string(token.Term)]
if exists {
curr.frequency++
} else {
rv[string(token.Term)] = &TokenFreq{
Term: token.Term,
frequency: 1,
}
}
}
}

View File

@ -44,9 +44,10 @@ func TestTokenFrequency(t *testing.T) {
End: 11,
},
},
frequency: 2,
},
}
result := TokenFrequency(tokens, nil)
result := TokenFrequency(tokens, nil, true)
if !reflect.DeepEqual(result, expectedResult) {
t.Errorf("expected %#v, got %#v", expectedResult, result)
}

View File

@ -26,29 +26,82 @@ func NewUnicodeTokenizer() *UnicodeTokenizer {
}
func (rt *UnicodeTokenizer) Tokenize(input []byte) analysis.TokenStream {
rvx := make([]analysis.TokenStream, 0, 10) // When rv gets full, append to rvx.
rv := make(analysis.TokenStream, 0, 1)
rv := make(analysis.TokenStream, 0)
ta := []analysis.Token(nil)
taNext := 0
segmenter := segment.NewWordSegmenterDirect(input)
start := 0
pos := 1
guessRemaining := func(end int) int {
avgSegmentLen := end / (len(rv) + 1)
if avgSegmentLen < 1 {
avgSegmentLen = 1
}
remainingLen := len(input) - end
return remainingLen / avgSegmentLen
}
for segmenter.Segment() {
segmentBytes := segmenter.Bytes()
end := start + len(segmentBytes)
if segmenter.Type() != segment.None {
token := analysis.Token{
Term: segmentBytes,
Start: start,
End: end,
Position: pos,
Type: convertType(segmenter.Type()),
if taNext >= len(ta) {
remainingSegments := guessRemaining(end)
if remainingSegments > 1000 {
remainingSegments = 1000
}
if remainingSegments < 1 {
remainingSegments = 1
}
ta = make([]analysis.Token, remainingSegments)
taNext = 0
}
rv = append(rv, &token)
token := &ta[taNext]
taNext++
token.Term = segmentBytes
token.Start = start
token.End = end
token.Position = pos
token.Type = convertType(segmenter.Type())
if len(rv) >= cap(rv) { // When rv is full, save it into rvx.
rvx = append(rvx, rv)
rvCap := cap(rv) * 2
if rvCap > 256 {
rvCap = 256
}
rv = make(analysis.TokenStream, 0, rvCap) // Next rv cap is bigger.
}
rv = append(rv, token)
pos++
}
start = end
}
if len(rvx) > 0 {
n := len(rv)
for _, r := range rvx {
n += len(r)
}
rall := make(analysis.TokenStream, 0, n)
for _, r := range rvx {
rall = append(rall, r...)
}
return append(rall, rv...)
}
return rv
}

View File

@ -0,0 +1,42 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package web
import (
"regexp"
"strings"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/tokenizers/exception"
"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
"github.com/blevesearch/bleve/registry"
)
const Name = "web"
var email = `(?:[a-z0-9!#$%&'*+/=?^_` + "`" + `{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_` + "`" + `{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])`
var url = `(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s` + "`" + `!()\[\]{};:'".,<>?«»“”‘’]))`
var twitterHandle = `@([a-zA-Z0-9_]){1,15}`
var twitterHashtag = `#([a-zA-Z0-9_])+`
var exceptions = []string{email, url, twitterHandle, twitterHashtag}
var exceptionsRegexp = regexp.MustCompile(strings.Join(exceptions, "|"))
func TokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
remainingTokenizer, err := cache.TokenizerNamed(unicode.Name)
if err != nil {
return nil, err
}
return exception.NewExceptionsTokenizer(exceptionsRegexp, remainingTokenizer), nil
}
func init() {
registry.RegisterTokenizer(Name, TokenizerConstructor)
}

View File

@ -0,0 +1,143 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package web
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func TestWeb(t *testing.T) {
tests := []struct {
input []byte
output analysis.TokenStream
}{
{
[]byte("Hello info@blevesearch.com"),
analysis.TokenStream{
{
Start: 0,
End: 5,
Term: []byte("Hello"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 6,
End: 26,
Term: []byte("info@blevesearch.com"),
Position: 2,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("That http://blevesearch.com"),
analysis.TokenStream{
{
Start: 0,
End: 4,
Term: []byte("That"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 5,
End: 27,
Term: []byte("http://blevesearch.com"),
Position: 2,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("Hey @blevesearch"),
analysis.TokenStream{
{
Start: 0,
End: 3,
Term: []byte("Hey"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 4,
End: 16,
Term: []byte("@blevesearch"),
Position: 2,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("This #bleve"),
analysis.TokenStream{
{
Start: 0,
End: 4,
Term: []byte("This"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 5,
End: 11,
Term: []byte("#bleve"),
Position: 2,
Type: analysis.AlphaNumeric,
},
},
},
{
[]byte("What about @blevesearch?"),
analysis.TokenStream{
{
Start: 0,
End: 4,
Term: []byte("What"),
Position: 1,
Type: analysis.AlphaNumeric,
},
{
Start: 5,
End: 10,
Term: []byte("about"),
Position: 2,
Type: analysis.AlphaNumeric,
},
{
Start: 11,
End: 23,
Term: []byte("@blevesearch"),
Position: 3,
Type: analysis.AlphaNumeric,
},
},
},
}
cache := registry.NewCache()
tokenizer, err := cache.TokenizerNamed(Name)
if err != nil {
t.Fatal(err)
}
for _, test := range tests {
actual := tokenizer.Tokenize(test.input)
if !reflect.DeepEqual(actual, test.output) {
t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
}
}
}

View File

@ -28,6 +28,7 @@ const (
Shingle
Single
Double
Boolean
)
// Token represents one occurrence of a term at a particular location in a

View File

@ -20,6 +20,8 @@ import (
"github.com/blevesearch/bleve/index/upside_down"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/search/highlight/highlighters/html"
_ "github.com/blevesearch/bleve/index/firestorm"
)
var bleveExpVar = expvar.NewMap("bleve")
@ -64,6 +66,8 @@ func init() {
bootDuration := time.Since(bootStart)
bleveExpVar.Add("bootDuration", int64(bootDuration))
indexStats = NewIndexStats()
bleveExpVar.Set("indexes", indexStats)
}
var logger = log.New(ioutil.Discard, "bleve", log.LstdFlags)

View File

@ -35,6 +35,7 @@ import (
_ "github.com/blevesearch/bleve/analysis/analyzers/keyword_analyzer"
_ "github.com/blevesearch/bleve/analysis/analyzers/simple_analyzer"
_ "github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer"
_ "github.com/blevesearch/bleve/analysis/analyzers/web"
// token filters
_ "github.com/blevesearch/bleve/analysis/token_filters/apostrophe_filter"
@ -55,6 +56,7 @@ import (
_ "github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer"
_ "github.com/blevesearch/bleve/analysis/tokenizers/single_token"
_ "github.com/blevesearch/bleve/analysis/tokenizers/unicode"
_ "github.com/blevesearch/bleve/analysis/tokenizers/web"
_ "github.com/blevesearch/bleve/analysis/tokenizers/whitespace_tokenizer"
// date time parsers
@ -88,6 +90,7 @@ import (
_ "github.com/blevesearch/bleve/index/store/gtreap"
// index types
_ "github.com/blevesearch/bleve/index/firestorm"
_ "github.com/blevesearch/bleve/index/upside_down"
// byte array converters

View File

@ -37,6 +37,7 @@ cat acc.out integration-acc.out | go run docs/merge-coverprofile.go > merged.out
if [ -n "$COVERALLS" ]
then
export GIT_BRANCH=$TRAVIS_BRANCH
goveralls -service drone.io -coverprofile=merged.out -repotoken $COVERALLS
fi

View File

@ -17,6 +17,7 @@ type Document struct {
ID string `json:"id"`
Fields []Field `json:"fields"`
CompositeFields []*CompositeField
Number uint64 `json:"-"`
}
func NewDocument(id string) *Document {

93
document/field_boolean.go Normal file
View File

@ -0,0 +1,93 @@
// Copyright (c) 2014 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package document
import (
"fmt"
"github.com/blevesearch/bleve/analysis"
)
const DefaultBooleanIndexingOptions = StoreField | IndexField
type BooleanField struct {
name string
arrayPositions []uint64
options IndexingOptions
value []byte
}
func (b *BooleanField) Name() string {
return b.name
}
func (b *BooleanField) ArrayPositions() []uint64 {
return b.arrayPositions
}
func (b *BooleanField) Options() IndexingOptions {
return b.options
}
func (b *BooleanField) Analyze() (int, analysis.TokenFrequencies) {
tokens := make(analysis.TokenStream, 0)
tokens = append(tokens, &analysis.Token{
Start: 0,
End: len(b.value),
Term: b.value,
Position: 1,
Type: analysis.Boolean,
})
fieldLength := len(tokens)
tokenFreqs := analysis.TokenFrequency(tokens, b.arrayPositions, b.options.IncludeTermVectors())
return fieldLength, tokenFreqs
}
func (b *BooleanField) Value() []byte {
return b.value
}
func (b *BooleanField) Boolean() (bool, error) {
if len(b.value) == 1 {
return b.value[0] == 'T', nil
}
return false, fmt.Errorf("boolean field has %d bytes", len(b.value))
}
func (b *BooleanField) GoString() string {
return fmt.Sprintf("&document.BooleanField{Name:%s, Options: %s, Value: %s}", b.name, b.options, b.value)
}
func NewBooleanFieldFromBytes(name string, arrayPositions []uint64, value []byte) *BooleanField {
return &BooleanField{
name: name,
arrayPositions: arrayPositions,
value: value,
options: DefaultNumericIndexingOptions,
}
}
func NewBooleanField(name string, arrayPositions []uint64, b bool) *BooleanField {
return NewBooleanFieldWithIndexingOptions(name, arrayPositions, b, DefaultNumericIndexingOptions)
}
func NewBooleanFieldWithIndexingOptions(name string, arrayPositions []uint64, b bool, options IndexingOptions) *BooleanField {
v := []byte("F")
if b {
v = []byte("T")
}
return &BooleanField{
name: name,
arrayPositions: arrayPositions,
value: v,
options: options,
}
}

View File

@ -75,7 +75,7 @@ func (n *DateTimeField) Analyze() (int, analysis.TokenFrequencies) {
}
fieldLength := len(tokens)
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions)
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions, n.options.IncludeTermVectors())
return fieldLength, tokenFreqs
}

View File

@ -71,7 +71,7 @@ func (n *NumericField) Analyze() (int, analysis.TokenFrequencies) {
}
fieldLength := len(tokens)
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions)
tokenFreqs := analysis.TokenFrequency(tokens, n.arrayPositions, n.options.IncludeTermVectors())
return fieldLength, tokenFreqs
}

View File

@ -60,7 +60,7 @@ func (t *TextField) Analyze() (int, analysis.TokenFrequencies) {
}
}
fieldLength := len(tokens) // number of tokens in this doc field
tokenFreqs := analysis.TokenFrequency(tokens, t.arrayPositions)
tokenFreqs := analysis.TokenFrequency(tokens, t.arrayPositions, t.options.IncludeTermVectors())
return fieldLength, tokenFreqs
}

View File

@ -64,6 +64,7 @@ func (h *CreateIndexHandler) ServeHTTP(w http.ResponseWriter, req *http.Request)
showError(w, req, fmt.Sprintf("error creating index: %v", err), 500)
return
}
newIndex.SetName(indexName)
RegisterIndexName(indexName, newIndex)
rv := struct {
Status string `json:"status"`

File diff suppressed because one or more lines are too long

View File

@ -1,428 +0,0 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0 Unless required by
// applicable law or agreed to in writing, software distributed under
// the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
// OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and
// limitations under the License.
package mapping
//go:generate go-bindata-assetfs -pkg=mapping ./mapping_static/...
//go:generate go fmt .
import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
"sort"
"github.com/elazarl/go-bindata-assetfs"
"github.com/gorilla/mux"
"github.com/blevesearch/bleve"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
)
func AssetFS() *assetfs.AssetFS {
return assetFS()
}
// RegisterHandlers registers mapping handlers on a router at the
// given pathBase, such as at "/api".
func RegisterHandlers(router *mux.Router, pathBase string) {
router.HandleFunc(pathBase+"/_analyzerNames", ListAnalyzerNames).Methods("POST")
router.HandleFunc(pathBase+"/_datetimeParserNames", ListDateTimeParserNames).Methods("POST")
router.HandleFunc(pathBase+"/_charFilterNames", ListCharFilterNames).Methods("POST")
router.HandleFunc(pathBase+"/_charFilterTypes", ListCharFilterTypes).Methods("GET")
router.HandleFunc(pathBase+"/_tokenizerNames", ListTokenizerNames).Methods("POST")
router.HandleFunc(pathBase+"/_tokenizerTypes", ListTokenizerTypes).Methods("GET")
router.HandleFunc(pathBase+"/_tokenFilterNames", ListTokenFilterNames).Methods("POST")
router.HandleFunc(pathBase+"/_tokenFilterTypes", ListTokenFilterTypes).Methods("GET")
router.HandleFunc(pathBase+"/_tokenMapNames", ListTokenMapNames).Methods("POST")
router.HandleFunc(pathBase+"/_analyze", AnalyzerText).Methods("POST")
router.HandleFunc(pathBase+"/_validateMapping", ValidateMapping).Methods("POST")
}
func ListAnalyzerNames(w http.ResponseWriter, req *http.Request) {
indexMapping := bleve.NewIndexMapping()
// read the request body
requestBody, err := ioutil.ReadAll(req.Body)
if err != nil {
showError(w, req, fmt.Sprintf("error reading request body: %v", err), 400)
return
}
// interpret request body as index mapping
if len(requestBody) > 0 {
err := json.Unmarshal(requestBody, &indexMapping)
if err != nil {
showError(w, req, fmt.Sprintf("error parsing index mapping: %v", err), 400)
return
}
}
// built in analyzer names
_, analyzerNames := registry.AnalyzerTypesAndInstances()
// add custom analyzer names
for name := range indexMapping.CustomAnalysis.Analyzers {
analyzerNames = append(analyzerNames, name)
}
sort.Strings(analyzerNames)
rv := struct {
Status string `json:"status"`
Analyzers []string `json:"analyzers"`
}{
Status: "ok",
Analyzers: analyzerNames,
}
mustEncode(w, rv)
}
func AnalyzerText(w http.ResponseWriter, req *http.Request) {
// read the request body
requestBody, err := ioutil.ReadAll(req.Body)
if err != nil {
showError(w, req, fmt.Sprintf("error reading request body: %v", err), 400)
return
}
mapping := bleve.NewIndexMapping()
var analyzeRequest = struct {
Analyzer string `json:"analyzer"`
Text string `json:"text"`
Mapping *bleve.IndexMapping `json:"mapping"`
}{}
err = json.Unmarshal(requestBody, &analyzeRequest)
if err != nil {
showError(w, req, fmt.Sprintf("error parsing index mapping: %v", err), 400)
return
}
if analyzeRequest.Mapping != nil {
mapping = analyzeRequest.Mapping
}
ts, err := mapping.AnalyzeText(analyzeRequest.Analyzer, []byte(analyzeRequest.Text))
if err != nil {
showError(w, req, fmt.Sprintf("error analyzing text: %v", err), 400)
return
}
rv := struct {
Status string `json:"status"`
Text string `json:"text"`
TokenStream analysis.TokenStream `json:"token_stream"`
}{
Status: "ok",
Text: analyzeRequest.Text,
TokenStream: ts,
}
mustEncode(w, rv)
}
func ListDateTimeParserNames(w http.ResponseWriter, req *http.Request) {
indexMapping := bleve.NewIndexMapping()
// read the request body
requestBody, err := ioutil.ReadAll(req.Body)
if err != nil {
showError(w, req, fmt.Sprintf("error reading request body: %v", err), 400)
return
}
// interpret request body as index mapping
if len(requestBody) > 0 {
err := json.Unmarshal(requestBody, &indexMapping)
if err != nil {
showError(w, req, fmt.Sprintf("error parsing index mapping: %v", err), 400)
return
}
}
// built in char filter names
_, dateTimeParserNames := registry.DateTimeParserTypesAndInstances()
// add custom date time parser names
for name := range indexMapping.CustomAnalysis.DateTimeParsers {
dateTimeParserNames = append(dateTimeParserNames, name)
}
sort.Strings(dateTimeParserNames)
rv := struct {
Status string `json:"status"`
DateTimeParsers []string `json:"datetime_parsers"`
}{
Status: "ok",
DateTimeParsers: dateTimeParserNames,
}
mustEncode(w, rv)
}
func ListCharFilterNames(w http.ResponseWriter, req *http.Request) {
indexMapping := bleve.NewIndexMapping()
// read the request body
requestBody, err := ioutil.ReadAll(req.Body)
if err != nil {
showError(w, req, fmt.Sprintf("error reading request body: %v", err), 400)
return
}
// interpret request body as index mapping
if len(requestBody) > 0 {
err := json.Unmarshal(requestBody, &indexMapping)
if err != nil {
showError(w, req, fmt.Sprintf("error parsing index mapping: %v", err), 400)
return
}
}
// built in char filter names
_, charFilterNames := registry.CharFilterTypesAndInstances()
// add custom char filter names
for name := range indexMapping.CustomAnalysis.CharFilters {
charFilterNames = append(charFilterNames, name)
}
sort.Strings(charFilterNames)
rv := struct {
Status string `json:"status"`
CharFilters []string `json:"char_filters"`
}{
Status: "ok",
CharFilters: charFilterNames,
}
mustEncode(w, rv)
}
func ListCharFilterTypes(w http.ResponseWriter, req *http.Request) {
// built in char filter names
charFilterTypes, _ := registry.CharFilterTypesAndInstances()
sort.Strings(charFilterTypes)
rv := struct {
Status string `json:"status"`
CharFilterTypes []string `json:"char_filter_types"`
}{
Status: "ok",
CharFilterTypes: charFilterTypes,
}
mustEncode(w, rv)
}
func ListTokenizerNames(w http.ResponseWriter, req *http.Request) {
indexMapping := bleve.NewIndexMapping()
// read the request body
requestBody, err := ioutil.ReadAll(req.Body)
if err != nil {
showError(w, req, fmt.Sprintf("error reading request body: %v", err), 400)
return
}
// interpret request body as index mapping
if len(requestBody) > 0 {
err := json.Unmarshal(requestBody, &indexMapping)
if err != nil {
showError(w, req, fmt.Sprintf("error parsing index mapping: %v", err), 400)
return
}
}
// built in char filter names
_, tokenizerNames := registry.TokenizerTypesAndInstances()
// add custom char filter names
for name := range indexMapping.CustomAnalysis.Tokenizers {
tokenizerNames = append(tokenizerNames, name)
}
sort.Strings(tokenizerNames)
rv := struct {
Status string `json:"status"`
Tokenizers []string `json:"tokenizers"`
}{
Status: "ok",
Tokenizers: tokenizerNames,
}
mustEncode(w, rv)
}
func ListTokenizerTypes(w http.ResponseWriter, req *http.Request) {
// built in char filter names
tokenizerTypes, _ := registry.TokenizerTypesAndInstances()
sort.Strings(tokenizerTypes)
rv := struct {
Status string `json:"status"`
TokenizerTypes []string `json:"tokenizer_types"`
}{
Status: "ok",
TokenizerTypes: tokenizerTypes,
}
mustEncode(w, rv)
}
func ListTokenFilterNames(w http.ResponseWriter, req *http.Request) {
indexMapping := bleve.NewIndexMapping()
// read the request body
requestBody, err := ioutil.ReadAll(req.Body)
if err != nil {
showError(w, req, fmt.Sprintf("error reading request body: %v", err), 400)
return
}
// interpret request body as index mapping
if len(requestBody) > 0 {
err := json.Unmarshal(requestBody, &indexMapping)
if err != nil {
showError(w, req, fmt.Sprintf("error parsing index mapping: %v", err), 400)
return
}
}
// built in char filter names
_, tokenFilterNames := registry.TokenFilterTypesAndInstances()
// add custom char filter names
for name := range indexMapping.CustomAnalysis.TokenFilters {
tokenFilterNames = append(tokenFilterNames, name)
}
sort.Strings(tokenFilterNames)
rv := struct {
Status string `json:"status"`
TokenFilters []string `json:"token_filters"`
}{
Status: "ok",
TokenFilters: tokenFilterNames,
}
mustEncode(w, rv)
}
func ListTokenFilterTypes(w http.ResponseWriter, req *http.Request) {
// built in char filter names
tokenFilterTypes, _ := registry.TokenFilterTypesAndInstances()
sort.Strings(tokenFilterTypes)
rv := struct {
Status string `json:"status"`
TokenFilterTypes []string `json:"token_filter_types"`
}{
Status: "ok",
TokenFilterTypes: tokenFilterTypes,
}
mustEncode(w, rv)
}
func ListTokenMapNames(w http.ResponseWriter, req *http.Request) {
indexMapping := bleve.NewIndexMapping()
// read the request body
requestBody, err := ioutil.ReadAll(req.Body)
if err != nil {
showError(w, req, fmt.Sprintf("error reading request body: %v", err), 400)
return
}
// interpret request body as index mapping
if len(requestBody) > 0 {
err := json.Unmarshal(requestBody, &indexMapping)
if err != nil {
showError(w, req, fmt.Sprintf("error parsing index mapping: %v", err), 400)
return
}
}
// built in char filter names
_, tokenMapNames := registry.TokenMapTypesAndInstances()
// add custom char map names
for name := range indexMapping.CustomAnalysis.TokenMaps {
tokenMapNames = append(tokenMapNames, name)
}
sort.Strings(tokenMapNames)
rv := struct {
Status string `json:"status"`
TokenMaps []string `json:"token_maps"`
}{
Status: "ok",
TokenMaps: tokenMapNames,
}
mustEncode(w, rv)
}
func ValidateMapping(w http.ResponseWriter, req *http.Request) {
indexMapping := bleve.NewIndexMapping()
// read the request body
requestBody, err := ioutil.ReadAll(req.Body)
if err != nil {
showError(w, req, fmt.Sprintf("error reading request body: %v", err), 400)
return
}
// interpret request body as index mapping
if len(requestBody) > 0 {
err := json.Unmarshal(requestBody, &indexMapping)
if err != nil {
showError(w, req, fmt.Sprintf("error parsing index mapping: %v", err), 400)
return
}
}
rv := struct {
Status string `json:"status"`
}{
Status: "ok",
}
mustEncode(w, rv)
}
func showError(w http.ResponseWriter, r *http.Request,
msg string, code int) {
log.Printf("Reporting error %v/%v", code, msg)
http.Error(w, msg, code)
}
func mustEncode(w io.Writer, i interface{}) {
if headered, ok := w.(http.ResponseWriter); ok {
headered.Header().Set("Cache-Control", "no-cache")
headered.Header().Set("Content-type", "application/json")
}
e := json.NewEncoder(w)
if err := e.Encode(i); err != nil {
panic(err)
}
}

View File

@ -1,144 +0,0 @@
var AnalyzerModalCtrl = function ($scope, $modalInstance, $http, name, value, mapping) {
$scope.origName = name;
$scope.name = name;
$scope.errorMessage = "";
$scope.formpath = "";
$scope.mapping = mapping;
$scope.analyzer = {};
// copy in value for editing
for (var k in value) {
// need deeper copy of nested arrays
if (k == "char_filters") {
newcharfilters = [];
for (var cfi in value.char_filters) {
newcharfilters.push(value.char_filters[cfi]);
}
$scope.analyzer.char_filters = newcharfilters;
} else if (k == "token_filters") {
newtokenfilters = [];
for (var tfi in value.token_filters) {
newtokenfilters.push(value.token_filters[tfi]);
}
$scope.analyzer.token_filters = newtokenfilters;
} else {
$scope.analyzer[k] = value[k];
}
}
$scope.tokenizerNames = [];
$scope.loadTokenizerNames = function() {
$http.post('/api/_tokenizerNames',mapping).success(function(data) {
$scope.tokenizerNames = data.tokenizers;
}).
error(function(data, code) {
$scope.errorMessage = data;
});
};
$scope.loadTokenizerNames();
$scope.charFilterNames = [];
$scope.loadCharFilterNames = function() {
$http.post('/api/_charFilterNames',mapping).success(function(data) {
$scope.charFilterNames = data.char_filters;
}).
error(function(data, code) {
$scope.errorMessage = data;
});
};
$scope.loadCharFilterNames();
$scope.addCharFilter = function(scope) {
filter = scope.addCharacterFilterName;
if (filter !== undefined && filter !== "") {
$scope.selectedAnalyzer.char_filters.push(filter);
}
console.log($scope.selectedAnalyzer.char_filters);
};
$scope.removeCharFilter = function(index) {
$scope.selectedAnalyzer.char_filters.splice(index, 1);
};
$scope.tokenFilterNames = [];
$scope.loadTokenFilterNames = function() {
$http.post('/api/_tokenFilterNames',mapping).success(function(data) {
$scope.tokenFilterNames = data.token_filters;
}).
error(function(data, code) {
$scope.errorMessage = data;
});
};
$scope.loadTokenFilterNames();
$scope.addCharFilter = function(scope) {
filter = scope.addCharacterFilterName;
if (filter !== undefined && filter !== "") {
$scope.analyzer.char_filters.push(filter);
}
console.log($scope.analyzer.char_filters);
};
$scope.removeCharFilter = function(index) {
$scope.analyzer.char_filters.splice(index, 1);
};
$scope.addTokenFilter = function(scope) {
filter = scope.addTokenFilterName;
if (filter !== undefined && filter !== "") {
$scope.analyzer.token_filters.push(filter);
}
console.log($scope.analyzer.token_filters);
};
$scope.removeTokenFilter = function(index) {
$scope.analyzer.token_filters.splice(index, 1);
};
$scope.cancel = function () {
$modalInstance.dismiss('cancel');
};
$scope.build = function() {
// must have a name
if (!$scope.name) {
$scope.errorMessage = "Name is required";
return;
}
// name must not already be used
if ($scope.name != $scope.origName && $scope.mapping.analysis.analyzers[$scope.name]) {
$scope.errorMessage = "Analyzer named '" + $scope.name + "' already exists";
return;
}
// ensure that this new mapping component is valid
analysis = {};
for (var ak in $scope.mapping.analysis) {
analysis[ak] = $scope.mapping.analysis[ak];
}
analyzers = {};
analyzers[$scope.name] = $scope.analyzer;
analysis["analyzers"] = analyzers;
testMapping = {
"analysis": analysis
};
$http.post('/api/_validateMapping',testMapping).success(function(data) {
// if its valid return it
result = {};
result[$scope.name] = $scope.analyzer;
$modalInstance.close(result);
}).
error(function(data, code) {
// otherwise display error
$scope.errorMessage = data;
});
};
};

View File

@ -1,105 +0,0 @@
var CharFilterModalCtrl = function ($scope, $modalInstance, $http, name, value, mapping) {
$scope.origName = name;
$scope.name = name;
$scope.errorMessage = "";
$scope.formpath = "";
$scope.mapping = mapping;
$scope.charfilter = {};
// copy in value for editing
for (var k in value) {
$scope.charfilter[k] = value[k];
}
$scope.unknownCharFilterTypeTemplate = "/static/partials/analysis/charfilters/generic.html";
$scope.charFilterTypeTemplates = {
"regexp": "/static/partials/analysis/charfilters/regexp.html",
};
$scope.charFilterTypeDefaults = {
"regexp": function() {
return {
"regexp": "",
"replace": ""
};
}
};
$scope.charFilterTypes = [];
updateCharFilterTypes = function() {
$http.get('/api/_charFilterTypes').success(function(data) {
$scope.charFilterTypes = data.char_filter_types;
}).
error(function(data, code) {
$scope.errorMessage = data;
});
};
updateCharFilterTypes();
if (!$scope.charfilter.type) {
defaultType = "regexp";
if ($scope.charFilterTypeDefaults[defaultType]) {
$scope.charfilter = $scope.charFilterTypeDefaults[defaultType]();
}
else {
$scope.charfilter = {};
}
$scope.charfilter.type = defaultType;
}
$scope.formpath = $scope.charFilterTypeTemplates[$scope.charfilter.type];
$scope.charFilterTypeChange = function() {
newType = $scope.charfilter.type;
if ($scope.charFilterTypeDefaults[$scope.charfilter.type]) {
$scope.charfilter = $scope.charFilterTypeDefaults[$scope.charfilter.type]();
} else {
$scope.charfilter = {};
}
$scope.charfilter.type = newType;
if ($scope.charFilterTypeTemplates[$scope.charfilter.type]) {
$scope.formpath = $scope.charFilterTypeTemplates[$scope.charfilter.type];
} else {
$scope.formpath = unknownCharFilterTypeTemplate;
}
};
$scope.cancel = function () {
$modalInstance.dismiss('cancel');
};
$scope.build = function() {
// must have a name
if (!$scope.name) {
$scope.errorMessage = "Name is required";
return;
}
// name must not already be used
if ($scope.name != $scope.origName && $scope.mapping.analysis.char_filters[$scope.name]) {
$scope.errorMessage = "Character filter named '" + $scope.name + "' already exists";
return;
}
// ensure that this new mapping component is valid
charFilters = {};
charFilters[$scope.name] = $scope.charfilter;
testMapping = {
"analysis": {
"char_filters": charFilters
}
};
$http.post('/api/_validateMapping',testMapping).success(function(data) {
// if its valid return it
result = {};
result[$scope.name] = $scope.charfilter;
$modalInstance.close(result);
}).
error(function(data, code) {
// otherwise display error
$scope.errorMessage = data;
});
};
};

View File

@ -1,179 +0,0 @@
var TokenFilterModalCtrl = function ($scope, $modalInstance, $http, name, value, mapping) {
$scope.origName = name;
$scope.name = name;
$scope.errorMessage = "";
$scope.formpath = "";
$scope.mapping = mapping;
$scope.tokenfilter = {};
// copy in value for editing
for (var k in value) {
$scope.tokenfilter[k] = value[k];
}
$scope.tokenMapNames = [];
$scope.loadTokenMapNames = function() {
$http.post('/api/_tokenMapNames',mapping).success(function(data) {
$scope.tokenMapNames = data.token_maps;
}).
error(function(data, code) {
$scope.errorMessage = data;
});
};
$scope.loadTokenMapNames();
$scope.unknownTokenFilterTypeTemplate = "/static/partials/analysis/tokenfilters/generic.html";
$scope.tokenFilterTypeTemplates = {
"dict_compound": "/static/partials/analysis/tokenfilters/dict_compound.html",
"edge_ngram": "/static/partials/analysis/tokenfilters/edge_ngram.html",
"elision": "/static/partials/analysis/tokenfilters/elision.html",
"keyword_marker": "/static/partials/analysis/tokenfilters/keyword_marker.html",
"length": "/static/partials/analysis/tokenfilters/length.html",
"ngram": "/static/partials/analysis/tokenfilters/ngram.html",
"normalize_unicode": "/static/partials/analysis/tokenfilters/normalize_unicode.html",
"shingle": "/static/partials/analysis/tokenfilters/shingle.html",
"stop_tokens": "/static/partials/analysis/tokenfilters/stop_tokens.html",
"truncate_token": "/static/partials/analysis/tokenfilters/truncate_token.html",
};
$scope.tokenFilterTypeDefaults = {
"dict_compound": function() {
return {
"dict_token_map": $scope.tokenMapNames[0]
};
},
"edge_ngram": function() {
return {
"edge": "front",
"min": 3,
"max": 3,
};
},
"elision": function() {
return {
"articles_token_map": $scope.tokenMapNames[0]
};
},
"keyword_marker": function() {
return {
"keywords_token_map": $scope.tokenMapNames[0]
};
},
"length": function() {
return {
"min": 3,
"max": 255
};
},
"ngram": function() {
return {
"min": 3,
"max": 3
};
},
"normalize_unicode": function() {
return {
"form": "nfc"
};
},
"shingle": function() {
return {
"min": 2,
"max": 2,
"output_original": false,
"separator": "",
"filler": ""
};
},
"stop_tokens": function() {
return {
"stop_token_map": $scope.tokenMapNames[0]
};
},
"truncate_token": function() {
return {
"length": 25
};
},
};
$scope.tokenFilterTypes = [];
updateTokenFilterTypes = function() {
$http.get('/api/_tokenFilterTypes').success(function(data) {
$scope.tokenFilterTypes = data.token_filter_types;
}).
error(function(data, code) {
$scope.errorMessage = data;
});
};
updateTokenFilterTypes();
if (!$scope.tokenfilter.type) {
defaultType = "length";
if ($scope.tokenFilterTypeDefaults[defaultType]) {
$scope.tokenfilter = $scope.tokenFilterTypeDefaults[defaultType]();
}
else {
$scope.tokenfilter = {};
}
$scope.tokenfilter.type = defaultType;
}
$scope.formpath = $scope.tokenFilterTypeTemplates[$scope.tokenfilter.type];
$scope.tokenFilterTypeChange = function() {
newType = $scope.tokenfilter.type;
if ($scope.tokenFilterTypeDefaults[$scope.tokenfilter.type]) {
$scope.tokenfilter = $scope.tokenFilterTypeDefaults[$scope.tokenfilter.type]();
} else {
$scope.tokenfilter = {};
}
$scope.tokenfilter.type = newType;
if ($scope.tokenFilterTypeTemplates[$scope.tokenfilter.type]) {
$scope.formpath = $scope.tokenFilterTypeTemplates[$scope.tokenfilter.type];
} else {
$scope.formpath = $scope.unknownTokenFilterTypeTemplate;
}
};
$scope.cancel = function () {
$modalInstance.dismiss('cancel');
};
$scope.build = function() {
// must have a name
if (!$scope.name) {
$scope.errorMessage = "Name is required";
return;
}
// name must not already be used
if ($scope.name != $scope.origName && $scope.mapping.analysis.token_filters[$scope.name]) {
$scope.errorMessage = "Token filter named '" + $scope.name + "' already exists";
return;
}
// ensure that this new mapping component is valid
tokenfilters = {};
tokenfilters[$scope.name] = $scope.tokenfilter;
testMapping = {
"analysis": {
"token_filters": tokenfilters,
"token_maps": $scope.mapping.analysis.token_maps
}
};
$http.post('/api/_validateMapping',testMapping).success(function(data) {
// if its valid return it
result = {};
result[$scope.name] = $scope.tokenfilter;
$modalInstance.close(result);
}).
error(function(data, code) {
// otherwise display error
$scope.errorMessage = data;
});
};
};

View File

@ -1,138 +0,0 @@
var TokenizerModalCtrl = function ($scope, $modalInstance, $http, name, value, mapping) {
$scope.origName = name;
$scope.name = name;
$scope.errorMessage = "";
$scope.formpath = "";
$scope.mapping = mapping;
$scope.tokenizer = {};
// copy in value for editing
for (var k in value) {
$scope.tokenizer[k] = value[k];
}
$scope.tokenizerNames = [];
$scope.loadTokenizerNames = function() {
$http.post('/api/_tokenizerNames',mapping).success(function(data) {
$scope.tokenizerNames = data.tokenizers;
}).
error(function(data, code) {
$scope.errorMessage = data;
});
};
$scope.loadTokenizerNames();
$scope.unknownTokenizerTypeTemplate = "/static/partials/analysis/tokenizers/generic.html";
$scope.tokenizerTypeTemplates = {
"regexp": "/static/partials/analysis/tokenizers/regexp.html",
"exception": "/static/partials/analysis/tokenizers/exception.html"
};
$scope.tokenizerTypeDefaults = {
"regexp": function() {
return {
"regexp": ""
};
},
"exception": function() {
return {
"exceptions": [],
"tokenizer": "unicode"
};
}
};
$scope.tokenizerTypes = [];
updateTokenizerTypes = function() {
$http.get('/api/_tokenizerTypes').success(function(data) {
$scope.tokenizerTypes = data.tokenizer_types;
}).
error(function(data, code) {
$scope.errorMessage = data;
});
};
updateTokenizerTypes();
if (!$scope.tokenizer.type) {
defaultType = "regexp";
if ($scope.tokenizerTypeDefaults[defaultType]) {
$scope.tokenizer = $scope.tokenizerTypeDefaults[defaultType]();
}
else {
$scope.tokenizer = {};
}
$scope.tokenizer.type = defaultType;
}
$scope.formpath = $scope.tokenizerTypeTemplates[$scope.tokenizer.type];
$scope.tokenizerTypeChange = function() {
newType = $scope.tokenizer.type;
if ($scope.tokenizerTypeDefaults[$scope.tokenizer.type]) {
$scope.tokenizer = $scope.tokenizerTypeDefaults[$scope.tokenizer.type]();
} else {
$scope.tokenizer = {};
}
$scope.tokenizer.type = newType;
if ($scope.tokenizerTypeTemplates[$scope.tokenizer.type]) {
$scope.formpath = $scope.tokenizerTypeTemplates[$scope.tokenizer.type];
} else {
$scope.formpath = $scope.unknownTokenizerTypeTemplate;
}
};
$scope.addException = function(scope) {
if (scope.newregexp) {
$scope.tokenizer.exceptions.push(scope.newregexp);
scope.newregexp = "";
}
};
$scope.removeException = function(index) {
$scope.tokenizer.exceptions.splice(index, 1);
};
$scope.cancel = function () {
$modalInstance.dismiss('cancel');
};
$scope.build = function() {
// must have a name
if (!$scope.name) {
$scope.errorMessage = "Name is required";
return;
}
// name must not already be used
if ($scope.name != $scope.origName && $scope.mapping.analysis.tokenizers[$scope.name]) {
$scope.errorMessage = "Tokenizer named '" + $scope.name + "' already exists";
return;
}
// ensure that this new mapping component is valid
tokenizers = {};
tokenizers[$scope.name] = $scope.tokenizer;
// add in all the existing tokenizers, since we might be referencing them
for (var t in $scope.mapping.analysis.tokenizers) {
tokenizers[t] = $scope.mapping.analysis.tokenizers[t];
}
testMapping = {
"analysis": {
"tokenizers": tokenizers
}
};
$http.post('/api/_validateMapping',testMapping).success(function(data) {
// if its valid return it
result = {};
result[$scope.name] = $scope.tokenizer;
$modalInstance.close(result);
}).
error(function(data, code) {
// otherwise display error
$scope.errorMessage = data;
});
};
};

View File

@ -1,51 +0,0 @@
var WordListModalCtrl = function ($scope, $modalInstance, name, words, mapping) {
$scope.name = name;
$scope.origName = name;
$scope.errorMessage = "";
$scope.newWord = "";
$scope.words = words.slice(0); // create copy
$scope.selectedWords = [];
$scope.mapping = mapping;
$scope.cancel = function () {
$modalInstance.dismiss('cancel');
};
$scope.addWord = function() {
if ($scope.newWord) {
$scope.words.push($scope.newWord);
$scope.newWord = "";
}
};
$scope.removeWord = function() {
// sort the selected word indexes into descending order
// so we can delete items without having to adjust indexes
$scope.selectedWords.sort(function(a,b){ return b - a; });
for (var index in $scope.selectedWords) {
$scope.words.splice($scope.selectedWords[index], 1);
}
$scope.selectedWords = [];
};
$scope.build = function() {
// must have a name
if (!$scope.name) {
$scope.errorMessage = "Name is required";
return;
}
// name must not already be used
if ($scope.name != $scope.origName && $scope.mapping.analysis.token_maps[$scope.name]) {
$scope.errorMessage = "Word list named '" + $scope.name + "' already exists";
return;
}
result = {};
result[$scope.name] = {
"type": "custom",
"tokens": $scope.words
};
$modalInstance.close(result);
};
};

View File

@ -1,379 +0,0 @@
// controller responsible for building a custom analysis components
function AnalysisCtrl($scope, $http, $routeParams, $log, $sce, $location, $modal) {
// analyzers
$scope.newAnalyzer = function () {
return $scope.editAnalyzer("", {
"type": "custom",
"char_filters": [],
"tokenizer": "unicode",
"token_filters": []
});
};
$scope.deleteAnalyzer = function (name) {
used = $scope.isAnalyzerUsed(name);
if (used) {
alert("This analyzer cannot be deleted because it is being used by the " + used + ".");
return;
}
if (confirm("Are you sure you want to delete '" + name + "'?")) {
delete $scope.$parent.mapping.analysis.analyzers[name];
}
};
$scope.isAnalyzerUsed = function(name) {
// analyzers are used in mappings (in various places)
// first check index level default analyzer
if ($scope.$parent.mapping.default_analyzer == name) {
return "index mapping default analyzer";
}
// then check the default documnt mapping
used = $scope.isAnalyzerUsedInDocMapping(name, $scope.$parent.mapping.default_mapping, "");
if (used) {
return "default document mapping " + used;
}
// then check the document mapping for each type
for (var docType in $scope.$parent.mapping.types) {
docMapping = $scope.$parent.mapping.types[docType];
used = $scope.isAnalyzerUsedInDocMapping(name, docMapping, "");
if (used) {
return "document mapping type '" + docType + "' ";
}
}
return null;
};
// a recursive helper
$scope.isAnalyzerUsedInDocMapping = function(name, docMapping, path) {
// first check the document level default analyzer
if (docMapping.default_analyzer == name) {
if (path) {
return "default analyzer at " + path;
} else {
return "default analyzer";
}
}
// now check fields at this level
for (var fieldIndex in docMapping.fields) {
field = docMapping.fields[fieldIndex];
if (field.analyzer == name) {
if (field.name) {
return "in the field named " + field.name;
}
return "in the field at path " + path;
}
}
// now check each nested property
for (var propertyName in docMapping.properties) {
subDoc = docMapping.properties[propertyName];
if (path) {
return $scope.isAnalyzerUsedInDocMapping(name, subDoc, path+"."+propertyName);
} else {
return $scope.isAnalyzerUsedInDocMapping(name, subDoc, propertyName);
}
}
return null;
};
$scope.editAnalyzer = function (name, value) {
var modalInstance = $modal.open({
animation: $scope.animationsEnabled,
templateUrl: '/static/partials/analysis/analyzer.html',
controller: 'AnalyzerModalCtrl',
resolve: {
name: function () {
return name;
},
value: function () {
return value;
},
mapping: function() {
return $scope.$parent.mapping;
}
}
});
modalInstance.result.then(function (result) {
// add this result to the mapping
for (var resultKey in result) {
if (name !== "" && resultKey != name) {
// remove the old name
delete $scope.$parent.mapping.analysis.analyzers[name];
}
$scope.$parent.mapping.analysis.analyzers[resultKey] = result[resultKey];
// reload parent available analyzers
$scope.$parent.loadAnalyzerNames();
}
}, function () {
$log.info('Modal dismissed at: ' + new Date());
});
};
// word lists
$scope.newWordList = function () {
return $scope.editWordList("", {tokens:[]});
};
$scope.deleteWordList = function (name) {
used = $scope.isWordListUsed(name);
if (used) {
alert("This word list cannot be deleted because it is being used by the " + used + ".");
return;
}
if (confirm("Are you sure you want to delete '" + name + "'?")) {
delete $scope.$parent.mapping.analysis.token_maps[name];
}
};
$scope.isWordListUsed = function(name) {
// word lists are only used by token filters
for (var tokenFilterName in $scope.$parent.mapping.analysis.token_filters) {
tokenFilter = $scope.$parent.mapping.analysis.token_filters[tokenFilterName];
// word lists are embeded in a variety of different field names
if (tokenFilter.dict_token_map == name ||
tokenFilter.articles_token_map == name ||
tokenFilter.keywords_token_map == name ||
tokenFilter.stop_token_map == name) {
return "token filter named '" + tokenFilterName + "'";
}
}
return null;
};
$scope.editWordList = function (name, value) {
var modalInstance = $modal.open({
animation: $scope.animationsEnabled,
templateUrl: '/static/partials/analysis/wordlist.html',
controller: 'WordListModalCtrl',
resolve: {
name: function () {
return name;
},
words: function () {
return value.tokens;
},
mapping: function() {
return $scope.$parent.mapping;
}
}
});
modalInstance.result.then(function (result) {
// add this result to the mapping
for (var resultKey in result) {
if (name !== "" && resultKey != name) {
// remove the old name
delete $scope.$parent.mapping.analysis.token_maps[name];
}
$scope.$parent.mapping.analysis.token_maps[resultKey] = result[resultKey];
}
}, function () {
$log.info('Modal dismissed at: ' + new Date());
});
};
// character filters
$scope.newCharFilter = function() {
return $scope.editCharFilter("", {});
};
$scope.deleteCharFilter = function(name) {
used = $scope.isCharFilterUsed(name);
if (used) {
alert("This character filter cannot be deleted because it is being used by the " + used + ".");
return;
}
if (confirm("Are you sure you want to delete '" + name + "'?")) {
delete $scope.$parent.mapping.analysis.char_filters[name];
}
};
$scope.isCharFilterUsed = function(name) {
// character filters can only be used by analyzers
for (var analyzerName in $scope.$parent.mapping.analysis.analyzers) {
analyzer = $scope.$parent.mapping.analysis.analyzers[analyzerName];
for (var charFilterIndex in analyzer.char_filters) {
charFilterName = analyzer.char_filters[charFilterIndex];
if (charFilterName == name) {
return "analyzer named '" + analyzerName + "'";
}
}
}
return null;
};
$scope.editCharFilter = function (name, value) {
var modalInstance = $modal.open({
animation: $scope.animationsEnabled,
templateUrl: '/static/partials/analysis/charfilter.html',
controller: 'CharFilterModalCtrl',
resolve: {
name: function () {
return name;
},
value: function () {
return value;
},
mapping: function() {
return $scope.$parent.mapping;
}
}
});
modalInstance.result.then(function (result) {
// add this result to the mapping
for (var resultKey in result) {
if (name !== "" && resultKey != name) {
// remove the old name
delete $scope.$parent.mapping.analysis.char_filters[name];
}
$scope.$parent.mapping.analysis.char_filters[resultKey] = result[resultKey];
}
}, function () {
$log.info('Modal dismissed at: ' + new Date());
});
};
// tokenizers
$scope.newTokenizer = function () {
return $scope.editTokenizer("", {});
};
$scope.deleteTokenizer = function (name) {
used = $scope.isTokenizerUsed(name);
if (used) {
alert("This tokenizer cannot be deleted because it is being used by the " + used + ".");
return;
}
if (confirm("Are you sure you want to delete '" + name + "'?")) {
delete $scope.$parent.mapping.analysis.tokenizers[name];
}
};
$scope.isTokenizerUsed = function(name) {
// tokenizers can be used by *other* tokenizers
for (var tokenizerName in $scope.$parent.mapping.analysis.tokenizers) {
tokenizer = $scope.$parent.mapping.analysis.tokenizers[tokenizerName];
if (tokenizer.tokenizer == name) {
return "tokenizer named '" + tokenizerName + "'";
}
}
// tokenizers can be used by analyzers
for (var analyzerName in $scope.$parent.mapping.analysis.analyzers) {
analyzer = $scope.$parent.mapping.analysis.analyzers[analyzerName];
if (analyzer.tokenizer == name) {
return "analyzer named '" + analyzerName + "'";
}
}
return null;
};
$scope.editTokenizer = function (name, value) {
var modalInstance = $modal.open({
animation: $scope.animationsEnabled,
templateUrl: '/static/partials/analysis/tokenizer.html',
controller: 'TokenizerModalCtrl',
resolve: {
name: function () {
return name;
},
value: function () {
return value;
},
mapping: function() {
return $scope.$parent.mapping;
}
}
});
modalInstance.result.then(function (result) {
// add this result to the mapping
for (var resultKey in result) {
if (name !== "" && resultKey != name) {
// remove the old name
delete $scope.$parent.mapping.analysis.tokenizers[name];
}
$scope.$parent.mapping.analysis.tokenizers[resultKey] = result[resultKey];
}
}, function () {
$log.info('Modal dismissed at: ' + new Date());
});
};
// token filters
$scope.newTokenFilter = function () {
return $scope.editTokenFilter("", {});
};
$scope.deleteTokenFilter = function (name) {
used = $scope.isTokenFilterUsed(name);
if (used) {
alert("This token filter cannot be deleted because it is being used by the " + used + ".");
return;
}
if (confirm("Are you sure you want to delete '" + name + "'?")) {
delete $scope.$parent.mapping.analysis.token_filters[name];
}
};
$scope.isTokenFilterUsed = function(name) {
// token filters can only be used by analyzers
for (var analyzerName in $scope.$parent.mapping.analysis.analyzers) {
analyzer = $scope.$parent.mapping.analysis.analyzers[analyzerName];
for (var tokenFilterIndex in analyzer.token_filters) {
tokenFilterName = analyzer.token_filters[tokenFilterIndex];
if (tokenFilterName == name) {
return "analyzer named '" + analyzerName + "'";
}
}
}
return null;
};
$scope.editTokenFilter = function (name, value) {
var modalInstance = $modal.open({
animation: $scope.animationsEnabled,
templateUrl: '/static/partials/analysis/tokenfilter.html',
controller: 'TokenFilterModalCtrl',
resolve: {
name: function () {
return name;
},
value: function () {
return value;
},
mapping: function() {
return $scope.$parent.mapping;
}
}
});
modalInstance.result.then(function (result) {
// add this result to the mapping
for (var resultKey in result) {
if (name !== "" && resultKey != name) {
// remove the old name
delete $scope.$parent.mapping.analysis.token_filters[name];
}
$scope.$parent.mapping.analysis.token_filters[resultKey] = result[resultKey];
}
}, function () {
$log.info('Modal dismissed at: ' + new Date());
});
};
}

View File

@ -1,110 +0,0 @@
// controller responsible for building a mapping
function MappingCtrl($scope, $http, $routeParams, $log, $sce, $location) {
newFieldSection = function() {
return {
"enabled": true,
"dynamic": true,
"default_analyzer": "",
"properties": {},
"fields": [
{
"type": "",
"index": true,
"store": true,
"include_in_all": true,
"include_term_vectors": true
}
]
};
};
$scope.$parent.mapping = {
"default_mapping": newFieldSection(),
"type_field": "_type",
"default_type": "_default",
"default_analyzer": "standard",
"default_datetime_parser": "dateTimeOptional",
"default_field": "_all",
"byte_array_converter": "json",
"analysis": {
"analyzers": {},
"token_maps": {},
"char_filters": {},
"tokenizers": {},
"token_filters": {}
}
};
$scope.analyzerNames = [];
$scope.loadAnalyzerNames = function() {
$http.post('/api/_analyzerNames',$scope.$parent.mapping).success(function(data) {
$scope.analyzerNames = data.analyzers;
}).
error(function(data, code) {
$scope.errorMessage = data;
});
};
$scope.loadAnalyzerNames();
$scope.datetimeParserNames = [];
$scope.loadDatetimeParserNames = function() {
$http.post('/api/_datetimeParserNames',$scope.$parent.mapping).success(function(data) {
$scope.datetimeParserNames = data.datetime_parsers;
}).
error(function(data, code) {
$scope.errorMessage = data;
});
};
$scope.loadDatetimeParserNames();
$scope.mappingType = "default";
$scope.selectedItem = null;
$scope.selectedLabel = "";
$scope.fieldTypes = [
{
"name": "text",
"label": "Text",
"description": "a text field"
},
{
"name": "number",
"label": "Number",
"description": "a numerical value, indexed to facilitate range queries"
},
{
"name": "datetime",
"label": "Date/Time",
"description": "a date/time value, indexed to facilitate range queries"
},
{
"name": "disabled",
"label": "Disabled",
"description": "a section of JSON to be completely ignored"
}
];
$scope.clickItem = function(x, y) {
$scope.selectedItem = x;
$scope.selectedLabel = y;
};
$scope.clickItem($scope.$parent.mapping.default_mapping);
$scope.addField = function(scope) {
if (scope.newFieldName) {
$scope.selectedItem.properties[scope.newFieldName] = newFieldSection();
scope.newFieldName = "";
console.log($scope.selectedItem);
}
};
$scope.changeType = function(scope) {
};
}

View File

@ -1,72 +0,0 @@
<div class="modal-header">
<h3 class="modal-title">Custom Analyzer</h3>
</div>
<div class="modal-body">
<div ng-show="errorMessage" class="alert alert-danger ng-cloak" role="alert"> {{errorMessage}}
</div>
<form class="form" role="form">
<div class="form-group">
<label for="aname">Name</label>
<input ng-model="name" type="text" class="form-control" id="tname" placeholder="Name">
</div>
<div class="form-group">
<label for="exampleInputPassword1">Character Filters</label>
<ul class="list-group" ng-show="analyzer.char_filters.length < 1">
<li class="list-group-item">None</li>
</ul>
<ul class="list-group" ng-show="analyzer.char_filters.length > 0" ui-sortable ng-model="analyzer.char_filters">
<li class="list-group-item" ng-repeat="analyzerCharFilter in analyzer.char_filters track by $index"><span class="glyphicon glyphicon-minus"></span> {{ analyzerCharFilter }}<span ng-click="removeCharFilter($index)" class="glyphicon glyphicon-remove pull-right"></span></li>
</ul>
</div>
<div class="form-group">
<label for="exampleInputPassword2"></label>
<div class="col-sm-10">
<select ng-change="addCharFilterChanged()" ng-model="addCharacterFilterName" class="form-control" id="addCharacterFilters">
<option ng-repeat="charFilter in charFilterNames">{{charFilter}}</option>
</select>
</div>
<div class="col-sm-2">
<button ng-click="addCharFilter(this)" type="button" class="btn btn-default pull-right">Add</button>
</div>
</div>
<div class="form-group">
<label for="analyzerTokenizer">Tokenizer</label>
<select ng-change="tokenizerChanged()" ng-model="analyzer.tokenizer" class="form-control" id="analyzerTokenizer">
<option ng-repeat="tokenizer in tokenizerNames">{{tokenizer}}</option>
</select>
</div>
<div class="form-group">
<label for="exampleInputPassword2">Token Filters</label>
<ul class="list-group" ng-show="analyzer.token_filters.length < 1">
<li class="list-group-item">None</li>
</ul>
<ul class="list-group" ng-show="analyzer.token_filters.length > 0" ui-sortable ng-model="analyzer.token_filters">
<li class="list-group-item" ng-repeat="analyzerTokenFilter in analyzer.token_filters"><span class="glyphicon glyphicon-minus"></span> {{ analyzerTokenFilter }}<span ng-click="removeTokenFilter($index)" class="glyphicon glyphicon-remove pull-right"></li>
</ul>
</div>
<div class="form-group">
<label for="exampleInputPassword2"></label>
<div class="col-sm-10">
<select ng-change="addTokenFilterChanged()" ng-model="addTokenFilterName" class="form-control" id="addTokenFilters">
<option ng-repeat="tokenFilter in tokenFilterNames">{{tokenFilter}}</option>
</select>
</div>
<div class="col-sm-2">
<button ng-click="addTokenFilter(this)" type="button" class="btn btn-default pull-right">Add</button>
</div>
</div>
</form>
</div>
<div class="modal-footer">
<button class="btn btn-default" ng-click="cancel()">Cancel</button>
<button ng-click="build()" type="button" class="btn btn-primary pull-right">Save</button>
</div>

View File

@ -1,34 +0,0 @@
<table class="table table-striped">
<thead>
<tr>
<th>Name</th>
<th></th>
</tr>
</thead>
<tbody>
<tr ng-repeat="(aname,aval) in mapping.analysis.analyzers">
<td>{{aname}}</td>
<td>
<div class="btn-group btn-group-xs" role="group">
<button ng-click="editAnalyzer(aname, aval)" type="button" class="btn btn-default btn-xs">
<span class="glyphicon glyphicon-edit" aria-hidden="true"></span> Edit
</button>
<button ng-click="deleteAnalyzer(aname)" type="button" class="btn btn-default btn-xs">
<span class="glyphicon glyphicon-trash" aria-hidden="true"></span> Delete
</button>
</div>
</td>
</tr>
<tr ng-show="Utils.keys(mapping.analysis.analyzers).length < 1">
<td colspan="2">None</td>
</tr>
</tbody>
<tfoot>
<tr>
<td colspan="2">
<button ng-click="newAnalyzer()" type="button" class="btn btn-sm btn-default pull-right">New Analyzer</button>
</td>
</tr>
</tfoot>
</table>

View File

@ -1,34 +0,0 @@
<div class="modal-header">
<h3 class="modal-title">Custom Char Filter</h3>
</div>
<div class="modal-body">
<div ng-show="errorMessage" class="alert alert-danger ng-cloak" role="alert"> {{errorMessage}}
</div>
<form class="form" role="form">
<div class="form-group">
<label for="tname">Name</label>
<input ng-model="name" type="text" class="form-control" id="tname" placeholder="Name">
</div>
<div class="form-group">
<label for="charfiltertype">Type</label>
<div class="col-sm-12 input-group">
<select ng-change="charFilterTypeChange()" ng-model="charfilter.type" class="form-control" id="charfiltertype">
<option ng-repeat="charFilterTyp in charFilterTypes">{{charFilterTyp}}</option>
</select>
</div>
</div>
<div ng-show="charfilter.type" ng-include src="formpath"/>
</form>
</div>
<div class="modal-footer">
<button class="btn btn-default" ng-click="cancel()">Cancel</button>
<button ng-click="build()" type="button" class="btn btn-primary pull-right">Save</button>
</div>

View File

@ -1,36 +0,0 @@
<table class="table table-striped">
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th></th>
</tr>
</thead>
<tbody>
<tr ng-repeat="(cfname,cfval) in mapping.analysis.char_filters">
<td>{{cfname}}</td>
<td>{{cfval.type}}</td>
<td>
<div class="btn-group btn-group-xs" role="group">
<button ng-click="editCharFilter(cfname, cfval)" type="button" class="btn btn-default btn-xs">
<span class="glyphicon glyphicon-edit" aria-hidden="true"></span> Edit
</button>
<button ng-click="deleteCharFilter(cfname)" type="button" class="btn btn-default btn-xs">
<span class="glyphicon glyphicon-trash" aria-hidden="true"></span> Delete
</button>
</div>
</td>
</tr>
<tr ng-show="Utils.keys(mapping.analysis.char_filters).length < 1">
<td colspan="3">None</td>
</tr>
</tbody>
<tfoot>
<tr>
<td colspan="3">
<button ng-click="newCharFilter()" type="button" class="btn btn-sm btn-default pull-right">New Character Filter</button>
</td>
</tr>
</tfoot>
</table>

View File

@ -1,9 +0,0 @@
<div class="form-group">
<label for="charfilterRegexp">Regular Expression</label>
<input ng-model="charfilter.regexp" type="text" class="form-control" id="charfilterRegexp" placeholder="">
</div>
<div class="form-group">
<label for="charfilterReplace">Replacement</label>
<input ng-model="charfilter.replace" type="text" class="form-control" id="charfilterReplace" placeholder="">
</div>

View File

@ -1,34 +0,0 @@
<div class="modal-header">
<h3 class="modal-title">Custom Token Filter</h3>
</div>
<div class="modal-body">
<div ng-show="errorMessage" class="alert alert-danger ng-cloak" role="alert"> {{errorMessage}}
</div>
<form class="form" role="form">
<div class="form-group">
<label for="tname">Name</label>
<input ng-model="name" type="text" class="form-control" id="tname" placeholder="Name">
</div>
<div class="form-group">
<label for="tokenfiltertype">Type</label>
<div class="col-sm-12 input-group">
<select ng-change="tokenFilterTypeChange()" ng-model="tokenfilter.type" class="form-control" id="tokenfiltertype">
<option ng-repeat="tokenFilterTyp in tokenFilterTypes">{{tokenFilterTyp}}</option>
</select>
</div>
</div>
<div ng-show="tokenfilter.type" ng-include src="formpath"/>
</form>
</div>
<div class="modal-footer">
<button class="btn btn-default" ng-click="cancel()">Cancel</button>
<button ng-click="build()" type="button" class="btn btn-primary pull-right">Save</button>
</div>

View File

@ -1,36 +0,0 @@
<table class="table table-striped">
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th></th>
</tr>
</thead>
<tbody>
<tr ng-repeat="(tfname,tfval) in mapping.analysis.token_filters">
<td>{{tfname}}</td>
<td>{{tfval.type}}</td>
<td>
<div class="btn-group btn-group-xs" role="group">
<button ng-click="editTokenFilter(tfname, tfval)" type="button" class="btn btn-default btn-xs">
<span class="glyphicon glyphicon-edit" aria-hidden="true"></span> Edit
</button>
<button ng-click="deleteTokenFilter(tfname)" type="button" class="btn btn-default btn-xs">
<span class="glyphicon glyphicon-trash" aria-hidden="true"></span> Delete
</button>
</div>
</td>
</tr>
<tr ng-show="Utils.keys(mapping.analysis.token_filters).length < 1">
<td colspan="3">None</td>
</tr>
</tbody>
<tfoot>
<tr>
<td colspan="3">
<button ng-click="newTokenFilter()" type="button" class="btn btn-sm btn-default pull-right">New Token Filter</button>
</td>
</tr>
</tfoot>
</table>

View File

@ -1,6 +0,0 @@
<div class="form-group">
<label for="tokenfilterTokenMaps">Sub Words</label>
<select ng-model="tokenfilter.dict_token_map" class="form-control" id="tokenfilterTokenMaps">
<option ng-repeat="tokenMap in tokenMapNames">{{tokenMap}}</option>
</select>
</div>

View File

@ -1,17 +0,0 @@
<div class="form-group">
<label for="tokenfilterEdge">Edge</label>
<select class="form-control" id="tokenfilterEdge" ng-model="tokenfilter.edge">
<option>front</option>
<option>back</option>
</select>
</div>
<div class="form-group">
<label for="tokenfilterMin">Min</label>
<input ng-model="tokenfilter.min" type="number" class="form-control" id="tokenfilterMin" placeholder="">
</div>
<div class="form-group">
<label for="tokenfilterMax">Max</label>
<input ng-model="tokenfilter.max" type="number" class="form-control" id="tokenfilterMax" placeholder="">
</div>

View File

@ -1,6 +0,0 @@
<div class="form-group">
<label for="tokenfilterTokenMaps">Articles</label>
<select ng-model="tokenfilter.articles_token_map" class="form-control" id="tokenfilterTokenMaps">
<option ng-repeat="tokenMap in tokenMapNames">{{tokenMap}}</option>
</select>
</div>

View File

@ -1,6 +0,0 @@
<div class="form-group">
<label for="tokenfilterTokenMaps">Keywords</label>
<select ng-model="tokenfilter.keywords_token_map" class="form-control" id="tokenfilterTokenMaps">
<option ng-repeat="tokenMap in tokenMapNames">{{tokenMap}}</option>
</select>
</div>

View File

@ -1,9 +0,0 @@
<div class="form-group">
<label for="tokenfilterMin">Min</label>
<input ng-model="tokenfilter.min" type="number" class="form-control" id="tokenfilterMin" placeholder="">
</div>
<div class="form-group">
<label for="tokenfilterMax">Max</label>
<input ng-model="tokenfilter.max" type="number" class="form-control" id="tokenfilterMax" placeholder="">
</div>

View File

@ -1,9 +0,0 @@
<div class="form-group">
<label for="tokenfilterMin">Min</label>
<input ng-model="tokenfilter.min" type="number" class="form-control" id="tokenfilterMin" placeholder="">
</div>
<div class="form-group">
<label for="tokenfilterMax">Max</label>
<input ng-model="tokenfilter.max" type="number" class="form-control" id="tokenfilterMax" placeholder="">
</div>

View File

@ -1,9 +0,0 @@
<div class="form-group">
<label for="tokenfilterNormalizeUnicode">Form</label>
<select class="form-control" id="tokenfilterNormalizeUnicode" ng-model="tokenfilter.form">
<option>nfc</option>
<option>nfd</option>
<option>nfkc</option>
<option>nfkd</option>
</select>
</div>

View File

@ -1,24 +0,0 @@
<div class="form-group">
<label for="tokenfilterMin">Min</label>
<input ng-model="tokenfilter.min" type="number" class="form-control" id="tokenfilterMin" placeholder="">
</div>
<div class="form-group">
<label for="tokenfilterMax">Max</label>
<input ng-model="tokenfilter.max" type="number" class="form-control" id="tokenfilterMax" placeholder="">
</div>
<div class="form-group">
<label for="tokenfilterInclude">Include Original Token</label>
<input ng-model="tokenfilter.output_original" type="checkbox" class="form-control" id="tokenfilterInclude">
</div>
<div class="form-group">
<label for="tokenfilterSep">Separator</label>
<input ng-model="tokenfilter.separator" type="text" class="form-control" id="tokenfilterSep" placeholder="">
</div>
<div class="form-group">
<label for="tokenfilterFiller">Filler</label>
<input ng-model="tokenfilter.filler" type="text" class="form-control" id="tokenfilterFiller" placeholder="">
</div>

View File

@ -1,6 +0,0 @@
<div class="form-group">
<label for="tokenfilterTokenMaps">Stop Words</label>
<select ng-model="tokenfilter.stop_token_map" class="form-control" id="tokenfilterTokenMaps">
<option ng-repeat="tokenMap in tokenMapNames">{{tokenMap}}</option>
</select>
</div>

View File

@ -1,4 +0,0 @@
<div class="form-group">
<label for="tokenfilterLen">Length</label>
<input ng-model="tokenfilter.length" type="number" class="form-control" id="tokenfilterLen" placeholder="">
</div>

View File

@ -1,3 +0,0 @@
<select ng-model="tokenfilter.word_map" class="form-control" id="tokenfilterTokenMaps">
<option ng-repeat="tokenMap in tokenMapNames">{{tokenMap}}</option>
</select>

View File

@ -1,34 +0,0 @@
<div class="modal-header">
<h3 class="modal-title">Custom Tokenizer</h3>
</div>
<div class="modal-body">
<div ng-show="errorMessage" class="alert alert-danger ng-cloak" role="alert"> {{errorMessage}}
</div>
<form class="form" role="form">
<div class="form-group">
<label for="tname">Name</label>
<input ng-model="name" type="text" class="form-control" id="tname" placeholder="Name">
</div>
<div class="form-group">
<label for="tokenizertype">Type</label>
<div class="col-sm-12 input-group">
<select ng-change="tokenizerTypeChange()" ng-model="tokenizer.type" class="form-control" id="tokenizertype">
<option ng-repeat="tokenizerTyp in tokenizerTypes">{{tokenizerTyp}}</option>
</select>
</div>
</div>
<div ng-show="tokenizer.type" ng-include src="formpath"/>
</form>
</div>
<div class="modal-footer">
<button class="btn btn-default" ng-click="cancel()">Cancel</button>
<button ng-click="build()" type="button" class="btn btn-primary pull-right">Save</button>
</div>

View File

@ -1,36 +0,0 @@
<table class="table table-striped">
<thead>
<tr>
<th>Name</th>
<th>Type</th>
<th></th>
</tr>
</thead>
<tbody>
<tr ng-repeat="(tname,tval) in mapping.analysis.tokenizers">
<td>{{tname}}</td>
<td>{{tval.type}}</td>
<td>
<div class="btn-group btn-group-xs" role="group">
<button ng-click="editTokenizer(tname, tval)" type="button" class="btn btn-default btn-xs">
<span class="glyphicon glyphicon-edit" aria-hidden="true"></span> Edit
</button>
<button ng-click="deleteTokenizer(tname)" type="button" class="btn btn-default btn-xs">
<span class="glyphicon glyphicon-trash" aria-hidden="true"></span> Delete
</button>
</div>
</td>
</tr>
<tr ng-show="Utils.keys(mapping.analysis.tokenizers).length < 1">
<td colspan="3">None</td>
</tr>
</tbody>
<tfoot>
<tr>
<td colspan="3">
<button ng-click="newTokenizer()" type="button" class="btn btn-sm btn-default pull-right">New Tokenizer</button>
</td>
</tr>
</tfoot>
</table>

View File

@ -1,24 +0,0 @@
<div class="form-group">
<label for="exampleInputPassword1">Exception Patterns</label>
<ul class="list-group" ng-show="tokenizer.exceptions.length < 1">
<li class="list-group-item">None</li>
</ul>
<ul class="list-group" ng-show="tokenizer.exceptions.length > 0" ui-sortable ng-model="tokenizer.exceptions">
<li class="list-group-item" ng-repeat="e in tokenizer.exceptions track by $index"><span class="glyphicon glyphicon-minus"></span> {{ e }}<span ng-click="removeException($index)" class="glyphicon glyphicon-remove pull-right"></span></li>
</ul>
</div>
<div class="form-group">
<label for="what"></label>
<div class="col-sm-10">
<input ng-model="newregexp" type="text" class="form-control" id="exceptionRegexp" placeholder="">
</div>
<div class="col-sm-2">
<button ng-click="addException(this)" type="button" class="btn btn-default pull-right">Add</button>
</div>
</div>
<div class="form-group">
<label for="analyzerTokenizer">Tokenizer for Remaining Input</label>
<select ng-change="tokenizerChanged()" ng-model="tokenizer.tokenizer" class="form-control" id="tokenizer">
<option ng-repeat="tokenizer in tokenizerNames">{{tokenizer}}</option>
</select>
</div>

View File

@ -1,4 +0,0 @@
<div class="form-group">
<label for="tokenizerRegexp">Regular Expression</label>
<input ng-model="tokenizer.regexp" type="text" class="form-control" id="tokenizerRegexp" placeholder="">
</div>

View File

@ -1,39 +0,0 @@
<div class="modal-header">
<h3 class="modal-title">Custom Word List</h3>
</div>
<div class="modal-body">
<div ng-show="errorMessage" class="alert alert-danger ng-cloak" role="alert"> {{errorMessage}}
</div>
<form class="form" role="form">
<div class="form-group">
<label for="tname">Name</label>
<input ng-model="name" type="text" class="form-control" id="tname" placeholder="Name">
</div>
<div class="form-group">
<label for="words">Words</label>
<select ng-model="selectedWords" multiple ng-multiple="true" id="words" size="5" class="form-control" ng-options="idx as word for (idx, word) in words">
</select>
</div>
<div class="form-group">
<label for="what"></label>
<div class="col-sm-8">
<input ng-model="newWord" type="text" class="form-control" id="newWord" placeholder="word">
</div>
<div class="col-sm-4">
<button ng-click="addWord()" type="button" class="btn btn-sm btn-default">Add</button>
<button ng-click="removeWord()" ng-disabled="selectedWords.length < 1" type="button" class="btn btn-sm btn-default pull-right">Remove</button>
</div>
</div>
</form>
</div>
<div class="modal-footer">
<button class="btn btn-default" ng-click="cancel()">Cancel</button>
<button ng-click="build()" type="button" class="btn btn-primary pull-right">Save</button>
</div>

View File

@ -1,34 +0,0 @@
<table class="table table-striped">
<thead>
<tr>
<th>Name</th>
<th></th>
</tr>
</thead>
<tbody>
<tr ng-repeat="(tmname,tmval) in mapping.analysis.token_maps">
<td>{{tmname}}</td>
<td>
<div class="btn-group btn-group-xs" role="group">
<button ng-click="editWordList(tmname, tmval)" type="button" class="btn btn-default btn-xs">
<span class="glyphicon glyphicon-edit" aria-hidden="true"></span> Edit
</button>
<button ng-click="deleteWordList(tmname)" type="button" class="btn btn-default btn-xs">
<span class="glyphicon glyphicon-trash" aria-hidden="true"></span> Delete
</button>
</div>
</td>
</tr>
<tr ng-show="Utils.keys(mapping.analysis.token_maps).length < 1">
<td colspan="2">None</td>
</tr>
</tbody>
<tfoot>
<tr>
<td colspan="2">
<button ng-click="newWordList()" type="button" class="btn btn-sm btn-default pull-right">New Word List</button>
</td>
</tr>
</tfoot>
</table>

View File

@ -1,119 +0,0 @@
<div class="row">
<div class="col-md-6">
<div class="panel panel-default">
<div class="panel-heading">
<h3 class="panel-title"><span class="glyphicon glyphicon-file" aria-hidden="true"></span> Document Structure</h3>
</div>
<div class="panel-body">
<ul class="list-custom">
<span class="list-item" ng-click="clickItem(mapping.default_mapping, '<document root>')" ng-class="{selected: mapping.default_mapping==selectedItem}">
&bull; &lt;document root&gt;
</span>
<ul class="list-custom">
<li ng-repeat="(pname,pval) in mapping.default_mapping.properties" ng-include="'/static/partials/mapping/mapping-node.html'" ng-init="parent = pname"></li>
</ul>
</ul>
<div class="form-group form-group-sm">
<div class="col-sm-10">
<input ng-model="newFieldName" type="text" class="form-control" id="fieldName" placeholder="field name">
</div>
<button ng-click="addField(this)" type="button" class="btn btn-sm btn-default">Add</button>
</div>
</div>
</div>
</div>
<div class="col-md-6">
<div class="panel panel-default">
<div class="panel-heading">
<h3 class="panel-title"><span class="glyphicon glyphicon-list" aria-hidden="true"></span> Indexing Behavior <small>{{selectedLabel}}</small></h3>
</div>
<div class="panel-body">
<div ng-show="selectedItem == null">Select an item in the document structure.</div>
<div ng-hide="selectedItem == null">
<div class="form-group form-group-sm">
<div class="col-sm-10">
<label>Type
<select ng-change="changeType(this)" ng-model="selectedItem.fields[0].type" ng-options="t.name as t.label for t in fieldTypes">
<option value="">Object</option>
</select>
</label>
</div>
</div>
<div ng-switch="selectedItem.fields[0].type">
<div ng-switch-when="text">
<div class="form-group form-group-sm">
<div class="col-sm-10">
<label>Analyzer
<select ng-change="changeType(this)" ng-model="selectedItem.fields[0].analyzer" ng-options="t as t for t in analyzerNames">
<option value="">Inherit</option>
</select>
</label>
</div>
</div>
</div>
<div ng-switch-when="datetime">
<div class="form-group form-group-sm">
<div class="col-sm-10">
<label>Date/TimeParser
<select ng-change="changeType(this)" ng-model="selectedItem.fields[0].date_format" ng-options="t as t for t in datetimeParserNames">
<option value="">Inherit</option>
</select>
</label>
</div>
</div>
</div>
</div>
</div>
<div ng-if="selectedItem.fields[0].type != '' && selectedItem.fields[0].type != 'disabled'">
<div class="checkbox">
<label>
<input ng-model="selectedItem.fields[0].index" type="checkbox"> Index
</label>
</div>
<div class="checkbox">
<label>
<input ng-model="selectedItem.fields[0].store" type="checkbox"> Store
</label>
</div>
<div class="checkbox">
<label>
<input ng-model="selectedItem.fields[0].include_in_all" type="checkbox"> Include in 'All' Field
</label>
</div>
<div ng-if="selectedItem.fields[0].type == 'text'">
<div class="checkbox">
<label>
<input ng-model="selectedItem.fields[0].include_term_vectors" type="checkbox"> Include Term Vectors
</label>
</div>
</div>
</div>
</div>
</div>
</div>
</div>

View File

@ -1,6 +0,0 @@
<span class="list-item"ng-click="clickItem(pval,pname)" ng-class="{selected: pval==selectedItem}">&bull; {{pname}}</span>
<ul class="list-custom">
<li ng-repeat="(pname,pval) in pval.properties" ng-init="parent = parent + '.' + pname">
<span class="list-item" ng-click="clickItem(pval,parent)" ng-class="{selected: pval==selectedItem}">&bull; {{pname}}</span>
</li>
</ul>

View File

@ -1,59 +0,0 @@
<div ng-show="errorMessage" class="alert alert-danger ng-cloak" role="alert"> {{errorMessage}}
</div>
<form class="form-horizontal" role="form">
<div class="form-group">
<label for="inputDoc" class="col-sm-2 control-label">Index Mapping</label>
<div class="col-sm-10">
<div class="radio">
<label>
<input ng-model="mappingType" type="radio" name="mappingType" value="default" checked>
Default
</label>
</div>
<div class="radio">
<label>
<input ng-model="mappingType" type="radio" name="mappingType" value="custom">
Custom
</label>
</div>
</div>
</div>
<div class="form-group" ng-show="mappingType == 'custom'">
<label for="inputDoc" class="col-sm-2 control-label">&nbsp;</label>
<div class="col-sm-10">
<div ng-include src="'/static/partials/mapping/mapping-custom.html'"/>
</div>
</div>
<div class="form-group" ng-controller="AnalysisCtrl" ng-show="mappingType == 'custom'">
<label for="inputDoc" class="col-sm-2 control-label">Custom Analysis</label>
<div class="col-sm-10">
<tabset>
<tab heading="Analyzers">
<div ng-include src="'/static/partials/analysis/analyzers.html'"/>
</tab>
<tab heading="Character Filters">
<div ng-include src="'/static/partials/analysis/charfilters.html'"/>
</tab>
<tab heading="Tokenizers">
<div ng-include src="'/static/partials/analysis/tokenizers.html'"/>
</tab>
<tab heading="Token Filters">
<div ng-include src="'/static/partials/analysis/tokenfilters.html'"/>
</tab>
<tab heading="Word Lists">
<div ng-include src="'/static/partials/analysis/wordlists.html'"/>
</tab>
</tabset>
</div>
</div>
</form>

View File

@ -18,7 +18,6 @@ import (
var indexNameMapping map[string]bleve.Index
var indexNameMappingLock sync.RWMutex
var indexStats = bleve.IndexStats{}
func RegisterIndexName(name string, idx bleve.Index) {
indexNameMappingLock.Lock()
@ -28,7 +27,6 @@ func RegisterIndexName(name string, idx bleve.Index) {
indexNameMapping = make(map[string]bleve.Index)
}
indexNameMapping[name] = idx
indexStats[name] = idx.Stats()
}
func UnregisterIndexByName(name string) bleve.Index {
@ -42,7 +40,6 @@ func UnregisterIndexByName(name string) bleve.Index {
if rv != nil {
delete(indexNameMapping, name)
}
delete(indexStats, name)
return rv
}
@ -66,10 +63,6 @@ func IndexNames() []string {
return rv
}
func IndexStats() bleve.IndexStats {
return indexStats
}
func UpdateAlias(alias string, add, remove []string) error {
indexNameMappingLock.Lock()
defer indexNameMappingLock.Unlock()

View File

@ -71,7 +71,7 @@ func (b *Batch) Size() int {
return len(b.internal.IndexOps) + len(b.internal.InternalOps)
}
// String prints a user friendly string represenation of what
// String prints a user friendly string representation of what
// is inside this batch.
func (b *Batch) String() string {
return b.internal.String()
@ -174,8 +174,22 @@ type Index interface {
FieldDictRange(field string, startTerm []byte, endTerm []byte) (index.FieldDict, error)
FieldDictPrefix(field string, termPrefix []byte) (index.FieldDict, error)
// DumpAll returns a channel receiving all index rows as
// UpsideDownCouchRow, in lexicographic byte order. If the enumeration
// fails, an error is sent. The channel is closed once the enumeration
// completes or an error is encountered. The caller must consume all
// channel entries until the channel is closed to ensure the transaction
// and other resources associated with the enumeration are released.
//
// DumpAll exists for debugging and tooling purpose and may change in the
// future.
DumpAll() chan interface{}
// DumpDoc works like DumpAll but returns only StoredRows and
// TermFrequencyRows related to a document.
DumpDoc(id string) chan interface{}
// DumpFields works like DumpAll but returns only FieldRows.
DumpFields() chan interface{}
Close() error
@ -188,6 +202,11 @@ type Index interface {
SetInternal(key, val []byte) error
DeleteInternal(key []byte) error
// Name returns the name of the index (by default this is the path)
Name() string
// SetName lets you assign your own logical name to this index
SetName(string)
// Advanced returns the indexer and data store, exposing lower level
// methods to enumerate records and access data.
Advanced() (index.Index, store.KVStore, error)
@ -211,7 +230,7 @@ func New(path string, mapping *IndexMapping) (Index, error) {
// The provided mapping will be used for all
// Index/Search operations.
// The specified index type will be used
// The specified kvstore implemenation will be used
// The specified kvstore implementation will be used
// and the provided kvconfig will be passed to its
// constructor.
func NewUsing(path string, mapping *IndexMapping, indexType string, kvstore string, kvconfig map[string]interface{}) (Index, error) {

View File

@ -50,6 +50,11 @@ func (f *FieldCache) FieldNamed(field string, createIfMissing bool) (uint16, boo
// trade read lock for write lock
f.mutex.RUnlock()
f.mutex.Lock()
// need to check again with write lock
if index, ok := f.fieldIndexes[field]; ok {
f.mutex.Unlock()
return index, true
}
// assign next field id
index := uint16(f.lastFieldIndex + 1)
f.fieldIndexes[field] = index

169
index/firestorm/analysis.go Normal file
View File

@ -0,0 +1,169 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"math"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
)
func (f *Firestorm) Analyze(d *document.Document) *index.AnalysisResult {
rv := &index.AnalysisResult{
DocID: d.ID,
Rows: make([]index.IndexRow, 0, 100),
}
docIDBytes := []byte(d.ID)
// add the _id row
rv.Rows = append(rv.Rows, NewTermFreqRow(0, nil, docIDBytes, d.Number, 0, 0, nil))
// information we collate as we merge fields with same name
fieldTermFreqs := make(map[uint16]analysis.TokenFrequencies)
fieldLengths := make(map[uint16]int)
fieldIncludeTermVectors := make(map[uint16]bool)
fieldNames := make(map[uint16]string)
analyzeField := func(field document.Field, storable bool) {
fieldIndex, newFieldRow := f.fieldIndexOrNewRow(field.Name())
if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow)
}
fieldNames[fieldIndex] = field.Name()
if field.Options().IsIndexed() {
fieldLength, tokenFreqs := field.Analyze()
existingFreqs := fieldTermFreqs[fieldIndex]
if existingFreqs == nil {
fieldTermFreqs[fieldIndex] = tokenFreqs
} else {
existingFreqs.MergeAll(field.Name(), tokenFreqs)
fieldTermFreqs[fieldIndex] = existingFreqs
}
fieldLengths[fieldIndex] += fieldLength
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
}
if storable && field.Options().IsStored() {
storeRow := f.storeField(docIDBytes, d.Number, field, fieldIndex)
rv.Rows = append(rv.Rows, storeRow)
}
}
for _, field := range d.Fields {
analyzeField(field, true)
}
if len(d.CompositeFields) > 0 {
for fieldIndex, tokenFreqs := range fieldTermFreqs {
// see if any of the composite fields need this
for _, compositeField := range d.CompositeFields {
compositeField.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
}
}
for _, compositeField := range d.CompositeFields {
analyzeField(compositeField, false)
}
}
rowsCapNeeded := len(rv.Rows)
for _, tokenFreqs := range fieldTermFreqs {
rowsCapNeeded += len(tokenFreqs)
}
rows := make([]index.IndexRow, 0, rowsCapNeeded)
rv.Rows = append(rows, rv.Rows...)
// walk through the collated information and proccess
// once for each indexed field (unique name)
for fieldIndex, tokenFreqs := range fieldTermFreqs {
fieldLength := fieldLengths[fieldIndex]
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
rv.Rows = f.indexField(docIDBytes, d.Number, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows)
}
return rv
}
func (f *Firestorm) indexField(docID []byte, docNum uint64, includeTermVectors bool, fieldIndex uint16, fieldLength int, tokenFreqs analysis.TokenFrequencies, rows []index.IndexRow) []index.IndexRow {
tfrs := make([]TermFreqRow, len(tokenFreqs))
fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength)))
if !includeTermVectors {
i := 0
for _, tf := range tokenFreqs {
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, nil))
i++
}
return rows
}
i := 0
for _, tf := range tokenFreqs {
var tv []*TermVector
tv, rows = f.termVectorsFromTokenFreq(fieldIndex, tf, rows)
rows = append(rows, InitTermFreqRow(&tfrs[i], fieldIndex, tf.Term, docID, docNum, uint64(tf.Frequency()), fieldNorm, tv))
i++
}
return rows
}
func (f *Firestorm) termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq, rows []index.IndexRow) ([]*TermVector, []index.IndexRow) {
rv := make([]*TermVector, len(tf.Locations))
for i, l := range tf.Locations {
var newFieldRow *FieldRow
fieldIndex := field
if l.Field != "" {
// lookup correct field
fieldIndex, newFieldRow = f.fieldIndexOrNewRow(l.Field)
if newFieldRow != nil {
rows = append(rows, newFieldRow)
}
}
tv := NewTermVector(fieldIndex, uint64(l.Position), uint64(l.Start), uint64(l.End), l.ArrayPositions)
rv[i] = tv
}
return rv, rows
}
func (f *Firestorm) storeField(docID []byte, docNum uint64, field document.Field, fieldIndex uint16) index.IndexRow {
fieldValue := make([]byte, 1+len(field.Value()))
fieldValue[0] = encodeFieldType(field)
copy(fieldValue[1:], field.Value())
storedRow := NewStoredRow(docID, docNum, fieldIndex, field.ArrayPositions(), fieldValue)
return storedRow
}
func encodeFieldType(f document.Field) byte {
fieldType := byte('x')
switch f.(type) {
case *document.TextField:
fieldType = 't'
case *document.NumericField:
fieldType = 'n'
case *document.DateTimeField:
fieldType = 'd'
case *document.BooleanField:
fieldType = 'b'
case *document.CompositeField:
fieldType = 'c'
}
return fieldType
}

View File

@ -0,0 +1,192 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
"github.com/blevesearch/bleve/index/store/null"
"github.com/blevesearch/bleve/registry"
)
func TestAnalysis(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
rows := []index.IndexRow{
NewFieldRow(0, IDFieldName),
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err := kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
// warmup to load field cache and set maxRead correctly
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
tests := []struct {
d *document.Document
r *index.AnalysisResult
}{
{
d: document.NewDocument("a").
AddField(
document.NewTextFieldWithIndexingOptions("name", nil, []byte("test"), document.IndexField|document.StoreField|document.IncludeTermVectors)),
r: &index.AnalysisResult{
DocID: "a",
Rows: []index.IndexRow{
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
NewFieldRow(1, "name"),
NewStoredRow([]byte("a"), 1, 1, nil, []byte("ttest")),
NewTermFreqRow(1, []byte("test"), []byte("a"), 1, 1, 1.0, []*TermVector{NewTermVector(1, 1, 0, 4, nil)}),
},
},
},
}
for _, test := range tests {
test.d.Number = 1
actual := f.Analyze(test.d)
if !reflect.DeepEqual(actual, test.r) {
t.Errorf("expected: %v got %v", test.r, actual)
}
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
}
func TestAnalysisBug328(t *testing.T) {
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
if err != nil {
t.Fatal(err)
}
analysisQueue := index.NewAnalysisQueue(1)
idx, err := NewFirestorm(gtreap.Name, nil, analysisQueue)
if err != nil {
t.Fatal(err)
}
d := document.NewDocument("1")
f := document.NewTextFieldCustom("title", nil, []byte("bleve"), document.IndexField|document.IncludeTermVectors, analyzer)
d.AddField(f)
f = document.NewTextFieldCustom("body", nil, []byte("bleve"), document.IndexField|document.IncludeTermVectors, analyzer)
d.AddField(f)
cf := document.NewCompositeFieldWithIndexingOptions("_all", true, []string{}, []string{}, document.IndexField|document.IncludeTermVectors)
d.AddField(cf)
rv := idx.Analyze(d)
fieldIndexes := make(map[uint16]string)
for _, row := range rv.Rows {
if row, ok := row.(*FieldRow); ok {
fieldIndexes[row.index] = row.Name()
}
if row, ok := row.(*TermFreqRow); ok && string(row.term) == "bleve" {
for _, vec := range row.Vectors() {
if vec.GetField() != uint32(row.field) {
if fieldIndexes[row.field] != "_all" {
t.Errorf("row named %s field %d - vector field %d", fieldIndexes[row.field], row.field, vec.GetField())
}
}
}
}
}
}
func BenchmarkAnalyze(b *testing.B) {
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed(standard_analyzer.Name)
if err != nil {
b.Fatal(err)
}
analysisQueue := index.NewAnalysisQueue(1)
idx, err := NewFirestorm(null.Name, nil, analysisQueue)
if err != nil {
b.Fatal(err)
}
d := document.NewDocument("1")
f := document.NewTextFieldWithAnalyzer("desc", nil, bleveWikiArticle1K, analyzer)
d.AddField(f)
b.ResetTimer()
for i := 0; i < b.N; i++ {
rv := idx.Analyze(d)
if len(rv.Rows) < 92 || len(rv.Rows) > 93 {
b.Fatalf("expected 512-13 rows, got %d", len(rv.Rows))
}
}
}
var bleveWikiArticle1K = []byte(`Boiling liquid expanding vapor explosion
From Wikipedia, the free encyclopedia
See also: Boiler explosion and Steam explosion
Flames subsequent to a flammable liquid BLEVE from a tanker. BLEVEs do not necessarily involve fire.
This article's tone or style may not reflect the encyclopedic tone used on Wikipedia. See Wikipedia's guide to writing better articles for suggestions. (July 2013)
A boiling liquid expanding vapor explosion (BLEVE, /ˈblɛviː/ blev-ee) is an explosion caused by the rupture of a vessel containing a pressurized liquid above its boiling point.[1]
Contents [hide]
1 Mechanism
1.1 Water example
1.2 BLEVEs without chemical reactions
2 Fires
3 Incidents
4 Safety measures
5 See also
6 References
7 External links
Mechanism[edit]
This section needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed. (July 2013)
There are three characteristics of liquids which are relevant to the discussion of a BLEVE:`)

View File

@ -0,0 +1,70 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/bleve/index/store/boltdb"
)
var boltTestConfig = map[string]interface{}{
"path": "test",
}
func BenchmarkBoltDBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, boltdb.Name, boltTestConfig, DestroyTest, 1)
}
func BenchmarkBoltDBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, boltdb.Name, boltTestConfig, DestroyTest, 2)
}
func BenchmarkBoltDBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, boltdb.Name, boltTestConfig, DestroyTest, 4)
}
// batches
func BenchmarkBoltDBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 1, 10)
}
func BenchmarkBoltDBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 2, 10)
}
func BenchmarkBoltDBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 4, 10)
}
func BenchmarkBoltDBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 1, 100)
}
func BenchmarkBoltDBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 2, 100)
}
func BenchmarkBoltDBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 4, 100)
}
func BenchmarkBoltBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 1, 1000)
}
func BenchmarkBoltBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 2, 1000)
}
func BenchmarkBoltBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, boltdb.Name, boltTestConfig, DestroyTest, 4, 1000)
}

View File

@ -0,0 +1,144 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"os"
"strconv"
"testing"
_ "github.com/blevesearch/bleve/analysis/analyzers/standard_analyzer"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/registry"
)
var benchmarkDocBodies = []string{
"A boiling liquid expanding vapor explosion (BLEVE, /ˈblɛviː/ blev-ee) is an explosion caused by the rupture of a vessel containing a pressurized liquid above its boiling point.",
"A boiler explosion is a catastrophic failure of a boiler. As seen today, boiler explosions are of two kinds. One kind is a failure of the pressure parts of the steam and water sides. There can be many different causes, such as failure of the safety valve, corrosion of critical parts of the boiler, or low water level. Corrosion along the edges of lap joints was a common cause of early boiler explosions.",
"A boiler is a closed vessel in which water or other fluid is heated. The fluid does not necessarily boil. (In North America the term \"furnace\" is normally used if the purpose is not actually to boil the fluid.) The heated or vaporized fluid exits the boiler for use in various processes or heating applications,[1][2] including central heating, boiler-based power generation, cooking, and sanitation.",
"A pressure vessel is a closed container designed to hold gases or liquids at a pressure substantially different from the ambient pressure.",
"Pressure (symbol: p or P) is the ratio of force to the area over which that force is distributed.",
"Liquid is one of the four fundamental states of matter (the others being solid, gas, and plasma), and is the only state with a definite volume but no fixed shape.",
"The boiling point of a substance is the temperature at which the vapor pressure of the liquid equals the pressure surrounding the liquid[1][2] and the liquid changes into a vapor.",
"Vapor pressure or equilibrium vapor pressure is defined as the pressure exerted by a vapor in thermodynamic equilibrium with its condensed phases (solid or liquid) at a given temperature in a closed system.",
"Industrial gases are a group of gases that are specifically manufactured for use in a wide range of industries, which include oil and gas, petrochemicals, chemicals, power, mining, steelmaking, metals, environmental protection, medicine, pharmaceuticals, biotechnology, food, water, fertilizers, nuclear power, electronics and aerospace.",
"The expansion ratio of a liquefied and cryogenic substance is the volume of a given amount of that substance in liquid form compared to the volume of the same amount of substance in gaseous form, at room temperature and normal atmospheric pressure.",
}
type KVStoreDestroy func() error
func DestroyTest() error {
return os.RemoveAll("test")
}
func CommonBenchmarkIndex(b *testing.B, storeName string, storeConfig map[string]interface{}, destroy KVStoreDestroy, analysisWorkers int) {
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed("standard")
if err != nil {
b.Fatal(err)
}
indexDocument := document.NewDocument("").
AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[0]), analyzer))
b.ResetTimer()
b.StopTimer()
for i := 0; i < b.N; i++ {
analysisQueue := index.NewAnalysisQueue(analysisWorkers)
idx, err := NewFirestorm(storeName, storeConfig, analysisQueue)
if err != nil {
b.Fatal(err)
}
err = idx.Open()
if err != nil {
b.Fatal(err)
}
indexDocument.ID = strconv.Itoa(i)
// just time the indexing portion
b.StartTimer()
err = idx.Update(indexDocument)
if err != nil {
b.Fatal(err)
}
b.StopTimer()
err = idx.Close()
if err != nil {
b.Fatal(err)
}
err = destroy()
if err != nil {
b.Fatal(err)
}
analysisQueue.Close()
}
}
func CommonBenchmarkIndexBatch(b *testing.B, storeName string, storeConfig map[string]interface{}, destroy KVStoreDestroy, analysisWorkers, batchSize int) {
cache := registry.NewCache()
analyzer, err := cache.AnalyzerNamed("standard")
if err != nil {
b.Fatal(err)
}
b.ResetTimer()
b.StopTimer()
for i := 0; i < b.N; i++ {
analysisQueue := index.NewAnalysisQueue(analysisWorkers)
idx, err := NewFirestorm(storeName, storeConfig, analysisQueue)
if err != nil {
b.Fatal(err)
}
err = idx.Open()
if err != nil {
b.Fatal(err)
}
b.StartTimer()
batch := index.NewBatch()
for j := 0; j < 1000; j++ {
if j%batchSize == 0 {
if len(batch.IndexOps) > 0 {
err := idx.Batch(batch)
if err != nil {
b.Fatal(err)
}
}
batch = index.NewBatch()
}
indexDocument := document.NewDocument("").
AddField(document.NewTextFieldWithAnalyzer("body", []uint64{}, []byte(benchmarkDocBodies[j%10]), analyzer))
indexDocument.ID = strconv.Itoa(i) + "-" + strconv.Itoa(j)
batch.Update(indexDocument)
}
// close last batch
if len(batch.IndexOps) > 0 {
err := idx.Batch(batch)
if err != nil {
b.Fatal(err)
}
}
b.StopTimer()
err = idx.Close()
if err != nil {
b.Fatal(err)
}
err = destroy()
if err != nil {
b.Fatal(err)
}
analysisQueue.Close()
}
}

View File

@ -0,0 +1,70 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/blevex/cznicb"
)
func DestroyCznicB() error {
return nil
}
func BenchmarkCznicBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, cznicb.Name, nil, DestroyCznicB, 1)
}
func BenchmarkCznicBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, cznicb.Name, nil, DestroyCznicB, 2)
}
func BenchmarkCznicBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, cznicb.Name, nil, DestroyCznicB, 4)
}
// batches
func BenchmarkCznicBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 1, 10)
}
func BenchmarkCznicBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 2, 10)
}
func BenchmarkCznicBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 4, 10)
}
func BenchmarkCznicBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 1, 100)
}
func BenchmarkCznicBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 2, 100)
}
func BenchmarkCznicBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 4, 100)
}
func BenchmarkCznicBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 1, 1000)
}
func BenchmarkCznicBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 2, 1000)
}
func BenchmarkCznicBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, cznicb.Name, nil, DestroyCznicB, 4, 1000)
}

View File

@ -0,0 +1,86 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build forestdb
package firestorm
import (
"os"
"testing"
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/index/store/forestdb"
)
func CreateForestDB() (store.KVStore, error) {
err := os.MkdirAll("testdir", 0700)
if err != nil {
return nil, err
}
s, err := forestdb.New("testdir/test", true, nil)
if err != nil {
return nil, err
}
return s, nil
}
func DestroyForestDB() error {
return os.RemoveAll("testdir")
}
func BenchmarkForestDBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateForestDB, DestroyForestDB, 1)
}
func BenchmarkForestDBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateForestDB, DestroyForestDB, 2)
}
func BenchmarkForestDBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateForestDB, DestroyForestDB, 4)
}
// batches
func BenchmarkForestDBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 1, 10)
}
func BenchmarkForestDBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 2, 10)
}
func BenchmarkForestDBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 4, 10)
}
func BenchmarkForestDBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 1, 100)
}
func BenchmarkForestDBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 2, 100)
}
func BenchmarkForestDBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 4, 100)
}
func BenchmarkForestDBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 1, 1000)
}
func BenchmarkForestDBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 2, 1000)
}
func BenchmarkForestDBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateForestDB, DestroyForestDB, 4, 1000)
}

View File

@ -0,0 +1,71 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/bleve/index/store/goleveldb"
)
var goLevelDBTestOptions = map[string]interface{}{
"create_if_missing": true,
"path": "test",
}
func BenchmarkGoLevelDBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1)
}
func BenchmarkGoLevelDBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2)
}
func BenchmarkGoLevelDBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4)
}
// batches
func BenchmarkGoLevelDBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1, 10)
}
func BenchmarkGoLevelDBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2, 10)
}
func BenchmarkGoLevelDBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4, 10)
}
func BenchmarkGoLevelDBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1, 100)
}
func BenchmarkGoLevelDBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2, 100)
}
func BenchmarkGoLevelDBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4, 100)
}
func BenchmarkGoLevelDBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 1, 1000)
}
func BenchmarkGoLevelDBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 2, 1000)
}
func BenchmarkGoLevelDBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, goleveldb.Name, goLevelDBTestOptions, DestroyTest, 4, 1000)
}

View File

@ -0,0 +1,81 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build rocksdb
package firestorm
import (
"os"
"testing"
"github.com/blevesearch/bleve/index/store"
)
var rocksdbTestOptions = map[string]interface{}{
"create_if_missing": true,
}
func CreateGoRocksDB() (store.KVStore, error) {
return rocksdb.New("test", rocksdbTestOptions)
}
func DestroyGoRocksDB() error {
return os.RemoveAll("test")
}
func BenchmarkRocksDBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateGoRocksDB, DestroyGoRocksDB, 1)
}
func BenchmarkRocksDBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateGoRocksDB, DestroyGoRocksDB, 2)
}
func BenchmarkRocksDBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateGoRocksDB, DestroyGoRocksDB, 4)
}
// batches
func BenchmarkRocksDBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 1, 10)
}
func BenchmarkRocksDBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 2, 10)
}
func BenchmarkRocksDBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 4, 10)
}
func BenchmarkRocksDBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 1, 100)
}
func BenchmarkRocksDBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 2, 100)
}
func BenchmarkRocksDBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 4, 100)
}
func BenchmarkRocksDBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 1, 1000)
}
func BenchmarkRocksDBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 2, 1000)
}
func BenchmarkRocksDBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateGoRocksDB, DestroyGoRocksDB, 4, 1000)
}

View File

@ -0,0 +1,70 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func DestroyGTreap() error {
return nil
}
func BenchmarkGTreapIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, gtreap.Name, nil, DestroyGTreap, 1)
}
func BenchmarkGTreapIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, gtreap.Name, nil, DestroyGTreap, 2)
}
func BenchmarkGTreapIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, gtreap.Name, nil, DestroyGTreap, 4)
}
// batches
func BenchmarkGTreapIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 1, 10)
}
func BenchmarkGTreapIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 2, 10)
}
func BenchmarkGTreapIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 4, 10)
}
func BenchmarkGTreapIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 1, 100)
}
func BenchmarkGTreapIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 2, 100)
}
func BenchmarkGTreapIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 4, 100)
}
func BenchmarkGTreapIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 1, 1000)
}
func BenchmarkGTreapIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 2, 1000)
}
func BenchmarkGTreapIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, gtreap.Name, nil, DestroyGTreap, 4, 1000)
}

View File

@ -0,0 +1,82 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
// +build leveldb full
package firestorm
import (
"os"
"testing"
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/index/store/leveldb"
)
var leveldbTestOptions = map[string]interface{}{
"create_if_missing": true,
}
func CreateLevelDB() (store.KVStore, error) {
return leveldb.New("test", leveldbTestOptions)
}
func DestroyLevelDB() error {
return os.RemoveAll("test")
}
func BenchmarkLevelDBIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateLevelDB, DestroyLevelDB, 1)
}
func BenchmarkLevelDBIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateLevelDB, DestroyLevelDB, 2)
}
func BenchmarkLevelDBIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, CreateLevelDB, DestroyLevelDB, 4)
}
// batches
func BenchmarkLevelDBIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 1, 10)
}
func BenchmarkLevelDBIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 2, 10)
}
func BenchmarkLevelDBIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 4, 10)
}
func BenchmarkLevelDBIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 1, 100)
}
func BenchmarkLevelDBIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 2, 100)
}
func BenchmarkLevelDBIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 4, 100)
}
func BenchmarkLevelDBIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 1, 1000)
}
func BenchmarkLevelDBIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 2, 1000)
}
func BenchmarkLevelDBIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, CreateLevelDB, DestroyLevelDB, 4, 1000)
}

View File

@ -0,0 +1,70 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/bleve/index/store/null"
)
func DestroyNull() error {
return nil
}
func BenchmarkNullIndexing1Workers(b *testing.B) {
CommonBenchmarkIndex(b, null.Name, nil, DestroyNull, 1)
}
func BenchmarkNullIndexing2Workers(b *testing.B) {
CommonBenchmarkIndex(b, null.Name, nil, DestroyNull, 2)
}
func BenchmarkNullIndexing4Workers(b *testing.B) {
CommonBenchmarkIndex(b, null.Name, nil, DestroyNull, 4)
}
// batches
func BenchmarkNullIndexing1Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 1, 10)
}
func BenchmarkNullIndexing2Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 2, 10)
}
func BenchmarkNullIndexing4Workers10Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 4, 10)
}
func BenchmarkNullIndexing1Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 1, 100)
}
func BenchmarkNullIndexing2Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 2, 100)
}
func BenchmarkNullIndexing4Workers100Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 4, 100)
}
func BenchmarkNullIndexing1Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 1, 1000)
}
func BenchmarkNullIndexing2Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 2, 1000)
}
func BenchmarkNullIndexing4Workers1000Batch(b *testing.B) {
CommonBenchmarkIndexBatch(b, null.Name, nil, DestroyNull, 4, 1000)
}

156
index/firestorm/comp.go Normal file
View File

@ -0,0 +1,156 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"math/rand"
"sort"
"sync"
"github.com/steveyen/gtreap"
"github.com/willf/bitset"
)
type Compensator struct {
inFlightMutex sync.RWMutex
maxRead uint64
inFlight *gtreap.Treap
deletedMutex sync.RWMutex
deletedDocNumbers *bitset.BitSet
}
func NewCompensator() *Compensator {
rv := Compensator{
inFlight: gtreap.NewTreap(inFlightItemCompare),
deletedDocNumbers: bitset.New(1000000),
}
return &rv
}
type Snapshot struct {
maxRead uint64
inFlight *gtreap.Treap
deletedDocNumbers *bitset.BitSet
}
// returns which doc number is valid
// if none, then 0
func (s *Snapshot) Which(docID []byte, docNumList DocNumberList) uint64 {
inFlightVal := s.inFlight.Get(&InFlightItem{docID: docID})
sort.Sort(docNumList) // Descending ordering.
for _, docNum := range docNumList {
if docNum > 0 && docNum <= s.maxRead &&
(inFlightVal == nil || inFlightVal.(*InFlightItem).docNum == docNum) &&
!s.deletedDocNumbers.Test(uint(docNum)) {
return docNum
}
}
return 0
}
func (s *Snapshot) Valid(docID []byte, docNum uint64) bool {
logger.Printf("checking validity of: '%s' - % x - %d", docID, docID, docNum)
if docNum > s.maxRead {
return false
}
logger.Printf("<= maxRead")
inFlightVal := s.inFlight.Get(&InFlightItem{docID: docID})
if inFlightVal != nil && inFlightVal.(*InFlightItem).docNum != docNum {
return false
}
logger.Printf("not in flight")
if s.deletedDocNumbers.Test(uint(docNum)) {
return false
}
logger.Printf("not deleted")
return true
}
func (c *Compensator) Mutate(docID []byte, docNum uint64) {
c.inFlightMutex.Lock()
defer c.inFlightMutex.Unlock()
c.inFlight = c.inFlight.Upsert(&InFlightItem{docID: docID, docNum: docNum}, rand.Int())
if docNum != 0 {
c.maxRead = docNum
}
}
func (c *Compensator) MutateBatch(inflightItems []*InFlightItem, lastDocNum uint64) {
c.inFlightMutex.Lock()
defer c.inFlightMutex.Unlock()
for _, item := range inflightItems {
c.inFlight = c.inFlight.Upsert(item, rand.Int())
}
c.maxRead = lastDocNum
}
func (c *Compensator) Migrate(docID []byte, docNum uint64, oldDocNums []uint64) {
c.inFlightMutex.Lock()
defer c.inFlightMutex.Unlock()
c.deletedMutex.Lock()
defer c.deletedMutex.Unlock()
// clone deleted doc numbers and mutate
if len(oldDocNums) > 0 {
newDeletedDocNumbers := c.deletedDocNumbers.Clone()
for _, oldDocNum := range oldDocNums {
newDeletedDocNumbers.Set(uint(oldDocNum))
}
// update pointer
c.deletedDocNumbers = newDeletedDocNumbers
}
// remove entry from in-flight if it still has same doc num
val := c.inFlight.Get(&InFlightItem{docID: docID})
if val != nil && val.(*InFlightItem).docNum == docNum {
c.inFlight = c.inFlight.Delete(&InFlightItem{docID: docID})
}
}
func (c *Compensator) GarbageCollect(docNums []uint64) {
c.deletedMutex.Lock()
defer c.deletedMutex.Unlock()
for _, docNum := range docNums {
c.deletedDocNumbers.Clear(uint(docNum))
}
}
func (c *Compensator) Snapshot() *Snapshot {
c.inFlightMutex.RLock()
defer c.inFlightMutex.RUnlock()
c.deletedMutex.RLock()
defer c.deletedMutex.RUnlock()
rv := Snapshot{
maxRead: c.maxRead,
inFlight: c.inFlight,
deletedDocNumbers: c.deletedDocNumbers,
}
return &rv
}
func (c *Compensator) GarbageCount() uint64 {
return uint64(c.deletedDocNumbers.Count())
}
//**************
type InFlightItem struct {
docID []byte
docNum uint64
}
func inFlightItemCompare(a, b interface{}) int {
return bytes.Compare(a.(*InFlightItem).docID, b.(*InFlightItem).docID)
}

View File

@ -0,0 +1,160 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"encoding/binary"
"fmt"
"sync"
"sync/atomic"
"time"
)
const DefaultDictUpdateThreshold = 10
var DefaultDictUpdateSleep = 1 * time.Second
type DictUpdater struct {
f *Firestorm
dictUpdateSleep time.Duration
quit chan struct{}
incoming chan map[string]int64
mutex sync.RWMutex
workingSet map[string]int64
closeWait sync.WaitGroup
batchesStarted uint64
batchesFlushed uint64
}
func NewDictUpdater(f *Firestorm) *DictUpdater {
rv := DictUpdater{
f: f,
dictUpdateSleep: DefaultDictUpdateSleep,
workingSet: make(map[string]int64),
batchesStarted: 1,
quit: make(chan struct{}),
incoming: make(chan map[string]int64, 8),
}
return &rv
}
func (d *DictUpdater) Notify(term string, usage int64) {
d.mutex.Lock()
defer d.mutex.Unlock()
d.workingSet[term] += usage
}
func (d *DictUpdater) NotifyBatch(termUsages map[string]int64) {
d.incoming <- termUsages
}
func (d *DictUpdater) Start() {
d.closeWait.Add(1)
go d.runIncoming()
go d.run()
}
func (d *DictUpdater) Stop() {
close(d.quit)
d.closeWait.Wait()
}
func (d *DictUpdater) runIncoming() {
for {
select {
case <-d.quit:
return
case termUsages, ok := <-d.incoming:
if !ok {
return
}
d.mutex.Lock()
for term, usage := range termUsages {
d.workingSet[term] += usage
}
d.mutex.Unlock()
}
}
}
func (d *DictUpdater) run() {
tick := time.Tick(d.dictUpdateSleep)
for {
select {
case <-d.quit:
logger.Printf("dictionary updater asked to quit")
d.closeWait.Done()
return
case <-tick:
logger.Printf("dictionary updater ticked")
d.update()
}
}
}
func (d *DictUpdater) update() {
d.mutex.Lock()
oldWorkingSet := d.workingSet
d.workingSet = make(map[string]int64)
atomic.AddUint64(&d.batchesStarted, 1)
d.mutex.Unlock()
// open a writer
writer, err := d.f.store.Writer()
if err != nil {
_ = writer.Close()
logger.Printf("dict updater fatal: %v", err)
return
}
// prepare batch
wb := writer.NewBatch()
dictionaryTermDelta := make([]byte, 8)
for term, delta := range oldWorkingSet {
binary.LittleEndian.PutUint64(dictionaryTermDelta, uint64(delta))
wb.Merge([]byte(term), dictionaryTermDelta)
}
err = writer.ExecuteBatch(wb)
if err != nil {
_ = writer.Close()
logger.Printf("dict updater fatal: %v", err)
return
}
atomic.AddUint64(&d.batchesFlushed, 1)
err = writer.Close()
}
// this is not intended to be used publicly, only for unit tests
// which depend on consistency we no longer provide
func (d *DictUpdater) waitTasksDone(dur time.Duration) error {
initial := atomic.LoadUint64(&d.batchesStarted)
timeout := time.After(dur)
tick := time.Tick(100 * time.Millisecond)
for {
select {
// Got a timeout! fail with a timeout error
case <-timeout:
flushed := atomic.LoadUint64(&d.batchesFlushed)
return fmt.Errorf("timeout, %d/%d", initial, flushed)
// Got a tick, we should check on doSomething()
case <-tick:
flushed := atomic.LoadUint64(&d.batchesFlushed)
if flushed > initial {
return nil
}
}
}
}

View File

@ -0,0 +1,163 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"runtime"
"testing"
"time"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func TestDictUpdater(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
dictBatch := map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): 3,
}
dictExpect := map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): 3,
}
f.(*Firestorm).dictUpdater.NotifyBatch(dictBatch)
// invoke updater manually
for len(f.(*Firestorm).dictUpdater.incoming) > 0 {
runtime.Gosched()
}
err = f.(*Firestorm).dictUpdater.waitTasksDone(5 * time.Second)
if err != nil {
t.Fatal(err)
}
// assert that dictionary rows are correct
reader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
for key, _ := range dictBatch {
v, err := reader.Get([]byte(key))
if err != nil {
t.Fatal(err)
}
if v == nil {
t.Fatal("unexpected dictionary value missing")
}
dr, err := NewDictionaryRowKV([]byte(key), v)
if err != nil {
t.Fatal(err)
}
expect := dictExpect[key]
if int64(dr.Count()) != expect {
t.Errorf("expected %d, got %d", expect, dr.Count())
}
}
err = reader.Close()
if err != nil {
t.Fatal(err)
}
// update it again
dictBatch = map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): 1,
}
dictExpect = map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): 4,
}
f.(*Firestorm).dictUpdater.NotifyBatch(dictBatch)
// invoke updater manually
for len(f.(*Firestorm).dictUpdater.incoming) > 0 {
runtime.Gosched()
}
f.(*Firestorm).dictUpdater.update()
// assert that dictionary rows are correct
reader, err = f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
for key, _ := range dictBatch {
v, err := reader.Get([]byte(key))
if err != nil {
t.Fatal(err)
}
dr, err := NewDictionaryRowKV([]byte(key), v)
if err != nil {
t.Fatal(err)
}
expect := dictExpect[key]
if int64(dr.Count()) != expect {
t.Errorf("expected %d, got %d", expect, dr.Count())
}
}
err = reader.Close()
if err != nil {
t.Fatal(err)
}
// update it again (decrement this time)
dictBatch = map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): -2,
}
dictExpect = map[string]int64{
string([]byte{'d', 1, 0, 'c', 'a', 't'}): 2,
}
f.(*Firestorm).dictUpdater.NotifyBatch(dictBatch)
// invoke updater manually
for len(f.(*Firestorm).dictUpdater.incoming) > 0 {
runtime.Gosched()
}
f.(*Firestorm).dictUpdater.update()
// assert that dictionary rows are correct
reader, err = f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
for key, _ := range dictBatch {
v, err := reader.Get([]byte(key))
if err != nil {
t.Fatal(err)
}
dr, err := NewDictionaryRowKV([]byte(key), v)
if err != nil {
t.Fatal(err)
}
expect := dictExpect[key]
if int64(dr.Count()) != expect {
t.Errorf("expected %d, got %d", expect, dr.Count())
}
}
err = reader.Close()
if err != nil {
t.Fatal(err)
}
}

View File

@ -0,0 +1,128 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"encoding/binary"
"io"
"github.com/golang/protobuf/proto"
)
const ByteSeparator byte = 0xff
var DictionaryKeyPrefix = []byte{'d'}
type DictionaryRow struct {
field uint16
term []byte
value DictionaryValue
}
func NewDictionaryRow(field uint16, term []byte, count uint64) *DictionaryRow {
rv := DictionaryRow{
field: field,
term: term,
}
rv.value.Count = proto.Uint64(count)
return &rv
}
func NewDictionaryRowK(key []byte) (*DictionaryRow, error) {
rv := DictionaryRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &rv.field)
if err != nil {
return nil, err
}
rv.term, err = buf.ReadBytes(ByteSeparator)
// there is no separator expected here, should get EOF
if err != io.EOF {
return nil, err
}
return &rv, nil
}
func (dr *DictionaryRow) parseDictionaryV(value []byte) error {
err := dr.value.Unmarshal(value)
if err != nil {
return err
}
return nil
}
func NewDictionaryRowKV(key, value []byte) (*DictionaryRow, error) {
rv, err := NewDictionaryRowK(key)
if err != nil {
return nil, err
}
err = rv.parseDictionaryV(value)
if err != nil {
return nil, err
}
return rv, nil
}
func (dr *DictionaryRow) Count() uint64 {
return dr.value.GetCount()
}
func (dr *DictionaryRow) SetCount(count uint64) {
dr.value.Count = proto.Uint64(count)
}
func (dr *DictionaryRow) KeySize() int {
return 3 + len(dr.term)
}
func (dr *DictionaryRow) KeyTo(buf []byte) (int, error) {
copy(buf[0:], DictionaryKeyPrefix)
binary.LittleEndian.PutUint16(buf[1:3], dr.field)
copy(buf[3:], dr.term)
return 3 + len(dr.term), nil
}
func (dr *DictionaryRow) Key() []byte {
buf := make([]byte, dr.KeySize())
n, _ := dr.KeyTo(buf)
return buf[:n]
}
func (dr *DictionaryRow) ValueSize() int {
return dr.value.Size()
}
func (dr *DictionaryRow) ValueTo(buf []byte) (int, error) {
return dr.value.MarshalTo(buf)
}
func (dr *DictionaryRow) Value() []byte {
buf := make([]byte, dr.ValueSize())
n, _ := dr.ValueTo(buf)
return buf[:n]
}
func DictionaryRowKey(field uint16, term []byte) []byte {
buf := make([]byte, 3+len(term))
copy(buf[0:], DictionaryKeyPrefix)
binary.LittleEndian.PutUint16(buf[1:3], field)
copy(buf[3:], term)
return buf
}

View File

@ -0,0 +1,59 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
)
func TestDictionaryRows(t *testing.T) {
tests := []struct {
input index.IndexRow
outKey []byte
outVal []byte
}{
{
NewDictionaryRow(0, []byte("test"), 3),
[]byte{DictionaryKeyPrefix[0], 0, 0, 't', 'e', 's', 't'},
[]byte{8, 3},
},
{
NewDictionaryRow(3, []byte("dictionary"), 734),
[]byte{DictionaryKeyPrefix[0], 3, 0, 'd', 'i', 'c', 't', 'i', 'o', 'n', 'a', 'r', 'y'},
[]byte{8, 222, 5},
},
}
// test going from struct to k/v bytes
for i, test := range tests {
rk := test.input.Key()
if !reflect.DeepEqual(rk, test.outKey) {
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
}
rv := test.input.Value()
if !reflect.DeepEqual(rv, test.outVal) {
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
}
}
// now test going back from k/v bytes to struct
for i, test := range tests {
row, err := NewDictionaryRowKV(test.outKey, test.outVal)
if err != nil {
t.Errorf("error parsking key/value: %v", err)
}
if !reflect.DeepEqual(row, test.input) {
t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i)
}
}
}

92
index/firestorm/dump.go Normal file
View File

@ -0,0 +1,92 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"fmt"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
)
// the functions in this file are only intended to be used by
// the bleve_dump utility and the debug http handlers
// if your application relies on them, you're doing something wrong
// they may change or be removed at any time
func (f *Firestorm) dumpPrefix(kvreader store.KVReader, rv chan interface{}, prefix []byte) error {
return visitPrefix(kvreader, prefix, func(key, val []byte) (bool, error) {
row, err := parseFromKeyValue(key, val)
if err != nil {
rv <- err
return false, err
}
rv <- row
return true, nil
})
}
func (f *Firestorm) dumpDoc(kvreader store.KVReader, rv chan interface{}, docID []byte) error {
// without a back index we have no choice but to walk the term freq and stored rows
// walk the term freqs
err := visitPrefix(kvreader, TermFreqKeyPrefix, func(key, val []byte) (bool, error) {
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
rv <- err
return false, err
}
if bytes.Compare(tfr.DocID(), docID) == 0 {
rv <- tfr
}
return true, nil
})
if err != nil {
return err
}
// now walk the stored
err = visitPrefix(kvreader, StoredKeyPrefix, func(key, val []byte) (bool, error) {
sr, err := NewStoredRowKV(key, val)
if err != nil {
rv <- err
return false, err
}
if bytes.Compare(sr.DocID(), docID) == 0 {
rv <- sr
}
return true, nil
})
return err
}
func parseFromKeyValue(key, value []byte) (index.IndexRow, error) {
if len(key) > 0 {
switch key[0] {
case VersionKey[0]:
return NewVersionRowV(value)
case FieldKeyPrefix[0]:
return NewFieldRowKV(key, value)
case DictionaryKeyPrefix[0]:
return NewDictionaryRowKV(key, value)
case TermFreqKeyPrefix[0]:
return NewTermFreqRowKV(key, value)
case StoredKeyPrefix[0]:
return NewStoredRowKV(key, value)
case InternalKeyPrefix[0]:
return NewInternalRowKV(key, value)
}
return nil, fmt.Errorf("Unknown field type '%s'", string(key[0]))
}
return nil, fmt.Errorf("Invalid empty key")
}

View File

@ -0,0 +1,129 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"time"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
var dictWaitDuration = 5 * time.Second
func TestDump(t *testing.T) {
analysisQueue := index.NewAnalysisQueue(1)
idx, err := NewFirestorm(gtreap.Name, nil, analysisQueue)
if err != nil {
t.Fatal(err)
}
err = idx.Open()
if err != nil {
t.Fatalf("error opening index: %v", err)
}
defer func() {
err := idx.Close()
if err != nil {
t.Fatal(err)
}
}()
var expectedCount uint64
docCount, err := idx.DocCount()
if err != nil {
t.Error(err)
}
if docCount != expectedCount {
t.Errorf("Expected document count to be %d got %d", expectedCount, docCount)
}
doc := document.NewDocument("1")
doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField))
doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 35.99, document.IndexField|document.StoreField))
dateField, err := document.NewDateTimeFieldWithIndexingOptions("unixEpoch", []uint64{}, time.Unix(0, 0), document.IndexField|document.StoreField)
if err != nil {
t.Error(err)
}
doc.AddField(dateField)
err = idx.Update(doc)
if err != nil {
t.Errorf("Error updating index: %v", err)
}
doc = document.NewDocument("2")
doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test2"), document.IndexField|document.StoreField))
doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 35.99, document.IndexField|document.StoreField))
dateField, err = document.NewDateTimeFieldWithIndexingOptions("unixEpoch", []uint64{}, time.Unix(0, 0), document.IndexField|document.StoreField)
if err != nil {
t.Error(err)
}
doc.AddField(dateField)
err = idx.Update(doc)
if err != nil {
t.Errorf("Error updating index: %v", err)
}
fieldsCount := 0
fieldsRows := idx.DumpFields()
for _ = range fieldsRows {
fieldsCount++
}
if fieldsCount != 4 { // _id field is automatic
t.Errorf("expected 4 fields, got %d", fieldsCount)
}
// 1 id term
// 1 text term
// 16 numeric terms
// 16 date terms
// 3 stored fields
expectedDocRowCount := int(1 + 1 + (2 * (64 / document.DefaultPrecisionStep)) + 3)
docRowCount := 0
docRows := idx.DumpDoc("1")
for _ = range docRows {
docRowCount++
}
if docRowCount != expectedDocRowCount {
t.Errorf("expected %d rows for document, got %d", expectedDocRowCount, docRowCount)
}
docRowCount = 0
docRows = idx.DumpDoc("2")
for _ = range docRows {
docRowCount++
}
if docRowCount != expectedDocRowCount {
t.Errorf("expected %d rows for document, got %d", expectedDocRowCount, docRowCount)
}
err = idx.(*Firestorm).dictUpdater.waitTasksDone(dictWaitDuration)
if err != nil {
t.Fatal(err)
}
// 1 version
// fieldsCount field rows
// 2 docs * expectedDocRowCount
// 2 text term row count (2 different text terms)
// 16 numeric term row counts (shared for both docs, same numeric value)
// 16 date term row counts (shared for both docs, same date value)
//
expectedAllRowCount := int(1 + fieldsCount + (2 * expectedDocRowCount) + 2 + int((2 * (64 / document.DefaultPrecisionStep))))
allRowCount := 0
allRows := idx.DumpAll()
for _ = range allRows {
allRowCount++
}
if allRowCount != expectedAllRowCount {
t.Errorf("expected %d rows for all, got %d", expectedAllRowCount, allRowCount)
}
}

119
index/firestorm/field.go Normal file
View File

@ -0,0 +1,119 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"encoding/binary"
"fmt"
"github.com/blevesearch/bleve/index/store"
"github.com/golang/protobuf/proto"
)
var FieldKeyPrefix = []byte{'f'}
func (f *Firestorm) fieldIndexOrNewRow(name string) (uint16, *FieldRow) {
index, existed := f.fieldCache.FieldNamed(name, true)
if !existed {
return index, NewFieldRow(uint16(index), name)
}
return index, nil
}
func (f *Firestorm) loadFields(reader store.KVReader) (err error) {
err = visitPrefix(reader, FieldKeyPrefix, func(key, val []byte) (bool, error) {
fieldRow, err := NewFieldRowKV(key, val)
if err != nil {
return false, err
}
f.fieldCache.AddExisting(fieldRow.Name(), fieldRow.Index())
return true, nil
})
return
}
type FieldRow struct {
index uint16
value FieldValue
}
func NewFieldRow(i uint16, name string) *FieldRow {
rv := FieldRow{
index: i,
}
rv.value.Name = proto.String(name)
return &rv
}
func NewFieldRowKV(key, value []byte) (*FieldRow, error) {
rv := FieldRow{}
buf := bytes.NewBuffer(key)
_, err := buf.ReadByte() // type
if err != nil {
return nil, err
}
err = binary.Read(buf, binary.LittleEndian, &rv.index)
if err != nil {
return nil, err
}
err = rv.value.Unmarshal(value)
if err != nil {
return nil, err
}
return &rv, nil
}
func (fr *FieldRow) KeySize() int {
return 3
}
func (fr *FieldRow) KeyTo(buf []byte) (int, error) {
buf[0] = 'f'
binary.LittleEndian.PutUint16(buf[1:3], fr.index)
return 3, nil
}
func (fr *FieldRow) Key() []byte {
buf := make([]byte, fr.KeySize())
n, _ := fr.KeyTo(buf)
return buf[:n]
}
func (fr *FieldRow) ValueSize() int {
return fr.value.Size()
}
func (fr *FieldRow) ValueTo(buf []byte) (int, error) {
return fr.value.MarshalTo(buf)
}
func (fr *FieldRow) Value() []byte {
buf := make([]byte, fr.ValueSize())
n, _ := fr.ValueTo(buf)
return buf[:n]
}
func (fr *FieldRow) Index() uint16 {
return fr.index
}
func (fr *FieldRow) Name() string {
return fr.value.GetName()
}
func (fr *FieldRow) String() string {
return fmt.Sprintf("FieldRow - Field: %d - Name: %s\n", fr.index, fr.Name())
}

View File

@ -0,0 +1,59 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
)
func TestFieldRows(t *testing.T) {
tests := []struct {
input index.IndexRow
outKey []byte
outVal []byte
}{
{
NewFieldRow(0, "_id"),
[]byte{FieldKeyPrefix[0], 0, 0},
[]byte{10, 3, '_', 'i', 'd'},
},
{
NewFieldRow(1, "name"),
[]byte{FieldKeyPrefix[0], 1, 0},
[]byte{10, 4, 'n', 'a', 'm', 'e'},
},
}
// test going from struct to k/v bytes
for i, test := range tests {
rk := test.input.Key()
if !reflect.DeepEqual(rk, test.outKey) {
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
}
rv := test.input.Value()
if !reflect.DeepEqual(rv, test.outVal) {
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
}
}
// now test going back from k/v bytes to struct
for i, test := range tests {
row, err := NewFieldRowKV(test.outKey, test.outVal)
if err != nil {
t.Errorf("error parsking key/value: %v", err)
}
if !reflect.DeepEqual(row, test.input) {
t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i)
}
}
}

View File

@ -0,0 +1,551 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"encoding/json"
"fmt"
"sync"
"sync/atomic"
"time"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
"github.com/blevesearch/bleve/registry"
)
const Name = "firestorm"
var UnsafeBatchUseDetected = fmt.Errorf("bleve.Batch is NOT thread-safe, modification after execution detected")
type Firestorm struct {
storeName string
storeConfig map[string]interface{}
store store.KVStore
compensator *Compensator
analysisQueue *index.AnalysisQueue
fieldCache *index.FieldCache
highDocNumber uint64
docCount *uint64
garbageCollector *GarbageCollector
lookuper *Lookuper
dictUpdater *DictUpdater
stats *indexStat
}
func NewFirestorm(storeName string, storeConfig map[string]interface{}, analysisQueue *index.AnalysisQueue) (index.Index, error) {
initialCount := uint64(0)
rv := Firestorm{
storeName: storeName,
storeConfig: storeConfig,
compensator: NewCompensator(),
analysisQueue: analysisQueue,
fieldCache: index.NewFieldCache(),
docCount: &initialCount,
highDocNumber: 0,
stats: &indexStat{},
}
rv.stats.f = &rv
rv.garbageCollector = NewGarbageCollector(&rv)
rv.lookuper = NewLookuper(&rv)
rv.dictUpdater = NewDictUpdater(&rv)
return &rv, nil
}
func (f *Firestorm) Open() (err error) {
// open the kv store
storeConstructor := registry.KVStoreConstructorByName(f.storeName)
if storeConstructor == nil {
err = index.ErrorUnknownStorageType
return
}
// now open the store
f.store, err = storeConstructor(&mergeOperator, f.storeConfig)
if err != nil {
return
}
// start a reader
var kvreader store.KVReader
kvreader, err = f.store.Reader()
if err != nil {
return
}
// assert correct version, and find out if this is new index
var newIndex bool
newIndex, err = f.checkVersion(kvreader)
if err != nil {
return
}
if !newIndex {
// process existing index before opening
err = f.warmup(kvreader)
if err != nil {
return
}
}
err = kvreader.Close()
if err != nil {
return
}
if newIndex {
// prepare a new index
err = f.bootstrap()
if err != nil {
return
}
}
// start the garbage collector
f.garbageCollector.Start()
// start the lookuper
f.lookuper.Start()
// start the dict updater
f.dictUpdater.Start()
return
}
func (f *Firestorm) Close() error {
f.garbageCollector.Stop()
f.lookuper.Stop()
f.dictUpdater.Stop()
return f.store.Close()
}
func (f *Firestorm) DocCount() (uint64, error) {
count := atomic.LoadUint64(f.docCount)
return count, nil
}
func (f *Firestorm) Update(doc *document.Document) (err error) {
// assign this document a number
doc.Number = atomic.AddUint64(&f.highDocNumber, 1)
// do analysis before acquiring write lock
analysisStart := time.Now()
resultChan := make(chan *index.AnalysisResult)
aw := index.NewAnalysisWork(f, doc, resultChan)
// put the work on the queue
f.analysisQueue.Queue(aw)
// wait for the result
result := <-resultChan
close(resultChan)
atomic.AddUint64(&f.stats.analysisTime, uint64(time.Since(analysisStart)))
// start a writer for this update
indexStart := time.Now()
var kvwriter store.KVWriter
kvwriter, err = f.store.Writer()
if err != nil {
return
}
defer func() {
if cerr := kvwriter.Close(); err == nil && cerr != nil {
err = cerr
}
}()
var dictionaryDeltas map[string]int64
dictionaryDeltas, err = f.batchRows(kvwriter, [][]index.IndexRow{result.Rows}, nil)
if err != nil {
_ = kvwriter.Close()
atomic.AddUint64(&f.stats.errors, 1)
return
}
f.compensator.Mutate([]byte(doc.ID), doc.Number)
f.lookuper.NotifyBatch([]*InFlightItem{&InFlightItem{[]byte(doc.ID), doc.Number}})
f.dictUpdater.NotifyBatch(dictionaryDeltas)
atomic.AddUint64(&f.stats.indexTime, uint64(time.Since(indexStart)))
return
}
func (f *Firestorm) Delete(id string) error {
indexStart := time.Now()
f.compensator.Mutate([]byte(id), 0)
f.lookuper.NotifyBatch([]*InFlightItem{&InFlightItem{[]byte(id), 0}})
atomic.AddUint64(&f.stats.indexTime, uint64(time.Since(indexStart)))
return nil
}
func (f *Firestorm) batchRows(writer store.KVWriter, rowsOfRows [][]index.IndexRow, deleteKeys [][]byte) (map[string]int64, error) {
dictionaryDeltas := make(map[string]int64)
// count up bytes needed for buffering.
addNum := 0
addKeyBytes := 0
addValBytes := 0
deleteNum := 0
deleteKeyBytes := 0
var kbuf []byte
prepareBuf := func(buf []byte, sizeNeeded int) []byte {
if cap(buf) < sizeNeeded {
return make([]byte, sizeNeeded, sizeNeeded+128)
}
return buf[0:sizeNeeded]
}
for _, rows := range rowsOfRows {
for _, row := range rows {
tfr, ok := row.(*TermFreqRow)
if ok {
if tfr.Field() != 0 {
kbuf = prepareBuf(kbuf, tfr.DictionaryRowKeySize())
klen, err := tfr.DictionaryRowKeyTo(kbuf)
if err != nil {
return nil, err
}
dictionaryDeltas[string(kbuf[0:klen])] += 1
}
}
addKeyBytes += row.KeySize()
addValBytes += row.ValueSize()
}
addNum += len(rows)
}
for _, dk := range deleteKeys {
deleteKeyBytes += len(dk)
}
deleteNum += len(deleteKeys)
// prepare batch
totBytes := addKeyBytes + addValBytes + deleteKeyBytes
buf, wb, err := writer.NewBatchEx(store.KVBatchOptions{
TotalBytes: totBytes,
NumSets: addNum,
NumDeletes: deleteNum,
NumMerges: 0,
})
if err != nil {
return nil, err
}
defer func() {
_ = wb.Close()
}()
for _, rows := range rowsOfRows {
for _, row := range rows {
klen, err := row.KeyTo(buf)
if err != nil {
return nil, err
}
vlen, err := row.ValueTo(buf[klen:])
if err != nil {
return nil, err
}
wb.Set(buf[0:klen], buf[klen:klen+vlen])
buf = buf[klen+vlen:]
}
}
for _, dk := range deleteKeys {
dklen := copy(buf, dk)
wb.Delete(buf[0:dklen])
buf = buf[dklen:]
}
// write out the batch
err = writer.ExecuteBatch(wb)
if err != nil {
return nil, err
}
return dictionaryDeltas, nil
}
func (f *Firestorm) Batch(batch *index.Batch) (err error) {
// acquire enough doc numbers for all updates in the batch
// FIXME we actually waste doc numbers because deletes are in the
// same map and we don't need numbers for them
lastDocNumber := atomic.AddUint64(&f.highDocNumber, uint64(len(batch.IndexOps)))
firstDocNumber := lastDocNumber - uint64(len(batch.IndexOps)) + 1
analysisStart := time.Now()
resultChan := make(chan *index.AnalysisResult)
var docsUpdated uint64
var docsDeleted uint64
for _, doc := range batch.IndexOps {
if doc != nil {
doc.Number = firstDocNumber // actually assign doc numbers here
firstDocNumber++
docsUpdated++
} else {
docsDeleted++
}
}
var detectedUnsafeMutex sync.RWMutex
detectedUnsafe := false
go func() {
sofar := uint64(0)
for _, doc := range batch.IndexOps {
if doc != nil {
sofar++
if sofar > docsUpdated {
detectedUnsafeMutex.Lock()
detectedUnsafe = true
detectedUnsafeMutex.Unlock()
return
}
aw := index.NewAnalysisWork(f, doc, resultChan)
// put the work on the queue
f.analysisQueue.Queue(aw)
}
}
}()
// extra 1 capacity for internal updates.
collectRows := make([][]index.IndexRow, 0, docsUpdated+1)
// wait for the result
var itemsDeQueued uint64
for itemsDeQueued < docsUpdated {
result := <-resultChan
collectRows = append(collectRows, result.Rows)
itemsDeQueued++
}
close(resultChan)
detectedUnsafeMutex.RLock()
defer detectedUnsafeMutex.RUnlock()
if detectedUnsafe {
return UnsafeBatchUseDetected
}
atomic.AddUint64(&f.stats.analysisTime, uint64(time.Since(analysisStart)))
var deleteKeys [][]byte
if len(batch.InternalOps) > 0 {
// add the internal ops
updateInternalRows := make([]index.IndexRow, 0, len(batch.InternalOps))
for internalKey, internalValue := range batch.InternalOps {
if internalValue == nil {
// delete
deleteInternalRow := NewInternalRow([]byte(internalKey), nil)
deleteKeys = append(deleteKeys, deleteInternalRow.Key())
} else {
updateInternalRow := NewInternalRow([]byte(internalKey), internalValue)
updateInternalRows = append(updateInternalRows, updateInternalRow)
}
}
collectRows = append(collectRows, updateInternalRows)
}
inflightItems := make([]*InFlightItem, 0, len(batch.IndexOps))
for docID, doc := range batch.IndexOps {
if doc != nil {
inflightItems = append(inflightItems,
&InFlightItem{[]byte(docID), doc.Number})
} else {
inflightItems = append(inflightItems,
&InFlightItem{[]byte(docID), 0})
}
}
indexStart := time.Now()
// start a writer for this batch
var kvwriter store.KVWriter
kvwriter, err = f.store.Writer()
if err != nil {
return
}
var dictionaryDeltas map[string]int64
dictionaryDeltas, err = f.batchRows(kvwriter, collectRows, deleteKeys)
if err != nil {
_ = kvwriter.Close()
atomic.AddUint64(&f.stats.errors, 1)
return
}
f.compensator.MutateBatch(inflightItems, lastDocNumber)
err = kvwriter.Close()
f.lookuper.NotifyBatch(inflightItems)
f.dictUpdater.NotifyBatch(dictionaryDeltas)
atomic.AddUint64(&f.stats.indexTime, uint64(time.Since(indexStart)))
if err == nil {
atomic.AddUint64(&f.stats.updates, docsUpdated)
atomic.AddUint64(&f.stats.deletes, docsDeleted)
atomic.AddUint64(&f.stats.batches, 1)
} else {
atomic.AddUint64(&f.stats.errors, 1)
}
return
}
func (f *Firestorm) SetInternal(key, val []byte) (err error) {
internalRow := NewInternalRow(key, val)
var writer store.KVWriter
writer, err = f.store.Writer()
if err != nil {
return
}
defer func() {
if cerr := writer.Close(); err == nil && cerr != nil {
err = cerr
}
}()
wb := writer.NewBatch()
wb.Set(internalRow.Key(), internalRow.Value())
return writer.ExecuteBatch(wb)
}
func (f *Firestorm) DeleteInternal(key []byte) (err error) {
internalRow := NewInternalRow(key, nil)
var writer store.KVWriter
writer, err = f.store.Writer()
if err != nil {
return
}
defer func() {
if cerr := writer.Close(); err == nil && cerr != nil {
err = cerr
}
}()
wb := writer.NewBatch()
wb.Delete(internalRow.Key())
return writer.ExecuteBatch(wb)
}
func (f *Firestorm) DumpAll() chan interface{} {
rv := make(chan interface{})
go func() {
defer close(rv)
// start an isolated reader for use during the dump
kvreader, err := f.store.Reader()
if err != nil {
rv <- err
return
}
defer func() {
cerr := kvreader.Close()
if cerr != nil {
rv <- cerr
}
}()
err = f.dumpPrefix(kvreader, rv, nil)
if err != nil {
rv <- err
return
}
}()
return rv
}
func (f *Firestorm) DumpDoc(docID string) chan interface{} {
rv := make(chan interface{})
go func() {
defer close(rv)
// start an isolated reader for use during the dump
kvreader, err := f.store.Reader()
if err != nil {
rv <- err
return
}
defer func() {
cerr := kvreader.Close()
if cerr != nil {
rv <- cerr
}
}()
err = f.dumpDoc(kvreader, rv, []byte(docID))
if err != nil {
rv <- err
return
}
}()
return rv
}
func (f *Firestorm) DumpFields() chan interface{} {
rv := make(chan interface{})
go func() {
defer close(rv)
// start an isolated reader for use during the dump
kvreader, err := f.store.Reader()
if err != nil {
rv <- err
return
}
defer func() {
cerr := kvreader.Close()
if cerr != nil {
rv <- cerr
}
}()
err = f.dumpPrefix(kvreader, rv, FieldKeyPrefix)
if err != nil {
rv <- err
return
}
}()
return rv
}
func (f *Firestorm) Reader() (index.IndexReader, error) {
return newFirestormReader(f)
}
func (f *Firestorm) Stats() json.Marshaler {
return f.stats
}
func (f *Firestorm) Wait(timeout time.Duration) error {
return f.dictUpdater.waitTasksDone(timeout)
}
func init() {
registry.RegisterIndexType(Name, NewFirestorm)
}

View File

@ -0,0 +1,382 @@
# Firestorm
A new indexing scheme for Bleve.
## Background
### Goals
- Avoid a single writer that must pause writing to perform computation
- either by allowing multiple writers, if computation cannot be avoided
- or by having a single writer which can insert rows uninterrupted
- Avoid the need for a back index
- the back index is expensive from a space perspective
- by not writing it out, we should be able to obtain a higher indexing throughput
- consulting the backindex is one of the read/think/update cycles mentioned above
### Considerations
- The cost for not maintaining a back index is paid in two places
- Searches may need to read more rows, because old/deleted rows may still exist
- These rows can be excluded, so correctness is not affected, but they will be slower
- Old/Deleted rows need to be cleaned up at some point
- This could either be through an explicit cleanup thread, the job of which is to constantly walk the kvstore looking for rows to delete
- Or, it could be integrated with a KV stores natural merge/compaction process (aka RocksDB)
### Semantics
It is helpful to review the desired semantics between the Index/Delete operations and Term Searches.
#### Index(doc_id, doc)
- Empty Index
- Term Search for "cat" = empty result set
The Index operation should update the index such that after the operation returns, a matching search would return the document.
- Index("a", "small cat")
- Term Search for "cat" = {"a"}
Calling the Index operation again for the same doc_id should update the index such that after the operation returns, only searches matching the newest version return the document.
- Index("a", "big dog")
- Term Search for "cat" = empty result set
- Term Search for "dog" = {"a"}
NOTE:
- At no point during the second index operation would concurrent searches for "cat" and "dog" both return 0 results.
- At no point during the second index operation would concurrent searches for "cat" and "dog" both return 1 result.
#### Delete(doc_id)
- Index("a", "small cat")
- Term Search for "cat" = {"a"}
- Delete("a")
- Term Search for "cat" = empty result set
Once the Delete operation returns, the document should no longer be returned by any search.
## Details
### Terminology
Document ID (`doc_id`)
:The user specified identifier (utf8 string). This never changes for a document.
Document Number (`doc_number`)
:The Bleve internal identifier (uint64). These numbers are generated from an atomic counter.
DocIdNumber
: Concatenation of `<doc_id> 0xff <doc_number>`
### Theory of Operation
By including a new unique identifier as a part of every row generated, the index operation no longer concerns itself with updating existing values or deleting previous values.
Removal of old rows is handled indepenently by separate threads.
Ensuring of correct semantics with respect to added/updated/deleted documents is maintained through synchronized in-memory data structures, to compensate for the decoupling of these other operations.
The Dictionary becomes a best effort data element. In kill-9 scenarios it could become incorrect, but it is believed that this will generally only affect scoring not correctness, and we can pursue read-repair operations.
### Index State
The following pseudo-structure will be used to explain changes to the internal state. Keep in mind the datatypes shown represent the logical structure required for correct behavior. The actual implementation may be different to achieve performance goals.
indexState {
docCount uint64
fieldCache map[string]uint16
nextDocNumber uint64
docIdNumberMutex sync.RWMutex // for protecting fields below
maxReadDocNumber uint64
inFlightDocIds map[string]uint64
deletedDocIdNumbers [][]byte
}
### Operation
#### Creating New Index
- New KV Batch
- SET VersionRow{version=X}
- SET FieldRow{field_id=0 field_name="_id"}
- Execute Batch
- Index State intialized to:
{
docCount = 0
fieldCache = {
"_id": 0
}
nextDocNumber = 1
maxReadDocNumber = 0
inFlightDocIds = {}
deletedDocIdNumbers = {}
}
- Garbage Collector Thread is started
- Old Doc Number Lookup Thread is started
- Index marked open
#### Opening an Existing Index
- GET VersionRow, assert current version or exit
- ITERATE all FieldRows{}
- ITERATE all TermFrequencyRow{ where field_id = 0 }
- Identify consecutive rows with same doc_id but different doc_number
- Lower document numbers are added to the deletedDocIdNumbers list
- Count all non-duplicate rows, seed the docCount
- Observe highest document number seen, seed nextDocNumber
- Index State intialized to:
{
docCount = <as counted above>
fieldCache = {
"_id": 0
<as scanned above>
}
nextDocNumber = <as scanned above> + 1
maxReadDocNumber = <same as nextDocNumber>
inFlightDocIds = {}
deletedDocIdNumbers = {<as scanned above>}
}
- Garbage Collector Thread is started
- Old Doc Number Lookup Thread is started
- Index marked open
#### Garbage Collector Thread
The role of the Garbage Collector thread is to clean up rows referring to document numbers that are no longer relevant (document was deleted or updated).
Currently, only two types of rows include document numbers:
- Term Frequency Rows
- Stored Rows
The current thought is that the garbage collector thread will use a single iterator to iterate the following key spaces:
- TermFrequencyRow { where field_id > 0}
- StoredRow {all}
For any row refering to a document number on the deletedDocNumbers list, that key will be DELETED.
The garbage collector will track loop iterations or start key for each deletedDocNumber so that it knows when it has walked a full circle for a given doc number. At point the following happen in order:
- docNumber is removed from the deletecDocNumbers list
- DELETE is issued on TermFreqRow{ field_id=0, term=doc_id, doc_id=doc_id_number }
The last thing we do is delete the TermFreqRow for field 0. If anything crashes at any point prior to this, we will again read this record on our next warmup and that doc_id_number will again go through the garbage collection process.
#### Old Doc Number Lookup Thread
The role of the Old Doc Number Lookup thread is to asynchronously lookup old document numbers in use for a give document id.
Waits in a select loop reading from a channel. Through this channel it is notified of a doc_id where work is to be done. When a doc_id comes in, the following is performed:
- Acquire indexState.docIdNumberMutex for reading:
- Read maxReadDocNumber
- Find doc_id/doc_number k/v pair in the inFlightDocIds map
- Release indexState.docIdNumberMutex
- Start Iterator at TermFrequency{ field_id=0 term=doc_id}
- Iterator until term != doc_id
All doc_numbers found that are less than maxReadDocNumber and != doc_number in the inFlightDocIds map are now scheduled for deletion.
- Acquire indexState.docIdNumberMutex for writing:
- add doc numbers to deletedDocIdNumbers
- check if doc_number in inFlightDocIds is still the same
- if so delete it
- if not, it was updated again, so we must leave it
- Release indexState.docIdNumberMutex
Notify Garbage Collector Thread directly of new doc_numbers.
#### Term Dictionary Updater Thread
The role of the Term Dictionary Updater thread is to asynchronously perform best-effort updates to the Term Dictionary. Note the contents of the Term Dictionary only affect scoring, and not correctness of query results.
NOTE: one case where correctness could be affected is if the dictionary is completely missing a term which has non-zero usage. Since the garbage collector thread is continually looking at these rows, its help could be enlisted to detect/repair this situation.
It is notified via a channel of increased term usage (by index ops) and of decresed term usage (by garbage collector cleaing up old usage)
#### Indexing a Document
- Perform all analysis on the document.
- new_doc_number = indexState.nextDocNumber++
- Create New Batch
- Batch will contain SET operations for:
- any new Fields
- Term Frequency Rows for indexed fields terms
- Stored Rows for stored fields
- Execute Batch
- Acquire indexState.docIdNumberMutex for writing:
- set maxReadDocNumber new_doc_number
- set inFlightDocIds{ docId = new_doc_number }
- Release indexState.docIdNumberMutex
- Notify Term Frequency Updater thread of increased term usage.
- Notify Old Doc Number Lookup Thread of doc_id.
The key property is that a search matching the updated document *SHOULD* return the document once this method returns. If the document was an update, it should return the previous document until this method returns. There should be no period of time where neither document matches.
#### Deleting a Document
- Acquire indexState.docIdNumberMutex for writing:
- set inFlightDocIds{ docId = 0 } // 0 is a doc number we never use, indicates pending deltion of docId
- Release indexState.docIdNumberMutex
- Notify Old Doc Number Lookup Thread of doc_id.
#### Batch Operations
Batch operations look largely just like the indexing/deleting operations. Two other optimizations come into play.
- More SET operations in the underlying batch
- Larger aggregated updates can be passed to the Term Frequency Updater Thread
#### Term Field Iteration
- Acquire indexState.docIdNumberMutex for reading:
- Get copy of: (it is assumed some COW data structure is used, or MVCC is accomodated in some way by the impl)
- maxReadDocNumber
- inFlightDocIds
- deletedDocIdNumbers
- Release indexState.docIdNumberMutex
Term Field Iteration is used by the basic term search. It produces the set of documents (and related info like term vectors) which used the specified term in the specified field.
Iterator starts at key:
```'t' <field id uint16> <term utf8> 0xff```
Iterator ends when the term does not match.
- Any row with doc_number > maxReadDocNumber MUST be ignored.
- Any row with doc_id_number on the deletedDocIdNumber list MUST be ignored.
- Any row with the same doc_id as an entry in the inFlightDocIds map, MUST have the same number.
Any row satisfying the above conditions is a candidate document.
### Row Encoding
All keys are manually encoded to ensure a precise row ordering.
Internal Row values are opaque byte arrays.
All other values are encoded using protobuf for a balance of efficiency and flexibility. Dictionary and TermFrequency rows are the most likely to take advantage of this flexibility, but other rows are read/written infrequently enough that the flexibility outweighs any overhead.
#### Version
There is a single version row which records which version of the firestorm indexing scheme is in use.
| Key | Value |
|---------|------------|
|```'v'```|```<VersionValue protobuf>```|
message VersionValue {
required uint64 version = 1;
}
#### Field
Field rows map field names to numeric values
| Key | Value |
|---------|------------|
|```'f' <field id uint16>```|```<FieldValue protobuf>```|
message FieldValue {
required string name = 1;
}
#### Dictionary
Dictionary rows record which terms are used in a particular field. The value can be used to store additional information about the term usage. The value will be encoded using protobuf so that future versions can add data to this structure.
| Key | Value |
|---------|------------|
|```'d' <field id uint16> <term utf8>```|```<DictionaryValue protobuf>```|
message DictionaryValue {
optional uint64 count = 1; // number of documents using this term in this field
}
#### Term Frequency
Term Freqquency rows record which documents use a term in a particular field. The value must record how often the term occurs. It may optionally include other details such as a normalization value (precomputed scoring adjustment for the length of the field) and term vectors (where the term occurred within the field). The value will be encoded using protobuf so that future versions can add data to this structure.
| Key | Value |
|---------|------------|
|```'t' <field id uint16> <term utf8> 0xff <doc_id utf8 > 0xff <doc number uint64>```|```<TermFreqValue protobuf>```|
message TermVectorEntry {
optional uint32 field = 1; // field optional if redundant, required for composite fields
optional uint64 pos = 2; // positional offset within the field
optional uint64 start = 3; // start byte offset
optional uint64 end = 4; // end byte offset
repeated uint64 arrayPositions = 5; // array positions
}
message TermFrequencyValue {
required uint64 freq = 1; // frequency of the term occurance within this field
optional float norm = 2; // normalization factor
repeated TermVectorEntry vectors = 3; // term vectors
}
#### Stored
Stored rows record the original values used to produce the index. At the row encoding level this is an opaque sequence of bytes.
| Key | Value |
|---------------------------|-------------------------|
|```'s' <doc id utf8> 0xff <doc number uint64> <field id uint16>```|```<StoredValue protobuf>```|
message StoredValue {
optional bytes raw = 1; // raw bytes
}
NOTE: we currently encode stored values as raw bytes, however we have other proposals in flight to do something better than this. By using protobuf here as well, we can support existing functionality through the raw field, but allow for more strongly typed information in the future.
#### Internal
Internal rows are a reserved keyspace which the layer above can use for anything it wants.
| Key | Value |
|---------------------------|-------------------------|
|```'i' <application key []byte>```|```<application value []byte>```|
### FAQ
1. How do you ensure correct semantics while updating a document in the index?
Let us consider 5 possible states:
a. Document X#1 is in the index, maxReadDocNumber=1, inFlightDocIds{}, deletedDocIdNumbers{}
b. Document X#1 and X#2 are in the index, maxReadDocNumber=1, inFlightDocIds{}, deletedDocIdNumbers{}
c. Document X#1 and X#2 are in the index, maxReadDocNumber=2, inFlightDocIds{X:2}, deletedDocIdNumbers{}
d. Document X#1 and X#2 are in the index, maxReadDocNumber=2, inFlightDocIds{}, deletedDocIdNumbers{X#1}
e. Document X#2 is in the index, maxReadDocNumber=2, inFlightDocIds{}, deletedDocIdNumbers{}
In state a, we have a steady state where one document has been indexed with id X.
In state b, we have executed the batch that writes the new rows corresponding to the new version of X, but we have not yet updated our in memory compensation data structures. This is OK, because maxReadDocNumber is still 1, all readers will ignore the new rows we just wrote. This is also OK because we are still inside the Index() method, so there is not yet any expectation to see the udpated document.
In state c, we have updated both the maxReadDocNumber to 2 and added X:2 to the inFlightDocIds map. This means that searchers could find rows corresponding to X#1 and X#2. However, they are forced to disregard any row for X where the document number is not 2.
In state d, we have completed the lookup for the old document numbers of X, and found 1. Now deletedDocIdNumbers contains X#1. Now readers that encounter this doc_id_number will ignore it.
In state e, the garbage collector has removed all record of X#1.
The Index method returns after it has transitioned to state c, which maintains the semantics we desire.
2\. Wait, what happens if I kill -9 the process, won't you forget about the deleted documents?
No, our proposal is for a warmup process to walk a subset of the keyspace (TermFreq{ where field_id=0 }). This warmup process will identify all not-yet cleaned up document numbers, and seed the deletedDocIdNumbers state as well as the Garbage Collector Thread.
3\. Wait, but what will happen to the inFlightDocIds in a kill -9 scenario?
It turns out they actually don't matter. That list was just an optimization to get us through the window of time while we hadn't yet looked up the old document numbers for a given document id. But, during the warmup phase we still identify all those keys and they go directly onto deletedDocIdNumbers list.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,31 @@
package firestorm;
message VersionValue {
required uint64 version = 1;
}
message FieldValue {
required string name = 1;
}
message DictionaryValue {
optional uint64 count = 1; // number of documents using this term in this field
}
message TermVector {
optional uint32 field = 1; // field optional if redundant, required for composite fields
optional uint64 pos = 2; // positional offset within the field
optional uint64 start = 3; // start byte offset
optional uint64 end = 4; // end byte offset
repeated uint64 arrayPositions = 5; // array positions
}
message TermFreqValue {
required uint64 freq = 1; // frequency of the term occurance within this field
optional float norm = 2; // normalization factor
repeated TermVector vectors = 3; // term vectors
}
message StoredValue {
optional bytes raw = 1; // raw bytes
}

File diff suppressed because it is too large Load Diff

235
index/firestorm/garbage.go Normal file
View File

@ -0,0 +1,235 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"math"
"sync"
"time"
)
const DefaultGarbageThreshold = 10
const DefaultMaxDocsPerPass = 1000
var DefaultGarbageSleep = 15 * time.Second
type GarbageCollector struct {
f *Firestorm
garbageThreshold int
garbageSleep time.Duration
maxDocsPerPass int
quit chan struct{}
mutex sync.RWMutex
workingSet map[uint64][]byte
closeWait sync.WaitGroup
}
func NewGarbageCollector(f *Firestorm) *GarbageCollector {
rv := GarbageCollector{
f: f,
garbageThreshold: DefaultGarbageThreshold,
garbageSleep: DefaultGarbageSleep,
maxDocsPerPass: DefaultMaxDocsPerPass,
quit: make(chan struct{}),
workingSet: make(map[uint64][]byte),
}
return &rv
}
func (gc *GarbageCollector) Notify(docNum uint64, docId []byte) {
gc.mutex.Lock()
defer gc.mutex.Unlock()
gc.workingSet[docNum] = docId
}
func (gc *GarbageCollector) Start() {
gc.closeWait.Add(1)
go gc.run()
}
func (gc *GarbageCollector) Stop() {
close(gc.quit)
gc.closeWait.Wait()
}
func (gc *GarbageCollector) run() {
tick := time.Tick(gc.garbageSleep)
for {
select {
case <-gc.quit:
logger.Printf("garbage collector asked to quit")
gc.closeWait.Done()
return
case <-tick:
logger.Printf("garbage collector ticked")
garbageSize := gc.f.compensator.GarbageCount()
docSize, err := gc.f.DocCount()
if err != nil {
logger.Printf("garbage collector error getting doc count: %v", err)
continue
}
if docSize == 0 {
continue
}
garbageRatio := int(uint64(garbageSize) / docSize)
if garbageRatio > gc.garbageThreshold {
gc.cleanup()
} else {
logger.Printf("garbage ratio only %d, waiting", garbageRatio)
}
}
}
}
func (gc *GarbageCollector) NextBatch(n int) []uint64 {
gc.mutex.RLock()
defer gc.mutex.RUnlock()
rv := make([]uint64, 0, n)
i := 0
for k := range gc.workingSet {
rv = append(rv, k)
i++
if i > n {
break
}
}
return rv
}
func (gc *GarbageCollector) cleanup() {
logger.Printf("garbage collector starting")
// get list of deleted doc numbers to work on this pass
deletedDocNumsList := gc.NextBatch(gc.maxDocsPerPass) //gc.f.deletedDocNumbers.Keys(gc.maxDocsPerPass)
logger.Printf("found %d doc numbers to cleanup", len(deletedDocNumsList))
// put these documents numbers in a map, for faster checking
// and for organized keys to be deleted
deletedDocNums := make(map[uint64][][]byte)
for _, deletedDocNum := range deletedDocNumsList {
deletedDocNums[deletedDocNum] = make([][]byte, 0)
}
reader, err := gc.f.store.Reader()
if err != nil {
logger.Printf("garbage collector fatal: %v", err)
return
}
defer func() {
if cerr := reader.Close(); err == nil && cerr != nil {
err = cerr
}
}()
// walk all the term freq rows (where field > 0)
termFreqStart := TermFreqIteratorStart(0, []byte{ByteSeparator})
termFreqEnd := TermFreqIteratorStart(math.MaxUint16, []byte{ByteSeparator})
var tfr TermFreqRow
dictionaryDeltas := make(map[string]int64)
err = visitRange(reader, termFreqStart, termFreqEnd, func(key, val []byte) (bool, error) {
err := tfr.ParseKey(key)
if err != nil {
return false, err
}
docNum := tfr.DocNum()
if docNumKeys, deleted := deletedDocNums[docNum]; deleted {
// this doc number has been deleted, place key into map
deletedDocNums[docNum] = append(docNumKeys, key)
if tfr.Field() != 0 {
drk := tfr.DictionaryRowKey()
dictionaryDeltas[string(drk)] -= 1
}
}
return true, nil
})
if err != nil {
logger.Printf("garbage collector fatal: %v", err)
return
}
// walk all the stored rows
var sr StoredRow
err = visitPrefix(reader, StoredKeyPrefix, func(key, val []byte) (bool, error) {
err := sr.ParseKey(key)
if err != nil {
return false, err
}
docNum := sr.DocNum()
if docNumKeys, deleted := deletedDocNums[docNum]; deleted {
// this doc number has been deleted, place key into map
deletedDocNums[docNum] = append(docNumKeys, key)
}
return true, nil
})
if err != nil {
logger.Printf("garbage collector fatal: %v", err)
return
}
// now process each doc one at a time
for docNum, docKeys := range deletedDocNums {
// delete keys for a doc number
logger.Printf("deleting keys for %d", docNum)
// open a writer
writer, err := gc.f.store.Writer()
if err != nil {
_ = writer.Close()
logger.Printf("garbage collector fatal: %v", err)
return
}
// prepare batch
wb := writer.NewBatch()
for _, k := range docKeys {
wb.Delete(k)
}
err = writer.ExecuteBatch(wb)
if err != nil {
_ = writer.Close()
logger.Printf("garbage collector fatal: %v", err)
return
}
logger.Printf("deleted %d keys", len(docKeys))
// remove it from delete keys list
docID := gc.workingSet[docNum]
delete(gc.workingSet, docNum)
gc.f.compensator.GarbageCollect([]uint64{docNum})
// now delete the original marker row (field 0)
tfidrow := NewTermFreqRow(0, nil, docID, docNum, 0, 0, nil)
markerRowKey := tfidrow.Key()
markerBatch := writer.NewBatch()
markerBatch.Delete(markerRowKey)
err = writer.ExecuteBatch(markerBatch)
if err != nil {
logger.Printf("garbage collector fatal: %v", err)
return
}
err = writer.Close()
if err != nil {
logger.Printf("garbage collector fatal: %v", err)
return
}
}
// updating dictionary in one batch
gc.f.dictUpdater.NotifyBatch(dictionaryDeltas)
logger.Printf("garbage collector finished")
}

View File

@ -0,0 +1,132 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"time"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func TestGarbageCleanup(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []struct {
row index.IndexRow
garbage bool
}{
// needed for warmup to work
{NewFieldRow(0, IDFieldName), false},
// 3 documents, with 2 older versions
{NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil), true},
{NewTermFreqRow(0, nil, []byte("a"), 2, 0, 0.0, nil), false},
{NewTermFreqRow(0, nil, []byte("b"), 3, 0, 0.0, nil), false},
{NewTermFreqRow(0, nil, []byte("c"), 4, 0, 0.0, nil), true},
{NewTermFreqRow(0, nil, []byte("c"), 5, 0, 0.0, nil), false},
// additional records for these docs which should be removed
{NewTermFreqRow(1, []byte("cat"), []byte("a"), 1, 3, 2.0, nil), true},
{NewTermFreqRow(1, []byte("cat"), []byte("c"), 4, 1, 1.0, nil), true},
{NewStoredRow([]byte("a"), 1, 1, nil, []byte("tcat")), true},
{NewStoredRow([]byte("c"), 4, 1, nil, []byte("tcat")), true},
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.row.Key(), row.row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
// warmup ensures that deletedDocNums is seeded correctly
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
// now invoke garbage collector cleanup manually
f.(*Firestorm).garbageCollector.cleanup()
// assert that garbage rows are gone
reader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
for _, row := range rows {
v, err := reader.Get(row.row.Key())
if err != nil {
t.Fatal(err)
}
if v != nil && row.garbage {
t.Errorf("garbage row not deleted, key: %s", row.row.Key())
}
if v == nil && !row.garbage {
t.Errorf("non-garbage row deleted, key: %s", row.row.Key())
}
}
err = reader.Close()
if err != nil {
t.Fatal(err)
}
// assert that deletedDocsNumbers size is 0
if f.(*Firestorm).compensator.GarbageCount() != 0 {
t.Errorf("expected deletedDocsNumbers size to be 0, got %d", f.(*Firestorm).compensator.GarbageCount())
}
}
func TestGarbageDontPanicOnEmptyDocs(t *testing.T) {
idx, err := NewFirestorm("", nil, index.NewAnalysisQueue(1))
if err != nil {
t.Fatal(err)
}
f := idx.(*Firestorm)
gc := NewGarbageCollector(f)
gc.garbageSleep = 30 * time.Millisecond
gc.Start()
time.Sleep(40 * time.Millisecond)
gc.Stop()
}

View File

@ -0,0 +1,67 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import "fmt"
var InternalKeyPrefix = []byte{'i'}
type InternalRow struct {
key []byte
val []byte
}
func NewInternalRow(key, val []byte) *InternalRow {
rv := InternalRow{
key: key,
val: val,
}
return &rv
}
func NewInternalRowKV(key, value []byte) (*InternalRow, error) {
rv := InternalRow{}
rv.key = key[1:]
rv.val = value
return &rv, nil
}
func (ir *InternalRow) KeySize() int {
return 1 + len(ir.key)
}
func (ir *InternalRow) KeyTo(buf []byte) (int, error) {
buf[0] = 'i'
copy(buf[1:], ir.key)
return 1 + len(ir.key), nil
}
func (ir *InternalRow) Key() []byte {
buf := make([]byte, ir.KeySize())
n, _ := ir.KeyTo(buf)
return buf[:n]
}
func (ir *InternalRow) ValueSize() int {
return len(ir.val)
}
func (ir *InternalRow) ValueTo(buf []byte) (int, error) {
copy(buf, ir.val)
return len(ir.val), nil
}
func (ir *InternalRow) Value() []byte {
return ir.val
}
func (ir *InternalRow) String() string {
return fmt.Sprintf("InternalStore - Key: %s (% x) Val: %s (% x)", ir.key, ir.key, ir.val, ir.val)
}

View File

@ -0,0 +1,54 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
)
func TestInternalRows(t *testing.T) {
tests := []struct {
input index.IndexRow
outKey []byte
outVal []byte
}{
{
NewInternalRow([]byte("key"), []byte("val")),
[]byte{'i', 'k', 'e', 'y'},
[]byte{'v', 'a', 'l'},
},
}
// test going from struct to k/v bytes
for i, test := range tests {
rk := test.input.Key()
if !reflect.DeepEqual(rk, test.outKey) {
t.Errorf("Expected key to be %v got: %v", test.outKey, rk)
}
rv := test.input.Value()
if !reflect.DeepEqual(rv, test.outVal) {
t.Errorf("Expected value to be %v got: %v for %d", test.outVal, rv, i)
}
}
// now test going back from k/v bytes to struct
for i, test := range tests {
row, err := NewInternalRowKV(test.outKey, test.outVal)
if err != nil {
t.Errorf("error parsking key/value: %v", err)
}
if !reflect.DeepEqual(row, test.input) {
t.Errorf("Expected: %#v got: %#v for %d", test.input, row, i)
}
}
}

146
index/firestorm/lookup.go Normal file
View File

@ -0,0 +1,146 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"fmt"
"sync"
"sync/atomic"
"time"
)
const channelBufferSize = 1000
type Lookuper struct {
f *Firestorm
workChan chan []*InFlightItem
quit chan struct{}
closeWait sync.WaitGroup
tasksQueued uint64
tasksDone uint64
}
func NewLookuper(f *Firestorm) *Lookuper {
rv := Lookuper{
f: f,
workChan: make(chan []*InFlightItem, channelBufferSize),
quit: make(chan struct{}),
}
return &rv
}
func (l *Lookuper) NotifyBatch(items []*InFlightItem) {
atomic.AddUint64(&l.tasksQueued, 1)
l.workChan <- items
}
func (l *Lookuper) Start() {
l.closeWait.Add(1)
go l.run()
}
func (l *Lookuper) Stop() {
close(l.quit)
l.closeWait.Wait()
}
func (l *Lookuper) run() {
for {
select {
case <-l.quit:
logger.Printf("lookuper asked to quit")
l.closeWait.Done()
return
case items, ok := <-l.workChan:
if !ok {
logger.Printf("lookuper work channel closed unexpectedly, stopping")
return
}
l.lookupItems(items)
}
}
}
func (l *Lookuper) lookupItems(items []*InFlightItem) {
for _, item := range items {
l.lookup(item)
}
atomic.AddUint64(&l.tasksDone, 1)
}
func (l *Lookuper) lookup(item *InFlightItem) {
reader, err := l.f.store.Reader()
if err != nil {
logger.Printf("lookuper fatal: %v", err)
return
}
defer func() {
if cerr := reader.Close(); err == nil && cerr != nil {
err = cerr
}
}()
prefix := TermFreqPrefixFieldTermDocId(0, nil, item.docID)
logger.Printf("lookuper prefix - % x", prefix)
var tfk TermFreqRow
docNums := make(DocNumberList, 0)
err = visitPrefix(reader, prefix, func(key, val []byte) (bool, error) {
logger.Printf("lookuper sees key % x", key)
err := tfk.ParseKey(key)
if err != nil {
return false, err
}
docNum := tfk.DocNum()
docNums = append(docNums, docNum)
return true, nil
})
if err != nil {
logger.Printf("lookuper fatal: %v", err)
return
}
oldDocNums := make(DocNumberList, 0, len(docNums))
for _, docNum := range docNums {
if item.docNum == 0 || docNum < item.docNum {
oldDocNums = append(oldDocNums, docNum)
}
}
logger.Printf("lookup migrating '%s' - %d - oldDocNums: %v", item.docID, item.docNum, oldDocNums)
l.f.compensator.Migrate(item.docID, item.docNum, oldDocNums)
if len(oldDocNums) == 0 && item.docNum != 0 {
// this was an add, not an update
atomic.AddUint64(l.f.docCount, 1)
} else if len(oldDocNums) > 0 && item.docNum == 0 {
// this was a delete (and it previously existed)
atomic.AddUint64(l.f.docCount, ^uint64(0))
}
}
// this is not intended to be used publicly, only for unit tests
// which depend on consistency we no longer provide
func (l *Lookuper) waitTasksDone(d time.Duration) error {
timeout := time.After(d)
tick := time.Tick(100 * time.Millisecond)
for {
select {
// Got a timeout! fail with a timeout error
case <-timeout:
return fmt.Errorf("timeout")
// Got a tick, we should check on doSomething()
case <-tick:
queued := atomic.LoadUint64(&l.tasksQueued)
done := atomic.LoadUint64(&l.tasksDone)
if queued == done {
return nil
}
}
}
}

View File

@ -0,0 +1,83 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"testing"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func TestLookups(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []struct {
row index.IndexRow
garbage bool
}{
// needed for warmup to work
{NewFieldRow(0, IDFieldName), false},
// 3 documents, with 2 older versions
{NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil), true},
{NewTermFreqRow(0, nil, []byte("a"), 2, 0, 0.0, nil), false},
{NewTermFreqRow(0, nil, []byte("b"), 3, 0, 0.0, nil), false},
{NewTermFreqRow(0, nil, []byte("c"), 4, 0, 0.0, nil), true},
{NewTermFreqRow(0, nil, []byte("c"), 5, 0, 0.0, nil), false},
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.row.Key(), row.row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
// also see the compensator
if tfr, ok := row.row.(*TermFreqRow); ok {
f.(*Firestorm).compensator.Mutate(tfr.DocID(), tfr.DocNum())
// expect this mutation to be in the in-flight list
val := f.(*Firestorm).compensator.inFlight.Get(&InFlightItem{docID: tfr.DocID()})
if val == nil {
t.Errorf("expected key: % x to be in the inflight list", tfr.DocID())
}
f.(*Firestorm).lookuper.lookup(&InFlightItem{docID: tfr.DocID(), docNum: tfr.DocNum()})
// now expect this mutation to NOT be in the in-flight list
val = f.(*Firestorm).compensator.inFlight.Get(&InFlightItem{docID: tfr.DocID()})
if val != nil {
t.Errorf("expected key: % x to NOT be in the inflight list, got %v", tfr.DocID(), val)
}
}
}
// check that doc count is 3 at the end of this
docCount, err := f.DocCount()
if err != nil {
t.Fatal(err)
}
if docCount != 3 {
t.Errorf("expected doc count 3, got %d", docCount)
}
}

71
index/firestorm/merge.go Normal file
View File

@ -0,0 +1,71 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"encoding/binary"
)
var mergeOperator firestormMerge
var dictionaryTermIncr []byte
var dictionaryTermDecr []byte
func init() {
dictionaryTermIncr = make([]byte, 8)
binary.LittleEndian.PutUint64(dictionaryTermIncr, uint64(1))
dictionaryTermDecr = make([]byte, 8)
var negOne = int64(-1)
binary.LittleEndian.PutUint64(dictionaryTermDecr, uint64(negOne))
}
type firestormMerge struct{}
func (m *firestormMerge) FullMerge(key, existingValue []byte, operands [][]byte) ([]byte, bool) {
// set up record based on key
dr, err := NewDictionaryRowK(key)
if err != nil {
return nil, false
}
if len(existingValue) > 0 {
// if existing value, parse it
err = dr.parseDictionaryV(existingValue)
if err != nil {
return nil, false
}
}
// now process operands
for _, operand := range operands {
next := int64(binary.LittleEndian.Uint64(operand))
if next < 0 && uint64(-next) > dr.Count() {
// subtracting next from existing would overflow
dr.SetCount(0)
} else if next < 0 {
dr.SetCount(dr.Count() - uint64(-next))
} else {
dr.SetCount(dr.Count() + uint64(next))
}
}
return dr.Value(), true
}
func (m *firestormMerge) PartialMerge(key, leftOperand, rightOperand []byte) ([]byte, bool) {
left := int64(binary.LittleEndian.Uint64(leftOperand))
right := int64(binary.LittleEndian.Uint64(rightOperand))
rv := make([]byte, 8)
binary.LittleEndian.PutUint64(rv, uint64(left+right))
return rv, true
}
func (m *firestormMerge) Name() string {
return "firestormMerge"
}

View File

@ -0,0 +1,93 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"encoding/binary"
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
)
func TestPartialMerge(t *testing.T) {
tests := []struct {
in [][]byte
out uint64
}{
{
in: [][]byte{dictionaryTermIncr, dictionaryTermIncr, dictionaryTermIncr, dictionaryTermIncr, dictionaryTermIncr},
out: 5,
},
}
mo := &firestormMerge{}
for _, test := range tests {
curr := test.in[0]
for _, next := range test.in[1:] {
var ok bool
curr, ok = mo.PartialMerge([]byte("key"), curr, next)
if !ok {
t.Errorf("expected partial merge ok")
}
}
actual := decodeCount(curr)
if actual != test.out {
t.Errorf("expected %d, got %d", test.out, actual)
}
}
}
func decodeCount(in []byte) uint64 {
buf := bytes.NewBuffer(in)
count, _ := binary.ReadUvarint(buf)
return count
}
func TestFullMerge(t *testing.T) {
tests := []struct {
existing index.IndexRow
operands [][]byte
result index.IndexRow
success bool
}{
{
existing: NewDictionaryRow(1, []byte("term"), 3),
operands: [][]byte{dictionaryTermIncr, dictionaryTermIncr},
result: NewDictionaryRow(1, []byte("term"), 5),
success: true,
},
{
existing: NewDictionaryRow(1, []byte("term"), 3),
operands: [][]byte{dictionaryTermDecr, dictionaryTermDecr},
result: NewDictionaryRow(1, []byte("term"), 1),
success: true,
},
}
mo := &firestormMerge{}
for _, test := range tests {
existingVal := test.existing.Value()
actual, success := mo.FullMerge([]byte("key"), existingVal, test.operands)
if success != test.success {
t.Errorf("expected error %t, got %t", test.success, success)
}
expectedVal := test.result.Value()
if !reflect.DeepEqual(expectedVal, actual) {
t.Errorf("expected result %v, got %v", expectedVal, actual)
}
}
}

220
index/firestorm/reader.go Normal file
View File

@ -0,0 +1,220 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"fmt"
"sort"
"github.com/blevesearch/bleve/document"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
)
type firestormReader struct {
f *Firestorm
r store.KVReader
s *Snapshot
docCount uint64
}
func newFirestormReader(f *Firestorm) (index.IndexReader, error) {
r, err := f.store.Reader()
if err != nil {
return nil, fmt.Errorf("error opening store reader: %v", err)
}
docCount, err := f.DocCount()
if err != nil {
return nil, fmt.Errorf("error opening store reader: %v", err)
}
rv := firestormReader{
f: f,
r: r,
s: f.compensator.Snapshot(),
docCount: docCount,
}
return &rv, nil
}
func (r *firestormReader) TermFieldReader(term []byte, field string) (index.TermFieldReader, error) {
fieldIndex, fieldExists := r.f.fieldCache.FieldNamed(field, false)
if fieldExists {
return newFirestormTermFieldReader(r, uint16(fieldIndex), term)
}
return newFirestormTermFieldReader(r, ^uint16(0), []byte{ByteSeparator})
}
func (r *firestormReader) DocIDReader(start, end string) (index.DocIDReader, error) {
return newFirestormDocIDReader(r, start, end)
}
func (r *firestormReader) FieldDict(field string) (index.FieldDict, error) {
return r.FieldDictRange(field, nil, nil)
}
func (r *firestormReader) FieldDictRange(field string, startTerm []byte, endTerm []byte) (index.FieldDict, error) {
fieldIndex, fieldExists := r.f.fieldCache.FieldNamed(field, false)
if fieldExists {
return newFirestormDictionaryReader(r, uint16(fieldIndex), startTerm, endTerm)
}
return newFirestormDictionaryReader(r, ^uint16(0), []byte{ByteSeparator}, []byte{})
}
func (r *firestormReader) FieldDictPrefix(field string, termPrefix []byte) (index.FieldDict, error) {
return r.FieldDictRange(field, termPrefix, incrementBytes(termPrefix))
}
func (r *firestormReader) Document(id string) (*document.Document, error) {
docID := []byte(id)
docNum, err := r.currDocNumForId(docID)
if err != nil {
return nil, err
} else if docNum == 0 {
return nil, nil
}
rv := document.NewDocument(id)
prefix := StoredPrefixDocIDNum(docID, docNum)
err = visitPrefix(r.r, prefix, func(key, val []byte) (bool, error) {
safeVal := make([]byte, len(val))
copy(safeVal, val)
row, err := NewStoredRowKV(key, safeVal)
if err != nil {
return false, err
}
if row != nil {
fieldName := r.f.fieldCache.FieldIndexed(row.field)
field := r.decodeFieldType(fieldName, row.arrayPositions, row.value.GetRaw())
if field != nil {
rv.AddField(field)
}
}
return true, nil
})
if err != nil {
return nil, err
}
return rv, nil
}
func (r *firestormReader) decodeFieldType(name string, pos []uint64, value []byte) document.Field {
switch value[0] {
case 't':
return document.NewTextField(name, pos, value[1:])
case 'n':
return document.NewNumericFieldFromBytes(name, pos, value[1:])
case 'd':
return document.NewDateTimeFieldFromBytes(name, pos, value[1:])
case 'b':
return document.NewBooleanFieldFromBytes(name, pos, value[1:])
}
return nil
}
func (r *firestormReader) currDocNumForId(docID []byte) (uint64, error) {
prefix := TermFreqPrefixFieldTermDocId(0, nil, docID)
docNums := make(DocNumberList, 0)
err := visitPrefix(r.r, prefix, func(key, val []byte) (bool, error) {
tfk, err := NewTermFreqRowKV(key, val)
if err != nil {
return false, err
}
docNum := tfk.DocNum()
docNums = append(docNums, docNum)
return true, nil
})
if err != nil {
return 0, err
}
if len(docNums) > 0 {
sort.Sort(docNums)
return docNums[0], nil
}
return 0, nil
}
func (r *firestormReader) DocumentFieldTerms(id string) (index.FieldTerms, error) {
docID := []byte(id)
docNum, err := r.currDocNumForId(docID)
if err != nil {
return nil, err
} else if docNum == 0 {
return nil, nil
}
rv := make(index.FieldTerms, 0)
// walk the term freqs
err = visitPrefix(r.r, TermFreqKeyPrefix, func(key, val []byte) (bool, error) {
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
return false, err
}
if bytes.Compare(tfr.DocID(), docID) == 0 && tfr.DocNum() == docNum && tfr.Field() != 0 {
fieldName := r.f.fieldCache.FieldIndexed(uint16(tfr.Field()))
terms, ok := rv[fieldName]
if !ok {
terms = make([]string, 0, 1)
}
terms = append(terms, string(tfr.Term()))
rv[fieldName] = terms
}
return true, nil
})
if err != nil {
return nil, err
}
return rv, nil
}
func (r *firestormReader) Fields() ([]string, error) {
fields := make([]string, 0)
err := visitPrefix(r.r, FieldKeyPrefix, func(key, val []byte) (bool, error) {
fieldRow, err := NewFieldRowKV(key, val)
if err != nil {
return false, err
}
fields = append(fields, fieldRow.Name())
return true, nil
})
if err != nil {
return nil, err
}
return fields, nil
}
func (r *firestormReader) GetInternal(key []byte) ([]byte, error) {
internalRow := NewInternalRow(key, nil)
return r.r.Get(internalRow.Key())
}
func (r *firestormReader) DocCount() uint64 {
return r.docCount
}
func (r *firestormReader) Close() error {
return r.r.Close()
}
func incrementBytes(in []byte) []byte {
rv := make([]byte, len(in))
copy(rv, in)
for i := len(rv) - 1; i >= 0; i-- {
rv[i] = rv[i] + 1
if rv[i] != 0 {
// didn't overflow, so stop
break
}
}
return rv
}

View File

@ -0,0 +1,70 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"fmt"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
)
type firestormDictionaryReader struct {
r *firestormReader
field uint16
start []byte
i store.KVIterator
}
func newFirestormDictionaryReader(r *firestormReader, field uint16, start, end []byte) (*firestormDictionaryReader, error) {
startKey := DictionaryRowKey(field, start)
logger.Printf("start key '%s' - % x", startKey, startKey)
if end == nil {
end = []byte{ByteSeparator}
}
endKey := DictionaryRowKey(field, end)
logger.Printf("end key '%s' - % x", endKey, endKey)
i := r.r.RangeIterator(startKey, endKey)
rv := firestormDictionaryReader{
r: r,
field: field,
start: startKey,
i: i,
}
return &rv, nil
}
func (r *firestormDictionaryReader) Next() (*index.DictEntry, error) {
key, val, valid := r.i.Current()
if !valid {
return nil, nil
}
logger.Printf("see key '%s' - % x", key, key)
currRow, err := NewDictionaryRowKV(key, val)
if err != nil {
return nil, fmt.Errorf("unexpected error parsing dictionary row kv: %v", err)
}
rv := index.DictEntry{
Term: string(currRow.term),
Count: currRow.Count(),
}
// advance the iterator to the next term
r.i.Next()
return &rv, nil
}
func (r *firestormDictionaryReader) Close() error {
if r.i != nil {
return r.i.Close()
}
return nil
}

View File

@ -0,0 +1,225 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"reflect"
"regexp"
"testing"
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/tokenizers/regexp_tokenizer"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
var testAnalyzer = &analysis.Analyzer{
Tokenizer: regexp_tokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)),
}
func TestDictionaryReader(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []index.IndexRow{
NewFieldRow(0, IDFieldName),
NewFieldRow(1, "name"),
NewFieldRow(2, "desc"),
NewFieldRow(3, "prefix"),
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
kvwriter, err = f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows = []index.IndexRow{
// dictionary entries
NewDictionaryRow(1, []byte("test"), 4),
NewDictionaryRow(2, []byte("eat"), 1),
NewDictionaryRow(2, []byte("more"), 1),
NewDictionaryRow(2, []byte("rice"), 1),
NewDictionaryRow(3, []byte("bob"), 1),
NewDictionaryRow(3, []byte("cat"), 1),
NewDictionaryRow(3, []byte("cats"), 1),
NewDictionaryRow(3, []byte("catting"), 1),
NewDictionaryRow(3, []byte("dog"), 1),
NewDictionaryRow(3, []byte("doggy"), 1),
NewDictionaryRow(3, []byte("zoo"), 1),
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
// now try it
r, err := f.Reader()
if err != nil {
t.Fatal(err)
}
dict, err := r.FieldDict("name")
if err != nil {
t.Errorf("error creating reader: %v", err)
}
termCount := 0
curr, err := dict.Next()
for err == nil && curr != nil {
termCount++
if curr.Term != "test" {
t.Errorf("expected term to be 'test', got '%s'", curr.Term)
}
curr, err = dict.Next()
}
if termCount != 1 {
t.Errorf("expected 1 term for this field, got %d", termCount)
}
err = dict.Close()
if err != nil {
t.Fatal(err)
}
dict, err = r.FieldDict("desc")
if err != nil {
t.Errorf("error creating reader: %v", err)
}
termCount = 0
terms := make([]string, 0)
curr, err = dict.Next()
for err == nil && curr != nil {
termCount++
terms = append(terms, curr.Term)
curr, err = dict.Next()
}
if termCount != 3 {
t.Errorf("expected 3 term for this field, got %d", termCount)
}
expectedTerms := []string{"eat", "more", "rice"}
if !reflect.DeepEqual(expectedTerms, terms) {
t.Errorf("expected %#v, got %#v", expectedTerms, terms)
}
err = dict.Close()
if err != nil {
t.Fatal(err)
}
// test start and end range
dict, err = r.FieldDictRange("desc", []byte("fun"), []byte("nice"))
if err != nil {
t.Errorf("error creating reader: %v", err)
}
termCount = 0
terms = make([]string, 0)
curr, err = dict.Next()
for err == nil && curr != nil {
termCount++
terms = append(terms, curr.Term)
curr, err = dict.Next()
}
if termCount != 1 {
t.Errorf("expected 1 term for this field, got %d", termCount)
}
expectedTerms = []string{"more"}
if !reflect.DeepEqual(expectedTerms, terms) {
t.Errorf("expected %#v, got %#v", expectedTerms, terms)
}
err = dict.Close()
if err != nil {
t.Fatal(err)
}
// test use case for prefix
dict, err = r.FieldDictPrefix("prefix", []byte("cat"))
if err != nil {
t.Errorf("error creating reader: %v", err)
}
termCount = 0
terms = make([]string, 0)
curr, err = dict.Next()
for err == nil && curr != nil {
termCount++
terms = append(terms, curr.Term)
curr, err = dict.Next()
}
if termCount != 3 {
t.Errorf("expected 3 term for this field, got %d", termCount)
}
expectedTerms = []string{"cat", "cats", "catting"}
if !reflect.DeepEqual(expectedTerms, terms) {
t.Errorf("expected %#v, got %#v", expectedTerms, terms)
}
err = dict.Close()
if err != nil {
t.Fatal(err)
}
err = r.Close()
if err != nil {
t.Fatal(err)
}
}

View File

@ -0,0 +1,120 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"github.com/blevesearch/bleve/index/store"
)
type firestormDocIDReader struct {
r *firestormReader
start []byte
i store.KVIterator
}
func newFirestormDocIDReader(r *firestormReader, start, end string) (*firestormDocIDReader, error) {
startKey := TermFreqIteratorStart(0, nil)
if start != "" {
startKey = TermFreqPrefixFieldTermDocId(0, nil, []byte(start))
}
logger.Printf("start key '%s' - % x", startKey, startKey)
endKey := TermFreqIteratorStart(0, []byte{ByteSeparator})
if end != "" {
endKey = TermFreqPrefixFieldTermDocId(0, nil, []byte(end))
}
logger.Printf("end key '%s' - % x", endKey, endKey)
i := r.r.RangeIterator(startKey, endKey)
rv := firestormDocIDReader{
r: r,
start: startKey,
i: i,
}
return &rv, nil
}
func (r *firestormDocIDReader) Next() (string, error) {
if r.i != nil {
key, val, valid := r.i.Current()
for valid {
logger.Printf("see key: '%s' - % x", key, key)
tfrsByDocNum := make(map[uint64]*TermFreqRow)
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
return "", err
}
tfrsByDocNum[tfr.DocNum()] = tfr
// now we have a possible row, but there may be more rows for the same docid
// find these now
err = r.findNextTfrsWithSameDocId(tfrsByDocNum, tfr.DocID())
if err != nil {
return "", err
}
docNumList := make(DocNumberList, 0, len(tfrsByDocNum))
for dn := range tfrsByDocNum {
docNumList = append(docNumList, dn)
}
logger.Printf("docNumList: %v", docNumList)
highestValidDocNum := r.r.s.Which(tfr.docID, docNumList)
if highestValidDocNum == 0 {
// no valid doc number
key, val, valid = r.i.Current()
continue
}
logger.Printf("highest valid: %d", highestValidDocNum)
tfr = tfrsByDocNum[highestValidDocNum]
return string(tfr.DocID()), nil
}
}
return "", nil
}
// FIXME this is identical to the one in reader_terms.go
func (r *firestormDocIDReader) findNextTfrsWithSameDocId(tfrsByDocNum map[uint64]*TermFreqRow, docID []byte) error {
tfrDocIdPrefix := TermFreqPrefixFieldTermDocId(0, nil, docID)
r.i.Next()
key, val, valid := r.i.Current()
for valid && bytes.HasPrefix(key, tfrDocIdPrefix) {
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
return err
}
tfrsByDocNum[tfr.DocNum()] = tfr
r.i.Next()
key, val, valid = r.i.Current()
}
return nil
}
func (r *firestormDocIDReader) Advance(docID string) (string, error) {
if r.i != nil {
tfrDocIdPrefix := TermFreqPrefixFieldTermDocId(0, nil, []byte(docID))
r.i.Seek(tfrDocIdPrefix)
return r.Next()
}
return "", nil
}
func (r *firestormDocIDReader) Close() error {
if r.i != nil {
return r.i.Close()
}
return nil
}

View File

@ -0,0 +1,187 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"math/rand"
"reflect"
"testing"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store/gtreap"
)
func TestDocIDReaderSomeGarbage(t *testing.T) {
aq := index.NewAnalysisQueue(1)
f, err := NewFirestorm(gtreap.Name, nil, aq)
if err != nil {
t.Fatal(err)
}
err = f.Open()
if err != nil {
t.Fatal(err)
}
kvwriter, err := f.(*Firestorm).store.Writer()
if err != nil {
t.Fatal(err)
}
rows := []index.IndexRow{
NewFieldRow(0, IDFieldName),
NewFieldRow(1, "desc"),
NewTermFreqRow(0, nil, []byte("a"), 1, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("b"), 2, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("c"), 3, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("d"), 4, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("a"), 5, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("b"), 6, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("e"), 7, 0, 0.0, nil),
NewTermFreqRow(0, nil, []byte("g"), 8, 0, 0.0, nil),
// first version of all docs have cat
NewTermFreqRow(1, []byte("cat"), []byte("a"), 1, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("b"), 2, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("c"), 3, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("d"), 4, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("e"), 7, 1, 1.0, nil),
NewTermFreqRow(1, []byte("cat"), []byte("g"), 8, 1, 1.0, nil),
// updated version of a still has cat
NewTermFreqRow(1, []byte("cat"), []byte("a"), 5, 1, 1.0, nil),
// updated version of b does NOT have cat
// c has delete in-flight
// d has delete not-yet-garbage-collected
}
for _, row := range rows {
wb := kvwriter.NewBatch()
wb.Set(row.Key(), row.Value())
err = kvwriter.ExecuteBatch(wb)
if err != nil {
t.Fatal(err)
}
}
f.(*Firestorm).compensator.inFlight = f.(*Firestorm).compensator.inFlight.Upsert(&InFlightItem{docID: []byte("c"), docNum: 0}, rand.Int())
f.(*Firestorm).compensator.deletedDocNumbers.Set(4)
err = kvwriter.Close()
if err != nil {
t.Fatal(err)
}
kvreader, err := f.(*Firestorm).store.Reader()
if err != nil {
t.Fatal(err)
}
// warmup to load field cache and set maxRead correctly
err = f.(*Firestorm).warmup(kvreader)
if err != nil {
t.Fatal(err)
}
err = kvreader.Close()
if err != nil {
t.Fatal(err)
}
r, err := f.Reader()
if err != nil {
t.Fatal(err)
}
dr, err := r.DocIDReader("", "")
if err != nil {
t.Fatal(err)
}
expectedDocIds := []string{"a", "b", "e", "g"}
foundDocIds := make([]string, 0)
next, err := dr.Next()
for next != "" && err == nil {
foundDocIds = append(foundDocIds, next)
next, err = dr.Next()
}
if err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(expectedDocIds, foundDocIds) {
t.Errorf("expected: %v, got %v", expectedDocIds, foundDocIds)
}
err = dr.Close()
if err != nil {
t.Fatal(err)
}
// now test with some doc id ranges
dr, err = r.DocIDReader("b", "f")
if err != nil {
t.Fatal(err)
}
expectedDocIds = []string{"b", "e"}
foundDocIds = make([]string, 0)
next, err = dr.Next()
for next != "" && err == nil {
foundDocIds = append(foundDocIds, next)
next, err = dr.Next()
}
if err != nil {
t.Fatal(err)
}
if !reflect.DeepEqual(expectedDocIds, foundDocIds) {
t.Errorf("expected: %v, got %v", expectedDocIds, foundDocIds)
}
err = dr.Close()
if err != nil {
t.Fatal(err)
}
//now try again and Advance to skip over "e"
dr, err = r.DocIDReader("b", "")
if err != nil {
t.Fatal(err)
}
expectedDocIds = []string{"b", "g"}
foundDocIds = make([]string, 0)
next, err = dr.Next()
if err != nil {
t.Fatal(err)
} else {
foundDocIds = append(foundDocIds, next)
}
next, err = dr.Advance("f")
if err != nil {
t.Fatal(err)
} else {
foundDocIds = append(foundDocIds, next)
}
if !reflect.DeepEqual(expectedDocIds, foundDocIds) {
t.Errorf("expected: %v, got %v", expectedDocIds, foundDocIds)
}
err = dr.Close()
if err != nil {
t.Fatal(err)
}
err = r.Close()
if err != nil {
t.Fatal(err)
}
}

View File

@ -0,0 +1,159 @@
// Copyright (c) 2015 Couchbase, Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
// except in compliance with the License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
package firestorm
import (
"bytes"
"github.com/blevesearch/bleve/index"
"github.com/blevesearch/bleve/index/store"
)
type firestormTermFieldReader struct {
r *firestormReader
field uint16
term []byte
prefix []byte
count uint64
i store.KVIterator
}
func newFirestormTermFieldReader(r *firestormReader, field uint16, term []byte) (index.TermFieldReader, error) {
dictionaryKey := DictionaryRowKey(field, term)
dictionaryValue, err := r.r.Get(dictionaryKey)
if err != nil {
return nil, err
}
prefix := TermFreqIteratorStart(field, term)
logger.Printf("starting term freq iterator at: '%s' - % x", prefix, prefix)
i := r.r.PrefixIterator(prefix)
rv := firestormTermFieldReader{
r: r,
field: field,
term: term,
prefix: prefix,
i: i,
}
// NOTE: in firestorm the dictionary row is advisory in nature
// it *may* tell us the correct out
// if this record does not exist, it DOES not mean that there is no
// usage, we must scan the term frequencies to be sure
if dictionaryValue != nil {
dictionaryRow, err := NewDictionaryRowKV(dictionaryKey, dictionaryValue)
if err != nil {
return nil, err
}
rv.count = dictionaryRow.Count()
}
return &rv, nil
}
func (r *firestormTermFieldReader) Next() (*index.TermFieldDoc, error) {
if r.i != nil {
key, val, valid := r.i.Current()
for valid {
logger.Printf("see key: '%s' - % x", key, key)
tfrsByDocNum := make(map[uint64]*TermFreqRow)
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
return nil, err
}
tfrsByDocNum[tfr.DocNum()] = tfr
// now we have a possible row, but there may be more rows for the same docid
// find these now
err = r.findNextTfrsWithSameDocId(tfrsByDocNum, tfr.DocID())
if err != nil {
return nil, err
}
docNumList := make(DocNumberList, 0, len(tfrsByDocNum))
for dn := range tfrsByDocNum {
docNumList = append(docNumList, dn)
}
logger.Printf("docNumList: %v", docNumList)
highestValidDocNum := r.r.s.Which(tfr.docID, docNumList)
if highestValidDocNum == 0 {
// no valid doc number
key, val, valid = r.i.Current()
continue
}
logger.Printf("highest valid: %d", highestValidDocNum)
tfr = tfrsByDocNum[highestValidDocNum]
return &index.TermFieldDoc{
ID: string(tfr.DocID()),
Freq: tfr.Freq(),
Norm: float64(tfr.Norm()),
Vectors: r.termFieldVectorsFromTermVectors(tfr.Vectors()),
}, nil
}
}
return nil, nil
}
func (r *firestormTermFieldReader) findNextTfrsWithSameDocId(tfrsByDocNum map[uint64]*TermFreqRow, docID []byte) error {
tfrDocIdPrefix := TermFreqPrefixFieldTermDocId(r.field, r.term, docID)
r.i.Next()
key, val, valid := r.i.Current()
for valid && bytes.HasPrefix(key, tfrDocIdPrefix) {
tfr, err := NewTermFreqRowKV(key, val)
if err != nil {
return err
}
tfrsByDocNum[tfr.DocNum()] = tfr
r.i.Next()
key, val, valid = r.i.Current()
}
return nil
}
func (r *firestormTermFieldReader) Advance(docID string) (*index.TermFieldDoc, error) {
if r.i != nil {
tfrDocIdPrefix := TermFreqPrefixFieldTermDocId(r.field, r.term, []byte(docID))
r.i.Seek(tfrDocIdPrefix)
return r.Next()
}
return nil, nil
}
func (r *firestormTermFieldReader) Count() uint64 {
return r.count
}
func (r *firestormTermFieldReader) Close() error {
if r.i != nil {
return r.i.Close()
}
return nil
}
func (r *firestormTermFieldReader) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
rv := make([]*index.TermFieldVector, len(in))
for i, tv := range in {
fieldName := r.r.f.fieldCache.FieldIndexed(uint16(tv.GetField()))
tfv := index.TermFieldVector{
Field: fieldName,
ArrayPositions: tv.GetArrayPositions(),
Pos: tv.GetPos(),
Start: tv.GetStart(),
End: tv.GetEnd(),
}
rv[i] = &tfv
}
return rv
}

Some files were not shown because too many files have changed in this diff Show More