From 3d842dfaf2068ab34199941a7c29dcbb724ef35d Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 17 Apr 2014 16:55:53 -0400 Subject: [PATCH] initial commit --- .gitignore | 10 + README.md | 3 + .../analyzers/keyword_analyzer/keyword.go | 24 + .../analyzers/standard_analyzer/standard.go | 39 ++ .../html_char_filter/html_char_filter.go | 32 ++ .../html_char_filter/html_char_filter_test.go | 52 ++ .../regexp_char_filter/regexp_char_filter.go | 30 ++ analysis/freq.go | 55 +++ .../length_filter/length_filter.go | 44 ++ .../length_filter/length_filter_test.go | 102 ++++ .../lower_case_filter/lower_case_filter.go | 35 ++ .../lower_case_filter_test.go | 52 ++ .../stemmer_filter/stemmer_filter.go | 46 ++ .../stemmer_filter/stemmer_filter_test.go | 52 ++ .../stop_words_filter/stop_words_filter.go | 53 ++ .../stop_words_filter_test.go | 55 +++ .../regexp_tokenizer/regexp_tokenizer.go | 40 ++ .../simple_word_boundary.go | 29 ++ .../simple_word_boundary_test.go | 51 ++ .../tokenizers/single_token/single_token.go | 31 ++ .../single_token/single_token_test.go | 67 +++ .../unicode_word_boundary/boundary.go | 114 +++++ .../unicode_word_boundary/boundary_test.go | 125 +++++ analysis/type.go | 59 +++ document/document.go | 34 ++ document/field.go | 29 ++ document/field_text.go | 41 ++ document/indexing_options.go | 27 + document/indexing_options_test.go | 68 +++ examples/bleve_index_json/main.go | 63 +++ examples/bleve_query/main.go | 70 +++ index/index.go | 48 ++ index/mock/mock.go | 227 +++++++++ index/mock/mock_test.go | 124 +++++ index/upside_down/reader.go | 101 ++++ index/upside_down/reader_test.go | 111 +++++ index/upside_down/row.go | 412 ++++++++++++++++ index/upside_down/row_test.go | 89 ++++ index/upside_down/upside_down.go | 466 ++++++++++++++++++ index/upside_down/upside_down_test.go | 221 +++++++++ search/collector.go | 21 + search/collector_top_score.go | 96 ++++ search/collector_top_score_test.go | 107 ++++ search/explanation.go | 28 ++ search/query.go | 19 + search/query_term.go | 32 ++ search/scorer_term.go | 172 +++++++ search/search.go | 39 ++ search/search_term.go | 84 ++++ search/search_test.go | 50 ++ search/sqrt_cache.go | 24 + shredder/json_shredder.go | 64 +++ shredder/jsonpointer_shredder.go | 55 +++ shredder/shredder.go | 17 + utils/bleve_dump/main.go | 31 ++ 55 files changed, 4170 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 analysis/analyzers/keyword_analyzer/keyword.go create mode 100644 analysis/analyzers/standard_analyzer/standard.go create mode 100644 analysis/char_filters/html_char_filter/html_char_filter.go create mode 100644 analysis/char_filters/html_char_filter/html_char_filter_test.go create mode 100644 analysis/char_filters/regexp_char_filter/regexp_char_filter.go create mode 100644 analysis/freq.go create mode 100644 analysis/token_filters/length_filter/length_filter.go create mode 100644 analysis/token_filters/length_filter/length_filter_test.go create mode 100644 analysis/token_filters/lower_case_filter/lower_case_filter.go create mode 100644 analysis/token_filters/lower_case_filter/lower_case_filter_test.go create mode 100644 analysis/token_filters/stemmer_filter/stemmer_filter.go create mode 100644 analysis/token_filters/stemmer_filter/stemmer_filter_test.go create mode 100644 analysis/token_filters/stop_words_filter/stop_words_filter.go create mode 100644 analysis/token_filters/stop_words_filter/stop_words_filter_test.go create mode 100644 analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go create mode 100644 analysis/tokenizers/simple_word_boundary/simple_word_boundary.go create mode 100644 analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go create mode 100644 analysis/tokenizers/single_token/single_token.go create mode 100644 analysis/tokenizers/single_token/single_token_test.go create mode 100644 analysis/tokenizers/unicode_word_boundary/boundary.go create mode 100644 analysis/tokenizers/unicode_word_boundary/boundary_test.go create mode 100644 analysis/type.go create mode 100644 document/document.go create mode 100644 document/field.go create mode 100644 document/field_text.go create mode 100644 document/indexing_options.go create mode 100644 document/indexing_options_test.go create mode 100644 examples/bleve_index_json/main.go create mode 100644 examples/bleve_query/main.go create mode 100644 index/index.go create mode 100644 index/mock/mock.go create mode 100644 index/mock/mock_test.go create mode 100644 index/upside_down/reader.go create mode 100644 index/upside_down/reader_test.go create mode 100644 index/upside_down/row.go create mode 100644 index/upside_down/row_test.go create mode 100644 index/upside_down/upside_down.go create mode 100644 index/upside_down/upside_down_test.go create mode 100644 search/collector.go create mode 100644 search/collector_top_score.go create mode 100644 search/collector_top_score_test.go create mode 100644 search/explanation.go create mode 100644 search/query.go create mode 100644 search/query_term.go create mode 100644 search/scorer_term.go create mode 100644 search/search.go create mode 100644 search/search_term.go create mode 100644 search/search_test.go create mode 100644 search/sqrt_cache.go create mode 100644 shredder/json_shredder.go create mode 100644 shredder/jsonpointer_shredder.go create mode 100644 shredder/shredder.go create mode 100644 utils/bleve_dump/main.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..7dd33b7c --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +#* +*.sublime-* +*~ +.#* +.project +.settings +.DS_Store +/examples/bleve_index_json/bleve_index_json +/examples/bleve_query/bleve_query +/utils/bleve_dump/bleve_dump diff --git a/README.md b/README.md new file mode 100644 index 00000000..3c889c18 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +## bleve + +A modern text indexing library for go. diff --git a/analysis/analyzers/keyword_analyzer/keyword.go b/analysis/analyzers/keyword_analyzer/keyword.go new file mode 100644 index 00000000..aa52a3b0 --- /dev/null +++ b/analysis/analyzers/keyword_analyzer/keyword.go @@ -0,0 +1,24 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package keyword_analyzer + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/tokenizers/single_token" +) + +func NewKeywordAnalyzer() (*analysis.Analyzer, error) { + keyword := analysis.Analyzer{ + CharFilters: []analysis.CharFilter{}, + Tokenizer: single_token.NewSingleTokenTokenizer(), + Filters: []analysis.TokenFilter{}, + } + + return &keyword, nil +} diff --git a/analysis/analyzers/standard_analyzer/standard.go b/analysis/analyzers/standard_analyzer/standard.go new file mode 100644 index 00000000..95e809f9 --- /dev/null +++ b/analysis/analyzers/standard_analyzer/standard.go @@ -0,0 +1,39 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package standard_analyzer + +import ( + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter" + "github.com/couchbaselabs/bleve/analysis/token_filters/stop_words_filter" + "github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary" +) + +func NewStandardAnalyzer() (*analysis.Analyzer, error) { + lower_case_filter, err := lower_case_filter.NewLowerCaseFilter() + if err != nil { + return nil, err + } + + stop_words_filter, err := stop_words_filter.NewStopWordsFilter() + if err != nil { + return nil, err + } + + standard := analysis.Analyzer{ + CharFilters: []analysis.CharFilter{}, + Tokenizer: unicode_word_boundary.NewUnicodeWordBoundaryTokenizer(), + TokenFilters: []analysis.TokenFilter{ + lower_case_filter, + stop_words_filter, + }, + } + + return &standard, nil +} diff --git a/analysis/char_filters/html_char_filter/html_char_filter.go b/analysis/char_filters/html_char_filter/html_char_filter.go new file mode 100644 index 00000000..a564c064 --- /dev/null +++ b/analysis/char_filters/html_char_filter/html_char_filter.go @@ -0,0 +1,32 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package html_char_filter + +import ( + "regexp" + + "github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter" +) + +// the origin of this regex is here: +// http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/ +// slightly modified by me to also match the DOCTYPE +const htmlTagPattern = `\s]+))?)+\s*|\s*)/?>` + +var htmlRegex = regexp.MustCompile(htmlTagPattern) + +type HtmlCharFilter struct { + *regexp_char_filter.RegexpCharFilter +} + +func NewHtmlCharFilter() *HtmlCharFilter { + return &HtmlCharFilter{ + regexp_char_filter.NewRegexpCharFilter(htmlRegex, []byte{' '}), + } +} diff --git a/analysis/char_filters/html_char_filter/html_char_filter_test.go b/analysis/char_filters/html_char_filter/html_char_filter_test.go new file mode 100644 index 00000000..eb01f8c1 --- /dev/null +++ b/analysis/char_filters/html_char_filter/html_char_filter_test.go @@ -0,0 +1,52 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package html_char_filter + +import ( + "reflect" + "testing" +) + +func TestHtmlCharFilter(t *testing.T) { + tests := []struct { + input []byte + output []byte + }{ + { + input: []byte(` + + + +

My First Heading

+ +

My first paragraph.

+ + +`), + output: []byte(` + + + + My First Heading + + My first paragraph. + + + `), + }, + } + + for _, test := range tests { + filter := NewHtmlCharFilter() + output := filter.Filter(test.input) + if !reflect.DeepEqual(output, test.output) { + t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input)) + } + } +} diff --git a/analysis/char_filters/regexp_char_filter/regexp_char_filter.go b/analysis/char_filters/regexp_char_filter/regexp_char_filter.go new file mode 100644 index 00000000..6f84a37c --- /dev/null +++ b/analysis/char_filters/regexp_char_filter/regexp_char_filter.go @@ -0,0 +1,30 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package regexp_char_filter + +import ( + "bytes" + "regexp" +) + +type RegexpCharFilter struct { + r *regexp.Regexp + replacement []byte +} + +func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter { + return &RegexpCharFilter{ + r: r, + replacement: replacement, + } +} + +func (s *RegexpCharFilter) Filter(input []byte) []byte { + return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) }) +} diff --git a/analysis/freq.go b/analysis/freq.go new file mode 100644 index 00000000..dbeea90a --- /dev/null +++ b/analysis/freq.go @@ -0,0 +1,55 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package analysis + +type TokenLocation struct { + Start int + End int + Position int +} + +type TokenFreq struct { + Term []byte + Locations []*TokenLocation +} + +func TokenFrequency(tokens TokenStream) []*TokenFreq { + index := make(map[string]*TokenFreq) + + for _, token := range tokens { + curr, ok := index[string(token.Term)] + if ok { + curr.Locations = append(curr.Locations, &TokenLocation{ + Start: token.Start, + End: token.End, + Position: token.Position, + }) + } else { + index[string(token.Term)] = &TokenFreq{ + Term: token.Term, + Locations: []*TokenLocation{ + &TokenLocation{ + Start: token.Start, + End: token.End, + Position: token.Position, + }, + }, + } + } + } + + rv := make([]*TokenFreq, len(index)) + i := 0 + for _, tf := range index { + rv[i] = tf + i += 1 + } + + return rv +} diff --git a/analysis/token_filters/length_filter/length_filter.go b/analysis/token_filters/length_filter/length_filter.go new file mode 100644 index 00000000..b2ccdef7 --- /dev/null +++ b/analysis/token_filters/length_filter/length_filter.go @@ -0,0 +1,44 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package length_filter + +import ( + "unicode/utf8" + + "github.com/couchbaselabs/bleve/analysis" +) + +type LengthFilter struct { + min int + max int +} + +func NewLengthFilter(min, max int) (*LengthFilter, error) { + return &LengthFilter{ + min: min, + max: max, + }, nil +} + +func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + rv := make(analysis.TokenStream, 0) + + for _, token := range input { + wordLen := utf8.RuneCount(token.Term) + if f.min > 0 && f.min > wordLen { + continue + } + if f.max > 0 && f.max < wordLen { + continue + } + rv = append(rv, token) + } + + return rv +} diff --git a/analysis/token_filters/length_filter/length_filter_test.go b/analysis/token_filters/length_filter/length_filter_test.go new file mode 100644 index 00000000..3f4d3923 --- /dev/null +++ b/analysis/token_filters/length_filter/length_filter_test.go @@ -0,0 +1,102 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package length_filter + +import ( + "testing" + + "github.com/couchbaselabs/bleve/analysis" +) + +func TestLengthFilter(t *testing.T) { + + inputTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("1"), + }, + &analysis.Token{ + Term: []byte("two"), + }, + &analysis.Token{ + Term: []byte("three"), + }, + } + + lengthFilter, err := NewLengthFilter(3, 4) + if err != nil { + t.Fatal(err) + } + ouputTokenStream := lengthFilter.Filter(inputTokenStream) + if len(ouputTokenStream) != 1 { + t.Fatalf("expected 1 output token") + } + if string(ouputTokenStream[0].Term) != "two" { + t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term) + } +} + +func TestLengthFilterNoMax(t *testing.T) { + + inputTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("1"), + }, + &analysis.Token{ + Term: []byte("two"), + }, + &analysis.Token{ + Term: []byte("three"), + }, + } + + lengthFilter, err := NewLengthFilter(3, -1) + if err != nil { + t.Fatal(err) + } + ouputTokenStream := lengthFilter.Filter(inputTokenStream) + if len(ouputTokenStream) != 2 { + t.Fatalf("expected 2 output token") + } + if string(ouputTokenStream[0].Term) != "two" { + t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term) + } + if string(ouputTokenStream[1].Term) != "three" { + t.Errorf("expected term `three`, got `%s`", ouputTokenStream[0].Term) + } +} + +func TestLengthFilterNoMin(t *testing.T) { + + inputTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("1"), + }, + &analysis.Token{ + Term: []byte("two"), + }, + &analysis.Token{ + Term: []byte("three"), + }, + } + + lengthFilter, err := NewLengthFilter(-1, 4) + if err != nil { + t.Fatal(err) + } + ouputTokenStream := lengthFilter.Filter(inputTokenStream) + if len(ouputTokenStream) != 2 { + t.Fatalf("expected 2 output token") + } + if string(ouputTokenStream[0].Term) != "1" { + t.Errorf("expected term `1`, got `%s`", ouputTokenStream[0].Term) + } + if string(ouputTokenStream[1].Term) != "two" { + t.Errorf("expected term `two`, got `%s`", ouputTokenStream[0].Term) + } +} diff --git a/analysis/token_filters/lower_case_filter/lower_case_filter.go b/analysis/token_filters/lower_case_filter/lower_case_filter.go new file mode 100644 index 00000000..572a6d5a --- /dev/null +++ b/analysis/token_filters/lower_case_filter/lower_case_filter.go @@ -0,0 +1,35 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package lower_case_filter + +import ( + "strings" + + "github.com/couchbaselabs/bleve/analysis" +) + +type LowerCaseFilter struct { +} + +func NewLowerCaseFilter() (*LowerCaseFilter, error) { + return &LowerCaseFilter{}, nil +} + +func (f *LowerCaseFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + rv := make(analysis.TokenStream, 0) + + for _, token := range input { + word := string(token.Term) + wordLowerCase := strings.ToLower(word) + token.Term = []byte(wordLowerCase) + rv = append(rv, token) + } + + return rv +} diff --git a/analysis/token_filters/lower_case_filter/lower_case_filter_test.go b/analysis/token_filters/lower_case_filter/lower_case_filter_test.go new file mode 100644 index 00000000..3c4cbf12 --- /dev/null +++ b/analysis/token_filters/lower_case_filter/lower_case_filter_test.go @@ -0,0 +1,52 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package lower_case_filter + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" +) + +func TestLowerCaseFilter(t *testing.T) { + + inputTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ONE"), + }, + &analysis.Token{ + Term: []byte("two"), + }, + &analysis.Token{ + Term: []byte("ThReE"), + }, + } + + expectedTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("one"), + }, + &analysis.Token{ + Term: []byte("two"), + }, + &analysis.Token{ + Term: []byte("three"), + }, + } + + filter, err := NewLowerCaseFilter() + if err != nil { + t.Fatal(err) + } + ouputTokenStream := filter.Filter(inputTokenStream) + if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { + t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream) + } +} diff --git a/analysis/token_filters/stemmer_filter/stemmer_filter.go b/analysis/token_filters/stemmer_filter/stemmer_filter.go new file mode 100644 index 00000000..f9470b58 --- /dev/null +++ b/analysis/token_filters/stemmer_filter/stemmer_filter.go @@ -0,0 +1,46 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package stemmer_filter + +import ( + "bitbucket.org/tebeka/snowball" + "github.com/couchbaselabs/bleve/analysis" +) + +type StemmerFilter struct { + lang string + stemmer *snowball.Stemmer +} + +func NewStemmerFilter(lang string) (*StemmerFilter, error) { + stemmer, err := snowball.New(lang) + if err != nil { + return nil, err + } + return &StemmerFilter{ + lang: lang, + stemmer: stemmer, + }, nil +} + +func (s *StemmerFilter) List() []string { + return snowball.LangList() +} + +func (s *StemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + rv := make(analysis.TokenStream, 0) + + for _, token := range input { + stemmed := s.stemmer.Stem(string(token.Term)) + token.Term = []byte(stemmed) + rv = append(rv, token) + } + + return rv +} diff --git a/analysis/token_filters/stemmer_filter/stemmer_filter_test.go b/analysis/token_filters/stemmer_filter/stemmer_filter_test.go new file mode 100644 index 00000000..645efa31 --- /dev/null +++ b/analysis/token_filters/stemmer_filter/stemmer_filter_test.go @@ -0,0 +1,52 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package stemmer_filter + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" +) + +func TestStemmerFilter(t *testing.T) { + + inputTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("walking"), + }, + &analysis.Token{ + Term: []byte("talked"), + }, + &analysis.Token{ + Term: []byte("business"), + }, + } + + expectedTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("walk"), + }, + &analysis.Token{ + Term: []byte("talk"), + }, + &analysis.Token{ + Term: []byte("busi"), + }, + } + + filter, err := NewStemmerFilter("english") + if err != nil { + t.Fatal(err) + } + ouputTokenStream := filter.Filter(inputTokenStream) + if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { + t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream) + } +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_filter.go b/analysis/token_filters/stop_words_filter/stop_words_filter.go new file mode 100644 index 00000000..8d3d2873 --- /dev/null +++ b/analysis/token_filters/stop_words_filter/stop_words_filter.go @@ -0,0 +1,53 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package stop_words_filter + +import ( + "github.com/couchbaselabs/bleve/analysis" +) + +var DEFAULT_STOP_WORDS []string = []string{ + "a", "an", "and", "are", "as", "at", "be", "but", "by", + "for", "if", "in", "into", "is", "it", + "no", "not", "of", "on", "or", "such", + "that", "the", "their", "then", "there", "these", + "they", "this", "to", "was", "will", "with", +} + +type StopWordsFilter struct { + stopWords map[string]bool +} + +func NewStopWordsFilter() (*StopWordsFilter, error) { + return &StopWordsFilter{ + stopWords: buildStopWordMap(DEFAULT_STOP_WORDS), + }, nil +} + +func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + rv := make(analysis.TokenStream, 0) + + for _, token := range input { + word := string(token.Term) + _, isStopWord := f.stopWords[word] + if !isStopWord { + rv = append(rv, token) + } + } + + return rv +} + +func buildStopWordMap(words []string) map[string]bool { + rv := make(map[string]bool, len(words)) + for _, word := range words { + rv[word] = true + } + return rv +} diff --git a/analysis/token_filters/stop_words_filter/stop_words_filter_test.go b/analysis/token_filters/stop_words_filter/stop_words_filter_test.go new file mode 100644 index 00000000..fb120fa6 --- /dev/null +++ b/analysis/token_filters/stop_words_filter/stop_words_filter_test.go @@ -0,0 +1,55 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package stop_words_filter + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" +) + +func TestStopWordsFilter(t *testing.T) { + + inputTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("a"), + }, + &analysis.Token{ + Term: []byte("walk"), + }, + &analysis.Token{ + Term: []byte("in"), + }, + &analysis.Token{ + Term: []byte("the"), + }, + &analysis.Token{ + Term: []byte("park"), + }, + } + + expectedTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("walk"), + }, + &analysis.Token{ + Term: []byte("park"), + }, + } + + filter, err := NewStopWordsFilter() + if err != nil { + t.Fatal(err) + } + ouputTokenStream := filter.Filter(inputTokenStream) + if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { + t.Errorf("expected %#v got %#v", expectedTokenStream, ouputTokenStream) + } +} diff --git a/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go new file mode 100644 index 00000000..b9286cf3 --- /dev/null +++ b/analysis/tokenizers/regexp_tokenizer/regexp_tokenizer.go @@ -0,0 +1,40 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package regexp_tokenizer + +import ( + "regexp" + + "github.com/couchbaselabs/bleve/analysis" +) + +type RegexpTokenizer struct { + r *regexp.Regexp +} + +func NewRegexpTokenizer(r *regexp.Regexp) *RegexpTokenizer { + return &RegexpTokenizer{ + r: r, + } +} + +func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream { + matches := rt.r.FindAllIndex(input, -1) + rv := make(analysis.TokenStream, len(matches)) + for i, match := range matches { + token := analysis.Token{ + Term: input[match[0]:match[1]], + Start: match[0], + End: match[1], + Position: i + 1, + } + rv[i] = &token + } + return rv +} diff --git a/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go b/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go new file mode 100644 index 00000000..dea53856 --- /dev/null +++ b/analysis/tokenizers/simple_word_boundary/simple_word_boundary.go @@ -0,0 +1,29 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package simple_word_boundary + +import ( + "regexp" + + "github.com/couchbaselabs/bleve/analysis/tokenizers/regexp_tokenizer" +) + +const wordPattern = `\w+` + +var wordRegex = regexp.MustCompile(wordPattern) + +type SimpleWordBoundaryTokenizer struct { + *regexp_tokenizer.RegexpTokenizer +} + +func NewSimpleWordBoundaryTokenizer() *SimpleWordBoundaryTokenizer { + return &SimpleWordBoundaryTokenizer{ + regexp_tokenizer.NewRegexpTokenizer(wordRegex), + } +} diff --git a/analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go b/analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go new file mode 100644 index 00000000..73f2af11 --- /dev/null +++ b/analysis/tokenizers/simple_word_boundary/simple_word_boundary_test.go @@ -0,0 +1,51 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package simple_word_boundary + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" +) + +func TestBoundary(t *testing.T) { + + tests := []struct { + input []byte + output analysis.TokenStream + }{ + { + []byte("Hello World."), + analysis.TokenStream{ + { + 0, + 5, + []byte("Hello"), + 1, + }, + { + 6, + 11, + []byte("World"), + 2, + }, + }, + }, + } + + for _, test := range tests { + tokenizer := NewSimpleWordBoundaryTokenizer() + actual := tokenizer.Tokenize(test.input) + + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) + } + } +} diff --git a/analysis/tokenizers/single_token/single_token.go b/analysis/tokenizers/single_token/single_token.go new file mode 100644 index 00000000..0f73bcce --- /dev/null +++ b/analysis/tokenizers/single_token/single_token.go @@ -0,0 +1,31 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package single_token + +import ( + "github.com/couchbaselabs/bleve/analysis" +) + +type SingleTokenTokenizer struct { +} + +func NewSingleTokenTokenizer() *SingleTokenTokenizer { + return &SingleTokenTokenizer{} +} + +func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream { + return analysis.TokenStream{ + &analysis.Token{ + Term: input, + Position: 1, + Start: 0, + End: len(input), + }, + } +} diff --git a/analysis/tokenizers/single_token/single_token_test.go b/analysis/tokenizers/single_token/single_token_test.go new file mode 100644 index 00000000..8f29dc85 --- /dev/null +++ b/analysis/tokenizers/single_token/single_token_test.go @@ -0,0 +1,67 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package single_token + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" +) + +func TestSingleTokenTokenizer(t *testing.T) { + + tests := []struct { + input []byte + output analysis.TokenStream + }{ + { + []byte("Hello World"), + analysis.TokenStream{ + { + 0, + 11, + []byte("Hello World"), + 1, + }, + }, + }, + { + []byte("こんにちは世界"), + analysis.TokenStream{ + { + 0, + 21, + []byte("こんにちは世界"), + 1, + }, + }, + }, + { + []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"), + analysis.TokenStream{ + { + 0, + 72, + []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"), + 1, + }, + }, + }, + } + + for _, test := range tests { + tokenizer := NewSingleTokenTokenizer() + actual := tokenizer.Tokenize(test.input) + + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) + } + } +} diff --git a/analysis/tokenizers/unicode_word_boundary/boundary.go b/analysis/tokenizers/unicode_word_boundary/boundary.go new file mode 100644 index 00000000..7dc12e03 --- /dev/null +++ b/analysis/tokenizers/unicode_word_boundary/boundary.go @@ -0,0 +1,114 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package unicode_word_boundary + +// #cgo pkg-config: icu-uc +// #include +// #include +// #include "unicode/utypes.h" +// #include "unicode/uchar.h" +// #include "unicode/ubrk.h" +// #include "unicode/ustring.h" +import "C" + +import "log" +import "unsafe" +import "github.com/couchbaselabs/bleve/analysis" + +type UnicodeWordBoundaryTokenizer struct { + locale *C.char +} + +func NewUnicodeWordBoundaryTokenizer() *UnicodeWordBoundaryTokenizer { + return &UnicodeWordBoundaryTokenizer{} +} + +func NewUnicodeWordBoundaryCustomLocaleTokenizer(locale string) *UnicodeWordBoundaryTokenizer { + return &UnicodeWordBoundaryTokenizer{ + locale: C.CString(locale), + } +} + +func (t *UnicodeWordBoundaryTokenizer) Tokenize(input []byte) analysis.TokenStream { + // var bi *C.UBreakIterator + rv := make(analysis.TokenStream, 0) + defer C.free(unsafe.Pointer(t.locale)) + + if len(input) < 1 { + return rv + } + + // works + var myUnsafePointer = unsafe.Pointer(&(input[0])) + var myCCharPointer *C.char = (*C.char)(myUnsafePointer) + + var inlen C.int32_t = C.int32_t(len(input)) + var buflen C.int32_t = C.int32_t(2*len(input) + 1) // worse case each byte becomes 2 + var stringToExamine []C.UChar = make([]C.UChar, buflen) + //log.Printf("new buff is: %v", stringToExamine) + var myUnsafePointerToExamine = unsafe.Pointer(&(stringToExamine[0])) + var myUCharPointer *C.UChar = (*C.UChar)(myUnsafePointerToExamine) + C.u_uastrncpy(myUCharPointer, myCCharPointer, inlen) + + //log.Printf("after copy new buff is: %v", stringToExamine) + + var err C.UErrorCode = C.U_ZERO_ERROR + bi := C.ubrk_open(C.UBRK_WORD, t.locale, myUCharPointer, -1, &err) + + if err > C.U_ZERO_ERROR { + log.Printf("error opening boundary iterator") + return rv + } + + defer C.ubrk_close(bi) + + position := 0 + var prev C.int32_t + p := C.ubrk_first(bi) + for p != C.UBRK_DONE { + + q := C.ubrk_getRuleStatus(bi) + + // convert boundaries back to utf8 positions + var nilCString *C.char + var indexA C.int32_t + + C.u_strToUTF8(nilCString, 0, &indexA, myUCharPointer, prev, &err) + if err > C.U_ZERO_ERROR && err != C.U_BUFFER_OVERFLOW_ERROR { + log.Printf("error converting boundary %d", err) + return rv + } else { + err = C.U_ZERO_ERROR + } + + var indexB C.int32_t + C.u_strToUTF8(nilCString, 0, &indexB, myUCharPointer, p, &err) + if err > C.U_ZERO_ERROR && err != C.U_BUFFER_OVERFLOW_ERROR { + log.Printf("error converting boundary %d", err) + return rv + } else { + err = C.U_ZERO_ERROR + } + + if q != 0 { + position += 1 + token := analysis.Token{ + Start: int(indexA), + End: int(indexB), + Term: input[indexA:indexB], + Position: position, + } + rv = append(rv, &token) + } + prev = p + p = C.ubrk_next(bi) + } + + return rv +} diff --git a/analysis/tokenizers/unicode_word_boundary/boundary_test.go b/analysis/tokenizers/unicode_word_boundary/boundary_test.go new file mode 100644 index 00000000..d8ffcecd --- /dev/null +++ b/analysis/tokenizers/unicode_word_boundary/boundary_test.go @@ -0,0 +1,125 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package unicode_word_boundary + +import ( + "reflect" + "testing" + + "github.com/couchbaselabs/bleve/analysis" +) + +func TestBoundary(t *testing.T) { + + tests := []struct { + input []byte + locale string + output analysis.TokenStream + }{ + { + []byte("Hello World"), + "en_US", + analysis.TokenStream{ + { + 0, + 5, + []byte("Hello"), + 1, + }, + { + 6, + 11, + []byte("World"), + 2, + }, + }, + }, + { + []byte("こんにちは世界"), + "en_US", + analysis.TokenStream{ + { + 0, + 15, + []byte("こんにちは"), + 1, + }, + { + 15, + 21, + []byte("世界"), + 2, + }, + }, + }, + { + []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"), + "th_TH", + analysis.TokenStream{ + { + 0, + 9, + []byte("แยก"), + 1, + }, + { + 9, + 15, + []byte("คำ"), + 2, + }, + { + 15, + 27, + []byte("ภาษา"), + 3, + }, + { + 27, + 36, + []byte("ไทย"), + 4, + }, + { + 36, + 42, + []byte("ก็"), + 5, + }, + { + 42, + 57, + []byte("ทำได้"), + 6, + }, + { + 57, + 63, + []byte("นะ"), + 7, + }, + { + 63, + 72, + []byte("จ้ะ"), + 8, + }, + }, + }, + } + + for _, test := range tests { + tokenizer := NewUnicodeWordBoundaryCustomLocaleTokenizer(test.locale) + actual := tokenizer.Tokenize(test.input) + + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input)) + } + } +} diff --git a/analysis/type.go b/analysis/type.go new file mode 100644 index 00000000..daa48cee --- /dev/null +++ b/analysis/type.go @@ -0,0 +1,59 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package analysis + +import ( + "fmt" +) + +type CharFilter interface { + Filter([]byte) []byte +} + +type Token struct { + Start int + End int + Term []byte + Position int +} + +func (t *Token) String() string { + return fmt.Sprintf("Start: %d End: %d Position: %d Token: %s", t.Start, t.End, t.Position, string(t.Term)) +} + +type TokenStream []*Token + +type Tokenizer interface { + Tokenize([]byte) TokenStream +} + +type TokenFilter interface { + Filter(TokenStream) TokenStream +} + +type Analyzer struct { + CharFilters []CharFilter + Tokenizer Tokenizer + TokenFilters []TokenFilter +} + +func (a *Analyzer) Analyze(input []byte) TokenStream { + if a.CharFilters != nil { + for _, cf := range a.CharFilters { + input = cf.Filter(input) + } + } + tokens := a.Tokenizer.Tokenize(input) + if a.TokenFilters != nil { + for _, tf := range a.TokenFilters { + tokens = tf.Filter(tokens) + } + } + return tokens +} diff --git a/document/document.go b/document/document.go new file mode 100644 index 00000000..49199993 --- /dev/null +++ b/document/document.go @@ -0,0 +1,34 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package document + +import ( + "encoding/json" +) + +type Document struct { + ID string `json:"id"` + Fields []*Field `json:"fields"` +} + +func NewDocument(id string) *Document { + return &Document{ + ID: id, + Fields: make([]*Field, 0), + } +} + +func (d *Document) AddField(f *Field) { + d.Fields = append(d.Fields, f) +} + +func (d *Document) String() string { + bytes, _ := json.MarshalIndent(d, "", " ") + return string(bytes) +} diff --git a/document/field.go b/document/field.go new file mode 100644 index 00000000..3f6dc1ef --- /dev/null +++ b/document/field.go @@ -0,0 +1,29 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package document + +import ( + "github.com/couchbaselabs/bleve/analysis" +) + +type Field struct { + Name string + IndexingOptions int + Analyzer *analysis.Analyzer + Value []byte +} + +func NewField(name string, value []byte, indexingOptions int, analyzer *analysis.Analyzer) *Field { + return &Field{ + Name: name, + IndexingOptions: indexingOptions, + Analyzer: analyzer, + Value: value, + } +} diff --git a/document/field_text.go b/document/field_text.go new file mode 100644 index 00000000..f9c3f56e --- /dev/null +++ b/document/field_text.go @@ -0,0 +1,41 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package document + +import ( + "log" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer" +) + +var standardAnalyzer *analysis.Analyzer + +func init() { + var err error + standardAnalyzer, err = standard_analyzer.NewStandardAnalyzer() + if err != nil { + log.Fatal(err) + } +} + +const DEFAULT_TEXT_INDEXING_OPTIONS = INDEX_FIELD + +func NewTextField(name string, value []byte) *Field { + return NewTextFieldWithIndexingOptions(name, value, DEFAULT_TEXT_INDEXING_OPTIONS) +} + +func NewTextFieldWithIndexingOptions(name string, value []byte, indexingOptions int) *Field { + return &Field{ + Name: name, + IndexingOptions: indexingOptions, + Analyzer: standardAnalyzer, + Value: value, + } +} diff --git a/document/indexing_options.go b/document/indexing_options.go new file mode 100644 index 00000000..fe7dbd60 --- /dev/null +++ b/document/indexing_options.go @@ -0,0 +1,27 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package document + +const ( + INDEX_FIELD = 1 << iota + STORE_FIELD + INCLUDE_TERM_VECTORS +) + +func IsIndexedField(arg int) bool { + return arg&INDEX_FIELD != 0 +} + +func IsStoredField(arg int) bool { + return arg&STORE_FIELD != 0 +} + +func IncludeTermVectors(arg int) bool { + return arg&INCLUDE_TERM_VECTORS != 0 +} diff --git a/document/indexing_options_test.go b/document/indexing_options_test.go new file mode 100644 index 00000000..ce8ce852 --- /dev/null +++ b/document/indexing_options_test.go @@ -0,0 +1,68 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package document + +import ( + "testing" +) + +func TestIndexingOptions(t *testing.T) { + tests := []struct { + indexingOptions int + isIndexed bool + isStored bool + includeTermVectors bool + }{ + { + indexingOptions: INDEX_FIELD | STORE_FIELD | INCLUDE_TERM_VECTORS, + isIndexed: true, + isStored: true, + includeTermVectors: true, + }, + { + indexingOptions: INDEX_FIELD | INCLUDE_TERM_VECTORS, + isIndexed: true, + isStored: false, + includeTermVectors: true, + }, + { + indexingOptions: STORE_FIELD | INCLUDE_TERM_VECTORS, + isIndexed: false, + isStored: true, + includeTermVectors: true, + }, + { + indexingOptions: INDEX_FIELD, + isIndexed: true, + isStored: false, + includeTermVectors: false, + }, + { + indexingOptions: STORE_FIELD, + isIndexed: false, + isStored: true, + includeTermVectors: false, + }, + } + + for _, test := range tests { + actuallyIndexed := IsIndexedField(test.indexingOptions) + if actuallyIndexed != test.isIndexed { + t.Errorf("expected indexed to be %v, got %v for %d", test.isIndexed, actuallyIndexed, test.indexingOptions) + } + actuallyStored := IsStoredField(test.indexingOptions) + if actuallyStored != test.isStored { + t.Errorf("expected stored to be %v, got %v for %d", test.isStored, actuallyStored, test.indexingOptions) + } + actuallyIncludeTermVectors := IncludeTermVectors(test.indexingOptions) + if actuallyIncludeTermVectors != test.includeTermVectors { + t.Errorf("expected includeTermVectors to be %v, got %v for %d", test.includeTermVectors, actuallyIncludeTermVectors, test.indexingOptions) + } + } +} diff --git a/examples/bleve_index_json/main.go b/examples/bleve_index_json/main.go new file mode 100644 index 00000000..a76452ff --- /dev/null +++ b/examples/bleve_index_json/main.go @@ -0,0 +1,63 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package main + +import ( + "flag" + "io/ioutil" + "log" + + "github.com/couchbaselabs/bleve/index/upside_down" + "github.com/couchbaselabs/bleve/shredder" +) + +var jsonDir = flag.String("jsonDir", "json", "json directory") +var indexDir = flag.String("indexDir", "index", "index directory") + +func main() { + + flag.Parse() + + // create a automatic JSON document shredder + jsonShredder := shredder.NewAutoJsonShredder() + + // create a new index + index := upside_down.NewUpsideDownCouch(*indexDir) + err := index.Open() + if err != nil { + log.Fatal(err) + } + defer index.Close() + + // open the directory + dirEntries, err := ioutil.ReadDir(*jsonDir) + if err != nil { + log.Fatal(err) + } + + // walk the directory entries + for _, dirEntry := range dirEntries { + // read the bytes + jsonBytes, err := ioutil.ReadFile(*jsonDir + "/" + dirEntry.Name()) + if err != nil { + log.Fatal(err) + } + // shred them into a document + doc, err := jsonShredder.Shred(dirEntry.Name(), jsonBytes) + if err != nil { + log.Fatal(err) + } + //log.Printf("%+v", doc) + // update the index + err = index.Update(doc) + if err != nil { + log.Fatal(err) + } + } +} diff --git a/examples/bleve_query/main.go b/examples/bleve_query/main.go new file mode 100644 index 00000000..a8b37a65 --- /dev/null +++ b/examples/bleve_query/main.go @@ -0,0 +1,70 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package main + +import ( + "flag" + "fmt" + "log" + + "github.com/couchbaselabs/bleve/index/upside_down" + "github.com/couchbaselabs/bleve/search" +) + +var field = flag.String("field", "description", "field to query") +var indexDir = flag.String("indexDir", "index", "index directory") +var limit = flag.Int("limit", 10, "limit to first N results") + +func main() { + + flag.Parse() + + if flag.NArg() < 1 { + log.Fatal("Specify search term") + } + + // open index + index := upside_down.NewUpsideDownCouch(*indexDir) + err := index.Open() + if err != nil { + log.Fatal(err) + } + defer index.Close() + + tq := search.TermQuery{ + Term: flag.Arg(0), + Field: *field, + BoostVal: 1.0, + Explain: true, + } + collector := search.NewTopScorerCollector(*limit) + searcher, err := tq.Searcher(index) + if err != nil { + log.Fatalf("searcher error: %v", err) + return + } + err = collector.Collect(searcher) + if err != nil { + log.Fatalf("search error: %v", err) + return + } + results := collector.Results() + if len(results) == 0 { + fmt.Printf("No matches\n") + } else { + last := uint64(*limit) + if searcher.Count() < last { + last = searcher.Count() + } + fmt.Printf("%d matches, showing %d through %d\n", searcher.Count(), 1, last) + for i, result := range results { + fmt.Printf("%2d. %s (%f)\n", i+1, result.ID, result.Score) + } + } +} diff --git a/index/index.go b/index/index.go new file mode 100644 index 00000000..1a6a2c40 --- /dev/null +++ b/index/index.go @@ -0,0 +1,48 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package index + +import ( + "github.com/couchbaselabs/bleve/document" +) + +type Index interface { + Open() error + Close() + + Update(doc *document.Document) error + Delete(id string) error + + TermFieldReader(term []byte, field string) (TermFieldReader, error) + + DocCount() uint64 + + Dump() +} + +type TermFieldVector struct { + Field string + Pos uint64 + Start uint64 + End uint64 +} + +type TermFieldDoc struct { + ID string + Freq uint64 + Norm float64 + Vectors []*TermFieldVector +} + +type TermFieldReader interface { + Next() (*TermFieldDoc, error) + Advance(ID string) (*TermFieldDoc, error) + Count() uint64 + Close() +} diff --git a/index/mock/mock.go b/index/mock/mock.go new file mode 100644 index 00000000..01b5abab --- /dev/null +++ b/index/mock/mock.go @@ -0,0 +1,227 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package mock + +import ( + "fmt" + "math" + "sort" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/couchbaselabs/bleve/document" + "github.com/couchbaselabs/bleve/index" +) + +type mockFreq struct { + freq uint64 + norm float64 + vectors []*index.TermFieldVector +} + +// key doc id +type mockDocFreq map[string]*mockFreq + +//key field +type mockFieldDocFreq map[string]mockDocFreq + +// 2 dim array +// inner level are always pairs (field name, term) +type mockBackIndexEntry [][]string + +type MockIndex struct { + + //this level of the map, the key is the term + termIndex map[string]mockFieldDocFreq + + // key is docid + backIndex map[string]mockBackIndexEntry + + docCount uint64 + analyzer map[string]*analysis.Analyzer +} + +func NewMockIndexWithDocs(docs []*document.Document) *MockIndex { + rv := NewMockIndex() + for _, doc := range docs { + rv.Update(doc) + } + return rv +} + +func NewMockIndex() *MockIndex { + mi := MockIndex{ + termIndex: make(map[string]mockFieldDocFreq), + backIndex: make(map[string]mockBackIndexEntry), + analyzer: make(map[string]*analysis.Analyzer), + } + + return &mi +} + +func (index *MockIndex) Open() error { + return nil +} + +func (index *MockIndex) Close() {} + +// for this implementation we dont care about performance +// update is simply delete then add +func (index *MockIndex) Update(doc *document.Document) error { + index.Delete(doc.ID) + + backIndexEntry := make(mockBackIndexEntry, 0) + for _, field := range doc.Fields { + + analyzer := field.Analyzer + tokens := analyzer.Analyze(field.Value) + fieldLength := len(tokens) // number of tokens in this doc field + fieldNorm := 1.0 / math.Sqrt(float64(fieldLength)) + tokenFreqs := analysis.TokenFrequency(tokens) + for _, tf := range tokenFreqs { + mf := mockFreq{ + freq: uint64(len(tf.Locations)), + norm: fieldNorm, + } + if document.IncludeTermVectors(field.IndexingOptions) { + mf.vectors = index.mockVectorsFromTokenFreq(field.Name, tf) + } + termString := string(tf.Term) + fieldMap, ok := index.termIndex[termString] + if !ok { + fieldMap = make(map[string]mockDocFreq) + index.termIndex[termString] = fieldMap + } + docMap, ok := fieldMap[field.Name] + if !ok { + docMap = make(map[string]*mockFreq) + fieldMap[field.Name] = docMap + } + docMap[doc.ID] = &mf + backIndexInnerEntry := []string{field.Name, termString} + backIndexEntry = append(backIndexEntry, backIndexInnerEntry) + } + } + index.backIndex[doc.ID] = backIndexEntry + index.docCount += 1 + return nil +} + +func (index *MockIndex) Delete(id string) error { + backIndexEntry, existed := index.backIndex[id] + if existed { + for _, backIndexPair := range backIndexEntry { + if len(backIndexPair) == 2 { + field := backIndexPair[0] + term := backIndexPair[1] + delete(index.termIndex[term][field], id) + if len(index.termIndex[term][field]) == 0 { + delete(index.termIndex[term], field) + if len(index.termIndex[term]) == 0 { + delete(index.termIndex, term) + } + } + } + } + delete(index.backIndex, id) + index.docCount -= 1 + } + + return nil +} + +func (index *MockIndex) TermFieldReader(term []byte, field string) (index.TermFieldReader, error) { + + fdf, ok := index.termIndex[string(term)] + if !ok { + fdf = make(mockFieldDocFreq) + } + docFreqs, ok := fdf[field] + if !ok { + docFreqs = make(mockDocFreq) + } + mtfr := mockTermFieldReader{ + index: docFreqs, + sortedDocIds: make(sort.StringSlice, len(docFreqs)), + curr: -1, + } + i := 0 + for k, _ := range docFreqs { + mtfr.sortedDocIds[i] = k + i += 1 + } + sort.Sort(mtfr.sortedDocIds) + + return &mtfr, nil +} + +func (index *MockIndex) DocCount() uint64 { + return index.docCount +} + +type mockTermFieldReader struct { + index mockDocFreq + sortedDocIds sort.StringSlice + curr int +} + +func (reader *mockTermFieldReader) Next() (*index.TermFieldDoc, error) { + next := reader.curr + 1 + if next < len(reader.sortedDocIds) { + nextTermKey := reader.sortedDocIds[next] + nextTerm := reader.index[nextTermKey] + reader.curr = next + return &index.TermFieldDoc{ID: nextTermKey, Freq: nextTerm.freq, Norm: nextTerm.norm, Vectors: nextTerm.vectors}, nil + } + return nil, nil +} + +func (reader *mockTermFieldReader) Advance(ID string) (*index.TermFieldDoc, error) { + if reader.curr >= len(reader.sortedDocIds) { + return nil, nil + } + + i := reader.curr + for currTermID := reader.sortedDocIds[i]; currTermID < ID && i < len(reader.sortedDocIds); i += 1 { + reader.curr = i + currTermID = reader.sortedDocIds[reader.curr] + } + + if reader.curr < len(reader.sortedDocIds) { + nextTermKey := reader.sortedDocIds[reader.curr] + nextTerm := reader.index[nextTermKey] + return &index.TermFieldDoc{ID: nextTermKey, Freq: nextTerm.freq, Norm: nextTerm.norm, Vectors: nextTerm.vectors}, nil + } + return nil, nil +} + +func (reader *mockTermFieldReader) Count() uint64 { + return uint64(len(reader.sortedDocIds)) +} + +func (reader *mockTermFieldReader) Close() {} + +func (mi *MockIndex) mockVectorsFromTokenFreq(field string, tf *analysis.TokenFreq) []*index.TermFieldVector { + rv := make([]*index.TermFieldVector, len(tf.Locations)) + + for i, l := range tf.Locations { + mv := index.TermFieldVector{ + Field: field, + Pos: uint64(l.Position), + Start: uint64(l.Start), + End: uint64(l.End), + } + rv[i] = &mv + } + + return rv +} + +func (mi *MockIndex) Dump() { + fmt.Println("dump not implemented") +} diff --git a/index/mock/mock_test.go b/index/mock/mock_test.go new file mode 100644 index 00000000..628760df --- /dev/null +++ b/index/mock/mock_test.go @@ -0,0 +1,124 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package mock + +import ( + "reflect" + "testing" + + _ "github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer" + "github.com/couchbaselabs/bleve/document" + "github.com/couchbaselabs/bleve/index" +) + +func TestCRUD(t *testing.T) { + i := NewMockIndex() + + // create doc, assert doc count goes up + doc1 := document.NewDocument("1") + doc1.AddField(document.NewTextField("name", []byte("marty"))) + i.Update(doc1) + count := i.DocCount() + if count != 1 { + t.Errorf("expected document count to be 1, was: %d", count) + } + + // add another doc, assert doc count goes up again + doc2 := document.NewDocument("2") + doc2.AddField(document.NewTextField("name", []byte("bob"))) + i.Update(doc2) + count = i.DocCount() + if count != 2 { + t.Errorf("expected document count to be 2, was: %d", count) + } + + // search for doc with term that should exist + expectedMatch := &index.TermFieldDoc{ + ID: "1", + Freq: 1, + Norm: 1, + } + tfr, err := i.TermFieldReader([]byte("marty"), "name") + if err != nil { + t.Errorf("unexpected error: %v", err) + } + match, err := tfr.Next() + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !reflect.DeepEqual(expectedMatch, match) { + t.Errorf("got %v, expected %v", match, expectedMatch) + } + nomatch, err := tfr.Next() + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if nomatch != nil { + t.Errorf("expected nil after last match") + } + + // update doc, assert doc count doesn't go up + doc1 = document.NewDocument("1") + doc1.AddField(document.NewTextField("name", []byte("salad"))) + doc1.AddField(document.NewTextFieldWithIndexingOptions("desc", []byte("eat more rice"), document.INDEX_FIELD|document.INCLUDE_TERM_VECTORS)) + i.Update(doc1) + count = i.DocCount() + if count != 2 { + t.Errorf("expected document count to be 2, was: %d", count) + } + + // perform the original search again, should NOT find anything this time + tfr, err = i.TermFieldReader([]byte("marty"), "name") + if err != nil { + t.Errorf("unexpected error: %v", err) + } + nomatch, err = tfr.Next() + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if nomatch != nil { + t.Errorf("expected no matches, found one") + t.Logf("%v", i) + } + + // delete a doc, ensure the count is 1 + err = i.Delete("2") + if err != nil { + t.Errorf("unexpected error: %v", err) + } + count = i.DocCount() + if count != 1 { + t.Errorf("expected document count to be 1, was: %d", count) + } + + expectedMatch = &index.TermFieldDoc{ + ID: "1", + Freq: 1, + Norm: 0.5773502691896258, + Vectors: []*index.TermFieldVector{ + &index.TermFieldVector{ + Field: "desc", + Pos: 3, + Start: 9, + End: 13, + }, + }, + } + tfr, err = i.TermFieldReader([]byte("rice"), "desc") + if err != nil { + t.Errorf("unexpected error: %v", err) + } + match, err = tfr.Next() + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !reflect.DeepEqual(expectedMatch, match) { + t.Errorf("got %#v, expected %#v", match, expectedMatch) + } +} diff --git a/index/upside_down/reader.go b/index/upside_down/reader.go new file mode 100644 index 00000000..68af0627 --- /dev/null +++ b/index/upside_down/reader.go @@ -0,0 +1,101 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package upside_down + +import ( + "bytes" + + "github.com/jmhodges/levigo" + + "github.com/couchbaselabs/bleve/index" +) + +type UpsideDownCouchTermFieldReader struct { + index *UpsideDownCouch + iterator *levigo.Iterator + count uint64 + term []byte + field uint16 +} + +func newUpsideDownCouchTermFieldReader(index *UpsideDownCouch, term []byte, field uint16) (*UpsideDownCouchTermFieldReader, error) { + ro := defaultReadOptions() + it := index.db.NewIterator(ro) + + tfr := NewTermFrequencyRow(term, field, "", 0, 0) + it.Seek(tfr.Key()) + + var count uint64 = 0 + if it.Valid() { + if bytes.Equal(it.Key(), tfr.Key()) { + tfr = ParseFromKeyValue(it.Key(), it.Value()).(*TermFrequencyRow) + count = tfr.freq + } + + } else { + return nil, it.GetError() + } + + return &UpsideDownCouchTermFieldReader{ + index: index, + iterator: it, + count: count, + term: term, + field: field, + }, nil +} + +func (r *UpsideDownCouchTermFieldReader) Count() uint64 { + return r.count +} + +func (r *UpsideDownCouchTermFieldReader) Next() (*index.TermFieldDoc, error) { + r.iterator.Next() + if r.iterator.Valid() { + tfr := NewTermFrequencyRow(r.term, r.field, "", 0, 0) + if !bytes.HasPrefix(r.iterator.Key(), tfr.Key()) { + // end of the line + return nil, nil + } + tfr = ParseFromKeyValue(r.iterator.Key(), r.iterator.Value()).(*TermFrequencyRow) + return &index.TermFieldDoc{ + ID: string(tfr.doc), + Freq: tfr.freq, + Norm: float64(tfr.norm), + Vectors: r.index.termFieldVectorsFromTermVectors(tfr.vectors), + }, nil + } else { + return nil, r.iterator.GetError() + } +} + +func (r *UpsideDownCouchTermFieldReader) Advance(docId string) (*index.TermFieldDoc, error) { + tfr := NewTermFrequencyRow(r.term, r.field, docId, 0, 0) + r.iterator.Seek(tfr.Key()) + if r.iterator.Valid() { + tfr := NewTermFrequencyRow(r.term, r.field, "", 0, 0) + if !bytes.HasPrefix(r.iterator.Key(), tfr.Key()) { + // end of the line + return nil, nil + } + tfr = ParseFromKeyValue(r.iterator.Key(), r.iterator.Value()).(*TermFrequencyRow) + return &index.TermFieldDoc{ + ID: string(tfr.doc), + Freq: tfr.freq, + Norm: float64(tfr.norm), + Vectors: r.index.termFieldVectorsFromTermVectors(tfr.vectors), + }, nil + } else { + return nil, r.iterator.GetError() + } +} + +func (r *UpsideDownCouchTermFieldReader) Close() { + r.iterator.Close() +} diff --git a/index/upside_down/reader_test.go b/index/upside_down/reader_test.go new file mode 100644 index 00000000..77ec5793 --- /dev/null +++ b/index/upside_down/reader_test.go @@ -0,0 +1,111 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package upside_down + +import ( + "os" + "reflect" + "testing" + + _ "github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer" + "github.com/couchbaselabs/bleve/document" + "github.com/couchbaselabs/bleve/index" +) + +func TestIndexReader(t *testing.T) { + defer os.RemoveAll("test") + + idx := NewUpsideDownCouch("test") + + err := idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer idx.Close() + + var expectedCount uint64 = 0 + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount += 1 + + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []byte("test test test"))) + doc.AddField(document.NewTextFieldWithIndexingOptions("desc", []byte("eat more rice"), document.INDEX_FIELD|document.INCLUDE_TERM_VECTORS)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount += 1 + + // first look for a term that doesnt exist + reader, err := idx.TermFieldReader([]byte("nope"), "name") + if err != nil { + t.Errorf("Error accessing term field reader: %v", err) + } + count := reader.Count() + if count != 0 { + t.Errorf("Expected doc count to be: %d got: %d", 0, count) + } + reader.Close() + + reader, err = idx.TermFieldReader([]byte("test"), "name") + if err != nil { + t.Errorf("Error accessing term field reader: %v", err) + } + defer reader.Close() + + expectedCount = 2 + count = reader.Count() + if count != expectedCount { + t.Errorf("Exptected doc count to be: %d got: %d", expectedCount, count) + } + + var match *index.TermFieldDoc + var actualCount uint64 + match, err = reader.Next() + for err == nil && match != nil { + match, err = reader.Next() + if err != nil { + t.Errorf("unexpected error reading next") + } + actualCount += 1 + } + if actualCount != count { + t.Errorf("count was 2, but only saw %d", actualCount) + } + + expectedMatch := &index.TermFieldDoc{ + ID: "2", + Freq: 1, + Norm: 0.5773502588272095, + Vectors: []*index.TermFieldVector{ + &index.TermFieldVector{ + Field: "desc", + Pos: 3, + Start: 9, + End: 13, + }, + }, + } + tfr, err := idx.TermFieldReader([]byte("rice"), "desc") + if err != nil { + t.Errorf("unexpected error: %v", err) + } + match, err = tfr.Next() + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !reflect.DeepEqual(expectedMatch, match) { + t.Errorf("got %#v, expected %#v", match, expectedMatch) + } +} diff --git a/index/upside_down/row.go b/index/upside_down/row.go new file mode 100644 index 00000000..0a8349f4 --- /dev/null +++ b/index/upside_down/row.go @@ -0,0 +1,412 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package upside_down + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" +) + +const BYTE_SEPARATOR byte = 0xff + +type UpsideDownCouchRowStream chan UpsideDownCouchRow + +type UpsideDownCouchRow interface { + Key() []byte + Value() []byte +} + +func ParseFromKeyValue(key, value []byte) UpsideDownCouchRow { + switch key[0] { + case 'v': + return NewVersionRowKV(key, value) + case 'f': + return NewFieldRowKV(key, value) + case 't': + return NewTermFrequencyRowKV(key, value) + case 'b': + return NewBackIndexRowKV(key, value) + } + return nil +} + +// VERSION + +type VersionRow struct { + version uint8 +} + +func (v *VersionRow) Key() []byte { + return []byte{'v'} +} + +func (v *VersionRow) Value() []byte { + buf := new(bytes.Buffer) + err := binary.Write(buf, binary.LittleEndian, v.version) + if err != nil { + panic(fmt.Sprintf("binary.Write failed: %v", err)) + } + return buf.Bytes() +} + +func (v *VersionRow) String() string { + return fmt.Sprintf("Version: %d", v.version) +} + +func NewVersionRow(version uint8) *VersionRow { + return &VersionRow{ + version: version, + } +} + +func NewVersionRowKV(key, value []byte) *VersionRow { + rv := VersionRow{} + buf := bytes.NewBuffer(value) + err := binary.Read(buf, binary.LittleEndian, &rv.version) + if err != nil { + panic(fmt.Sprintf("binary.Read failed: %v", err)) + } + return &rv +} + +// FIELD definition + +type FieldRow struct { + index uint16 + name string +} + +func (f *FieldRow) Key() []byte { + buf := new(bytes.Buffer) + err := buf.WriteByte('f') + if err != nil { + panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err)) + } + err = binary.Write(buf, binary.LittleEndian, f.index) + if err != nil { + panic(fmt.Sprintf("binary.Write failed: %v", err)) + } + return buf.Bytes() +} + +func (f *FieldRow) Value() []byte { + buf := new(bytes.Buffer) + _, err := buf.WriteString(f.name) + if err != nil { + panic(fmt.Sprintf("Buffer.WriteString failed: %v", err)) + } + err = buf.WriteByte(BYTE_SEPARATOR) + if err != nil { + panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err)) + } + return buf.Bytes() +} + +func (f *FieldRow) String() string { + return fmt.Sprintf("Field: %d Name: %s", f.index, f.name) +} + +func NewFieldRow(index uint16, name string) *FieldRow { + return &FieldRow{ + index: index, + name: name, + } +} + +func NewFieldRowKV(key, value []byte) *FieldRow { + rv := FieldRow{} + + buf := bytes.NewBuffer(key) + buf.ReadByte() // type + err := binary.Read(buf, binary.LittleEndian, &rv.index) + if err != nil { + panic(fmt.Sprintf("binary.Read failed: %v", err)) + } + + buf = bytes.NewBuffer(value) + rv.name, err = buf.ReadString(BYTE_SEPARATOR) + if err != nil { + panic(fmt.Sprintf("Buffer.ReadString failed: %v", err)) + } + rv.name = rv.name[:len(rv.name)-1] // trim off separator byte + + return &rv +} + +// TERM FIELD FREQUENCY + +type TermVector struct { + field uint16 + pos uint64 + start uint64 + end uint64 +} + +func (tv *TermVector) String() string { + return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d", tv.field, tv.pos, tv.start, tv.end) +} + +type TermFrequencyRow struct { + term []byte + field uint16 + doc []byte + freq uint64 + norm float32 + vectors []*TermVector +} + +func (tfr *TermFrequencyRow) Key() []byte { + buf := new(bytes.Buffer) + err := buf.WriteByte('t') + if err != nil { + panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err)) + } + _, err = buf.Write(tfr.term) + if err != nil { + panic(fmt.Sprintf("Buffer.Write failed: %v", err)) + } + err = buf.WriteByte(BYTE_SEPARATOR) + if err != nil { + panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err)) + } + err = binary.Write(buf, binary.LittleEndian, tfr.field) + if err != nil { + panic(fmt.Sprintf("binary.Write failed: %v", err)) + } + _, err = buf.Write(tfr.doc) + if err != nil { + panic(fmt.Sprintf("Buffer.Write failed: %v", err)) + } + return buf.Bytes() +} + +func (tfr *TermFrequencyRow) Value() []byte { + buf := new(bytes.Buffer) + err := binary.Write(buf, binary.LittleEndian, tfr.freq) + if err != nil { + panic(fmt.Sprintf("binary.Write failed: %v", err)) + } + err = binary.Write(buf, binary.LittleEndian, tfr.norm) + if err != nil { + panic(fmt.Sprintf("binary.Write failed: %v", err)) + } + for _, vector := range tfr.vectors { + err = binary.Write(buf, binary.LittleEndian, vector.field) + if err != nil { + panic(fmt.Sprintf("binary.Write failed: %v", err)) + } + err = binary.Write(buf, binary.LittleEndian, vector.pos) + if err != nil { + panic(fmt.Sprintf("binary.Write failed: %v", err)) + } + err = binary.Write(buf, binary.LittleEndian, vector.start) + if err != nil { + panic(fmt.Sprintf("binary.Write failed: %v", err)) + } + err = binary.Write(buf, binary.LittleEndian, vector.end) + if err != nil { + panic(fmt.Sprintf("binary.Write failed: %v", err)) + } + } + return buf.Bytes() +} + +func (tfr *TermFrequencyRow) String() string { + return fmt.Sprintf("Term: `%s` Field: %d DocId: `%s` Frequency: %d Norm: %f Vectors: %v", string(tfr.term), tfr.field, string(tfr.doc), tfr.freq, tfr.norm, tfr.vectors) +} + +func NewTermFrequencyRow(term []byte, field uint16, doc string, freq uint64, norm float32) *TermFrequencyRow { + return &TermFrequencyRow{ + term: term, + field: field, + doc: []byte(doc), + freq: freq, + norm: norm, + } +} + +func NewTermFrequencyRowWithTermVectors(term []byte, field uint16, doc string, freq uint64, norm float32, vectors []*TermVector) *TermFrequencyRow { + return &TermFrequencyRow{ + term: term, + field: field, + doc: []byte(doc), + freq: freq, + norm: norm, + vectors: vectors, + } +} + +func NewTermFrequencyRowKV(key, value []byte) *TermFrequencyRow { + rv := TermFrequencyRow{ + doc: []byte(""), + } + buf := bytes.NewBuffer(key) + buf.ReadByte() // type + + var err error + rv.term, err = buf.ReadBytes(BYTE_SEPARATOR) + if err != nil { + panic(fmt.Sprintf("Buffer.ReadString failed: %v", err)) + } + rv.term = rv.term[:len(rv.term)-1] // trim off separator byte + + err = binary.Read(buf, binary.LittleEndian, &rv.field) + if err != nil { + panic(fmt.Sprintf("binary.Read failed: %v", err)) + } + + doc, err := buf.ReadBytes(BYTE_SEPARATOR) + if err != io.EOF { + panic(fmt.Sprintf("expected binary.ReadString to end in EOF: %v", err)) + } + if doc != nil { + rv.doc = doc + } + + buf = bytes.NewBuffer((value)) + err = binary.Read(buf, binary.LittleEndian, &rv.freq) + if err != nil { + panic(fmt.Sprintf("binary.Read failed: %v", err)) + } + err = binary.Read(buf, binary.LittleEndian, &rv.norm) + if err != nil { + panic(fmt.Sprintf("binary.Read failed: %v", err)) + } + + var field uint16 + err = binary.Read(buf, binary.LittleEndian, &field) + if err != nil && err != io.EOF { + panic(fmt.Sprintf("binary.Read failed: %v", err)) + } + for err != io.EOF { + tv := TermVector{} + tv.field = field + // at this point we expect at least one term vector + if rv.vectors == nil { + rv.vectors = make([]*TermVector, 0) + } + + err = binary.Read(buf, binary.LittleEndian, &tv.pos) + if err != nil { + panic(fmt.Sprintf("binary.Read failed: %v", err)) + } + err = binary.Read(buf, binary.LittleEndian, &tv.start) + if err != nil { + panic(fmt.Sprintf("binary.Read failed: %v", err)) + } + err = binary.Read(buf, binary.LittleEndian, &tv.end) + if err != nil { + panic(fmt.Sprintf("binary.Read failed: %v", err)) + } + rv.vectors = append(rv.vectors, &tv) + // try to read next record (may not exist) + err = binary.Read(buf, binary.LittleEndian, &field) + } + + return &rv + +} + +type BackIndexEntry struct { + term []byte + field uint16 +} + +func (bie *BackIndexEntry) String() string { + return fmt.Sprintf("Term: `%s` Field: %d", string(bie.term), bie.field) +} + +type BackIndexRow struct { + doc []byte + entries []*BackIndexEntry +} + +func (br *BackIndexRow) Key() []byte { + buf := new(bytes.Buffer) + err := buf.WriteByte('b') + if err != nil { + panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err)) + } + err = binary.Write(buf, binary.LittleEndian, br.doc) + if err != nil { + panic(fmt.Sprintf("binary.Write failed: %v", err)) + } + return buf.Bytes() +} + +func (br *BackIndexRow) Value() []byte { + buf := new(bytes.Buffer) + for _, e := range br.entries { + _, err := buf.Write(e.term) + if err != nil { + panic(fmt.Sprintf("Buffer.Write failed: %v", err)) + } + err = buf.WriteByte(BYTE_SEPARATOR) + if err != nil { + panic(fmt.Sprintf("Buffer.WriteByte failed: %v", err)) + } + err = binary.Write(buf, binary.LittleEndian, e.field) + if err != nil { + panic(fmt.Sprintf("binary.Write failed: %v", err)) + } + } + return buf.Bytes() +} + +func (br *BackIndexRow) String() string { + return fmt.Sprintf("Backindex DocId: `%s` Entries: %v", string(br.doc), br.entries) +} + +func NewBackIndexRow(doc string, entries []*BackIndexEntry) *BackIndexRow { + return &BackIndexRow{ + doc: []byte(doc), + entries: entries, + } +} + +func NewBackIndexRowKV(key, value []byte) *BackIndexRow { + rv := BackIndexRow{} + + buf := bytes.NewBuffer(key) + buf.ReadByte() // type + + var err error + rv.doc, err = buf.ReadBytes(BYTE_SEPARATOR) + if err != io.EOF { + panic(fmt.Sprintf("expected binary.ReadString to end in EOF: %v", err)) + } + + buf = bytes.NewBuffer(value) + rv.entries = make([]*BackIndexEntry, 0) + + var term []byte + term, err = buf.ReadBytes(BYTE_SEPARATOR) + if err != nil && err != io.EOF { + panic(fmt.Sprintf("Buffer.ReadString failed: %v", err)) + } + for err != io.EOF { + ent := BackIndexEntry{} + ent.term = term[:len(term)-1] // trim off separator byte + + err = binary.Read(buf, binary.LittleEndian, &ent.field) + if err != nil { + panic(fmt.Sprintf("binary.Read failed: %v", err)) + } + rv.entries = append(rv.entries, &ent) + + term, err = buf.ReadBytes(BYTE_SEPARATOR) + if err != nil && err != io.EOF { + panic(fmt.Sprintf("Buffer.ReadString failed: %v", err)) + } + } + + return &rv +} diff --git a/index/upside_down/row_test.go b/index/upside_down/row_test.go new file mode 100644 index 00000000..94657eb4 --- /dev/null +++ b/index/upside_down/row_test.go @@ -0,0 +1,89 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package upside_down + +import ( + "reflect" + "testing" +) + +func TestRows(t *testing.T) { + tests := []struct { + input UpsideDownCouchRow + outKey []byte + outVal []byte + }{ + { + NewVersionRow(1), + []byte{'v'}, + []byte{0x1}, + }, + { + NewFieldRow(0, "name"), + []byte{'f', 0, 0}, + []byte{'n', 'a', 'm', 'e', BYTE_SEPARATOR}, + }, + { + NewFieldRow(1, "desc"), + []byte{'f', 1, 0}, + []byte{'d', 'e', 's', 'c', BYTE_SEPARATOR}, + }, + { + NewFieldRow(513, "style"), + []byte{'f', 1, 2}, + []byte{'s', 't', 'y', 'l', 'e', BYTE_SEPARATOR}, + }, + { + NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "", 3, 3.14), + []byte{'t', 'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0}, + []byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64}, + }, + { + NewTermFrequencyRow([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14), + []byte{'t', 'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64}, + }, + { + NewTermFrequencyRowWithTermVectors([]byte{'b', 'e', 'e', 'r'}, 0, "budweiser", 3, 3.14, []*TermVector{&TermVector{field: 0, pos: 1, start: 3, end: 11}, &TermVector{field: 0, pos: 2, start: 23, end: 31}, &TermVector{field: 0, pos: 3, start: 43, end: 51}}), + []byte{'t', 'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0, 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{3, 0, 0, 0, 0, 0, 0, 0, 195, 245, 72, 64, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 43, 0, 0, 0, 0, 0, 0, 0, 51, 0, 0, 0, 0, 0, 0, 0}, + }, + { + NewBackIndexRow("budweiser", []*BackIndexEntry{&BackIndexEntry{[]byte{'b', 'e', 'e', 'r'}, 0}}), + []byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0}, + }, + { + NewBackIndexRow("budweiser", []*BackIndexEntry{&BackIndexEntry{[]byte{'b', 'e', 'e', 'r'}, 0}, &BackIndexEntry{[]byte{'b', 'e', 'a', 't'}, 1}}), + []byte{'b', 'b', 'u', 'd', 'w', 'e', 'i', 's', 'e', 'r'}, + []byte{'b', 'e', 'e', 'r', BYTE_SEPARATOR, 0, 0, 'b', 'e', 'a', 't', BYTE_SEPARATOR, 1, 0}, + }, + } + + // test going from struct to k/v bytes + for _, test := range tests { + rk := test.input.Key() + if !reflect.DeepEqual(rk, test.outKey) { + t.Errorf("Expected key to be %v got: %v", test.outKey, rk) + } + rv := test.input.Value() + if !reflect.DeepEqual(rv, test.outVal) { + t.Errorf("Expected value to be %v got: %v", test.outVal, rv) + } + } + + // now test going back from k/v bytes to struct + for _, test := range tests { + row := ParseFromKeyValue(test.outKey, test.outVal) + if !reflect.DeepEqual(row, test.input) { + t.Fatalf("Expected: %#v got: %#v", test.input, row) + } + } + +} diff --git a/index/upside_down/upside_down.go b/index/upside_down/upside_down.go new file mode 100644 index 00000000..24e1ebb8 --- /dev/null +++ b/index/upside_down/upside_down.go @@ -0,0 +1,466 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package upside_down + +import ( + "bytes" + "fmt" + "log" + "math" + + "github.com/couchbaselabs/bleve/analysis" + "github.com/jmhodges/levigo" + + "github.com/couchbaselabs/bleve/document" + "github.com/couchbaselabs/bleve/index" +) + +var VERSION_KEY []byte = []byte{'v'} + +const VERSION uint8 = 1 + +type UpsideDownCouch struct { + version uint8 + path string + opts *levigo.Options + db *levigo.DB + fieldIndexes map[string]uint16 + lastFieldIndex int + analyzer map[string]*analysis.Analyzer + docCount uint64 +} + +func NewUpsideDownCouch(path string) *UpsideDownCouch { + opts := levigo.NewOptions() + opts.SetCreateIfMissing(true) + + return &UpsideDownCouch{ + version: VERSION, + path: path, + opts: opts, + analyzer: make(map[string]*analysis.Analyzer), + fieldIndexes: make(map[string]uint16), + } +} + +func (udc *UpsideDownCouch) init() (err error) { + // prepare a list of rows + rows := make([]UpsideDownCouchRow, 0) + + // version marker + rows = append(rows, NewVersionRow(udc.version)) + + return udc.batchRows(nil, rows, nil) +} + +func (udc *UpsideDownCouch) loadSchema() (err error) { + // schema := make([]*index.Field, 0) + + ro := defaultReadOptions() + it := udc.db.NewIterator(ro) + defer it.Close() + + keyPrefix := []byte{'f'} + it.Seek(keyPrefix) + for it = it; it.Valid(); it.Next() { + // stop when + if !bytes.HasPrefix(it.Key(), keyPrefix) { + break + } + fieldRow := NewFieldRowKV(it.Key(), it.Value()) + udc.fieldIndexes[fieldRow.name] = fieldRow.index + if int(fieldRow.index) > udc.lastFieldIndex { + udc.lastFieldIndex = int(fieldRow.index) + } + } + err = it.GetError() + if err != nil { + return + } + + return +} + +func (udc *UpsideDownCouch) batchRows(addRows []UpsideDownCouchRow, updateRows []UpsideDownCouchRow, deleteRows []UpsideDownCouchRow) (err error) { + ro := defaultReadOptions() + + // prepare batch + wb := levigo.NewWriteBatch() + + // add + for _, row := range addRows { + tfr, ok := row.(*TermFrequencyRow) + if ok { + // need to increment counter + tr := NewTermFrequencyRow(tfr.term, tfr.field, "", 0, 0) + val, err := udc.db.Get(ro, tr.Key()) + if err != nil { + return err + } + if val != nil { + tr = ParseFromKeyValue(tr.Key(), val).(*TermFrequencyRow) + tr.freq += 1 // incr + } else { + tr = NewTermFrequencyRow(tfr.term, tfr.field, "", 1, 0) + } + + // now add this to the batch + wb.Put(tr.Key(), tr.Value()) + } + wb.Put(row.Key(), row.Value()) + } + + // update + for _, row := range updateRows { + wb.Put(row.Key(), row.Value()) + } + + // delete + for _, row := range deleteRows { + tfr, ok := row.(*TermFrequencyRow) + if ok { + // need to decrement counter + tr := NewTermFrequencyRow(tfr.term, tfr.field, "", 0, 0) + val, err := udc.db.Get(ro, tr.Key()) + if err != nil { + return err + } + if val != nil { + tr = ParseFromKeyValue(tr.Key(), val).(*TermFrequencyRow) + tr.freq -= 1 // incr + } else { + log.Panic(fmt.Sprintf("unexpected missing row, deleting term, expected count row to exit: %v", tr.Key())) + } + + if tr.freq == 0 { + wb.Delete(tr.Key()) + } else { + // now add this to the batch + wb.Put(tr.Key(), tr.Value()) + } + + } + wb.Delete(row.Key()) + } + + // write out the batch + wo := defaultWriteOptions() + err = udc.db.Write(wo, wb) + return +} + +func (udc *UpsideDownCouch) DocCount() uint64 { + return udc.docCount +} + +func (udc *UpsideDownCouch) Open() (err error) { + udc.db, err = levigo.Open(udc.path, udc.opts) + if err != nil { + return + } + + ro := defaultReadOptions() + var value []byte + value, err = udc.db.Get(ro, VERSION_KEY) + if err != nil { + return + } + + // init new index OR load schema + if value == nil { + err = udc.init() + if err != nil { + return + } + } else { + err = udc.loadSchema() + if err != nil { + return + } + } + // set doc count + udc.docCount = udc.countDocs() + return +} + +func (udc *UpsideDownCouch) countDocs() uint64 { + ro := defaultReadOptions() + ro.SetFillCache(false) // dont fill the cache with this + it := udc.db.NewIterator(ro) + defer it.Close() + + // begining of back index + it.Seek([]byte{'b'}) + + var rv uint64 = 0 + for it = it; it.Valid(); it.Next() { + if !bytes.HasPrefix(it.Key(), []byte{'b'}) { + break + } + rv += 1 + } + return rv +} + +func (udc *UpsideDownCouch) rowCount() uint64 { + ro := defaultReadOptions() + ro.SetFillCache(false) // dont fill the cache with this + it := udc.db.NewIterator(ro) + defer it.Close() + + it.Seek([]byte{0}) + + var rv uint64 = 0 + for it = it; it.Valid(); it.Next() { + rv += 1 + } + return rv +} + +func (udc *UpsideDownCouch) Close() { + udc.db.Close() +} + +func (udc *UpsideDownCouch) Update(doc *document.Document) error { + // first we lookup the backindex row for the doc id if it exists + // lookup the back index row + backIndexRow, err := udc.backIndexRowForDoc(doc.ID) + if err != nil { + return err + } + + var isAdd = true + // a map for each field, map key is term (string) bool true for existence + // FIMXE hard-coded to max of 256 fields + existingTermFieldMaps := make([]map[string]bool, 256) + if backIndexRow != nil { + isAdd = false + for _, entry := range backIndexRow.entries { + existingTermFieldMap := existingTermFieldMaps[entry.field] + if existingTermFieldMap == nil { + existingTermFieldMap = make(map[string]bool, 0) + existingTermFieldMaps[entry.field] = existingTermFieldMap + } + existingTermFieldMap[string(entry.term)] = true + } + } + + // prepare a list of rows + updateRows := make([]UpsideDownCouchRow, 0) + addRows := make([]UpsideDownCouchRow, 0) + + // track our back index entries + backIndexEntries := make([]*BackIndexEntry, 0) + + for _, field := range doc.Fields { + fieldIndex, fieldExists := udc.fieldIndexes[field.Name] + if !fieldExists { + // assign next field id + fieldIndex = uint16(udc.lastFieldIndex + 1) + udc.fieldIndexes[field.Name] = fieldIndex + // ensure this batch adds a row for this field + row := NewFieldRow(uint16(fieldIndex), field.Name) + updateRows = append(updateRows, row) + udc.lastFieldIndex = int(fieldIndex) + } + + existingTermFieldMap := existingTermFieldMaps[fieldIndex] + + analyzer := field.Analyzer + tokens := analyzer.Analyze(field.Value) + fieldLength := len(tokens) // number of tokens in this doc field + fieldNorm := float32(1.0 / math.Sqrt(float64(fieldLength))) + tokenFreqs := analysis.TokenFrequency(tokens) + for _, tf := range tokenFreqs { + var termFreqRow *TermFrequencyRow + if document.IncludeTermVectors(field.IndexingOptions) { + tv := termVectorsFromTokenFreq(uint16(fieldIndex), tf) + termFreqRow = NewTermFrequencyRowWithTermVectors(tf.Term, uint16(fieldIndex), doc.ID, uint64(frequencyFromTokenFreq(tf)), fieldNorm, tv) + } else { + termFreqRow = NewTermFrequencyRow(tf.Term, uint16(fieldIndex), doc.ID, uint64(frequencyFromTokenFreq(tf)), fieldNorm) + } + + // record the back index entry + backIndexEntry := BackIndexEntry{tf.Term, uint16(fieldIndex)} + backIndexEntries = append(backIndexEntries, &backIndexEntry) + + // remove the entry from the map of existing term fields if it exists + if existingTermFieldMap != nil { + termString := string(tf.Term) + _, ok := existingTermFieldMap[termString] + if ok { + // this is an update + updateRows = append(updateRows, termFreqRow) + // this term existed last time, delete it from that map + delete(existingTermFieldMap, termString) + } else { + // this is an add + addRows = append(addRows, termFreqRow) + } + } else { + // this is an add + addRows = append(addRows, termFreqRow) + } + } + + } + + // build the back index row + backIndexRow = NewBackIndexRow(doc.ID, backIndexEntries) + updateRows = append(updateRows, backIndexRow) + + // any of the existing rows that weren't updated need to be deleted + deleteRows := make([]UpsideDownCouchRow, 0) + for fieldIndex, existingTermFieldMap := range existingTermFieldMaps { + if existingTermFieldMap != nil { + for termString, _ := range existingTermFieldMap { + termFreqRow := NewTermFrequencyRow([]byte(termString), uint16(fieldIndex), doc.ID, 0, 0) + deleteRows = append(deleteRows, termFreqRow) + } + } + } + + err = udc.batchRows(addRows, updateRows, deleteRows) + if err == nil && isAdd { + udc.docCount += 1 + } + return err +} + +func (udc *UpsideDownCouch) Delete(id string) error { + // lookup the back index row + backIndexRow, err := udc.backIndexRowForDoc(id) + if err != nil { + return err + } + if backIndexRow == nil { + return nil + } + + // prepare a list of rows to delete + rows := make([]UpsideDownCouchRow, 0) + for _, backIndexEntry := range backIndexRow.entries { + tfr := NewTermFrequencyRow(backIndexEntry.term, backIndexEntry.field, id, 0, 0) + rows = append(rows, tfr) + } + + // also delete the back entry itself + rows = append(rows, backIndexRow) + + err = udc.batchRows(nil, nil, rows) + if err == nil { + udc.docCount -= 1 + } + return err +} + +func (udc *UpsideDownCouch) backIndexRowForDoc(docId string) (*BackIndexRow, error) { + ro := defaultReadOptions() + + // use a temporary row structure to build key + tempRow := &BackIndexRow{ + doc: []byte(docId), + } + key := tempRow.Key() + value, err := udc.db.Get(ro, key) + if err != nil { + return nil, err + } + if value == nil { + return nil, nil + } + backIndexRow := ParseFromKeyValue(key, value).(*BackIndexRow) + return backIndexRow, nil +} + +func (udc *UpsideDownCouch) Dump() { + ro := defaultReadOptions() + ro.SetFillCache(false) + it := udc.db.NewIterator(ro) + defer it.Close() + it.SeekToFirst() + for it = it; it.Valid(); it.Next() { + //fmt.Printf("Key: `%v` Value: `%v`\n", string(it.Key()), string(it.Value())) + row := ParseFromKeyValue(it.Key(), it.Value()) + if row != nil { + fmt.Printf("%v\n", row) + fmt.Printf("Key: % -100x\nValue: % -100x\n\n", it.Key(), it.Value()) + } + } + err := it.GetError() + if err != nil { + fmt.Printf("Error reading iterator: %v", err) + } +} + +func (udc *UpsideDownCouch) TermFieldReader(term []byte, fieldName string) (index.TermFieldReader, error) { + fieldIndex, fieldExists := udc.fieldIndexes[fieldName] + if fieldExists { + return newUpsideDownCouchTermFieldReader(udc, term, uint16(fieldIndex)) + } + log.Printf("fields: %v", udc.fieldIndexes) + return nil, fmt.Errorf("No field named `%s` in the schema", fieldName) +} + +func defaultWriteOptions() *levigo.WriteOptions { + wo := levigo.NewWriteOptions() + // request fsync on write for safety + wo.SetSync(true) + return wo +} + +func defaultReadOptions() *levigo.ReadOptions { + ro := levigo.NewReadOptions() + return ro +} + +func frequencyFromTokenFreq(tf *analysis.TokenFreq) int { + return len(tf.Locations) +} + +func termVectorsFromTokenFreq(field uint16, tf *analysis.TokenFreq) []*TermVector { + rv := make([]*TermVector, len(tf.Locations)) + + for i, l := range tf.Locations { + tv := TermVector{ + field: field, + pos: uint64(l.Position), + start: uint64(l.Start), + end: uint64(l.End), + } + rv[i] = &tv + } + + return rv +} + +func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector { + rv := make([]*index.TermFieldVector, len(in)) + + for i, tv := range in { + fieldName := udc.fieldIndexToName(tv.field) + tfv := index.TermFieldVector{ + Field: fieldName, + Pos: tv.pos, + Start: tv.start, + End: tv.end, + } + rv[i] = &tfv + } + return rv +} + +func (udc *UpsideDownCouch) fieldIndexToName(i uint16) string { + for fieldName, fieldIndex := range udc.fieldIndexes { + if i == fieldIndex { + return fieldName + } + } + return "" +} diff --git a/index/upside_down/upside_down_test.go b/index/upside_down/upside_down_test.go new file mode 100644 index 00000000..0c016636 --- /dev/null +++ b/index/upside_down/upside_down_test.go @@ -0,0 +1,221 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package upside_down + +import ( + "os" + "testing" + + _ "github.com/couchbaselabs/bleve/analysis/analyzers/standard_analyzer" + "github.com/couchbaselabs/bleve/document" +) + +func TestIndexOpenReopen(t *testing.T) { + defer os.RemoveAll("test") + + idx := NewUpsideDownCouch("test") + err := idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + + var expectedCount uint64 = 0 + docCount := idx.DocCount() + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + // opening database should have inserted version + expectedLength := uint64(1) + rowCount := idx.rowCount() + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } + + // now close it + idx.Close() + + idx = NewUpsideDownCouch("test") + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + + // now close it + idx.Close() +} + +func TestIndexInsert(t *testing.T) { + defer os.RemoveAll("test") + + idx := NewUpsideDownCouch("test") + + err := idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer idx.Close() + + var expectedCount uint64 = 0 + docCount := idx.DocCount() + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount += 1 + + docCount = idx.DocCount() + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + // should have 4 rows (1 for version, 1 for schema field, and 1 for single term, and 1 for the term count, and 1 for the back index entry) + expectedLength := uint64(1 + 1 + 1 + 1 + 1) + rowCount := idx.rowCount() + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } +} + +func TestIndexInsertThenDelete(t *testing.T) { + defer os.RemoveAll("test") + + idx := NewUpsideDownCouch("test") + + err := idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer idx.Close() + + var expectedCount uint64 = 0 + docCount := idx.DocCount() + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount += 1 + + docCount = idx.DocCount() + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + err = idx.Delete("1") + if err != nil { + t.Errorf("Error deleting entry from index: %v", err) + } + expectedCount -= 1 + + docCount = idx.DocCount() + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + // should have 2 row (1 for version, 1 for schema field) + expectedLength := uint64(1 + 1) + rowCount := idx.rowCount() + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } +} + +func TestIndexInsertThenUpdate(t *testing.T) { + defer os.RemoveAll("test") + + idx := NewUpsideDownCouch("test") + + err := idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer idx.Close() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + // this update should overwrite one term, and introduce one new one + doc = document.NewDocument("1") + doc.AddField(document.NewTextField("name", []byte("test fail"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error deleting entry from index: %v", err) + } + + // should have 2 row (1 for version, 1 for schema field, and 2 for the two term, and 2 for the term counts, and 1 for the back index entry) + expectedLength := uint64(1 + 1 + 2 + 2 + 1) + rowCount := idx.rowCount() + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } + + // now do another update that should remove one of term + doc = document.NewDocument("1") + doc.AddField(document.NewTextField("name", []byte("fail"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error deleting entry from index: %v", err) + } + + // should have 2 row (1 for version, 1 for schema field, and 1 for the remaining term, and 1 for the term count, and 1 for the back index entry) + expectedLength = uint64(1 + 1 + 1 + 1 + 1) + rowCount = idx.rowCount() + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } +} + +func TestIndexInsertMultiple(t *testing.T) { + defer os.RemoveAll("test") + + idx := NewUpsideDownCouch("test") + + err := idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer idx.Close() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + // should have 4 rows (1 for version, 1 for schema field, and 2 for single term, and 1 for the term count, and 2 for the back index entries) + expectedLength := uint64(1 + 1 + 2 + 1 + 2) + rowCount := idx.rowCount() + if rowCount != expectedLength { + t.Errorf("expected %d rows, got: %d", expectedLength, rowCount) + } +} diff --git a/search/collector.go b/search/collector.go new file mode 100644 index 00000000..3e5dc6ce --- /dev/null +++ b/search/collector.go @@ -0,0 +1,21 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package search + +import ( + "time" +) + +type Collector interface { + Collect(searcher Searcher) error + Results() DocumentMatchCollection + Total() uint64 + MaxScore() float64 + Took() time.Duration +} diff --git a/search/collector_top_score.go b/search/collector_top_score.go new file mode 100644 index 00000000..01acfbea --- /dev/null +++ b/search/collector_top_score.go @@ -0,0 +1,96 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package search + +import ( + "container/list" + "time" +) + +type TopScoreCollector struct { + k int + results *list.List + took time.Duration + maxScore float64 + total uint64 +} + +func NewTopScorerCollector(k int) *TopScoreCollector { + return &TopScoreCollector{ + k: k, + results: list.New(), + } +} + +func (tksc *TopScoreCollector) Total() uint64 { + return tksc.total +} + +func (tksc *TopScoreCollector) MaxScore() float64 { + return tksc.maxScore +} + +func (tksc *TopScoreCollector) Took() time.Duration { + return tksc.took +} + +func (tksc *TopScoreCollector) Collect(searcher Searcher) error { + startTime := time.Now() + next, err := searcher.Next() + for err == nil && next != nil { + tksc.collectSingle(next) + next, err = searcher.Next() + } + // compute search duration + tksc.took = time.Since(startTime) + if err != nil { + return err + } + return nil +} + +func (tksc *TopScoreCollector) collectSingle(dm *DocumentMatch) { + // increment total hits + tksc.total += 1 + + // update max score + if dm.Score > tksc.maxScore { + tksc.maxScore = dm.Score + } + + for e := tksc.results.Front(); e != nil; e = e.Next() { + curr := e.Value.(*DocumentMatch) + if dm.Score < curr.Score { + + tksc.results.InsertBefore(dm, e) + // if we just made the list too long + if tksc.results.Len() > tksc.k { + // remove the head + tksc.results.Remove(tksc.results.Front()) + } + return + } + } + // if we got to the end, we still have to add it + tksc.results.PushBack(dm) + if tksc.results.Len() > tksc.k { + // remove the head + tksc.results.Remove(tksc.results.Front()) + } +} + +func (tksc *TopScoreCollector) Results() DocumentMatchCollection { + rv := make(DocumentMatchCollection, tksc.results.Len()) + i := 0 + for e := tksc.results.Back(); e != nil; e = e.Prev() { + rv[i] = e.Value.(*DocumentMatch) + i++ + } + return rv +} diff --git a/search/collector_top_score_test.go b/search/collector_top_score_test.go new file mode 100644 index 00000000..992ea792 --- /dev/null +++ b/search/collector_top_score_test.go @@ -0,0 +1,107 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package search + +import ( + "testing" +) + +func TestTop10Scores(t *testing.T) { + + // a stub search with more than 10 matches + // the top-10 scores are > 10 + // everything else is less than 10 + searcher := &stubSearcher{ + matches: DocumentMatchCollection{ + &DocumentMatch{ + ID: "a", + Score: 11, + }, + &DocumentMatch{ + ID: "b", + Score: 9, + }, + &DocumentMatch{ + ID: "c", + Score: 11, + }, + &DocumentMatch{ + ID: "d", + Score: 9, + }, + &DocumentMatch{ + ID: "e", + Score: 11, + }, + &DocumentMatch{ + ID: "f", + Score: 9, + }, + &DocumentMatch{ + ID: "g", + Score: 11, + }, + &DocumentMatch{ + ID: "h", + Score: 9, + }, + &DocumentMatch{ + ID: "i", + Score: 11, + }, + &DocumentMatch{ + ID: "j", + Score: 11, + }, + &DocumentMatch{ + ID: "k", + Score: 11, + }, + &DocumentMatch{ + ID: "l", + Score: 99, + }, + &DocumentMatch{ + ID: "m", + Score: 11, + }, + &DocumentMatch{ + ID: "n", + Score: 11, + }, + }, + } + + collector := NewTopScorerCollector(10) + collector.Collect(searcher) + results := collector.Results() + + if len(results) != 10 { + t.Fatalf("expected 10 results, got %d", len(results)) + } + + if results[0].ID != "l" { + t.Errorf("expected first result to have ID 'l', got %s", results[0].ID) + } + + if results[0].Score != 99.0 { + t.Errorf("expected highest score to be 99.0, got %f", results[0].Score) + } + + minScore := 1000.0 + for _, result := range results { + if result.Score < minScore { + minScore = result.Score + } + } + + if minScore < 10 { + t.Errorf("expected minimum score to be higher than 10, got %f", minScore) + } +} diff --git a/search/explanation.go b/search/explanation.go new file mode 100644 index 00000000..6676bb2f --- /dev/null +++ b/search/explanation.go @@ -0,0 +1,28 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package search + +import ( + "encoding/json" + "fmt" +) + +type Explanation struct { + Value float64 `json:"value"` + Message string `json:"message"` + Children []*Explanation `json:"children,omitempty"` +} + +func (expl *Explanation) String() string { + js, err := json.MarshalIndent(expl, "", " ") + if err != nil { + return fmt.Sprintf("error serializing explation to json: %v", err) + } + return string(js) +} diff --git a/search/query.go b/search/query.go new file mode 100644 index 00000000..837a9f22 --- /dev/null +++ b/search/query.go @@ -0,0 +1,19 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package search + +import ( + "github.com/couchbaselabs/bleve/index" +) + +type Query interface { + Boost() float64 + Searcher(index index.Index) (Searcher, error) + Validate() error +} diff --git a/search/query_term.go b/search/query_term.go new file mode 100644 index 00000000..3f3dc067 --- /dev/null +++ b/search/query_term.go @@ -0,0 +1,32 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package search + +import ( + "github.com/couchbaselabs/bleve/index" +) + +type TermQuery struct { + Term string `json:"term"` + Field string `json:"field,omitempty"` + BoostVal float64 `json:"boost,omitempty"` + Explain bool `json:"explain,omitempty"` +} + +func (q *TermQuery) Boost() float64 { + return q.BoostVal +} + +func (q *TermQuery) Searcher(index index.Index) (Searcher, error) { + return NewTermSearcher(index, q) +} + +func (q *TermQuery) Validate() error { + return nil +} diff --git a/search/scorer_term.go b/search/scorer_term.go new file mode 100644 index 00000000..590549b2 --- /dev/null +++ b/search/scorer_term.go @@ -0,0 +1,172 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package search + +import ( + "fmt" + "math" + + "github.com/couchbaselabs/bleve/index" +) + +const MAX_SCORE_CACHE = 64 + +type TermQueryScorer struct { + query *TermQuery + docTerm uint64 + docTotal uint64 + idf float64 + explain bool + idfExplanation *Explanation + scoreCache map[int]float64 + scoreExplanationCache map[int]*Explanation + queryNorm float64 + queryWeight float64 + queryWeightExplanation *Explanation +} + +func NewTermQueryScorer(query *TermQuery, docTotal, docTerm uint64, explain bool) *TermQueryScorer { + rv := TermQueryScorer{ + query: query, + docTerm: docTerm, + docTotal: docTotal, + idf: 1.0 + math.Log(float64(docTotal)/float64(docTerm+1.0)), + explain: explain, + scoreCache: make(map[int]float64, MAX_SCORE_CACHE), + scoreExplanationCache: make(map[int]*Explanation, MAX_SCORE_CACHE), + queryWeight: 1.0, + } + + if explain { + rv.idfExplanation = &Explanation{ + Value: rv.idf, + Message: fmt.Sprintf("idf(docFreq=%d, maxDocs=%d)", docTerm, docTotal), + } + } + + return &rv +} + +func (s *TermQueryScorer) Weight() float64 { + sum := s.query.Boost() * s.idf + return sum * sum +} + +func (s *TermQueryScorer) SetQueryNorm(qnorm float64) { + s.queryNorm = qnorm + + // update the query weight + s.queryWeight = s.query.Boost() * s.idf * s.queryNorm + + if s.explain { + childrenExplanations := make([]*Explanation, 3) + childrenExplanations[0] = &Explanation{ + Value: s.query.Boost(), + Message: "boost", + } + childrenExplanations[1] = s.idfExplanation + childrenExplanations[2] = &Explanation{ + Value: s.queryNorm, + Message: "queryNorm", + } + s.queryWeightExplanation = &Explanation{ + Value: s.queryWeight, + Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.query.Field, string(s.query.Term), s.query.Boost()), + Children: childrenExplanations, + } + } +} + +func (s *TermQueryScorer) Score(termMatch *index.TermFieldDoc) *DocumentMatch { + + var scoreExplanation *Explanation + // see if the score was cached + score, ok := s.scoreCache[int(termMatch.Freq)] + if !ok { + // need to compute score + var tf float64 + if termMatch.Freq < MAX_SQRT_CACHE { + tf = SQRT_CACHE[int(termMatch.Freq)] + } else { + tf = math.Sqrt(float64(termMatch.Freq)) + } + + score = tf * termMatch.Norm * s.idf + + if s.explain { + childrenExplanations := make([]*Explanation, 3) + childrenExplanations[0] = &Explanation{ + Value: tf, + Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.query.Field, string(s.query.Term), termMatch.Freq), + } + childrenExplanations[1] = &Explanation{ + Value: termMatch.Norm, + Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.query.Field, termMatch.ID), + } + childrenExplanations[2] = s.idfExplanation + scoreExplanation = &Explanation{ + Value: score, + Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.query.Field, string(s.query.Term), termMatch.ID), + Children: childrenExplanations, + } + } + + // if the query weight isn't 1, multiply + if s.queryWeight != 1.0 { + score = score * s.queryWeight + if s.explain { + childExplanations := make([]*Explanation, 2) + childExplanations[0] = s.queryWeightExplanation + childExplanations[1] = scoreExplanation + scoreExplanation = &Explanation{ + Value: score, + Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.query.Field, string(s.query.Term), s.query.Boost(), termMatch.ID), + Children: childExplanations, + } + } + } + + if termMatch.Freq < MAX_SCORE_CACHE { + s.scoreCache[int(termMatch.Freq)] = score + if s.explain { + s.scoreExplanationCache[int(termMatch.Freq)] = scoreExplanation + } + } + } + + if ok && s.explain { + scoreExplanation = s.scoreExplanationCache[int(termMatch.Freq)] + } + + rv := DocumentMatch{ + ID: termMatch.ID, + Score: score, + } + if s.explain { + rv.Expl = scoreExplanation + } + + if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 { + locations := make(Locations, len(termMatch.Vectors)) + for i, v := range termMatch.Vectors { + loc := Location{ + Pos: float64(v.Pos), + Start: float64(v.Start), + End: float64(v.End), + } + locations[i] = &loc + } + tlm := make(TermLocationMap) + tlm[s.query.Term] = locations + rv.Locations = make(FieldTermLocationMap) + rv.Locations[s.query.Field] = tlm + } + + return &rv +} diff --git a/search/search.go b/search/search.go new file mode 100644 index 00000000..574c3e85 --- /dev/null +++ b/search/search.go @@ -0,0 +1,39 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package search + +type Location struct { + Pos float64 `json:"pos"` + Start float64 `json:"start"` + End float64 `json:"end"` +} + +type Locations []*Location + +type TermLocationMap map[string]Locations + +type FieldTermLocationMap map[string]TermLocationMap + +type DocumentMatch struct { + ID string `json:"id"` + Score float64 `json:"score"` + Expl *Explanation `json:"explanation,omitempty"` + Locations FieldTermLocationMap `json:"locations,omitempty"` +} + +type DocumentMatchCollection []*DocumentMatch + +type Searcher interface { + Next() (*DocumentMatch, error) + Advance(ID string) (*DocumentMatch, error) + Close() + Weight() float64 + SetQueryNorm(float64) + Count() uint64 +} diff --git a/search/search_term.go b/search/search_term.go new file mode 100644 index 00000000..119068f6 --- /dev/null +++ b/search/search_term.go @@ -0,0 +1,84 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package search + +import ( + "github.com/couchbaselabs/bleve/index" +) + +type TermSearcher struct { + index index.Index + query *TermQuery + reader index.TermFieldReader + scorer *TermQueryScorer +} + +func NewTermSearcher(index index.Index, query *TermQuery) (*TermSearcher, error) { + reader, err := index.TermFieldReader([]byte(query.Term), query.Field) + if err != nil { + return nil, err + } + scorer := NewTermQueryScorer(query, index.DocCount(), reader.Count(), query.Explain) + return &TermSearcher{ + index: index, + query: query, + reader: reader, + scorer: scorer, + }, nil +} + +func (s *TermSearcher) Count() uint64 { + return s.reader.Count() +} + +func (s *TermSearcher) Weight() float64 { + return s.scorer.Weight() +} + +func (s *TermSearcher) SetQueryNorm(qnorm float64) { + s.scorer.SetQueryNorm(qnorm) +} + +func (s *TermSearcher) Next() (*DocumentMatch, error) { + termMatch, err := s.reader.Next() + if err != nil { + return nil, err + } + + if termMatch == nil { + return nil, nil + } + + // score match + docMatch := s.scorer.Score(termMatch) + // return doc match + return docMatch, nil + +} + +func (s *TermSearcher) Advance(ID string) (*DocumentMatch, error) { + termMatch, err := s.reader.Advance(ID) + if err != nil { + return nil, err + } + + if termMatch == nil { + return nil, nil + } + + // score match + docMatch := s.scorer.Score(termMatch) + + // return doc match + return docMatch, nil +} + +func (s *TermSearcher) Close() { + s.reader.Close() +} diff --git a/search/search_test.go b/search/search_test.go new file mode 100644 index 00000000..e59cb5c0 --- /dev/null +++ b/search/search_test.go @@ -0,0 +1,50 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package search + +type stubSearcher struct { + index int + matches DocumentMatchCollection +} + +func (ss *stubSearcher) Next() (*DocumentMatch, error) { + if ss.index < len(ss.matches) { + rv := ss.matches[ss.index] + ss.index++ + return rv, nil + } + return nil, nil +} + +func (ss *stubSearcher) Advance(ID string) (*DocumentMatch, error) { + + for ss.index < len(ss.matches) && ss.matches[ss.index].ID < ID { + ss.index++ + } + if ss.index < len(ss.matches) { + rv := ss.matches[ss.index] + ss.index++ + return rv, nil + } + return nil, nil +} + +func (ss *stubSearcher) Close() { +} + +func (ss *stubSearcher) Weight() float64 { + return 0.0 +} + +func (ss *stubSearcher) SetQueryNorm(float64) { +} + +func (ss *stubSearcher) Count() uint64 { + return uint64(len(ss.matches)) +} diff --git a/search/sqrt_cache.go b/search/sqrt_cache.go new file mode 100644 index 00000000..1e4e57ab --- /dev/null +++ b/search/sqrt_cache.go @@ -0,0 +1,24 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package search + +import ( + "math" +) + +var SQRT_CACHE map[int]float64 + +const MAX_SQRT_CACHE = 64 + +func init() { + SQRT_CACHE = make(map[int]float64, MAX_SQRT_CACHE) + for i := 0; i < MAX_SQRT_CACHE; i++ { + SQRT_CACHE[i] = math.Sqrt(float64(i)) + } +} diff --git a/shredder/json_shredder.go b/shredder/json_shredder.go new file mode 100644 index 00000000..32cfe684 --- /dev/null +++ b/shredder/json_shredder.go @@ -0,0 +1,64 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. + +package shredder + +import ( + "encoding/json" + "strconv" + + "github.com/couchbaselabs/bleve/document" +) + +// A simple automatic JSON shredder which parses the whole document body. +// Any strings found in the JSON are added as text fields + +type AutoJsonShredder struct { +} + +func NewAutoJsonShredder() *AutoJsonShredder { + return &AutoJsonShredder{} +} + +func (s *AutoJsonShredder) Shred(id string, body []byte) (*document.Document, error) { + rv := document.NewDocument(id) + + var section interface{} + err := json.Unmarshal(body, §ion) + if err != nil { + return nil, err + } + + shredSection(rv, section, "") + + return rv, nil +} + +func shredSection(doc *document.Document, section interface{}, parent string) { + nextParent := parent + if nextParent != "" { + nextParent = nextParent + "." + } + switch section := section.(type) { + + case string: + f := document.NewTextField(parent, []byte(section)) + doc.AddField(f) + + case []interface{}: + for i, sub := range section { + shredSection(doc, sub, nextParent+strconv.Itoa(i)) + } + + case map[string]interface{}: + for k, sub := range section { + shredSection(doc, sub, nextParent+k) + } + } +} diff --git a/shredder/jsonpointer_shredder.go b/shredder/jsonpointer_shredder.go new file mode 100644 index 00000000..687c43b9 --- /dev/null +++ b/shredder/jsonpointer_shredder.go @@ -0,0 +1,55 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package shredder + +import ( + "github.com/couchbaselabs/bleve/document" + "github.com/dustin/go-jsonpointer" +) + +// A simple automatic JSON shredder which parses the whole document body. +// Any strings found in the JSON are added as text fields + +type JsonPointerShredder struct { + fieldPaths map[string]string + paths []string +} + +func NewJsonPointerShredder() *JsonPointerShredder { + return &JsonPointerShredder{ + fieldPaths: make(map[string]string), + paths: make([]string, 0), + } +} + +func (s *JsonPointerShredder) AddTextField(name string, path string) { + s.fieldPaths[name] = path + s.paths = append(s.paths, path) +} + +func (s *JsonPointerShredder) AddField(name string, path string) { + s.fieldPaths[name] = path + s.paths = append(s.paths, path) +} + +func (s *JsonPointerShredder) Shred(id string, body []byte) (*document.Document, error) { + rv := document.NewDocument(id) + + values, err := jsonpointer.FindMany(body, s.paths) + if err != nil { + return nil, err + } + + for fieldName, fieldPath := range s.fieldPaths { + field := document.NewTextField(fieldName, values[fieldPath]) + rv.AddField(field) + } + + return rv, nil +} diff --git a/shredder/shredder.go b/shredder/shredder.go new file mode 100644 index 00000000..49320f26 --- /dev/null +++ b/shredder/shredder.go @@ -0,0 +1,17 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package shredder + +import ( + "github.com/couchbaselabs/bleve/document" +) + +type Shredder interface { + Shred(id string, body []byte) (document.Document, error) +} diff --git a/utils/bleve_dump/main.go b/utils/bleve_dump/main.go new file mode 100644 index 00000000..fa422f26 --- /dev/null +++ b/utils/bleve_dump/main.go @@ -0,0 +1,31 @@ +// Copyright (c) 2014 Couchbase, Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file +// except in compliance with the License. You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software distributed under the +// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +package main + +import ( + "flag" + "log" + + "github.com/couchbaselabs/bleve/index/upside_down" +) + +var indexDir = flag.String("indexDir", "index", "index directory") + +func main() { + flag.Parse() + + index := upside_down.NewUpsideDownCouch(*indexDir) + err := index.Open() + if err != nil { + log.Fatal(err) + } + defer index.Close() + + index.Dump() +}