new tokenizer which uses cld2 to guess the field's language
This commit is contained in:
parent
aade85a6e8
commit
b629636424
|
@ -5,6 +5,7 @@
|
|||
.project
|
||||
.settings
|
||||
.DS_Store
|
||||
/analysis/tokenizers/cld2/cld2-read-only
|
||||
/examples/bleve_index_json/bleve_index_json
|
||||
/examples/bleve_index_json/index/
|
||||
/examples/bleve_query/bleve_query
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
# cld2 tokenizer
|
||||
|
||||
A bleve tokenizer which passes the input text to the cld2 library. The library determines what it thinks the language most likely is. The ISO-639 language code is returned as the single token resulting from the analysis.
|
||||
|
||||
# Building
|
||||
|
||||
1. Acquire the source to cld2 in this directory.
|
||||
|
||||
$ svn checkout http://cld2.googlecode.com/svn/trunk/ cld2-read-only
|
||||
|
||||
2. Build cld2
|
||||
|
||||
$ cd cld2-read-only/internal/
|
||||
$ ./compile_libs.sh
|
||||
|
||||
|
||||
3. Put the resulting libraries somewhere your dynamic linker can find.
|
||||
|
||||
$ cp *.so /usr/local/lib
|
||||
|
||||
4. Run the unit tests
|
||||
|
||||
$ cd ../..
|
||||
$ go test -v
|
||||
=== RUN TestCld2Tokenizer
|
||||
--- PASS: TestCld2Tokenizer (0.03 seconds)
|
||||
PASS
|
||||
ok github.com/couchbaselabs/bleve/analysis/tokenizers/cld2 0.067s
|
|
@ -0,0 +1,32 @@
|
|||
#include "cld2_tokenizer.h"
|
||||
#include "cld2-read-only/public/compact_lang_det.h"
|
||||
|
||||
const char* DetectLang(const char *buffer) {
|
||||
|
||||
bool is_plain_text = true;
|
||||
CLD2::CLDHints cldhints = {NULL, NULL, 0, CLD2::UNKNOWN_LANGUAGE};
|
||||
bool allow_extended_lang = true;
|
||||
int flags = 0;
|
||||
CLD2::Language language3[3];
|
||||
int percent3[3];
|
||||
double normalized_score3[3];
|
||||
CLD2::ResultChunkVector resultchunkvector;
|
||||
int text_bytes;
|
||||
bool is_reliable;
|
||||
|
||||
CLD2::Language summary_lang = CLD2::UNKNOWN_LANGUAGE;
|
||||
|
||||
summary_lang = CLD2::ExtDetectLanguageSummary(buffer,
|
||||
strlen(buffer),
|
||||
is_plain_text,
|
||||
&cldhints,
|
||||
flags,
|
||||
language3,
|
||||
percent3,
|
||||
normalized_score3,
|
||||
&resultchunkvector,
|
||||
&text_bytes,
|
||||
&is_reliable);
|
||||
|
||||
return CLD2::LanguageCode(summary_lang);
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package cld2
|
||||
|
||||
// #cgo LDFLAGS: -Lcld2-read-only/internal/ -lcld2_full
|
||||
// #include "cld2_tokenizer.h"
|
||||
// #include <string.h>
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"unsafe"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
type Cld2Tokenizer struct {
|
||||
}
|
||||
|
||||
func NewCld2Tokenizer() *Cld2Tokenizer {
|
||||
return &Cld2Tokenizer{}
|
||||
}
|
||||
|
||||
func (rt *Cld2Tokenizer) Tokenize(input []byte) analysis.TokenStream {
|
||||
rv := make(analysis.TokenStream, 0)
|
||||
lang, err := rt.detectLanguage(input)
|
||||
if err != nil {
|
||||
return rv
|
||||
}
|
||||
token := analysis.Token{
|
||||
Term: lang,
|
||||
Start: 0,
|
||||
End: len(lang),
|
||||
Position: 1,
|
||||
}
|
||||
rv = append(rv, &token)
|
||||
return rv
|
||||
}
|
||||
|
||||
func (rt *Cld2Tokenizer) detectLanguage(input []byte) ([]byte, error) {
|
||||
cstr := C.CString(string(input))
|
||||
res := C.DetectLang(cstr)
|
||||
return C.GoBytes(unsafe.Pointer(res), C.int(C.strlen(res))), nil
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
const char* DetectLang(const char *buffer);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
package cld2
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"github.com/couchbaselabs/bleve/analysis"
|
||||
)
|
||||
|
||||
func TestCld2Tokenizer(t *testing.T) {
|
||||
tests := []struct {
|
||||
input []byte
|
||||
output analysis.TokenStream
|
||||
}{
|
||||
{
|
||||
input: []byte("the quick brown fox"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("en"),
|
||||
Start: 0,
|
||||
End: 2,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("こんにちは世界"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ja"),
|
||||
Start: 0,
|
||||
End: 2,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("แยกคำภาษาไทยก็ทำได้นะจ้ะ"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("th"),
|
||||
Start: 0,
|
||||
End: 2,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
input: []byte("مرحبا، العالم!"),
|
||||
output: analysis.TokenStream{
|
||||
&analysis.Token{
|
||||
Term: []byte("ar"),
|
||||
Start: 0,
|
||||
End: 2,
|
||||
Position: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
tokenizer := NewCld2Tokenizer()
|
||||
for _, test := range tests {
|
||||
res := tokenizer.Tokenize(test.input)
|
||||
if !reflect.DeepEqual(res, test.output) {
|
||||
t.Errorf("expected:")
|
||||
for _, token := range test.output {
|
||||
t.Errorf("%#v - %s", token, token.Term)
|
||||
}
|
||||
t.Errorf("got:")
|
||||
for _, token := range res {
|
||||
t.Errorf("%#v - %s", token, token.Term)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue