package blevex
|
|
import (
|
"errors"
|
"github.com/huichen/sego"
|
|
"github.com/blevesearch/bleve/v2/analysis"
|
"github.com/blevesearch/bleve/v2/registry"
|
)
|
|
type SegoTokenizer struct {
|
segmenter sego.Segmenter
|
}
|
|
var _ analysis.Tokenizer = &SegoTokenizer{}
|
|
func NewSegoTokenizer(dictpath string) *SegoTokenizer {
|
segmenter := sego.Segmenter{}
|
segmenter.LoadDictionary(dictpath)
|
return &SegoTokenizer{segmenter: segmenter}
|
}
|
|
func (st *SegoTokenizer) Tokenize(sentence []byte) analysis.TokenStream {
|
result := make(analysis.TokenStream, 0)
|
pos := 1
|
segments := st.segmenter.Segment(sentence)
|
for _, segment := range segments {
|
token := analysis.Token{
|
Term: []byte(segment.Token().Text()),
|
Start: segment.Start(),
|
End: segment.End(),
|
Position: pos,
|
Type: analysis.Ideographic,
|
}
|
result = append(result, &token)
|
pos++
|
}
|
return result
|
}
|
|
func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) {
|
dictpath, ok := config["dictpath"].(string)
|
if !ok {
|
return nil, errors.New("config dictpath not found")
|
}
|
return NewSegoTokenizer(dictpath), nil
|
}
|
|
func init() {
|
registry.RegisterTokenizer("sego", tokenizerConstructor)
|
}
|