package blevex import ( "errors" "github.com/huichen/sego" "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/registry" ) type SegoTokenizer struct { segmenter sego.Segmenter } var _ analysis.Tokenizer = &SegoTokenizer{} func NewSegoTokenizer(dictpath string) *SegoTokenizer { segmenter := sego.Segmenter{} segmenter.LoadDictionary(dictpath) return &SegoTokenizer{segmenter: segmenter} } func (st *SegoTokenizer) Tokenize(sentence []byte) analysis.TokenStream { result := make(analysis.TokenStream, 0) pos := 1 segments := st.segmenter.Segment(sentence) for _, segment := range segments { token := analysis.Token{ Term: []byte(segment.Token().Text()), Start: segment.Start(), End: segment.End(), Position: pos, Type: analysis.Ideographic, } result = append(result, &token) pos++ } return result } func tokenizerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Tokenizer, error) { dictpath, ok := config["dictpath"].(string) if !ok { return nil, errors.New("config dictpath not found") } return NewSegoTokenizer(dictpath), nil } func init() { registry.RegisterTokenizer("sego", tokenizerConstructor) }