| | |
| | | |
| | | import ( |
| | | "errors" |
| | | "github.com/huichen/sego" |
| | | |
| | | "github.com/blevesearch/bleve/v2/analysis" |
| | | "github.com/blevesearch/bleve/v2/registry" |
| | | "github.com/yanyiwu/gojieba" |
| | | ) |
| | | |
| | | type JiebaTokenizer struct { |
| | | handle *gojieba.Jieba |
| | | type SegoTokenizer struct { |
| | | segmenter sego.Segmenter |
| | | } |
| | | |
| | | var _ analysis.Tokenizer = &JiebaTokenizer{} |
| | | var _ analysis.Tokenizer = &SegoTokenizer{} |
| | | |
| | | func NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words string) *JiebaTokenizer { |
| | | x := gojieba.NewJieba(dictpath, hmmpath, userdictpath, idf, stop_words) |
| | | return &JiebaTokenizer{x} |
| | | func NewSegoTokenizer(dictpath string) *SegoTokenizer { |
| | | segmenter := sego.Segmenter{} |
| | | segmenter.LoadDictionary(dictpath) |
| | | return &SegoTokenizer{segmenter: segmenter} |
| | | } |
| | | |
| | | func (x *JiebaTokenizer) Free() { |
| | | x.handle.Free() |
| | | } |
| | | |
| | | // Analyze([]byte) TokenStream |
| | | func (x *JiebaTokenizer) Tokenize(sentence []byte) analysis.TokenStream { |
| | | func (st *SegoTokenizer) Tokenize(sentence []byte) analysis.TokenStream { |
| | | result := make(analysis.TokenStream, 0) |
| | | pos := 1 |
| | | words := x.handle.Tokenize(string(sentence), gojieba.SearchMode, true) |
| | | for _, word := range words { |
| | | segments := st.segmenter.Segment(sentence) |
| | | for _, segment := range segments { |
| | | token := analysis.Token{ |
| | | Term: []byte(word.Str), |
| | | Start: word.Start, |
| | | End: word.End, |
| | | Term: []byte(segment.Token().Text()), |
| | | Start: segment.Start(), |
| | | End: segment.End(), |
| | | Position: pos, |
| | | Type: analysis.Ideographic, |
| | | } |
| | |
| | | if !ok { |
| | | return nil, errors.New("config dictpath not found") |
| | | } |
| | | hmmpath, ok := config["hmmpath"].(string) |
| | | if !ok { |
| | | return nil, errors.New("config hmmpath not found") |
| | | } |
| | | userdictpath, ok := config["userdictpath"].(string) |
| | | if !ok { |
| | | return nil, errors.New("config userdictpath not found") |
| | | } |
| | | idf, ok := config["idf"].(string) |
| | | if !ok { |
| | | return nil, errors.New("config idf not found") |
| | | } |
| | | stop_words, ok := config["stop_words"].(string) |
| | | if !ok { |
| | | return nil, errors.New("config stop_words not found") |
| | | } |
| | | return NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words), nil |
| | | return NewSegoTokenizer(dictpath), nil |
| | | } |
| | | |
| | | func init() { |
| | | registry.RegisterTokenizer("gojieba", tokenizerConstructor) |
| | | registry.RegisterTokenizer("sego", tokenizerConstructor) |
| | | } |