From deb1fb612b471789f5452f9536421ebee0f36155 Mon Sep 17 00:00:00 2001 From: zhangqian <zhangqian@123.com> Date: 星期三, 22 十一月 2023 18:16:14 +0800 Subject: [PATCH] 换一个纯go实现的中文分词包 --- pkg/blevex/tokenizer.go | 52 ++++++++++++++++------------------------------------ 1 files changed, 16 insertions(+), 36 deletions(-) diff --git a/pkg/blevex/tokenizer.go b/pkg/blevex/tokenizer.go index 1522308..e81a59a 100644 --- a/pkg/blevex/tokenizer.go +++ b/pkg/blevex/tokenizer.go @@ -2,37 +2,33 @@ import ( "errors" + "github.com/huichen/sego" "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/registry" - "github.com/yanyiwu/gojieba" ) -type JiebaTokenizer struct { - handle *gojieba.Jieba +type SegoTokenizer struct { + segmenter sego.Segmenter } -var _ analysis.Tokenizer = &JiebaTokenizer{} +var _ analysis.Tokenizer = &SegoTokenizer{} -func NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words string) *JiebaTokenizer { - x := gojieba.NewJieba(dictpath, hmmpath, userdictpath, idf, stop_words) - return &JiebaTokenizer{x} +func NewSegoTokenizer(dictpath string) *SegoTokenizer { + segmenter := sego.Segmenter{} + segmenter.LoadDictionary(dictpath) + return &SegoTokenizer{segmenter: segmenter} } -func (x *JiebaTokenizer) Free() { - x.handle.Free() -} - -// Analyze([]byte) TokenStream -func (x *JiebaTokenizer) Tokenize(sentence []byte) analysis.TokenStream { +func (st *SegoTokenizer) Tokenize(sentence []byte) analysis.TokenStream { result := make(analysis.TokenStream, 0) pos := 1 - words := x.handle.Tokenize(string(sentence), gojieba.SearchMode, true) - for _, word := range words { + segments := st.segmenter.Segment(sentence) + for _, segment := range segments { token := analysis.Token{ - Term: []byte(word.Str), - Start: word.Start, - End: word.End, + Term: []byte(segment.Token().Text()), + Start: segment.Start(), + End: segment.End(), Position: pos, Type: analysis.Ideographic, } @@ -47,25 +43,9 @@ if !ok { return nil, errors.New("config dictpath not found") } - hmmpath, ok := config["hmmpath"].(string) - if !ok { - return nil, errors.New("config hmmpath not found") - } - userdictpath, ok := config["userdictpath"].(string) - if !ok { - return nil, errors.New("config userdictpath not found") - } - idf, ok := config["idf"].(string) - if !ok { - return nil, errors.New("config idf not found") - } - stop_words, ok := config["stop_words"].(string) - if !ok { - return nil, errors.New("config stop_words not found") - } - return NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words), nil + return NewSegoTokenizer(dictpath), nil } func init() { - registry.RegisterTokenizer("gojieba", tokenizerConstructor) + registry.RegisterTokenizer("sego", tokenizerConstructor) } -- Gitblit v1.8.0