From deb1fb612b471789f5452f9536421ebee0f36155 Mon Sep 17 00:00:00 2001 From: zhangqian <zhangqian@123.com> Date: 星期三, 22 十一月 2023 18:16:14 +0800 Subject: [PATCH] 换一个纯go实现的中文分词包 --- pkg/blevex/analyzer.go | 58 +++++++++++++++++++++++++--------------------------------- 1 files changed, 25 insertions(+), 33 deletions(-) diff --git a/pkg/blevex/analyzer.go b/pkg/blevex/analyzer.go index 29cb8a4..2839517 100644 --- a/pkg/blevex/analyzer.go +++ b/pkg/blevex/analyzer.go @@ -7,38 +7,14 @@ "github.com/blevesearch/bleve/v2/registry" ) -func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) { - tokenizerName, ok := config["tokenizer"].(string) - if !ok { - return nil, errors.New("must specify tokenizer") - } - tokenizer, err := cache.TokenizerNamed(tokenizerName) - if err != nil { - return nil, err - } - - jbtk, ok := tokenizer.(*JiebaTokenizer) - if !ok { - return nil, errors.New("tokenizer must be of type jieba") - } - alz := &JiebaAnalyzer{ - Tokenizer: jbtk, - } - return alz, nil -} - -func init() { - registry.RegisterAnalyzer("gojieba", analyzerConstructor) -} - -// JiebaAnalyzer from analysis.DefaultAnalyzer -type JiebaAnalyzer struct { +// SegoAnalyzer from analysis.DefaultAnalyzer +type SegoAnalyzer struct { CharFilters []analysis.CharFilter - Tokenizer *JiebaTokenizer + Tokenizer *SegoTokenizer TokenFilters []analysis.TokenFilter } -func (a *JiebaAnalyzer) Analyze(input []byte) analysis.TokenStream { +func (a *SegoAnalyzer) Analyze(input []byte) analysis.TokenStream { if a.CharFilters != nil { for _, cf := range a.CharFilters { input = cf.Filter(input) @@ -53,10 +29,26 @@ return tokens } -func (a *JiebaAnalyzer) Free() { - if a.Tokenizer != nil { - a.Tokenizer.Free() - } else { - panic("JiebaAnalyzer.Tokenizer is nil, this should not happen") +func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) { + tokenizerName, ok := config["tokenizer"].(string) + if !ok { + return nil, errors.New("must specify tokenizer") } + tokenizer, err := cache.TokenizerNamed(tokenizerName) + if err != nil { + return nil, err + } + + segoTokenizer, ok := tokenizer.(*SegoTokenizer) + if !ok { + return nil, errors.New("tokenizer must be of type sego") + } + alz := &SegoAnalyzer{ + Tokenizer: segoTokenizer, + } + return alz, nil +} + +func init() { + registry.RegisterAnalyzer("sego", analyzerConstructor) } -- Gitblit v1.8.0