From deb1fb612b471789f5452f9536421ebee0f36155 Mon Sep 17 00:00:00 2001
From: zhangqian <zhangqian@123.com>
Date: 星期三, 22 十一月 2023 18:16:14 +0800
Subject: [PATCH] 换一个纯go实现的中文分词包

---
 pkg/blevex/tokenizer.go |   52 ++++++++++++++++------------------------------------
 1 files changed, 16 insertions(+), 36 deletions(-)

diff --git a/pkg/blevex/tokenizer.go b/pkg/blevex/tokenizer.go
index 1522308..e81a59a 100644
--- a/pkg/blevex/tokenizer.go
+++ b/pkg/blevex/tokenizer.go
@@ -2,37 +2,33 @@
 
 import (
 	"errors"
+	"github.com/huichen/sego"
 
 	"github.com/blevesearch/bleve/v2/analysis"
 	"github.com/blevesearch/bleve/v2/registry"
-	"github.com/yanyiwu/gojieba"
 )
 
-type JiebaTokenizer struct {
-	handle *gojieba.Jieba
+type SegoTokenizer struct {
+	segmenter sego.Segmenter
 }
 
-var _ analysis.Tokenizer = &JiebaTokenizer{}
+var _ analysis.Tokenizer = &SegoTokenizer{}
 
-func NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words string) *JiebaTokenizer {
-	x := gojieba.NewJieba(dictpath, hmmpath, userdictpath, idf, stop_words)
-	return &JiebaTokenizer{x}
+func NewSegoTokenizer(dictpath string) *SegoTokenizer {
+	segmenter := sego.Segmenter{}
+	segmenter.LoadDictionary(dictpath)
+	return &SegoTokenizer{segmenter: segmenter}
 }
 
-func (x *JiebaTokenizer) Free() {
-	x.handle.Free()
-}
-
-// Analyze([]byte) TokenStream
-func (x *JiebaTokenizer) Tokenize(sentence []byte) analysis.TokenStream {
+func (st *SegoTokenizer) Tokenize(sentence []byte) analysis.TokenStream {
 	result := make(analysis.TokenStream, 0)
 	pos := 1
-	words := x.handle.Tokenize(string(sentence), gojieba.SearchMode, true)
-	for _, word := range words {
+	segments := st.segmenter.Segment(sentence)
+	for _, segment := range segments {
 		token := analysis.Token{
-			Term:     []byte(word.Str),
-			Start:    word.Start,
-			End:      word.End,
+			Term:     []byte(segment.Token().Text()),
+			Start:    segment.Start(),
+			End:      segment.End(),
 			Position: pos,
 			Type:     analysis.Ideographic,
 		}
@@ -47,25 +43,9 @@
 	if !ok {
 		return nil, errors.New("config dictpath not found")
 	}
-	hmmpath, ok := config["hmmpath"].(string)
-	if !ok {
-		return nil, errors.New("config hmmpath not found")
-	}
-	userdictpath, ok := config["userdictpath"].(string)
-	if !ok {
-		return nil, errors.New("config userdictpath not found")
-	}
-	idf, ok := config["idf"].(string)
-	if !ok {
-		return nil, errors.New("config idf not found")
-	}
-	stop_words, ok := config["stop_words"].(string)
-	if !ok {
-		return nil, errors.New("config stop_words not found")
-	}
-	return NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words), nil
+	return NewSegoTokenizer(dictpath), nil
 }
 
 func init() {
-	registry.RegisterTokenizer("gojieba", tokenizerConstructor)
+	registry.RegisterTokenizer("sego", tokenizerConstructor)
 }

--
Gitblit v1.8.0