From 6dec2342316aecf4084c8f4efb43f33fbb72892f Mon Sep 17 00:00:00 2001
From: liujiandao <274878379@qq.com>
Date: 星期二, 09 四月 2024 14:07:43 +0800
Subject: [PATCH] 更新计量单位字典
---
pkg/blevex/tokenizer.go | 52 ++++++++++++++++------------------------------------
1 files changed, 16 insertions(+), 36 deletions(-)
diff --git a/pkg/blevex/tokenizer.go b/pkg/blevex/tokenizer.go
index 1522308..e81a59a 100644
--- a/pkg/blevex/tokenizer.go
+++ b/pkg/blevex/tokenizer.go
@@ -2,37 +2,33 @@
import (
"errors"
+ "github.com/huichen/sego"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
- "github.com/yanyiwu/gojieba"
)
-type JiebaTokenizer struct {
- handle *gojieba.Jieba
+type SegoTokenizer struct {
+ segmenter sego.Segmenter
}
-var _ analysis.Tokenizer = &JiebaTokenizer{}
+var _ analysis.Tokenizer = &SegoTokenizer{}
-func NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words string) *JiebaTokenizer {
- x := gojieba.NewJieba(dictpath, hmmpath, userdictpath, idf, stop_words)
- return &JiebaTokenizer{x}
+func NewSegoTokenizer(dictpath string) *SegoTokenizer {
+ segmenter := sego.Segmenter{}
+ segmenter.LoadDictionary(dictpath)
+ return &SegoTokenizer{segmenter: segmenter}
}
-func (x *JiebaTokenizer) Free() {
- x.handle.Free()
-}
-
-// Analyze([]byte) TokenStream
-func (x *JiebaTokenizer) Tokenize(sentence []byte) analysis.TokenStream {
+func (st *SegoTokenizer) Tokenize(sentence []byte) analysis.TokenStream {
result := make(analysis.TokenStream, 0)
pos := 1
- words := x.handle.Tokenize(string(sentence), gojieba.SearchMode, true)
- for _, word := range words {
+ segments := st.segmenter.Segment(sentence)
+ for _, segment := range segments {
token := analysis.Token{
- Term: []byte(word.Str),
- Start: word.Start,
- End: word.End,
+ Term: []byte(segment.Token().Text()),
+ Start: segment.Start(),
+ End: segment.End(),
Position: pos,
Type: analysis.Ideographic,
}
@@ -47,25 +43,9 @@
if !ok {
return nil, errors.New("config dictpath not found")
}
- hmmpath, ok := config["hmmpath"].(string)
- if !ok {
- return nil, errors.New("config hmmpath not found")
- }
- userdictpath, ok := config["userdictpath"].(string)
- if !ok {
- return nil, errors.New("config userdictpath not found")
- }
- idf, ok := config["idf"].(string)
- if !ok {
- return nil, errors.New("config idf not found")
- }
- stop_words, ok := config["stop_words"].(string)
- if !ok {
- return nil, errors.New("config stop_words not found")
- }
- return NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words), nil
+ return NewSegoTokenizer(dictpath), nil
}
func init() {
- registry.RegisterTokenizer("gojieba", tokenizerConstructor)
+ registry.RegisterTokenizer("sego", tokenizerConstructor)
}
--
Gitblit v1.8.0