zhangqian
2023-11-22 deb1fb612b471789f5452f9536421ebee0f36155
换一个纯go实现的中文分词包
1个文件已添加
5个文件已修改
589179 ■■■■■ 已修改文件
conf/dictionary.txt 589032 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
go.mod 3 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
go.sum 9 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
pkg/blevex/analyzer.go 58 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
pkg/blevex/bleve.go 25 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
pkg/blevex/tokenizer.go 52 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
conf/dictionary.txt
New file
Diff too large
go.mod
@@ -10,6 +10,7 @@
    github.com/gin-gonic/gin v1.9.0
    github.com/golang-jwt/jwt/v4 v4.5.0
    github.com/google/uuid v1.3.1
    github.com/huichen/sego v0.0.0-20210824061530-c87651ea5c76
    github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646
    github.com/nsqio/go-nsq v1.1.0
    github.com/open-policy-agent/opa v0.57.1
@@ -21,7 +22,6 @@
    github.com/swaggo/gin-swagger v1.6.0
    github.com/swaggo/swag v1.16.1
    github.com/xuri/excelize/v2 v2.8.0
    github.com/yanyiwu/gojieba v1.3.0
    go.uber.org/zap v1.24.0
    golang.org/x/crypto v0.15.0
    google.golang.org/genproto v0.0.0-20230711160842-782d3b101e98
@@ -37,6 +37,7 @@
    github.com/KyleBanks/depth v1.2.1 // indirect
    github.com/OneOfOne/xxhash v1.2.8 // indirect
    github.com/RoaringBitmap/roaring v1.2.3 // indirect
    github.com/adamzy/cedar-go v0.0.0-20170805034717-80a9c64b256d // indirect
    github.com/agnivade/levenshtein v1.1.1 // indirect
    github.com/beorn7/perks v1.0.1 // indirect
    github.com/bits-and-blooms/bitset v1.2.0 // indirect
go.sum
@@ -48,6 +48,9 @@
github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q=
github.com/RoaringBitmap/roaring v1.2.3 h1:yqreLINqIrX22ErkKI0vY47/ivtJr6n+kMhVOVmhWBY=
github.com/RoaringBitmap/roaring v1.2.3/go.mod h1:plvDsJQpxOC5bw8LRteu/MLWHsHez/3y6cubLI4/1yE=
github.com/adamzy/cedar-go v0.0.0-20170805034717-80a9c64b256d h1:ir/IFJU5xbja5UaBEQLjcvn7aAU01nqU/NUyOBEU+ew=
github.com/adamzy/cedar-go v0.0.0-20170805034717-80a9c64b256d/go.mod h1:PRWNwWq0yifz6XDPZu48aSld8BWwBfr2JKB2bGWiEd4=
github.com/adamzy/sego v0.0.0-20151004184924-5eab9a44f8e8/go.mod h1:KQxo+Xesl2wLJ3yJcX443KaoWzXpbPzU1GNRyE8kNEY=
github.com/agnivade/levenshtein v1.1.1 h1:QY8M92nrzkmr798gCo3kmMyqXFzdQVpxLlGPRBij0P8=
github.com/agnivade/levenshtein v1.1.1/go.mod h1:veldBMzWxcCG2ZvUTKD2kJNRdCk5hVbJomOvKkmgYbo=
github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q=
@@ -257,8 +260,12 @@
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
github.com/huichen/sego v0.0.0-20210824061530-c87651ea5c76 h1:qNQ2+1IQT9Mor/vfEHePOQSbiapLoNI7sQmpxM7l1Ew=
github.com/huichen/sego v0.0.0-20210824061530-c87651ea5c76/go.mod h1:Fymg8+khR/cKSuIwqRxy/jmZg7PIPLk7CauXzrbcMUM=
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/issue9/assert v1.4.1 h1:gUtOpMTeaE4JTe9kACma5foOHBvVt1p5XTFrULDwdXI=
github.com/issue9/assert v1.4.1/go.mod h1:Yktk83hAVl1SPSYtd9kjhBizuiBIqUQyj+D5SE2yjVY=
github.com/jinzhu/inflection v1.0.0 h1:K317FqzuhWc8YvSVlFMCCUb36O/S9MCKRDI7QkRKD/E=
github.com/jinzhu/inflection v1.0.0/go.mod h1:h+uFLlag+Qp1Va5pdKtLDYj+kHp5pxUVkryuEj+Srlc=
github.com/jinzhu/now v1.1.4/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8=
@@ -400,8 +407,6 @@
github.com/xuri/nfp v0.0.0-20230819163627-dc951e3ffe1a/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ=
github.com/xuri/nfp v0.0.0-20230919160717-d98342af3f05 h1:qhbILQo1K3mphbwKh1vNm4oGezE1eF9fQWmNiIpSfI4=
github.com/xuri/nfp v0.0.0-20230919160717-d98342af3f05/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ=
github.com/yanyiwu/gojieba v1.3.0 h1:6VeaPOR+MawnImdeSvWNr7rP4tvUfnGlEKaoBnR33Ds=
github.com/yanyiwu/gojieba v1.3.0/go.mod h1:54wkP7sMJ6bklf7yPl6F+JG71dzVUU1WigZbR47nGdY=
github.com/yashtewari/glob-intersection v0.2.0 h1:8iuHdN88yYuCzCdjt0gDe+6bAhUwBeEWqThExu54RFg=
github.com/yashtewari/glob-intersection v0.2.0/go.mod h1:LK7pIC3piUjovexikBbJ26Yml7g8xa5bsjfx2v1fwok=
github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
pkg/blevex/analyzer.go
@@ -7,38 +7,14 @@
    "github.com/blevesearch/bleve/v2/registry"
)
func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
    tokenizerName, ok := config["tokenizer"].(string)
    if !ok {
        return nil, errors.New("must specify tokenizer")
    }
    tokenizer, err := cache.TokenizerNamed(tokenizerName)
    if err != nil {
        return nil, err
    }
    jbtk, ok := tokenizer.(*JiebaTokenizer)
    if !ok {
        return nil, errors.New("tokenizer must be of type jieba")
    }
    alz := &JiebaAnalyzer{
        Tokenizer: jbtk,
    }
    return alz, nil
}
func init() {
    registry.RegisterAnalyzer("gojieba", analyzerConstructor)
}
// JiebaAnalyzer from analysis.DefaultAnalyzer
type JiebaAnalyzer struct {
// SegoAnalyzer from analysis.DefaultAnalyzer
type SegoAnalyzer struct {
    CharFilters  []analysis.CharFilter
    Tokenizer    *JiebaTokenizer
    Tokenizer    *SegoTokenizer
    TokenFilters []analysis.TokenFilter
}
func (a *JiebaAnalyzer) Analyze(input []byte) analysis.TokenStream {
func (a *SegoAnalyzer) Analyze(input []byte) analysis.TokenStream {
    if a.CharFilters != nil {
        for _, cf := range a.CharFilters {
            input = cf.Filter(input)
@@ -53,10 +29,26 @@
    return tokens
}
func (a *JiebaAnalyzer) Free() {
    if a.Tokenizer != nil {
        a.Tokenizer.Free()
    } else {
        panic("JiebaAnalyzer.Tokenizer is nil, this should not happen")
func analyzerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.Analyzer, error) {
    tokenizerName, ok := config["tokenizer"].(string)
    if !ok {
        return nil, errors.New("must specify tokenizer")
    }
    tokenizer, err := cache.TokenizerNamed(tokenizerName)
    if err != nil {
        return nil, err
    }
    segoTokenizer, ok := tokenizer.(*SegoTokenizer)
    if !ok {
        return nil, errors.New("tokenizer must be of type sego")
    }
    alz := &SegoAnalyzer{
        Tokenizer: segoTokenizer,
    }
    return alz, nil
}
func init() {
    registry.RegisterAnalyzer("sego", analyzerConstructor)
}
pkg/blevex/bleve.go
@@ -4,43 +4,34 @@
    "fmt"
    "github.com/blevesearch/bleve/v2"
    "github.com/blevesearch/bleve/v2/mapping"
    "github.com/yanyiwu/gojieba"
    "sync"
)
// InitAnalyzer 加载自定义分词器(结巴分词)
// InitAnalyzer 加载自定义分词器(sego)
var defaultAnalyzer *mapping.IndexMappingImpl
func InitAnalyzer() {
    indexMapping := bleve.NewIndexMapping()
    //os.RemoveAll(IndexDir)
    //// clean index when example finished
    //defer os.RemoveAll(IndexDir)
    err := indexMapping.AddCustomTokenizer("gojieba",
    err := indexMapping.AddCustomTokenizer("sego",
        map[string]interface{}{
            "dictpath":     gojieba.DICT_PATH,
            "hmmpath":      gojieba.HMM_PATH,
            "userdictpath": gojieba.USER_DICT_PATH,
            "idf":          gojieba.IDF_PATH,
            "stop_words":   gojieba.STOP_WORDS_PATH,
            "type":         "gojieba",
            "dictpath": "conf/dictionary.txt", // 替换为实际的字典路径
            "type":     "sego",
        },
    )
    if err != nil {
        panic(err)
    }
    err = indexMapping.AddCustomAnalyzer("gojieba",
    err = indexMapping.AddCustomAnalyzer("sego",
        map[string]interface{}{
            "type":      "gojieba",
            "tokenizer": "gojieba",
            "type":      "sego",
            "tokenizer": "sego",
        },
    )
    if err != nil {
        panic(err)
    }
    indexMapping.DefaultAnalyzer = "gojieba"
    indexMapping.DefaultAnalyzer = "sego"
    defaultAnalyzer = indexMapping
}
pkg/blevex/tokenizer.go
@@ -2,37 +2,33 @@
import (
    "errors"
    "github.com/huichen/sego"
    "github.com/blevesearch/bleve/v2/analysis"
    "github.com/blevesearch/bleve/v2/registry"
    "github.com/yanyiwu/gojieba"
)
type JiebaTokenizer struct {
    handle *gojieba.Jieba
type SegoTokenizer struct {
    segmenter sego.Segmenter
}
var _ analysis.Tokenizer = &JiebaTokenizer{}
var _ analysis.Tokenizer = &SegoTokenizer{}
func NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words string) *JiebaTokenizer {
    x := gojieba.NewJieba(dictpath, hmmpath, userdictpath, idf, stop_words)
    return &JiebaTokenizer{x}
func NewSegoTokenizer(dictpath string) *SegoTokenizer {
    segmenter := sego.Segmenter{}
    segmenter.LoadDictionary(dictpath)
    return &SegoTokenizer{segmenter: segmenter}
}
func (x *JiebaTokenizer) Free() {
    x.handle.Free()
}
// Analyze([]byte) TokenStream
func (x *JiebaTokenizer) Tokenize(sentence []byte) analysis.TokenStream {
func (st *SegoTokenizer) Tokenize(sentence []byte) analysis.TokenStream {
    result := make(analysis.TokenStream, 0)
    pos := 1
    words := x.handle.Tokenize(string(sentence), gojieba.SearchMode, true)
    for _, word := range words {
    segments := st.segmenter.Segment(sentence)
    for _, segment := range segments {
        token := analysis.Token{
            Term:     []byte(word.Str),
            Start:    word.Start,
            End:      word.End,
            Term:     []byte(segment.Token().Text()),
            Start:    segment.Start(),
            End:      segment.End(),
            Position: pos,
            Type:     analysis.Ideographic,
        }
@@ -47,25 +43,9 @@
    if !ok {
        return nil, errors.New("config dictpath not found")
    }
    hmmpath, ok := config["hmmpath"].(string)
    if !ok {
        return nil, errors.New("config hmmpath not found")
    }
    userdictpath, ok := config["userdictpath"].(string)
    if !ok {
        return nil, errors.New("config userdictpath not found")
    }
    idf, ok := config["idf"].(string)
    if !ok {
        return nil, errors.New("config idf not found")
    }
    stop_words, ok := config["stop_words"].(string)
    if !ok {
        return nil, errors.New("config stop_words not found")
    }
    return NewJiebaTokenizer(dictpath, hmmpath, userdictpath, idf, stop_words), nil
    return NewSegoTokenizer(dictpath), nil
}
func init() {
    registry.RegisterTokenizer("gojieba", tokenizerConstructor)
    registry.RegisterTokenizer("sego", tokenizerConstructor)
}