From a5b6a3c7a1fdf77d7e7c2cff0ed8deb9328e6dba Mon Sep 17 00:00:00 2001 From: Gea-Suan Lin Date: Fri, 9 Feb 2024 11:25:26 +0800 Subject: [PATCH] Rewrite splitter. Merge all english characters (like "apple", not "ap" "pp" "pl" "le"), but keep splitting on Chinese words. --- internal/ngram/ngram.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/internal/ngram/ngram.go b/internal/ngram/ngram.go index 21ce13e..7199a8f 100644 --- a/internal/ngram/ngram.go +++ b/internal/ngram/ngram.go @@ -4,11 +4,10 @@ import ( "regexp" ) -var re1, re2 *regexp.Regexp +var re1 *regexp.Regexp func init() { - re1 = regexp.MustCompile("\\PL+") - re2 = regexp.MustCompile("") + re1 = regexp.MustCompile("(\\w+|\\p{L})") } func Bigram(s string) []string { @@ -24,8 +23,8 @@ func Bigram(s string) []string { func split(s string) []string { bag := make([]string, 0) - for _, w := range re1.Split(s, -1) { - bag = append(bag, re2.Split(w, -1)...) + for _, w := range re1.FindAllStringSubmatch(s, -1) { + bag = append(bag, w[0]) } return bag }