Rewrite splitter.

Merge all english characters (like "apple", not "ap" "pp" "pl" "le"),
but keep splitting on Chinese words.
This commit is contained in:
Gea-Suan Lin
2024-02-09 11:25:26 +08:00
parent 28c1df566d
commit a5b6a3c7a1

View File

@@ -4,11 +4,10 @@ import (
"regexp" "regexp"
) )
var re1, re2 *regexp.Regexp var re1 *regexp.Regexp
func init() { func init() {
re1 = regexp.MustCompile("\\PL+") re1 = regexp.MustCompile("(\\w+|\\p{L})")
re2 = regexp.MustCompile("")
} }
func Bigram(s string) []string { func Bigram(s string) []string {
@@ -24,8 +23,8 @@ func Bigram(s string) []string {
func split(s string) []string { func split(s string) []string {
bag := make([]string, 0) bag := make([]string, 0)
for _, w := range re1.Split(s, -1) { for _, w := range re1.FindAllStringSubmatch(s, -1) {
bag = append(bag, re2.Split(w, -1)...) bag = append(bag, w[0])
} }
return bag return bag
} }