Files
go-ir-playground/internal/ngram/ngram.go
Gea-Suan Lin a5b6a3c7a1 Rewrite splitter.
Merge all english characters (like "apple", not "ap" "pp" "pl" "le"),
but keep splitting on Chinese words.
2024-02-09 11:25:26 +08:00

31 lines
442 B
Go

package ngram
import (
"regexp"
)
var re1 *regexp.Regexp
func init() {
re1 = regexp.MustCompile("(\\w+|\\p{L})")
}
func Bigram(s string) []string {
bag := split(s)
r := make([]string, 0)
for i := 0; i < len(bag) - 1; i++ {
r = append(r, bag[i] + bag[i + 1])
}
return r
}
func split(s string) []string {
bag := make([]string, 0)
for _, w := range re1.FindAllStringSubmatch(s, -1) {
bag = append(bag, w[0])
}
return bag
}