Rewrite splitter.
Merge all english characters (like "apple", not "ap" "pp" "pl" "le"), but keep splitting on Chinese words.
This commit is contained in:
@@ -4,11 +4,10 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
)
|
)
|
||||||
|
|
||||||
var re1, re2 *regexp.Regexp
|
var re1 *regexp.Regexp
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
re1 = regexp.MustCompile("\\PL+")
|
re1 = regexp.MustCompile("(\\w+|\\p{L})")
|
||||||
re2 = regexp.MustCompile("")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func Bigram(s string) []string {
|
func Bigram(s string) []string {
|
||||||
@@ -24,8 +23,8 @@ func Bigram(s string) []string {
|
|||||||
|
|
||||||
func split(s string) []string {
|
func split(s string) []string {
|
||||||
bag := make([]string, 0)
|
bag := make([]string, 0)
|
||||||
for _, w := range re1.Split(s, -1) {
|
for _, w := range re1.FindAllStringSubmatch(s, -1) {
|
||||||
bag = append(bag, re2.Split(w, -1)...)
|
bag = append(bag, w[0])
|
||||||
}
|
}
|
||||||
return bag
|
return bag
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user