Merge all english characters (like "apple", not "ap" "pp" "pl" "le"), but keep splitting on Chinese words.
31 lines
442 B
Go
31 lines
442 B
Go
package ngram
|
|
|
|
import (
|
|
"regexp"
|
|
)
|
|
|
|
var re1 *regexp.Regexp
|
|
|
|
func init() {
|
|
re1 = regexp.MustCompile("(\\w+|\\p{L})")
|
|
}
|
|
|
|
func Bigram(s string) []string {
|
|
bag := split(s)
|
|
|
|
r := make([]string, 0)
|
|
for i := 0; i < len(bag) - 1; i++ {
|
|
r = append(r, bag[i] + bag[i + 1])
|
|
}
|
|
|
|
return r
|
|
}
|
|
|
|
func split(s string) []string {
|
|
bag := make([]string, 0)
|
|
for _, w := range re1.FindAllStringSubmatch(s, -1) {
|
|
bag = append(bag, w[0])
|
|
}
|
|
return bag
|
|
}
|