Implement tokenize().

This commit is contained in:
Gea-Suan Lin
2024-02-09 11:46:19 +08:00
parent b133273065
commit ce79d2b245
3 changed files with 21 additions and 3 deletions

View File

@@ -4,15 +4,15 @@ import (
"fmt" "fmt"
"github.com/gslin/go-ir-playground/internal/artifact" "github.com/gslin/go-ir-playground/internal/artifact"
"github.com/gslin/go-ir-playground/internal/ngram" "github.com/gslin/go-ir-playground/internal/tokenize"
) )
func main() { func main() {
articles := artifact.Read("data/articles.json") articles := artifact.Read("data/articles.json")
for _, article := range articles { for _, article := range articles {
title_bag := ngram.Bigram(article.Title) title_bag := tokenize.Tokenize(article.Title)
body_bag := ngram.Bigram(article.Body) body_bag := tokenize.Tokenize(article.Body)
fmt.Printf("title_bag = %v\n", title_bag) fmt.Printf("title_bag = %v\n", title_bag)
fmt.Printf("body_bag = %v\n", body_bag) fmt.Printf("body_bag = %v\n", body_bag)

View File

@@ -21,6 +21,10 @@ func Bigram(s string) []string {
return r return r
} }
func Unigram(s string) []string {
return split(s)
}
func split(s string) []string { func split(s string) []string {
bag := make([]string, 0) bag := make([]string, 0)
for _, w := range re1.FindAllStringSubmatch(s, -1) { for _, w := range re1.FindAllStringSubmatch(s, -1) {

View File

@@ -0,0 +1,14 @@
package tokenize
import (
"slices"
"github.com/gslin/go-ir-playground/internal/ngram"
)
func Tokenize(s string) []string {
bag := append(ngram.Unigram(s), ngram.Bigram(s)...)
slices.Sort(bag)
slices.Compact[[]string, string](bag)
return bag
}