diff --git a/cmd/ir-tfidf/main.go b/cmd/ir-tfidf/main.go index 5858182..a0ef35c 100644 --- a/cmd/ir-tfidf/main.go +++ b/cmd/ir-tfidf/main.go @@ -4,15 +4,15 @@ import ( "fmt" "github.com/gslin/go-ir-playground/internal/artifact" - "github.com/gslin/go-ir-playground/internal/ngram" + "github.com/gslin/go-ir-playground/internal/tokenize" ) func main() { articles := artifact.Read("data/articles.json") for _, article := range articles { - title_bag := ngram.Bigram(article.Title) - body_bag := ngram.Bigram(article.Body) + title_bag := tokenize.Tokenize(article.Title) + body_bag := tokenize.Tokenize(article.Body) fmt.Printf("title_bag = %v\n", title_bag) fmt.Printf("body_bag = %v\n", body_bag) diff --git a/internal/ngram/ngram.go b/internal/ngram/ngram.go index 7199a8f..eec2e3b 100644 --- a/internal/ngram/ngram.go +++ b/internal/ngram/ngram.go @@ -21,6 +21,10 @@ func Bigram(s string) []string { return r } +func Unigram(s string) []string { + return split(s) +} + func split(s string) []string { bag := make([]string, 0) for _, w := range re1.FindAllStringSubmatch(s, -1) { diff --git a/internal/tokenize/tokenize.go b/internal/tokenize/tokenize.go new file mode 100644 index 0000000..e7d2750 --- /dev/null +++ b/internal/tokenize/tokenize.go @@ -0,0 +1,14 @@ +package tokenize + +import ( + "slices" + + "github.com/gslin/go-ir-playground/internal/ngram" +) + +func Tokenize(s string) []string { + bag := append(ngram.Unigram(s), ngram.Bigram(s)...) + slices.Sort(bag) + slices.Compact[[]string, string](bag) + return bag +}