Implement tokenize().
This commit is contained in:
@@ -4,15 +4,15 @@ import (
|
||||
"fmt"
|
||||
|
||||
"github.com/gslin/go-ir-playground/internal/artifact"
|
||||
"github.com/gslin/go-ir-playground/internal/ngram"
|
||||
"github.com/gslin/go-ir-playground/internal/tokenize"
|
||||
)
|
||||
|
||||
func main() {
|
||||
articles := artifact.Read("data/articles.json")
|
||||
|
||||
for _, article := range articles {
|
||||
title_bag := ngram.Bigram(article.Title)
|
||||
body_bag := ngram.Bigram(article.Body)
|
||||
title_bag := tokenize.Tokenize(article.Title)
|
||||
body_bag := tokenize.Tokenize(article.Body)
|
||||
|
||||
fmt.Printf("title_bag = %v\n", title_bag)
|
||||
fmt.Printf("body_bag = %v\n", body_bag)
|
||||
|
||||
@@ -21,6 +21,10 @@ func Bigram(s string) []string {
|
||||
return r
|
||||
}
|
||||
|
||||
func Unigram(s string) []string {
|
||||
return split(s)
|
||||
}
|
||||
|
||||
func split(s string) []string {
|
||||
bag := make([]string, 0)
|
||||
for _, w := range re1.FindAllStringSubmatch(s, -1) {
|
||||
|
||||
14
internal/tokenize/tokenize.go
Normal file
14
internal/tokenize/tokenize.go
Normal file
@@ -0,0 +1,14 @@
|
||||
package tokenize
|
||||
|
||||
import (
|
||||
"slices"
|
||||
|
||||
"github.com/gslin/go-ir-playground/internal/ngram"
|
||||
)
|
||||
|
||||
func Tokenize(s string) []string {
|
||||
bag := append(ngram.Unigram(s), ngram.Bigram(s)...)
|
||||
slices.Sort(bag)
|
||||
slices.Compact[[]string, string](bag)
|
||||
return bag
|
||||
}
|
||||
Reference in New Issue
Block a user