Implement tokenize().
This commit is contained in:
@@ -4,15 +4,15 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
"github.com/gslin/go-ir-playground/internal/artifact"
|
"github.com/gslin/go-ir-playground/internal/artifact"
|
||||||
"github.com/gslin/go-ir-playground/internal/ngram"
|
"github.com/gslin/go-ir-playground/internal/tokenize"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
articles := artifact.Read("data/articles.json")
|
articles := artifact.Read("data/articles.json")
|
||||||
|
|
||||||
for _, article := range articles {
|
for _, article := range articles {
|
||||||
title_bag := ngram.Bigram(article.Title)
|
title_bag := tokenize.Tokenize(article.Title)
|
||||||
body_bag := ngram.Bigram(article.Body)
|
body_bag := tokenize.Tokenize(article.Body)
|
||||||
|
|
||||||
fmt.Printf("title_bag = %v\n", title_bag)
|
fmt.Printf("title_bag = %v\n", title_bag)
|
||||||
fmt.Printf("body_bag = %v\n", body_bag)
|
fmt.Printf("body_bag = %v\n", body_bag)
|
||||||
|
|||||||
@@ -21,6 +21,10 @@ func Bigram(s string) []string {
|
|||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Unigram(s string) []string {
|
||||||
|
return split(s)
|
||||||
|
}
|
||||||
|
|
||||||
func split(s string) []string {
|
func split(s string) []string {
|
||||||
bag := make([]string, 0)
|
bag := make([]string, 0)
|
||||||
for _, w := range re1.FindAllStringSubmatch(s, -1) {
|
for _, w := range re1.FindAllStringSubmatch(s, -1) {
|
||||||
|
|||||||
14
internal/tokenize/tokenize.go
Normal file
14
internal/tokenize/tokenize.go
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
package tokenize
|
||||||
|
|
||||||
|
import (
|
||||||
|
"slices"
|
||||||
|
|
||||||
|
"github.com/gslin/go-ir-playground/internal/ngram"
|
||||||
|
)
|
||||||
|
|
||||||
|
func Tokenize(s string) []string {
|
||||||
|
bag := append(ngram.Unigram(s), ngram.Bigram(s)...)
|
||||||
|
slices.Sort(bag)
|
||||||
|
slices.Compact[[]string, string](bag)
|
||||||
|
return bag
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user