diff --git a/cmd/ir-tfidf/main.go b/cmd/ir-tfidf/main.go index 66c4aae..0a1ed64 100644 --- a/cmd/ir-tfidf/main.go +++ b/cmd/ir-tfidf/main.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "strings" "github.com/gslin/go-ir-playground/internal/artifact" "github.com/gslin/go-ir-playground/internal/tokenizer" @@ -10,11 +11,27 @@ import ( func main() { articles := artifact.Read("data/articles.json") - for _, article := range articles { - title_bag := tokenizer.Tokenize(article.Title) - body_bag := tokenizer.Tokenize(article.Body) + tokens := make(map[string][]string) + tf := make(map[string]map[string]int) + df := make(map[string]int) - fmt.Printf("title_bag = %v\n", title_bag) - fmt.Printf("body_bag = %v\n", body_bag) + for _, article := range articles { + str := article.Title + "\n" + article.Body + + bag := tokenizer.Tokenize(str) + tokens[article.Id] = bag + + // Init TF: + tf[article.Id] = make(map[string]int) + + for _, w := range bag { + // Handle TF: + tf[article.Id][w] = strings.Count(str, w) + + // Handle DF: + df[w] += 1 + } } + + fmt.Println("TF & DF Built") }