1package tokenize 2 3import ( 4 "encoding/json" 5 "fmt" 6 "path/filepath" 7 "testing" 8 9 "github.com/jdkato/prose/internal/util" 10) 11 12type goldenRule struct { 13 Name string 14 Input string 15 Output []string 16} 17 18func TestPragmaticRulesEn(t *testing.T) { testLang("en", t) } 19func TestPragmaticRulesFr(t *testing.T) { testLang("fr", t) } 20func TestPragmaticRulesEs(t *testing.T) { testLang("es", t) } 21 22func BenchmarkPragmaticRulesEn(b *testing.B) { benchmarkLang("en", b) } 23 24func benchmarkLang(lang string, b *testing.B) { 25 tests := make([]goldenRule, 0) 26 f := fmt.Sprintf("golden_rules_%s.json", lang) 27 cases := util.ReadDataFile(filepath.Join(testdata, f)) 28 29 tok, err := NewPragmaticSegmenter(lang) 30 util.CheckError(err) 31 32 util.CheckError(json.Unmarshal(cases, &tests)) 33 for n := 0; n < b.N; n++ { 34 for _, test := range tests { 35 tok.Tokenize(test.Input) 36 } 37 } 38} 39 40func testLang(lang string, t *testing.T) { 41 tests := make([]goldenRule, 0) 42 f := fmt.Sprintf("golden_rules_%s.json", lang) 43 cases := util.ReadDataFile(filepath.Join(testdata, f)) 44 45 tok, err := NewPragmaticSegmenter(lang) 46 util.CheckError(err) 47 48 util.CheckError(json.Unmarshal(cases, &tests)) 49 for _, test := range tests { 50 compare(t, test.Name, test.Input, test.Output, tok) 51 } 52} 53 54func compare(t *testing.T, test, actualText string, expected []string, tok *PragmaticSegmenter) bool { 55 actual := tok.Tokenize(actualText) 56 if len(actual) != len(expected) { 57 t.Log(test) 58 t.Logf("Actual: %v\n", actual) 59 t.Errorf("Actual: %d, Expected: %d\n", len(actual), len(expected)) 60 t.Log("===") 61 return false 62 } 63 for index, sent := range actual { 64 if sent != expected[index] { 65 t.Log(test) 66 t.Errorf("Actual: [%s] Expected: [%s]\n", sent, expected[index]) 67 t.Log("===") 68 return false 69 } 70 } 71 return true 72} 73