1 /*
2  nlp_test.cpp     MindForger application test
3 
4  Copyright (C) 2016-2020 Martin Dvorak <martin.dvorak@mindforger.com>
5 
6  This program is free software; you can redistribute it and/or
7  modify it under the terms of the GNU General Public License
8  as published by the Free Software Foundation; either version 2
9  of the License, or (at your option) any later version.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License
17  along with this program. If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <iostream>
21 #include <cstdio>
22 #include <vector>
23 #include <string>
24 #include <map>
25 
26 #include "../../../src/config/configuration.h"
27 #include "../../../src/mind/mind.h"
28 #include "../../../src/mind/ai/ai.h"
29 #include "../../../src/mind/ai/nlp/stemmer/stemmer.h"
30 #include "../../../src/mind/ai/nlp/string_char_provider.h"
31 #include "../../../src/mind/ai/nlp/note_char_provider.h"
32 #include "../../../src/mind/ai/nlp/markdown_tokenizer.h"
33 #include "../../../src/mind/ai/nlp/lexicon.h"
34 #include "../../../src/mind/ai/nlp/word_frequency_list.h"
35 #include "../../../src/mind/ai/nlp/bag_of_words.h"
36 
37 #include <gtest/gtest.h>
38 
39 extern char* getMindforgerGitHomePath();
40 
41 using namespace std;
42 
43 // DISABLED test because 3rd party stemmer has memory leaks()
TEST(AiNlpTestCase,DISABLED_Stemmer)44 TEST(AiNlpTestCase, DISABLED_Stemmer)
45 {
46     m8r::Stemmer stemmer{};
47 
48     const char *a[] = {
49         "informational",
50         "Eclipse",
51         "Martin",
52         "AI",
53         "machine",
54         "learning"
55     };
56     vector<string> words(a, end(a));
57 
58     stemmer.setLanguage(m8r::Stemmer::ENGLISH);
59 
60     for(string& w:words) {
61         string sW = stemmer.stem(w);
62         cout << "Before: " << w << endl;
63         cout << "After : " << sW << endl;
64     }
65 }
66 
TEST(AiNlpTestCase,Lexicon)67 TEST(AiNlpTestCase, Lexicon)
68 {
69     m8r::Lexicon lexicon{};
70 
71     lexicon.add("a5");
72     ASSERT_EQ(1, lexicon.size());
73     ASSERT_EQ(1, lexicon.get("a5")->frequency);
74 
75     lexicon.add("a5");
76     ASSERT_EQ(1, lexicon.size());
77     ASSERT_EQ(2, lexicon.get("a5")->frequency);
78 
79     string s{"a5"};
80     lexicon.add(s);
81     ASSERT_EQ(1, lexicon.size());
82     ASSERT_EQ(3, lexicon.get(s)->frequency);
83     lexicon.add(&s);
84     ASSERT_EQ(1, lexicon.size());
85     ASSERT_EQ(4, lexicon.get(&s)->frequency);
86 
87     // adding more words for better weight calculation 5/3/2
88     lexicon.add("a5");
89     lexicon.add("a3");
90     lexicon.add("a3");
91     lexicon.add("a3");
92     lexicon.add("a2");
93     lexicon.add("a2");
94 
95     // weights
96     lexicon.recalculateWeights();
97     lexicon.print();
98 
99     ASSERT_FLOAT_EQ(0.01, lexicon.get("a5")->weight);
100     ASSERT_FLOAT_EQ(0.4, lexicon.get("a3")->weight);
101     ASSERT_FLOAT_EQ(0.6, lexicon.get("a2")->weight);
102 
103     // TODO weights: increase scale
104 
105 }
106 
107 // DISABLED test because 3rd party stemmer has memory leaks()
TEST(AiNlpTestCase,DISABLED_BowOutline)108 TEST(AiNlpTestCase, DISABLED_BowOutline)
109 {
110     // FOO outline
111     m8r::OutlineType oType{m8r::OutlineType::KeyOutline(),nullptr,m8r::Color::RED()};
112     m8r::Outline o{&oType};
113     o.setName("Outline Name");
114     // FOO outline as MD
115     string markdown;
116     markdown.assign(
117         "Outline Name\n"
118         "========\n"
119         "O text begin [LINK-LABEL](http://link-1.com) text end.\n"
120         "\n"
121         "First Section\n"
122         "-------------\n"
123         "N1 text `N1 inline code` N1 text end.\n"
124         "\n"
125         "## Second Section\n"
126         "S2 intro text:\n"
127         "```\n"
128         "N2 text codeblock.\n"
129         "```\n"
130         "\n"
131         "Note 3\n"
132         "-------------\n"
133         "N2 text.\n"
134         "\n");
135 
136     /*
137      * STEP: parse O/N to Lexicon and BoW(matrix Things x frequencies)
138      */
139 
140     m8r::Lexicon lexicon{};
141     m8r::CommonWordsBlacklist wordBlaclist{};
142     wordBlaclist.addWord("text");
143     m8r::MarkdownTokenizer tokenizer{lexicon, wordBlaclist};
144     m8r::StringCharProvider chars{markdown};
145     m8r::WordFrequencyList* wfl = new m8r::WordFrequencyList{&lexicon};
146     cout << "Tokenizing MD string to word frequency list..." << endl;
147     tokenizer.tokenize(chars, *wfl);
148     wfl->sort();
149 
150     // assert wfl
151     wfl->print();
152     ASSERT_EQ(19, wfl->size());
153     // assert lexicon
154     lexicon.print();
155     ASSERT_EQ(19, lexicon.size());
156 
157     /*
158      * STEP: build BoW i.e. matrix of Things x frequencies
159      */
160 
161     m8r::BagOfWords bow{};
162     bow.add(&o, wfl);
163 
164     bow.print();
165     ASSERT_EQ(1, bow.size());
166 }
167 
168 /*
169  * AA: BoW
170  */
171 
TEST(AiNlpTestCase,Tokenizer)172 TEST(AiNlpTestCase, Tokenizer)
173 {
174     string repositoryPath{"/lib/test/resources/basic-repository"};
175     repositoryPath.insert(0, getMindforgerGitHomePath());
176     m8r::Configuration& config = m8r::Configuration::getInstance();
177     config.clear();
178     config.setConfigFilePath("/tmp/cfg-antc-t.md");
179     config.setActiveRepository(config.addRepository(m8r::RepositoryIndexer::getRepositoryForPath(repositoryPath)));
180     m8r::Mind mind(config);
181     mind.learn();
182     mind.think().get();
183     cout << endl << "Statistics:";
184     cout << endl << "  Outlines: " << mind.remind().getOutlinesCount();
185     cout << endl << "  Bytes   : " << mind.remind().getOutlineMarkdownsSize();
186 
187     ASSERT_EQ(3, mind.remind().getOutlinesCount());
188 
189     // test N narrowing to string using char provider
190     cout << endl << endl << "Testing M NARROWING using tokenizer:" << endl;
191     unique_ptr<vector<m8r::Outline*>> os = mind.findOutlineByNameFts("Canonical Message");
192     ASSERT_EQ(1, os.get()->size());
193     m8r::Note* n=os.get()->at(0)->getNotes()[0];
194     cout << "- BEGIN original ---" << endl << n->getName() << endl << n->getDescriptionAsString() << endl
195          << "- END original >>>>> BEGIN char stream ---" << endl;
196     m8r::NoteCharProvider nNarrower{n};
197     string narrowed{};
198     while(nNarrower.hasNext()) {
199         narrowed += nNarrower.next();
200     }
201     cout << narrowed << endl << "- END char stream --" << endl;
202     ASSERT_EQ(146, narrowed.size());
203 }
204 
205 // IMPROVE disabled as AA API changed - it will be re-enable once BoW becomes main AA algorithm again
TEST(AiNlpTestCase,DISABLED_AaRepositoryBow)206 TEST(AiNlpTestCase, DISABLED_AaRepositoryBow)
207 {
208     string repositoryPath{"/lib/test/resources/universe-repository"};
209     repositoryPath.insert(0, getMindforgerGitHomePath());
210     m8r::Configuration& config = m8r::Configuration::getInstance();
211     config.clear();
212     config.setConfigFilePath("/tmp/cfg-antc-r.md");
213     config.setActiveRepository(config.addRepository(m8r::RepositoryIndexer::getRepositoryForPath(repositoryPath)));
214     config.setAaAlgorithm(m8r::Configuration::AssociationAssessmentAlgorithm::BOW);
215 
216     m8r::Mind mind(config);
217     bool learned = mind.learn();
218     ASSERT_EQ(true, learned);
219     cout << "Statistics:" << endl
220     << "  Outlines: " << mind.remind().getOutlinesCount() << endl
221     << "  Bytes   : " << mind.remind().getOutlineMarkdownsSize() << endl;
222     ASSERT_LE(1, mind.remind().getOutlinesCount());
223 
224     shared_future<bool> readyToThink = mind.think();
225     ASSERT_EQ(true, readyToThink.get()); // blocked
226     ASSERT_EQ(m8r::Configuration::MindState::THINKING, config.getMindState());
227 
228     /*
229      * Tokenize repository > make AI to think > find the most similar Notes pair
230      */
231 
232     // get the best associations of N w/ given name
233     m8r::Note* n=mind.remind().getOutlines()[0]->getNoteByName("Albert Einstein");
234     ASSERT_NE(nullptr, n);
235 
236     m8r::AssociatedNotes associations{m8r::ResourceType::NOTE, n};
237     cout << "BEFORE =========" << endl;
238     auto lbFuture = mind.getAssociatedNotes(associations);
239     lbFuture.get();  // blocked
240     while(!associations.getAssociations()->size()) cout << ".";
241     cout << "AFTER =========" << endl;
242 
243     ASSERT_EQ(7, associations.getAssociations()->size());
244 
245     m8r::Ai::print(n,*associations.getAssociations());
246 }
247 
248 // IMPROVE disabled as AA API changed - it will be re-enable once BoW becomes main AA algorithm again
TEST(AiNlpTestCase,DISABLED_AaUniverseBow)249 TEST(AiNlpTestCase, DISABLED_AaUniverseBow)
250 {
251     string repositoryPath{"/lib/test/resources/aa-repository"};
252     repositoryPath.insert(0, getMindforgerGitHomePath());
253     m8r::Configuration& config = m8r::Configuration::getInstance();
254     config.clear();
255     config.setConfigFilePath("/tmp/cfg-antc-aub.md");
256     config.setActiveRepository(config.addRepository(m8r::RepositoryIndexer::getRepositoryForPath(repositoryPath)));
257     config.setAaAlgorithm(m8r::Configuration::AssociationAssessmentAlgorithm::BOW);
258 
259     m8r::Mind mind(config);
260     bool learned = mind.learn();
261     ASSERT_EQ(true, learned);
262     cout << "Statistics:" << endl
263     << "  Outlines: " << mind.remind().getOutlinesCount() << endl
264     << "  Bytes   : " << mind.remind().getOutlineMarkdownsSize() << endl;
265     ASSERT_LE(1, mind.remind().getOutlinesCount());
266 
267     shared_future<bool> readyToThink = mind.think();
268     ASSERT_EQ(true, readyToThink.get()); // blocked
269     ASSERT_EQ(m8r::Configuration::MindState::THINKING, config.getMindState());
270 
271     // assert associations
272     m8r::Outline* u;
273     if(mind.remind().getOutlines()[0]->getName().find("Alternative") != string::npos) {
274         u = mind.remind().getOutlines()[1];
275     } else {
276         u = mind.remind().getOutlines()[0];
277     }
278 
279     // get the best associations of 'Albert Einstein'
280     m8r::Note* n=u->getNotes()[0];
281     UNUSED_ARG(n);
282     m8r::AssociatedNotes associations{m8r::ResourceType::NOTE, n};
283     auto lbFuture = mind.getAssociatedNotes(associations);
284     m8r::Ai::print(n,*associations.getAssociations());
285     lbFuture.get(); // blocked
286     vector<pair<m8r::Note*,float>>* leaderboard = associations.getAssociations();
287 
288     // asserts
289     ASSERT_EQ(9, leaderboard->size());
290     ASSERT_EQ("Same Albert Einstein", (*leaderboard)[0].first->getName());
291     ASSERT_EQ("Universe", (*leaderboard)[0].first->getOutline()->getName());
292     ASSERT_FLOAT_EQ(0.9, (*leaderboard)[0].second);
293     ASSERT_EQ("Same Albert Einstein", (*leaderboard)[1].first->getName());
294     ASSERT_EQ("Alternative Universe", (*leaderboard)[1].first->getOutline()->getName());
295 }
296 
297 /*
298  * AA: FTS
299  */
300 
TEST(AiNlpTestCase,AaRepositoryFts)301 TEST(AiNlpTestCase, AaRepositoryFts)
302 {
303     // TODO AaRepositoryFts
304 }
305 
TEST(AiNlpTestCase,AaUniverseFts)306 TEST(AiNlpTestCase, AaUniverseFts)
307 {
308     // TODO AaUniverseFts
309 }
310