1 /*
2 nlp_test.cpp MindForger application test
3
4 Copyright (C) 2016-2020 Martin Dvorak <martin.dvorak@mindforger.com>
5
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License
8 as published by the Free Software Foundation; either version 2
9 of the License, or (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include <iostream>
21 #include <cstdio>
22 #include <vector>
23 #include <string>
24 #include <map>
25
26 #include "../../../src/config/configuration.h"
27 #include "../../../src/mind/mind.h"
28 #include "../../../src/mind/ai/ai.h"
29 #include "../../../src/mind/ai/nlp/stemmer/stemmer.h"
30 #include "../../../src/mind/ai/nlp/string_char_provider.h"
31 #include "../../../src/mind/ai/nlp/note_char_provider.h"
32 #include "../../../src/mind/ai/nlp/markdown_tokenizer.h"
33 #include "../../../src/mind/ai/nlp/lexicon.h"
34 #include "../../../src/mind/ai/nlp/word_frequency_list.h"
35 #include "../../../src/mind/ai/nlp/bag_of_words.h"
36
37 #include <gtest/gtest.h>
38
39 extern char* getMindforgerGitHomePath();
40
41 using namespace std;
42
43 // DISABLED test because 3rd party stemmer has memory leaks()
TEST(AiNlpTestCase,DISABLED_Stemmer)44 TEST(AiNlpTestCase, DISABLED_Stemmer)
45 {
46 m8r::Stemmer stemmer{};
47
48 const char *a[] = {
49 "informational",
50 "Eclipse",
51 "Martin",
52 "AI",
53 "machine",
54 "learning"
55 };
56 vector<string> words(a, end(a));
57
58 stemmer.setLanguage(m8r::Stemmer::ENGLISH);
59
60 for(string& w:words) {
61 string sW = stemmer.stem(w);
62 cout << "Before: " << w << endl;
63 cout << "After : " << sW << endl;
64 }
65 }
66
TEST(AiNlpTestCase,Lexicon)67 TEST(AiNlpTestCase, Lexicon)
68 {
69 m8r::Lexicon lexicon{};
70
71 lexicon.add("a5");
72 ASSERT_EQ(1, lexicon.size());
73 ASSERT_EQ(1, lexicon.get("a5")->frequency);
74
75 lexicon.add("a5");
76 ASSERT_EQ(1, lexicon.size());
77 ASSERT_EQ(2, lexicon.get("a5")->frequency);
78
79 string s{"a5"};
80 lexicon.add(s);
81 ASSERT_EQ(1, lexicon.size());
82 ASSERT_EQ(3, lexicon.get(s)->frequency);
83 lexicon.add(&s);
84 ASSERT_EQ(1, lexicon.size());
85 ASSERT_EQ(4, lexicon.get(&s)->frequency);
86
87 // adding more words for better weight calculation 5/3/2
88 lexicon.add("a5");
89 lexicon.add("a3");
90 lexicon.add("a3");
91 lexicon.add("a3");
92 lexicon.add("a2");
93 lexicon.add("a2");
94
95 // weights
96 lexicon.recalculateWeights();
97 lexicon.print();
98
99 ASSERT_FLOAT_EQ(0.01, lexicon.get("a5")->weight);
100 ASSERT_FLOAT_EQ(0.4, lexicon.get("a3")->weight);
101 ASSERT_FLOAT_EQ(0.6, lexicon.get("a2")->weight);
102
103 // TODO weights: increase scale
104
105 }
106
107 // DISABLED test because 3rd party stemmer has memory leaks()
TEST(AiNlpTestCase,DISABLED_BowOutline)108 TEST(AiNlpTestCase, DISABLED_BowOutline)
109 {
110 // FOO outline
111 m8r::OutlineType oType{m8r::OutlineType::KeyOutline(),nullptr,m8r::Color::RED()};
112 m8r::Outline o{&oType};
113 o.setName("Outline Name");
114 // FOO outline as MD
115 string markdown;
116 markdown.assign(
117 "Outline Name\n"
118 "========\n"
119 "O text begin [LINK-LABEL](http://link-1.com) text end.\n"
120 "\n"
121 "First Section\n"
122 "-------------\n"
123 "N1 text `N1 inline code` N1 text end.\n"
124 "\n"
125 "## Second Section\n"
126 "S2 intro text:\n"
127 "```\n"
128 "N2 text codeblock.\n"
129 "```\n"
130 "\n"
131 "Note 3\n"
132 "-------------\n"
133 "N2 text.\n"
134 "\n");
135
136 /*
137 * STEP: parse O/N to Lexicon and BoW(matrix Things x frequencies)
138 */
139
140 m8r::Lexicon lexicon{};
141 m8r::CommonWordsBlacklist wordBlaclist{};
142 wordBlaclist.addWord("text");
143 m8r::MarkdownTokenizer tokenizer{lexicon, wordBlaclist};
144 m8r::StringCharProvider chars{markdown};
145 m8r::WordFrequencyList* wfl = new m8r::WordFrequencyList{&lexicon};
146 cout << "Tokenizing MD string to word frequency list..." << endl;
147 tokenizer.tokenize(chars, *wfl);
148 wfl->sort();
149
150 // assert wfl
151 wfl->print();
152 ASSERT_EQ(19, wfl->size());
153 // assert lexicon
154 lexicon.print();
155 ASSERT_EQ(19, lexicon.size());
156
157 /*
158 * STEP: build BoW i.e. matrix of Things x frequencies
159 */
160
161 m8r::BagOfWords bow{};
162 bow.add(&o, wfl);
163
164 bow.print();
165 ASSERT_EQ(1, bow.size());
166 }
167
168 /*
169 * AA: BoW
170 */
171
TEST(AiNlpTestCase,Tokenizer)172 TEST(AiNlpTestCase, Tokenizer)
173 {
174 string repositoryPath{"/lib/test/resources/basic-repository"};
175 repositoryPath.insert(0, getMindforgerGitHomePath());
176 m8r::Configuration& config = m8r::Configuration::getInstance();
177 config.clear();
178 config.setConfigFilePath("/tmp/cfg-antc-t.md");
179 config.setActiveRepository(config.addRepository(m8r::RepositoryIndexer::getRepositoryForPath(repositoryPath)));
180 m8r::Mind mind(config);
181 mind.learn();
182 mind.think().get();
183 cout << endl << "Statistics:";
184 cout << endl << " Outlines: " << mind.remind().getOutlinesCount();
185 cout << endl << " Bytes : " << mind.remind().getOutlineMarkdownsSize();
186
187 ASSERT_EQ(3, mind.remind().getOutlinesCount());
188
189 // test N narrowing to string using char provider
190 cout << endl << endl << "Testing M NARROWING using tokenizer:" << endl;
191 unique_ptr<vector<m8r::Outline*>> os = mind.findOutlineByNameFts("Canonical Message");
192 ASSERT_EQ(1, os.get()->size());
193 m8r::Note* n=os.get()->at(0)->getNotes()[0];
194 cout << "- BEGIN original ---" << endl << n->getName() << endl << n->getDescriptionAsString() << endl
195 << "- END original >>>>> BEGIN char stream ---" << endl;
196 m8r::NoteCharProvider nNarrower{n};
197 string narrowed{};
198 while(nNarrower.hasNext()) {
199 narrowed += nNarrower.next();
200 }
201 cout << narrowed << endl << "- END char stream --" << endl;
202 ASSERT_EQ(146, narrowed.size());
203 }
204
205 // IMPROVE disabled as AA API changed - it will be re-enable once BoW becomes main AA algorithm again
TEST(AiNlpTestCase,DISABLED_AaRepositoryBow)206 TEST(AiNlpTestCase, DISABLED_AaRepositoryBow)
207 {
208 string repositoryPath{"/lib/test/resources/universe-repository"};
209 repositoryPath.insert(0, getMindforgerGitHomePath());
210 m8r::Configuration& config = m8r::Configuration::getInstance();
211 config.clear();
212 config.setConfigFilePath("/tmp/cfg-antc-r.md");
213 config.setActiveRepository(config.addRepository(m8r::RepositoryIndexer::getRepositoryForPath(repositoryPath)));
214 config.setAaAlgorithm(m8r::Configuration::AssociationAssessmentAlgorithm::BOW);
215
216 m8r::Mind mind(config);
217 bool learned = mind.learn();
218 ASSERT_EQ(true, learned);
219 cout << "Statistics:" << endl
220 << " Outlines: " << mind.remind().getOutlinesCount() << endl
221 << " Bytes : " << mind.remind().getOutlineMarkdownsSize() << endl;
222 ASSERT_LE(1, mind.remind().getOutlinesCount());
223
224 shared_future<bool> readyToThink = mind.think();
225 ASSERT_EQ(true, readyToThink.get()); // blocked
226 ASSERT_EQ(m8r::Configuration::MindState::THINKING, config.getMindState());
227
228 /*
229 * Tokenize repository > make AI to think > find the most similar Notes pair
230 */
231
232 // get the best associations of N w/ given name
233 m8r::Note* n=mind.remind().getOutlines()[0]->getNoteByName("Albert Einstein");
234 ASSERT_NE(nullptr, n);
235
236 m8r::AssociatedNotes associations{m8r::ResourceType::NOTE, n};
237 cout << "BEFORE =========" << endl;
238 auto lbFuture = mind.getAssociatedNotes(associations);
239 lbFuture.get(); // blocked
240 while(!associations.getAssociations()->size()) cout << ".";
241 cout << "AFTER =========" << endl;
242
243 ASSERT_EQ(7, associations.getAssociations()->size());
244
245 m8r::Ai::print(n,*associations.getAssociations());
246 }
247
248 // IMPROVE disabled as AA API changed - it will be re-enable once BoW becomes main AA algorithm again
TEST(AiNlpTestCase,DISABLED_AaUniverseBow)249 TEST(AiNlpTestCase, DISABLED_AaUniverseBow)
250 {
251 string repositoryPath{"/lib/test/resources/aa-repository"};
252 repositoryPath.insert(0, getMindforgerGitHomePath());
253 m8r::Configuration& config = m8r::Configuration::getInstance();
254 config.clear();
255 config.setConfigFilePath("/tmp/cfg-antc-aub.md");
256 config.setActiveRepository(config.addRepository(m8r::RepositoryIndexer::getRepositoryForPath(repositoryPath)));
257 config.setAaAlgorithm(m8r::Configuration::AssociationAssessmentAlgorithm::BOW);
258
259 m8r::Mind mind(config);
260 bool learned = mind.learn();
261 ASSERT_EQ(true, learned);
262 cout << "Statistics:" << endl
263 << " Outlines: " << mind.remind().getOutlinesCount() << endl
264 << " Bytes : " << mind.remind().getOutlineMarkdownsSize() << endl;
265 ASSERT_LE(1, mind.remind().getOutlinesCount());
266
267 shared_future<bool> readyToThink = mind.think();
268 ASSERT_EQ(true, readyToThink.get()); // blocked
269 ASSERT_EQ(m8r::Configuration::MindState::THINKING, config.getMindState());
270
271 // assert associations
272 m8r::Outline* u;
273 if(mind.remind().getOutlines()[0]->getName().find("Alternative") != string::npos) {
274 u = mind.remind().getOutlines()[1];
275 } else {
276 u = mind.remind().getOutlines()[0];
277 }
278
279 // get the best associations of 'Albert Einstein'
280 m8r::Note* n=u->getNotes()[0];
281 UNUSED_ARG(n);
282 m8r::AssociatedNotes associations{m8r::ResourceType::NOTE, n};
283 auto lbFuture = mind.getAssociatedNotes(associations);
284 m8r::Ai::print(n,*associations.getAssociations());
285 lbFuture.get(); // blocked
286 vector<pair<m8r::Note*,float>>* leaderboard = associations.getAssociations();
287
288 // asserts
289 ASSERT_EQ(9, leaderboard->size());
290 ASSERT_EQ("Same Albert Einstein", (*leaderboard)[0].first->getName());
291 ASSERT_EQ("Universe", (*leaderboard)[0].first->getOutline()->getName());
292 ASSERT_FLOAT_EQ(0.9, (*leaderboard)[0].second);
293 ASSERT_EQ("Same Albert Einstein", (*leaderboard)[1].first->getName());
294 ASSERT_EQ("Alternative Universe", (*leaderboard)[1].first->getOutline()->getName());
295 }
296
297 /*
298 * AA: FTS
299 */
300
TEST(AiNlpTestCase,AaRepositoryFts)301 TEST(AiNlpTestCase, AaRepositoryFts)
302 {
303 // TODO AaRepositoryFts
304 }
305
TEST(AiNlpTestCase,AaUniverseFts)306 TEST(AiNlpTestCase, AaUniverseFts)
307 {
308 // TODO AaUniverseFts
309 }
310