1<?php 2// (c) Copyright by authors of the Tiki Wiki CMS Groupware Project 3// 4// All Rights Reserved. See copyright.txt for details and a complete list of authors. 5// Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details. 6// $Id$ 7 8/** 9 * @group unit 10 * 11 */ 12 13class Multilingual_Aligner_SentenceSegmentorTest extends TikiTestCase 14{ 15 16 //////////////////////////////////////////////////////////////// 17 // Documentation tests 18 // These tests illustrate how to use this class. 19 //////////////////////////////////////////////////////////////// 20 21 /** 22 * @group multilingual 23 */ 24 public function test_This_is_how_you_create_a_SentenceSegmentor() 25 { 26 $segmentor = new Multilingual_Aligner_SentenceSegmentor(); 27 } 28 29 /** 30 * @group multilingual 31 */ 32 public function test_this_is_how_you_segment_text_into_sentences() 33 { 34 $segmentor = new Multilingual_Aligner_SentenceSegmentor(); 35 $text = "hello. world"; 36 $sentences = $segmentor->segment($text); 37 } 38 39 //////////////////////////////////////////////////////////////// 40 // Internal tests 41 // These tests check the internal workings of the class. 42 //////////////////////////////////////////////////////////////// 43 44 45 /** 46 * @group multilingual 47 */ 48 public function test_segmentation_deals_with_period() 49 { 50 $text = "hello brand new. world."; 51 $expSentences = ["hello brand new.", " world."]; 52 $this->do_test_basic_segmentation( 53 $text, 54 $expSentences, 55 "Segmentation did not deal properly with separation with period." 56 ); 57 } 58 59 /** 60 * @group multilingual 61 */ 62 public function test_segmentation_deals_with_question_mark() 63 { 64 $text = "hello? Anybody home?"; 65 $expSentences = ["hello?", " Anybody home?"]; 66 $this->do_test_basic_segmentation( 67 $text, 68 $expSentences, 69 "Segmentation did not deal properly with separation with question mark." 70 ); 71 } 72 73 /** 74 * @group multilingual 75 */ 76 public function test_segmentation_deals_with_several_question_marks() 77 { 78 $text = "hello???? Anybody home?"; 79 $expSentences = ["hello????", " Anybody home?"]; 80 $this->do_test_basic_segmentation( 81 $text, 82 $expSentences, 83 "Segmentation did not deal properly with separation with question mark." 84 ); 85 } 86 87 /** 88 * @group multilingual 89 */ 90 public function test_segmentation_deals_with_exclamation_mark() 91 { 92 $text = "hello! Anybody home!"; 93 $expSentences = ["hello!", " Anybody home!"]; 94 $this->do_test_basic_segmentation( 95 $text, 96 $expSentences, 97 "Segmentation did not deal properly with separation with exclamation mark." 98 ); 99 } 100 101 102 /** 103 * @group multilingual 104 */ 105 public function test_segmentation_deals_with_mix_of_exclamation_and_question_marks() 106 { 107 $text = "hello?!? Anybody home!"; 108 $expSentences = ["hello?!?", " Anybody home!"]; 109 110 $this->do_test_basic_segmentation( 111 $text, 112 $expSentences, 113 "Segmentation did not deal properly with separation with exclamation mark." 114 ); 115 } 116 117 118 /** 119 * @group multilingual 120 */ 121 public function test_segmentation_deals_with_empty_string() 122 { 123 $text = ""; 124 $expSentences = []; 125 $this->do_test_basic_segmentation( 126 $text, 127 $expSentences, 128 "Segmentation did not deal properly with empty string." 129 ); 130 } 131 132 /** 133 * @group multilingual 134 */ 135 public function test_segmentation_deals_with_wiki_paragraph_break() 136 { 137 $text = "This sentence ends with a period and a newline.\n" . 138 "This sentence has no period, but ends with a wiki paragraph break\n\n" . 139 "This is the start of a new paragraph."; 140 141 $expSentences = [ 142 "This sentence ends with a period and a newline.", 143 "\nThis sentence has no period, but ends with a wiki paragraph break\n\n", 144 "This is the start of a new paragraph." 145 ]; 146 147 $this->do_test_basic_segmentation( 148 $text, 149 $expSentences, 150 "Segmentation did not deal properly with wiki paragraph break." 151 ); 152 } 153 154 /** 155 * @group multilingual 156 */ 157 public function test_segmentation_deals_with_bullet_lists() 158 { 159 $text = "This sentence precedes a bullet list.\n" . 160 "* Bullet 1\n" . 161 "** Bullet 1-1\n" . 162 "* Bullet 2\n" . 163 "After bullet list"; 164 165 $expSentences = [ 166 "This sentence precedes a bullet list.", 167 "\n", 168 "* Bullet 1\n", 169 "** Bullet 1-1\n", 170 "* Bullet 2\nAfter bullet list"]; 171 172 $this->do_test_basic_segmentation( 173 $text, 174 $expSentences, 175 "Segmentation did not deal properly with bullet list." 176 ); 177 } 178 179 //////////////////////////////////////////////////////////////// 180 // Helper methods 181 //////////////////////////////////////////////////////////////// 182 183 public function do_test_basic_segmentation($text, $expSentences, $message) 184 { 185 $segmentor = new Multilingual_Aligner_SentenceSegmentor(); 186 $sentences = $segmentor->segment($text); 187 $got_sentences_as_string = implode(', ', $sentences); 188 $exp_sentences_as_string = implode(', ', $expSentences); 189 190 $this->assertEquals( 191 $expSentences, 192 $sentences, 193 $message . "\n" . 194 "Segmented sentences differed from expected.\n" . 195 "Expected Sentences: $exp_sentences_as_string\n" . 196 "Got Sentences: $got_sentences_as_string\n" 197 ); 198 } 199} 200