1<?php
2// (c) Copyright by authors of the Tiki Wiki CMS Groupware Project
3//
4// All Rights Reserved. See copyright.txt for details and a complete list of authors.
5// Licensed under the GNU LESSER GENERAL PUBLIC LICENSE. See license.txt for details.
6// $Id$
7
8/**
9 * @group unit
10 *
11 */
12
13class Multilingual_Aligner_SentenceSegmentorTest extends TikiTestCase
14{
15
16	////////////////////////////////////////////////////////////////
17	// Documentation tests
18	//    These tests illustrate how to use this class.
19	////////////////////////////////////////////////////////////////
20
21	/**
22	 * @group multilingual
23	 */
24	public function test_This_is_how_you_create_a_SentenceSegmentor()
25	{
26		$segmentor = new Multilingual_Aligner_SentenceSegmentor();
27	}
28
29	/**
30	 * @group multilingual
31	 */
32	public function test_this_is_how_you_segment_text_into_sentences()
33	{
34		$segmentor = new Multilingual_Aligner_SentenceSegmentor();
35		$text = "hello. world";
36		$sentences = $segmentor->segment($text);
37	}
38
39	////////////////////////////////////////////////////////////////
40	// Internal tests
41	//    These tests check the internal workings of the class.
42	////////////////////////////////////////////////////////////////
43
44
45	/**
46	 * @group multilingual
47	 */
48	public function test_segmentation_deals_with_period()
49	{
50		$text = "hello brand new. world.";
51		$expSentences = ["hello brand new.", " world."];
52		$this->do_test_basic_segmentation(
53			$text,
54			$expSentences,
55			"Segmentation did not deal properly with separation with period."
56		);
57	}
58
59	/**
60	 * @group multilingual
61	 */
62	public function test_segmentation_deals_with_question_mark()
63	{
64		$text = "hello? Anybody home?";
65		$expSentences = ["hello?", " Anybody home?"];
66		$this->do_test_basic_segmentation(
67			$text,
68			$expSentences,
69			"Segmentation did not deal properly with separation with question mark."
70		);
71	}
72
73	/**
74	 * @group multilingual
75	 */
76	public function test_segmentation_deals_with_several_question_marks()
77	{
78		$text = "hello???? Anybody home?";
79		$expSentences = ["hello????", " Anybody home?"];
80		$this->do_test_basic_segmentation(
81			$text,
82			$expSentences,
83			"Segmentation did not deal properly with separation with question mark."
84		);
85	}
86
87	/**
88	 * @group multilingual
89	 */
90	public function test_segmentation_deals_with_exclamation_mark()
91	{
92		$text = "hello! Anybody home!";
93		$expSentences = ["hello!", " Anybody home!"];
94		$this->do_test_basic_segmentation(
95			$text,
96			$expSentences,
97			"Segmentation did not deal properly with separation with exclamation mark."
98		);
99	}
100
101
102	/**
103	 * @group multilingual
104	 */
105	public function test_segmentation_deals_with_mix_of_exclamation_and_question_marks()
106	{
107		$text = "hello?!? Anybody home!";
108		$expSentences = ["hello?!?", " Anybody home!"];
109
110		$this->do_test_basic_segmentation(
111			$text,
112			$expSentences,
113			"Segmentation did not deal properly with separation with exclamation mark."
114		);
115	}
116
117
118	/**
119	 * @group multilingual
120	 */
121	public function test_segmentation_deals_with_empty_string()
122	{
123		$text = "";
124		$expSentences = [];
125		$this->do_test_basic_segmentation(
126			$text,
127			$expSentences,
128			"Segmentation did not deal properly with empty string."
129		);
130	}
131
132	/**
133	 * @group multilingual
134	 */
135	public function test_segmentation_deals_with_wiki_paragraph_break()
136	{
137		$text = "This sentence ends with a period and a newline.\n" .
138						"This sentence has no period, but ends with a wiki paragraph break\n\n" .
139						"This is the start of a new paragraph.";
140
141		$expSentences = [
142						"This sentence ends with a period and a newline.",
143						"\nThis sentence has no period, but ends with a wiki paragraph break\n\n",
144						"This is the start of a new paragraph."
145		];
146
147		$this->do_test_basic_segmentation(
148			$text,
149			$expSentences,
150			"Segmentation did not deal properly with wiki paragraph break."
151		);
152	}
153
154	/**
155	 * @group multilingual
156	 */
157	public function test_segmentation_deals_with_bullet_lists()
158	{
159		$text = "This sentence precedes a bullet list.\n" .
160					"* Bullet 1\n" .
161					"** Bullet 1-1\n" .
162					"* Bullet 2\n" .
163					"After bullet list";
164
165		$expSentences = [
166					"This sentence precedes a bullet list.",
167					"\n",
168					"* Bullet 1\n",
169					"** Bullet 1-1\n",
170					"* Bullet 2\nAfter bullet list"];
171
172		$this->do_test_basic_segmentation(
173			$text,
174			$expSentences,
175			"Segmentation did not deal properly with bullet list."
176		);
177	}
178
179	////////////////////////////////////////////////////////////////
180	// Helper methods
181	////////////////////////////////////////////////////////////////
182
183	public function do_test_basic_segmentation($text, $expSentences, $message)
184	{
185		$segmentor = new Multilingual_Aligner_SentenceSegmentor();
186		$sentences = $segmentor->segment($text);
187		$got_sentences_as_string = implode(', ', $sentences);
188		$exp_sentences_as_string = implode(', ', $expSentences);
189
190		$this->assertEquals(
191			$expSentences,
192			$sentences,
193			$message . "\n" .
194			"Segmented sentences differed from expected.\n" .
195			"Expected Sentences: $exp_sentences_as_string\n" .
196			"Got      Sentences: $got_sentences_as_string\n"
197		);
198	}
199}
200