1<?php
2/**
3 * ----------------------------------------------------------------------
4 *
5 * Copyright (c) 2006-2013 Khaled Al-Sham'aa.
6 *
7 * http://www.ar-php.org
8 *
9 * PHP Version 5
10 *
11 * ----------------------------------------------------------------------
12 *
13 * LICENSE
14 *
15 * This program is open source product; you can redistribute it and/or
16 * modify it under the terms of the GNU Lesser General Public License (LGPL)
17 * as published by the Free Software Foundation; either version 3
18 * of the License, or (at your option) any later version.
19 *
20 * This program is distributed in the hope that it will be useful,
21 * but WITHOUT ANY WARRANTY; without even the implied warranty of
22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23 * GNU Lesser General Public License for more details.
24 *
25 * You should have received a copy of the GNU Lesser General Public License
26 * along with this program.  If not, see <http://www.gnu.org/licenses/lgpl.txt>.
27 *
28 * ----------------------------------------------------------------------
29 *
30 * Class Name: Arabic Auto Summarize Class
31 *
32 * Filename: AutoSummarize.php
33 *
34 * Original Author(s): Khaled Al-Sham'aa <khaled@ar-php.org>
35 *
36 * Purpose: Automatic keyphrase extraction to provide a quick mini-summary
37 *          for a long Arabic document.
38 *
39 * ----------------------------------------------------------------------
40 *
41 * Arabic Auto Summarize
42 *
43 * This class identifies the key points in an Arabic document for you to share with
44 * others or quickly scan. The class determines key points by analyzing an Arabic
45 * document and assigning a score to each sentence. Sentences that contain words
46 * used frequently in the document are given a higher score. You can then choose a
47 * percentage of the highest-scoring sentences to display in the summary.
48 * "ArAutoSummarize" class works best on well-structured documents such as reports,
49 * articles, and scientific papers.
50 *
51 * "ArAutoSummarize" class cuts wordy copy to the bone by counting words and ranking
52 * sentences. First, "ArAutoSummarize" class identifies the most common words in the
53 * document and assigns a "score" to each word--the more frequently a word is used,
54 * the higher the score.
55 *
56 * Then, it "averages" each sentence by adding the scores of its words and dividing
57 * the sum by the number of words in the sentence--the higher the average, the
58 * higher the rank of the sentence. "ArAutoSummarize" class can summarize texts to
59 * specific number of sentences or percentage of the original copy.
60 *
61 * We use statistical approach, with some attention apparently paid to:
62 *
63 * - Location: leading sentences of paragraph, title, introduction, and conclusion.
64 * - Fixed phrases: in-text summaries.
65 * - Frequencies of words, phrases, proper names
66 * - Contextual material: query, title, headline, initial paragraph
67 *
68 * The motivation for this class is the range of applications for key phrases:
69 *
70 * - Mini-summary: Automatic key phrase extraction can provide a quick mini-summary
71 *   for a long document. For example, it could be a feature in a web sites; just
72 *   click the summarize button when browsing a long web page.
73 *
74 * - Highlights: It can highlight key phrases in a long document, to facilitate
75 *   skimming the document.
76 *
77 * - Author Assistance: Automatic key phrase extraction can help an author or editor
78 *   who wants to supply a list of key phrases for a document. For example, the
79 *   administrator of a web site might want to have a key phrase list at the top of
80 *   each web page. The automatically extracted phrases can be a starting point for
81 *   further manual refinement by the author or editor.
82 *
83 * - Text Compression: On a device with limited display capacity or limited
84 *   bandwidth, key phrases can be a substitute for the full text. For example, an
85 *   email message could be reduced to a set of key phrases for display on a pager;
86 *   a web page could be reduced for display on a portable wireless web browser.
87 *
88 * This list is not intended to be exhaustive, and there may be some overlap in
89 * the items.
90 *
91 * Example:
92 * <code>
93 * include('./I18N/Arabic.php');
94 * $obj = new I18N_Arabic('AutoSummarize');
95 *
96 * $file = 'Examples/Articles/Ajax.txt';
97 * $r = 20;
98 *
99 * // get contents of a file into a string
100 * $fhandle = fopen($file, "r");
101 * $c = fread($fhandle, filesize($file));
102 * fclose($fhandle);
103 *
104 * $k = $obj->getMetaKeywords($c, $r);
105 * echo '<b><font color=#FFFF00>';
106 * echo 'Keywords:</font></b>';
107 * echo '<p dir="rtl" align="justify">';
108 * echo $k . '</p>';
109 *
110 * $s = $obj->doRateSummarize($c, $r);
111 * echo '<b><font color=#FFFF00>';
112 * echo 'Summary:</font></b>';
113 * echo '<p dir="rtl" align="justify">';
114 * echo $s . '</p>';
115 *
116 * echo '<b><font color=#FFFF00>';
117 * echo 'Full Text:</font></b>';
118 * echo '<p><a class=ar_link target=_blank ';
119 * echo 'href='.$file.'>Source File</a></p>';
120 * </code>
121 *
122 * @category  I18N
123 * @package   I18N_Arabic
124 * @author    Khaled Al-Sham'aa <khaled@ar-php.org>
125 * @copyright 2006-2013 Khaled Al-Sham'aa
126 *
127 * @license   LGPL <http://www.gnu.org/licenses/lgpl.txt>
128 * @link      http://www.ar-php.org
129 */
130
131// New in PHP V5.3: Namespaces
132// namespace I18N\Arabic;
133//
134// $obj = new I18N\Arabic\AutoSummarize();
135//
136// use I18N\Arabic;
137// $obj = new Arabic\AutoSummarize();
138//
139// use I18N\Arabic\AutoSummarize as AutoSummarize;
140// $obj = new AutoSummarize();
141
142
143/**
144 * This PHP class do automatic keyphrase extraction to provide a quick
145 * mini-summary for a long Arabic document
146 *
147 * @category  I18N
148 * @package   I18N_Arabic
149 * @author    Khaled Al-Sham'aa <khaled@ar-php.org>
150 * @copyright 2006-2013 Khaled Al-Sham'aa
151 *
152 * @license   LGPL <http://www.gnu.org/licenses/lgpl.txt>
153 * @link      http://www.ar-php.org
154 */
155class I18N_Arabic_AutoSummarize
156{
157    private $_normalizeAlef       = array('أ','إ','آ');
158    private $_normalizeDiacritics = array('َ','ً','ُ','ٌ','ِ','ٍ','ْ','ّ');
159
160    private $_commonChars = array('ة','ه','ي','ن','و','ت','ل','ا','س','م',
161                                   'e', 't', 'a', 'o', 'i', 'n', 's');
162
163    private $_separators = array('.',"\n",'،','؛','(','[','{',')',']','}',',',';');
164
165    private $_commonWords    = array();
166    private $_importantWords = array();
167
168    /**
169     * Loads initialize values
170     *
171     * @ignore
172     */
173    public function __construct()
174    {
175        // This common words used in cleanCommon method
176        $words    = file(dirname(__FILE__).'/data/ar-stopwords.txt');
177        $en_words = file(dirname(__FILE__).'/data/en-stopwords.txt');
178
179        $words = array_merge($words, $en_words);
180        $words = array_map('trim', $words);
181
182        $this->_commonWords = $words;
183
184        // This important words used in rankSentences method
185        $words = file(dirname(__FILE__).'/data/important-words.txt');
186        $words = array_map('trim', $words);
187
188        $this->_importantWords = $words;
189    }
190
191    /**
192     * Load enhanced Arabic stop words list
193     *
194     * @return void
195     */
196    public function loadExtra()
197    {
198        $extra_words = file(dirname(__FILE__).'/data/ar-extra-stopwords.txt');
199        $extra_words = array_map('trim', $extra_words);
200
201        $this->_commonWords = array_merge($this->_commonWords, $extra_words);
202    }
203
204    /**
205     * Core summarize function that implement required steps in the algorithm
206     *
207     * @param string  $str      Input Arabic document as a string
208     * @param string  $keywords List of keywords higlited by search process
209     * @param integer $int      Sentences value (see $mode effect also)
210     * @param string  $mode     Mode of sentences count [number|rate]
211     * @param string  $output   Output mode [summary|highlight]
212     * @param string  $style    Name of the CSS class you would like to apply
213     *
214     * @return string Output summary requested
215     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
216     */
217    protected function summarize($str, $keywords, $int, $mode, $output, $style=null)
218    {
219        preg_match_all(
220            "/[^\.\n\،\؛\,\;](.+?)[\.\n\،\؛\,\;]/u",
221            $str,
222            $sentences
223        );
224        $_sentences = $sentences[0];
225
226        if ($mode == 'rate') {
227            $str            = preg_replace("/\s{2,}/u", ' ', $str);
228            $totalChars     = mb_strlen($str);
229            $totalSentences = count($_sentences);
230
231            $maxChars = round($int * $totalChars / 100);
232            $int      = round($int * $totalSentences / 100);
233        } else {
234            $maxChars = 99999;
235        }
236
237        $summary = '';
238
239        $str           = strip_tags($str);
240        $normalizedStr = $this->doNormalize($str);
241        $cleanedStr    = $this->cleanCommon($normalizedStr);
242        $stemStr       = $this->draftStem($cleanedStr);
243
244        preg_match_all(
245            "/[^\.\n\،\؛\,\;](.+?)[\.\n\،\؛\,\;]/u",
246            $stemStr,
247            $sentences
248        );
249        $_stemmedSentences = $sentences[0];
250
251        $wordRanks = $this->rankWords($stemStr);
252
253        if ($keywords) {
254            $keywords = $this->doNormalize($keywords);
255            $keywords = $this->draftStem($keywords);
256            $words    = explode(' ', $keywords);
257
258            foreach ($words as $word) {
259                $wordRanks[$word] = 1000;
260            }
261        }
262
263        $sentencesRanks = $this->rankSentences(
264            $_sentences,
265            $_stemmedSentences,
266            $wordRanks
267        );
268
269        list($sentences, $ranks) = $sentencesRanks;
270
271        $minRank = $this->minAcceptedRank($sentences, $ranks, $int, $maxChars);
272
273        $totalSentences = count($ranks);
274
275        for ($i = 0; $i < $totalSentences; $i++) {
276            if ($sentencesRanks[1][$i] >= $minRank) {
277                if ($output == 'summary') {
278                    $summary .= ' '.$sentencesRanks[0][$i];
279                } else {
280                    $summary .= '<span class="' . $style .'">' .
281                                $sentencesRanks[0][$i] . '</span>';
282                }
283            } else {
284                if ($output == 'highlight') {
285                    $summary .= $sentencesRanks[0][$i];
286                }
287            }
288        }
289
290        if ($output == 'highlight') {
291            $summary = str_replace("\n", '<br />', $summary);
292        }
293
294        return $summary;
295    }
296
297    /**
298     * Summarize input Arabic string (document content) into specific number of
299     * sentences in the output
300     *
301     * @param string  $str      Input Arabic document as a string
302     * @param integer $int      Number of sentences required in output summary
303     * @param string  $keywords List of keywords higlited by search process
304     *
305     * @return string Output summary requested
306     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
307     */
308    public function doSummarize($str, $int, $keywords)
309    {
310        $summary = $this->summarize(
311            $str, $keywords, $int, 'number', 'summary', $style
312        );
313
314        return $summary;
315    }
316
317    /**
318     * Summarize percentage of the input Arabic string (document content) into output
319     *
320     * @param string  $str      Input Arabic document as a string
321     * @param integer $rate     Rate of output summary sentence number as
322     *                          percentage of the input Arabic string
323     *                          (document content)
324     * @param string  $keywords List of keywords higlited by search process
325     *
326     * @return string Output summary requested
327     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
328     */
329    public function doRateSummarize($str, $rate, $keywords)
330    {
331        $summary = $this->summarize(
332            $str, $keywords, $rate, 'rate', 'summary', $style
333        );
334
335        return $summary;
336    }
337
338    /**
339     * Highlight key sentences (summary) of the input string (document content)
340     * using CSS and send the result back as an output
341     *
342     * @param string  $str      Input Arabic document as a string
343     * @param integer $int      Number of key sentences required to be
344     *                          highlighted in the input string
345     *                          (document content)
346     * @param string  $keywords List of keywords higlited by search process
347     * @param string  $style    Name of the CSS class you would like to apply
348     *
349     * @return string Output highlighted key sentences summary (using CSS)
350     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
351     */
352    public function highlightSummary($str, $int, $keywords, $style)
353    {
354        $summary = $this->summarize(
355            $str, $keywords, $int, 'number', 'highlight', $style
356        );
357
358        return $summary;
359    }
360
361    /**
362     * Highlight key sentences (summary) as percentage of the input string
363     * (document content) using CSS and send the result back as an output.
364     *
365     * @param string  $str      Input Arabic document as a string
366     * @param integer $rate     Rate of highlighted key sentences summary
367     *                          number as percentage of the input Arabic
368     *                          string (document content)
369     * @param string  $keywords List of keywords higlited by search process
370     * @param string  $style    Name of the CSS class you would like to apply
371     *
372     * @return string Output highlighted key sentences summary (using CSS)
373     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
374     */
375    public function highlightRateSummary($str, $rate, $keywords, $style)
376    {
377        $summary = $this->summarize(
378            $str, $keywords, $rate, 'rate', 'highlight', $style
379        );
380
381        return $summary;
382    }
383
384    /**
385     * Extract keywords from a given Arabic string (document content)
386     *
387     * @param string  $str Input Arabic document as a string
388     * @param integer $int Number of keywords required to be extracting
389     *                     from input string (document content)
390     *
391     * @return string List of the keywords extracting from input Arabic string
392     *               (document content)
393     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
394     */
395    public function getMetaKeywords($str, $int)
396    {
397        $patterns     = array();
398        $replacements = array();
399        $metaKeywords = '';
400
401        array_push($patterns, '/\.|\n|\،|\؛|\(|\[|\{|\)|\]|\}|\,|\;/u');
402        array_push($replacements, ' ');
403        $str = preg_replace($patterns, $replacements, $str);
404
405        $normalizedStr = $this->doNormalize($str);
406        $cleanedStr    = $this->cleanCommon($normalizedStr);
407
408        $str = preg_replace('/(\W)ال(\w{3,})/u', '\\1\\2', $cleanedStr);
409        $str = preg_replace('/(\W)وال(\w{3,})/u', '\\1\\2', $str);
410        $str = preg_replace('/(\w{3,})هما(\W)/u', '\\1\\2', $str);
411        $str = preg_replace('/(\w{3,})كما(\W)/u', '\\1\\2', $str);
412        $str = preg_replace('/(\w{3,})تين(\W)/u', '\\1\\2', $str);
413        $str = preg_replace('/(\w{3,})هم(\W)/u', '\\1\\2', $str);
414        $str = preg_replace('/(\w{3,})هن(\W)/u', '\\1\\2', $str);
415        $str = preg_replace('/(\w{3,})ها(\W)/u', '\\1\\2', $str);
416        $str = preg_replace('/(\w{3,})نا(\W)/u', '\\1\\2', $str);
417        $str = preg_replace('/(\w{3,})ني(\W)/u', '\\1\\2', $str);
418        $str = preg_replace('/(\w{3,})كم(\W)/u', '\\1\\2', $str);
419        $str = preg_replace('/(\w{3,})تم(\W)/u', '\\1\\2', $str);
420        $str = preg_replace('/(\w{3,})كن(\W)/u', '\\1\\2', $str);
421        $str = preg_replace('/(\w{3,})ات(\W)/u', '\\1\\2', $str);
422        $str = preg_replace('/(\w{3,})ين(\W)/u', '\\1\\2', $str);
423        $str = preg_replace('/(\w{3,})تن(\W)/u', '\\1\\2', $str);
424        $str = preg_replace('/(\w{3,})ون(\W)/u', '\\1\\2', $str);
425        $str = preg_replace('/(\w{3,})ان(\W)/u', '\\1\\2', $str);
426        $str = preg_replace('/(\w{3,})تا(\W)/u', '\\1\\2', $str);
427        $str = preg_replace('/(\w{3,})وا(\W)/u', '\\1\\2', $str);
428        $str = preg_replace('/(\w{3,})ة(\W)/u', '\\1\\2', $str);
429
430        $stemStr = preg_replace('/(\W)\w{1,3}(\W)/u', '\\2', $str);
431
432        $wordRanks = $this->rankWords($stemStr);
433
434        arsort($wordRanks, SORT_NUMERIC);
435
436        $i = 1;
437        foreach ($wordRanks as $key => $value) {
438            if ($this->acceptedWord($key)) {
439                $metaKeywords .= $key . '، ';
440                $i++;
441            }
442            if ($i > $int) {
443                break;
444            }
445        }
446
447        $metaKeywords = mb_substr($metaKeywords, 0, -2);
448
449        return $metaKeywords;
450    }
451
452    /**
453     * Normalized Arabic document
454     *
455     * @param string $str Input Arabic document as a string
456     *
457     * @return string Normalized Arabic document
458     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
459     */
460    protected function doNormalize($str)
461    {
462        $str = str_replace($this->_normalizeAlef, 'ا', $str);
463        $str = str_replace($this->_normalizeDiacritics, '', $str);
464        $str = strtr(
465            $str,
466            'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
467            'abcdefghijklmnopqrstuvwxyz'
468        );
469
470        return $str;
471    }
472
473    /**
474     * Extracting common Arabic words (roughly)
475     * from input Arabic string (document content)
476     *
477     * @param string $str Input normalized Arabic document as a string
478     *
479     * @return string Arabic document as a string free of common words (roughly)
480     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
481     */
482    public function cleanCommon($str)
483    {
484        $str = str_replace($this->_commonWords, ' ', $str);
485
486        return $str;
487    }
488
489    /**
490     * Remove less significant Arabic letter from given string (document content).
491     * Please note that output will not be human readable.
492     *
493     * @param string $str Input Arabic document as a string
494     *
495     * @return string Output string after removing less significant Arabic letter
496     *                (not human readable output)
497     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
498     */
499    protected function draftStem($str)
500    {
501        $str = str_replace($this->_commonChars, '', $str);
502        return $str;
503    }
504
505    /**
506     * Ranks words in a given Arabic string (document content). That rank refers
507     * to the frequency of that word appears in that given document.
508     *
509     * @param string $str Input Arabic document as a string
510     *
511     * @return hash Associated array where document words referred by index and
512     *              those words ranks referred by values of those array items.
513     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
514     */
515    protected function rankWords($str)
516    {
517        $wordsRanks = array();
518
519        $str   = str_replace($this->_separators, ' ', $str);
520        $words = preg_split("/[\s,]+/u", $str);
521
522        foreach ($words as $word) {
523            if (isset($wordsRanks[$word])) {
524                $wordsRanks[$word]++;
525            } else {
526                $wordsRanks[$word] = 1;
527            }
528        }
529
530        foreach ($wordsRanks as $wordRank => $total) {
531            if (mb_substr($wordRank, 0, 1) == 'و') {
532                $subWordRank = mb_substr($wordRank, 1, mb_strlen($wordRank) - 1);
533                if (isset($wordsRanks[$subWordRank])) {
534                    unset($wordsRanks[$wordRank]);
535                    $wordsRanks[$subWordRank] += $total;
536                }
537            }
538        }
539
540        return $wordsRanks;
541    }
542
543    /**
544     * Ranks sentences in a given Arabic string (document content).
545     *
546     * @param array $sentences        Sentences of the input Arabic document
547     *                                as an array
548     * @param array $stemmedSentences Stemmed sentences of the input Arabic
549     *                                document as an array
550     * @param array $arr              Words ranks array (word as an index and
551     *                                value refer to the word frequency)
552     *
553     * @return array Two dimension array, first item is an array of document
554     *               sentences, second item is an array of ranks of document
555     *               sentences.
556     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
557     */
558    protected function rankSentences($sentences, $stemmedSentences, $arr)
559    {
560        $sentenceArr = array();
561        $rankArr     = array();
562
563        $max = count($sentences);
564
565        for ($i = 0; $i < $max; $i++) {
566            $sentence = $sentences[$i];
567
568            $w     = 0;
569            $first = mb_substr($sentence, 0, 1);
570            $last  = mb_substr($sentence, -1, 1);
571
572            if ($first == "\n") {
573                $w += 3;
574            } elseif (in_array($first, $this->_separators)) {
575                $w += 2;
576            } else {
577                $w += 1;
578            }
579
580            if ($last == "\n") {
581                $w += 3;
582            } elseif (in_array($last, $this->_separators)) {
583                $w += 2;
584            } else {
585                $w += 1;
586            }
587
588            foreach ($this->_importantWords as $word) {
589                if ($word != '') {
590                    $w += mb_substr_count($sentence, $word);
591                }
592            }
593
594            $sentence = mb_substr(mb_substr($sentence, 0, -1), 1);
595            if (!in_array($first, $this->_separators)) {
596                $sentence = $first . $sentence;
597            }
598
599            $stemStr = $stemmedSentences[$i];
600            $stemStr = mb_substr($stemStr, 0, -1);
601
602            $words = preg_split("/[\s,]+/u", $stemStr);
603
604            $totalWords = count($words);
605            if ($totalWords > 4) {
606                $totalWordsRank = 0;
607
608                foreach ($words as $word) {
609                    if (isset($arr[$word])) {
610                        $totalWordsRank += $arr[$word];
611                    }
612                }
613
614                $wordsRank     = $totalWordsRank / $totalWords;
615                $sentenceRanks = $w * $wordsRank;
616
617                array_push($sentenceArr, $sentence . $last);
618                array_push($rankArr, $sentenceRanks);
619            }
620        }
621
622        $sentencesRanks = array($sentenceArr, $rankArr);
623
624        return $sentencesRanks;
625    }
626
627    /**
628     * Calculate minimum rank for sentences which will be including in the summary
629     *
630     * @param array   $str Document sentences
631     * @param array   $arr Sentences ranks
632     * @param integer $int Number of sentences you need to include in your summary
633     * @param integer $max Maximum number of characters accepted in your summary
634     *
635     * @return integer Minimum accepted sentence rank (sentences with rank more
636     *                 than this will be listed in the document summary)
637     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
638     */
639    protected function minAcceptedRank($str, $arr, $int, $max)
640    {
641        $len = array();
642
643        foreach ($str as $line) {
644            $len[] = mb_strlen($line);
645        }
646
647        rsort($arr, SORT_NUMERIC);
648
649        $totalChars = 0;
650
651        for ($i=0; $i<=$int; $i++) {
652
653            if (!isset($arr[$i])) {
654                $minRank = 0;
655                break;
656            }
657
658            $totalChars += $len[$i];
659
660            if ($totalChars >= $max) {
661                $minRank = $arr[$i];
662                break;
663            }
664
665            $minRank = $arr[$i];
666        }
667
668        return $minRank;
669    }
670
671    /**
672     * Check some conditions to know if a given string is a formal valid word or not
673     *
674     * @param string $word String to be checked if it is a valid word or not
675     *
676     * @return boolean True if passed string is accepted as a valid word else
677     *                 it will return False
678     * @author Khaled Al-Sham'aa <khaled@ar-php.org>
679     */
680    protected function acceptedWord($word)
681    {
682        $accept = true;
683
684        if (mb_strlen($word) < 3) {
685            $accept = false;
686        }
687
688        return $accept;
689    }
690}
691
692