1<?php 2/** 3 * ---------------------------------------------------------------------- 4 * 5 * Copyright (c) 2006-2013 Khaled Al-Sham'aa. 6 * 7 * http://www.ar-php.org 8 * 9 * PHP Version 5 10 * 11 * ---------------------------------------------------------------------- 12 * 13 * LICENSE 14 * 15 * This program is open source product; you can redistribute it and/or 16 * modify it under the terms of the GNU Lesser General Public License (LGPL) 17 * as published by the Free Software Foundation; either version 3 18 * of the License, or (at your option) any later version. 19 * 20 * This program is distributed in the hope that it will be useful, 21 * but WITHOUT ANY WARRANTY; without even the implied warranty of 22 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23 * GNU Lesser General Public License for more details. 24 * 25 * You should have received a copy of the GNU Lesser General Public License 26 * along with this program. If not, see <http://www.gnu.org/licenses/lgpl.txt>. 27 * 28 * ---------------------------------------------------------------------- 29 * 30 * Class Name: Arabic Auto Summarize Class 31 * 32 * Filename: AutoSummarize.php 33 * 34 * Original Author(s): Khaled Al-Sham'aa <khaled@ar-php.org> 35 * 36 * Purpose: Automatic keyphrase extraction to provide a quick mini-summary 37 * for a long Arabic document. 38 * 39 * ---------------------------------------------------------------------- 40 * 41 * Arabic Auto Summarize 42 * 43 * This class identifies the key points in an Arabic document for you to share with 44 * others or quickly scan. The class determines key points by analyzing an Arabic 45 * document and assigning a score to each sentence. Sentences that contain words 46 * used frequently in the document are given a higher score. You can then choose a 47 * percentage of the highest-scoring sentences to display in the summary. 48 * "ArAutoSummarize" class works best on well-structured documents such as reports, 49 * articles, and scientific papers. 50 * 51 * "ArAutoSummarize" class cuts wordy copy to the bone by counting words and ranking 52 * sentences. First, "ArAutoSummarize" class identifies the most common words in the 53 * document and assigns a "score" to each word--the more frequently a word is used, 54 * the higher the score. 55 * 56 * Then, it "averages" each sentence by adding the scores of its words and dividing 57 * the sum by the number of words in the sentence--the higher the average, the 58 * higher the rank of the sentence. "ArAutoSummarize" class can summarize texts to 59 * specific number of sentences or percentage of the original copy. 60 * 61 * We use statistical approach, with some attention apparently paid to: 62 * 63 * - Location: leading sentences of paragraph, title, introduction, and conclusion. 64 * - Fixed phrases: in-text summaries. 65 * - Frequencies of words, phrases, proper names 66 * - Contextual material: query, title, headline, initial paragraph 67 * 68 * The motivation for this class is the range of applications for key phrases: 69 * 70 * - Mini-summary: Automatic key phrase extraction can provide a quick mini-summary 71 * for a long document. For example, it could be a feature in a web sites; just 72 * click the summarize button when browsing a long web page. 73 * 74 * - Highlights: It can highlight key phrases in a long document, to facilitate 75 * skimming the document. 76 * 77 * - Author Assistance: Automatic key phrase extraction can help an author or editor 78 * who wants to supply a list of key phrases for a document. For example, the 79 * administrator of a web site might want to have a key phrase list at the top of 80 * each web page. The automatically extracted phrases can be a starting point for 81 * further manual refinement by the author or editor. 82 * 83 * - Text Compression: On a device with limited display capacity or limited 84 * bandwidth, key phrases can be a substitute for the full text. For example, an 85 * email message could be reduced to a set of key phrases for display on a pager; 86 * a web page could be reduced for display on a portable wireless web browser. 87 * 88 * This list is not intended to be exhaustive, and there may be some overlap in 89 * the items. 90 * 91 * Example: 92 * <code> 93 * include('./I18N/Arabic.php'); 94 * $obj = new I18N_Arabic('AutoSummarize'); 95 * 96 * $file = 'Examples/Articles/Ajax.txt'; 97 * $r = 20; 98 * 99 * // get contents of a file into a string 100 * $fhandle = fopen($file, "r"); 101 * $c = fread($fhandle, filesize($file)); 102 * fclose($fhandle); 103 * 104 * $k = $obj->getMetaKeywords($c, $r); 105 * echo '<b><font color=#FFFF00>'; 106 * echo 'Keywords:</font></b>'; 107 * echo '<p dir="rtl" align="justify">'; 108 * echo $k . '</p>'; 109 * 110 * $s = $obj->doRateSummarize($c, $r); 111 * echo '<b><font color=#FFFF00>'; 112 * echo 'Summary:</font></b>'; 113 * echo '<p dir="rtl" align="justify">'; 114 * echo $s . '</p>'; 115 * 116 * echo '<b><font color=#FFFF00>'; 117 * echo 'Full Text:</font></b>'; 118 * echo '<p><a class=ar_link target=_blank '; 119 * echo 'href='.$file.'>Source File</a></p>'; 120 * </code> 121 * 122 * @category I18N 123 * @package I18N_Arabic 124 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 125 * @copyright 2006-2013 Khaled Al-Sham'aa 126 * 127 * @license LGPL <http://www.gnu.org/licenses/lgpl.txt> 128 * @link http://www.ar-php.org 129 */ 130 131// New in PHP V5.3: Namespaces 132// namespace I18N\Arabic; 133// 134// $obj = new I18N\Arabic\AutoSummarize(); 135// 136// use I18N\Arabic; 137// $obj = new Arabic\AutoSummarize(); 138// 139// use I18N\Arabic\AutoSummarize as AutoSummarize; 140// $obj = new AutoSummarize(); 141 142 143/** 144 * This PHP class do automatic keyphrase extraction to provide a quick 145 * mini-summary for a long Arabic document 146 * 147 * @category I18N 148 * @package I18N_Arabic 149 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 150 * @copyright 2006-2013 Khaled Al-Sham'aa 151 * 152 * @license LGPL <http://www.gnu.org/licenses/lgpl.txt> 153 * @link http://www.ar-php.org 154 */ 155class I18N_Arabic_AutoSummarize 156{ 157 private $_normalizeAlef = array('أ','إ','آ'); 158 private $_normalizeDiacritics = array('َ','ً','ُ','ٌ','ِ','ٍ','ْ','ّ'); 159 160 private $_commonChars = array('ة','ه','ي','ن','و','ت','ل','ا','س','م', 161 'e', 't', 'a', 'o', 'i', 'n', 's'); 162 163 private $_separators = array('.',"\n",'،','؛','(','[','{',')',']','}',',',';'); 164 165 private $_commonWords = array(); 166 private $_importantWords = array(); 167 168 /** 169 * Loads initialize values 170 * 171 * @ignore 172 */ 173 public function __construct() 174 { 175 // This common words used in cleanCommon method 176 $words = file(dirname(__FILE__).'/data/ar-stopwords.txt'); 177 $en_words = file(dirname(__FILE__).'/data/en-stopwords.txt'); 178 179 $words = array_merge($words, $en_words); 180 $words = array_map('trim', $words); 181 182 $this->_commonWords = $words; 183 184 // This important words used in rankSentences method 185 $words = file(dirname(__FILE__).'/data/important-words.txt'); 186 $words = array_map('trim', $words); 187 188 $this->_importantWords = $words; 189 } 190 191 /** 192 * Load enhanced Arabic stop words list 193 * 194 * @return void 195 */ 196 public function loadExtra() 197 { 198 $extra_words = file(dirname(__FILE__).'/data/ar-extra-stopwords.txt'); 199 $extra_words = array_map('trim', $extra_words); 200 201 $this->_commonWords = array_merge($this->_commonWords, $extra_words); 202 } 203 204 /** 205 * Core summarize function that implement required steps in the algorithm 206 * 207 * @param string $str Input Arabic document as a string 208 * @param string $keywords List of keywords higlited by search process 209 * @param integer $int Sentences value (see $mode effect also) 210 * @param string $mode Mode of sentences count [number|rate] 211 * @param string $output Output mode [summary|highlight] 212 * @param string $style Name of the CSS class you would like to apply 213 * 214 * @return string Output summary requested 215 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 216 */ 217 protected function summarize($str, $keywords, $int, $mode, $output, $style=null) 218 { 219 preg_match_all( 220 "/[^\.\n\،\؛\,\;](.+?)[\.\n\،\؛\,\;]/u", 221 $str, 222 $sentences 223 ); 224 $_sentences = $sentences[0]; 225 226 if ($mode == 'rate') { 227 $str = preg_replace("/\s{2,}/u", ' ', $str); 228 $totalChars = mb_strlen($str); 229 $totalSentences = count($_sentences); 230 231 $maxChars = round($int * $totalChars / 100); 232 $int = round($int * $totalSentences / 100); 233 } else { 234 $maxChars = 99999; 235 } 236 237 $summary = ''; 238 239 $str = strip_tags($str); 240 $normalizedStr = $this->doNormalize($str); 241 $cleanedStr = $this->cleanCommon($normalizedStr); 242 $stemStr = $this->draftStem($cleanedStr); 243 244 preg_match_all( 245 "/[^\.\n\،\؛\,\;](.+?)[\.\n\،\؛\,\;]/u", 246 $stemStr, 247 $sentences 248 ); 249 $_stemmedSentences = $sentences[0]; 250 251 $wordRanks = $this->rankWords($stemStr); 252 253 if ($keywords) { 254 $keywords = $this->doNormalize($keywords); 255 $keywords = $this->draftStem($keywords); 256 $words = explode(' ', $keywords); 257 258 foreach ($words as $word) { 259 $wordRanks[$word] = 1000; 260 } 261 } 262 263 $sentencesRanks = $this->rankSentences( 264 $_sentences, 265 $_stemmedSentences, 266 $wordRanks 267 ); 268 269 list($sentences, $ranks) = $sentencesRanks; 270 271 $minRank = $this->minAcceptedRank($sentences, $ranks, $int, $maxChars); 272 273 $totalSentences = count($ranks); 274 275 for ($i = 0; $i < $totalSentences; $i++) { 276 if ($sentencesRanks[1][$i] >= $minRank) { 277 if ($output == 'summary') { 278 $summary .= ' '.$sentencesRanks[0][$i]; 279 } else { 280 $summary .= '<span class="' . $style .'">' . 281 $sentencesRanks[0][$i] . '</span>'; 282 } 283 } else { 284 if ($output == 'highlight') { 285 $summary .= $sentencesRanks[0][$i]; 286 } 287 } 288 } 289 290 if ($output == 'highlight') { 291 $summary = str_replace("\n", '<br />', $summary); 292 } 293 294 return $summary; 295 } 296 297 /** 298 * Summarize input Arabic string (document content) into specific number of 299 * sentences in the output 300 * 301 * @param string $str Input Arabic document as a string 302 * @param integer $int Number of sentences required in output summary 303 * @param string $keywords List of keywords higlited by search process 304 * 305 * @return string Output summary requested 306 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 307 */ 308 public function doSummarize($str, $int, $keywords) 309 { 310 $summary = $this->summarize( 311 $str, $keywords, $int, 'number', 'summary', $style 312 ); 313 314 return $summary; 315 } 316 317 /** 318 * Summarize percentage of the input Arabic string (document content) into output 319 * 320 * @param string $str Input Arabic document as a string 321 * @param integer $rate Rate of output summary sentence number as 322 * percentage of the input Arabic string 323 * (document content) 324 * @param string $keywords List of keywords higlited by search process 325 * 326 * @return string Output summary requested 327 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 328 */ 329 public function doRateSummarize($str, $rate, $keywords) 330 { 331 $summary = $this->summarize( 332 $str, $keywords, $rate, 'rate', 'summary', $style 333 ); 334 335 return $summary; 336 } 337 338 /** 339 * Highlight key sentences (summary) of the input string (document content) 340 * using CSS and send the result back as an output 341 * 342 * @param string $str Input Arabic document as a string 343 * @param integer $int Number of key sentences required to be 344 * highlighted in the input string 345 * (document content) 346 * @param string $keywords List of keywords higlited by search process 347 * @param string $style Name of the CSS class you would like to apply 348 * 349 * @return string Output highlighted key sentences summary (using CSS) 350 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 351 */ 352 public function highlightSummary($str, $int, $keywords, $style) 353 { 354 $summary = $this->summarize( 355 $str, $keywords, $int, 'number', 'highlight', $style 356 ); 357 358 return $summary; 359 } 360 361 /** 362 * Highlight key sentences (summary) as percentage of the input string 363 * (document content) using CSS and send the result back as an output. 364 * 365 * @param string $str Input Arabic document as a string 366 * @param integer $rate Rate of highlighted key sentences summary 367 * number as percentage of the input Arabic 368 * string (document content) 369 * @param string $keywords List of keywords higlited by search process 370 * @param string $style Name of the CSS class you would like to apply 371 * 372 * @return string Output highlighted key sentences summary (using CSS) 373 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 374 */ 375 public function highlightRateSummary($str, $rate, $keywords, $style) 376 { 377 $summary = $this->summarize( 378 $str, $keywords, $rate, 'rate', 'highlight', $style 379 ); 380 381 return $summary; 382 } 383 384 /** 385 * Extract keywords from a given Arabic string (document content) 386 * 387 * @param string $str Input Arabic document as a string 388 * @param integer $int Number of keywords required to be extracting 389 * from input string (document content) 390 * 391 * @return string List of the keywords extracting from input Arabic string 392 * (document content) 393 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 394 */ 395 public function getMetaKeywords($str, $int) 396 { 397 $patterns = array(); 398 $replacements = array(); 399 $metaKeywords = ''; 400 401 array_push($patterns, '/\.|\n|\،|\؛|\(|\[|\{|\)|\]|\}|\,|\;/u'); 402 array_push($replacements, ' '); 403 $str = preg_replace($patterns, $replacements, $str); 404 405 $normalizedStr = $this->doNormalize($str); 406 $cleanedStr = $this->cleanCommon($normalizedStr); 407 408 $str = preg_replace('/(\W)ال(\w{3,})/u', '\\1\\2', $cleanedStr); 409 $str = preg_replace('/(\W)وال(\w{3,})/u', '\\1\\2', $str); 410 $str = preg_replace('/(\w{3,})هما(\W)/u', '\\1\\2', $str); 411 $str = preg_replace('/(\w{3,})كما(\W)/u', '\\1\\2', $str); 412 $str = preg_replace('/(\w{3,})تين(\W)/u', '\\1\\2', $str); 413 $str = preg_replace('/(\w{3,})هم(\W)/u', '\\1\\2', $str); 414 $str = preg_replace('/(\w{3,})هن(\W)/u', '\\1\\2', $str); 415 $str = preg_replace('/(\w{3,})ها(\W)/u', '\\1\\2', $str); 416 $str = preg_replace('/(\w{3,})نا(\W)/u', '\\1\\2', $str); 417 $str = preg_replace('/(\w{3,})ني(\W)/u', '\\1\\2', $str); 418 $str = preg_replace('/(\w{3,})كم(\W)/u', '\\1\\2', $str); 419 $str = preg_replace('/(\w{3,})تم(\W)/u', '\\1\\2', $str); 420 $str = preg_replace('/(\w{3,})كن(\W)/u', '\\1\\2', $str); 421 $str = preg_replace('/(\w{3,})ات(\W)/u', '\\1\\2', $str); 422 $str = preg_replace('/(\w{3,})ين(\W)/u', '\\1\\2', $str); 423 $str = preg_replace('/(\w{3,})تن(\W)/u', '\\1\\2', $str); 424 $str = preg_replace('/(\w{3,})ون(\W)/u', '\\1\\2', $str); 425 $str = preg_replace('/(\w{3,})ان(\W)/u', '\\1\\2', $str); 426 $str = preg_replace('/(\w{3,})تا(\W)/u', '\\1\\2', $str); 427 $str = preg_replace('/(\w{3,})وا(\W)/u', '\\1\\2', $str); 428 $str = preg_replace('/(\w{3,})ة(\W)/u', '\\1\\2', $str); 429 430 $stemStr = preg_replace('/(\W)\w{1,3}(\W)/u', '\\2', $str); 431 432 $wordRanks = $this->rankWords($stemStr); 433 434 arsort($wordRanks, SORT_NUMERIC); 435 436 $i = 1; 437 foreach ($wordRanks as $key => $value) { 438 if ($this->acceptedWord($key)) { 439 $metaKeywords .= $key . '، '; 440 $i++; 441 } 442 if ($i > $int) { 443 break; 444 } 445 } 446 447 $metaKeywords = mb_substr($metaKeywords, 0, -2); 448 449 return $metaKeywords; 450 } 451 452 /** 453 * Normalized Arabic document 454 * 455 * @param string $str Input Arabic document as a string 456 * 457 * @return string Normalized Arabic document 458 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 459 */ 460 protected function doNormalize($str) 461 { 462 $str = str_replace($this->_normalizeAlef, 'ا', $str); 463 $str = str_replace($this->_normalizeDiacritics, '', $str); 464 $str = strtr( 465 $str, 466 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 467 'abcdefghijklmnopqrstuvwxyz' 468 ); 469 470 return $str; 471 } 472 473 /** 474 * Extracting common Arabic words (roughly) 475 * from input Arabic string (document content) 476 * 477 * @param string $str Input normalized Arabic document as a string 478 * 479 * @return string Arabic document as a string free of common words (roughly) 480 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 481 */ 482 public function cleanCommon($str) 483 { 484 $str = str_replace($this->_commonWords, ' ', $str); 485 486 return $str; 487 } 488 489 /** 490 * Remove less significant Arabic letter from given string (document content). 491 * Please note that output will not be human readable. 492 * 493 * @param string $str Input Arabic document as a string 494 * 495 * @return string Output string after removing less significant Arabic letter 496 * (not human readable output) 497 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 498 */ 499 protected function draftStem($str) 500 { 501 $str = str_replace($this->_commonChars, '', $str); 502 return $str; 503 } 504 505 /** 506 * Ranks words in a given Arabic string (document content). That rank refers 507 * to the frequency of that word appears in that given document. 508 * 509 * @param string $str Input Arabic document as a string 510 * 511 * @return hash Associated array where document words referred by index and 512 * those words ranks referred by values of those array items. 513 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 514 */ 515 protected function rankWords($str) 516 { 517 $wordsRanks = array(); 518 519 $str = str_replace($this->_separators, ' ', $str); 520 $words = preg_split("/[\s,]+/u", $str); 521 522 foreach ($words as $word) { 523 if (isset($wordsRanks[$word])) { 524 $wordsRanks[$word]++; 525 } else { 526 $wordsRanks[$word] = 1; 527 } 528 } 529 530 foreach ($wordsRanks as $wordRank => $total) { 531 if (mb_substr($wordRank, 0, 1) == 'و') { 532 $subWordRank = mb_substr($wordRank, 1, mb_strlen($wordRank) - 1); 533 if (isset($wordsRanks[$subWordRank])) { 534 unset($wordsRanks[$wordRank]); 535 $wordsRanks[$subWordRank] += $total; 536 } 537 } 538 } 539 540 return $wordsRanks; 541 } 542 543 /** 544 * Ranks sentences in a given Arabic string (document content). 545 * 546 * @param array $sentences Sentences of the input Arabic document 547 * as an array 548 * @param array $stemmedSentences Stemmed sentences of the input Arabic 549 * document as an array 550 * @param array $arr Words ranks array (word as an index and 551 * value refer to the word frequency) 552 * 553 * @return array Two dimension array, first item is an array of document 554 * sentences, second item is an array of ranks of document 555 * sentences. 556 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 557 */ 558 protected function rankSentences($sentences, $stemmedSentences, $arr) 559 { 560 $sentenceArr = array(); 561 $rankArr = array(); 562 563 $max = count($sentences); 564 565 for ($i = 0; $i < $max; $i++) { 566 $sentence = $sentences[$i]; 567 568 $w = 0; 569 $first = mb_substr($sentence, 0, 1); 570 $last = mb_substr($sentence, -1, 1); 571 572 if ($first == "\n") { 573 $w += 3; 574 } elseif (in_array($first, $this->_separators)) { 575 $w += 2; 576 } else { 577 $w += 1; 578 } 579 580 if ($last == "\n") { 581 $w += 3; 582 } elseif (in_array($last, $this->_separators)) { 583 $w += 2; 584 } else { 585 $w += 1; 586 } 587 588 foreach ($this->_importantWords as $word) { 589 if ($word != '') { 590 $w += mb_substr_count($sentence, $word); 591 } 592 } 593 594 $sentence = mb_substr(mb_substr($sentence, 0, -1), 1); 595 if (!in_array($first, $this->_separators)) { 596 $sentence = $first . $sentence; 597 } 598 599 $stemStr = $stemmedSentences[$i]; 600 $stemStr = mb_substr($stemStr, 0, -1); 601 602 $words = preg_split("/[\s,]+/u", $stemStr); 603 604 $totalWords = count($words); 605 if ($totalWords > 4) { 606 $totalWordsRank = 0; 607 608 foreach ($words as $word) { 609 if (isset($arr[$word])) { 610 $totalWordsRank += $arr[$word]; 611 } 612 } 613 614 $wordsRank = $totalWordsRank / $totalWords; 615 $sentenceRanks = $w * $wordsRank; 616 617 array_push($sentenceArr, $sentence . $last); 618 array_push($rankArr, $sentenceRanks); 619 } 620 } 621 622 $sentencesRanks = array($sentenceArr, $rankArr); 623 624 return $sentencesRanks; 625 } 626 627 /** 628 * Calculate minimum rank for sentences which will be including in the summary 629 * 630 * @param array $str Document sentences 631 * @param array $arr Sentences ranks 632 * @param integer $int Number of sentences you need to include in your summary 633 * @param integer $max Maximum number of characters accepted in your summary 634 * 635 * @return integer Minimum accepted sentence rank (sentences with rank more 636 * than this will be listed in the document summary) 637 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 638 */ 639 protected function minAcceptedRank($str, $arr, $int, $max) 640 { 641 $len = array(); 642 643 foreach ($str as $line) { 644 $len[] = mb_strlen($line); 645 } 646 647 rsort($arr, SORT_NUMERIC); 648 649 $totalChars = 0; 650 651 for ($i=0; $i<=$int; $i++) { 652 653 if (!isset($arr[$i])) { 654 $minRank = 0; 655 break; 656 } 657 658 $totalChars += $len[$i]; 659 660 if ($totalChars >= $max) { 661 $minRank = $arr[$i]; 662 break; 663 } 664 665 $minRank = $arr[$i]; 666 } 667 668 return $minRank; 669 } 670 671 /** 672 * Check some conditions to know if a given string is a formal valid word or not 673 * 674 * @param string $word String to be checked if it is a valid word or not 675 * 676 * @return boolean True if passed string is accepted as a valid word else 677 * it will return False 678 * @author Khaled Al-Sham'aa <khaled@ar-php.org> 679 */ 680 protected function acceptedWord($word) 681 { 682 $accept = true; 683 684 if (mb_strlen($word) < 3) { 685 $accept = false; 686 } 687 688 return $accept; 689 } 690} 691 692