1<?php 2/* vim: set expandtab sw=4 ts=4 sts=4: */ 3/** 4 * MediaWiki import plugin for phpMyAdmin 5 * 6 * @package PhpMyAdmin-Import 7 * @subpackage MediaWiki 8 */ 9namespace PhpMyAdmin\Plugins\Import; 10 11use PhpMyAdmin\Import; 12use PhpMyAdmin\Message; 13use PhpMyAdmin\Plugins\ImportPlugin; 14use PhpMyAdmin\Properties\Plugins\ImportPluginProperties; 15 16/** 17 * Handles the import for the MediaWiki format 18 * 19 * @package PhpMyAdmin-Import 20 * @subpackage MediaWiki 21 */ 22class ImportMediawiki extends ImportPlugin 23{ 24 /** 25 * Whether to analyze tables 26 * 27 * @var bool 28 */ 29 private $_analyze; 30 31 /** 32 * Constructor 33 */ 34 public function __construct() 35 { 36 $this->setProperties(); 37 } 38 39 /** 40 * Sets the import plugin properties. 41 * Called in the constructor. 42 * 43 * @return void 44 */ 45 protected function setProperties() 46 { 47 $this->_setAnalyze(false); 48 if ($GLOBALS['plugin_param'] !== 'table') { 49 $this->_setAnalyze(true); 50 } 51 52 $importPluginProperties = new ImportPluginProperties(); 53 $importPluginProperties->setText(__('MediaWiki Table')); 54 $importPluginProperties->setExtension('txt'); 55 $importPluginProperties->setMimeType('text/plain'); 56 $importPluginProperties->setOptions(array()); 57 $importPluginProperties->setOptionsText(__('Options')); 58 59 $this->properties = $importPluginProperties; 60 } 61 62 /** 63 * Handles the whole import logic 64 * 65 * @param array &$sql_data 2-element array with sql data 66 * 67 * @return void 68 */ 69 public function doImport(array &$sql_data = array()) 70 { 71 global $error, $timeout_passed, $finished; 72 73 // Defaults for parser 74 75 // The buffer that will be used to store chunks read from the imported file 76 $buffer = ''; 77 78 // Used as storage for the last part of the current chunk data 79 // Will be appended to the first line of the next chunk, if there is one 80 $last_chunk_line = ''; 81 82 // Remembers whether the current buffer line is part of a comment 83 $inside_comment = false; 84 // Remembers whether the current buffer line is part of a data comment 85 $inside_data_comment = false; 86 // Remembers whether the current buffer line is part of a structure comment 87 $inside_structure_comment = false; 88 89 // MediaWiki only accepts "\n" as row terminator 90 $mediawiki_new_line = "\n"; 91 92 // Initialize the name of the current table 93 $cur_table_name = ""; 94 95 while (!$finished && !$error && !$timeout_passed) { 96 $data = Import::getNextChunk(); 97 98 if ($data === false) { 99 // Subtract data we didn't handle yet and stop processing 100 $GLOBALS['offset'] -= mb_strlen($buffer); 101 break; 102 } elseif ($data === true) { 103 // Handle rest of buffer 104 } else { 105 // Append new data to buffer 106 $buffer = $data; 107 unset($data); 108 // Don't parse string if we're not at the end 109 // and don't have a new line inside 110 if (mb_strpos($buffer, $mediawiki_new_line) === false) { 111 continue; 112 } 113 } 114 115 // Because of reading chunk by chunk, the first line from the buffer 116 // contains only a portion of an actual line from the imported file. 117 // Therefore, we have to append it to the last line from the previous 118 // chunk. If we are at the first chunk, $last_chunk_line should be empty. 119 $buffer = $last_chunk_line . $buffer; 120 121 // Process the buffer line by line 122 $buffer_lines = explode($mediawiki_new_line, $buffer); 123 124 $full_buffer_lines_count = count($buffer_lines); 125 // If the reading is not finalised, the final line of the current chunk 126 // will not be complete 127 if (! $finished) { 128 $last_chunk_line = $buffer_lines[--$full_buffer_lines_count]; 129 } 130 131 for ($line_nr = 0; $line_nr < $full_buffer_lines_count; ++$line_nr) { 132 $cur_buffer_line = trim($buffer_lines[$line_nr]); 133 134 // If the line is empty, go to the next one 135 if ($cur_buffer_line === '') { 136 continue; 137 } 138 139 $first_character = $cur_buffer_line[0]; 140 $matches = array(); 141 142 // Check beginning of comment 143 if (!strcmp(mb_substr($cur_buffer_line, 0, 4), "<!--")) { 144 $inside_comment = true; 145 continue; 146 } elseif ($inside_comment) { 147 // Check end of comment 148 if (!strcmp(mb_substr($cur_buffer_line, 0, 4), "-->") 149 ) { 150 // Only data comments are closed. The structure comments 151 // will be closed when a data comment begins (in order to 152 // skip structure tables) 153 if ($inside_data_comment) { 154 $inside_data_comment = false; 155 } 156 157 // End comments that are not related to table structure 158 if (!$inside_structure_comment) { 159 $inside_comment = false; 160 } 161 } else { 162 // Check table name 163 $match_table_name = array(); 164 if (preg_match( 165 "/^Table data for `(.*)`$/", 166 $cur_buffer_line, 167 $match_table_name 168 ) 169 ) { 170 $cur_table_name = $match_table_name[1]; 171 $inside_data_comment = true; 172 173 $inside_structure_comment 174 = $this->_mngInsideStructComm( 175 $inside_structure_comment 176 ); 177 } elseif (preg_match( 178 "/^Table structure for `(.*)`$/", 179 $cur_buffer_line, 180 $match_table_name 181 ) 182 ) { 183 // The structure comments will be ignored 184 $inside_structure_comment = true; 185 } 186 } 187 continue; 188 } elseif (preg_match('/^\{\|(.*)$/', $cur_buffer_line, $matches)) { 189 // Check start of table 190 191 // This will store all the column info on all rows from 192 // the current table read from the buffer 193 $cur_temp_table = array(); 194 195 // Will be used as storage for the current row in the buffer 196 // Once all its columns are read, it will be added to 197 // $cur_temp_table and then it will be emptied 198 $cur_temp_line = array(); 199 200 // Helps us differentiate the header columns 201 // from the normal columns 202 $in_table_header = false; 203 // End processing because the current line does not 204 // contain any column information 205 } elseif (mb_substr($cur_buffer_line, 0, 2) === '|-' 206 || mb_substr($cur_buffer_line, 0, 2) === '|+' 207 || mb_substr($cur_buffer_line, 0, 2) === '|}' 208 ) { 209 // Check begin row or end table 210 211 // Add current line to the values storage 212 if (!empty($cur_temp_line)) { 213 // If the current line contains header cells 214 // ( marked with '!' ), 215 // it will be marked as table header 216 if ($in_table_header) { 217 // Set the header columns 218 $cur_temp_table_headers = $cur_temp_line; 219 } else { 220 // Normal line, add it to the table 221 $cur_temp_table [] = $cur_temp_line; 222 } 223 } 224 225 // Empty the temporary buffer 226 $cur_temp_line = array(); 227 228 // No more processing required at the end of the table 229 if (mb_substr($cur_buffer_line, 0, 2) === '|}') { 230 $current_table = array( 231 $cur_table_name, 232 $cur_temp_table_headers, 233 $cur_temp_table, 234 ); 235 236 // Import the current table data into the database 237 $this->_importDataOneTable($current_table, $sql_data); 238 239 // Reset table name 240 $cur_table_name = ""; 241 } 242 // What's after the row tag is now only attributes 243 244 } elseif (($first_character === '|') || ($first_character === '!')) { 245 // Check cell elements 246 247 // Header cells 248 if ($first_character === '!') { 249 // Mark as table header, but treat as normal row 250 $cur_buffer_line = str_replace('!!', '||', $cur_buffer_line); 251 // Will be used to set $cur_temp_line as table header 252 $in_table_header = true; 253 } else { 254 $in_table_header = false; 255 } 256 257 // Loop through each table cell 258 $cells = $this->_explodeMarkup($cur_buffer_line); 259 foreach ($cells as $cell) { 260 $cell = $this->_getCellData($cell); 261 262 // Delete the beginning of the column, if there is one 263 $cell = trim($cell); 264 $col_start_chars = array("|", "!"); 265 foreach ($col_start_chars as $col_start_char) { 266 $cell = $this->_getCellContent($cell, $col_start_char); 267 } 268 269 // Add the cell to the row 270 $cur_temp_line [] = $cell; 271 } // foreach $cells 272 } else { 273 // If it's none of the above, then the current line has a bad 274 // format 275 $message = Message::error( 276 __('Invalid format of mediawiki input on line: <br />%s.') 277 ); 278 $message->addParam($cur_buffer_line); 279 $error = true; 280 } 281 } // End treating full buffer lines 282 } // while - finished parsing buffer 283 } 284 285 /** 286 * Imports data from a single table 287 * 288 * @param array $table containing all table info: 289 * <code> 290 * $table[0] - string containing table name 291 * $table[1] - array[] of table headers 292 * $table[2] - array[][] of table content rows 293 * </code> 294 * 295 * @param array &$sql_data 2-element array with sql data 296 * 297 * @global bool $analyze whether to scan for column types 298 * 299 * @return void 300 */ 301 private function _importDataOneTable(array $table, array &$sql_data) 302 { 303 $analyze = $this->_getAnalyze(); 304 if ($analyze) { 305 // Set the table name 306 $this->_setTableName($table[0]); 307 308 // Set generic names for table headers if they don't exist 309 $this->_setTableHeaders($table[1], $table[2][0]); 310 311 // Create the tables array to be used in Import::buildSql() 312 $tables = array(); 313 $tables [] = array($table[0], $table[1], $table[2]); 314 315 // Obtain the best-fit MySQL types for each column 316 $analyses = array(); 317 $analyses [] = Import::analyzeTable($tables[0]); 318 319 $this->_executeImportTables($tables, $analyses, $sql_data); 320 } 321 322 // Commit any possible data in buffers 323 Import::runQuery('', '', $sql_data); 324 } 325 326 /** 327 * Sets the table name 328 * 329 * @param string &$table_name reference to the name of the table 330 * 331 * @return void 332 */ 333 private function _setTableName(&$table_name) 334 { 335 if (empty($table_name)) { 336 $result = $GLOBALS['dbi']->fetchResult('SHOW TABLES'); 337 // todo check if the name below already exists 338 $table_name = 'TABLE ' . (count($result) + 1); 339 } 340 } 341 342 /** 343 * Set generic names for table headers, if they don't exist 344 * 345 * @param array &$table_headers reference to the array containing the headers 346 * of a table 347 * @param array $table_row array containing the first content row 348 * 349 * @return void 350 */ 351 private function _setTableHeaders(array &$table_headers, array $table_row) 352 { 353 if (empty($table_headers)) { 354 // The first table row should contain the number of columns 355 // If they are not set, generic names will be given (COL 1, COL 2, etc) 356 $num_cols = count($table_row); 357 for ($i = 0; $i < $num_cols; ++$i) { 358 $table_headers [$i] = 'COL ' . ($i + 1); 359 } 360 } 361 } 362 363 /** 364 * Sets the database name and additional options and calls Import::buildSql() 365 * Used in PMA_importDataAllTables() and $this->_importDataOneTable() 366 * 367 * @param array &$tables structure: 368 * array( 369 * array(table_name, array() column_names, array()() 370 * rows) 371 * ) 372 * @param array &$analyses structure: 373 * $analyses = array( 374 * array(array() column_types, array() column_sizes) 375 * ) 376 * @param array &$sql_data 2-element array with sql data 377 * 378 * @global string $db name of the database to import in 379 * 380 * @return void 381 */ 382 private function _executeImportTables(array &$tables, array &$analyses, array &$sql_data) 383 { 384 global $db; 385 386 // $db_name : The currently selected database name, if applicable 387 // No backquotes 388 // $options : An associative array of options 389 list($db_name, $options) = $this->getDbnameAndOptions($db, 'mediawiki_DB'); 390 391 // Array of SQL strings 392 // Non-applicable parameters 393 $create = null; 394 395 // Create and execute necessary SQL statements from data 396 Import::buildSql($db_name, $tables, $analyses, $create, $options, $sql_data); 397 398 unset($tables); 399 unset($analyses); 400 } 401 402 /** 403 * Replaces all instances of the '||' separator between delimiters 404 * in a given string 405 * 406 * @param string $replace the string to be replaced with 407 * @param string $subject the text to be replaced 408 * 409 * @return string with replacements 410 */ 411 private function _delimiterReplace($replace, $subject) 412 { 413 // String that will be returned 414 $cleaned = ""; 415 // Possible states of current character 416 $inside_tag = false; 417 $inside_attribute = false; 418 // Attributes can be declared with either " or ' 419 $start_attribute_character = false; 420 421 // The full separator is "||"; 422 // This remembers if the previous character was '|' 423 $partial_separator = false; 424 425 // Parse text char by char 426 for ($i = 0; $i < strlen($subject); $i++) { 427 $cur_char = $subject[$i]; 428 // Check for separators 429 if ($cur_char == '|') { 430 // If we're not inside a tag, then this is part of a real separator, 431 // so we append it to the current segment 432 if (!$inside_attribute) { 433 $cleaned .= $cur_char; 434 if ($partial_separator) { 435 $inside_tag = false; 436 $inside_attribute = false; 437 } 438 } elseif ($partial_separator) { 439 // If we are inside a tag, we replace the current char with 440 // the placeholder and append that to the current segment 441 $cleaned .= $replace; 442 } 443 444 // If the previous character was also '|', then this ends a 445 // full separator. If not, this may be the beginning of one 446 $partial_separator = !$partial_separator; 447 } else { 448 // If we're inside a tag attribute and the current character is 449 // not '|', but the previous one was, it means that the single '|' 450 // was not appended, so we append it now 451 if ($partial_separator && $inside_attribute) { 452 $cleaned .= "|"; 453 } 454 // If the char is different from "|", no separator can be formed 455 $partial_separator = false; 456 457 // any other character should be appended to the current segment 458 $cleaned .= $cur_char; 459 460 if ($cur_char == '<' && !$inside_attribute) { 461 // start of a tag 462 $inside_tag = true; 463 } elseif ($cur_char == '>' && !$inside_attribute) { 464 // end of a tag 465 $inside_tag = false; 466 } elseif (($cur_char == '"' || $cur_char == "'") && $inside_tag) { 467 // start or end of an attribute 468 if (!$inside_attribute) { 469 $inside_attribute = true; 470 // remember the attribute`s declaration character (" or ') 471 $start_attribute_character = $cur_char; 472 } else { 473 if ($cur_char == $start_attribute_character) { 474 $inside_attribute = false; 475 // unset attribute declaration character 476 $start_attribute_character = false; 477 } 478 } 479 } 480 } 481 } // end for each character in $subject 482 483 return $cleaned; 484 } 485 486 /** 487 * Separates a string into items, similarly to explode 488 * Uses the '||' separator (which is standard in the mediawiki format) 489 * and ignores any instances of it inside markup tags 490 * Used in parsing buffer lines containing data cells 491 * 492 * @param string $text text to be split 493 * 494 * @return array 495 */ 496 private function _explodeMarkup($text) 497 { 498 $separator = "||"; 499 $placeholder = "\x00"; 500 501 // Remove placeholder instances 502 $text = str_replace($placeholder, '', $text); 503 504 // Replace instances of the separator inside HTML-like 505 // tags with the placeholder 506 $cleaned = $this->_delimiterReplace($placeholder, $text); 507 // Explode, then put the replaced separators back in 508 $items = explode($separator, $cleaned); 509 foreach ($items as $i => $str) { 510 $items[$i] = str_replace($placeholder, $separator, $str); 511 } 512 513 return $items; 514 } 515 516 517 /* ~~~~~~~~~~~~~~~~~~~~ Getters and Setters ~~~~~~~~~~~~~~~~~~~~ */ 518 519 /** 520 * Returns true if the table should be analyzed, false otherwise 521 * 522 * @return bool 523 */ 524 private function _getAnalyze() 525 { 526 return $this->_analyze; 527 } 528 529 /** 530 * Sets to true if the table should be analyzed, false otherwise 531 * 532 * @param bool $analyze status 533 * 534 * @return void 535 */ 536 private function _setAnalyze($analyze) 537 { 538 $this->_analyze = $analyze; 539 } 540 541 /** 542 * Get cell 543 * 544 * @param string $cell Cell 545 * 546 * @return mixed 547 */ 548 private function _getCellData($cell) 549 { 550 // A cell could contain both parameters and data 551 $cell_data = explode('|', $cell, 2); 552 553 // A '|' inside an invalid link should not 554 // be mistaken as delimiting cell parameters 555 if (mb_strpos($cell_data[0], '[[') === false) { 556 return $cell; 557 } 558 559 if (count($cell_data) == 1) { 560 return $cell_data[0]; 561 } 562 563 return $cell_data[1]; 564 } 565 566 /** 567 * Manage $inside_structure_comment 568 * 569 * @param boolean $inside_structure_comment Value to test 570 * 571 * @return bool 572 */ 573 private function _mngInsideStructComm($inside_structure_comment) 574 { 575 // End ignoring structure rows 576 if ($inside_structure_comment) { 577 $inside_structure_comment = false; 578 } 579 580 return $inside_structure_comment; 581 } 582 583 /** 584 * Get cell content 585 * 586 * @param string $cell Cell 587 * @param string $col_start_char Start char 588 * 589 * @return string 590 */ 591 private function _getCellContent($cell, $col_start_char) 592 { 593 if (mb_strpos($cell, $col_start_char) === 0) { 594 $cell = trim(mb_substr($cell, 1)); 595 } 596 597 return $cell; 598 } 599} 600