1<?php
2/* vim: set expandtab sw=4 ts=4 sts=4: */
3/**
4 * MediaWiki import plugin for phpMyAdmin
5 *
6 * @package    PhpMyAdmin-Import
7 * @subpackage MediaWiki
8 */
9namespace PhpMyAdmin\Plugins\Import;
10
11use PhpMyAdmin\Import;
12use PhpMyAdmin\Message;
13use PhpMyAdmin\Plugins\ImportPlugin;
14use PhpMyAdmin\Properties\Plugins\ImportPluginProperties;
15
16/**
17 * Handles the import for the MediaWiki format
18 *
19 * @package    PhpMyAdmin-Import
20 * @subpackage MediaWiki
21 */
22class ImportMediawiki extends ImportPlugin
23{
24    /**
25     * Whether to analyze tables
26     *
27     * @var bool
28     */
29    private $_analyze;
30
31    /**
32     * Constructor
33     */
34    public function __construct()
35    {
36        $this->setProperties();
37    }
38
39    /**
40     * Sets the import plugin properties.
41     * Called in the constructor.
42     *
43     * @return void
44     */
45    protected function setProperties()
46    {
47        $this->_setAnalyze(false);
48        if ($GLOBALS['plugin_param'] !== 'table') {
49            $this->_setAnalyze(true);
50        }
51
52        $importPluginProperties = new ImportPluginProperties();
53        $importPluginProperties->setText(__('MediaWiki Table'));
54        $importPluginProperties->setExtension('txt');
55        $importPluginProperties->setMimeType('text/plain');
56        $importPluginProperties->setOptions(array());
57        $importPluginProperties->setOptionsText(__('Options'));
58
59        $this->properties = $importPluginProperties;
60    }
61
62    /**
63     * Handles the whole import logic
64     *
65     * @param array &$sql_data 2-element array with sql data
66     *
67     * @return void
68     */
69    public function doImport(array &$sql_data = array())
70    {
71        global $error, $timeout_passed, $finished;
72
73        // Defaults for parser
74
75        // The buffer that will be used to store chunks read from the imported file
76        $buffer = '';
77
78        // Used as storage for the last part of the current chunk data
79        // Will be appended to the first line of the next chunk, if there is one
80        $last_chunk_line = '';
81
82        // Remembers whether the current buffer line is part of a comment
83        $inside_comment = false;
84        // Remembers whether the current buffer line is part of a data comment
85        $inside_data_comment = false;
86        // Remembers whether the current buffer line is part of a structure comment
87        $inside_structure_comment = false;
88
89        // MediaWiki only accepts "\n" as row terminator
90        $mediawiki_new_line = "\n";
91
92        // Initialize the name of the current table
93        $cur_table_name = "";
94
95        while (!$finished && !$error && !$timeout_passed) {
96            $data = Import::getNextChunk();
97
98            if ($data === false) {
99                // Subtract data we didn't handle yet and stop processing
100                $GLOBALS['offset'] -= mb_strlen($buffer);
101                break;
102            } elseif ($data === true) {
103                // Handle rest of buffer
104            } else {
105                // Append new data to buffer
106                $buffer = $data;
107                unset($data);
108                // Don't parse string if we're not at the end
109                // and don't have a new line inside
110                if (mb_strpos($buffer, $mediawiki_new_line) === false) {
111                    continue;
112                }
113            }
114
115            // Because of reading chunk by chunk, the first line from the buffer
116            // contains only a portion of an actual line from the imported file.
117            // Therefore, we have to append it to the last line from the previous
118            // chunk. If we are at the first chunk, $last_chunk_line should be empty.
119            $buffer = $last_chunk_line . $buffer;
120
121            // Process the buffer line by line
122            $buffer_lines = explode($mediawiki_new_line, $buffer);
123
124            $full_buffer_lines_count = count($buffer_lines);
125            // If the reading is not finalised, the final line of the current chunk
126            // will not be complete
127            if (! $finished) {
128                $last_chunk_line = $buffer_lines[--$full_buffer_lines_count];
129            }
130
131            for ($line_nr = 0; $line_nr < $full_buffer_lines_count; ++$line_nr) {
132                $cur_buffer_line = trim($buffer_lines[$line_nr]);
133
134                // If the line is empty, go to the next one
135                if ($cur_buffer_line === '') {
136                    continue;
137                }
138
139                $first_character = $cur_buffer_line[0];
140                $matches = array();
141
142                // Check beginning of comment
143                if (!strcmp(mb_substr($cur_buffer_line, 0, 4), "<!--")) {
144                    $inside_comment = true;
145                    continue;
146                } elseif ($inside_comment) {
147                    // Check end of comment
148                    if (!strcmp(mb_substr($cur_buffer_line, 0, 4), "-->")
149                    ) {
150                        // Only data comments are closed. The structure comments
151                        // will be closed when a data comment begins (in order to
152                        // skip structure tables)
153                        if ($inside_data_comment) {
154                            $inside_data_comment = false;
155                        }
156
157                        // End comments that are not related to table structure
158                        if (!$inside_structure_comment) {
159                            $inside_comment = false;
160                        }
161                    } else {
162                        // Check table name
163                        $match_table_name = array();
164                        if (preg_match(
165                            "/^Table data for `(.*)`$/",
166                            $cur_buffer_line,
167                            $match_table_name
168                        )
169                        ) {
170                            $cur_table_name = $match_table_name[1];
171                            $inside_data_comment = true;
172
173                            $inside_structure_comment
174                                = $this->_mngInsideStructComm(
175                                    $inside_structure_comment
176                                );
177                        } elseif (preg_match(
178                            "/^Table structure for `(.*)`$/",
179                            $cur_buffer_line,
180                            $match_table_name
181                        )
182                        ) {
183                            // The structure comments will be ignored
184                            $inside_structure_comment = true;
185                        }
186                    }
187                    continue;
188                } elseif (preg_match('/^\{\|(.*)$/', $cur_buffer_line, $matches)) {
189                    // Check start of table
190
191                    // This will store all the column info on all rows from
192                    // the current table read from the buffer
193                    $cur_temp_table = array();
194
195                    // Will be used as storage for the current row in the buffer
196                    // Once all its columns are read, it will be added to
197                    // $cur_temp_table and then it will be emptied
198                    $cur_temp_line = array();
199
200                    // Helps us differentiate the header columns
201                    // from the normal columns
202                    $in_table_header = false;
203                    // End processing because the current line does not
204                    // contain any column information
205                } elseif (mb_substr($cur_buffer_line, 0, 2) === '|-'
206                    || mb_substr($cur_buffer_line, 0, 2) === '|+'
207                    || mb_substr($cur_buffer_line, 0, 2) === '|}'
208                ) {
209                    // Check begin row or end table
210
211                    // Add current line to the values storage
212                    if (!empty($cur_temp_line)) {
213                        // If the current line contains header cells
214                        // ( marked with '!' ),
215                        // it will be marked as table header
216                        if ($in_table_header) {
217                            // Set the header columns
218                            $cur_temp_table_headers = $cur_temp_line;
219                        } else {
220                            // Normal line, add it to the table
221                            $cur_temp_table [] = $cur_temp_line;
222                        }
223                    }
224
225                    // Empty the temporary buffer
226                    $cur_temp_line = array();
227
228                    // No more processing required at the end of the table
229                    if (mb_substr($cur_buffer_line, 0, 2) === '|}') {
230                        $current_table = array(
231                            $cur_table_name,
232                            $cur_temp_table_headers,
233                            $cur_temp_table,
234                        );
235
236                        // Import the current table data into the database
237                        $this->_importDataOneTable($current_table, $sql_data);
238
239                        // Reset table name
240                        $cur_table_name = "";
241                    }
242                    // What's after the row tag is now only attributes
243
244                } elseif (($first_character === '|') || ($first_character === '!')) {
245                    // Check cell elements
246
247                    // Header cells
248                    if ($first_character === '!') {
249                        // Mark as table header, but treat as normal row
250                        $cur_buffer_line = str_replace('!!', '||', $cur_buffer_line);
251                        // Will be used to set $cur_temp_line as table header
252                        $in_table_header = true;
253                    } else {
254                        $in_table_header = false;
255                    }
256
257                    // Loop through each table cell
258                    $cells = $this->_explodeMarkup($cur_buffer_line);
259                    foreach ($cells as $cell) {
260                        $cell = $this->_getCellData($cell);
261
262                        // Delete the beginning of the column, if there is one
263                        $cell = trim($cell);
264                        $col_start_chars = array("|", "!");
265                        foreach ($col_start_chars as $col_start_char) {
266                            $cell = $this->_getCellContent($cell, $col_start_char);
267                        }
268
269                        // Add the cell to the row
270                        $cur_temp_line [] = $cell;
271                    } // foreach $cells
272                } else {
273                    // If it's none of the above, then the current line has a bad
274                    // format
275                    $message = Message::error(
276                        __('Invalid format of mediawiki input on line: <br />%s.')
277                    );
278                    $message->addParam($cur_buffer_line);
279                    $error = true;
280                }
281            } // End treating full buffer lines
282        } // while - finished parsing buffer
283    }
284
285    /**
286     * Imports data from a single table
287     *
288     * @param array $table     containing all table info:
289     *                         <code>
290     *                         $table[0] - string containing table name
291     *                         $table[1] - array[]   of table headers
292     *                         $table[2] - array[][] of table content rows
293     *                         </code>
294     *
295     * @param array &$sql_data 2-element array with sql data
296     *
297     * @global bool $analyze whether to scan for column types
298     *
299     * @return void
300     */
301    private function _importDataOneTable(array $table, array &$sql_data)
302    {
303        $analyze = $this->_getAnalyze();
304        if ($analyze) {
305            // Set the table name
306            $this->_setTableName($table[0]);
307
308            // Set generic names for table headers if they don't exist
309            $this->_setTableHeaders($table[1], $table[2][0]);
310
311            // Create the tables array to be used in Import::buildSql()
312            $tables = array();
313            $tables [] = array($table[0], $table[1], $table[2]);
314
315            // Obtain the best-fit MySQL types for each column
316            $analyses = array();
317            $analyses [] = Import::analyzeTable($tables[0]);
318
319            $this->_executeImportTables($tables, $analyses, $sql_data);
320        }
321
322        // Commit any possible data in buffers
323        Import::runQuery('', '', $sql_data);
324    }
325
326    /**
327     * Sets the table name
328     *
329     * @param string &$table_name reference to the name of the table
330     *
331     * @return void
332     */
333    private function _setTableName(&$table_name)
334    {
335        if (empty($table_name)) {
336            $result = $GLOBALS['dbi']->fetchResult('SHOW TABLES');
337            // todo check if the name below already exists
338            $table_name = 'TABLE ' . (count($result) + 1);
339        }
340    }
341
342    /**
343     * Set generic names for table headers, if they don't exist
344     *
345     * @param array &$table_headers reference to the array containing the headers
346     *                              of a table
347     * @param array $table_row      array containing the first content row
348     *
349     * @return void
350     */
351    private function _setTableHeaders(array &$table_headers, array $table_row)
352    {
353        if (empty($table_headers)) {
354            // The first table row should contain the number of columns
355            // If they are not set, generic names will be given (COL 1, COL 2, etc)
356            $num_cols = count($table_row);
357            for ($i = 0; $i < $num_cols; ++$i) {
358                $table_headers [$i] = 'COL ' . ($i + 1);
359            }
360        }
361    }
362
363    /**
364     * Sets the database name and additional options and calls Import::buildSql()
365     * Used in PMA_importDataAllTables() and $this->_importDataOneTable()
366     *
367     * @param array &$tables   structure:
368     *                         array(
369     *                         array(table_name, array() column_names, array()()
370     *                         rows)
371     *                         )
372     * @param array &$analyses structure:
373     *                         $analyses = array(
374     *                         array(array() column_types, array() column_sizes)
375     *                         )
376     * @param array &$sql_data 2-element array with sql data
377     *
378     * @global string $db      name of the database to import in
379     *
380     * @return void
381     */
382    private function _executeImportTables(array &$tables, array &$analyses, array &$sql_data)
383    {
384        global $db;
385
386        // $db_name : The currently selected database name, if applicable
387        //            No backquotes
388        // $options : An associative array of options
389        list($db_name, $options) = $this->getDbnameAndOptions($db, 'mediawiki_DB');
390
391        // Array of SQL strings
392        // Non-applicable parameters
393        $create = null;
394
395        // Create and execute necessary SQL statements from data
396        Import::buildSql($db_name, $tables, $analyses, $create, $options, $sql_data);
397
398        unset($tables);
399        unset($analyses);
400    }
401
402    /**
403     * Replaces all instances of the '||' separator between delimiters
404     * in a given string
405     *
406     * @param string $replace the string to be replaced with
407     * @param string $subject the text to be replaced
408     *
409     * @return string with replacements
410     */
411    private function _delimiterReplace($replace, $subject)
412    {
413        // String that will be returned
414        $cleaned = "";
415        // Possible states of current character
416        $inside_tag = false;
417        $inside_attribute = false;
418        // Attributes can be declared with either " or '
419        $start_attribute_character = false;
420
421        // The full separator is "||";
422        // This remembers if the previous character was '|'
423        $partial_separator = false;
424
425        // Parse text char by char
426        for ($i = 0; $i < strlen($subject); $i++) {
427            $cur_char = $subject[$i];
428            // Check for separators
429            if ($cur_char == '|') {
430                // If we're not inside a tag, then this is part of a real separator,
431                // so we append it to the current segment
432                if (!$inside_attribute) {
433                    $cleaned .= $cur_char;
434                    if ($partial_separator) {
435                        $inside_tag = false;
436                        $inside_attribute = false;
437                    }
438                } elseif ($partial_separator) {
439                    // If we are inside a tag, we replace the current char with
440                    // the placeholder and append that to the current segment
441                    $cleaned .= $replace;
442                }
443
444                // If the previous character was also '|', then this ends a
445                // full separator. If not, this may be the beginning of one
446                $partial_separator = !$partial_separator;
447            } else {
448                // If we're inside a tag attribute and the current character is
449                // not '|', but the previous one was, it means that the single '|'
450                // was not appended, so we append it now
451                if ($partial_separator && $inside_attribute) {
452                    $cleaned .= "|";
453                }
454                // If the char is different from "|", no separator can be formed
455                $partial_separator = false;
456
457                // any other character should be appended to the current segment
458                $cleaned .= $cur_char;
459
460                if ($cur_char == '<' && !$inside_attribute) {
461                    // start of a tag
462                    $inside_tag = true;
463                } elseif ($cur_char == '>' && !$inside_attribute) {
464                    // end of a tag
465                    $inside_tag = false;
466                } elseif (($cur_char == '"' || $cur_char == "'") && $inside_tag) {
467                    // start or end of an attribute
468                    if (!$inside_attribute) {
469                        $inside_attribute = true;
470                        // remember the attribute`s declaration character (" or ')
471                        $start_attribute_character = $cur_char;
472                    } else {
473                        if ($cur_char == $start_attribute_character) {
474                            $inside_attribute = false;
475                            // unset attribute declaration character
476                            $start_attribute_character = false;
477                        }
478                    }
479                }
480            }
481        } // end for each character in $subject
482
483        return $cleaned;
484    }
485
486    /**
487     * Separates a string into items, similarly to explode
488     * Uses the '||' separator (which is standard in the mediawiki format)
489     * and ignores any instances of it inside markup tags
490     * Used in parsing buffer lines containing data cells
491     *
492     * @param string $text text to be split
493     *
494     * @return array
495     */
496    private function _explodeMarkup($text)
497    {
498        $separator = "||";
499        $placeholder = "\x00";
500
501        // Remove placeholder instances
502        $text = str_replace($placeholder, '', $text);
503
504        // Replace instances of the separator inside HTML-like
505        // tags with the placeholder
506        $cleaned = $this->_delimiterReplace($placeholder, $text);
507        // Explode, then put the replaced separators back in
508        $items = explode($separator, $cleaned);
509        foreach ($items as $i => $str) {
510            $items[$i] = str_replace($placeholder, $separator, $str);
511        }
512
513        return $items;
514    }
515
516
517    /* ~~~~~~~~~~~~~~~~~~~~ Getters and Setters ~~~~~~~~~~~~~~~~~~~~ */
518
519    /**
520     * Returns true if the table should be analyzed, false otherwise
521     *
522     * @return bool
523     */
524    private function _getAnalyze()
525    {
526        return $this->_analyze;
527    }
528
529    /**
530     * Sets to true if the table should be analyzed, false otherwise
531     *
532     * @param bool $analyze status
533     *
534     * @return void
535     */
536    private function _setAnalyze($analyze)
537    {
538        $this->_analyze = $analyze;
539    }
540
541    /**
542     * Get cell
543     *
544     * @param string $cell Cell
545     *
546     * @return mixed
547     */
548    private function _getCellData($cell)
549    {
550        // A cell could contain both parameters and data
551        $cell_data = explode('|', $cell, 2);
552
553        // A '|' inside an invalid link should not
554        // be mistaken as delimiting cell parameters
555        if (mb_strpos($cell_data[0], '[[') === false) {
556            return $cell;
557        }
558
559        if (count($cell_data) == 1) {
560            return $cell_data[0];
561        }
562
563        return $cell_data[1];
564    }
565
566    /**
567     * Manage $inside_structure_comment
568     *
569     * @param boolean $inside_structure_comment Value to test
570     *
571     * @return bool
572     */
573    private function _mngInsideStructComm($inside_structure_comment)
574    {
575        // End ignoring structure rows
576        if ($inside_structure_comment) {
577            $inside_structure_comment = false;
578        }
579
580        return $inside_structure_comment;
581    }
582
583    /**
584     * Get cell content
585     *
586     * @param string $cell           Cell
587     * @param string $col_start_char Start char
588     *
589     * @return string
590     */
591    private function _getCellContent($cell, $col_start_char)
592    {
593        if (mb_strpos($cell, $col_start_char) === 0) {
594            $cell = trim(mb_substr($cell, 1));
595        }
596
597        return $cell;
598    }
599}
600