1 /*
2  * TexLogParser.cpp
3  *
4  * Copyright (C) 2021 by RStudio, PBC
5  *
6  * Unless you have received this program directly from RStudio pursuant
7  * to the terms of a commercial license agreement with RStudio, then
8  * this program is licensed to you under the terms of version 3 of the
9  * GNU Affero General Public License. This program is distributed WITHOUT
10  * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
11  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
12  * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
13  *
14  */
15 
16 #include <core/tex/TexLogParser.hpp>
17 
18 #include <gsl/gsl>
19 
20 #include <boost/regex.hpp>
21 #include <boost/lexical_cast.hpp>
22 #include <boost/algorithm/string.hpp>
23 
24 #include <shared_core/Error.hpp>
25 #include <shared_core/FilePath.hpp>
26 #include <core/FileSerializer.hpp>
27 #include <core/RegexUtils.hpp>
28 #include <shared_core/SafeConvert.hpp>
29 #include <core/system/System.hpp>
30 
31 namespace rstudio {
32 namespace core {
33 namespace tex {
34 
35 namespace {
36 
37 // Helper function, returns true if str begins with any of these values
beginsWith(const std::string & str,const std::string & test1,const std::string & test2=std::string (),const std::string & test3=std::string (),const std::string & test4=std::string ())38 bool beginsWith(const std::string& str,
39                 const std::string& test1,
40                 const std::string& test2=std::string(),
41                 const std::string& test3=std::string(),
42                 const std::string& test4=std::string())
43 {
44    using namespace boost::algorithm;
45    if (starts_with(str, test1))
46       return true;
47 
48    if (test2.empty())
49       return false;
50    else if (starts_with(str, test2))
51       return true;
52 
53    if (test3.empty())
54       return false;
55    else if (starts_with(str, test3))
56       return true;
57 
58    if (test4.empty())
59       return false;
60    else if (starts_with(str, test4))
61       return true;
62 
63    return false;
64 }
65 
66 // Finds unmatched parens in `line` and puts them in pParens. Can be either
67 // ( or ). Logically the result can only be [zero or more ')'] followed by
68 // [zero or more '('].
findUnmatchedParens(const std::string & line,std::vector<std::string::const_iterator> * pParens)69 void findUnmatchedParens(const std::string& line,
70                          std::vector<std::string::const_iterator>* pParens)
71 {
72    // We need to ignore close parens unless they are at the start of a line,
73    // preceded by nothing but whitespace and/or other close parens. Without
74    // this, sample.Rnw has some false positives due to some math errors, e.g.:
75    //
76    // l.204 (x + (y^
77    //               2))
78    //
79    // The first line is ignored because it's part of an error message. The rest
80    // gets parsed and underflows the file stack.
81    bool ignoreCloseParens = false;
82 
83    // NOTE: I don't know if it's possible for (<filename> to appear anywhere
84    // but the beginning of the line (preceded only by whitespace(?) and close
85    // parens). But the Sublime Text 2 plugin code seemed to imply that it is
86    // possible.
87 
88    for (std::string::const_iterator it = line.begin(); it != line.end(); it++)
89    {
90       switch (*it)
91       {
92       case '(':
93          pParens->push_back(it);
94          ignoreCloseParens = true;
95          break;
96       case ')':
97          if (pParens->empty() || *(pParens->back()) == ')')
98          {
99             if (!ignoreCloseParens)
100                pParens->push_back(it);
101          }
102          else
103             pParens->pop_back();
104          break;
105       case ' ':
106       case '\t':
107          break;
108       default:
109          ignoreCloseParens = true;
110          break;
111       }
112    }
113 }
114 
resolveFilename(const FilePath & rootDir,const std::string & filename)115 FilePath resolveFilename(const FilePath& rootDir,
116                          const std::string& filename)
117 {
118    std::string result = filename;
119 
120    // Remove quotes if necessary
121    if (result.size() > 2 &&
122        boost::algorithm::starts_with(result, "\"") &&
123        boost::algorithm::ends_with(result, "\""))
124    {
125       result.erase(result.size()-1, 1);
126    }
127 
128    // Strip leading ./
129    if (boost::algorithm::starts_with(result, "./"))
130       result.erase(0, 2);
131 
132    if (result.empty())
133       return FilePath();
134 
135    // Check for existence of file
136    FilePath file = rootDir.completePath(result);
137    if (file.exists() && !file.isDirectory())
138       return file;
139    else
140       return FilePath();
141 }
142 
143 // TeX wraps lines hard at 79 characters. We use heuristics as described in
144 // Sublime Text's TeX plugin to determine where these breaks are.
unwrapLines(std::vector<std::string> * pLines,std::vector<size_t> * pLinesUnwrapped=nullptr)145 void unwrapLines(std::vector<std::string>* pLines,
146                  std::vector<size_t>* pLinesUnwrapped=nullptr)
147 {
148    static boost::regex regexLine("^l\\.(\\d+)\\s");
149    static boost::regex regexAssignment("^\\\\.*?=");
150 
151    std::vector<std::string>::iterator pos = pLines->begin();
152 
153    for ( ; pos != pLines->end(); pos++)
154    {
155       // The first line is always long, and not artificially wrapped
156       if (pos == pLines->begin())
157          continue;
158 
159       if (pos->length() != 79)
160          continue;
161 
162       // The **<filename> line may be long, but we don't care about it
163       if (beginsWith(*pos, "**"))
164          continue;
165 
166       while (true)
167       {
168          std::vector<std::string>::iterator nextPos = pos + 1;
169          // No more lines to add
170          if (nextPos == pLines->end())
171             break;
172 
173          if (nextPos->empty())
174             break;
175 
176          // Underfull/Overfull terminator
177          if (*nextPos == " []")
178             break;
179 
180          // Common prefixes
181          if (beginsWith(*nextPos, "File:", "Package:", "Document Class:"))
182             break;
183 
184          // More prefixes
185          if (beginsWith(*nextPos, "LaTeX Warning:", "LaTeX Info:", "LaTeX2e <"))
186             break;
187 
188          if (regex_utils::search(*nextPos, regexAssignment))
189             break;
190 
191          if (regex_utils::search(*nextPos, regexLine))
192             break;
193 
194          bool breakAfterAppend = nextPos->length() != 79;
195 
196          pos->append(*nextPos);
197          // NOTE: Erase is a simple but inefficient way of handling this. Would
198          //    be way faster to maintain an output iterator that points to the
199          //    correct point in pLines, and when finished, truncate whatever
200          //    elements come after the final position of the output iterator.
201          pLines->erase(nextPos, nextPos+1);
202          if (pLinesUnwrapped)
203             pLinesUnwrapped->push_back(1 + (pos - pLines->begin()));
204 
205          if (breakAfterAppend)
206             break;
207       }
208    }
209 }
210 
211 class FileStack : public boost::noncopyable
212 {
213 public:
FileStack(FilePath rootDir)214    explicit FileStack(FilePath rootDir) : rootDir_(rootDir)
215    {
216    }
217 
currentFile()218    FilePath currentFile()
219    {
220       return currentFile_;
221    }
222 
processLine(const std::string & line)223    void processLine(const std::string& line)
224    {
225       typedef std::vector<std::string::const_iterator> Iterators;
226       Iterators parens;
227       findUnmatchedParens(line, &parens);
228       for (Iterators::const_iterator itParen = parens.begin();
229            itParen != parens.end();
230            itParen++)
231       {
232          std::string::const_iterator it = *itParen;
233 
234          if (*it == ')')
235          {
236             if (!fileStack_.empty())
237             {
238                fileStack_.pop_back();
239                updateCurrentFile();
240             }
241             else
242             {
243                LOG_WARNING_MESSAGE("File context stack underflow while parsing "
244                                    "TeX log");
245             }
246          }
247          else if (*it == '(')
248          {
249             std::string::const_iterator itFilenameEnd =
250                   // case: no other ( on this line
251                   (itParen + 1 == parens.end()) ? line.end() :
252                   // case: space before next paren, eat it
253                   *(*(itParen+1)-1) == ' ' ? *(itParen+1)-1 :
254                   // case: other
255                   *(itParen+1);
256 
257             std::string filename = std::string(it+1, itFilenameEnd);
258             fileStack_.push_back(resolveFilename(rootDir_, filename));
259 
260             updateCurrentFile();
261          }
262          else
263             BOOST_ASSERT(false);
264       }
265    }
266 
267 private:
268 
updateCurrentFile()269    void updateCurrentFile()
270    {
271       for (std::vector<FilePath>::reverse_iterator it = fileStack_.rbegin();
272            it != fileStack_.rend();
273            it++)
274       {
275          if (!it->isEmpty())
276          {
277             currentFile_ = *it;
278             return;
279          }
280       }
281       currentFile_ = FilePath();
282    }
283 
284    FilePath rootDir_;
285    FilePath currentFile_;
286    std::vector<FilePath> fileStack_;
287 };
288 
texFilePath(const std::string & logPath,const FilePath & compileDir)289 FilePath texFilePath(const std::string& logPath, const FilePath& compileDir)
290 {
291    // some tex compilers report file names with absolute paths and some
292    // report them relative to the compilation directory -- on Posix use
293    // realPath to get a clean full path back
294 
295    FilePath path = compileDir.completePath(logPath);
296    FilePath realPath;
297    Error error = core::system::realPath(path, &realPath);
298    if (error)
299    {
300       // log any error which isn't no such file or directory
301       if (error != systemError(boost::system::errc::no_such_file_or_directory, ErrorLocation()))
302       {
303          LOG_ERROR(error);
304       }
305 
306       return path;
307    }
308    else
309    {
310       return realPath;
311    }
312 }
313 
calculateWrappedLine(const std::vector<size_t> & unwrappedLines,size_t unwrappedLineNum)314 size_t calculateWrappedLine(const std::vector<size_t>& unwrappedLines,
315                                size_t unwrappedLineNum)
316 {
317    for (std::vector<size_t>::const_iterator it = unwrappedLines.begin();
318         it != unwrappedLines.end();
319         it++)
320    {
321       if (*it >= unwrappedLineNum)
322       {
323          return unwrappedLineNum + (it - unwrappedLines.begin());
324       }
325    }
326 
327    return unwrappedLineNum + unwrappedLines.size();
328 }
329 
330 } // anonymous namespace
331 
parseLatexLog(const FilePath & logFilePath,LogEntries * pLogEntries)332 Error parseLatexLog(const FilePath& logFilePath, LogEntries* pLogEntries)
333 {
334    static boost::regex regexOverUnderfullLines(" at lines (\\d+)--(\\d+)\\s*(?:\\[])?$");
335    static boost::regex regexWarning("^(?:.*?) Warning: (.+)");
336    static boost::regex regexWarningEnd(" input line (\\d+)\\.$");
337    static boost::regex regexLnn("^l\\.(\\d+)\\s");
338    static boost::regex regexCStyleError("^(.+):(\\d+):\\s(.+)$");
339 
340    std::vector<std::string> lines;
341    Error error = readStringVectorFromFile(logFilePath, &lines, false);
342    if (error)
343       return error;
344 
345    std::vector<size_t> linesUnwrapped;
346    unwrapLines(&lines, &linesUnwrapped);
347 
348    FilePath rootDir = logFilePath.getParent();
349    FileStack fileStack(rootDir);
350 
351    for (std::vector<std::string>::const_iterator it = lines.begin();
352         it != lines.end();
353         it++)
354    {
355       const std::string& line = *it;
356       auto logLineNum = (it - lines.begin()) + 1;
357 
358       // We slurp overfull/underfull messages with no further processing
359       // (i.e. not manipulating the file stack)
360 
361       if (beginsWith(line, "Overfull ", "Underfull "))
362       {
363          std::string msg = line;
364          int lineNum = -1;
365 
366          // Parse lines, if present
367          boost::smatch overUnderfullLinesMatch;
368          if (regex_utils::search(line,
369                                  overUnderfullLinesMatch,
370                                  regexOverUnderfullLines))
371          {
372             lineNum = safe_convert::stringTo<int>(overUnderfullLinesMatch[1],
373                                                   -1);
374          }
375 
376          // Single line case
377          bool singleLine = boost::algorithm::ends_with(line, "[]");
378 
379          if (singleLine)
380          {
381             msg.erase(line.size()-2, 2);
382             boost::algorithm::trim_right(msg);
383          }
384 
385          pLogEntries->push_back(LogEntry(logFilePath,
386                                          gsl::narrow_cast<int>(calculateWrappedLine(
387                                                                   linesUnwrapped,
388                                                                   logLineNum)),
389                                          LogEntry::Box,
390                                          fileStack.currentFile(),
391                                          lineNum,
392                                          msg));
393 
394          if (singleLine)
395             continue;
396 
397          for (; it != lines.end(); it++)
398          {
399             // For multi-line case, we're looking for " []" on a line by itself
400             if (*it == " []")
401                break;
402          }
403 
404          // The iterator would be incremented by the outer for loop, must not
405          // let it go past the end! (If we did get to the end, it would
406          // mean the log file was malformed, but we still can't crash in this
407          // situation.)
408          if (it == lines.end())
409             break;
410          else
411             continue;
412       }
413 
414       fileStack.processLine(line);
415 
416       // Now see if it's an error or warning
417 
418       if (beginsWith(line, "! "))
419       {
420          std::string errorMsg = line.substr(2);
421          int lineNum = -1;
422 
423          boost::smatch match;
424          for (it++; it != lines.end(); it++)
425          {
426             if (regex_utils::search(*it, match, regexLnn))
427             {
428                lineNum = safe_convert::stringTo<int>(match[1], -1);
429                break;
430             }
431          }
432 
433          pLogEntries->push_back(LogEntry(logFilePath,
434                                          gsl::narrow_cast<int>(calculateWrappedLine(
435                                                                   linesUnwrapped,
436                                                                   logLineNum)),
437                                          LogEntry::Error,
438                                          fileStack.currentFile(),
439                                          lineNum,
440                                          errorMsg));
441 
442          // The iterator would be incremented by the outer for loop, must not
443          // let it go past the end! (If we did get to the end, it would
444          // mean the log file was malformed, but we still can't crash in this
445          // situation.)
446          if (it == lines.end())
447             break;
448          else
449             continue;
450       }
451 
452       boost::smatch warningMatch;
453       if (regex_utils::search(line, warningMatch, regexWarning))
454       {
455          std::string warningMsg = warningMatch[1];
456          int lineNum = -1;
457          while (true)
458          {
459             if (boost::algorithm::ends_with(warningMsg, "."))
460             {
461                boost::smatch warningEndMatch;
462                if (regex_utils::search(*it, warningEndMatch, regexWarningEnd))
463                {
464                   lineNum = safe_convert::stringTo<int>(warningEndMatch[1], -1);
465                }
466                break;
467             }
468 
469             if (++it == lines.end())
470                break;
471             warningMsg.append(*it);
472          }
473 
474          pLogEntries->push_back(LogEntry(logFilePath,
475                                          gsl::narrow_cast<int>(calculateWrappedLine(
476                                                                   linesUnwrapped,
477                                                                   logLineNum)),
478                                          LogEntry::Warning,
479                                          fileStack.currentFile(),
480                                          lineNum,
481                                          warningMsg));
482 
483          // The iterator would be incremented by the outer for loop, must not
484          // let it go past the end! (If we did get to the end, it would
485          // mean the log file was malformed, but we still can't crash in this
486          // situation.)
487          if (it == lines.end())
488             break;
489          else
490             continue;
491       }
492 
493       boost::smatch cStyleErrorMatch;
494       if (regex_utils::search(line, cStyleErrorMatch, regexCStyleError))
495       {
496          FilePath cstyleFile = resolveFilename(rootDir, cStyleErrorMatch[1]);
497          if (cstyleFile.exists())
498          {
499             int lineNum = safe_convert::stringTo<int>(cStyleErrorMatch[2], -1);
500             pLogEntries->push_back(LogEntry(logFilePath,
501                                             gsl::narrow_cast<int>(calculateWrappedLine(
502                                                                      linesUnwrapped,
503                                                                      logLineNum)),
504                                             LogEntry::Error,
505                                             cstyleFile,
506                                             lineNum,
507                                             cStyleErrorMatch[3]));
508          }
509       }
510    }
511 
512    return Success();
513 }
514 
parseBibtexLog(const FilePath & logFilePath,LogEntries * pLogEntries)515 Error parseBibtexLog(const FilePath& logFilePath, LogEntries* pLogEntries)
516 {
517    boost::regex re("^(.*)---line ([0-9]+) of file (.*)$");
518 
519    // get the lines
520    std::vector<std::string> lines;
521    Error error = core::readStringVectorFromFile(logFilePath, &lines, false);
522    if (error)
523       return error;
524 
525    // look for error messages
526    for (std::vector<std::string>::const_iterator it = lines.begin();
527         it != lines.end();
528         it++)
529    {
530       boost::smatch match;
531       if (regex_utils::match(*it, match, re))
532       {
533          pLogEntries->push_back(
534                LogEntry(
535                      logFilePath,
536                      gsl::narrow_cast<int>((it - lines.begin()) + 1),
537                      LogEntry::Error,
538                      texFilePath(match[3], logFilePath.getParent()),
539                      boost::lexical_cast<int>(match[2]),
540                      match[1]));
541       }
542    }
543 
544    return Success();
545 }
546 
547 } // namespace tex
548 } // namespace core
549 } // namespace rstudio
550 
551 
552 
553