1 /*
2  * CsvParser.hpp
3  *
4  * Copyright (C) 2021 by RStudio, PBC
5  *
6  * Unless you have received this program directly from RStudio pursuant
7  * to the terms of a commercial license agreement with RStudio, then
8  * this program is licensed to you under the terms of version 3 of the
9  * GNU Affero General Public License. This program is distributed WITHOUT
10  * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
11  * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
12  * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
13  *
14  */
15 
16 #ifndef CSV_PARSER_HPP
17 #define CSV_PARSER_HPP
18 
19 #include <string>
20 #include <vector>
21 #include <boost/algorithm/string/replace.hpp>
22 
23 namespace rstudio {
24 namespace core {
25 namespace text {
26 
27 /*
28 Parses up to one line of CSV data. Empty lines will be skipped.
29 
30 The value returned is a pair consisting of the line that was
31 successfully parsed and an iterator value that indicates where
32 parsing should begin next time.
33 
34 If less than one line of CSV data is available, an empty vector
35 is returned.
36 
37 This implementation is RFC4180 compliant.
38 
39 Note that if parseCsvLine is called in a loop, the termination
40 condition should be that the returned vector is empty, NOT that
41 the returned iterator == end. (In the case of malformed or
42 incomplete CSV data that do not end with a line break, the
43 returned iterator will never move past the beginning of the
44 last line.)
45 */
46 
47 template <typename InputIterator>
parseCsvLine(InputIterator begin,InputIterator end,bool allowMissingEOL=false)48 std::pair<std::vector<std::string>, InputIterator> parseCsvLine(
49       InputIterator begin,
50       InputIterator end,
51       bool allowMissingEOL = false)
52 {
53    std::vector<std::string> line;
54 
55    bool inQuote = false;
56 
57    std::string element;
58 
59    InputIterator pos = begin;
60    while (pos != end)
61    {
62       bool noIncrement = false;
63 
64       if (inQuote)
65       {
66          if (*pos == '"')
67          {
68             if (++pos != end)
69             {
70                if (*pos == '"')
71                {
72                   element.push_back('"');
73                   ++pos;
74                   continue;
75                }
76             }
77             noIncrement = true;
78             inQuote = false;
79          }
80          else
81          {
82             element.push_back(*pos);
83          }
84       }
85       else // not in quote
86       {
87          if (*pos == '"')
88          {
89             // starting a quote
90             element.clear();
91             inQuote = true;
92          }
93          else if (*pos == ',')
94          {
95             line.push_back(element);
96             element.clear();
97          }
98          else if (*pos == '\r')
99          {
100             // ignore--expect a \n next
101          }
102          else if (*pos == '\n')
103          {
104             if (!element.empty() || !line.empty())
105             {
106                line.push_back(element);
107                element.clear();
108             }
109 
110             begin = ++pos;
111             noIncrement = true;
112 
113             // don't return blank lines
114             if (!line.empty())
115             {
116                return std::pair<std::vector<std::string>, InputIterator>(
117                      line, begin);
118             }
119          }
120          else
121          {
122             element.push_back(*pos);
123          }
124       }
125 
126       if (!noIncrement)
127          ++pos;
128    }
129 
130    // if we got here, we failed to find a (terminating) newline
131    if (allowMissingEOL)
132    {
133       line.push_back(element);
134       return std::pair<std::vector<std::string>, InputIterator>(line, end);
135    }
136 
137    return std::pair<std::vector<std::string>, InputIterator>(
138          std::vector<std::string>(), begin);
139 }
140 
141 template <typename InputIterator>
parseCsvLine(InputIterator begin,InputIterator end,bool allowMissingEOL,std::vector<std::string> * pParsedCsv)142 InputIterator parseCsvLine(InputIterator begin,
143                            InputIterator end,
144                            bool allowMissingEOL,
145                            std::vector<std::string>* pParsedCsv)
146 {
147    std::pair<std::vector<std::string>, InputIterator> parsed =
148           parseCsvLine(begin, end, allowMissingEOL);
149    *pParsedCsv = parsed.first;
150    return parsed.second;
151 }
152 
153 
154 // Encodes a vector of string values to a line of RFC4180 CSV.
155 //
156 // Note that it's the caller's responsibility to add a terminating newline.
encodeCsvLine(const std::vector<std::string> & values)157 inline std::string encodeCsvLine(const std::vector<std::string>& values)
158 {
159    std::string line;
160    for (unsigned i = 0; i < values.size(); i++)
161    {
162       // escape quotes if needed
163       std::string val(values[i]);
164       boost::algorithm::replace_all(val, "\"", "\"\"");
165 
166       // add to the line, with a comma if there are additional values
167       line.append("\"" + val + "\"");
168       if (i < values.size() - 1)
169          line.append(",");
170    }
171    return line;
172 }
173 
174 } // namespace text
175 } // namespace core
176 } // namespace rstudio
177 
178 #endif // CSV_PARSER_HPP
179