xref: /reactos/sdk/lib/ucrt/startup/argv_parsing.cpp (revision 04e0dc4a)
1 /***
2 *stdargv.c - standard & wildcard _setargv routine
3 *
4 *       Copyright (c) Microsoft Corporation. All rights reserved.
5 *
6 *Purpose:
7 *       processes program command line, with or without wildcard expansion
8 *
9 *******************************************************************************/
10 
11 #include <corecrt_internal.h>
12 #include <corecrt_internal_traits.h>
13 #include <limits.h>
14 #include <mbstring.h>
15 #include <stdlib.h>
16 
17 
18 
19 // In the function below, we need to ensure that we've initialized the mbc table
20 // before we start performing character transformations.
do_locale_initialization(char)21 static void do_locale_initialization(char)    throw() { __acrt_initialize_multibyte(); }
do_locale_initialization(wchar_t)22 static void do_locale_initialization(wchar_t) throw() { /* no-op */                    }
23 
get_command_line(char)24 static char*    get_command_line(char)    throw() { return _acmdln; }
get_command_line(wchar_t)25 static wchar_t* get_command_line(wchar_t) throw() { return _wcmdln; }
26 
get_argv(char)27 static char**&    get_argv(char)    throw() { return __argv;  }
get_argv(wchar_t)28 static wchar_t**& get_argv(wchar_t) throw() { return __wargv; }
29 
expand_argv_wildcards(_In_z_ char ** const argv,_Out_ _Deref_post_z_ char *** const expanded_argv)30 static errno_t expand_argv_wildcards(
31     _In_z_               char**  const argv,
32     _Out_ _Deref_post_z_ char*** const expanded_argv) throw()
33 {
34     return __acrt_expand_narrow_argv_wildcards(argv, expanded_argv);
35 }
36 
expand_argv_wildcards(_In_z_ wchar_t ** const argv,_Out_ _Deref_post_z_ wchar_t *** const expanded_argv)37 static errno_t expand_argv_wildcards(
38     _In_z_               wchar_t**  const argv,
39     _Out_ _Deref_post_z_ wchar_t*** const expanded_argv) throw()
40 {
41     return __acrt_expand_wide_argv_wildcards(argv, expanded_argv);
42 }
43 
44 
45 
46 /***
47 *static void parse_cmdline(cmdstart, argv, args, argument_count, character_count)
48 *
49 *Purpose:
50 *       Parses the command line and sets up the argv[] array.
51 *       On entry, cmdstart should point to the command line,
52 *       argv should point to memory for the argv array, args
53 *       points to memory to place the text of the arguments.
54 *       If these are nullptr, then no storing (only counting)
55 *       is done.  On exit, *argument_count has the number of
56 *       arguments (plus one for a final nullptr argument),
57 *       and *character_count has the number of bytes used in the buffer
58 *       pointed to by args.
59 *
60 *Entry:
61 *       Character *cmdstart - pointer to command line of the form
62 *           <progname><nul><args><nul>
63 *       Character **argv - where to build argv array; nullptr means don't
64 *                       build array
65 *       Character *args - where to place argument text; nullptr means don't
66 *                       store text
67 *
68 *Exit:
69 *       no return value
70 *       int *argument_count - returns number of argv entries created
71 *       int *character_count - number of characters used in args buffer
72 *
73 *Exceptions:
74 *
75 *******************************************************************************/
76 
77 
78 // should_copy_another_character helper functions
79 // should_copy_another_character is *ONLY* checking for DBCS lead bytes to see if there
80 // might be a following trail byte.  This works because the callers are only concerned
81 // about escaped quote sequences and other codepages aren't using those quotes.
should_copy_another_character(char const c)82 static bool __cdecl should_copy_another_character(char const c) throw()
83 {
84     // This is OK for UTF-8 as a quote is never a trail byte.
85     return _ismbblead(c) != 0;
86 }
87 
should_copy_another_character(wchar_t)88 static bool __cdecl should_copy_another_character(wchar_t) throw()
89 {
90     // This is OK for UTF-16 as a quote is never part of a surrogate pair.
91     return false;
92 }
93 
94 template <typename Character>
parse_command_line(Character * cmdstart,Character ** argv,Character * args,size_t * argument_count,size_t * character_count)95 static void __cdecl parse_command_line(
96     Character*  cmdstart,
97     Character** argv,
98     Character*  args,
99     size_t*     argument_count,
100     size_t*     character_count
101     ) throw()
102 {
103     *character_count = 0;
104     *argument_count  = 1; // We'll have at least the program name
105 
106     Character c;
107     int copy_character;                   /* 1 = copy char to *args */
108     unsigned numslash;              /* num of backslashes seen */
109 
110     /* first scan the program name, copy it, and count the bytes */
111     Character* p = cmdstart;
112     if (argv)
113         *argv++ = args;
114 
115     // A quoted program name is handled here. The handling is much
116     // simpler than for other arguments. Basically, whatever lies
117     // between the leading double-quote and next one, or a terminal null
118     // character is simply accepted. Fancier handling is not required
119     // because the program name must be a legal NTFS/HPFS file name.
120     // Note that the double-quote characters are not copied, nor do they
121     // contribute to character_count.
122     bool in_quotes = false;
123     do
124     {
125         if (*p == '"')
126         {
127             in_quotes = !in_quotes;
128             c = *p++;
129             continue;
130         }
131 
132         ++*character_count;
133         if (args)
134             *args++ = *p;
135 
136         c = *p++;
137 
138         if (should_copy_another_character(c))
139         {
140             ++*character_count;
141             if (args)
142                 *args++ = *p; // Copy 2nd byte too
143             ++p; // skip over trail byte
144         }
145     }
146     while (c != '\0' && (in_quotes || (c != ' ' && c != '\t')));
147 
148     if (c == '\0')
149     {
150         p--;
151     }
152     else
153     {
154         if (args)
155             *(args - 1) = '\0';
156     }
157 
158     in_quotes = false;
159 
160     // Loop on each argument
161     for (;;)
162     {
163         if (*p)
164         {
165             while (*p == ' ' || *p == '\t')
166                 ++p;
167         }
168 
169         if (*p == '\0')
170             break; // End of arguments
171 
172         // Scan an argument:
173         if (argv)
174             *argv++ = args;
175 
176         ++*argument_count;
177 
178         // Loop through scanning one argument:
179         for (;;)
180         {
181             copy_character = 1;
182 
183             // Rules:
184             // 2N     backslashes   + " ==> N backslashes and begin/end quote
185             // 2N + 1 backslashes   + " ==> N backslashes + literal "
186             // N      backslashes       ==> N backslashes
187             numslash = 0;
188 
189             while (*p == '\\')
190             {
191                 // Count number of backslashes for use below
192                 ++p;
193                 ++numslash;
194             }
195 
196             if (*p == '"')
197             {
198                 // if 2N backslashes before, start/end quote, otherwise
199                 // copy literally:
200                 if (numslash % 2 == 0)
201                 {
202                     if (in_quotes && p[1] == '"')
203                     {
204                         p++; // Double quote inside quoted string
205                     }
206                     else
207                     {
208                         // Skip first quote char and copy second:
209                         copy_character = 0; // Don't copy quote
210                         in_quotes = !in_quotes;
211                     }
212                 }
213 
214                 numslash /= 2;
215             }
216 
217             // Copy slashes:
218             while (numslash--)
219             {
220                 if (args)
221                     *args++ = '\\';
222                 ++*character_count;
223             }
224 
225             // If at end of arg, break loop:
226             if (*p == '\0' || (!in_quotes && (*p == ' ' || *p == '\t')))
227                 break;
228 
229             // Copy character into argument:
230             if (copy_character)
231             {
232                 if (args)
233                     *args++ = *p;
234 
235                 if (should_copy_another_character(*p))
236                 {
237                     ++p;
238                     ++*character_count;
239 
240                     if (args)
241                         *args++ = *p;
242                 }
243 
244                 ++*character_count;
245             }
246 
247             ++p;
248         }
249 
250         // Null-terminate the argument:
251         if (args)
252             *args++ = '\0'; // Terminate the string
253 
254         ++*character_count;
255     }
256 
257     // We put one last argument in -- a null pointer:
258     if (argv)
259         *argv++ = nullptr;
260 
261     ++*argument_count;
262 }
263 
264 
265 
__acrt_allocate_buffer_for_argv(size_t const argument_count,size_t const character_count,size_t const character_size)266 extern "C" unsigned char* __cdecl __acrt_allocate_buffer_for_argv(
267     size_t const argument_count,
268     size_t const character_count,
269     size_t const character_size
270     )
271 {
272     if (argument_count >= SIZE_MAX / sizeof(void*))
273         return nullptr;
274 
275     if (character_count >= SIZE_MAX / character_size)
276         return nullptr;
277 
278     size_t const argument_array_size  = argument_count  * sizeof(void*);
279     size_t const character_array_size = character_count * character_size;
280 
281     if (SIZE_MAX - argument_array_size <= character_array_size)
282         return nullptr;
283 
284     size_t const total_size = argument_array_size + character_array_size;
285     __crt_unique_heap_ptr<unsigned char> buffer(_calloc_crt_t(unsigned char, total_size));
286     if (!buffer)
287         return nullptr;
288 
289     return buffer.detach();
290 }
291 
292 
293 
294 /***
295 *_setargv, __setargv - set up "argc" and "argv" for C programs
296 *
297 *Purpose:
298 *       Read the command line and create the argv array for C
299 *       programs.
300 *
301 *Entry:
302 *       Arguments are retrieved from the program command line,
303 *       pointed to by _acmdln.
304 *
305 *Exit:
306 *       Returns 0 if successful, -1 if memory allocation failed.
307 *       "argv" points to a null-terminated list of pointers to ASCIZ
308 *       strings, each of which is an argument from the command line.
309 *       "argc" is the number of arguments.  The strings are copied from
310 *       the environment segment into space allocated on the heap/stack.
311 *       The list of pointers is also located on the heap or stack.
312 *       _pgmptr points to the program name.
313 *
314 *Exceptions:
315 *       Terminates with out of memory error if no memory to allocate.
316 *
317 *******************************************************************************/
318 template <typename Character>
common_configure_argv(_crt_argv_mode const mode)319 static errno_t __cdecl common_configure_argv(_crt_argv_mode const mode) throw()
320 {
321     typedef __crt_char_traits<Character> traits;
322 
323     if (mode == _crt_argv_no_arguments)
324     {
325         return 0;
326     }
327 
328     _VALIDATE_RETURN_ERRCODE(
329         mode == _crt_argv_expanded_arguments ||
330         mode == _crt_argv_unexpanded_arguments, EINVAL);
331 
332     do_locale_initialization(Character());
333 
334 
335     static Character program_name[MAX_PATH + 1];
336     traits::get_module_file_name(nullptr, program_name, MAX_PATH);
337     traits::set_program_name(&program_name[0]);
338 
339     // If there's no command line at all, then use the program name as the
340     // command line to parse, so that argv[0] is initialized with the program
341     // name.  (This won't happen when the program is run by cmd.exe, but it
342     // could happen if the program is spawned via some other means.)
343     Character* const raw_command_line = get_command_line(Character());
344     Character* const command_line = raw_command_line == nullptr || raw_command_line[0] == '\0'
345         ? program_name
346         : raw_command_line;
347 
348     size_t argument_count  = 0;
349     size_t character_count = 0;
350     parse_command_line(
351         command_line,
352         static_cast<Character**>(nullptr),
353         static_cast<Character*>(nullptr),
354         &argument_count,
355         &character_count);
356 
357     __crt_unique_heap_ptr<unsigned char> buffer(__acrt_allocate_buffer_for_argv(
358         argument_count,
359         character_count,
360         sizeof(Character)));
361 
362     _VALIDATE_RETURN_ERRCODE_NOEXC(buffer, ENOMEM);
363 
364     Character** const first_argument = reinterpret_cast<Character**>(buffer.get());
365     Character*  const first_string   = reinterpret_cast<Character*>(buffer.get() + argument_count * sizeof(Character*));
366 
367     parse_command_line(command_line, first_argument, first_string, &argument_count, &character_count);
368 
369     // If we are not expanding wildcards, then we are done...
370     if (mode == _crt_argv_unexpanded_arguments)
371     {
372         __argc = static_cast<int>(argument_count - 1);
373         get_argv(Character()) = reinterpret_cast<Character**>(buffer.detach());
374         return 0;
375     }
376 
377     // ... otherwise, we try to do the wildcard expansion:
378     __crt_unique_heap_ptr<Character*> expanded_argv;
379     errno_t const argv_expansion_status = expand_argv_wildcards(first_argument, expanded_argv.get_address_of());
380     if (argv_expansion_status != 0)
381         return argv_expansion_status;
382 
383     __argc = [&]()
384     {
385         size_t n = 0;
386         for (auto it = expanded_argv.get(); *it; ++it, ++n) { }
387         return static_cast<int>(n);
388     }();
389 
390     get_argv(Character()) = expanded_argv.detach();
391     return 0;
392 }
393 
394 
395 
_configure_narrow_argv(_crt_argv_mode const mode)396 extern "C" errno_t __cdecl _configure_narrow_argv(_crt_argv_mode const mode)
397 {
398     return common_configure_argv<char>(mode);
399 }
400 
_configure_wide_argv(_crt_argv_mode const mode)401 extern "C" errno_t __cdecl _configure_wide_argv(_crt_argv_mode const mode)
402 {
403     return common_configure_argv<wchar_t>(mode);
404 }
405