1 /**
2  * Copyright (c) 2017, Timothy Stack
3  *
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * * Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * * Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  * * Neither the name of Timothy Stack nor the names of its contributors
15  * may be used to endorse or promote products derived from this software
16  * without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28  *
29  * @file text_format.cc
30  */
31 
32 #include "config.h"
33 
34 #include "pcrepp/pcrepp.hh"
35 #include "yajl/api/yajl_parse.h"
36 
37 #include "text_format.hh"
38 
detect_text_format(const char * str,size_t len)39 text_format_t detect_text_format(const char *str, size_t len)
40 {
41     // XXX This is a pretty crude way of detecting format...
42     static pcrepp PYTHON_MATCHERS = pcrepp(
43         "(?:"
44             "^\\s*def\\s+\\w+\\([^)]*\\):[^\\n]*$|"
45             "^\\s*try:[^\\n]*$"
46             ")",
47         PCRE_MULTILINE);
48 
49     static pcrepp RUST_MATCHERS = pcrepp(R"(
50 (?:
51 ^\s*use\s+[\w+:\{\}]+;$|
52 ^\s*(?:pub)?\s+(?:const|enum|fn)\s+\w+.*$|
53 ^\s*impl\s+\w+.*$
54 )
55 )",
56         PCRE_MULTILINE);
57 
58     static pcrepp JAVA_MATCHERS = pcrepp(
59         "(?:"
60         "^package\\s+|"
61         "^import\\s+|"
62         "^\\s*(?:public)?\\s*class\\s*(\\w+\\s+)*\\s*{"
63         ")",
64         PCRE_MULTILINE);
65 
66     static pcrepp C_LIKE_MATCHERS = pcrepp(
67         "(?:"
68             "^#\\s*include\\s+|"
69             "^#\\s*define\\s+|"
70             "^\\s*if\\s+\\([^)]+\\)[^\\n]*$|"
71             "^\\s*(?:\\w+\\s+)*class \\w+ {"
72             ")",
73         PCRE_MULTILINE);
74 
75     static pcrepp SQL_MATCHERS = pcrepp(
76         "(?:"
77             "select\\s+.+\\s+from\\s+|"
78             "insert\\s+into\\s+.+\\s+values"
79             ")",
80         PCRE_MULTILINE|PCRE_CASELESS);
81 
82     static pcrepp XML_MATCHERS = pcrepp(
83         "(?:"
84         R"(<\?xml(\s+\w+\s*=\s*"[^"]*")*\?>|)"
85         R"(</?\w+(\s+\w+\s*=\s*"[^"]*")*\s*>)"
86         ")",
87         PCRE_MULTILINE|PCRE_CASELESS);
88 
89     text_format_t retval = text_format_t::TF_UNKNOWN;
90     pcre_input pi(str, 0, len);
91     pcre_context_static<30> pc;
92 
93     {
94         auto_mem<yajl_handle_t> jhandle(yajl_free);
95 
96         jhandle = yajl_alloc(nullptr, nullptr, nullptr);
97         if (yajl_parse(jhandle, (unsigned char *) str, len) == yajl_status_ok) {
98             return text_format_t::TF_JSON;
99         }
100     }
101 
102     if (PYTHON_MATCHERS.match(pc, pi)) {
103         return text_format_t::TF_PYTHON;
104     }
105 
106     if (RUST_MATCHERS.match(pc, pi)) {
107         return text_format_t::TF_RUST;
108     }
109 
110     if (JAVA_MATCHERS.match(pc, pi)) {
111         return text_format_t::TF_JAVA;
112     }
113 
114     if (C_LIKE_MATCHERS.match(pc, pi)) {
115         return text_format_t::TF_C_LIKE;
116     }
117 
118     if (SQL_MATCHERS.match(pc, pi)) {
119         return text_format_t::TF_SQL;
120     }
121 
122     if (XML_MATCHERS.match(pc, pi)) {
123         return text_format_t::TF_XML;
124     }
125 
126     return retval;
127 }
128