1 /**
2 * Copyright (c) 2017, Timothy Stack
3 *
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * * Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 * * Neither the name of Timothy Stack nor the names of its contributors
15 * may be used to endorse or promote products derived from this software
16 * without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ''AS IS'' AND ANY
19 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 *
29 * @file text_format.cc
30 */
31
32 #include "config.h"
33
34 #include "pcrepp/pcrepp.hh"
35 #include "yajl/api/yajl_parse.h"
36
37 #include "text_format.hh"
38
detect_text_format(const char * str,size_t len)39 text_format_t detect_text_format(const char *str, size_t len)
40 {
41 // XXX This is a pretty crude way of detecting format...
42 static pcrepp PYTHON_MATCHERS = pcrepp(
43 "(?:"
44 "^\\s*def\\s+\\w+\\([^)]*\\):[^\\n]*$|"
45 "^\\s*try:[^\\n]*$"
46 ")",
47 PCRE_MULTILINE);
48
49 static pcrepp RUST_MATCHERS = pcrepp(R"(
50 (?:
51 ^\s*use\s+[\w+:\{\}]+;$|
52 ^\s*(?:pub)?\s+(?:const|enum|fn)\s+\w+.*$|
53 ^\s*impl\s+\w+.*$
54 )
55 )",
56 PCRE_MULTILINE);
57
58 static pcrepp JAVA_MATCHERS = pcrepp(
59 "(?:"
60 "^package\\s+|"
61 "^import\\s+|"
62 "^\\s*(?:public)?\\s*class\\s*(\\w+\\s+)*\\s*{"
63 ")",
64 PCRE_MULTILINE);
65
66 static pcrepp C_LIKE_MATCHERS = pcrepp(
67 "(?:"
68 "^#\\s*include\\s+|"
69 "^#\\s*define\\s+|"
70 "^\\s*if\\s+\\([^)]+\\)[^\\n]*$|"
71 "^\\s*(?:\\w+\\s+)*class \\w+ {"
72 ")",
73 PCRE_MULTILINE);
74
75 static pcrepp SQL_MATCHERS = pcrepp(
76 "(?:"
77 "select\\s+.+\\s+from\\s+|"
78 "insert\\s+into\\s+.+\\s+values"
79 ")",
80 PCRE_MULTILINE|PCRE_CASELESS);
81
82 static pcrepp XML_MATCHERS = pcrepp(
83 "(?:"
84 R"(<\?xml(\s+\w+\s*=\s*"[^"]*")*\?>|)"
85 R"(</?\w+(\s+\w+\s*=\s*"[^"]*")*\s*>)"
86 ")",
87 PCRE_MULTILINE|PCRE_CASELESS);
88
89 text_format_t retval = text_format_t::TF_UNKNOWN;
90 pcre_input pi(str, 0, len);
91 pcre_context_static<30> pc;
92
93 {
94 auto_mem<yajl_handle_t> jhandle(yajl_free);
95
96 jhandle = yajl_alloc(nullptr, nullptr, nullptr);
97 if (yajl_parse(jhandle, (unsigned char *) str, len) == yajl_status_ok) {
98 return text_format_t::TF_JSON;
99 }
100 }
101
102 if (PYTHON_MATCHERS.match(pc, pi)) {
103 return text_format_t::TF_PYTHON;
104 }
105
106 if (RUST_MATCHERS.match(pc, pi)) {
107 return text_format_t::TF_RUST;
108 }
109
110 if (JAVA_MATCHERS.match(pc, pi)) {
111 return text_format_t::TF_JAVA;
112 }
113
114 if (C_LIKE_MATCHERS.match(pc, pi)) {
115 return text_format_t::TF_C_LIKE;
116 }
117
118 if (SQL_MATCHERS.match(pc, pi)) {
119 return text_format_t::TF_SQL;
120 }
121
122 if (XML_MATCHERS.match(pc, pi)) {
123 return text_format_t::TF_XML;
124 }
125
126 return retval;
127 }
128