1 
2 /******************************************************************************
3 * MODULE     : python_language.cpp
4 * DESCRIPTION: the python language
5 * COPYRIGHT  : (C) 2014  François Poulain
6 *******************************************************************************
7 * This software falls under the GNU general public license and comes WITHOUT
8 * ANY WARRANTY WHATSOEVER. See the file $TEXMACS_PATH/LICENSE for more details.
9 * If you don't have this file, write to the Free Software Foundation, Inc.,
10 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
11 ******************************************************************************/
12 
13 #include "analyze.hpp"
14 #include "impl_language.hpp"
15 #include "scheme.hpp"
16 
17 static void parse_escaped_char (string s, int& pos);
18 static void parse_number (string s, int& pos);
19 static void parse_various_number (string s, int& pos);
20 static void parse_alpha (string s, int& pos);
21 static inline bool belongs_to_identifier (char c);
22 
python_language_rep(string name)23 python_language_rep::python_language_rep (string name):
24   language_rep (name), colored ("") {}
25 
26 text_property
advance(tree t,int & pos)27 python_language_rep::advance (tree t, int& pos) {
28   string s= t->label;
29   if (pos==N(s))
30     return &tp_normal_rep;
31   char c= s[pos];
32   if (c == ' ') {
33     pos++;
34     return &tp_space_rep;
35   }
36   if (c == '\\') {
37     parse_escaped_char (s, pos);
38     return &tp_normal_rep;
39   }
40   if (pos+2 < N(s) && s[pos] == '0' &&
41        (s[pos+1] == 'x' || s[pos+1] == 'X' ||
42         s[pos+1] == 'o' || s[pos+1] == 'O' ||
43         s[pos+1] == 'b' || s[pos+1] == 'B')) {
44     parse_various_number (s, pos);
45     return &tp_normal_rep;
46   }
47   if ((c >= '0' && c <= '9') ||
48       (c == '.' && pos+1 < N(s) && s[pos+1] >= '0' && s[pos+1] <= '9')) {
49     parse_number (s, pos);
50     return &tp_normal_rep;
51   }
52   if (belongs_to_identifier (c)) {
53     parse_alpha (s, pos);
54     return &tp_normal_rep;
55   }
56   tm_char_forwards (s, pos);
57   return &tp_normal_rep;
58 }
59 
60 array<int>
get_hyphens(string s)61 python_language_rep::get_hyphens (string s) {
62   int i;
63   array<int> penalty (N(s)+1);
64   penalty[0]= HYPH_INVALID;
65   for (i=1; i<N(s); i++)
66     if (s[i-1] == '-' && is_alpha (s[i]))
67       penalty[i]= HYPH_STD;
68     else penalty[i]= HYPH_INVALID;
69   penalty[i]= HYPH_INVALID;
70   return penalty;
71 }
72 
73 void
hyphenate(string s,int after,string & left,string & right)74 python_language_rep::hyphenate (
75   string s, int after, string& left, string& right)
76 {
77   left = s (0, after);
78   right= s (after, N(s));
79 }
80 
81 static void
python_color_setup_operator_openclose(hashmap<string,string> & t)82 python_color_setup_operator_openclose (hashmap<string, string> & t) {
83   string c= "operator_openclose";
84   t ("{")= c;
85   t ("[")= c;
86   t ("(")= c;
87   t (")")= c;
88   t ("]")= c;
89   t ("}")= c;
90 }
91 
92 static void
python_color_setup_constants(hashmap<string,string> & t)93 python_color_setup_constants (hashmap<string, string> & t) {
94   string c= "constant";
95   t ("Ellipsis")= c;
96   t ("False")= c;
97   t ("None")= c;
98   t ("NotImplemented")= c;
99   t ("True")= c;
100   t ("__debug__")= c;
101   t ("__import__")= c;
102   t ("abs")= c;
103   t ("all")= c;
104   t ("any")= c;
105   t ("apply")= c;
106   t ("ascii")= c;
107   t ("basestring")= c;
108   t ("bin")= c;
109   t ("bool")= c;
110   t ("buffer")= c;
111   t ("bytearray")= c;
112   t ("bytes")= c;
113   t ("callable")= c;
114   t ("chr")= c;
115   t ("classmethod")= c;
116   t ("cmp")= c;
117   t ("coerce")= c;
118   t ("compile")= c;
119   t ("complex")= c;
120   t ("delattr")= c;
121   t ("dict")= c;
122   t ("dir")= c;
123   t ("divmod")= c;
124   t ("enumerate")= c;
125   t ("eval")= c;
126   t ("execfile")= c;
127   t ("file")= c;
128   t ("filter")= c;
129   t ("float")= c;
130   t ("format")= c;
131   t ("frozenset")= c;
132   t ("getattr")= c;
133   t ("globals")= c;
134   t ("hasattr")= c;
135   t ("hash")= c;
136   t ("help")= c;
137   t ("hex")= c;
138   t ("id")= c;
139   t ("input")= c;
140   t ("int")= c;
141   t ("intern")= c;
142   t ("isinstance")= c;
143   t ("issubclass")= c;
144   t ("iter")= c;
145   t ("len")= c;
146   t ("list")= c;
147   t ("locals")= c;
148   t ("long")= c;
149   t ("map")= c;
150   t ("max")= c;
151   t ("memoryview")= c;
152   t ("min")= c;
153   t ("next")= c;
154   t ("nonlocal")= c;
155   t ("object")= c;
156   t ("oct")= c;
157   t ("open")= c;
158   t ("ord")= c;
159   t ("pow")= c;
160   t ("property")= c;
161   t ("range")= c;
162   t ("raw_input")= c;
163   t ("reduce")= c;
164   t ("reload")= c;
165   t ("repr")= c;
166   t ("reversed")= c;
167   t ("round")= c;
168   t ("set")= c;
169   t ("setattr")= c;
170   t ("slice")= c;
171   t ("sorted")= c;
172   t ("staticmethod")= c;
173   t ("str")= c;
174   t ("sum")= c;
175   t ("super")= c;
176   t ("tuple")= c;
177   t ("type")= c;
178   t ("unichr")= c;
179   t ("unicode")= c;
180   t ("vars")= c;
181   t ("xrange")= c;
182   t ("zip")= c;
183 }
184 
185 static void
python_color_setup_constant_exceptions(hashmap<string,string> & t)186 python_color_setup_constant_exceptions (hashmap<string, string> & t) {
187   string c= "constant";
188   t ("BaseException")= c;
189   t ("Exception")= c;
190   t ("ArithmeticError")= c;
191   t ("EnvironmentError")= c;
192   t ("LookupError")= c;
193   t ("StandardError")= c;
194   t ("AssertionError")= c;
195   t ("AttributeError")= c;
196   t ("BufferError")= c;
197   t ("EOFError")= c;
198   t ("FloatingPointError")= c;
199   t ("GeneratorExit")= c;
200   t ("IOError")= c;
201   t ("ImportError")= c;
202   t ("IndentationError")= c;
203   t ("IndexError")= c;
204   t ("KeyError")= c;
205   t ("KeyboardInterrupt")= c;
206   t ("MemoryError")= c;
207   t ("NameError")= c;
208   t ("NotImplementedError")= c;
209   t ("OSError")= c;
210   t ("OverflowError")= c;
211   t ("ReferenceError")= c;
212   t ("RuntimeError")= c;
213   t ("StopIteration")= c;
214   t ("SyntaxError")= c;
215   t ("SystemError")= c;
216   t ("SystemExit")= c;
217   t ("TabError")= c;
218   t ("TypeError")= c;
219   t ("UnboundLocalError")= c;
220   t ("UnicodeError")= c;
221   t ("UnicodeDecodeError")= c;
222   t ("UnicodeEncodeError")= c;
223   t ("UnicodeTranslateError")= c;
224   t ("ValueError")= c;
225   t ("VMSError")= c;
226   t ("WindowsError")= c;
227   t ("ZeroDivisionError")= c;
228   t ("BytesWarning")= c;
229   t ("DeprecationWarning")= c;
230   t ("FutureWarning")= c;
231   t ("ImportWarning")= c;
232   t ("PendingDeprecationWarning")= c;
233   t ("RuntimeWarning")= c;
234   t ("SyntaxWarning")= c;
235   t ("UnicodeWarning")= c;
236   t ("UserWarning")= c;
237   t ("Warning")= c;
238 }
239 
240 static void
python_color_setup_declare_class(hashmap<string,string> & t)241 python_color_setup_declare_class (hashmap<string, string> & t) {
242   string c= "declare_type";
243   t ("class")= c;
244 }
245 
246 static void
python_color_setup_declare_function(hashmap<string,string> & t)247 python_color_setup_declare_function (hashmap<string, string> & t) {
248   string c= "declare_function";
249   t ("def")= c;
250   t ("lambda")= c;
251 }
252 
253 static void
python_color_setup_keywords(hashmap<string,string> & t)254 python_color_setup_keywords (hashmap<string, string> & t) {
255   string c= "keyword";
256   t ("as")= c;
257   t ("del")= c;
258   t ("finally")= c;
259   t ("from")= c;
260   t ("global")= c;
261   t ("import")= c;
262   t ("in")= c;
263   t ("is")= c;
264   t ("with")= c;
265 }
266 
267 static void
python_color_setup_keywords_conditional(hashmap<string,string> & t)268 python_color_setup_keywords_conditional (hashmap<string, string> & t) {
269   string c= "keyword_conditional";
270   t ("break")= c;
271   t ("continue")= c;
272   t ("elif")= c;
273   t ("else")= c;
274   t ("for")= c;
275   t ("if")= c;
276   t ("while")= c;
277 }
278 
279 static void
python_color_setup_keywords_control(hashmap<string,string> & t)280 python_color_setup_keywords_control (hashmap<string, string> & t) {
281   string c= "keyword_control";
282   t ("assert")= c;
283   t ("except")= c;
284   t ("exec")= c;
285   t ("pass")= c;
286   t ("print")= c;
287   t ("raise")= c;
288   t ("return")= c;
289   t ("try")= c;
290   t ("yield")= c;
291 }
292 
293 static void
python_color_setup_operator(hashmap<string,string> & t)294 python_color_setup_operator (hashmap<string, string>& t) {
295   string c= "operator";
296   t ("and")= c;
297   t ("not")= c;
298   t ("or")= c;
299 
300   t ("+")= c;
301   t ("-")= c;
302   t ("/")= c;
303   t ("*")= c;
304   t ("**")= c;
305   t ("//")= c;
306   t ("%")= c;
307   t ("|")= c;
308   t ("&")= c;
309   t ("^")= c;
310   t ("<less><less>")= c;
311   t ("<gtr><gtr>")= c;
312 
313   t ("==")= c;
314   t ("!=")= c;
315   t ("<less><gtr>")= c;
316   t ("<less>")= c;
317   t ("<gtr>")= c;
318   t ("<less>=")= c;
319   t ("<gtr>=")= c;
320 
321   t ("=")= c;
322 
323   t ("+=")= c;
324   t ("-=")= c;
325   t ("/=")= c;
326   t ("*=")= c;
327   t ("**=")= c;
328   t ("//=")= c;
329   t ("%=")= c;
330   t ("|=")= c;
331   t ("&=")= c;
332   t ("^=")= c;
333   t ("<less><less>=")= c;
334   t ("<gtr><gtr>=")= c;
335 
336   t ("~")= c;
337 }
338 
339 static void
python_color_setup_operator_special(hashmap<string,string> & t)340 python_color_setup_operator_special (hashmap<string, string> & t) {
341   string c= "operator_special";
342   t (":")= c;
343 }
344 
345 static void
python_color_setup_operator_decoration(hashmap<string,string> & t)346 python_color_setup_operator_decoration (hashmap<string, string> & t) {
347   string c= "operator_decoration";
348   t ("@")= c;
349 }
350 
351 static void
python_color_setup_operator_field(hashmap<string,string> & t)352 python_color_setup_operator_field (hashmap<string, string> & t) {
353   t (".")= "operator_field";
354 }
355 
356 static inline bool
belongs_to_identifier(char c)357 belongs_to_identifier (char c) {
358   return ((c<='9' && c>='0') ||
359           (c<='Z' && c>='A') ||
360 	  (c<='z' && c>='a') ||
361           (c=='_'));
362 }
363 
364 static inline bool
is_hex_number(char c)365 is_hex_number (char c) {
366   return (c>='0' && c<='9') || (c>='A' && c<='F') || (c>='a' && c<='f');
367 }
368 
369 static inline bool
is_number(char c)370 is_number (char c) {
371   return (c>='0' && c<='9');
372 }
373 
374 static void
parse_identifier(hashmap<string,string> & t,string s,int & pos)375 parse_identifier (hashmap<string, string>& t, string s, int& pos) {
376   int i=pos;
377   if (pos >= N(s)) return;
378   if (is_number (s[i])) return;
379   while (i<N(s) && belongs_to_identifier (s[i])) i++;
380   if (!(t->contains (s (pos, i)))) pos= i;
381 }
382 
383 static void
parse_alpha(string s,int & pos)384 parse_alpha (string s, int& pos) {
385   static hashmap<string,string> empty;
386   parse_identifier (empty, s, pos);
387 }
388 
389 static void
parse_blanks(string s,int & pos)390 parse_blanks (string s, int& pos) {
391   while (pos<N(s) && (s[pos] == ' ' || s[pos] == '\t')) pos++;
392 }
393 
394 static void
parse_escaped_char(string s,int & pos)395 parse_escaped_char (string s, int& pos) {
396   int n= N(s), i= pos++;
397   if (i+2 >= n) return;
398   if (s[i] != '\\')
399     return;
400   i++;
401   if (test (s, i, "newline"))
402     pos+= 7;
403   else if (s[i] == '\\' || s[i] == '\'' || s[i] == '\"' ||
404            s[i] == 'a'  || s[i] == 'b'  || s[i] == 'f'  ||
405            s[i] == 'n'  || s[i] == 'r'  || s[i] == 't'  ||
406            s[i] == 'N'  || s[i] == 'v')
407     pos+= 1;
408   else if (s[i] == 'o'  || s[i] == 'x')
409     pos+= 3;
410   else if (s[i] == 'u')
411     pos+= 5;
412   else if (s[i] == 'U')
413     pos+= 9;
414   return;
415 }
416 
417 static bool
parse_string(string s,int & pos,bool force)418 parse_string (string s, int& pos, bool force) {
419   int n= N(s);
420   static string delim;
421   if (pos >= n) return false;
422   if (test (s, pos, "\"\"\"") || test (s, pos, "\'\'\'")) {
423     delim= s(pos, pos+3);
424     pos+= N(delim);
425   }
426   else if (s[pos] == '\"' || s[pos] == '\'') {
427     delim= s(pos, pos+1);
428     pos+= N(delim);
429   }
430   else if (!force)
431     return false;
432   while (pos<n && !test (s, pos, delim)) {
433     if (s[pos] == '\\') {
434       return true;
435     }
436     else
437       pos++;
438   }
439   if (test (s, pos, delim))
440     pos+= N(delim);
441   return false;
442 }
443 
444 static string
parse_keywords(hashmap<string,string> & t,string s,int & pos)445 parse_keywords (hashmap<string,string>& t, string s, int& pos) {
446   int i= pos;
447   if (pos>=N(s)) return "";
448   if (is_number (s[i])) return "";
449   while ((i<N(s)) && belongs_to_identifier (s[i])) i++;
450   string r= s (pos, i);
451   if (t->contains (r)) {
452     string tr= t(r);
453     if (tr == "keyword_conditional" ||
454         tr == "keyword_control"      ||
455         tr == "keyword"              ||
456         tr == "declare_type"         ||
457         tr == "declare_function"     ||
458         tr == "constant") {
459       pos=i;
460       return tr;
461     }
462   }
463   return "";
464 }
465 
466 static string
parse_operators(hashmap<string,string> & t,string s,int & pos)467 parse_operators (hashmap<string,string>& t, string s, int& pos) {
468   int i;
469   for (i=12; i>=1; i--) {
470     string r=s(pos,pos+i);
471     if (t->contains (r)) {
472       string tr= t(r);
473       if (tr == "operator"          ||
474           tr == "operator_field"    ||
475           tr == "operator_special"  ||
476           tr == "operator_openclose") {
477         pos=pos+i;
478         return tr;
479       }
480       else if (t(r) == "operator_decoration") {
481         pos=pos+i;
482         while ((pos<N(s)) && belongs_to_identifier (s[pos])) pos++;
483         return "operator_special";
484       }
485     }
486   }
487   return "";
488 }
489 
490 static void
parse_various_number(string s,int & pos)491 parse_various_number (string s, int& pos) {
492   if (!(pos+2 < N(s) && s[pos] == '0' &&
493        (s[pos+1] == 'x' || s[pos+1] == 'X' ||
494         s[pos+1] == 'o' || s[pos+1] == 'O' ||
495         s[pos+1] == 'b' || s[pos+1] == 'B')))
496     return;
497   pos+= 2;
498   while (pos<N(s) && is_hex_number (s[pos])) pos++;
499   if (pos<N(s) && (s[pos] == 'l' || s[pos] == 'L')) pos++;
500 }
501 
502 static void
parse_number(string s,int & pos)503 parse_number (string s, int& pos) {
504   int i= pos;
505   if (pos>=N(s)) return;
506   while (i<N(s) && (is_number (s[i]) || s[i] == '.'))
507     i++;
508   if (i == pos) return;
509   if (i<N(s) && (s[i] == 'e' || s[i] == 'E')) {
510     i++;
511     if (i<N(s) && s[i] == '-') i++;
512     while (i<N(s) && (is_number (s[i]) || s[i] == '.')) i++;
513     if (i<N(s) && (s[i] == 'j')) i++;
514   }
515   else if (i<N(s) && (s[i] == 'l' || s[i] == 'L')) i++;
516   else if (i<N(s) && (s[i] == 'j')) i++;
517   pos= i;
518 }
519 
520 static void
parse_comment_single_line(string s,int & pos)521 parse_comment_single_line (string s, int& pos) {
522   if (pos>=N(s)) return;
523   if (s[pos]!='#') return;
524   pos=N(s);
525 }
526 
527 string
get_color(tree t,int start,int end)528 python_language_rep::get_color (tree t, int start, int end) {
529   static bool setup_done= false;
530   if (!setup_done) {
531     /*
532      * NOTE: it seems there is no way to take into account multiline
533      * dependencies. Then such weird syntax like
534      *
535      * str= """some string beginning ...
536      * some string end"""
537      *
538      * will not be correctly typeset.
539      *
540      */
541 
542     python_color_setup_constants (colored);
543     python_color_setup_constant_exceptions (colored);
544     python_color_setup_declare_class (colored);
545     python_color_setup_declare_function (colored);
546     python_color_setup_keywords (colored);
547     python_color_setup_keywords_conditional (colored);
548     python_color_setup_keywords_control (colored);
549     python_color_setup_operator (colored);
550     python_color_setup_operator_special (colored);
551     python_color_setup_operator_decoration (colored);
552     python_color_setup_operator_openclose (colored);
553     python_color_setup_operator_field (colored);
554     setup_done= true;
555   }
556 
557   static string none= "";
558   if (start >= end) return none;
559   string s= t->label;
560   int pos= 0;
561   int opos=0;
562   string type;
563   bool in_str= false;
564   bool in_esc= false;
565   do {
566     type= none;
567     do {
568       opos= pos;
569       if (in_str) {
570         in_esc= parse_string (s, pos, true);
571         in_str= false;
572         if (opos < pos) {
573           type= "constant_string";
574           break;
575         }
576       }
577       else if (in_esc) {
578         parse_escaped_char (s, pos);
579         in_esc= false;
580         in_str= true;
581         if (opos < pos) {
582           type= "constant_char";
583           break;
584         }
585       }
586       else {
587         parse_blanks (s, pos);
588         if (opos < pos){
589           break;
590         }
591         parse_comment_single_line (s, pos);
592         if (opos < pos) {
593           type= "comment";
594           break;
595         }
596         in_esc= parse_string (s, pos, false);
597         if (opos < pos) {
598           type= "constant_string";
599           break;
600         }
601         type= parse_keywords (colored, s, pos);
602         if (opos < pos) {
603           break;
604         }
605         parse_various_number (s, pos);
606         if (opos < pos) {
607           type= "constant_number";
608           break;
609         }
610         parse_number (s, pos);
611         if (opos < pos) {
612           type= "constant_number";
613           break;
614         }
615         type= parse_operators (colored, s, pos);
616         if (opos < pos) {
617           break;
618         }
619         parse_identifier (colored, s, pos);
620         if (opos < pos) {
621           type= none;
622           break;
623         }
624       }
625       pos= opos;
626       pos++;
627     }
628     while (false);
629   }
630   while (pos <= start);
631   if (type == none) return none;
632   return decode_color ("python", encode_color (type));
633 }
634