1;;; make-regexp.el --- generate efficient regexps to match strings. 2 3;; Copyright (C) 1994, 1995 Simon Marshall. 4 5;; Author: Simon Marshall <simon@gnu.ai.mit.edu> 6;; Keywords: lisp, matching 7;; Version: 1.02 8 9;; LCD Archive Entry: 10;; make-regexp|Simon Marshall|simon@gnu.ai.mit.edu| 11;; Generate efficient regexps to match strings.| 12;; 11-Jul-1995|1.02|~/functions/make-regexp.el.gz| 13 14;; The archive is archive.cis.ohio-state.edu in /pub/gnu/emacs/elisp-archive. 15 16;;; This file is not part of GNU Emacs. 17 18;;; This program is free software; you can redistribute it and/or modify 19;;; it under the terms of the GNU General Public License as published by 20;;; the Free Software Foundation; either version 2, or (at your option) 21;;; any later version. 22 23;;; This program is distributed in the hope that it will be useful, 24;;; but WITHOUT ANY WARRANTY; without even the implied warranty of 25;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 26;;; GNU General Public License for more details. 27 28;; A copy of the GNU General Public License is available at 29;; https://www.r-project.org/Licenses/ 30 31;;; Commentary: 32 33;; Purpose: 34;; 35;; To make efficient regexps from lists of strings. 36 37;; For example: 38;; 39;; (let ((strings '("cond" "if" "while" "let\\*?" "prog1" "prog2" "progn" 40;; "catch" "throw" "save-restriction" "save-excursion" 41;; "save-window-excursion" "save-match-data" 42;; "unwind-protect" "condition-case" "track-mouse"))) 43;; (concat "(" (make-regexp strings t))) 44;; 45;; => "(\\(c\\(atch\\|ond\\(\\|ition-case\\)\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(excursion\\|match-data\\|restriction\\|window-excursion\\)\\|t\\(hrow\\|rack-mouse\\)\\|unwind-protect\\|while\\)" 46;; 47;; To search for the above regexp takes about 70% of the time as for the simple 48;; (concat "(\\(" (mapconcat 'identity strings "\\|") "\\)") regexp. 49;; 50;; Obviously, the more the similarity between strings, the faster the regexp: 51;; 52;; (make-regexp '("abort" "abs" "accept" "access" "array" "begin" "body" "case" 53;; "constant" "declare" "delay" "delta" "digits" "else" "elsif" 54;; "entry" "exception" "exit" "function" "generic" "goto" "if" 55;; "others" "limited" "loop" "mod" "new" "null" "out" "subtype" 56;; "package" "pragma" "private" "procedure" "raise" "range" 57;; "record" "rem" "renames" "return" "reverse" "select" 58;; "separate" "task" "terminate" "then" "type" "when" "while" 59;; "with" "xor")) 60;; 61;; => "a\\(b\\(ort\\|s\\)\\|cce\\(pt\\|ss\\)\\|rray\\)\\|b\\(egin\\|ody\\)\\|c\\(ase\\|onstant\\)\\|d\\(e\\(clare\\|l\\(ay\\|ta\\)\\)\\|igits\\)\\|e\\(ls\\(e\\|if\\)\\|ntry\\|x\\(ception\\|it\\)\\)\\|function\\|g\\(eneric\\|oto\\)\\|if\\|l\\(imited\\|oop\\)\\|mod\\|n\\(ew\\|ull\\)\\|o\\(thers\\|ut\\)\\|p\\(ackage\\|r\\(agma\\|ivate\\|ocedure\\)\\)\\|r\\(a\\(ise\\|nge\\)\\|e\\(cord\\|m\\|names\\|turn\\|verse\\)\\)\\|s\\(e\\(lect\\|parate\\)\\|ubtype\\)\\|t\\(ask\\|erminate\\|hen\\|ype\\)\\|w\\(h\\(en\\|ile\\)\\|ith\\)\\|xor" 62;; 63;; To search for the above regexp takes less than 60% of the time of the simple 64;; mapconcat equivalent. 65;; 66;; But even small regexps may be worth it: 67;; 68;; (make-regexp '("and" "at" "do" "end" "for" "in" "is" "not" "of" "or" "use")) 69;; => "a\\(nd\\|t\\)\\|do\\|end\\|for\\|i[ns]\\|not\\|o[fr]\\|use" 70;; 71;; as this is 10% faster than the mapconcat equivalent. 72 73;; Installation: 74;; 75;; (autoload 'make-regexp "make-regexp" 76;; "Return a regexp to match a string item in STRINGS.") 77;; 78;; (autoload 'make-regexps "make-regexp" 79;; "Return a regexp to REGEXPS.") 80;; 81;; Since these functions were written to produce efficient regexps, not regexps 82;; efficiently, it is probably not a good idea to in-line too many calls in 83;; your code, unless you use the following neat trick with `eval-when-compile': 84;; 85;; (defvar definition-regexp 86;; (let ((regexp (eval-when-compile 87;; (make-regexp '("defun" "defsubst" "defmacro" "defalias" 88;; "defvar" "defconst" "defadvice") t)))) 89;; (concat "^(" regexp))) 90;; 91;; The `byte-compile' code will be as if you had defined the variable thus: 92;; 93;; (defvar definition-regexp 94;; "^(\\(def\\(a\\(dvice\\|lias\\)\\|const\\|macro\\|subst\\|un\\|var\\)\\)") 95 96;; Feedback: 97;; 98;; Originally written for font-lock, from an idea from Stig's hl319. 99;; Please don't tell me that it doesn't produce optimal regexps; I know that 100;; already. But (ideas or) code to improve things (are) is welcome. Please 101;; test your code and tell me the speed up in searching an appropriate buffer. 102;; 103;; Please send me bug reports, bug fixes, and extensions, etc. 104;; Simon Marshall <simon@gnu.ai.mit.edu> 105 106;; History: 107;; 108;; 1.00--1.01: 109;; - Made `make-regexp' take `lax' to force top-level parentheses. 110;; - Fixed `make-regexps' for MATCH bug and new `font-lock-keywords'. 111;; - Added `unfontify' to user timing functions. 112;; 1.01--1.02: 113;; - Made `make-regexp' `let' a big `max-lisp-eval-depth'. 114 115;; The basic idea is to find the shortest common non-"" prefix each time, and 116;; squirrel it out. If there is no such prefix, we divide the list into two so 117;; that (at least) one half will have at least a one-character common prefix. 118 119;; In addition, we (a) delay the addition of () parenthesis as long as possible 120;; (until we're sure we need them), and (b) try to squirrel out one-character 121;; sequences (so we can use [] rather than ()). 122 123;;; Code: 124 125(defun make-regexp (strings &optional paren lax) 126 "Return a regexp to match a string item in STRINGS. 127If optional PAREN non-nil, output regexp parentheses around returned regexp. 128If optional LAX non-nil, don't output parentheses if it doesn't require them. 129Merges keywords to avoid backtracking in Emacs' regexp matcher." 130 (let* ((max-lisp-eval-depth (* 1024 1024)) 131 (strings (let ((l strings)) ; Paranoia---make strings unique! 132 (while l (setq l (setcdr l (delete (car l) (cdr l))))) 133 (sort strings 'string-lessp))) 134 (open-paren (if paren "\\(" "")) (close-paren (if paren "\\)" "")) 135 (open-lax (if lax "" open-paren)) (close-lax (if lax "" close-paren)) 136 (completion-ignore-case nil)) 137 (cond 138 ;; If there's only one string, just return it. 139 ((= (length strings) 1) 140 (concat open-lax (car strings) close-lax)) 141 ;; If there's an empty string, pull it out. 142 ((string= (car strings) "") 143 (if (and (= (length strings) 2) (= (length (nth 1 strings)) 1)) 144 (concat open-lax (nth 1 strings) "?" close-lax) 145 (concat open-paren "\\|" (make-regexp (cdr strings)) close-paren))) 146 ;; If there are only one-character strings, make a [] list instead. 147 ((= (length strings) (apply '+ (mapcar 'length strings))) 148 (concat open-lax "[" (mapconcat 'identity strings "") "]" close-lax)) 149 (t 150 ;; We have a list of strings. Is there a common prefix? 151 (let ((prefix (try-completion "" (mapcar 'list strings)))) 152 (if (> (length prefix) 0) 153 ;; Common prefix! Squirrel it out and recurse with the suffixes. 154 (let* ((len (length prefix)) 155 (sufs (mapcar (lambda (str) (substring str len)) strings))) 156 (concat open-paren prefix (make-regexp sufs t t) close-paren)) 157 ;; No common prefix. Is there a one-character sequence? 158 (let ((letters (let ((completion-regexp-list '("^.$"))) 159 (all-completions "" (mapcar 'list strings))))) 160 (if (> (length letters) 1) 161 ;; Do the one-character sequences, then recurse on the rest. 162 (let ((rest (let ((completion-regexp-list '("^..+$"))) 163 (all-completions "" (mapcar 'list strings))))) 164 (concat open-paren 165 (make-regexp letters) "\\|" (make-regexp rest) 166 close-paren)) 167 ;; No one-character sequence, so divide the list into two by 168 ;; dividing into those that start with a particular letter, and 169 ;; those that do not. 170 (let* ((char (substring (car strings) 0 1)) 171 (half1 (all-completions char (mapcar 'list strings))) 172 (half2 (nthcdr (length half1) strings))) 173 (concat open-paren 174 (make-regexp half1) "\\|" (make-regexp half2) 175 close-paren)))))))))) 176 177;; This stuff is realy for font-lock... 178 179;; Ahhh, the wonders of lisp... 180(defun regexp-span (regexp &optional start) 181 "Return the span or depth of REGEXP. 182This means the number of \"\\\\(...\\\\)\" pairs in REGEXP, optionally from START." 183 (let ((match (string-match (regexp-quote "\\(") regexp (or start 0)))) 184 (if (not match) 0 (1+ (regexp-span regexp (match-end 0)))))) 185 186;; The basic idea is to concat the regexps together, keeping count of the span 187;; of the regexps so that we can get the correct match for hilighting. 188(defun make-regexps (&rest regexps) 189 "Return a regexp to match REGEXPS 190Each item of REGEXPS should be of the form: 191 192 STRING ; A STRING to be used literally. 193 (STRING MATCH FACE DATA) ; Match STRING at depth MATCH with FACE 194 ; and highlight according to DATA. 195 (STRINGS FACE DATA) ; STRINGS is a list of strings FACE is 196 ; to highlight according to DATA. 197 198Returns a list of the form: 199 200 (REGEXP (MATCH FACE DATA) ...) 201 202For example: 203 204 (make-regexps \"^(\" 205 '((\"defun\" \"defalias\" \"defsubst\" \"defadvice\") keyword) 206 \"[ \t]*\" 207 '(\"\\\\([a-zA-Z-]+\\\\)?\" 1 function-name nil t)) 208 209 => 210 211 (\"^(\\\\(def\\\\(a\\\\(dvice\\\\|lias\\\\)\\\\|subst\\\\|un\\\\)\\\\)[ ]*\\\\([a-zA-Z-]+\\\\)?\" 212 (1 keyword) (4 function-name nil t)) 213 214Uses `make-regexp' to make efficient regexps." 215 (let ((regexp "") (data ())) 216 (while regexps 217 (cond ((stringp (car regexps)) 218 (setq regexp (concat regexp (car regexps)))) 219 ((stringp (nth 0 (car regexps))) 220 (setq data (cons (cons (+ (regexp-span regexp) 221 (nth 1 (car regexps))) 222 (nthcdr 2 (car regexps))) 223 data) 224 regexp (concat regexp (nth 0 (car regexps))))) 225 (t 226 (setq data (cons (cons (1+ (regexp-span regexp)) 227 (cdr (car regexps))) 228 data) 229 regexp (concat regexp (make-regexp (nth 0 (car regexps)) 230 t))))) 231 (setq regexps (cdr regexps))) 232 (cons regexp (nreverse data)))) 233 234;; timing functions removed due to name collisions with Gnus 235 236(provide 'make-regexp) 237 238;;; make-regexp.el ends here 239