1"======================================================================
2|
3|   Regular expressions
4|
5|
6 ======================================================================"
7
8
9"======================================================================
10|
11| Copyright 1999 Free Software Foundation, Inc.
12| Written by Paolo Bonzini.
13|
14| This file is part of GNU Smalltalk.
15|
16| GNU Smalltalk is free software; you can redistribute it and/or modify it
17| under the terms of the GNU General Public License as published by the Free
18| Software Foundation; either version 2, or (at your option) any later version.
19|
20| GNU Smalltalk is distributed in the hope that it will be useful, but WITHOUT
21| ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
22| FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
23| details.
24|
25| You should have received a copy of the GNU General Public License along with
26| GNU Smalltalk; see the file COPYING.  If not, write to the Free Software
27| Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
28|
29 ======================================================================"
30
31Object subclass: #RegularExpression
32    instanceVariableNames: 'selectors params noDollar string'
33    classVariableNames: ''
34    poolDictionaries: ''
35    category: 'Examples-Useful'!
36
37!RegularExpression class methodsFor: 'instance creation'!
38
39new
40    self shouldNotImplement
41!
42
43fromString: aString
44    ^super new parseString: aString
45! !
46
47!RegularExpression class methodsFor: 'parsing'!
48
49match: aString to: regExp
50    ^(self fromString: regExp) match: aString
51! !
52
53!RegularExpression methodsFor: 'parsing'!
54
55match: aString
56
57    ^self matchStream: (ReadStream on: aString) from: 1
58!
59
60matchStream: aStream
61
62    ^self matchStream: aStream from: 1
63! !
64
65!RegularExpression methodsFor: 'private'!
66
67parseString: aString
68    "Private - Convert a regular expression to its internal representation"
69
70    | regexp endPos |
71
72    params := OrderedCollection new.
73    selectors := OrderedCollection new.
74
75    "Zero-length aString is a special case"
76    aString size = 0
77	ifTrue: [
78	    noDollar := true.
79	    self addDotAsterisk.
80	    ^self
81	].
82
83    regexp := ReadStream on: aString.
84    noDollar := aString last ~= $$.
85
86    (regexp peekFor: $^)
87	ifFalse: [ self addDotAsterisk ]
88
89    endPos := noDollar
90	ifTrue: [ aString size ]
91	ifFalse: [ aString size - 1 ].
92
93    [ regexp position > endPos ] whileFalse: [ self parseAtom: regexp ]
94!
95
96addDotAsterisk
97
98    "Add an implicit .* sequence"
99    params addLast: nil.
100    selectors addLast: #wild0any:index:
101!
102
103parseAtom: regexp
104    "Private - Parse an 'atom' of the regular expression. Add to selectors the
105     selector to be called to match it, and add to params the first parameter that
106     will be passed to this selector"
107
108    | next |
109    (next := regexp next) = $\
110	ifTrue: [
111	    params addLast: regexp next.
112	    ^selectors addLast: #char:index:
113	].
114
115    (next = $+) & selectors notEmpty
116	ifTrue: [
117	    ^selectors
118		at: selectors size
119		put: ('wild1', selectors last) asSymbol
120	].
121
122    (next = $*) & selectors notEmpty
123	ifTrue: [
124	    ^selectors
125		at: selectors size
126		put: ('wild0', selectors last) asSymbol
127	].
128
129    next = $.
130	ifTrue: [
131	    params addLast: nil.
132	    ^selectors addLast: #any:index:
133	].
134
135    next = $[
136	ifTrue: [
137	    (regexp peekFor: $^)
138		ifTrue: [ selectors addLast: #notRange:index: ]
139		ifFalse: [ selectors addLast: #range:index: ].
140
141	    params addLast: (self parseRange: regexp).
142	    ^selectors last
143	].
144
145    next = ${
146	ifTrue: [
147	    params addLast: (self parseOptional: regexp).
148	    ^selectors addLast: #optional:index:
149	].
150
151    params addLast: next.
152    ^selectors addLast: #char:index:
153!
154
155parseRange: regexp
156
157    "Private - Parse a 'range atom', that is an atom that can match to seve-
158     ral characters."
159
160    | next answerStream |
161    answerStream := WriteStream on: (String new: 8).  "Number out of a hat"
162
163    [   (next := regexp next) = $] ] whileFalse: [
164	answerStream nextPut: next.
165	regexp atEnd ifTrue: [ self errorBadRegexp ].
166
167	(regexp peekFor: $-) ifTrue: [
168	    regexp atEnd ifTrue: [ self errorBadRegexp ].
169
170	    next asciiValue + 1 to: regexp next asciiValue do: [:i |
171		answerStream nextPut: i asCharacter
172	    ]
173	]
174    ].
175    ^answerStream contents!
176
177parseOptional: regexp
178
179    "Private - Parse an 'optional atom', that is an atom that can match to
180     several regular expressions."
181
182    | pos next result |
183    pos := regexp position.
184    result := OrderedCollection new.
185    [
186	(next := regexp next) = $\
187	    ifTrue: [regexp next]
188	    ifFalse: [
189		next = $| ifTrue: [
190		    result addLast: (self class fromString:
191			'^', (regexp copyFrom: pos to: regexp position - 2)).
192		    pos := regexp position
193		].
194		next = $} ifTrue: [
195		    result addLast: (self class fromString:
196			'^', (regexp copyFrom: pos to: regexp position - 2)).
197		    ^result
198		]
199	    ].
200	regexp atEnd
201    ]   whileFalse: [ ].
202
203    "If we come here, we have found no } : bad regular expression"
204    self errorBadRegexp
205!
206
207errorBadRegexp
208
209    "Seems like we had some problems parsing the regular expression"
210    self error: 'Bad regular expression'
211!
212
213char: aCharacter index: dummy
214
215    "Private - Check if the next character matchs to aCharacter"
216
217    ^string atEnd
218	ifTrue: [ false ]
219	ifFalse: [ aCharacter = string next ]
220!
221
222any: dummy index: dummy2
223
224    "Private - If we aren't at the end of the stream, skip a character, else
225     answer false"
226
227    ^string atEnd
228	ifTrue: [ false ]
229	ifFalse: [ string next. true ]
230!
231
232range: aString index: dummy
233
234    "Private - Check if the next character is included in aString"
235
236    ^string atEnd
237	ifTrue: [ false ]
238	ifFalse: [ aString includes: string next ]
239!
240
241notRange: aString index: dummy
242
243    "Private - Check if the next character is not included in aString"
244
245    ^string atEnd
246	ifTrue: [ false]
247	ifFalse: [ (aString includes: string next) not ]
248!
249
250optional: listOfRegexp index: dummy
251
252    "Private - Check if the next characters match to any of the RegularExpression
253     objects in listOfRegexp"
254
255    | pos |
256
257    string atEnd ifTrue: [^false].
258
259    pos := string position.
260    listOfRegexp do: [ :re |
261	(re matchStream: string from: 1) ifTrue: [^true].
262	string position: pos.
263    ].
264    ^false
265!
266
267wild0any: atLeast1 index: index
268
269    "Private - Match a .* sequence"
270
271    ^self matchWild: #any:index: with: nil following:
272	[ self matchStream: string from: index + 1 ]
273!
274
275wild1any: atLeast1 index: index
276
277    "Private - Match a .+ sequence"
278
279    (self any: nil index: index) ifFalse: [^false].
280
281    ^self matchWild: #any:index: with: nil following:
282	[ self matchStream: string from: index + 1 ]
283!
284
285wild0range: aString index: index
286
287    "Private - Match a [...]* sequence"
288
289    ^self matchWild: #range:index: with: aString following:
290	[ self matchStream: string from: index + 1 ]
291!
292
293wild1range: aString index: index
294
295    "Private - Match a [...]+ sequence"
296
297    (self range: aString index: index) ifFalse: [^false].
298    ^self matchWild: #range:index: with: aString following:
299	[ self matchStream: string from: index + 1 ]
300!
301
302wild0notRange: aString index: index
303
304    "Private - Match a [...]* sequence"
305
306    ^self matchWild: #notRange:index: with: aString following:
307	[ self matchStream: string from: index + 1 ]
308!
309
310wild1notRange: aString index: index
311
312    "Private - Match a [...]+ sequence"
313
314    (self notRange: aString index: index) ifFalse: [^false].
315    ^self matchWild: #notRange:index: with: aString following:
316	[ self matchStream: string from: index + 1 ]
317!
318
319wild0char: aCharacter index: index
320
321    "Private - Match a x* sequence"
322
323    ^self matchWild: #char:index: with: aCharacter following:
324	[ self matchStream: string from: index + 1 ]
325!
326
327wild1char: aCharacter index: index
328
329    "Private - Match a x+ sequence"
330
331    (self char: aCharacter index: index) ifFalse: [^false].
332    ^self matchWild: #char:index: with: aCharacter following:
333	[ self matchStream: string from: index + 1 ]
334!
335
336matchWild: aSymbol with: arg following: aBlock
337
338    "Private - Helper function for * sequences (+ sequences are parsed by
339     checking for a match and then treating them as * sequences: for example,
340     x+ becomes xx*). Try to match as many characters as possible and then
341     look if the remaining part of the string matches the rest of the regular
342     expression (to do so, aBlock is evaluated): if yes, answer nil; if no,
343     try again with one character less. For example, matching [ABC]*AC to the
344     string BAC works in this way:
345	  - try with the longest run of As, Bs or Cs (BAC). The rest of the
346	    string (that is, nothing) doesn't match the regular expression AC, so...
347	  - ...try with BA. The rest of the string (that is, C) doesn't match
348	    the regular expression AC, so...
349	  - ...try with B. The rest of the string (that is, AC) does match the
350	    regular expression AC, so we answer nil"
351
352    | first last |
353    first := string position.
354    last := self findLastWild: aSymbol with: arg.
355
356    last to: first by: -1 do: [ :i |
357	(aBlock value == false)
358	    ifFalse: [ ^nil ].
359
360	i > 1 ifTrue: [self position: i - 1].
361    ].
362    ^false
363!
364
365findLastWild: aSymbol with: arg
366    "Send aSymbol with arg and nil as its parameter until it answers false and
367     answer the position of the last character for which aSymbol answered true"
368
369    [   string atEnd ifTrue: [ ^string position ].
370	self perform: aSymbol with: arg with: nil ] whileTrue: [ ].
371
372    string skip: -1.
373    ^string position
374!
375
376matchStream: aStream from: firstIndex
377
378    "Private - Match all the atoms from the firstIndex-th to the string on
379     which aStream is streaming. Answer true or false"
380
381    | result oldString |
382
383    oldString := string.
384    self string: aStream.
385
386    firstIndex to: self numberOfAtoms do: [ :i |
387	" result = true ---> go on; result = false ---> answer false;
388	  result = nil ---> answer true"
389	result := self matchAtom: i.
390	result == true ifFalse: [ string := oldString. ^result isNil ].
391    ].
392    result := self checkIfAtEnd.
393    string := oldString.
394    ^result
395!
396
397string: aStream
398
399    "Private - Tell the other methods which string is being parsed"
400
401    string := aStream
402!
403
404checkIfAtEnd
405
406    "Private - Answer true if there is no $ or if we're at the end of the
407     parsed string"
408
409    ^noDollar or: [string atEnd]
410!
411
412numberOfAtoms
413
414    "Private - Answer the number of atoms in the receiver"
415
416    ^selectors size
417!
418
419matchAtom: index
420
421    "Private - Try to match an atom to string"
422
423    | result |
424
425    "index print. $  print.
426     (selectors at: index) print. $  print.
427     (params at: index) print. $  print.
428     string peek print. $  printNl."
429
430    ^self perform: (selectors at: index) with: (params at: index) with: index
431! !
432