1 /*************************************************************************/
2 /* Copyright (c) 2005 Sampo Pyysalo */
3 /* */
4 /* Use of the link grammar parsing system is subject to the terms of the */
5 /* license set forth in the LICENSE file included with this software. */
6 /* This license allows free redistribution and use in source and binary */
7 /* forms, with or without modification, subject to certain conditions. */
8 /* */
9 /*************************************************************************/
10
11 #include <string.h>
12 #include "link-includes.h"
13 #include "dict-common/dict-common.h"
14 #include "dict-common/file-utils.h"
15 #include "read-regex.h"
16
17 /*
18 Function for reading regular expression name:pattern combinations
19 into the Dictionary from a given file.
20
21 The format of the regex file is as follows:
22
23 Lines starting with "%" are comments and are ignored.
24 All other nonempty lines must follow the following format:
25
26 REGEX_NAME: /pattern/
27
28 here REGEX_NAME is an identifying unique name for the regex.
29 This name is used to determine the disjuncts that will be assigned to
30 tokens matching the pattern, so in the dictionary file (e.g. 4.0.dict)
31 you must have something like
32
33 REGEX_NAME: (({@MX+} & (JG- or <noun-main-s>)) or YS+)) or AN+ or G+);
34
35 using the same name. The pattern itself must be surrounded by slashes.
36 Extra whitespace is ignored.
37 */
38
39 #define MAX_REGEX_NAME_LENGTH 50
40 #define MAX_REGEX_LENGTH 255
41
read_regex_file(Dictionary dict,const char * file_name)42 int read_regex_file(Dictionary dict, const char *file_name)
43 {
44 Regex_node **tail = &dict->regex_root; /* Last Regex_node * in list */
45 Regex_node *new_re;
46 char name[MAX_REGEX_NAME_LENGTH];
47 char regex[MAX_REGEX_LENGTH];
48 int c,prev,i,line=1;
49 FILE *fp;
50
51 fp = dictopen(file_name, "r");
52 if (fp == NULL)
53 {
54 prt_error("Error: cannot open regex file %s\n", file_name);
55 return 1;
56 }
57
58 /* read in regexs. loop broken on EOF. */
59 while (1)
60 {
61 bool neg = false;
62
63 /* skip whitespace and comments. */
64 do
65 {
66 do
67 {
68 c = fgetc(fp);
69 if (c == '\n') { line++; }
70 }
71 while (lg_isspace(c));
72
73 if (c == '%')
74 {
75 while ((c != EOF) && (c != '\n')) { c = fgetc(fp); }
76 line++;
77 }
78 }
79 while (lg_isspace(c));
80
81 if (c == EOF) { break; } /* done. */
82
83 /* read in the name of the regex. */
84 i = 0;
85 do
86 {
87 if (i >= MAX_REGEX_NAME_LENGTH-1)
88 {
89 prt_error("Error: Regex name too long on line %d\n", line);
90 goto failure;
91 }
92 name[i++] = c;
93 c = fgetc(fp);
94 }
95 while ((!lg_isspace(c)) && (c != ':') && (c != EOF));
96 name[i] = '\0';
97
98 /* Skip possible whitespace after name, expect colon. */
99 while (lg_isspace(c))
100 {
101 if (c == '\n') { line++; }
102 c = fgetc(fp);
103 }
104 if (c != ':')
105 {
106 prt_error("Error: Regex missing colon on line %d\n", line);
107 goto failure;
108 }
109
110 /* Skip whitespace after colon, expect slash. */
111 do
112 {
113 if (c == '\n') { line++; }
114 c = fgetc(fp);
115 }
116 while (lg_isspace(c));
117 if (c == '!')
118 {
119 neg = true;
120 do
121 {
122 if (c == '\n') { line++; }
123 c = fgetc(fp);
124 }
125 while (lg_isspace(c));
126 }
127 if (c != '/')
128 {
129 prt_error("Error: Regex missing leading slash on line %d\n", line);
130 goto failure;
131 }
132
133 /* Read in the regex. */
134 i = 0;
135 do
136 {
137 if (i > MAX_REGEX_LENGTH-1)
138 {
139 prt_error("Error: Regex too long on line %d\n", line);
140 goto failure;
141 }
142 prev = c;
143 c = fgetc(fp);
144 if ((c == '/') && (prev == '\\'))
145 regex[i-1] = '/'; /* \/ is undefined */
146 else
147 regex[i++] = c;
148 }
149 while ((c != '/' || prev == '\\') && (c != EOF));
150 regex[i-1] = '\0';
151
152 /* Expect termination by a slash. */
153 if (c != '/')
154 {
155 prt_error("Error: Regex missing trailing slash on line %d\n", line);
156 goto failure;
157 }
158
159 /* Create new Regex_node and add to dict list. */
160 new_re = (Regex_node *) malloc(sizeof(Regex_node));
161 new_re->name = strdup(name);
162 new_re->pattern = strdup(regex);
163 new_re->neg = neg;
164 new_re->re = NULL;
165 new_re->next = NULL;
166 *tail = new_re;
167 tail = &new_re->next;
168 }
169
170 fclose(fp);
171 return 0;
172 failure:
173 fclose(fp);
174 return 1;
175 }
176
177