1 /*************************************************************************/
2 /* Copyright (c) 2005 Sampo Pyysalo                                      */
3 /*                                                                       */
4 /* Use of the link grammar parsing system is subject to the terms of the */
5 /* license set forth in the LICENSE file included with this software.    */
6 /* This license allows free redistribution and use in source and binary  */
7 /* forms, with or without modification, subject to certain conditions.   */
8 /*                                                                       */
9 /*************************************************************************/
10 
11 #include <string.h>
12 #include "link-includes.h"
13 #include "dict-common/dict-common.h"
14 #include "dict-common/file-utils.h"
15 #include "read-regex.h"
16 
17 /*
18   Function for reading regular expression name:pattern combinations
19   into the Dictionary from a given file.
20 
21   The format of the regex file is as follows:
22 
23   Lines starting with "%" are comments and are ignored.
24   All other nonempty lines must follow the following format:
25 
26       REGEX_NAME:  /pattern/
27 
28   here REGEX_NAME is an identifying unique name for the regex.
29   This name is used to determine the disjuncts that will be assigned to
30   tokens matching the pattern, so in the dictionary file (e.g. 4.0.dict)
31   you must have something like
32 
33      REGEX_NAME:  (({@MX+} & (JG- or <noun-main-s>)) or YS+)) or AN+ or G+);
34 
35   using the same name. The pattern itself must be surrounded by slashes.
36   Extra whitespace is ignored.
37 */
38 
39 #define MAX_REGEX_NAME_LENGTH 50
40 #define MAX_REGEX_LENGTH      255
41 
read_regex_file(Dictionary dict,const char * file_name)42 int read_regex_file(Dictionary dict, const char *file_name)
43 {
44 	Regex_node **tail = &dict->regex_root; /* Last Regex_node * in list */
45 	Regex_node *new_re;
46 	char name[MAX_REGEX_NAME_LENGTH];
47 	char regex[MAX_REGEX_LENGTH];
48 	int c,prev,i,line=1;
49 	FILE *fp;
50 
51 	fp = dictopen(file_name, "r");
52 	if (fp == NULL)
53 	{
54 		prt_error("Error: cannot open regex file %s\n", file_name);
55 		return 1;
56 	}
57 
58 	/* read in regexs. loop broken on EOF. */
59 	while (1)
60 	{
61 		bool neg = false;
62 
63 		/* skip whitespace and comments. */
64 		do
65 		{
66 			do
67 			{
68 				c = fgetc(fp);
69 				if (c == '\n') { line++; }
70 			}
71 			while (lg_isspace(c));
72 
73 			if (c == '%')
74 			{
75 				while ((c != EOF) && (c != '\n')) { c = fgetc(fp); }
76 				line++;
77 			}
78 		}
79 		while (lg_isspace(c));
80 
81 		if (c == EOF) { break; } /* done. */
82 
83 		/* read in the name of the regex. */
84 		i = 0;
85 		do
86 		{
87 			if (i >= MAX_REGEX_NAME_LENGTH-1)
88 			{
89 				prt_error("Error: Regex name too long on line %d\n", line);
90 				goto failure;
91 			}
92 			name[i++] = c;
93 			c = fgetc(fp);
94 		}
95 		while ((!lg_isspace(c)) && (c != ':') && (c != EOF));
96 		name[i] = '\0';
97 
98 		/* Skip possible whitespace after name, expect colon. */
99 		while (lg_isspace(c))
100 		{
101 			if (c == '\n') { line++; }
102 			c = fgetc(fp);
103 		}
104 		if (c != ':')
105 		{
106 			prt_error("Error: Regex missing colon on line %d\n", line);
107 			goto failure;
108 		}
109 
110 		/* Skip whitespace after colon, expect slash. */
111 		do
112 		{
113 			if (c == '\n') { line++; }
114 			c = fgetc(fp);
115 		}
116 		while (lg_isspace(c));
117 		if (c == '!')
118 		{
119 			neg = true;
120 			do
121 			{
122 				if (c == '\n') { line++; }
123 				c = fgetc(fp);
124 			}
125 			while (lg_isspace(c));
126 		}
127 		if (c != '/')
128 		{
129 			prt_error("Error: Regex missing leading slash on line %d\n", line);
130 			goto failure;
131 		}
132 
133 		/* Read in the regex. */
134 		i = 0;
135 		do
136 		{
137 			if (i > MAX_REGEX_LENGTH-1)
138 			{
139 				prt_error("Error: Regex too long on line %d\n", line);
140 				goto failure;
141 			}
142 			prev = c;
143 			c = fgetc(fp);
144 			if ((c == '/') && (prev == '\\'))
145 				regex[i-1] = '/'; /* \/ is undefined */
146 			else
147 				regex[i++] = c;
148 		}
149 		while ((c != '/' || prev == '\\') && (c != EOF));
150 		regex[i-1] = '\0';
151 
152 		/* Expect termination by a slash. */
153 		if (c != '/')
154 		{
155 			prt_error("Error: Regex missing trailing slash on line %d\n", line);
156 			goto failure;
157 		}
158 
159 		/* Create new Regex_node and add to dict list. */
160 		new_re = (Regex_node *) malloc(sizeof(Regex_node));
161 		new_re->name    = strdup(name);
162 		new_re->pattern = strdup(regex);
163 		new_re->neg     = neg;
164 		new_re->re      = NULL;
165 		new_re->next    = NULL;
166 		*tail = new_re;
167 		tail = &new_re->next;
168 	}
169 
170 	fclose(fp);
171 	return 0;
172 failure:
173 	fclose(fp);
174 	return 1;
175 }
176 
177