1 #include <stdio.h>
2 #include <string.h>
3 #include "html_import.h"
4 #include "selection_store.h"
5 #include "utility.h"
6 
7 /*
8 static void html_import_start_element (GMarkupParseContext* context,
9                                        const gchar*         element_name,
10                                        const gchar**        attr_names,
11                                        const gchar**        attr_values,
12                                        HtmlImport*          hi,
13                                        GError**             error)
14 */
html_import_start_element(HtmlImport * hi,const gchar * element_name,const gchar ** attr_names,const gchar ** attr_values)15 void html_import_start_element (HtmlImport*   hi,
16                                 const gchar*  element_name,
17                                 const gchar** attr_names,
18                                 const gchar** attr_values)
19 {
20 	GList* node;
21 	gint   index;
22 	HtmlImportElement* hie = NULL;
23 
24 	// check tag <base href="some.base.address">
25 	if (g_ascii_strcasecmp (element_name, "base")==0) {
26 		for (index=0; attr_names[index]; index++) {
27 			if (g_ascii_strcasecmp (attr_names[index], "href")==0 ) {
28 				g_free (hi->base_href);
29 				hi->base_href = g_strdup (attr_values[index]);
30 			}
31 		}
32 	}
33 
34 	// find tag with element_list
35 	for (node=hi->element_list; node; node=node->next)
36 	{
37 		hie = node->data;
38 		if (g_ascii_strcasecmp (element_name, hie->element_name)==0)
39 		{
40 			hie = node->data;
41 			// check attr_name
42 			for (index=0; attr_names[index]; index++)
43 			{
44 				if (g_ascii_strcasecmp (attr_names[index], hie->attr_name)==0)
45 				{
46 					html_import_element_add_url (hie, attr_values[index]);
47 					break;
48 				}
49 			}
50 			break;
51 		}
52 	}
53 }
54 
html_import_new()55 HtmlImport* html_import_new()
56 {
57 	HtmlImport* hi = g_malloc (sizeof (HtmlImport));
58 
59 	hi->base_href = NULL;
60 
61 	hi->name_array = g_ptr_array_sized_new (16);
62 	hi->value_array = g_ptr_array_sized_new (16);
63 
64 	hi->element_list = NULL;
65 	hi->gstring = g_string_sized_new (80);
66 
67 	return hi;
68 }
69 
html_import_destroy(HtmlImport * hi)70 void html_import_destroy (HtmlImport* hi)
71 {
72 	GList* node;
73 
74 	g_free (hi->base_href);
75 
76 	g_ptr_array_free (hi->name_array, TRUE);
77 	g_ptr_array_free (hi->value_array, TRUE);
78 
79 	for (node=hi->element_list; node; node=node->next)
80 		html_import_element_destroy (node->data);
81 
82 	g_list_free (hi->element_list);
83 	g_string_free (hi->gstring, TRUE);
84 	g_free (hi);
85 }
86 
html_import_add_grabbed_element(HtmlImport * html_import,gchar * element_name,gchar * attr_name,gchar * label_name)87 void html_import_add_grabbed_element (HtmlImport* html_import,
88                                       gchar* element_name,
89                                       gchar* attr_name,
90                                       gchar* label_name)
91 {
92 	HtmlImportElement* hie;
93 
94 	hie = html_import_element_new (element_name, attr_name, label_name);
95 
96 	html_import->element_list = g_list_append (html_import->element_list,
97 	                                           hie);
98 }
99 
html_import_get_element(HtmlImport * html_import,gint index)100 HtmlImportElement* html_import_get_element (HtmlImport* html_import,
101                                             gint index)
102 {
103 	GList* node = g_list_nth (html_import->element_list, index);
104 
105 	return node->data;
106 }
107 
html_import_parse_tag(HtmlImport * hi,gchar * tag_str)108 void html_import_parse_tag (HtmlImport* hi, gchar* tag_str)
109 {
110 	gchar*   current      = tag_str;
111 	gchar*   element_name = NULL;
112 	gchar*   attr_name    = NULL;
113 	gchar*   attr_value   = NULL;
114 	gint     inside_level;
115 	gchar    inside_chr;
116 
117 	g_ptr_array_set_size (hi->name_array, 0);
118 	g_ptr_array_set_size (hi->value_array, 0);
119 
120 	// element
121 	while (*current) {
122 		if (*current == ' ') {
123 			*current++ = 0;
124 			element_name = tag_str;
125 			break;
126 		}
127 		current++;
128 	}
129 
130 	if (element_name==NULL || *element_name==0)
131 		return;
132 
133 	while (*current) {
134 		// skip space
135 		while (*current == ' ')
136 			current++;
137 
138 		// attribute name
139 		attr_name  = current;
140 		while (*current) {
141 			if (*current == '=') {
142 				*current++ = 0;
143 				break;
144 			}
145 			else if (*current == ' ') {
146 				attr_name = NULL;
147 				break;
148 			}
149 			current++;
150 		}
151 
152 		if (attr_name==NULL)
153 			continue;
154 
155 		// attribute value
156 		attr_value = current;
157 		inside_level = 0;
158 		inside_chr   = 0;
159 		while (*current) {
160 			if (*current == '"' || *current == '\'') {
161 				if (inside_chr != *current) {
162 					if (inside_level==0)
163 						attr_value = current+1;
164 					inside_chr = *current;
165 					inside_level++;
166 				}
167 				else if (inside_chr == *current) {
168 					inside_chr = (inside_chr=='"') ? '\'' : '"';
169 					inside_level--;
170 				}
171 
172 				if (current[1] == 0 || current[1] == ' ' || inside_level==0) {
173 					*current = 0;
174 					break;
175 				}
176 			}
177 			else if (*current == ' ' && inside_level == 0) {
178 				*current = 0;
179 				break;
180 			}
181 			current++;
182 		}
183 		// add attribute
184 		g_ptr_array_add (hi->name_array, attr_name);
185 		g_ptr_array_add (hi->value_array, attr_value);
186 	}
187 	g_ptr_array_add (hi->name_array, NULL);
188 	g_ptr_array_add (hi->value_array, NULL);
189 
190 	// call
191 	html_import_start_element (hi, element_name,
192 	                           (const gchar**)hi->name_array->pdata,
193 	                           (const gchar**)hi->value_array->pdata);
194 
195 }
196 
html_import_read_file(HtmlImport * hi,gchar * filename)197 gboolean html_import_read_file (HtmlImport* hi, gchar* filename)
198 {
199 	FILE*    file;
200 	guint8  *buffer, *data_end, *current;
201 
202 	guint    read_size;
203 	gboolean inside_tag = FALSE;
204 	GString* gstring;
205 
206 	file = fopen (filename, "r");
207 
208 	if (file==NULL)
209 		return FALSE;
210 
211 	buffer = g_malloc (4096);
212 	gstring = g_string_sized_new (4096);
213 
214 	do {
215 		read_size = fread (buffer, 1, 4096, file);
216 		data_end = buffer + read_size;
217 
218 		for (current=buffer; current!=data_end; current++) {
219 			switch (*current) {
220 			case '<':
221 				inside_tag = TRUE;
222 				g_string_truncate (gstring, 0);
223 				break;
224 			case '>':
225 				if (inside_tag)
226 					html_import_parse_tag (hi, gstring->str);
227 				inside_tag = FALSE;
228 				g_string_truncate (gstring, 0);
229 				break;
230 			case '\r':
231 			case '\n':
232 				// skip
233 				break;
234 			default:
235 				if (inside_tag)
236 					g_string_append_c (gstring, *current);
237 			}
238 			// check <tag> length
239 			if( gstring->len >= 4096 ) {
240 				inside_tag=FALSE;
241 				g_string_truncate(gstring, 0);
242 			}
243 		}
244 	} while (read_size);
245 
246 	g_string_free (gstring, TRUE);
247 	g_free (buffer);
248 
249 	fclose (file);
250 
251 	return TRUE;
252 }
253 
254 /*
255 gboolean html_import_read_file (HtmlImport* hi,
256                                 gchar* filename)
257 {
258 	GMarkupParseContext* parser_context;
259 	GMarkupParser        parser;
260 
261 	FILE*    file;
262 	guint    size;
263 	guint8*  buffer;
264 	GError*  error = NULL;
265 	gboolean parse_ok = TRUE;
266 
267 	parser.start_element = html_import_start_element;
268 	parser.end_element   = NULL;
269 	parser.text          = NULL;
270 	parser.passthrough   = NULL;
271 	parser.error         = NULL;
272 	parser_context = g_markup_parse_context_new (&parser, 0,
273 	                                             hi, NULL);
274 
275 	file = fopen (filename, "r");
276 	if (file==NULL)
277 		return FALSE;
278 
279 	buffer = g_malloc (4096);
280 
281 	while ( (size=fread (buffer, 1, 4096, file)) && parse_ok) {
282 		parse_ok = g_markup_parse_context_parse (parser_context,
283 		                                         buffer, size, &error);
284 	}
285 
286 	if (error)
287 		g_error_free (error);
288 
289 	g_markup_parse_context_end_parse (parser_context, NULL);
290 	g_markup_parse_context_free (parser_context);
291 
292 	g_free (buffer);
293 	fclose (file);
294 
295 	return parse_ok;
296 }
297 */
298 
html_import_get_url_first(HtmlImport * hi,HtmlImportIter * iter,const gchar ** string)299 gboolean html_import_get_url_first (HtmlImport*     hi,
300                                     HtmlImportIter* iter,
301                                     const gchar**   string)
302 {
303 	iter->first = TRUE;
304 	iter->element_list = hi->element_list;
305 
306 	return html_import_get_url_next (hi, iter, string);
307 }
308 
html_import_get_url_next(HtmlImport * hi,HtmlImportIter * iter,const gchar ** string)309 gboolean html_import_get_url_next (HtmlImport*     hi,
310                                    HtmlImportIter* iter,
311                                    const gchar**   string)
312 {
313 	HtmlImportElement* hie;
314 	gboolean    valid;
315 	gboolean    selected;
316 	GString*    gstring = hi->gstring;
317 	UrlPart     urlp;
318 
319 	for (; iter->element_list; iter->element_list=iter->element_list->next) {
320 		hie = iter->element_list->data;
321 
322 		// check if first time search in this HtmlImportElement
323 		if (iter->first) {
324 			iter->first = FALSE;
325 			valid = gtk_tree_model_get_iter_first (GTK_TREE_MODEL (hie->url_store),
326 			                                       &iter->tree_iter);
327 		}
328 		else {
329 			valid = gtk_tree_model_iter_next (GTK_TREE_MODEL (hie->url_store),
330 			                                  &iter->tree_iter);
331 		}
332 
333 		// search selected URL from HtmlImportElement
334 		while (valid) {
335 			gtk_tree_model_get (GTK_TREE_MODEL (hie->url_store),
336 			                    &iter->tree_iter,
337 			                    SELECTION_STORE_SELECTED, &selected,
338 			                    SELECTION_STORE_STRING,   string,
339 			                    -1);
340 			if (selected) {
341 				url_part (&urlp, *string);
342 				g_string_truncate (gstring, 0);
343 
344 				if (urlp.protocol_beg==NULL) {
345 					url_part (&urlp, hi->base_href);
346 					g_string_append_len (gstring, urlp.protocol_beg,
347 					                     urlp.folder_end - urlp.protocol_beg);
348 				}
349 				g_string_append (gstring, *string);
350 				*string = gstring->str;
351 				return TRUE;
352 			}
353 
354 			valid = gtk_tree_model_iter_next (GTK_TREE_MODEL (hie->url_store),
355 			                                  &iter->tree_iter);
356 		}
357 		iter->first = TRUE;
358 	}
359 	return FALSE;
360 }
361 
362 // =================================================================
363 
html_import_class_init(HtmlImportClass * hic)364 void html_import_class_init (HtmlImportClass* hic)
365 {
366 	hic->hash_table = g_hash_table_new_full (g_str_hash, g_str_equal,
367 	                                         (GDestroyNotify)g_free,
368 	                                         NULL);
369 	hic->index_array = g_array_sized_new (FALSE, FALSE, sizeof(gint), 100);
370 	hic->list_store = selection_store_new ();
371 }
372 
html_import_class_clear(HtmlImportClass * hic)373 void html_import_class_clear (HtmlImportClass* hic)
374 {
375 	g_hash_table_destroy (hic->hash_table);
376 	g_array_free (hic->index_array, TRUE);
377 	g_object_unref (hic->list_store);
378 }
379 
html_import_class_add(HtmlImportClass * hic,const gchar * class_name)380 void html_import_class_add (HtmlImportClass* hic,
381                             const gchar* class_name)
382 {
383 	gint index;
384 	gpointer key, val;
385 	GtkTreeIter iter;
386 
387 	if (class_name==NULL || *class_name==0)
388 		class_name="(null)";
389 
390 	if( g_hash_table_lookup_extended (hic->hash_table,
391 	                                  class_name,
392 	                                  &key, &val) )
393 	{
394 		index = GPOINTER_TO_INT(val);
395 	}
396 	else {
397 		printf ("--- class:%s\n", class_name);
398 		gtk_list_store_append (hic->list_store, &iter);
399 		gtk_list_store_set (hic->list_store, &iter,
400 		                    SELECTION_STORE_STRING, class_name,
401 		                    -1);
402 		index = -1;
403 	}
404 	g_hash_table_insert (hic->hash_table,
405 	                     (gpointer)g_strdup(class_name),
406 	                     GINT_TO_POINTER(hic->index_array->len));
407 	g_array_append_val (hic->index_array, index);
408 }
409 
html_import_class_apply_selection_to_element(HtmlImportClass * hic,HtmlImportElement * hie)410 void html_import_class_apply_selection_to_element (HtmlImportClass*   hic,
411                                                    HtmlImportElement* hie)
412 {
413 	gboolean valid;
414 	gboolean selected;
415 	const gchar* string;
416 	GtkTreeIter iter;
417 	GtkTreeIter iter_list;
418 	gpointer key, val;
419 	gint index;
420 
421 	valid = gtk_tree_model_get_iter_first (GTK_TREE_MODEL (hic->list_store),
422 	                                       &iter);
423 	while(valid) {
424 		gtk_tree_model_get (GTK_TREE_MODEL(hic->list_store), &iter,
425 		                    SELECTION_STORE_SELECTED, &selected,
426 		                    SELECTION_STORE_STRING,   &string,
427 		                    -1);
428 		if( selected ) {
429 			g_hash_table_lookup_extended (hic->hash_table, string,
430 			                              &key, &val);
431 			index = GPOINTER_TO_INT(val);
432 			while( index != -1 ) {
433 				gint count;
434 				gboolean selected;
435 				gtk_tree_model_iter_nth_child (GTK_TREE_MODEL (hie->url_store),
436 				                               &iter_list,
437 				                               NULL, index);
438 				gtk_tree_model_get (GTK_TREE_MODEL (hie->url_store),
439 				                    &iter_list,
440 				                    SELECTION_STORE_COUNT, &count,
441 				                    -1);
442 
443 				selected = (count) ? TRUE : FALSE;
444 				count++;
445 
446 				gtk_list_store_set (hie->url_store, &iter_list,
447 				                    SELECTION_STORE_SELECTED, selected,
448 				                    SELECTION_STORE_COUNT,    count,
449 				                    -1);
450 
451 				index = g_array_index (hic->index_array, gint, index);
452 			}
453 		}
454 		valid = gtk_tree_model_iter_next (GTK_TREE_MODEL (hic->list_store),
455 		                                  &iter);
456 	}
457 }
458 
459 // =================================================================
460 
html_import_element_new(gchar * element_name,gchar * attr_name,gchar * label_name)461 HtmlImportElement* html_import_element_new (gchar* element_name,
462                                             gchar* attr_name,
463                                             gchar* label_name)
464 {
465 	HtmlImportElement* he = g_malloc (sizeof (HtmlImportElement));
466 
467 	he->element_name = g_strdup (element_name);
468 	he->attr_name    = g_strdup (attr_name);
469 	he->label_name   = g_strdup (label_name);
470 
471 	he->url_store = selection_store_new ();
472 
473 	html_import_class_init (&he->extension);
474 	html_import_class_init (&he->address);
475 
476 	return he;
477 }
478 
html_import_element_destroy(HtmlImportElement * he)479 void html_import_element_destroy (HtmlImportElement* he)
480 {
481 	g_free (he->element_name);
482 	g_free (he->attr_name);
483 	g_free (he->label_name);
484 
485 	g_object_unref (he->url_store);
486 
487 	html_import_class_clear (&he->extension);
488 	html_import_class_clear (&he->address);
489 
490 	g_free (he);
491 }
492 
html_import_element_add_url(HtmlImportElement * he,const gchar * url)493 void html_import_element_add_url (HtmlImportElement* he,
494                                   const gchar* url)
495 {
496 	GtkTreeIter iter;
497 	gchar*  url_str;
498 	gchar*  ext = NULL;
499 	gchar*  addr = NULL;
500 	UrlPart urlp;
501 
502 	url_str = g_strdup (url);
503 	url_part (&urlp, url_str);
504 
505 	if (urlp.ext_end)
506 		ext = urlp.ext_beg;
507 	if (urlp.address_end) {
508 		*urlp.address_end = 0;
509 		addr = urlp.address_beg;
510 	}
511 
512 	html_import_class_add (&he->extension, ext);
513 	html_import_class_add (&he->address, addr);
514 
515 	gtk_list_store_append (he->url_store, &iter);
516 	gtk_list_store_set (he->url_store, &iter,
517 	                    SELECTION_STORE_STRING, url,
518 	                    -1);
519 
520 	g_free (url_str);
521 }
522 
html_import_element_apply_selection(HtmlImportElement * hie)523 void html_import_element_apply_selection (HtmlImportElement* hie)
524 {
525 	html_import_class_apply_selection_to_element (&hie->address, hie);
526 	html_import_class_apply_selection_to_element (&hie->extension, hie);
527 }
528 
529