1 #include <stdio.h>
2 #include <string.h>
3 #include "html_import.h"
4 #include "selection_store.h"
5 #include "utility.h"
6
7 /*
8 static void html_import_start_element (GMarkupParseContext* context,
9 const gchar* element_name,
10 const gchar** attr_names,
11 const gchar** attr_values,
12 HtmlImport* hi,
13 GError** error)
14 */
html_import_start_element(HtmlImport * hi,const gchar * element_name,const gchar ** attr_names,const gchar ** attr_values)15 void html_import_start_element (HtmlImport* hi,
16 const gchar* element_name,
17 const gchar** attr_names,
18 const gchar** attr_values)
19 {
20 GList* node;
21 gint index;
22 HtmlImportElement* hie = NULL;
23
24 // check tag <base href="some.base.address">
25 if (g_ascii_strcasecmp (element_name, "base")==0) {
26 for (index=0; attr_names[index]; index++) {
27 if (g_ascii_strcasecmp (attr_names[index], "href")==0 ) {
28 g_free (hi->base_href);
29 hi->base_href = g_strdup (attr_values[index]);
30 }
31 }
32 }
33
34 // find tag with element_list
35 for (node=hi->element_list; node; node=node->next)
36 {
37 hie = node->data;
38 if (g_ascii_strcasecmp (element_name, hie->element_name)==0)
39 {
40 hie = node->data;
41 // check attr_name
42 for (index=0; attr_names[index]; index++)
43 {
44 if (g_ascii_strcasecmp (attr_names[index], hie->attr_name)==0)
45 {
46 html_import_element_add_url (hie, attr_values[index]);
47 break;
48 }
49 }
50 break;
51 }
52 }
53 }
54
html_import_new()55 HtmlImport* html_import_new()
56 {
57 HtmlImport* hi = g_malloc (sizeof (HtmlImport));
58
59 hi->base_href = NULL;
60
61 hi->name_array = g_ptr_array_sized_new (16);
62 hi->value_array = g_ptr_array_sized_new (16);
63
64 hi->element_list = NULL;
65 hi->gstring = g_string_sized_new (80);
66
67 return hi;
68 }
69
html_import_destroy(HtmlImport * hi)70 void html_import_destroy (HtmlImport* hi)
71 {
72 GList* node;
73
74 g_free (hi->base_href);
75
76 g_ptr_array_free (hi->name_array, TRUE);
77 g_ptr_array_free (hi->value_array, TRUE);
78
79 for (node=hi->element_list; node; node=node->next)
80 html_import_element_destroy (node->data);
81
82 g_list_free (hi->element_list);
83 g_string_free (hi->gstring, TRUE);
84 g_free (hi);
85 }
86
html_import_add_grabbed_element(HtmlImport * html_import,gchar * element_name,gchar * attr_name,gchar * label_name)87 void html_import_add_grabbed_element (HtmlImport* html_import,
88 gchar* element_name,
89 gchar* attr_name,
90 gchar* label_name)
91 {
92 HtmlImportElement* hie;
93
94 hie = html_import_element_new (element_name, attr_name, label_name);
95
96 html_import->element_list = g_list_append (html_import->element_list,
97 hie);
98 }
99
html_import_get_element(HtmlImport * html_import,gint index)100 HtmlImportElement* html_import_get_element (HtmlImport* html_import,
101 gint index)
102 {
103 GList* node = g_list_nth (html_import->element_list, index);
104
105 return node->data;
106 }
107
html_import_parse_tag(HtmlImport * hi,gchar * tag_str)108 void html_import_parse_tag (HtmlImport* hi, gchar* tag_str)
109 {
110 gchar* current = tag_str;
111 gchar* element_name = NULL;
112 gchar* attr_name = NULL;
113 gchar* attr_value = NULL;
114 gint inside_level;
115 gchar inside_chr;
116
117 g_ptr_array_set_size (hi->name_array, 0);
118 g_ptr_array_set_size (hi->value_array, 0);
119
120 // element
121 while (*current) {
122 if (*current == ' ') {
123 *current++ = 0;
124 element_name = tag_str;
125 break;
126 }
127 current++;
128 }
129
130 if (element_name==NULL || *element_name==0)
131 return;
132
133 while (*current) {
134 // skip space
135 while (*current == ' ')
136 current++;
137
138 // attribute name
139 attr_name = current;
140 while (*current) {
141 if (*current == '=') {
142 *current++ = 0;
143 break;
144 }
145 else if (*current == ' ') {
146 attr_name = NULL;
147 break;
148 }
149 current++;
150 }
151
152 if (attr_name==NULL)
153 continue;
154
155 // attribute value
156 attr_value = current;
157 inside_level = 0;
158 inside_chr = 0;
159 while (*current) {
160 if (*current == '"' || *current == '\'') {
161 if (inside_chr != *current) {
162 if (inside_level==0)
163 attr_value = current+1;
164 inside_chr = *current;
165 inside_level++;
166 }
167 else if (inside_chr == *current) {
168 inside_chr = (inside_chr=='"') ? '\'' : '"';
169 inside_level--;
170 }
171
172 if (current[1] == 0 || current[1] == ' ' || inside_level==0) {
173 *current = 0;
174 break;
175 }
176 }
177 else if (*current == ' ' && inside_level == 0) {
178 *current = 0;
179 break;
180 }
181 current++;
182 }
183 // add attribute
184 g_ptr_array_add (hi->name_array, attr_name);
185 g_ptr_array_add (hi->value_array, attr_value);
186 }
187 g_ptr_array_add (hi->name_array, NULL);
188 g_ptr_array_add (hi->value_array, NULL);
189
190 // call
191 html_import_start_element (hi, element_name,
192 (const gchar**)hi->name_array->pdata,
193 (const gchar**)hi->value_array->pdata);
194
195 }
196
html_import_read_file(HtmlImport * hi,gchar * filename)197 gboolean html_import_read_file (HtmlImport* hi, gchar* filename)
198 {
199 FILE* file;
200 guint8 *buffer, *data_end, *current;
201
202 guint read_size;
203 gboolean inside_tag = FALSE;
204 GString* gstring;
205
206 file = fopen (filename, "r");
207
208 if (file==NULL)
209 return FALSE;
210
211 buffer = g_malloc (4096);
212 gstring = g_string_sized_new (4096);
213
214 do {
215 read_size = fread (buffer, 1, 4096, file);
216 data_end = buffer + read_size;
217
218 for (current=buffer; current!=data_end; current++) {
219 switch (*current) {
220 case '<':
221 inside_tag = TRUE;
222 g_string_truncate (gstring, 0);
223 break;
224 case '>':
225 if (inside_tag)
226 html_import_parse_tag (hi, gstring->str);
227 inside_tag = FALSE;
228 g_string_truncate (gstring, 0);
229 break;
230 case '\r':
231 case '\n':
232 // skip
233 break;
234 default:
235 if (inside_tag)
236 g_string_append_c (gstring, *current);
237 }
238 // check <tag> length
239 if( gstring->len >= 4096 ) {
240 inside_tag=FALSE;
241 g_string_truncate(gstring, 0);
242 }
243 }
244 } while (read_size);
245
246 g_string_free (gstring, TRUE);
247 g_free (buffer);
248
249 fclose (file);
250
251 return TRUE;
252 }
253
254 /*
255 gboolean html_import_read_file (HtmlImport* hi,
256 gchar* filename)
257 {
258 GMarkupParseContext* parser_context;
259 GMarkupParser parser;
260
261 FILE* file;
262 guint size;
263 guint8* buffer;
264 GError* error = NULL;
265 gboolean parse_ok = TRUE;
266
267 parser.start_element = html_import_start_element;
268 parser.end_element = NULL;
269 parser.text = NULL;
270 parser.passthrough = NULL;
271 parser.error = NULL;
272 parser_context = g_markup_parse_context_new (&parser, 0,
273 hi, NULL);
274
275 file = fopen (filename, "r");
276 if (file==NULL)
277 return FALSE;
278
279 buffer = g_malloc (4096);
280
281 while ( (size=fread (buffer, 1, 4096, file)) && parse_ok) {
282 parse_ok = g_markup_parse_context_parse (parser_context,
283 buffer, size, &error);
284 }
285
286 if (error)
287 g_error_free (error);
288
289 g_markup_parse_context_end_parse (parser_context, NULL);
290 g_markup_parse_context_free (parser_context);
291
292 g_free (buffer);
293 fclose (file);
294
295 return parse_ok;
296 }
297 */
298
html_import_get_url_first(HtmlImport * hi,HtmlImportIter * iter,const gchar ** string)299 gboolean html_import_get_url_first (HtmlImport* hi,
300 HtmlImportIter* iter,
301 const gchar** string)
302 {
303 iter->first = TRUE;
304 iter->element_list = hi->element_list;
305
306 return html_import_get_url_next (hi, iter, string);
307 }
308
html_import_get_url_next(HtmlImport * hi,HtmlImportIter * iter,const gchar ** string)309 gboolean html_import_get_url_next (HtmlImport* hi,
310 HtmlImportIter* iter,
311 const gchar** string)
312 {
313 HtmlImportElement* hie;
314 gboolean valid;
315 gboolean selected;
316 GString* gstring = hi->gstring;
317 UrlPart urlp;
318
319 for (; iter->element_list; iter->element_list=iter->element_list->next) {
320 hie = iter->element_list->data;
321
322 // check if first time search in this HtmlImportElement
323 if (iter->first) {
324 iter->first = FALSE;
325 valid = gtk_tree_model_get_iter_first (GTK_TREE_MODEL (hie->url_store),
326 &iter->tree_iter);
327 }
328 else {
329 valid = gtk_tree_model_iter_next (GTK_TREE_MODEL (hie->url_store),
330 &iter->tree_iter);
331 }
332
333 // search selected URL from HtmlImportElement
334 while (valid) {
335 gtk_tree_model_get (GTK_TREE_MODEL (hie->url_store),
336 &iter->tree_iter,
337 SELECTION_STORE_SELECTED, &selected,
338 SELECTION_STORE_STRING, string,
339 -1);
340 if (selected) {
341 url_part (&urlp, *string);
342 g_string_truncate (gstring, 0);
343
344 if (urlp.protocol_beg==NULL) {
345 url_part (&urlp, hi->base_href);
346 g_string_append_len (gstring, urlp.protocol_beg,
347 urlp.folder_end - urlp.protocol_beg);
348 }
349 g_string_append (gstring, *string);
350 *string = gstring->str;
351 return TRUE;
352 }
353
354 valid = gtk_tree_model_iter_next (GTK_TREE_MODEL (hie->url_store),
355 &iter->tree_iter);
356 }
357 iter->first = TRUE;
358 }
359 return FALSE;
360 }
361
362 // =================================================================
363
html_import_class_init(HtmlImportClass * hic)364 void html_import_class_init (HtmlImportClass* hic)
365 {
366 hic->hash_table = g_hash_table_new_full (g_str_hash, g_str_equal,
367 (GDestroyNotify)g_free,
368 NULL);
369 hic->index_array = g_array_sized_new (FALSE, FALSE, sizeof(gint), 100);
370 hic->list_store = selection_store_new ();
371 }
372
html_import_class_clear(HtmlImportClass * hic)373 void html_import_class_clear (HtmlImportClass* hic)
374 {
375 g_hash_table_destroy (hic->hash_table);
376 g_array_free (hic->index_array, TRUE);
377 g_object_unref (hic->list_store);
378 }
379
html_import_class_add(HtmlImportClass * hic,const gchar * class_name)380 void html_import_class_add (HtmlImportClass* hic,
381 const gchar* class_name)
382 {
383 gint index;
384 gpointer key, val;
385 GtkTreeIter iter;
386
387 if (class_name==NULL || *class_name==0)
388 class_name="(null)";
389
390 if( g_hash_table_lookup_extended (hic->hash_table,
391 class_name,
392 &key, &val) )
393 {
394 index = GPOINTER_TO_INT(val);
395 }
396 else {
397 printf ("--- class:%s\n", class_name);
398 gtk_list_store_append (hic->list_store, &iter);
399 gtk_list_store_set (hic->list_store, &iter,
400 SELECTION_STORE_STRING, class_name,
401 -1);
402 index = -1;
403 }
404 g_hash_table_insert (hic->hash_table,
405 (gpointer)g_strdup(class_name),
406 GINT_TO_POINTER(hic->index_array->len));
407 g_array_append_val (hic->index_array, index);
408 }
409
html_import_class_apply_selection_to_element(HtmlImportClass * hic,HtmlImportElement * hie)410 void html_import_class_apply_selection_to_element (HtmlImportClass* hic,
411 HtmlImportElement* hie)
412 {
413 gboolean valid;
414 gboolean selected;
415 const gchar* string;
416 GtkTreeIter iter;
417 GtkTreeIter iter_list;
418 gpointer key, val;
419 gint index;
420
421 valid = gtk_tree_model_get_iter_first (GTK_TREE_MODEL (hic->list_store),
422 &iter);
423 while(valid) {
424 gtk_tree_model_get (GTK_TREE_MODEL(hic->list_store), &iter,
425 SELECTION_STORE_SELECTED, &selected,
426 SELECTION_STORE_STRING, &string,
427 -1);
428 if( selected ) {
429 g_hash_table_lookup_extended (hic->hash_table, string,
430 &key, &val);
431 index = GPOINTER_TO_INT(val);
432 while( index != -1 ) {
433 gint count;
434 gboolean selected;
435 gtk_tree_model_iter_nth_child (GTK_TREE_MODEL (hie->url_store),
436 &iter_list,
437 NULL, index);
438 gtk_tree_model_get (GTK_TREE_MODEL (hie->url_store),
439 &iter_list,
440 SELECTION_STORE_COUNT, &count,
441 -1);
442
443 selected = (count) ? TRUE : FALSE;
444 count++;
445
446 gtk_list_store_set (hie->url_store, &iter_list,
447 SELECTION_STORE_SELECTED, selected,
448 SELECTION_STORE_COUNT, count,
449 -1);
450
451 index = g_array_index (hic->index_array, gint, index);
452 }
453 }
454 valid = gtk_tree_model_iter_next (GTK_TREE_MODEL (hic->list_store),
455 &iter);
456 }
457 }
458
459 // =================================================================
460
html_import_element_new(gchar * element_name,gchar * attr_name,gchar * label_name)461 HtmlImportElement* html_import_element_new (gchar* element_name,
462 gchar* attr_name,
463 gchar* label_name)
464 {
465 HtmlImportElement* he = g_malloc (sizeof (HtmlImportElement));
466
467 he->element_name = g_strdup (element_name);
468 he->attr_name = g_strdup (attr_name);
469 he->label_name = g_strdup (label_name);
470
471 he->url_store = selection_store_new ();
472
473 html_import_class_init (&he->extension);
474 html_import_class_init (&he->address);
475
476 return he;
477 }
478
html_import_element_destroy(HtmlImportElement * he)479 void html_import_element_destroy (HtmlImportElement* he)
480 {
481 g_free (he->element_name);
482 g_free (he->attr_name);
483 g_free (he->label_name);
484
485 g_object_unref (he->url_store);
486
487 html_import_class_clear (&he->extension);
488 html_import_class_clear (&he->address);
489
490 g_free (he);
491 }
492
html_import_element_add_url(HtmlImportElement * he,const gchar * url)493 void html_import_element_add_url (HtmlImportElement* he,
494 const gchar* url)
495 {
496 GtkTreeIter iter;
497 gchar* url_str;
498 gchar* ext = NULL;
499 gchar* addr = NULL;
500 UrlPart urlp;
501
502 url_str = g_strdup (url);
503 url_part (&urlp, url_str);
504
505 if (urlp.ext_end)
506 ext = urlp.ext_beg;
507 if (urlp.address_end) {
508 *urlp.address_end = 0;
509 addr = urlp.address_beg;
510 }
511
512 html_import_class_add (&he->extension, ext);
513 html_import_class_add (&he->address, addr);
514
515 gtk_list_store_append (he->url_store, &iter);
516 gtk_list_store_set (he->url_store, &iter,
517 SELECTION_STORE_STRING, url,
518 -1);
519
520 g_free (url_str);
521 }
522
html_import_element_apply_selection(HtmlImportElement * hie)523 void html_import_element_apply_selection (HtmlImportElement* hie)
524 {
525 html_import_class_apply_selection_to_element (&hie->address, hie);
526 html_import_class_apply_selection_to_element (&hie->extension, hie);
527 }
528
529