1 /*
2 * Copyright (C) 2009, Nokia <ivan.frade@nokia.com>
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 * Boston, MA 02110-1301, USA.
18 */
19
20 #include "config-miners.h"
21
22 #define _XOPEN_SOURCE
23 #define _XOPEN_SOURCE_EXTENDED 1 /* strptime is XPG4v2 */
24
25 #include <time.h>
26 #include <string.h>
27 #include <stdio.h>
28
29 #include <libtracker-miners-common/tracker-utils.h>
30 #include <libtracker-miners-common/tracker-date-time.h>
31
32 #include "tracker-utils.h"
33
34 #ifndef HAVE_GETLINE
35
36 #include <stddef.h>
37 #include <stdlib.h>
38 #include <limits.h>
39 #include <errno.h>
40
41 #undef getdelim
42 #undef getline
43
44 #define GROW_BY 80
45
46 #endif /* HAVE_GETLINE */
47
48 #define DATE_FORMAT_ISO8601 "%Y-%m-%dT%H:%M:%S%z"
49
50 /**
51 * SECTION:tracker-utils
52 * @title: Data utilities
53 * @short_description: Functions for coalescing, merging, date
54 * handling and normalizing
55 * @stability: Stable
56 * @include: libtracker-extract/tracker-extract.h
57 *
58 * This API is provided to facilitate common more general functions
59 * which extractors may find useful. These functions are also used by
60 * the in-house extractors quite frequently.
61 **/
62
63 static const char *months[] = {
64 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
65 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
66 };
67
68 static const char imonths[] = {
69 '1', '2', '3', '4', '5',
70 '6', '7', '8', '9', '0', '1', '2'
71 };
72
73
74 /**
75 * tracker_coalesce_strip:
76 * @n_values: the number of @... supplied
77 * @...: the string pointers to coalesce
78 *
79 * This function iterates through a series of string pointers passed
80 * using @... and returns the first which is not %NULL, not empty
81 * (i.e. "") and not comprised of one or more spaces (i.e. " ").
82 *
83 * The returned value is stripped using g_strstrip(). It is MOST
84 * important NOT to pass constant string pointers to this function!
85 *
86 * Returns: the first string pointer from those provided which
87 * matches, otherwise %NULL.
88 *
89 * Since: 0.10
90 **/
91 const gchar *
tracker_coalesce_strip(gint n_values,...)92 tracker_coalesce_strip (gint n_values,
93 ...)
94 {
95 va_list args;
96 gint i;
97 const gchar *result = NULL;
98
99 va_start (args, n_values);
100
101 for (i = 0; i < n_values; i++) {
102 gchar *value;
103
104 value = va_arg (args, gchar *);
105 if (!result && !tracker_is_blank_string (value)) {
106 result = (const gchar *) g_strstrip (value);
107 break;
108 }
109 }
110
111 va_end (args);
112
113 return result;
114 }
115
116 // LCOV_EXCL_START
117
118 /**
119 * tracker_coalesce:
120 * @n_values: the number of @Varargs supplied
121 * @...: the string pointers to coalesce
122 *
123 * This function iterates through a series of string pointers passed
124 * using @... and returns the first which is not %NULL, not empty
125 * (i.e. "") and not comprised of one or more spaces (i.e. " ").
126 *
127 * The returned value is stripped using g_strstrip(). All other values
128 * supplied are freed. It is MOST important NOT to pass constant
129 * string pointers to this function!
130 *
131 * Returns: the first string pointer from those provided which
132 * matches, otherwise %NULL.
133 *
134 * Since: 0.8
135 *
136 * Deprecated: 0.10: Use tracker_coalesce_strip() instead.
137 *
138 **/
139 gchar *
tracker_coalesce(gint n_values,...)140 tracker_coalesce (gint n_values,
141 ...)
142 {
143 va_list args;
144 gint i;
145 gchar *result = NULL;
146
147 va_start (args, n_values);
148
149 for (i = 0; i < n_values; i++) {
150 gchar *value;
151
152 value = va_arg (args, gchar *);
153 if (!result && !tracker_is_blank_string (value)) {
154 result = g_strstrip (value);
155 } else {
156 g_free (value);
157 }
158 }
159
160 va_end (args);
161
162 return result;
163 }
164 // LCOV_EXCL_STOP
165
166 /**
167 * tracker_merge_const:
168 * @delimiter: the delimiter to use when merging
169 * @n_values: the number of @... supplied
170 * @...: the string pointers to merge
171 *
172 * This function iterates through a series of string pointers passed
173 * using @... and returns a newly allocated string of the merged
174 * strings.
175 *
176 * The @delimiter can be %NULL. If specified, it will be used in
177 * between each merged string in the result.
178 *
179 * Returns: a newly-allocated string holding the result which should
180 * be freed with g_free() when finished with, otherwise %NULL.
181 *
182 * Since: 0.10
183 **/
184 gchar *
tracker_merge_const(const gchar * delimiter,gint n_values,...)185 tracker_merge_const (const gchar *delimiter,
186 gint n_values,
187 ...)
188 {
189 va_list args;
190 gint i;
191 GString *str = NULL;
192
193 va_start (args, n_values);
194
195 for (i = 0; i < n_values; i++) {
196 gchar *value;
197
198 value = va_arg (args, gchar *);
199 if (value) {
200 if (!str) {
201 str = g_string_new (value);
202 } else {
203 if (delimiter) {
204 g_string_append (str, delimiter);
205 }
206 g_string_append (str, value);
207 }
208 }
209 }
210
211 va_end (args);
212
213 if (!str) {
214 return NULL;
215 }
216
217 return g_string_free (str, FALSE);
218 }
219
220 // LCOV_EXCL_START
221
222 /**
223 * tracker_merge:
224 * @delimiter: the delimiter to use when merging
225 * @n_values: the number of @... supplied
226 * @...: the string pointers to merge
227 *
228 * This function iterates through a series of string pointers passed
229 * using @... and returns a newly allocated string of the merged
230 * strings. All passed strings are freed (don't pass const values)/
231 *
232 * The @delimiter can be %NULL. If specified, it will be used in
233 * between each merged string in the result.
234 *
235 * Returns: a newly-allocated string holding the result which should
236 * be freed with g_free() when finished with, otherwise %NULL.
237 *
238 * Since: 0.8
239 *
240 * Deprecated: 0.10: Use tracker_merge_const() instead.
241 **/
242 gchar *
tracker_merge(const gchar * delimiter,gint n_values,...)243 tracker_merge (const gchar *delimiter,
244 gint n_values,
245 ...)
246 {
247 va_list args;
248 gint i;
249 GString *str = NULL;
250
251 va_start (args, n_values);
252
253 for (i = 0; i < n_values; i++) {
254 gchar *value;
255
256 value = va_arg (args, gchar *);
257 if (value) {
258 if (!str) {
259 str = g_string_new (value);
260 } else {
261 if (delimiter) {
262 g_string_append (str, delimiter);
263 }
264 g_string_append (str, value);
265 }
266 g_free (value);
267 }
268 }
269
270 va_end (args);
271
272 if (!str) {
273 return NULL;
274 }
275
276 return g_string_free (str, FALSE);
277 }
278
279 /**
280 * tracker_text_normalize:
281 * @text: the text to normalize
282 * @max_words: the maximum words of @text to normalize
283 * @n_words: the number of words actually normalized
284 *
285 * This function iterates through @text checking for UTF-8 validity
286 * using g_utf8_get_char_validated(). For each character found, the
287 * %GUnicodeType is checked to make sure it is one fo the following
288 * values:
289 * <itemizedlist>
290 * <listitem><para>%G_UNICODE_LOWERCASE_LETTER</para></listitem>
291 * <listitem><para>%G_UNICODE_MODIFIER_LETTER</para></listitem>
292 * <listitem><para>%G_UNICODE_OTHER_LETTER</para></listitem>
293 * <listitem><para>%G_UNICODE_TITLECASE_LETTER</para></listitem>
294 * <listitem><para>%G_UNICODE_UPPERCASE_LETTER</para></listitem>
295 * </itemizedlist>
296 *
297 * All other symbols, punctuation, marks, numbers and separators are
298 * stripped. A regular space (i.e. " ") is used to separate the words
299 * in the returned string.
300 *
301 * The @n_words can be %NULL. If specified, it will be populated with
302 * the number of words that were normalized in the result.
303 *
304 * Returns: a newly-allocated string holding the result which should
305 * be freed with g_free() when finished with, otherwise %NULL.
306 *
307 * Since: 0.8
308 *
309 * Deprecated: 0.10: Use tracker_text_validate_utf8() instead.
310 **/
311 gchar *
tracker_text_normalize(const gchar * text,guint max_words,guint * n_words)312 tracker_text_normalize (const gchar *text,
313 guint max_words,
314 guint *n_words)
315 {
316 GString *string;
317 gboolean in_break = TRUE;
318 gunichar ch;
319 gint words = 0;
320
321 string = g_string_new (NULL);
322
323 while ((ch = g_utf8_get_char_validated (text, -1)) > 0) {
324 GUnicodeType type;
325
326 type = g_unichar_type (ch);
327
328 if (type == G_UNICODE_LOWERCASE_LETTER ||
329 type == G_UNICODE_MODIFIER_LETTER ||
330 type == G_UNICODE_OTHER_LETTER ||
331 type == G_UNICODE_TITLECASE_LETTER ||
332 type == G_UNICODE_UPPERCASE_LETTER) {
333 /* Append regular chars */
334 g_string_append_unichar (string, ch);
335 in_break = FALSE;
336 } else if (!in_break) {
337 /* Non-regular char found, treat as word break */
338 g_string_append_c (string, ' ');
339 in_break = TRUE;
340 words++;
341
342 if (words > max_words) {
343 break;
344 }
345 }
346
347 text = g_utf8_find_next_char (text, NULL);
348 }
349
350 if (n_words) {
351 if (!in_break) {
352 /* Count the last word */
353 words += 1;
354 }
355 *n_words = words;
356 }
357
358 return g_string_free (string, FALSE);
359 }
360
361 // LCOV_EXCL_STOP
362
363 /**
364 * tracker_text_validate_utf8:
365 * @text: the text to validate
366 * @text_len: length of @text, or -1 if NUL-terminated
367 * @str: the string where to place the validated UTF-8 characters, or %NULL if
368 * not needed.
369 * @valid_len: Output number of valid UTF-8 bytes found, or %NULL if not needed
370 *
371 * This function iterates through @text checking for UTF-8 validity
372 * using g_utf8_validate(), appends the first chunk of valid characters
373 * to @str, and gives the number of valid UTF-8 bytes in @valid_len.
374 *
375 * Returns: %TRUE if some bytes were found to be valid, %FALSE otherwise.
376 *
377 * Since: 0.10
378 **/
379 gboolean
tracker_text_validate_utf8(const gchar * text,gssize text_len,GString ** str,gsize * valid_len)380 tracker_text_validate_utf8 (const gchar *text,
381 gssize text_len,
382 GString **str,
383 gsize *valid_len)
384 {
385 gsize len_to_validate;
386
387 g_return_val_if_fail (text, FALSE);
388
389 len_to_validate = text_len >= 0 ? text_len : strlen (text);
390
391 if (len_to_validate > 0) {
392 const gchar *end = text;
393
394 /* Validate string, getting the pointer to first non-valid character
395 * (if any) or to the end of the string. */
396 g_utf8_validate (text, len_to_validate, &end);
397 if (end > text) {
398 /* If str output required... */
399 if (str) {
400 /* Create string to output if not already as input */
401 *str = (*str == NULL ?
402 g_string_new_len (text, end - text) :
403 g_string_append_len (*str, text, end - text));
404 }
405
406 /* If utf8 len output required... */
407 if (valid_len) {
408 *valid_len = end - text;
409 }
410
411 return TRUE;
412 }
413 }
414
415 return FALSE;
416 }
417
418 /**
419 * tracker_date_format_to_iso8601:
420 * @date_string: the date in a string pointer
421 * @format: the format of the @date_string
422 *
423 * This function uses strptime() to create a time tm structure using
424 * @date_string and @format.
425 *
426 * Returns: a newly-allocated string with the time represented in
427 * ISO8601 date format which should be freed with g_free() when
428 * finished with, otherwise %NULL.
429 *
430 * Since: 0.8
431 **/
432 gchar *
tracker_date_format_to_iso8601(const gchar * date_string,const gchar * format)433 tracker_date_format_to_iso8601 (const gchar *date_string,
434 const gchar *format)
435 {
436 gchar *result;
437 struct tm date_tm = { 0 };
438
439 g_return_val_if_fail (date_string != NULL, NULL);
440 g_return_val_if_fail (format != NULL, NULL);
441
442 if (strptime (date_string, format, &date_tm) == 0) {
443 return NULL;
444 }
445
446 /* If the input format string doesn't parse timezone information with
447 * either %z or %Z, strptime() won't set the tm_gmtoff member in the
448 * broken-down time, and the value during initialization (0) will be
449 * left. This effectively means that every broken-down time obtained
450 * with strptime() without parsing timezone information will be based
451 * on UTC, instead of being treated as localtime. In order to fix this
452 * and set the correct value for the offset w.r.t gmt, we can just
453 * use mktime() to fill in the daylight saving flag as well as the
454 * gmt offset value. */
455 if (!strstr (format, "%z") && !strstr (format, "%Z")) {
456 /* tm_isdst not set by strptime(), we set -1 on it in order to ask
457 * mktime to 'normalize' its contents and fill in the gmt offset
458 * and daylight saving time information */
459 date_tm.tm_isdst = -1;
460
461 /* Note: no real problem if mktime() fails. In this case, tm_isdst
462 * will be -1, and therefore strftime() will not write the timezone
463 * information, which is equally right to represent localtime. */
464 mktime (&date_tm);
465 }
466
467 result = g_malloc (sizeof (char) * 25);
468 strftime (result, 25, DATE_FORMAT_ISO8601 , &date_tm);
469 return result;
470 }
471
472 static gboolean
is_int(const gchar * str)473 is_int (const gchar *str)
474 {
475 gint i, len;
476
477 if (!str || str[0] == '\0') {
478 return FALSE;
479 }
480
481 len = strlen (str);
482
483 for (i = 0; i < len; i++) {
484 if (!g_ascii_isdigit (str[i])) {
485 return FALSE;
486 }
487 }
488
489 return TRUE ;
490 }
491
492 static gint
parse_month(const gchar * month)493 parse_month (const gchar *month)
494 {
495 gint i;
496
497 for (i = 0; i < 12; i++) {
498 if (!strncmp (month, months[i], 3)) {
499 return i;
500 }
501 }
502
503 return -1;
504 }
505
506 /* Determine date format and convert to ISO 8601 format */
507 /* FIXME We should handle all the fractions here (see ISO 8601), as well as YYYY:DDD etc */
508
509 /**
510 * tracker_date_guess:
511 * @date_string: the date in a string pointer
512 *
513 * This function uses a number of methods to try and guess the date
514 * held in @date_string. The @date_string must be at least 5
515 * characters in length or longer for any guessing to be attempted.
516 * Some of the string formats guessed include:
517 *
518 * <itemizedlist>
519 * <listitem><para>"YYYY-MM-DD" (Simple format)</para></listitem>
520 * <listitem><para>"20050315113224-08'00'" (PDF format)</para></listitem>
521 * <listitem><para>"20050216111533Z" (PDF format)</para></listitem>
522 * <listitem><para>"Mon Feb 9 10:10:00 2004" (Microsoft Office format)</para></listitem>
523 * <listitem><para>"2005:04:29 14:56:54" (Exif format)</para></listitem>
524 * <listitem><para>"YYYY-MM-DDThh:mm:ss.ff+zz:zz</para></listitem>
525 * </itemizedlist>
526 *
527 * Returns: a newly-allocated string with the time represented in
528 * ISO8601 date format which should be freed with g_free() when
529 * finished with, otherwise %NULL.
530 *
531 * Since: 0.8
532 **/
533 gchar *
tracker_date_guess(const gchar * date_string)534 tracker_date_guess (const gchar *date_string)
535 {
536 gchar buf[30];
537 gint len;
538 GError *error = NULL;
539
540 if (!date_string) {
541 return NULL;
542 }
543
544 len = strlen (date_string);
545
546 /* We cannot format a date without at least a four digit
547 * year.
548 */
549 if (len < 4) {
550 return NULL;
551 }
552
553 /* Check for year only dates (EG ID3 music tags might have
554 * Audio.ReleaseDate as 4 digit year)
555 */
556 if (len == 4) {
557 if (is_int (date_string)) {
558 buf[0] = date_string[0];
559 buf[1] = date_string[1];
560 buf[2] = date_string[2];
561 buf[3] = date_string[3];
562 buf[4] = '-';
563 buf[5] = '0';
564 buf[6] = '1';
565 buf[7] = '-';
566 buf[8] = '0';
567 buf[9] = '1';
568 buf[10] = 'T';
569 buf[11] = '0';
570 buf[12] = '0';
571 buf[13] = ':';
572 buf[14] = '0';
573 buf[15] = '0';
574 buf[16] = ':';
575 buf[17] = '0';
576 buf[18] = '0';
577 buf[19] = 'Z';
578 buf[20] = '\0';
579
580 tracker_string_to_date (buf, NULL, &error);
581
582 if (error != NULL) {
583 g_error_free (error);
584 return NULL;
585 }
586
587 return g_strdup (buf);
588 } else {
589 return NULL;
590 }
591 } else if (len == 10) {
592 /* Check for date part only YYYY-MM-DD */
593 buf[0] = date_string[0];
594 buf[1] = date_string[1];
595 buf[2] = date_string[2];
596 buf[3] = date_string[3];
597 buf[4] = '-';
598 buf[5] = date_string[5];
599 buf[6] = date_string[6];
600 buf[7] = '-';
601 buf[8] = date_string[8];
602 buf[9] = date_string[9];
603 buf[10] = 'T';
604 buf[11] = '0';
605 buf[12] = '0';
606 buf[13] = ':';
607 buf[14] = '0';
608 buf[15] = '0';
609 buf[16] = ':';
610 buf[17] = '0';
611 buf[18] = '0';
612 buf[19] = '\0';
613
614 tracker_string_to_date (buf, NULL, &error);
615
616 if (error != NULL) {
617 g_error_free (error);
618 return NULL;
619 }
620
621 return g_strdup (buf);
622 } else if (len == 14) {
623 /* Check for pdf format EG 20050315113224-08'00' or
624 * 20050216111533Z
625 */
626 buf[0] = date_string[0];
627 buf[1] = date_string[1];
628 buf[2] = date_string[2];
629 buf[3] = date_string[3];
630 buf[4] = '-';
631 buf[5] = date_string[4];
632 buf[6] = date_string[5];
633 buf[7] = '-';
634 buf[8] = date_string[6];
635 buf[9] = date_string[7];
636 buf[10] = 'T';
637 buf[11] = date_string[8];
638 buf[12] = date_string[9];
639 buf[13] = ':';
640 buf[14] = date_string[10];
641 buf[15] = date_string[11];
642 buf[16] = ':';
643 buf[17] = date_string[12];
644 buf[18] = date_string[13];
645 buf[19] = '\0';
646
647 tracker_string_to_date (buf, NULL, &error);
648
649 if (error != NULL) {
650 g_error_free (error);
651 return NULL;
652 }
653
654 return g_strdup (buf);
655 } else if (len == 15 && date_string[14] == 'Z') {
656 buf[0] = date_string[0];
657 buf[1] = date_string[1];
658 buf[2] = date_string[2];
659 buf[3] = date_string[3];
660 buf[4] = '-';
661 buf[5] = date_string[4];
662 buf[6] = date_string[5];
663 buf[7] = '-';
664 buf[8] = date_string[6];
665 buf[9] = date_string[7];
666 buf[10] = 'T';
667 buf[11] = date_string[8];
668 buf[12] = date_string[9];
669 buf[13] = ':';
670 buf[14] = date_string[10];
671 buf[15] = date_string[11];
672 buf[16] = ':';
673 buf[17] = date_string[12];
674 buf[18] = date_string[13];
675 buf[19] = 'Z';
676 buf[20] = '\0';
677
678 tracker_string_to_date (buf, NULL, &error);
679
680 if (error != NULL) {
681 g_error_free (error);
682 return NULL;
683 }
684
685 return g_strdup (buf);
686 } else if (len == 21 && (date_string[14] == '-' || date_string[14] == '+' )) {
687 buf[0] = date_string[0];
688 buf[1] = date_string[1];
689 buf[2] = date_string[2];
690 buf[3] = date_string[3];
691 buf[4] = '-';
692 buf[5] = date_string[4];
693 buf[6] = date_string[5];
694 buf[7] = '-';
695 buf[8] = date_string[6];
696 buf[9] = date_string[7];
697 buf[10] = 'T';
698 buf[11] = date_string[8];
699 buf[12] = date_string[9];
700 buf[13] = ':';
701 buf[14] = date_string[10];
702 buf[15] = date_string[11];
703 buf[16] = ':';
704 buf[17] = date_string[12];
705 buf[18] = date_string[13];
706 buf[19] = date_string[14];
707 buf[20] = date_string[15];
708 buf[21] = date_string[16];
709 buf[22] = ':';
710 buf[23] = date_string[18];
711 buf[24] = date_string[19];
712 buf[25] = '\0';
713
714 tracker_string_to_date (buf, NULL, &error);
715
716 if (error != NULL) {
717 g_error_free (error);
718 return NULL;
719 }
720
721 return g_strdup (buf);
722 } else if ((len == 24) && (date_string[3] == ' ')) {
723 /* Check for msoffice date format "Mon Feb 9 10:10:00 2004" */
724 gint num_month;
725 gchar mon1;
726 gchar day1;
727
728 num_month = parse_month (date_string + 4);
729
730 if (num_month < 0) {
731 return NULL;
732 }
733
734 mon1 = imonths[num_month];
735
736 if (date_string[8] == ' ') {
737 day1 = '0';
738 } else {
739 day1 = date_string[8];
740 }
741
742 buf[0] = date_string[20];
743 buf[1] = date_string[21];
744 buf[2] = date_string[22];
745 buf[3] = date_string[23];
746 buf[4] = '-';
747
748 if (num_month < 10) {
749 buf[5] = '0';
750 buf[6] = mon1;
751 } else {
752 buf[5] = '1';
753 buf[6] = mon1;
754 }
755
756 buf[7] = '-';
757 buf[8] = day1;
758 buf[9] = date_string[9];
759 buf[10] = 'T';
760 buf[11] = date_string[11];
761 buf[12] = date_string[12];
762 buf[13] = ':';
763 buf[14] = date_string[14];
764 buf[15] = date_string[15];
765 buf[16] = ':';
766 buf[17] = date_string[17];
767 buf[18] = date_string[18];
768 buf[19] = '\0';
769
770 tracker_string_to_date (buf, NULL, &error);
771
772 if (error != NULL) {
773 g_error_free (error);
774 return NULL;
775 }
776
777 return g_strdup (buf);
778 } else if ((len == 19) && (date_string[4] == ':') && (date_string[7] == ':')) {
779 /* Check for Exif date format "2005:04:29 14:56:54" */
780 buf[0] = date_string[0];
781 buf[1] = date_string[1];
782 buf[2] = date_string[2];
783 buf[3] = date_string[3];
784 buf[4] = '-';
785 buf[5] = date_string[5];
786 buf[6] = date_string[6];
787 buf[7] = '-';
788 buf[8] = date_string[8];
789 buf[9] = date_string[9];
790 buf[10] = 'T';
791 buf[11] = date_string[11];
792 buf[12] = date_string[12];
793 buf[13] = ':';
794 buf[14] = date_string[14];
795 buf[15] = date_string[15];
796 buf[16] = ':';
797 buf[17] = date_string[17];
798 buf[18] = date_string[18];
799 buf[19] = '\0';
800
801 tracker_string_to_date (buf, NULL, &error);
802
803 if (error != NULL) {
804 g_error_free (error);
805 return NULL;
806 }
807
808 return g_strdup (buf);
809 }
810
811 tracker_string_to_date (date_string, NULL, &error);
812
813 if (error != NULL) {
814 g_error_free (error);
815 return NULL;
816 }
817
818 return g_strdup (date_string);
819 }
820
821 #ifndef HAVE_GETLINE
822
823 static gint
my_igetdelim(gchar ** linebuf,gsize * linebufsz,gint delimiter,FILE * file)824 my_igetdelim (gchar **linebuf,
825 gsize *linebufsz,
826 gint delimiter,
827 FILE *file)
828 {
829 gint ch;
830 gint idx;
831
832 if ((file == NULL || linebuf == NULL || *linebuf == NULL || *linebufsz == 0) &&
833 !(*linebuf == NULL && *linebufsz == 0)) {
834 errno = EINVAL;
835 return -1;
836 }
837
838 if (*linebuf == NULL && *linebufsz == 0) {
839 *linebuf = g_malloc (GROW_BY);
840
841 if (!*linebuf) {
842 errno = ENOMEM;
843 return -1;
844 }
845
846 *linebufsz += GROW_BY;
847 }
848
849 idx = 0;
850
851 while ((ch = fgetc (file)) != EOF) {
852 /* Grow the line buffer as necessary */
853 while (idx > *linebufsz - 2) {
854 *linebuf = g_realloc (*linebuf, *linebufsz += GROW_BY);
855
856 if (!*linebuf) {
857 errno = ENOMEM;
858 return -1;
859 }
860 }
861 (*linebuf)[idx++] = (gchar) ch;
862
863 if ((gchar) ch == delimiter) {
864 break;
865 }
866 }
867
868 if (idx != 0) {
869 (*linebuf)[idx] = 0;
870 } else if ( ch == EOF ) {
871 return -1;
872 }
873
874 return idx;
875 }
876
877 #endif /* HAVE_GETLINE */
878
879 /**
880 * tracker_getline:
881 * @lineptr: Buffer to write into
882 * @n: Max bytes of linebuf
883 * @stream: Filestream to read from
884 *
885 * Reads an entire line from stream, storing the address of the buffer
886 * containing the text into *lineptr. The buffer is null-terminated
887 * and includes the newline character, if one was found.
888 *
889 * Read GNU getline()'s manpage for more information
890 *
891 * Returns: the number of characters read, including the delimiter
892 * character, but not including the terminating %NULL byte. This value
893 * can be used to handle embedded %NULL bytes in the line read. Upon
894 * failure, -1 is returned.
895 *
896 * Since: 0.10
897 **/
898 gssize
tracker_getline(gchar ** lineptr,gsize * n,FILE * stream)899 tracker_getline (gchar **lineptr,
900 gsize *n,
901 FILE *stream)
902 {
903 #ifndef HAVE_GETLINE
904 return my_igetdelim (lineptr, n, '\n', stream);
905 #else /* HAVE_GETLINE */
906 return getline (lineptr, n, stream);
907 #endif /* HAVE_GETLINE */
908 }
909
910 /**
911 * tracker_keywords_parse:
912 * @store: Array where to store the keywords
913 * @keywords: Keywords line to parse
914 *
915 * Parses a keywords line into store, avoiding duplicates and stripping leading
916 * and trailing spaces from keywords. Allowed delimiters are , and ;
917 *
918 * Since: 0.10
919 **/
920 void
tracker_keywords_parse(GPtrArray * store,const gchar * keywords)921 tracker_keywords_parse (GPtrArray *store,
922 const gchar *keywords)
923 {
924 gchar *orig, *keywords_d;
925 char *saveptr, *p;
926 size_t len;
927
928 keywords_d = orig = g_strdup (keywords);
929 p = keywords_d;
930 keywords_d = strchr (keywords_d, '"');
931
932 if (keywords_d) {
933 keywords_d++;
934 } else {
935 keywords_d = p;
936 }
937
938 len = strlen (keywords_d);
939 if (len > 0 && keywords_d[len - 1] == '"') {
940 keywords_d[len - 1] = '\0';
941 }
942
943 for (p = strtok_r (keywords_d, ",;", &saveptr); p;
944 p = strtok_r (NULL, ",;", &saveptr)) {
945 guint i;
946 gboolean found = FALSE;
947 gchar *p_do = g_strdup (p);
948 gchar *p_dup = p_do;
949 guint len = strlen (p_dup);
950
951 if (*p_dup == ' ')
952 p_dup++;
953
954 if (p_dup[len-1] == ' ')
955 p_dup[len-1] = '\0';
956
957 /* ignore keywords containing invalid UTF-8 */
958 if (!g_utf8_validate (p_dup, -1, NULL)) {
959 g_free (p_do);
960 continue;
961 }
962
963 for (i = 0; i < store->len; i++) {
964 const gchar *earlier = g_ptr_array_index (store, i);
965 if (g_strcmp0 (earlier, p_dup) == 0) {
966 found = TRUE;
967 break;
968 }
969 }
970
971 if (!found) {
972 g_ptr_array_add (store, g_strdup (p_dup));
973 }
974
975 g_free (p_do);
976 }
977
978 g_free (orig);
979 }
980