1 /*
2  * Copyright (C) 2009, Nokia <ivan.frade@nokia.com>
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with this library; if not, write to the
16  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17  * Boston, MA  02110-1301, USA.
18  */
19 
20 #include "config-miners.h"
21 
22 #define _XOPEN_SOURCE
23 #define _XOPEN_SOURCE_EXTENDED 1	/* strptime is XPG4v2 */
24 
25 #include <time.h>
26 #include <string.h>
27 #include <stdio.h>
28 
29 #include <libtracker-miners-common/tracker-utils.h>
30 #include <libtracker-miners-common/tracker-date-time.h>
31 
32 #include "tracker-utils.h"
33 
34 #ifndef HAVE_GETLINE
35 
36 #include <stddef.h>
37 #include <stdlib.h>
38 #include <limits.h>
39 #include <errno.h>
40 
41 #undef getdelim
42 #undef getline
43 
44 #define GROW_BY 80
45 
46 #endif /* HAVE_GETLINE */
47 
48 #define DATE_FORMAT_ISO8601 "%Y-%m-%dT%H:%M:%S%z"
49 
50 /**
51  * SECTION:tracker-utils
52  * @title: Data utilities
53  * @short_description: Functions for coalescing, merging, date
54  * handling and normalizing
55  * @stability: Stable
56  * @include: libtracker-extract/tracker-extract.h
57  *
58  * This API is provided to facilitate common more general functions
59  * which extractors may find useful. These functions are also used by
60  * the in-house extractors quite frequently.
61  **/
62 
63 static const char *months[] = {
64 	"Jan", "Feb", "Mar", "Apr", "May", "Jun",
65 	"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
66 };
67 
68 static const char imonths[] = {
69 	'1', '2', '3', '4', '5',
70 	'6', '7', '8', '9', '0', '1', '2'
71 };
72 
73 
74 /**
75  * tracker_coalesce_strip:
76  * @n_values: the number of @... supplied
77  * @...: the string pointers to coalesce
78  *
79  * This function iterates through a series of string pointers passed
80  * using @... and returns the first which is not %NULL, not empty
81  * (i.e. "") and not comprised of one or more spaces (i.e. " ").
82  *
83  * The returned value is stripped using g_strstrip(). It is MOST
84  * important NOT to pass constant string pointers to this function!
85  *
86  * Returns: the first string pointer from those provided which
87  * matches, otherwise %NULL.
88  *
89  * Since: 0.10
90  **/
91 const gchar *
tracker_coalesce_strip(gint n_values,...)92 tracker_coalesce_strip (gint n_values,
93                         ...)
94 {
95 	va_list args;
96 	gint    i;
97 	const gchar *result = NULL;
98 
99 	va_start (args, n_values);
100 
101 	for (i = 0; i < n_values; i++) {
102 		gchar *value;
103 
104 		value = va_arg (args, gchar *);
105 		if (!result && !tracker_is_blank_string (value)) {
106 			result = (const gchar *) g_strstrip (value);
107 			break;
108 		}
109 	}
110 
111 	va_end (args);
112 
113 	return result;
114 }
115 
116 // LCOV_EXCL_START
117 
118 /**
119  * tracker_coalesce:
120  * @n_values: the number of @Varargs supplied
121  * @...: the string pointers to coalesce
122  *
123  * This function iterates through a series of string pointers passed
124  * using @... and returns the first which is not %NULL, not empty
125  * (i.e. "") and not comprised of one or more spaces (i.e. " ").
126  *
127  * The returned value is stripped using g_strstrip(). All other values
128  * supplied are freed. It is MOST important NOT to pass constant
129  * string pointers to this function!
130  *
131  * Returns: the first string pointer from those provided which
132  * matches, otherwise %NULL.
133  *
134  * Since: 0.8
135  *
136  * Deprecated: 0.10: Use tracker_coalesce_strip() instead.
137  *
138  **/
139 gchar *
tracker_coalesce(gint n_values,...)140 tracker_coalesce (gint n_values,
141                   ...)
142 {
143 	va_list args;
144 	gint    i;
145 	gchar *result = NULL;
146 
147 	va_start (args, n_values);
148 
149 	for (i = 0; i < n_values; i++) {
150 		gchar *value;
151 
152 		value = va_arg (args, gchar *);
153 		if (!result && !tracker_is_blank_string (value)) {
154 			result = g_strstrip (value);
155 		} else {
156 			g_free (value);
157 		}
158 	}
159 
160 	va_end (args);
161 
162 	return result;
163 }
164 // LCOV_EXCL_STOP
165 
166 /**
167  * tracker_merge_const:
168  * @delimiter: the delimiter to use when merging
169  * @n_values: the number of @... supplied
170  * @...: the string pointers to merge
171  *
172  * This function iterates through a series of string pointers passed
173  * using @... and returns a newly allocated string of the merged
174  * strings.
175  *
176  * The @delimiter can be %NULL. If specified, it will be used in
177  * between each merged string in the result.
178  *
179  * Returns: a newly-allocated string holding the result which should
180  * be freed with g_free() when finished with, otherwise %NULL.
181  *
182  * Since: 0.10
183  **/
184 gchar *
tracker_merge_const(const gchar * delimiter,gint n_values,...)185 tracker_merge_const (const gchar *delimiter,
186                      gint         n_values,
187                      ...)
188 {
189 	va_list args;
190 	gint    i;
191 	GString *str = NULL;
192 
193 	va_start (args, n_values);
194 
195 	for (i = 0; i < n_values; i++) {
196 		gchar *value;
197 
198 		value = va_arg (args, gchar *);
199 		if (value) {
200 			if (!str) {
201 				str = g_string_new (value);
202 			} else {
203 				if (delimiter) {
204 					g_string_append (str, delimiter);
205 				}
206 				g_string_append (str, value);
207 			}
208 		}
209 	}
210 
211 	va_end (args);
212 
213 	if (!str) {
214 		return NULL;
215 	}
216 
217 	return g_string_free (str, FALSE);
218 }
219 
220 // LCOV_EXCL_START
221 
222 /**
223  * tracker_merge:
224  * @delimiter: the delimiter to use when merging
225  * @n_values: the number of @... supplied
226  * @...: the string pointers to merge
227  *
228  * This function iterates through a series of string pointers passed
229  * using @... and returns a newly allocated string of the merged
230  * strings. All passed strings are freed (don't pass const values)/
231  *
232  * The @delimiter can be %NULL. If specified, it will be used in
233  * between each merged string in the result.
234  *
235  * Returns: a newly-allocated string holding the result which should
236  * be freed with g_free() when finished with, otherwise %NULL.
237  *
238  * Since: 0.8
239  *
240  * Deprecated: 0.10: Use tracker_merge_const() instead.
241  **/
242 gchar *
tracker_merge(const gchar * delimiter,gint n_values,...)243 tracker_merge (const gchar *delimiter,
244                gint         n_values,
245                ...)
246 {
247 	va_list args;
248 	gint    i;
249 	GString *str = NULL;
250 
251 	va_start (args, n_values);
252 
253 	for (i = 0; i < n_values; i++) {
254 		gchar *value;
255 
256 		value = va_arg (args, gchar *);
257 		if (value) {
258 			if (!str) {
259 				str = g_string_new (value);
260 			} else {
261 				if (delimiter) {
262 					g_string_append (str, delimiter);
263 				}
264 				g_string_append (str, value);
265 			}
266 			g_free (value);
267 		}
268 	}
269 
270 	va_end (args);
271 
272 	if (!str) {
273 		return NULL;
274 	}
275 
276 	return g_string_free (str, FALSE);
277 }
278 
279 /**
280  * tracker_text_normalize:
281  * @text: the text to normalize
282  * @max_words: the maximum words of @text to normalize
283  * @n_words: the number of words actually normalized
284  *
285  * This function iterates through @text checking for UTF-8 validity
286  * using g_utf8_get_char_validated(). For each character found, the
287  * %GUnicodeType is checked to make sure it is one fo the following
288  * values:
289  * <itemizedlist>
290  *  <listitem><para>%G_UNICODE_LOWERCASE_LETTER</para></listitem>
291  *  <listitem><para>%G_UNICODE_MODIFIER_LETTER</para></listitem>
292  *  <listitem><para>%G_UNICODE_OTHER_LETTER</para></listitem>
293  *  <listitem><para>%G_UNICODE_TITLECASE_LETTER</para></listitem>
294  *  <listitem><para>%G_UNICODE_UPPERCASE_LETTER</para></listitem>
295  * </itemizedlist>
296  *
297  * All other symbols, punctuation, marks, numbers and separators are
298  * stripped. A regular space (i.e. " ") is used to separate the words
299  * in the returned string.
300  *
301  * The @n_words can be %NULL. If specified, it will be populated with
302  * the number of words that were normalized in the result.
303  *
304  * Returns: a newly-allocated string holding the result which should
305  * be freed with g_free() when finished with, otherwise %NULL.
306  *
307  * Since: 0.8
308  *
309  * Deprecated: 0.10: Use tracker_text_validate_utf8() instead.
310  **/
311 gchar *
tracker_text_normalize(const gchar * text,guint max_words,guint * n_words)312 tracker_text_normalize (const gchar *text,
313                         guint        max_words,
314                         guint       *n_words)
315 {
316 	GString *string;
317 	gboolean in_break = TRUE;
318 	gunichar ch;
319 	gint words = 0;
320 
321 	string = g_string_new (NULL);
322 
323 	while ((ch = g_utf8_get_char_validated (text, -1)) > 0) {
324 		GUnicodeType type;
325 
326 		type = g_unichar_type (ch);
327 
328 		if (type == G_UNICODE_LOWERCASE_LETTER ||
329 		    type == G_UNICODE_MODIFIER_LETTER ||
330 		    type == G_UNICODE_OTHER_LETTER ||
331 		    type == G_UNICODE_TITLECASE_LETTER ||
332 		    type == G_UNICODE_UPPERCASE_LETTER) {
333 			/* Append regular chars */
334 			g_string_append_unichar (string, ch);
335 			in_break = FALSE;
336 		} else if (!in_break) {
337 			/* Non-regular char found, treat as word break */
338 			g_string_append_c (string, ' ');
339 			in_break = TRUE;
340 			words++;
341 
342 			if (words > max_words) {
343 				break;
344 			}
345 		}
346 
347 		text = g_utf8_find_next_char (text, NULL);
348 	}
349 
350 	if (n_words) {
351 		if (!in_break) {
352 			/* Count the last word */
353 			words += 1;
354 		}
355 		*n_words = words;
356 	}
357 
358 	return g_string_free (string, FALSE);
359 }
360 
361 // LCOV_EXCL_STOP
362 
363 /**
364  * tracker_text_validate_utf8:
365  * @text: the text to validate
366  * @text_len: length of @text, or -1 if NUL-terminated
367  * @str: the string where to place the validated UTF-8 characters, or %NULL if
368  *  not needed.
369  * @valid_len: Output number of valid UTF-8 bytes found, or %NULL if not needed
370  *
371  * This function iterates through @text checking for UTF-8 validity
372  * using g_utf8_validate(), appends the first chunk of valid characters
373  * to @str, and gives the number of valid UTF-8 bytes in @valid_len.
374  *
375  * Returns: %TRUE if some bytes were found to be valid, %FALSE otherwise.
376  *
377  * Since: 0.10
378  **/
379 gboolean
tracker_text_validate_utf8(const gchar * text,gssize text_len,GString ** str,gsize * valid_len)380 tracker_text_validate_utf8 (const gchar  *text,
381                             gssize        text_len,
382                             GString     **str,
383                             gsize        *valid_len)
384 {
385 	gsize len_to_validate;
386 
387 	g_return_val_if_fail (text, FALSE);
388 
389 	len_to_validate = text_len >= 0 ? text_len : strlen (text);
390 
391 	if (len_to_validate > 0) {
392 		const gchar *end = text;
393 
394 		/* Validate string, getting the pointer to first non-valid character
395 		 *  (if any) or to the end of the string. */
396 		g_utf8_validate (text, len_to_validate, &end);
397 		if (end > text) {
398 			/* If str output required... */
399 			if (str) {
400 				/* Create string to output if not already as input */
401 				*str = (*str == NULL ?
402 				        g_string_new_len (text, end - text) :
403 				        g_string_append_len (*str, text, end - text));
404 			}
405 
406 			/* If utf8 len output required... */
407 			if (valid_len) {
408 				*valid_len = end - text;
409 			}
410 
411 			return TRUE;
412 		}
413 	}
414 
415 	return FALSE;
416 }
417 
418 /**
419  * tracker_date_format_to_iso8601:
420  * @date_string: the date in a string pointer
421  * @format: the format of the @date_string
422  *
423  * This function uses strptime() to create a time tm structure using
424  * @date_string and @format.
425  *
426  * Returns: a newly-allocated string with the time represented in
427  * ISO8601 date format which should be freed with g_free() when
428  * finished with, otherwise %NULL.
429  *
430  * Since: 0.8
431  **/
432 gchar *
tracker_date_format_to_iso8601(const gchar * date_string,const gchar * format)433 tracker_date_format_to_iso8601 (const gchar *date_string,
434                                 const gchar *format)
435 {
436 	gchar *result;
437 	struct tm date_tm = { 0 };
438 
439 	g_return_val_if_fail (date_string != NULL, NULL);
440 	g_return_val_if_fail (format != NULL, NULL);
441 
442 	if (strptime (date_string, format, &date_tm) == 0) {
443 		return NULL;
444 	}
445 
446 	/* If the input format string doesn't parse timezone information with
447 	 * either %z or %Z, strptime() won't set the tm_gmtoff member in the
448 	 * broken-down time, and the value during initialization (0) will be
449 	 * left. This effectively means that every broken-down time obtained
450 	 * with strptime() without parsing timezone information will be based
451 	 * on UTC, instead of being treated as localtime. In order to fix this
452 	 * and set the correct value for the offset w.r.t gmt, we can just
453 	 * use mktime() to fill in the daylight saving flag as well as the
454 	 * gmt offset value. */
455 	if (!strstr (format, "%z") && !strstr (format, "%Z")) {
456 		/* tm_isdst not set by strptime(), we set -1 on it in order to ask
457 		 * mktime to 'normalize' its contents and fill in the gmt offset
458 		 * and daylight saving time information */
459 		date_tm.tm_isdst = -1;
460 
461 		/* Note: no real problem if mktime() fails. In this case, tm_isdst
462 		 * will be -1, and therefore strftime() will not write the timezone
463 		 * information, which is equally right to represent localtime. */
464 		mktime (&date_tm);
465 	}
466 
467 	result = g_malloc (sizeof (char) * 25);
468 	strftime (result, 25, DATE_FORMAT_ISO8601 , &date_tm);
469 	return result;
470 }
471 
472 static gboolean
is_int(const gchar * str)473 is_int (const gchar *str)
474 {
475 	gint i, len;
476 
477 	if (!str || str[0] == '\0') {
478 		return FALSE;
479 	}
480 
481 	len = strlen (str);
482 
483 	for (i = 0; i < len; i++) {
484 		if (!g_ascii_isdigit (str[i])) {
485 			return FALSE;
486 		}
487 	}
488 
489 	return TRUE ;
490 }
491 
492 static gint
parse_month(const gchar * month)493 parse_month (const gchar *month)
494 {
495 	gint i;
496 
497 	for (i = 0; i < 12; i++) {
498 		if (!strncmp (month, months[i], 3)) {
499 			return i;
500 		}
501 	}
502 
503 	return -1;
504 }
505 
506 /* Determine date format and convert to ISO 8601 format */
507 /* FIXME We should handle all the fractions here (see ISO 8601), as well as YYYY:DDD etc */
508 
509 /**
510  * tracker_date_guess:
511  * @date_string: the date in a string pointer
512  *
513  * This function uses a number of methods to try and guess the date
514  * held in @date_string. The @date_string must be at least 5
515  * characters in length or longer for any guessing to be attempted.
516  * Some of the string formats guessed include:
517  *
518  * <itemizedlist>
519  *  <listitem><para>"YYYY-MM-DD" (Simple format)</para></listitem>
520  *  <listitem><para>"20050315113224-08'00'" (PDF format)</para></listitem>
521  *  <listitem><para>"20050216111533Z" (PDF format)</para></listitem>
522  *  <listitem><para>"Mon Feb  9 10:10:00 2004" (Microsoft Office format)</para></listitem>
523  *  <listitem><para>"2005:04:29 14:56:54" (Exif format)</para></listitem>
524  *  <listitem><para>"YYYY-MM-DDThh:mm:ss.ff+zz:zz</para></listitem>
525  * </itemizedlist>
526  *
527  * Returns: a newly-allocated string with the time represented in
528  * ISO8601 date format which should be freed with g_free() when
529  * finished with, otherwise %NULL.
530  *
531  * Since: 0.8
532  **/
533 gchar *
tracker_date_guess(const gchar * date_string)534 tracker_date_guess (const gchar *date_string)
535 {
536 	gchar buf[30];
537 	gint  len;
538 	GError *error = NULL;
539 
540 	if (!date_string) {
541 		return NULL;
542 	}
543 
544 	len = strlen (date_string);
545 
546 	/* We cannot format a date without at least a four digit
547 	 * year.
548 	 */
549 	if (len < 4) {
550 		return NULL;
551 	}
552 
553 	/* Check for year only dates (EG ID3 music tags might have
554 	 * Audio.ReleaseDate as 4 digit year)
555 	 */
556 	if (len == 4) {
557 		if (is_int (date_string)) {
558 			buf[0] = date_string[0];
559 			buf[1] = date_string[1];
560 			buf[2] = date_string[2];
561 			buf[3] = date_string[3];
562 			buf[4] = '-';
563 			buf[5] = '0';
564 			buf[6] = '1';
565 			buf[7] = '-';
566 			buf[8] = '0';
567 			buf[9] = '1';
568 			buf[10] = 'T';
569 			buf[11] = '0';
570 			buf[12] = '0';
571 			buf[13] = ':';
572 			buf[14] = '0';
573 			buf[15] = '0';
574 			buf[16] = ':';
575 			buf[17] = '0';
576 			buf[18] = '0';
577 			buf[19] = 'Z';
578 			buf[20] = '\0';
579 
580 			tracker_string_to_date (buf, NULL, &error);
581 
582 			if (error != NULL) {
583 				g_error_free (error);
584 				return NULL;
585 			}
586 
587 			return g_strdup (buf);
588 		} else {
589 			return NULL;
590 		}
591 	} else if (len == 10)  {
592 		/* Check for date part only YYYY-MM-DD */
593 		buf[0] = date_string[0];
594 		buf[1] = date_string[1];
595 		buf[2] = date_string[2];
596 		buf[3] = date_string[3];
597 		buf[4] = '-';
598 		buf[5] = date_string[5];
599 		buf[6] = date_string[6];
600 		buf[7] = '-';
601 		buf[8] = date_string[8];
602 		buf[9] = date_string[9];
603 		buf[10] = 'T';
604 		buf[11] = '0';
605 		buf[12] = '0';
606 		buf[13] = ':';
607 		buf[14] = '0';
608 		buf[15] = '0';
609 		buf[16] = ':';
610 		buf[17] = '0';
611 		buf[18] = '0';
612 		buf[19] = '\0';
613 
614 		tracker_string_to_date (buf, NULL, &error);
615 
616 		if (error != NULL) {
617 			g_error_free (error);
618 			return NULL;
619 		}
620 
621 		return g_strdup (buf);
622 	} else if (len == 14) {
623 		/* Check for pdf format EG 20050315113224-08'00' or
624 		 * 20050216111533Z
625 		 */
626 		buf[0] = date_string[0];
627 		buf[1] = date_string[1];
628 		buf[2] = date_string[2];
629 		buf[3] = date_string[3];
630 		buf[4] = '-';
631 		buf[5] = date_string[4];
632 		buf[6] = date_string[5];
633 		buf[7] = '-';
634 		buf[8] = date_string[6];
635 		buf[9] = date_string[7];
636 		buf[10] = 'T';
637 		buf[11] = date_string[8];
638 		buf[12] = date_string[9];
639 		buf[13] = ':';
640 		buf[14] = date_string[10];
641 		buf[15] = date_string[11];
642 		buf[16] = ':';
643 		buf[17] = date_string[12];
644 		buf[18] = date_string[13];
645 		buf[19] = '\0';
646 
647 		tracker_string_to_date (buf, NULL, &error);
648 
649 		if (error != NULL) {
650 			g_error_free (error);
651 			return NULL;
652 		}
653 
654 		return g_strdup (buf);
655 	} else if (len == 15 && date_string[14] == 'Z') {
656 		buf[0] = date_string[0];
657 		buf[1] = date_string[1];
658 		buf[2] = date_string[2];
659 		buf[3] = date_string[3];
660 		buf[4] = '-';
661 		buf[5] = date_string[4];
662 		buf[6] = date_string[5];
663 		buf[7] = '-';
664 		buf[8] = date_string[6];
665 		buf[9] = date_string[7];
666 		buf[10] = 'T';
667 		buf[11] = date_string[8];
668 		buf[12] = date_string[9];
669 		buf[13] = ':';
670 		buf[14] = date_string[10];
671 		buf[15] = date_string[11];
672 		buf[16] = ':';
673 		buf[17] = date_string[12];
674 		buf[18] = date_string[13];
675 		buf[19] = 'Z';
676 		buf[20] = '\0';
677 
678 		tracker_string_to_date (buf, NULL, &error);
679 
680 		if (error != NULL) {
681 			g_error_free (error);
682 			return NULL;
683 		}
684 
685 		return g_strdup (buf);
686 	} else if (len == 21 && (date_string[14] == '-' || date_string[14] == '+' )) {
687 		buf[0] = date_string[0];
688 		buf[1] = date_string[1];
689 		buf[2] = date_string[2];
690 		buf[3] = date_string[3];
691 		buf[4] = '-';
692 		buf[5] = date_string[4];
693 		buf[6] = date_string[5];
694 		buf[7] = '-';
695 		buf[8] = date_string[6];
696 		buf[9] = date_string[7];
697 		buf[10] = 'T';
698 		buf[11] = date_string[8];
699 		buf[12] = date_string[9];
700 		buf[13] = ':';
701 		buf[14] = date_string[10];
702 		buf[15] = date_string[11];
703 		buf[16] = ':';
704 		buf[17] = date_string[12];
705 		buf[18] = date_string[13];
706 		buf[19] = date_string[14];
707 		buf[20] = date_string[15];
708 		buf[21] = date_string[16];
709 		buf[22] =  ':';
710 		buf[23] = date_string[18];
711 		buf[24] = date_string[19];
712 		buf[25] = '\0';
713 
714 		tracker_string_to_date (buf, NULL, &error);
715 
716 		if (error != NULL) {
717 			g_error_free (error);
718 			return NULL;
719 		}
720 
721 		return g_strdup (buf);
722 	} else if ((len == 24) && (date_string[3] == ' ')) {
723 		/* Check for msoffice date format "Mon Feb  9 10:10:00 2004" */
724 		gint  num_month;
725 		gchar mon1;
726 		gchar day1;
727 
728 		num_month = parse_month (date_string + 4);
729 
730 		if (num_month < 0) {
731 			return NULL;
732 		}
733 
734 		mon1 = imonths[num_month];
735 
736 		if (date_string[8] == ' ') {
737 			day1 = '0';
738 		} else {
739 			day1 = date_string[8];
740 		}
741 
742 		buf[0] = date_string[20];
743 		buf[1] = date_string[21];
744 		buf[2] = date_string[22];
745 		buf[3] = date_string[23];
746 		buf[4] = '-';
747 
748 		if (num_month < 10) {
749 			buf[5] = '0';
750 			buf[6] = mon1;
751 		} else {
752 			buf[5] = '1';
753 			buf[6] = mon1;
754 		}
755 
756 		buf[7] = '-';
757 		buf[8] = day1;
758 		buf[9] = date_string[9];
759 		buf[10] = 'T';
760 		buf[11] = date_string[11];
761 		buf[12] = date_string[12];
762 		buf[13] = ':';
763 		buf[14] = date_string[14];
764 		buf[15] = date_string[15];
765 		buf[16] = ':';
766 		buf[17] = date_string[17];
767 		buf[18] = date_string[18];
768 		buf[19] = '\0';
769 
770 		tracker_string_to_date (buf, NULL, &error);
771 
772 		if (error != NULL) {
773 			g_error_free (error);
774 			return NULL;
775 		}
776 
777 		return g_strdup (buf);
778 	} else if ((len == 19) && (date_string[4] == ':') && (date_string[7] == ':')) {
779 		/* Check for Exif date format "2005:04:29 14:56:54" */
780 		buf[0] = date_string[0];
781 		buf[1] = date_string[1];
782 		buf[2] = date_string[2];
783 		buf[3] = date_string[3];
784 		buf[4] = '-';
785 		buf[5] = date_string[5];
786 		buf[6] = date_string[6];
787 		buf[7] = '-';
788 		buf[8] = date_string[8];
789 		buf[9] = date_string[9];
790 		buf[10] = 'T';
791 		buf[11] = date_string[11];
792 		buf[12] = date_string[12];
793 		buf[13] = ':';
794 		buf[14] = date_string[14];
795 		buf[15] = date_string[15];
796 		buf[16] = ':';
797 		buf[17] = date_string[17];
798 		buf[18] = date_string[18];
799 		buf[19] = '\0';
800 
801 		tracker_string_to_date (buf, NULL, &error);
802 
803 		if (error != NULL) {
804 			g_error_free (error);
805 			return NULL;
806 		}
807 
808 		return g_strdup (buf);
809 	}
810 
811 	tracker_string_to_date (date_string, NULL, &error);
812 
813 	if (error != NULL) {
814 		g_error_free (error);
815 		return NULL;
816 	}
817 
818 	return g_strdup (date_string);
819 }
820 
821 #ifndef HAVE_GETLINE
822 
823 static gint
my_igetdelim(gchar ** linebuf,gsize * linebufsz,gint delimiter,FILE * file)824 my_igetdelim (gchar  **linebuf,
825               gsize   *linebufsz,
826               gint     delimiter,
827               FILE    *file)
828 {
829 	gint ch;
830 	gint idx;
831 
832 	if ((file == NULL || linebuf == NULL || *linebuf == NULL || *linebufsz == 0) &&
833 	    !(*linebuf == NULL && *linebufsz == 0)) {
834 		errno = EINVAL;
835 		return -1;
836 	}
837 
838 	if (*linebuf == NULL && *linebufsz == 0) {
839 		*linebuf = g_malloc (GROW_BY);
840 
841 		if (!*linebuf) {
842 			errno = ENOMEM;
843 			return -1;
844 		}
845 
846 		*linebufsz += GROW_BY;
847 	}
848 
849 	idx = 0;
850 
851 	while ((ch = fgetc (file)) != EOF) {
852 		/* Grow the line buffer as necessary */
853 		while (idx > *linebufsz - 2) {
854 			*linebuf = g_realloc (*linebuf, *linebufsz += GROW_BY);
855 
856 			if (!*linebuf) {
857 				errno = ENOMEM;
858 				return -1;
859 			}
860 		}
861 		(*linebuf)[idx++] = (gchar) ch;
862 
863 		if ((gchar) ch == delimiter) {
864 			break;
865 		}
866 	}
867 
868 	if (idx != 0) {
869 		(*linebuf)[idx] = 0;
870 	} else if ( ch == EOF ) {
871 		return -1;
872 	}
873 
874 	return idx;
875 }
876 
877 #endif /* HAVE_GETLINE */
878 
879 /**
880  * tracker_getline:
881  * @lineptr: Buffer to write into
882  * @n: Max bytes of linebuf
883  * @stream: Filestream to read from
884  *
885  * Reads an entire line from stream, storing the address of the buffer
886  * containing  the  text into *lineptr.  The buffer is null-terminated
887  * and includes the newline character, if one was found.
888  *
889  * Read GNU getline()'s manpage for more information
890  *
891  * Returns: the number of characters read, including the delimiter
892  * character, but not including the terminating %NULL byte. This value
893  * can be used to handle embedded %NULL bytes in the line read. Upon
894  * failure, -1 is returned.
895  *
896  * Since: 0.10
897  **/
898 gssize
tracker_getline(gchar ** lineptr,gsize * n,FILE * stream)899 tracker_getline (gchar **lineptr,
900                  gsize  *n,
901                  FILE *stream)
902 {
903 #ifndef HAVE_GETLINE
904 	return my_igetdelim (lineptr, n, '\n', stream);
905 #else  /* HAVE_GETLINE */
906 	return getline (lineptr, n, stream);
907 #endif /* HAVE_GETLINE */
908 }
909 
910 /**
911  * tracker_keywords_parse:
912  * @store: Array where to store the keywords
913  * @keywords: Keywords line to parse
914  *
915  * Parses a keywords line into store, avoiding duplicates and stripping leading
916  * and trailing spaces from keywords. Allowed delimiters are , and ;
917  *
918  * Since: 0.10
919  **/
920 void
tracker_keywords_parse(GPtrArray * store,const gchar * keywords)921 tracker_keywords_parse (GPtrArray   *store,
922                         const gchar *keywords)
923 {
924 	gchar *orig, *keywords_d;
925 	char *saveptr, *p;
926 	size_t len;
927 
928 	keywords_d = orig = g_strdup (keywords);
929 	p = keywords_d;
930 	keywords_d = strchr (keywords_d, '"');
931 
932 	if (keywords_d) {
933 		keywords_d++;
934 	} else {
935 		keywords_d = p;
936 	}
937 
938 	len = strlen (keywords_d);
939 	if (len > 0 && keywords_d[len - 1] == '"') {
940 		keywords_d[len - 1] = '\0';
941 	}
942 
943 	for (p = strtok_r (keywords_d, ",;", &saveptr); p;
944 	     p = strtok_r (NULL, ",;", &saveptr)) {
945 		guint i;
946 		gboolean found = FALSE;
947 		gchar *p_do = g_strdup (p);
948 		gchar *p_dup = p_do;
949 		guint len = strlen (p_dup);
950 
951 		if (*p_dup == ' ')
952 			p_dup++;
953 
954 		if (p_dup[len-1] == ' ')
955 			p_dup[len-1] = '\0';
956 
957 		/* ignore keywords containing invalid UTF-8 */
958 		if (!g_utf8_validate (p_dup, -1, NULL)) {
959 			g_free (p_do);
960 			continue;
961 		}
962 
963 		for (i = 0; i < store->len; i++) {
964 			const gchar *earlier = g_ptr_array_index (store, i);
965 			if (g_strcmp0 (earlier, p_dup) == 0) {
966 				found = TRUE;
967 				break;
968 			}
969 		}
970 
971 		if (!found) {
972 			g_ptr_array_add (store, g_strdup (p_dup));
973 		}
974 
975 		g_free (p_do);
976 	}
977 
978 	g_free (orig);
979 }
980