tracker-utils.c - OpenGrok cross reference for /dports/sysutils/tracker-miners/tracker-miners-2.3.5/src/libtracker-extract/tracker-utils.c

/*
 * Copyright (C) 2009, Nokia <ivan.frade@nokia.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA  02110-1301, USA.
 */

#include "config-miners.h"

#define _XOPEN_SOURCE
#define _XOPEN_SOURCE_EXTENDED 1	/* strptime is XPG4v2 */

#include <time.h>
#include <string.h>
#include <stdio.h>

#include <libtracker-miners-common/tracker-utils.h>
#include <libtracker-miners-common/tracker-date-time.h>

#include "tracker-utils.h"

#ifndef HAVE_GETLINE

#include <stddef.h>
#include <stdlib.h>
#include <limits.h>
#include <errno.h>

#undef getdelim
#undef getline

#define GROW_BY 80

#endif /* HAVE_GETLINE */

#define DATE_FORMAT_ISO8601 "%Y-%m-%dT%H:%M:%S%z"

/**
 * SECTION:tracker-utils
 * @title: Data utilities
 * @short_description: Functions for coalescing, merging, date
 * handling and normalizing
 * @stability: Stable
 * @include: libtracker-extract/tracker-extract.h
 *
 * This API is provided to facilitate common more general functions
 * which extractors may find useful. These functions are also used by
 * the in-house extractors quite frequently.
 **/

static const char *months[] = {
	"Jan", "Feb", "Mar", "Apr", "May", "Jun",
	"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
};

static const char imonths[] = {
	'1', '2', '3', '4', '5',
	'6', '7', '8', '9', '0', '1', '2'
};


/**
 * tracker_coalesce_strip:
 * @n_values: the number of @... supplied
 * @...: the string pointers to coalesce
 *
 * This function iterates through a series of string pointers passed
 * using @... and returns the first which is not %NULL, not empty
 * (i.e. "") and not comprised of one or more spaces (i.e. " ").
 *
 * The returned value is stripped using g_strstrip(). It is MOST
 * important NOT to pass constant string pointers to this function!
 *
 * Returns: the first string pointer from those provided which
 * matches, otherwise %NULL.
 *
 * Since: 0.10
 **/
const gchar *
tracker_coalesce_strip (gint n_values,
                        ...)
{
	va_list args;
	gint    i;
	const gchar *result = NULL;

	va_start (args, n_values);

	for (i = 0; i < n_values; i++) {
		gchar *value;

		value = va_arg (args, gchar *);
		if (!result && !tracker_is_blank_string (value)) {
			result = (const gchar *) g_strstrip (value);
			break;
		}
	}

	va_end (args);

	return result;
}

// LCOV_EXCL_START

/**
 * tracker_coalesce:
 * @n_values: the number of @Varargs supplied
 * @...: the string pointers to coalesce
 *
 * This function iterates through a series of string pointers passed
 * using @... and returns the first which is not %NULL, not empty
 * (i.e. "") and not comprised of one or more spaces (i.e. " ").
 *
 * The returned value is stripped using g_strstrip(). All other values
 * supplied are freed. It is MOST important NOT to pass constant
 * string pointers to this function!
 *
 * Returns: the first string pointer from those provided which
 * matches, otherwise %NULL.
 *
 * Since: 0.8
 *
 * Deprecated: 0.10: Use tracker_coalesce_strip() instead.
 *
 **/
gchar *
tracker_coalesce (gint n_values,
                  ...)
{
	va_list args;
	gint    i;
	gchar *result = NULL;

	va_start (args, n_values);

	for (i = 0; i < n_values; i++) {
		gchar *value;

		value = va_arg (args, gchar *);
		if (!result && !tracker_is_blank_string (value)) {
			result = g_strstrip (value);
		} else {
			g_free (value);
		}
	}

	va_end (args);

	return result;
}
// LCOV_EXCL_STOP

/**
 * tracker_merge_const:
 * @delimiter: the delimiter to use when merging
 * @n_values: the number of @... supplied
 * @...: the string pointers to merge
 *
 * This function iterates through a series of string pointers passed
 * using @... and returns a newly allocated string of the merged
 * strings.
 *
 * The @delimiter can be %NULL. If specified, it will be used in
 * between each merged string in the result.
 *
 * Returns: a newly-allocated string holding the result which should
 * be freed with g_free() when finished with, otherwise %NULL.
 *
 * Since: 0.10
 **/
gchar *
tracker_merge_const (const gchar *delimiter,
                     gint         n_values,
                     ...)
{
	va_list args;
	gint    i;
	GString *str = NULL;

	va_start (args, n_values);

	for (i = 0; i < n_values; i++) {
		gchar *value;

		value = va_arg (args, gchar *);
		if (value) {
			if (!str) {
				str = g_string_new (value);
			} else {
				if (delimiter) {
					g_string_append (str, delimiter);
				}
				g_string_append (str, value);
			}
		}
	}

	va_end (args);

	if (!str) {
		return NULL;
	}

	return g_string_free (str, FALSE);
}

// LCOV_EXCL_START

/**
 * tracker_merge:
 * @delimiter: the delimiter to use when merging
 * @n_values: the number of @... supplied
 * @...: the string pointers to merge
 *
 * This function iterates through a series of string pointers passed
 * using @... and returns a newly allocated string of the merged
 * strings. All passed strings are freed (don't pass const values)/
 *
 * The @delimiter can be %NULL. If specified, it will be used in
 * between each merged string in the result.
 *
 * Returns: a newly-allocated string holding the result which should
 * be freed with g_free() when finished with, otherwise %NULL.
 *
 * Since: 0.8
 *
 * Deprecated: 0.10: Use tracker_merge_const() instead.
 **/
gchar *
tracker_merge (const gchar *delimiter,
               gint         n_values,
               ...)
{
	va_list args;
	gint    i;
	GString *str = NULL;

	va_start (args, n_values);

	for (i = 0; i < n_values; i++) {
		gchar *value;

		value = va_arg (args, gchar *);
		if (value) {
			if (!str) {
				str = g_string_new (value);
			} else {
				if (delimiter) {
					g_string_append (str, delimiter);
				}
				g_string_append (str, value);
			}
			g_free (value);
		}
	}

	va_end (args);

	if (!str) {
		return NULL;
	}

	return g_string_free (str, FALSE);
}

/**
 * tracker_text_normalize:
 * @text: the text to normalize
 * @max_words: the maximum words of @text to normalize
 * @n_words: the number of words actually normalized
 *
 * This function iterates through @text checking for UTF-8 validity
 * using g_utf8_get_char_validated(). For each character found, the
 * %GUnicodeType is checked to make sure it is one fo the following
 * values:
 * <itemizedlist>
 *  <listitem><para>%G_UNICODE_LOWERCASE_LETTER</para></listitem>
 *  <listitem><para>%G_UNICODE_MODIFIER_LETTER</para></listitem>
 *  <listitem><para>%G_UNICODE_OTHER_LETTER</para></listitem>
 *  <listitem><para>%G_UNICODE_TITLECASE_LETTER</para></listitem>
 *  <listitem><para>%G_UNICODE_UPPERCASE_LETTER</para></listitem>
 * </itemizedlist>
 *
 * All other symbols, punctuation, marks, numbers and separators are
 * stripped. A regular space (i.e. " ") is used to separate the words
 * in the returned string.
 *
 * The @n_words can be %NULL. If specified, it will be populated with
 * the number of words that were normalized in the result.
 *
 * Returns: a newly-allocated string holding the result which should
 * be freed with g_free() when finished with, otherwise %NULL.
 *
 * Since: 0.8
 *
 * Deprecated: 0.10: Use tracker_text_validate_utf8() instead.
 **/
gchar *
tracker_text_normalize (const gchar *text,
                        guint        max_words,
                        guint       *n_words)
{
	GString *string;
	gboolean in_break = TRUE;
	gunichar ch;
	gint words = 0;

	string = g_string_new (NULL);

	while ((ch = g_utf8_get_char_validated (text, -1)) > 0) {
		GUnicodeType type;

		type = g_unichar_type (ch);

		if (type == G_UNICODE_LOWERCASE_LETTER ||
		    type == G_UNICODE_MODIFIER_LETTER ||
		    type == G_UNICODE_OTHER_LETTER ||
		    type == G_UNICODE_TITLECASE_LETTER ||
		    type == G_UNICODE_UPPERCASE_LETTER) {
			/* Append regular chars */
			g_string_append_unichar (string, ch);
			in_break = FALSE;
		} else if (!in_break) {
			/* Non-regular char found, treat as word break */
			g_string_append_c (string, ' ');
			in_break = TRUE;
			words++;

			if (words > max_words) {
				break;
			}
		}

		text = g_utf8_find_next_char (text, NULL);
	}

	if (n_words) {
		if (!in_break) {
			/* Count the last word */
			words += 1;
		}
		*n_words = words;
	}

	return g_string_free (string, FALSE);
}

// LCOV_EXCL_STOP

/**
 * tracker_text_validate_utf8:
 * @text: the text to validate
 * @text_len: length of @text, or -1 if NUL-terminated
 * @str: the string where to place the validated UTF-8 characters, or %NULL if
 *  not needed.
 * @valid_len: Output number of valid UTF-8 bytes found, or %NULL if not needed
 *
 * This function iterates through @text checking for UTF-8 validity
 * using g_utf8_validate(), appends the first chunk of valid characters
 * to @str, and gives the number of valid UTF-8 bytes in @valid_len.
 *
 * Returns: %TRUE if some bytes were found to be valid, %FALSE otherwise.
 *
 * Since: 0.10
 **/
gboolean
tracker_text_validate_utf8 (const gchar  *text,
                            gssize        text_len,
                            GString     **str,
                            gsize        *valid_len)
{
	gsize len_to_validate;

	g_return_val_if_fail (text, FALSE);

	len_to_validate = text_len >= 0 ? text_len : strlen (text);

	if (len_to_validate > 0) {
		const gchar *end = text;

		/* Validate string, getting the pointer to first non-valid character
		 *  (if any) or to the end of the string. */
		g_utf8_validate (text, len_to_validate, &end);
		if (end > text) {
			/* If str output required... */
			if (str) {
				/* Create string to output if not already as input */
				*str = (*str == NULL ?
				        g_string_new_len (text, end - text) :
				        g_string_append_len (*str, text, end - text));
			}

			/* If utf8 len output required... */
			if (valid_len) {
				*valid_len = end - text;
			}

			return TRUE;
		}
	}

	return FALSE;
}

/**
 * tracker_date_format_to_iso8601:
 * @date_string: the date in a string pointer
 * @format: the format of the @date_string
 *
 * This function uses strptime() to create a time tm structure using
 * @date_string and @format.
 *
 * Returns: a newly-allocated string with the time represented in
 * ISO8601 date format which should be freed with g_free() when
 * finished with, otherwise %NULL.
 *
 * Since: 0.8
 **/
gchar *
tracker_date_format_to_iso8601 (const gchar *date_string,
                                const gchar *format)
{
	gchar *result;
	struct tm date_tm = { 0 };

	g_return_val_if_fail (date_string != NULL, NULL);
	g_return_val_if_fail (format != NULL, NULL);

	if (strptime (date_string, format, &date_tm) == 0) {
		return NULL;
	}

	/* If the input format string doesn't parse timezone information with
	 * either %z or %Z, strptime() won't set the tm_gmtoff member in the
	 * broken-down time, and the value during initialization (0) will be
	 * left. This effectively means that every broken-down time obtained
	 * with strptime() without parsing timezone information will be based
	 * on UTC, instead of being treated as localtime. In order to fix this
	 * and set the correct value for the offset w.r.t gmt, we can just
	 * use mktime() to fill in the daylight saving flag as well as the
	 * gmt offset value. */
	if (!strstr (format, "%z") && !strstr (format, "%Z")) {
		/* tm_isdst not set by strptime(), we set -1 on it in order to ask
		 * mktime to 'normalize' its contents and fill in the gmt offset
		 * and daylight saving time information */
		date_tm.tm_isdst = -1;

		/* Note: no real problem if mktime() fails. In this case, tm_isdst
		 * will be -1, and therefore strftime() will not write the timezone
		 * information, which is equally right to represent localtime. */
		mktime (&date_tm);
	}

	result = g_malloc (sizeof (char) * 25);
	strftime (result, 25, DATE_FORMAT_ISO8601 , &date_tm);
	return result;
}

static gboolean
is_int (const gchar *str)
{
	gint i, len;

	if (!str || str[0] == '\0') {
		return FALSE;
	}

	len = strlen (str);

	for (i = 0; i < len; i++) {
		if (!g_ascii_isdigit (str[i])) {
			return FALSE;
		}
	}

	return TRUE ;
}

static gint
parse_month (const gchar *month)
{
	gint i;

	for (i = 0; i < 12; i++) {
		if (!strncmp (month, months[i], 3)) {
			return i;
		}
	}

	return -1;
}

/* Determine date format and convert to ISO 8601 format */
/* FIXME We should handle all the fractions here (see ISO 8601), as well as YYYY:DDD etc */

/**
 * tracker_date_guess:
 * @date_string: the date in a string pointer
 *
 * This function uses a number of methods to try and guess the date
 * held in @date_string. The @date_string must be at least 5
 * characters in length or longer for any guessing to be attempted.
 * Some of the string formats guessed include:
 *
 * <itemizedlist>
 *  <listitem><para>"YYYY-MM-DD" (Simple format)</para></listitem>
 *  <listitem><para>"20050315113224-08'00'" (PDF format)</para></listitem>
 *  <listitem><para>"20050216111533Z" (PDF format)</para></listitem>
 *  <listitem><para>"Mon Feb  9 10:10:00 2004" (Microsoft Office format)</para></listitem>
 *  <listitem><para>"2005:04:29 14:56:54" (Exif format)</para></listitem>
 *  <listitem><para>"YYYY-MM-DDThh:mm:ss.ff+zz:zz</para></listitem>
 * </itemizedlist>
 *
 * Returns: a newly-allocated string with the time represented in
 * ISO8601 date format which should be freed with g_free() when
 * finished with, otherwise %NULL.
 *
 * Since: 0.8
 **/
gchar *
tracker_date_guess (const gchar *date_string)
{
	gchar buf[30];
	gint  len;
	GError *error = NULL;

	if (!date_string) {
		return NULL;
	}

	len = strlen (date_string);

	/* We cannot format a date without at least a four digit
	 * year.
	 */
	if (len < 4) {
		return NULL;
	}

	/* Check for year only dates (EG ID3 music tags might have
	 * Audio.ReleaseDate as 4 digit year)
	 */
	if (len == 4) {
		if (is_int (date_string)) {
			buf[0] = date_string[0];
			buf[1] = date_string[1];
			buf[2] = date_string[2];
			buf[3] = date_string[3];
			buf[4] = '-';
			buf[5] = '0';
			buf[6] = '1';
			buf[7] = '-';
			buf[8] = '0';
			buf[9] = '1';
			buf[10] = 'T';
			buf[11] = '0';
			buf[12] = '0';
			buf[13] = ':';
			buf[14] = '0';
			buf[15] = '0';
			buf[16] = ':';
			buf[17] = '0';
			buf[18] = '0';
			buf[19] = 'Z';
			buf[20] = '\0';

			tracker_string_to_date (buf, NULL, &error);

			if (error != NULL) {
				g_error_free (error);
				return NULL;
			}

			return g_strdup (buf);
		} else {
			return NULL;
		}
	} else if (len == 10)  {
		/* Check for date part only YYYY-MM-DD */
		buf[0] = date_string[0];
		buf[1] = date_string[1];
		buf[2] = date_string[2];
		buf[3] = date_string[3];
		buf[4] = '-';
		buf[5] = date_string[5];
		buf[6] = date_string[6];
		buf[7] = '-';
		buf[8] = date_string[8];
		buf[9] = date_string[9];
		buf[10] = 'T';
		buf[11] = '0';
		buf[12] = '0';
		buf[13] = ':';
		buf[14] = '0';
		buf[15] = '0';
		buf[16] = ':';
		buf[17] = '0';
		buf[18] = '0';
		buf[19] = '\0';

		tracker_string_to_date (buf, NULL, &error);

		if (error != NULL) {
			g_error_free (error);
			return NULL;
		}

		return g_strdup (buf);
	} else if (len == 14) {
		/* Check for pdf format EG 20050315113224-08'00' or
		 * 20050216111533Z
		 */
		buf[0] = date_string[0];
		buf[1] = date_string[1];
		buf[2] = date_string[2];
		buf[3] = date_string[3];
		buf[4] = '-';
		buf[5] = date_string[4];
		buf[6] = date_string[5];
		buf[7] = '-';
		buf[8] = date_string[6];
		buf[9] = date_string[7];
		buf[10] = 'T';
		buf[11] = date_string[8];
		buf[12] = date_string[9];
		buf[13] = ':';
		buf[14] = date_string[10];
		buf[15] = date_string[11];
		buf[16] = ':';
		buf[17] = date_string[12];
		buf[18] = date_string[13];
		buf[19] = '\0';

		tracker_string_to_date (buf, NULL, &error);

		if (error != NULL) {
			g_error_free (error);
			return NULL;
		}

		return g_strdup (buf);
	} else if (len == 15 && date_string[14] == 'Z') {
		buf[0] = date_string[0];
		buf[1] = date_string[1];
		buf[2] = date_string[2];
		buf[3] = date_string[3];
		buf[4] = '-';
		buf[5] = date_string[4];
		buf[6] = date_string[5];
		buf[7] = '-';
		buf[8] = date_string[6];
		buf[9] = date_string[7];
		buf[10] = 'T';
		buf[11] = date_string[8];
		buf[12] = date_string[9];
		buf[13] = ':';
		buf[14] = date_string[10];
		buf[15] = date_string[11];
		buf[16] = ':';
		buf[17] = date_string[12];
		buf[18] = date_string[13];
		buf[19] = 'Z';
		buf[20] = '\0';

		tracker_string_to_date (buf, NULL, &error);

		if (error != NULL) {
			g_error_free (error);
			return NULL;
		}

		return g_strdup (buf);
	} else if (len == 21 && (date_string[14] == '-' || date_string[14] == '+' )) {
		buf[0] = date_string[0];
		buf[1] = date_string[1];
		buf[2] = date_string[2];
		buf[3] = date_string[3];
		buf[4] = '-';
		buf[5] = date_string[4];
		buf[6] = date_string[5];
		buf[7] = '-';
		buf[8] = date_string[6];
		buf[9] = date_string[7];
		buf[10] = 'T';
		buf[11] = date_string[8];
		buf[12] = date_string[9];
		buf[13] = ':';
		buf[14] = date_string[10];
		buf[15] = date_string[11];
		buf[16] = ':';
		buf[17] = date_string[12];
		buf[18] = date_string[13];
		buf[19] = date_string[14];
		buf[20] = date_string[15];
		buf[21] = date_string[16];
		buf[22] =  ':';
		buf[23] = date_string[18];
		buf[24] = date_string[19];
		buf[25] = '\0';

		tracker_string_to_date (buf, NULL, &error);

		if (error != NULL) {
			g_error_free (error);
			return NULL;
		}

		return g_strdup (buf);
	} else if ((len == 24) && (date_string[3] == ' ')) {
		/* Check for msoffice date format "Mon Feb  9 10:10:00 2004" */
		gint  num_month;
		gchar mon1;
		gchar day1;

		num_month = parse_month (date_string + 4);

		if (num_month < 0) {
			return NULL;
		}

		mon1 = imonths[num_month];

		if (date_string[8] == ' ') {
			day1 = '0';
		} else {
			day1 = date_string[8];
		}

		buf[0] = date_string[20];
		buf[1] = date_string[21];
		buf[2] = date_string[22];
		buf[3] = date_string[23];
		buf[4] = '-';

		if (num_month < 10) {
			buf[5] = '0';
			buf[6] = mon1;
		} else {
			buf[5] = '1';
			buf[6] = mon1;
		}

		buf[7] = '-';
		buf[8] = day1;
		buf[9] = date_string[9];
		buf[10] = 'T';
		buf[11] = date_string[11];
		buf[12] = date_string[12];
		buf[13] = ':';
		buf[14] = date_string[14];
		buf[15] = date_string[15];
		buf[16] = ':';
		buf[17] = date_string[17];
		buf[18] = date_string[18];
		buf[19] = '\0';

		tracker_string_to_date (buf, NULL, &error);

		if (error != NULL) {
			g_error_free (error);
			return NULL;
		}

		return g_strdup (buf);
	} else if ((len == 19) && (date_string[4] == ':') && (date_string[7] == ':')) {
		/* Check for Exif date format "2005:04:29 14:56:54" */
		buf[0] = date_string[0];
		buf[1] = date_string[1];
		buf[2] = date_string[2];
		buf[3] = date_string[3];
		buf[4] = '-';
		buf[5] = date_string[5];
		buf[6] = date_string[6];
		buf[7] = '-';
		buf[8] = date_string[8];
		buf[9] = date_string[9];
		buf[10] = 'T';
		buf[11] = date_string[11];
		buf[12] = date_string[12];
		buf[13] = ':';
		buf[14] = date_string[14];
		buf[15] = date_string[15];
		buf[16] = ':';
		buf[17] = date_string[17];
		buf[18] = date_string[18];
		buf[19] = '\0';

		tracker_string_to_date (buf, NULL, &error);

		if (error != NULL) {
			g_error_free (error);
			return NULL;
		}

		return g_strdup (buf);
	}

	tracker_string_to_date (date_string, NULL, &error);

	if (error != NULL) {
		g_error_free (error);
		return NULL;
	}

	return g_strdup (date_string);
}

#ifndef HAVE_GETLINE

static gint
my_igetdelim (gchar  **linebuf,
              gsize   *linebufsz,
              gint     delimiter,
              FILE    *file)
{
	gint ch;
	gint idx;

	if ((file == NULL || linebuf == NULL || *linebuf == NULL || *linebufsz == 0) &&
	    !(*linebuf == NULL && *linebufsz == 0)) {
		errno = EINVAL;
		return -1;
	}

	if (*linebuf == NULL && *linebufsz == 0) {
		*linebuf = g_malloc (GROW_BY);

		if (!*linebuf) {
			errno = ENOMEM;
			return -1;
		}

		*linebufsz += GROW_BY;
	}

	idx = 0;

	while ((ch = fgetc (file)) != EOF) {
		/* Grow the line buffer as necessary */
		while (idx > *linebufsz - 2) {
			*linebuf = g_realloc (*linebuf, *linebufsz += GROW_BY);

			if (!*linebuf) {
				errno = ENOMEM;
				return -1;
			}
		}
		(*linebuf)[idx++] = (gchar) ch;

		if ((gchar) ch == delimiter) {
			break;
		}
	}

	if (idx != 0) {
		(*linebuf)[idx] = 0;
	} else if ( ch == EOF ) {
		return -1;
	}

	return idx;
}

#endif /* HAVE_GETLINE */

/**
 * tracker_getline:
 * @lineptr: Buffer to write into
 * @n: Max bytes of linebuf
 * @stream: Filestream to read from
 *
 * Reads an entire line from stream, storing the address of the buffer
 * containing  the  text into *lineptr.  The buffer is null-terminated
 * and includes the newline character, if one was found.
 *
 * Read GNU getline()'s manpage for more information
 *
 * Returns: the number of characters read, including the delimiter
 * character, but not including the terminating %NULL byte. This value
 * can be used to handle embedded %NULL bytes in the line read. Upon
 * failure, -1 is returned.
 *
 * Since: 0.10
 **/
gssize
tracker_getline (gchar **lineptr,
                 gsize  *n,
                 FILE *stream)
{
#ifndef HAVE_GETLINE
	return my_igetdelim (lineptr, n, '\n', stream);
#else  /* HAVE_GETLINE */
	return getline (lineptr, n, stream);
#endif /* HAVE_GETLINE */
}

/**
 * tracker_keywords_parse:
 * @store: Array where to store the keywords
 * @keywords: Keywords line to parse
 *
 * Parses a keywords line into store, avoiding duplicates and stripping leading
 * and trailing spaces from keywords. Allowed delimiters are , and ;
 *
 * Since: 0.10
 **/
void
tracker_keywords_parse (GPtrArray   *store,
                        const gchar *keywords)
{
	gchar *orig, *keywords_d;
	char *saveptr, *p;
	size_t len;

	keywords_d = orig = g_strdup (keywords);
	p = keywords_d;
	keywords_d = strchr (keywords_d, '"');

	if (keywords_d) {
		keywords_d++;
	} else {
		keywords_d = p;
	}

	len = strlen (keywords_d);
	if (len > 0 && keywords_d[len - 1] == '"') {
		keywords_d[len - 1] = '\0';
	}

	for (p = strtok_r (keywords_d, ",;", &saveptr); p;
	     p = strtok_r (NULL, ",;", &saveptr)) {
		guint i;
		gboolean found = FALSE;
		gchar *p_do = g_strdup (p);
		gchar *p_dup = p_do;
		guint len = strlen (p_dup);

		if (*p_dup == ' ')
			p_dup++;

		if (p_dup[len-1] == ' ')
			p_dup[len-1] = '\0';

		/* ignore keywords containing invalid UTF-8 */
		if (!g_utf8_validate (p_dup, -1, NULL)) {
			g_free (p_do);
			continue;
		}

		for (i = 0; i < store->len; i++) {
			const gchar *earlier = g_ptr_array_index (store, i);
			if (g_strcmp0 (earlier, p_dup) == 0) {
				found = TRUE;
				break;
			}
		}

		if (!found) {
			g_ptr_array_add (store, g_strdup (p_dup));
		}

		g_free (p_do);
	}

	g_free (orig);
}