1 /* EasyTAG - tag editor for audio files
2  * Copyright (C) 2014 David King <amigadave@amigadave.com>
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms of the GNU General Public License as published by the Free
6  * Software Foundation; either version 2 of the License, or (at your option)
7  * any later version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program; if not, write to the Free Software Foundation, Inc., 51
16  * Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18 
19 #include "scan.h"
20 
21 #include <string.h>
22 
23 /*
24  * Function to replace underscore '_' by a space. No need to reallocate.
25  */
26 void
Scan_Convert_Underscore_Into_Space(gchar * string)27 Scan_Convert_Underscore_Into_Space (gchar *string)
28 {
29     gchar *tmp = string;
30 
31     while ((tmp = strchr (tmp, '_')) != NULL)
32     {
33         *tmp = ' ';
34     }
35 }
36 
37 /*
38  * Function to replace %20 by a space. No need to reallocate.
39  */
40 void
Scan_Convert_P20_Into_Space(gchar * string)41 Scan_Convert_P20_Into_Space (gchar *string)
42 {
43     gchar *tmp, *tmp1;
44 
45     while ((tmp = strstr (string, "%20")) != NULL)
46     {
47         tmp1 = tmp + 3;
48         *(tmp++) = ' ';
49         while (*tmp1)
50             *(tmp++) = *(tmp1++);
51         *tmp = '\0';
52     }
53 }
54 
55 /*
56  * Function to replace space by '_'. No need to reallocate.
57  */
58 void
Scan_Convert_Space_Into_Underscore(gchar * string)59 Scan_Convert_Space_Into_Underscore (gchar *string)
60 {
61     gchar *tmp = string;
62 
63     while ((tmp = strchr (tmp, ' ')) != NULL)
64     {
65         *tmp = '_';
66     }
67 }
68 
69 void
Scan_Process_Fields_Remove_Space(gchar * string)70 Scan_Process_Fields_Remove_Space (gchar *string)
71 {
72     gchar *tmp, *tmp1;
73 
74     tmp = tmp1 = string;
75 
76     while (*tmp)
77     {
78         while (*tmp == ' ')
79             tmp++;
80         if (*tmp)
81             *(tmp1++) = *(tmp++);
82     }
83     *tmp1 = '\0';
84 }
85 
86 /*
87  * Scan_Process_Fields_Insert_Space:
88  * @string: Input string
89  *
90  * This function will insert space before every uppercase character.
91  *
92  * Returns: A newly allocated string.
93  */
94 gchar *
Scan_Process_Fields_Insert_Space(const gchar * string)95 Scan_Process_Fields_Insert_Space (const gchar *string)
96 {
97     gchar *iter;
98     gunichar c;
99     GString *string1;
100 
101     string1 = g_string_new ("");
102     g_string_append_c (string1, *string);
103 
104     for (iter = g_utf8_next_char (string); *iter; iter = g_utf8_next_char (iter))
105     {
106         c = g_utf8_get_char (iter);
107 
108         if (g_unichar_isupper (c))
109         {
110             g_string_append_c (string1, ' ');
111         }
112 
113         g_string_append_unichar (string1, c);
114     }
115 
116     return g_string_free (string1, FALSE);
117 }
118 
119 /*
120  * The function removes the duplicated spaces. No need to reallocate.
121  */
122 void
Scan_Process_Fields_Keep_One_Space(gchar * string)123 Scan_Process_Fields_Keep_One_Space (gchar *string)
124 {
125     gchar *tmp, *tmp1;
126 
127     tmp = tmp1 = string;
128 
129     // Remove multiple consecutive underscores and spaces.
130     while (*tmp1)
131     {
132         while (*tmp1 && *tmp1 != ' ' && *tmp1 != '_')
133             *(tmp++) = *(tmp1++);
134         if (!*tmp1)
135             break;
136         *(tmp++) = *(tmp1++);
137         while (*tmp1 == ' ' || *tmp1 == '_')
138             tmp1++;
139     }
140     *tmp = '\0';
141 }
142 
143 /*
144  * Function to remove spaces
145  * No need to reallocate
146  */
147 void
Scan_Remove_Spaces(gchar * string)148 Scan_Remove_Spaces (gchar *string)
149 {
150   int nextnotspace = 0, pos = 0;
151 
152   while(string[pos] != '\0')
153   {
154     if(string[pos] == ' ')
155     {
156       nextnotspace = pos;
157       while(string[++nextnotspace] == ' ');
158       string[pos] = string[nextnotspace];
159       string[nextnotspace] = ' ';
160       continue;
161     }
162     pos++;
163   }
164 }
165 
166 /* Returns a newly-allocated string. */
167 gchar *
Scan_Process_Fields_All_Uppercase(const gchar * string)168 Scan_Process_Fields_All_Uppercase (const gchar *string)
169 {
170     return g_utf8_strup (string, -1);
171 }
172 
173 /* Returns a newly-allocated string. */
174 gchar *
Scan_Process_Fields_All_Downcase(const gchar * string)175 Scan_Process_Fields_All_Downcase (const gchar *string)
176 {
177     return g_utf8_strdown (string, -1);
178 }
179 
180 /* Returns a newly-allocated string. */
181 gchar *
Scan_Process_Fields_Letter_Uppercase(const gchar * string)182 Scan_Process_Fields_Letter_Uppercase (const gchar *string)
183 {
184     const gchar *temp;
185     gchar temp2[6];
186     gboolean set_to_upper_case = TRUE;
187     gunichar c;
188     GString *string1;
189 
190     string1 = g_string_new ("");
191 
192     for (temp = string; *temp; temp = g_utf8_next_char (temp))
193     {
194         gchar *temp3;
195         int l;
196 
197         c = g_utf8_get_char (temp);
198         l = g_unichar_to_utf8 (c, temp2);
199 
200         if (set_to_upper_case && g_unichar_islower(c))
201         {
202             temp3 = g_utf8_strup (temp2, l);
203             g_string_append (string1, temp3);
204             g_free (temp3);
205         }
206         else if (!set_to_upper_case && g_unichar_isupper(c))
207         {
208             temp3 = g_utf8_strdown (temp2, l);
209             g_string_append (string1, temp3);
210             g_free (temp3);
211         }
212         else
213         {
214             g_string_append_len (string1, temp2, l);
215         }
216 
217         /* Uppercase the word 'I' in english */
218         if (!set_to_upper_case &&
219             (*(temp - 1) == ' ' || *(temp - 1) == '_') &&
220             (*temp == 'i' || *temp == 'I') &&
221             (*(temp + 1) == ' ' || *(temp + 1) == '_'))
222         {
223             string1->str [string1->len - 1] = 'I';
224         }
225 
226         /* After the first time, all will be lower case. */
227         set_to_upper_case = FALSE;
228     }
229 
230     return g_string_free (string1, FALSE);
231 }
232 
233 static gint
Scan_Word_Is_Roman_Numeral(const gchar * text)234 Scan_Word_Is_Roman_Numeral (const gchar *text)
235 {
236     /* No need for caseless strchr. */
237     static const gchar romans[] = "MmDdCcLlXxVvIi";
238 
239     gsize next_allowed = 0;
240     gsize prev = 0;
241     gsize count = 0;
242     const gchar *i;
243 
244     for (i = text; *i; i++)
245     {
246         const char *s = strchr (romans, *i);
247 
248         if (s)
249         {
250             gsize c = (s - romans) / 2;
251 
252             if (c < next_allowed)
253             {
254                 return 0;
255             }
256 
257             if (c < prev)
258             {
259                 /* After subtraction, no more subtracted chars allowed. */
260                 next_allowed = prev + 1;
261             }
262             else if (c == prev)
263             {
264                 /* Allow indefinite repetition for m; three for c, x and i; and
265                  * none for d, l and v. */
266                 if ((c && ++count > 3) || (c & 1))
267                 {
268                     return 0;
269                 }
270 
271                 /* No more subtraction. */
272                 next_allowed = c;
273             }
274             else if (c && !(c & 1))
275             {
276                 /* For first occurrence of c, x and i, allow "subtraction" from
277                  * 10 and 5 times self, reset counting. */
278                 next_allowed = c - 2;
279                 count = 1;
280             }
281 
282             prev = c;
283         }
284         else
285         {
286             if (g_unichar_isalnum (g_utf8_get_char (i)))
287             {
288                 return 0;
289             }
290 
291             break;
292         }
293     }
294 
295     /* Return length of found Roman numeral. */
296     return i - text;
297 }
298 
299 /*
300  * Function to set the first letter of each word to uppercase, according the "Chicago Manual of Style" (http://www.docstyles.com/cmscrib.htm#Note2)
301  * No needed to reallocate
302  */
303 void
Scan_Process_Fields_First_Letters_Uppercase(gchar ** str,gboolean uppercase_preps,gboolean handle_roman)304 Scan_Process_Fields_First_Letters_Uppercase (gchar **str,
305                                              gboolean uppercase_preps,
306                                              gboolean handle_roman)
307 {
308 /**** DANIEL TEST *****
309     gchar *iter;
310     gchar utf8_character[6];
311     gboolean set_to_upper_case = TRUE;
312     gunichar c;
313 
314     for (iter = text; *iter; iter = g_utf8_next_char(iter))
315     {
316         c = g_utf8_get_char(iter);
317         if (set_to_upper_case && g_unichar_islower(c))
318             strncpy(iter, utf8_character, g_unichar_to_utf8(g_unichar_toupper(c), utf8_character));
319         else if (!set_to_upper_case && g_unichar_isupper(c))
320             strncpy(iter, utf8_character, g_unichar_to_utf8(g_unichar_tolower(c), utf8_character));
321 
322         set_to_upper_case = (g_unichar_isalpha(c)
323                             || c == (gunichar)'.'
324                             || c == (gunichar)'\''
325                             || c == (gunichar)'`') ? FALSE : TRUE;
326     }
327 ****/
328 /**** Barış Çiçek version ****/
329     gchar *string = *str;
330     gchar *word, *word1, *word2, *temp;
331     gint i, len;
332     gchar utf8_character[6];
333     gunichar c;
334     gboolean set_to_upper_case, set_to_upper_case_tmp;
335     // There have to be space at the end of words to seperate them from prefix
336     // Chicago Manual of Style "Heading caps" Capitalization Rules (CMS 1993, 282) (http://www.docstyles.com/cmscrib.htm#Note2)
337     const gchar * exempt[] =
338     {
339         "a ",       "a_",
340         "against ", "against_",
341         "an ",      "an_",
342         "and ",     "and_",
343         "at ",      "at_",
344         "between ", "between_",
345         "but ",     "but_",
346         "feat. ",   "feat._",
347         "for ",     "for_",
348         "in ",      "in_",
349         "nor ",     "nor_",
350         "of ",      "of_",
351         //"off ",     "off_",   // Removed by Slash Bunny
352         "on ",      "on_",
353         "or ",      "or_",
354         //"over ",    "over_",  // Removed by Slash Bunny
355         "so ",      "so_",
356         "the ",     "the_",
357         "to ",      "to_",
358         "with ",    "with_",
359         "yet ",     "yet_",
360         NULL
361     };
362 
363     temp = Scan_Process_Fields_All_Downcase (string);
364     g_free (*str);
365     *str = string = temp;
366 
367     if (!g_utf8_validate(string,-1,NULL))
368     {
369         /* FIXME: Translatable string. */
370         g_warning ("%s",
371                    "Scan_Process_Fields_First_Letters_Uppercase: Not valid UTF-8!");
372         return;
373     }
374     /* Removes trailing whitespace. */
375     string = g_strchomp(string);
376 
377     temp = string;
378 
379     /* If the word is a roman numeral, capitalize all of it. */
380     if (handle_roman && (len = Scan_Word_Is_Roman_Numeral (temp)))
381     {
382         gchar *tmp = g_utf8_strup (temp, len);
383         strncpy (string, tmp, len);
384         g_free (tmp);
385     }
386     else
387     {
388         // Set first character to uppercase
389         c = g_utf8_get_char(temp);
390         strncpy(string, utf8_character, g_unichar_to_utf8(g_unichar_toupper(c), utf8_character));
391     }
392 
393     // Uppercase first character of each word, except for 'exempt[]' words lists
394     while ( temp )
395     {
396         word = temp; // Needed if there is only one word
397         word1 = strchr (temp, ' ');
398         word2 = strchr (temp, '_');
399 
400         // Take the first string found (near beginning of string)
401         if (word1 && word2)
402             word = MIN(word1,word2);
403         else if (word1)
404             word = word1;
405         else if (word2)
406             word = word2;
407         else
408         {
409             // Last word of the string: the first letter is always uppercase,
410             // even if it's in the exempt list. This is a Chicago Manual of Style rule.
411             // Last Word In String - Should Capitalize Regardless of Word (Chicago Manual of Style)
412             c = g_utf8_get_char(word);
413             strncpy(word, utf8_character, g_unichar_to_utf8(g_unichar_toupper(c), utf8_character));
414             break;
415         }
416 
417         // Go to first character of the word (char. after ' ' or '_')
418         word = word+1;
419 
420         // If the word is a roman numeral, capitalize all of it
421         if (handle_roman && (len = Scan_Word_Is_Roman_Numeral (word)))
422         {
423             gchar *tmp = g_utf8_strup (word, len);
424             strncpy (word, tmp, len);
425             g_free (tmp);
426         }
427         else
428         {
429             // Set uppercase the first character of this word
430             c = g_utf8_get_char(word);
431             strncpy(word, utf8_character, g_unichar_to_utf8(g_unichar_toupper(c), utf8_character));
432 
433             if (uppercase_preps)
434             {
435                 goto increment;
436             }
437 
438             /* Lowercase the first character of this word if found in the
439              * exempt words list. */
440             for (i=0; exempt[i]!=NULL; i++)
441             {
442                 if (g_ascii_strncasecmp(exempt[i], word, strlen(exempt[i])) == 0)
443                 {
444                     c = g_utf8_get_char(word);
445                     strncpy(word, utf8_character, g_unichar_to_utf8(g_unichar_tolower(c), utf8_character));
446                     break;
447                 }
448             }
449         }
450 
451 increment:
452         temp = word;
453     }
454 
455     // Uppercase letter placed after some characters like '(', '[', '{'
456     set_to_upper_case = FALSE;
457     for (temp = string; *temp; temp = g_utf8_next_char(temp))
458     {
459         c = g_utf8_get_char(temp);
460         set_to_upper_case_tmp = (  c == (gunichar)'('
461                                 || c == (gunichar)'['
462                                 || c == (gunichar)'{'
463                                 || c == (gunichar)'"'
464                                 || c == (gunichar)':'
465                                 || c == (gunichar)'.'
466                                 || c == (gunichar)'`'
467                                 || c == (gunichar)'-'
468                                 ) ? TRUE : FALSE;
469 
470         if (set_to_upper_case && g_unichar_islower(c))
471             strncpy(temp, utf8_character, g_unichar_to_utf8(g_unichar_toupper(c), utf8_character));
472 
473         set_to_upper_case = set_to_upper_case_tmp;
474     }
475 }
476