1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3  * License, v. 2.0. If a copy of the MPL was not distributed with this
4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 
6 #include "nsCharSetProber.h"
7 #include "prmem.h"
8 
9 //This filter applies to all scripts which do not use English characters
FilterWithoutEnglishLetters(const char * aBuf,uint32_t aLen,char ** newBuf,uint32_t & newLen)10 bool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen)
11 {
12   char *newptr;
13   char *prevPtr, *curPtr;
14 
15   bool meetMSB = false;
16   newptr = *newBuf = (char*)PR_Malloc(aLen);
17   if (!newptr)
18     return false;
19 
20   for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
21   {
22     if (*curPtr & 0x80)
23     {
24       meetMSB = true;
25     }
26     else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')
27     {
28       //current char is a symbol, most likely a punctuation. we treat it as segment delimiter
29       if (meetMSB && curPtr > prevPtr)
30       //this segment contains more than single symbol, and it has upper ASCII, we need to keep it
31       {
32         while (prevPtr < curPtr) *newptr++ = *prevPtr++;
33         prevPtr++;
34         *newptr++ = ' ';
35         meetMSB = false;
36       }
37       else //ignore current segment. (either because it is just a symbol or just an English word)
38         prevPtr = curPtr+1;
39     }
40   }
41   if (meetMSB && curPtr > prevPtr)
42     while (prevPtr < curPtr) *newptr++ = *prevPtr++;
43 
44   newLen = newptr - *newBuf;
45 
46   return true;
47 }
48 
49 //This filter applies to all scripts which contain both English characters and upper ASCII characters.
FilterWithEnglishLetters(const char * aBuf,uint32_t aLen,char ** newBuf,uint32_t & newLen)50 bool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen)
51 {
52   //do filtering to reduce load to probers
53   char *newptr;
54   char *prevPtr, *curPtr;
55   bool isInTag = false;
56 
57   newptr = *newBuf = (char*)PR_Malloc(aLen);
58   if (!newptr)
59     return false;
60 
61   for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
62   {
63     if (*curPtr == '>')
64       isInTag = false;
65     else if (*curPtr == '<')
66       isInTag = true;
67 
68     if (!(*curPtr & 0x80) &&
69         (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
70     {
71       if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol
72                                         // and it is not inside a tag, keep it.
73       {
74         while (prevPtr < curPtr) *newptr++ = *prevPtr++;
75         prevPtr++;
76         *newptr++ = ' ';
77       }
78       else
79         prevPtr = curPtr+1;
80     }
81   }
82 
83   // If the current segment contains more than just a symbol
84   // and it is not inside a tag then keep it.
85   if (!isInTag)
86     while (prevPtr < curPtr)
87       *newptr++ = *prevPtr++;
88 
89   newLen = newptr - *newBuf;
90 
91   return true;
92 }
93