1 /**
2  * scan_ccns2:
3  * additional filters for scanning credit card numbers.
4  * used by the scan_accts.flex system.
5  */
6 
7 #include <cassert>
8 
9 
10 #include "config.h"
11 #include "scan_ccns2.h"
12 
13 #include "be13_api/utils.h"
14 #include "dfxml_cpp/src/hash_t.h"
15 
16 #include "be13_api/scanner_params.h"
17 
18 int scan_ccns2_debug=0;
19 
20 
21 /* credit2.cpp:
22  * A filter to scan stdin to stdout, pass through only the lines
23  * that have valid credit-card numbers by our feature detector.
24  */
25 
digit_val(char cc)26 inline int digit_val(char cc)
27 {
28     return cc - '0';
29 }
30 
31 
32 /** extract the digits from a buffer of a given length
33  * into a null-termianted array (which must be at least len+1).
34  * Return 0 if extract is successful and if the count of non-digit
35  * numbers is either 0, 3 (for credit card numbers beginning with a 4 or 5)
36  * or 2 (for credit card numbers beginning with a 3).
37  */
extract_digits_and_test(const char * buf,int len,char * digits)38 static int extract_digits_and_test(const char *buf,int len,char *digits)
39 {
40     int nondigit_count = 0;
41     while(*buf && len){
42 	if(isdigit(*buf)) *digits++ = *buf;
43 	else nondigit_count++;
44 	buf++;
45 	len--;
46     }
47     *digits = 0;			// null-terminate
48 
49     if(nondigit_count==0) return 0;
50     if((digits[0]=='4' || digits[0]=='5') && nondigit_count==3){
51 	return 0;			// visa or mastercard
52     }
53     if((digits[0]=='3') && (nondigit_count==2)){
54 	return 0;			// american express
55     }
56     return -1;
57 }
58 
59 /* Return true if the string only has hex digits */
only_hex_digits(const char * buf,int len)60 static int only_hex_digits(const char *buf,int len)
61 {
62     while(*buf && len){
63 	if(ishexnumber(*buf)==0) return 0;
64 	buf++;
65 	len--;
66     }
67     return 1;
68 }
69 
only_dec_digits(const char * buf,int len)70 static int only_dec_digits(const char *buf,int len)
71 {
72     while(*buf && len){
73 	if(isdigit(*buf)==0) return 0;
74 	buf++;
75 	len--;
76     }
77     return 1;
78 }
79 
80 
81 /****************************************************************
82  *** The tests. Note that sense is reversed.
83  ****************************************************************/
84 
85 
86 
87 /* int ccv1(const char *str,int len)
88  * Return 0 if a number follows the
89  * Credit Card Number Validation Algorithm Version #1, -1 if it fails
90  * (Version 2 is a pure database lookup based on the 3 digits on the back panel.)
91  */
92 
ccv1_test(const char * digits)93 static int ccv1_test(const char *digits)
94 {
95     int chk=0;
96     int double_flag=0;			// is number doubled?
97     int len = strlen(digits);
98     int i;
99     int doubled[] = { 0,2,4,6,8,1,3,5,7,9 };	/* what are number when "doubled" */
100 
101     for(i=len-1;i>=0;i--){
102 	int val = digit_val(digits[i]);
103 	if(double_flag==0){
104 	    chk += val;
105 	    double_flag = 1;
106 	} else {
107 	    chk += doubled[val];
108 	    double_flag = 0;
109 	}
110     }
111 
112     if ( (chk%10) == 0 ) {
113 	return 0;			// passed alg
114     }
115     return -1;
116 }
117 
118 /* histogram_test:
119  * Compute the historgram of the number.
120  * If one digit is repeated more than 7 times, it is not valid.
121  * If two sets of digits are repeated more than 5 times, it is not valid.
122  */
histogram_test(const char * digits)123 static int histogram_test(const char *digits)
124 {
125     int cntscore = 0;
126     int digit_counts[10];			// count of each character
127 
128     memset((void*)digit_counts,0,sizeof(digit_counts));
129     while(*digits){
130 	digit_counts[digit_val(*digits)]++;
131 	digits++;
132     }
133 
134     /* If we have more than 7 of one digit,
135      * or two digits with more than 5,
136      * this isn't a valid number.
137      */
138     for(int i=0; i<10; i++) {
139 	if (digit_counts[i]>7) { return -1;}
140 	if (digit_counts[i]>4) { cntscore ++;}
141     }
142     if(cntscore >=2) return -1;
143     return 0;				// passed histogram test
144 }
145 
146 
147 /*
148  * Called to display strings. The first character is not part of the number.
149  */
150 
151 /** Return the value of the first 4 digites of a buffer, as an integer */
int4(const char * cc)152 static int int4(const char *cc)
153 {
154     char buf[5];
155     for(int i=0;i<4 && cc[i];i++){
156 	buf[i] = cc[i];
157     }
158     buf[4] = 0;
159     return atoi(buf);
160 }
161 
162 /** Return the value of the first 6 digites of a buffer, as an integer */
int6(const char * cc)163 static int int6(const char *cc)
164 {
165     char buf[7];
166     for(int i=0;i<6 && cc[i];i++){
167 	buf[i] = cc[i];
168     }
169     buf[6] = 0;
170     return atoi(buf);
171 }
172 
pattern_test(const char * digits)173 static int pattern_test(const char *digits)
174 {
175     int a = int4(digits);
176     int b = int4(digits+4);
177     int c = int4(digits+8);
178     int d = int4(digits+12);
179 
180     if(b-a == c-d) return -1;		/* something fishy going on... */
181     return 0;
182 }
183 
184 /**
185  * return 0 if prefix is okay, -1 if it is not.
186  *
187  * revised prefix test based on Wikipedia bank card number table
188  * http://en.wikipedia.org/wiki/Bank_card_number
189  */
190 
prefix_test(const char * digits)191 static int prefix_test(const char *digits)
192 {
193     int len = strlen(digits);
194     int a = int4(digits);
195     int b = int6(digits);
196 
197     switch(len){
198     case 13:
199         if(digits[0]=='4') return 0; // Legacy as all 13-digits are deprecated
200 	return -1;
201     case 14:
202 	if(a>=3000 && a<=3050) return 0; // Diners Club Carte Blanche (DC-CB)
203 	if(a>=3600 && a<=3999) return 0; // Diners Club International (DC-Int)
204 	return -1;
205     case 15:
206 	if(a==2014) return 0; // Diners Club enRoute (DC-eR)
207 	if(a==2149) return 0; // Diners Club enRoute (DC-eR)
208 	if(a>=3400 && a<=3499) return 0; // American Express (AmEx)
209 	if(a>=3700 && a<=3799) return 0; // American Express (AmEx)
210 	return -1;
211     case 16:
212 	if(a>=3528 && a<=3589) return 0; // JCB (JCB)
213 	if(a>=4000 && a<=4999) return 0; // Visa (Visa)
214 	if(b==417500) return 0; // Visa (Visa)
215 	if(a>=5100 && a<=5999) return 0; // MasterCard (MC)
216 	if(b>=560221 && b<=560225) return 0; // BankCard (BC)
217 	if(a==5610) return 0; // BankCard (BC)
218 	if(a==6011) return 0; // Discovery (Disc)
219 	if(b>=622126 && b<=622925) return 0; // China UnionPay (CUP)
220 	if(b>=624000 && b<=626999) return 0; // China UnionPay (CUP)
221 	if(b>=628200 && b<=628899) return 0; // China UnionPay (CUP)
222 	if(a==6304) return 0; // Laser (Lasr)
223 	if(a==6334) return 0; // Solo (Solo)
224 	if(a==6706) return 0; // Laser (Lasr)
225 	if(a==6709) return 0; // Laser (Lasr)
226 	if(a==6767) return 0; // Solo (Solo)
227 	if(a==6771) return 0; // Laser (Lasr)
228 	if(a>=6440 && a<=6499) return 0; // Discovery (Disc)
229 	if(a>=6500 && a<=6599) return 0; // Discovery (Disc)
230 	return -1;
231     case 17:
232 	if(b>=622126 && b<=622925) return 0; // China UnionPay (CUP)
233 	if(b>=624000 && b<=626999) return 0; // China UnionPay (CUP)
234 	if(b>=628200 && b<=628899) return 0; // China UnionPay (CUP)
235 	if(a==6304) return 0; // Laser (Lasr)
236 	if(a==6706) return 0; // Laser (Lasr)
237 	if(a==6709) return 0; // Laser (Lasr)
238 	if(a==6771) return 0; // Laser (Lasr)
239       	return -1;
240     case 18:
241 	if(b>=622126 && b<=622925) return 0; // China UnionPay (CUP)
242 	if(b>=624000 && b<=626999) return 0; // China UnionPay (CUP)
243 	if(b>=628200 && b<=628899) return 0; // China UnionPay (CUP)
244 	if(a==6304) return 0; // Laser (Lasr)
245 	if(a==6334) return 0; // Solo (Solo)
246 	if(a==6706) return 0; // Laser (Lasr)
247 	if(a==6709) return 0; // Laser (Lasr)
248 	if(a==6767) return 0; // Solo (Solo)
249 	if(a==6771) return 0; // Laser (Lasr)
250       	return -1;
251     case 19:
252 	if(b>=622126 && b<=622925) return 0; // China UnionPay (CUP)
253 	if(b>=624000 && b<=626999) return 0; // China UnionPay (CUP)
254 	if(b>=628200 && b<=628899) return 0; // China UnionPay (CUP)
255 	if(a==6304) return 0; // Laser (Lasr)
256 	if(a==6334) return 0; // Solo (Solo)
257 	if(a==6706) return 0; // Laser (Lasr)
258 	if(a==6709) return 0; // Laser (Lasr)
259 	if(a==6767) return 0; // Solo (Solo)
260 	if(a==6771) return 0; // Laser (Lasr)
261       	return -1;
262     }
263     return -1;
264 }
265 
266 #define RETURN(code,reason) {if(scan_ccns2_debug){std::cerr << reason << "\n";} return code;}
267 /**
268  * Determine if this is or is not a credit card number.
269  * Return 1 if it is, 0 if it is not.
270  * buf[-WINDOW_MARGIN] must be accessible.
271  * buf[len+WINDOW_MARGIN] must be accessible
272  */
valid_ccn(const char * buf,int buflen)273 bool valid_ccn(const char *buf,int buflen)
274 {
275     /* Make the digits array */
276     if(buflen>19) RETURN(0,"Too long");
277 
278     char digits[20];			// just the digits
279 
280     memset(digits,0,sizeof(digits));
281     if(extract_digits_and_test(buf,buflen,digits)) RETURN(0,"failed nondigit count");
282     if(prefix_test(digits))    RETURN(0,"failed prefix test");
283     if(ccv1_test(digits))      RETURN(0,"failed ccv1 test");
284     if(pattern_test(digits))   RETURN(0,"failed pattern test");
285     if(histogram_test(digits)) RETURN(0,"failed histogram test");
286 
287     int before_window = 4;		// what we care about before
288     int after_window = 4;		// what we care about before
289 
290     /* If the 4 characters before or after are hex digits but not decimal digits,
291      * then this is probably not a credit card number.
292      * We're probably instead in a sea of hex. So abort.
293      */
294     if(only_hex_digits(buf-before_window,before_window) && !only_dec_digits(buf-before_window,before_window)){
295 	RETURN(0,"failed before hex test");
296     }
297     if(only_hex_digits(buf+buflen,after_window) && !only_dec_digits(buf+buflen,after_window)){
298 	RETURN(0,"failed after hex test");
299     }
300 
301     return 1;
302 }
303 
304 
305 /**
306  * Throw out phone numbers that are preceeded or followed with only
307  * numbers and spaces or brackets. These are commonly seen in PDF files
308  * when they are decompressed.
309  */
valid_char(char ch)310 inline bool valid_char(char ch)
311 {
312     return isdigit(ch) || isspace(ch) || ch=='[' || ch==']' || ch=='<' || ch=='Z' || ch=='.' || ch=='l' || ch=='j';
313 }
314 
valid_phone(const sbuf_t & sbuf,size_t pos,size_t len)315 bool  valid_phone(const sbuf_t &sbuf,size_t pos,size_t len)
316 {
317     /* We want invalid characters before and after (assuming there is a before and after */
318     int invalid_before = 0;
319     int invalid_after = 0;
320     if(pos>8){
321 	for(size_t i=pos-8;i<pos;i++){
322 	    if(!valid_char(sbuf[i])) invalid_before = 1;
323 	}
324     } else {
325 	invalid_before = 1;
326     }
327 
328     if(sbuf.bufsize < pos+len+8){
329 	for(size_t i=pos+len;i<pos+len+8;i++){
330 	    if(!valid_char(sbuf[i])) invalid_after = 1;
331 	}
332     } else {
333 	invalid_after = 1;
334     }
335 
336     /*
337      * 2013-05-28: if followed by ' #{1,5} ' then it's not a phone either!
338      */
339     if(pos+len+5 < sbuf.bufsize){
340         if(sbuf[pos+len]==' ' && isdigit(sbuf[pos+len+1])){
341             for(size_t i = pos+len+1 ; (i+1<sbuf.bufsize) && (i<pos+len+8);i++){
342                 if(isdigit(sbuf[i]) && sbuf[i+1]==' ') return false; // not valid
343             }
344         }
345     }
346 
347     /* If it is followed by a dash and a number, it's not a phone number */
348     if(pos+len+2 < sbuf.bufsize){
349         if(sbuf[pos+len]=='-' && isdigit(sbuf[pos+len+1])) return false;
350     }
351 
352     return invalid_before!=0 && invalid_after!=0;
353 }
354 
355 // http://rosettacode.org/wiki/Bitcoin/address_validation#C
356 static const char *base58_chars =
357     "123456789"
358     "ABCDEFGHJKLMNPQRSTUVWXYZ"
359     "abcdefghijkmnopqrstuvwxyz";
360 static int base58_vals[256];
361 static bool unbase58_built = false;
build_unbase58()362 void build_unbase58()
363 {
364     memset(base58_vals,-1,sizeof(base58_vals));
365     for(size_t i=0;base58_chars[i];i++){
366         base58_vals[(u_char)(base58_chars[i])] = i;
367     }
368     unbase58_built = true;
369 }
370 
unbase58(const char * s,uint8_t * out,size_t len)371 bool unbase58(const char *s,uint8_t *out,size_t len)
372 {
373     assert(unbase58_built==true);
374     memset(out,0,25);
375     for(size_t i=0;s[i] && i<len;i++){
376         int c = base58_vals[(u_char)(s[i])];
377         if (c==-1) return false; // invalid character
378         for (int j = 25; j--; ) {
379             c += 58 * out[j];
380             out[j] = c % 256;
381             c /= 256;
382         }
383         if (c!=0) return false; // address too long
384     }
385     return true;
386 }
387 
388 // A bitcoin address uses a base58 encoding, which uses an alphabet of the characters 0 .. 9, A ..Z, a .. z,
389 // but without the four characters 0, O, I and l.
valid_bitcoin_address(const char * s,size_t len)390 bool valid_bitcoin_address(const char *s,size_t len){
391     uint8_t dec[32];
392     if (unbase58(s,dec,len)==false) return false;
393     dfxml::sha256_t d1 = dfxml::sha256_generator::hash_buf(dec,21);
394     dfxml::sha256_t d2 = dfxml::sha256_generator::hash_buf(d1.digest,d1.size());
395     if (memcmp(dec+21, d2.digest, 4)!=0){
396         return false;
397     }
398     return true;  /* validates */
399 };
400 
401 
402 
403 
404 
405 
406 #ifdef DEBUG
validate_ccn_debug(const char * buf,int buflen)407 static int validate_ccn_debug(const char *buf,int buflen)
408 {
409     char digits[64];
410 
411     printf("running tests. 0 means passed, -1 means failed.\n\n");
412     printf("nondigit_test(%s) = %d\n",buf,extract_digits_and_test(buf,buflen,digits));
413     printf("prefix_test(%s) = %d \n",digits,prefix_test(digits));
414     printf("ccv1_test(%s) = %d \n",digits,ccv1_test(digits));
415     printf("histogram_test(%s) = %d \n",digits,histogram_test(digits));
416     printf("pattern_test(%s) = %d \n",digits,pattern_test(digits));
417     printf("only_hex_digits(%s) = %d\n",buf,only_hex_digits(buf,strlen(buf)));
418     printf("only_dec_digits(%s) = %d\n",buf,only_dec_digits(buf,strlen(buf)));
419     return validate_ccn(buf,buflen);
420 }
421 #endif
422