1 /*
2 # C-Code for hashing and matching
3 # S3 atomic 64bit integers for R
4 # (c) 2011 Jens Oehlschägel
5 # Licence: GPL2
6 # Provided 'as is', use at your own risk
7 # Created: 2011-12-11
8 # Last changed: 2012-10-22
9 #*/
10
11 /* for speed (should not really matter in this case as most time is spent in the hashing) */
12 // #define USE_RINTERNALS 1
13 #include <Rinternals.h>
14 #include <R.h>
15
16 //#include "timing.h"
17
18 // This multiplicator was used in Simon Urbanek's package fastmatch for 32-bit integers
19 //#define HASH64(X, SHIFT) (314159265358979323ULL * ((unsigned long long)(X)) >> (SHIFT))
20 // This multiplicator seems to work fine with 64bit integers
21 #define HASH64(X, SHIFT) (0x9e3779b97f4a7c13ULL * ((unsigned long long)(X)) >> (SHIFT))
22
hashfun_integer64(SEXP x_,SEXP bits_,SEXP ret_)23 SEXP hashfun_integer64(SEXP x_, SEXP bits_, SEXP ret_){
24 int i, n = LENGTH(x_);
25 long long * x = (long long *) REAL(x_);
26 unsigned int * ret = (unsigned int *) INTEGER(ret_);
27 int shift = 64 - asInteger(bits_);
28 for(i=0; i<n; i++){
29 ret[i] = (unsigned int) HASH64(x[i], shift);
30 }
31 return ret_;
32 }
33
34 // this function is loosely following Simon Urbanek's package 'fastmatch'
hashmap_integer64(SEXP x_,SEXP bits_,SEXP hashpos_,SEXP nunique_)35 SEXP hashmap_integer64(SEXP x_, SEXP bits_, SEXP hashpos_, SEXP nunique_){
36 int i, nx = LENGTH(x_);
37 int h, nh = LENGTH(hashpos_);
38 long long * x = (long long *) REAL(x_);
39 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
40 int bits = asInteger(bits_);
41 int shift = 64 - bits;
42 long long v;
43 int nunique = 0;
44 for(i=0; i<nx; ){
45 v = x[i++];
46 h = HASH64(v, shift);
47 while (hashpos[h] && x[hashpos[h] - 1] != v){
48 h++;
49 if (h == nh)
50 h = 0;
51 }
52 if (!hashpos[h]){
53 hashpos[h] = i;
54 nunique++;
55 }
56 }
57 INTEGER(nunique_)[0] = nunique;
58 return hashpos_;
59 }
60
hashpos_integer64(SEXP x_,SEXP hashdat_,SEXP bits_,SEXP hashpos_,SEXP nomatch_,SEXP ret_)61 SEXP hashpos_integer64(SEXP x_, SEXP hashdat_, SEXP bits_, SEXP hashpos_, SEXP nomatch_, SEXP ret_){
62 int i, nx = LENGTH(x_);
63 int h, nh = LENGTH(hashpos_);
64 long long * x = (long long *) REAL(x_);
65 long long * hashdat = (long long *) REAL(hashdat_);
66 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
67 int * ret = INTEGER(ret_);
68 int bits = asInteger(bits_);
69 int shift = 64 - bits;
70 int nomatch = asInteger(nomatch_);
71 long long v;
72 for(i=0; i<nx; i++){
73 v = x[i];
74 h = HASH64(v, shift);
75 for(;;){
76 if (hashpos[h]){ // this is mostly while(hashpos[h]) but we want to catch failure for the nomatch assignment
77 if (hashdat[hashpos[h] - 1] == v){
78 ret[i] = hashpos[h];
79 break;
80 }
81 h++;
82 if (h == nh)
83 h = 0;
84 }else{
85 ret[i] = nomatch;
86 break;
87 }
88 }
89 }
90 return ret_;
91 }
92
hashrev_integer64(SEXP x_,SEXP hashdat_,SEXP bits_,SEXP hashpos_,SEXP nunique_,SEXP nomatch_,SEXP ret_)93 SEXP hashrev_integer64(SEXP x_, SEXP hashdat_, SEXP bits_, SEXP hashpos_, SEXP nunique_, SEXP nomatch_, SEXP ret_){
94 int i, nx = LENGTH(x_);
95 int h, nh = LENGTH(hashpos_);
96 int nd = LENGTH(hashdat_);
97 long long * x = (long long *) REAL(x_);
98 long long * hashdat = (long long *) REAL(hashdat_);
99 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
100 int * ret = INTEGER(ret_);
101 int bits = asInteger(bits_);
102 int shift = 64 - bits;
103 int nomatch = asInteger(nomatch_);
104 int nunique = asInteger(nunique_);
105 int iunique=0;
106 long long v;
107 for(i=0; i<nx; ){
108 v = x[i++];
109 h = HASH64(v, shift);
110 while(hashpos[h]){
111 if (hashdat[hashpos[h] - 1] == v){
112 h = hashpos[h] - 1;
113 if (!ret[h]){
114 ret[h] = i;
115 if (++iunique==nunique)
116 i=nx; // break out of for as well
117 }
118 break;
119 }
120 h++;
121 if (h == nh)
122 h = 0;
123 }
124 }
125 if (iunique<nd){
126 if (nunique<nd){ // some gaps are duplicates
127 for(i=0; i<nd; i++){
128 if (!ret[i]){
129 v = hashdat[i];
130 h = HASH64(v, shift);
131 while(hashpos[h]){ // this is mostly while(hashpos[h]) but we want to catch failure for the nomatch assignment
132 if (hashdat[hashpos[h] - 1] == v){
133 h = ret[hashpos[h] - 1];
134 if (h)
135 ret[i] = h;
136 else
137 ret[i] = nomatch;
138 break;
139 }
140 h++;
141 if (h == nh)
142 h = 0;
143 }
144 }
145 }
146 }else{ // no duplicates: all gaps are nomatches
147 for(i=0; i<nd; i++)
148 if (!ret[i])
149 ret[i] = nomatch;
150 }
151 }
152 return ret_;
153 }
154
hashrin_integer64(SEXP x_,SEXP hashdat_,SEXP bits_,SEXP hashpos_,SEXP nunique_,SEXP ret_)155 SEXP hashrin_integer64(SEXP x_, SEXP hashdat_, SEXP bits_, SEXP hashpos_, SEXP nunique_, SEXP ret_){
156 int i, nx = LENGTH(x_);
157 int h, nh = LENGTH(hashpos_);
158 int nd = LENGTH(hashdat_);
159 long long * x = (long long *) REAL(x_);
160 long long * hashdat = (long long *) REAL(hashdat_);
161 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
162 int * ret = INTEGER(ret_);
163 int bits = asInteger(bits_);
164 int shift = 64 - bits;
165 int nunique = asInteger(nunique_);
166 int iunique=0;
167 long long v;
168 for(i=0; i<nx; ){
169 v = x[i++];
170 h = HASH64(v, shift);
171 while(hashpos[h]){
172 if (hashdat[hashpos[h] - 1] == v){
173 h = hashpos[h] - 1;
174 if (!ret[h]){
175 ret[h] = TRUE;
176 if (++iunique==nunique)
177 i=nx; // break out of for as well
178 }
179 break;
180 }
181 h++;
182 if (h == nh)
183 h = 0;
184 }
185 }
186 if (nunique<nd){ // some gaps are duplicates
187 for(i=0; i<nd; i++){
188 if (!ret[i]){
189 v = hashdat[i];
190 h = HASH64(v, shift);
191 while(hashpos[h]){ // this is mostly while(hashpos[h]) but we want to catch failure for the nomatch assignment
192 if (hashdat[hashpos[h] - 1] == v){
193 h = ret[hashpos[h] - 1];
194 if (h)
195 ret[i] = TRUE;
196 break;
197 }
198 h++;
199 if (h == nh)
200 h = 0;
201 }
202 }
203 }
204 }
205 return ret_;
206 }
207
hashfin_integer64(SEXP x_,SEXP hashdat_,SEXP bits_,SEXP hashpos_,SEXP ret_)208 SEXP hashfin_integer64(SEXP x_, SEXP hashdat_, SEXP bits_, SEXP hashpos_, SEXP ret_){
209 int i, nx = LENGTH(x_);
210 int h, nh = LENGTH(hashpos_);
211 long long * x = (long long *) REAL(x_);
212 long long * hashdat = (long long *) REAL(hashdat_);
213 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
214 int * ret = LOGICAL(ret_);
215 int bits = asInteger(bits_);
216 int shift = 64 - bits;
217 long long v;
218 for(i=0; i<nx; i++){
219 v = x[i];
220 h = HASH64(v, shift);
221 for(;;){
222 if (hashpos[h]){ // this is mostly while(hashpos[h]) but we want to catch failure for the nomatch assignment
223 if (hashdat[hashpos[h] - 1] == v){
224 ret[i] = TRUE;
225 break;
226 }
227 h++;
228 if (h == nh)
229 h = 0;
230 }else{
231 ret[i] = FALSE;
232 break;
233 }
234 }
235 }
236 return ret_;
237 }
238
hashdup_integer64(SEXP hashdat_,SEXP bits_,SEXP hashpos_,SEXP nunique_,SEXP ret_)239 SEXP hashdup_integer64(SEXP hashdat_, SEXP bits_, SEXP hashpos_, SEXP nunique_, SEXP ret_){
240 int nu = LENGTH(ret_);
241 int h, nh = LENGTH(hashpos_);
242 //long long * hashdat = (long long *) REAL(hashdat_);
243 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
244 int * ret = LOGICAL(ret_);
245 int nunique = asInteger(nunique_);
246 for(h=0; h<nu; h++)
247 ret[h] = TRUE;
248 for(h=0; h<nh; h++)
249 if (hashpos[h]>0){
250 ret[hashpos[h]-1] = FALSE;
251 nunique--;
252 if (nunique<1)
253 break;
254 }
255 return ret_;
256 }
257
hashuni_integer64(SEXP hashdat_,SEXP bits_,SEXP hashpos_,SEXP keep_order_,SEXP ret_)258 SEXP hashuni_integer64(SEXP hashdat_, SEXP bits_, SEXP hashpos_, SEXP keep_order_, SEXP ret_){
259 int h, nh = LENGTH(hashpos_);
260 int u, nu = LENGTH(ret_);
261 long long * hashdat = (long long *) REAL(hashdat_);
262 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
263 long long * ret = (long long *) REAL(ret_);
264 if (asLogical(keep_order_)){
265 int i;
266 // int nx = LENGTH(hashdat_);
267 int bits = asInteger(bits_);
268 int shift = 64 - bits;
269 long long v;
270 for(u=0,i=0; u<nu; i++){
271 v = hashdat[i];
272 h = HASH64(v, shift);
273 while(hashpos[h] && hashdat[hashpos[h] - 1] != v){ // this is mostly while(hashpos[h]) but we want to catch failure for the nomatch assignment
274 h++;
275 if (h == nh)
276 h = 0;
277 }
278 if (i == (hashpos[h] - 1)){
279 ret[u++] = v; /* unique */
280 }
281 }
282 }else{
283 for(u=0,h=0; u<nu; h++)
284 if (hashpos[h]>0){
285 ret[u++] = hashdat[hashpos[h]-1];
286 }
287 }
288 return ret_;
289 }
290
hashmapuni_integer64(SEXP x_,SEXP bits_,SEXP hashpos_,SEXP nunique_)291 SEXP hashmapuni_integer64(SEXP x_, SEXP bits_, SEXP hashpos_, SEXP nunique_){
292 int i, nx = LENGTH(x_);
293 int h, nh = LENGTH(hashpos_);
294 int nu = 0;
295 SEXP ret_;
296 PROTECT_INDEX idx;
297 PROTECT_WITH_INDEX(ret_ = allocVector(REALSXP, nx), &idx);
298 long long * ret = (long long *) REAL(ret_);
299 long long * x = (long long *) REAL(x_);
300 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
301 int bits = asInteger(bits_);
302 int shift = 64 - bits;
303 long long v;
304 for(i=0; i<nx; ){
305 v = x[i++];
306 h = HASH64(v, shift);
307 while(hashpos[h] && x[hashpos[h] - 1] != v){
308 h++;
309 if (h == nh)
310 h = 0;
311 }
312 if (!hashpos[h]){
313 hashpos[h] = i;
314 ret[nu++] = v;
315 }
316 }
317 INTEGER(nunique_)[0] = nu;
318 REPROTECT(ret_ = lengthgets(ret_, nu), idx);
319 UNPROTECT(1);
320 return ret_;
321 }
322
323
hashupo_integer64(SEXP hashdat_,SEXP bits_,SEXP hashpos_,SEXP keep_order_,SEXP ret_)324 SEXP hashupo_integer64(SEXP hashdat_, SEXP bits_, SEXP hashpos_, SEXP keep_order_, SEXP ret_){
325 int h, nh = LENGTH(hashpos_);
326 int u, nu = LENGTH(ret_);
327 long long * hashdat = (long long *) REAL(hashdat_);
328 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
329 int * ret = INTEGER(ret_);
330 if (asLogical(keep_order_)){
331 int i;
332 // int nx = LENGTH(hashdat_);
333 int bits = asInteger(bits_);
334 int shift = 64 - bits;
335 long long v;
336 for(u=0,i=0; u<nu; i++){
337 v = hashdat[i];
338 h = HASH64(v, shift);
339 while(hashpos[h] && hashdat[hashpos[h] - 1] != v){ // this is mostly while(hashpos[h]) but we want to catch failure for the nomatch assignment
340 h++;
341 if (h == nh)
342 h = 0;
343 }
344 if (i == (hashpos[h] - 1)){
345 ret[u++] = hashpos[h]; /* unique */
346 }
347 }
348 }else{
349 for(u=0,h=0; u<nu; h++)
350 if (hashpos[h]>0){
351 ret[u++] = hashpos[h];
352 }
353 }
354 return ret_;
355 }
356
hashmapupo_integer64(SEXP x_,SEXP bits_,SEXP hashpos_,SEXP nunique_)357 SEXP hashmapupo_integer64(SEXP x_, SEXP bits_, SEXP hashpos_, SEXP nunique_){
358 int i, nx = LENGTH(x_);
359 int h, nh = LENGTH(hashpos_);
360 int nu = 0;
361 SEXP ret_;
362 PROTECT_INDEX idx;
363 PROTECT_WITH_INDEX(ret_ = allocVector(INTSXP, nx), &idx);
364 int * ret = INTEGER(ret_);
365 long long * x = (long long *) REAL(x_);
366 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
367 int bits = asInteger(bits_);
368 int shift = 64 - bits;
369 long long v;
370 for(i=0; i<nx; ){
371 v = x[i++];
372 h = HASH64(v, shift);
373 while(hashpos[h] && x[hashpos[h] - 1] != v){
374 h++;
375 if (h == nh)
376 h = 0;
377 }
378 if (!hashpos[h]){
379 hashpos[h] = i;
380 ret[nu++] = hashpos[h];
381 }
382 }
383 INTEGER(nunique_)[0] = nu;
384 REPROTECT(ret_ = lengthgets(ret_, nu), idx);
385 UNPROTECT(1);
386 return ret_;
387 }
388
389
390
hashtab_integer641(SEXP hashdat_,SEXP bits_,SEXP hashpos_,SEXP nunique_)391 SEXP hashtab_integer641(SEXP hashdat_, SEXP bits_, SEXP hashpos_, SEXP nunique_){
392 int i, nx = LENGTH(hashdat_);
393 int h, nh = LENGTH(hashpos_);
394 int u;
395 long long * hashdat = (long long *) REAL(hashdat_);
396 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
397 //int * pos = INTEGER(pos_);
398 SEXP ret_;
399 PROTECT_INDEX idx;
400 PROTECT_WITH_INDEX(ret_ = allocVector(INTSXP, nh), &idx);
401 int * ret = INTEGER(ret_);
402 int bits = asInteger(bits_);
403 int shift = 64 - bits;
404 long long v;
405 for(i=0; i<nh; i++)
406 ret[i]=0;
407 for(i=0; i<nx; i++){
408 v = hashdat[i];
409 h = HASH64(v, shift);
410 while(hashpos[h]){ // this is mostly while(hashpos[h]) but we want to catch failure for the nomatch assignment
411 if (hashdat[hashpos[h] - 1] == v){
412 ret[h]++;
413 break;
414 }
415 h++;
416 if (h == nh)
417 h = 0;
418 }
419 }
420 for (u=0,h=0;h<nh;h++){
421 if (hashpos[h]){
422 //pos[u]=hashpos[h];
423 ret[u++]=ret[h];
424 }
425 }
426 REPROTECT(ret_ = lengthgets(ret_, u), idx);
427 UNPROTECT(1);
428 return ret_;
429 }
430
431
432
hashtab_integer64(SEXP x_,SEXP bits_,SEXP hashpos_,SEXP nunique_)433 SEXP hashtab_integer64(SEXP x_, SEXP bits_, SEXP hashpos_, SEXP nunique_){
434 int i, nx = LENGTH(x_);
435 int h, nh = LENGTH(hashpos_);
436 long long * x = (long long *) REAL(x_);
437 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
438 SEXP hashtab_;
439 PROTECT_INDEX idx;
440 PROTECT_WITH_INDEX(hashtab_ = allocVector(INTSXP, nh), &idx);
441 int *hashtab = INTEGER(hashtab_);
442 int bits = asInteger(bits_);
443 int shift = 64 - bits;
444 long long v;
445 int u, nu = INTEGER(nunique_)[0];
446
447 for(i=0; i<nh; i++)
448 hashtab[i]=0;
449 for(i=0; i<nx; ){
450 v = x[i++];
451 h = HASH64(v, shift);
452 while (hashpos[h] && x[hashpos[h] - 1] != v){
453 h++;
454 if (h == nh)
455 h = 0;
456 }
457 hashtab[h]++;
458 }
459 SEXP tabval_;
460 PROTECT(tabval_ = allocVector(REALSXP, nu));
461 long long * tabval = (long long *) REAL(tabval_);
462 for (u=0,h=0;u<nu;h++){
463 if (hashpos[h]){
464 tabval[u] = x[hashpos[h]-1];
465 hashtab[u]=hashtab[h];
466 u++;
467 }
468 }
469 REPROTECT(hashtab_ = lengthgets(hashtab_, nu), idx);
470
471 SEXP class;
472 PROTECT(class = allocVector(STRSXP, 1));
473 SET_STRING_ELT(class, 0, mkChar("integer64"));
474 classgets(tabval_, class);
475
476 SEXP ret_;
477 PROTECT(ret_ = allocVector(VECSXP, 2));
478 SET_VECTOR_ELT(ret_, 0, tabval_);
479 SET_VECTOR_ELT(ret_, 1, hashtab_);
480
481 UNPROTECT(4);
482 return ret_;
483 }
484
485
486
hashmaptab_integer64(SEXP x_,SEXP bits_,SEXP hashpos_,SEXP nunique_)487 SEXP hashmaptab_integer64(SEXP x_, SEXP bits_, SEXP hashpos_, SEXP nunique_){
488 int i, nx = LENGTH(x_);
489 int h, nh = LENGTH(hashpos_);
490 long long * x = (long long *) REAL(x_);
491 unsigned int * hashpos = (unsigned int *) INTEGER(hashpos_);
492 SEXP hashtab_;
493 PROTECT_INDEX idx;
494 PROTECT_WITH_INDEX(hashtab_ = allocVector(INTSXP, nh), &idx);
495 int *hashtab = INTEGER(hashtab_);
496 int bits = asInteger(bits_);
497 int shift = 64 - bits;
498 long long v;
499 int u, nu=0;
500 for(i=0; i<nh; i++)
501 hashtab[i]=0;
502 for(i=0; i<nx; ){
503 v = x[i++];
504 h = HASH64(v, shift);
505 while (hashpos[h] && x[hashpos[h] - 1] != v){
506 h++;
507 if (h == nh)
508 h = 0;
509 }
510 if (!hashpos[h]){
511 hashpos[h] = i;
512 nu++;
513 }
514 hashtab[h]++;
515 }
516 SEXP tabval_;
517 PROTECT(tabval_ = allocVector(REALSXP, nu));
518 long long * tabval = (long long *) REAL(tabval_);
519 for (u=0,h=0;u<nu;h++){
520 if (hashpos[h]){
521 tabval[u] = x[hashpos[h]-1];
522 hashtab[u]=hashtab[h];
523 u++;
524 }
525 }
526 INTEGER(nunique_)[0] = nu;
527 REPROTECT(hashtab_ = lengthgets(hashtab_, nu), idx);
528
529 SEXP class;
530 PROTECT(class = allocVector(STRSXP, 1));
531 SET_STRING_ELT(class, 0, mkChar("integer64"));
532 classgets(tabval_, class);
533
534 SEXP ret_;
535 PROTECT(ret_ = allocVector(VECSXP, 2));
536 SET_VECTOR_ELT(ret_, 0, tabval_);
537 SET_VECTOR_ELT(ret_, 1, hashtab_);
538
539 UNPROTECT(4);
540 return ret_;
541 }
542