/* Copyright (C) 1995-2008 Edward Der-Hua Liu, Hsin-Chu, Taiwan */ #include #include #include #include "gcin.h" #include "pho.h" #include "tsin.h" #include "gtab.h" #include "gst.h" #include "gtab-db.h" static char *bf; static int bfN_a = 0, ofs=0; static gboolean b_pinyin; int *phidx, *sidx, phcount; int bfsize, phidxsize; u_char *sf; gboolean is_gtab, gtabkey64; int phsz, hash_shift; int (*key_cmp)(char *a, char *b, char len); char **textArr; int textArrN = 0, textArrN_a; int *textPhyOfs; FILE *fw; int key_cmp32(char *a, char *b, char len) { u_char i; for(i=0; i < len; i++) { u_int ka,kb; memcpy(&ka, a, 4); memcpy(&kb, b, 4); if (ka > kb) return 1; if (kb > ka) return -1; a+=4; b+=4; } return 0; } int key_cmp64(char *a, char *b, char len) { u_char i; for(i=0; i < len; i++) { u_int64_t ka,kb; memcpy(&ka, a, 8); memcpy(&kb, b, 8); if (ka > kb) return 1; if (kb > ka) return -1; a+=8; b+=8; } return 0; } static int qcmp(const void *a, const void *b) { int idxa=*((int *)a); char *pa = (char *)&bf[idxa]; int idxb=*((int *)b); char *pb = (char *)&bf[idxb]; char lena,lenb, len; usecount_t usecounta, usecountb; int text_idxa, text_idxb; lena=*(pa++); memcpy(&usecounta, pa, sizeof(usecount_t)); pa+= sizeof(usecount_t);memcpy(&text_idxa, pa, sizeof(text_idxa)); pa+=sizeof(text_idxa); char *ka = pa; // pa += lena * phsz; lenb=*(pb++); memcpy(&usecountb, pb, sizeof(usecount_t)); pb+= sizeof(usecount_t);memcpy(&text_idxb, pb, sizeof(text_idxb)); pb+=sizeof(text_idxb); char *kb = pb; // pb += lenb * phsz; len=Min(lena,lenb); int d = (*key_cmp)(ka, kb, len); if (d) return d; if (lena > lenb) return 1; if (lena < lenb) return -1; int tlena = strlen(textArr[text_idxa]); int tlenb = strlen(textArr[text_idxb]); if (tlena > tlenb) return 1; if (tlena < tlenb) return -1; if ((d=memcmp(pa, pb, tlena))) return d; // large first, so large one will be kept after delete return usecountb - usecounta; } static int qcmp_eq(const void *a, const void *b) { int idxa=*((int *)a); char *pa = (char *)&bf[idxa]; int idxb=*((int *)b); char *pb = (char *)&bf[idxb]; char lena,lenb, len; int text_idxa, text_idxb; lena=*(pa++); if (lena < 0) lena = -lena; pa+= sizeof(usecount_t);memcpy(&text_idxa, pa, sizeof(text_idxa)); pa+=sizeof(text_idxa); char *ka = pa; // pa += lena * phsz; lenb=*(pb++); if (lenb < 0) lenb = -lenb; pb+= sizeof(usecount_t);memcpy(&text_idxb, pb, sizeof(text_idxb)); pb+=sizeof(text_idxb); char *kb = pb; // pb += lenb * phsz; len=Min(lena,lenb); int d = (*key_cmp)(ka, kb, len); if (d) return d; if (lena > lenb) return 1; if (lena < lenb) return -1; int tlena = strlen(textArr[text_idxa]); int tlenb = strlen(textArr[text_idxb]); if (tlena > tlenb) return 1; if (tlena < tlenb) return -1; return memcmp(pa, pb, tlena); } static int qcmp_usecount(const void *a, const void *b) { int idxa=*((int *)a); char *pa = (char *)&sf[idxa]; int idxb=*((int *)b); char *pb = (char *)&sf[idxb]; char lena,lenb, len; usecount_t usecounta, usecountb; int text_idxa, text_idxb; lena=*(pa++); memcpy(&usecounta, pa, sizeof(usecount_t)); pa+= sizeof(usecount_t); memcpy(&text_idxa, pa, sizeof(text_idxa)); pa+=sizeof(text_idxa); lenb=*(pb++); memcpy(&usecountb, pb, sizeof(usecount_t)); pb+= sizeof(usecount_t); memcpy(&text_idxb, pb, sizeof(text_idxb)); pb+=sizeof(text_idxb); len=Min(lena,lenb); int d = (*key_cmp)(pa, pb, len); if (d) return d; #if 0 pa += len*phsz; pb += len*phsz; #endif if (lena > lenb) return 1; if (lena < lenb) return -1; // now lena == lenb int tlena = strlen(textArr[text_idxa]); int tlenb = strlen(textArr[text_idxb]); if (tlena > tlenb) return 1; if (tlena < tlenb) return -1; return usecountb - usecounta; } void send_gcin_message(Display *dpy, char *s); #if WIN32 && 1 #pragma comment(linker, "/subsystem:\"windows\" /entry:\"mainCRTStartup\"") #endif void init_TableDir(); static int qcmp_strcmp(const void *aa, const void *bb) { const char **a = (const char **)aa; const char **b = (const char **)bb; return strcmp(*a, *b); } static int find_text(char *s) { char **p = bsearch(&s, textArr, textArrN, sizeof(char *), qcmp_strcmp); if (!p) return -1; return p - textArr; } void add_one_line(char clen, usecount_t usecount, int chbufN, char *cphbuf, u_char *chbuf, gboolean b_en_need_str) { if (phcount >= phidxsize) { phidxsize+=1024; if (!(phidx=(int *)realloc(phidx, phidxsize*sizeof(phidx[0])))) { puts("realloc err"); exit(1); } } phidx[phcount++]=ofs; // dbg("phcount:%d clen:%d\n", phcount, clen); int new_bfN = ofs + 1 + sizeof(usecount)+ sizeof(int) + phsz * clen; if (bfsize < new_bfN) { bfsize = new_bfN + 1024*1024; bf = (char *)realloc(bf, bfsize); } // dbg("clen:%d\n", clen); char oclen = clen; memcpy(&bf[ofs++], &oclen,1); memcpy(&bf[ofs],&usecount, sizeof(usecount_t)); ofs+=sizeof(usecount_t); int text_idx = find_text(chbuf); if (text_idx < 0) p_err("not found '%s'", chbuf); memcpy(&bf[ofs], &text_idx, sizeof(text_idx)); ofs+=sizeof(text_idx); memcpy(&bf[ofs], cphbuf, clen * phsz); ofs+=clen * phsz; #if 0 memcpy(&bf[ofs], chbuf, chbufN); ofs+=chbufN; #endif } static int prefix_eq(int idxa, int idxb, int preLen) { char *pa = (char *)&sf[sidx[idxa]]; char *pb = (char *)&sf[sidx[idxb]]; char lena,lenb, len; usecount_t usecounta, usecountb; int text_idxa, text_idxb; lena=*(pa++); pa+= sizeof(usecount_t)+sizeof(text_idxa); char *ka = pa; pa += lena * phsz; lenb=*(pb++); pb+= sizeof(usecount_t)+sizeof(text_idxb); char *kb = pb; pb += lenb * phsz; len=Min(lena,lenb); if (len > preLen) len = preLen; return (*key_cmp)(ka, kb, len)==0; } int gen_tree(int start, int end, int prelen) { // dbg("gen_tree %d %d %d\n", start, end, prelen); int prelen1 = prelen+1; if (start>=end) p_err("error found %d %d", start, end); // start is always included fseek(fw, 0, SEEK_END); int start_ofs = ftell(fw); BLOCK_HEAD bh; bzero(&bh, sizeof(bh)); fwrite(&bh, sizeof(bh), 1, fw); GNODE gn; bzero(&gn, sizeof(gn)); for(int i=start;i 32) gtabkey64 = TRUE; } INMD inmd, *cur_inmd = &inmd; char *cphbuf; if (is_gtab) { cur_inmd->keybits = keybits; if (gtabkey64) { cphbuf = (char *)phbuf64; phsz = 8; key_cmp = key_cmp64; hash_shift = TSIN_HASH_SHIFT_64; cur_inmd->key64 = TRUE; } else { cphbuf = (char *)phbuf32; phsz = 4; hash_shift = TSIN_HASH_SHIFT_32; key_cmp = key_cmp32; cur_inmd->key64 = FALSE; } cur_inmd->last_k_bitn = (((cur_inmd->key64 ? 64:32) / cur_inmd->keybits) - 1) * cur_inmd->keybits; dbg("cur_inmd->last_k_bitn %d\n", cur_inmd->last_k_bitn); } dbg("phsz: %d\n", phsz); fofs = ftell(fp); while (!feof(fp)) { usecount_t usecount=0; lineCnt++; myfgets((char *)s,sizeof(s),fp); len=strlen((char *)s); if (s[0]=='#') continue; if (strstr(s, TSIN_GTAB_KEY) || strstr(s, TSIN_EN_WORD_KEY)) continue; if (s[len-1]=='\n') s[--len]=0; if (len==0) { dbg("len==0\n"); continue; } char *p = strchr(s, ' '); if (!p) continue; *p = 0; if (textArrN >= textArrN_a) { textArrN_a += 1024; textArr = trealloc(textArr, char *, textArrN_a); } textArr[textArrN++]=strdup(s); } dbg("textArrN %d\n", textArrN); qsort(textArr, textArrN, sizeof(char *), qcmp_strcmp); int ntextArrN=1; for(int i=1;i