#include #include extern char **argvec; #include "str.h" #include "config.h" #include "random.h" #include "externs.h" #include "strings.h" #define MAXSHORT 127 /* max length for short-string code */ /* strings 0 long use (char *)0 */ /* strings 1 long use one of 255 statically-allocated strings */ /* strings 2-MAXSHORT long use a block that's the minimal number of grains */ /* strings over MAXSHORT long allocate in sizeof(HDR) grains from an arena, and are referred to by handles. */ /* strings matching those read at startup just point to them */ /* the low two bits of the returned pointer are flags: ...xxx00 means a non-duplicate string (<=MAXSHORT), possibly len=1 ...xxx01 means a non-duplicate handle (>MAXSHORT) ...xxx10 means a duplicate string, pointer points to the DUPSTR ...xxx11 is an error This, obviously, means this whole paradigm depends on pointers being "just funny ints". For machines where this isn't true, this file needs a rethink at the very least. Also, this assumes that any valid DUPSTR * has its low two bits clear. */ typedef struct lumps LUMPS; typedef struct hdr HDR; typedef struct strlist STRLIST; typedef struct dupstr DUPSTR; struct lumps { void *free; int size; } ; struct hdr { unsigned int size : 29; unsigned int free : 1; unsigned int first : 1; unsigned int last : 1; HDR *next; HDR *prev; } ; struct strlist { int height; int count; STRLIST *ptrs[1]; /* actually, [.height] */ } ; #define SL_LINK(slp,n) ((slp)->ptrs[n]) #define SL_STR(slp) ((char *)&(slp)->ptrs[(slp)->height]) #define MAXHEIGHT 16 #define RATIO 2 struct dupstr { DUPSTR *left; DUPSTR *right; unsigned long int hash; char *s; } ; #define MINHOLE (((MAXSHORT+1)/sizeof(hdr))+1) /* min # grains in free blk */ static unsigned int grain; static unsigned int grainshift; static char *_1_strings[255]; static int nshorts; static LUMPS *shorts; static void *malthread; static void *freehandles; static HDR arena[2]; static HDR *hand; static STRLIST *slhead[MAXHEIGHT]; static DUPSTR *duproot; static void *dsfree; static void *smalloc_(int nb, int *sp, const char *file, int line) #define smalloc(nb,sp) smalloc_((nb),(sp),__FILE__,__LINE__) { void *x; int rs; x = muck_malloc_roundup(nb+sizeof(void *),&rs,file,line); *(void **)x = malthread; malthread = x; *sp = rs - sizeof(void *); return(((char *)x)+sizeof(void *)); } static void tfreeall(void) { void *x; while (malthread) { x = malthread; malthread = *(void **)x; free(x); } } /* static unsigned int power_2(unsigned int n) { while (n & (n-1)) n += n & ~(n-1); return(n); } */ static void *pop_lump(LUMPS *l) { char *x; if (! l->free) { int nb; char *f; nb = l->size * 500; if (nb < 16000) nb = 16000; x = smalloc(nb,&nb); while (((int)x) & 3) { x ++; nb --; } f = l->free; while (nb >= l->size) { *(void **)x = f; f = x; x += l->size; nb -= l->size; } l->free = f; while (nb >= grain) { *(void **)x = freehandles; freehandles = x; x += grain; nb -= grain; } } x = l->free; l->free = *(void **)x; return(x); } static void push_lump(void *x, LUMPS *l) { *(void **)x = l->free; l->free = x; } static void *gethandle(void) { char *x; if (! freehandles) { int nb; x = smalloc(8000,&nb); while (((int)x) & 3) { x ++; nb --; } while (nb >= grain) { *(void **)x = freehandles; freehandles = x; x += grain; nb -= grain; } } x = freehandles; freehandles = *(void **)x; return(x); } static void puthandle(void *h) { *(void **)h = freehandles; freehandles = h; } static HDR *new_arena_block(int n) { char *hcp; HDR *h; n = (n + 2) * sizeof(HDR); hcp = smalloc(n,&n); while (((int)hcp) & 3) { hcp ++; n --; } h = (HDR *) hcp; n /= sizeof(HDR); h->size = n; h->free = 1; h->first = 1; h->last = 1; h->next = &arena[0]; h->prev = arena[0].prev; arena[0].prev = h; h->prev->next = h; h[1].next = &arena[0]; h[1].prev = arena[1].prev; arena[1].prev = h; h[1].prev[1].next = h; return(h); } static void *arena_alloc(int ng) { HDR *hand0; hand0 = hand; if (! hand) hand = new_arena_block(ng); while (1) { if (hand->size > ng) { int ngleft; HDR *h; ngleft = hand->size - (ng+1); if (ngleft < 2) { h = hand; hand = hand[1].next; hand[1].prev = h[1].prev; h[1].prev[1].next = hand; } else { h = hand; hand += ng + 1; hand->next = h->next; hand->prev = h; h->next = hand; hand->next->prev = hand; hand->size = ngleft; hand->free = 1; hand->first = 0; hand->last = h->last; h->last = 0; hand[1].next = h[1].next; hand[1].prev = h[1].prev; hand[1].next[1].prev = hand; hand[1].prev[1].next = hand; h->size = ng + 1; h->last = 0; } h->free = 0; return(h+1); } hand = (hand+1)->next; if (hand == hand0) hand = new_arena_block(ng); } } static void arena_free(void *x) { HDR *h; HDR *p; HDR *n; h = ((HDR *)x) - 1; p = h->prev; n = h->next; if (!h->first && p->free) { if (!h->last && n->free) { p->size += h->size + n->size; p->next = n->next; n->next->prev = p; p->last = n->last; p[1].next = n[1].next; n[1].next[1].prev = p; if (hand == n) hand = p; } else { p->size += h->size; p->next = h->next; h->next->prev = p; p->last = h->last; } } else { if (!h->last && n->free) { h->size += n->size; h->next = n->next; n->next->prev = h; h->free = 1; h->last = n->last; h[1].next = n[1].next; h[1].prev = n[1].prev; n[1].prev[1].next = h; n[1].next[1].prev = h; if (hand == n) hand = h; } else { for (n=h->next;!n->free;n=n->next) ; h[1].next = n; h[1].prev = n[1].prev; n[1].prev = h; h[1].prev[1].next = h; h->free = 1; if (hand == n) hand = h; } } } static int arena_memused(void *x) { return((((HDR *)x)-1)->size*sizeof(HDR)); } static void dump_arena_h(HDR *h) { fprintf(stderr,"%p: size=%u free=%u first=%u last=%u next=%p prev=%p",(void *)h,h->size,h->free,h->first,h->last,(void *)h->next,(void *)h->prev); if (h->free) { fprintf(stderr," fnext=%p fprev=%p",(void *)h[1].next,(void *)h[1].prev); } fprintf(stderr,"\n"); } /* This routine is largely for calling from debuggers */ void dump_arena(void); void dump_arena(void) { HDR *h; dump_arena_h(&arena[0]); for (h=arena[0].next;h!=&arena[0];h=h->next) dump_arena_h(h); fflush(stderr); } /* Vaguely CRCish hash function for strings */ /* (No, CRC it ain't - I XOR in eight bits, then do one bit of CRC) */ static unsigned long int compute_hash(const void *sarg) { register unsigned long int h; register const unsigned char *s; h = 0; s = sarg; while (*s) { h ^= *s++; h = (h >> 1) ^ ((h & 1) ? 0xedb88320 : 0); } return(h); } static int dscmp(DUPSTR *d, unsigned long int h, const char *s) { if (d->hash < h) return(-1); if (d->hash > h) return(1); return(strcmp(d->s,s)); } static DUPSTR *mergelist(DUPSTR *l1, DUPSTR *l2) { DUPSTR *list; DUPSTR **lt; lt = &list; while (l1 || l2) { if (!l2 || (l1 && (dscmp(l1,l2->hash,l2->s) < 0))) { *lt = l1; lt = &l1->left; l1 = l1->left; } else { *lt = l2; lt = &l2->left; l2 = l2->left; } } *lt = 0; return(list); } static DUPSTR *sortlist(DUPSTR *list) { DUPSTR *l[2]; DUPSTR *e; int flip; l[0] = 0; l[1] = 0; flip = 0; while (list) { e = list; list = e->left; e->left = l[flip]; l[flip] = e; } if (! l[1]) return(l[0]); return(mergelist(sortlist(l[0]),sortlist(l[1]))); } static DUPSTR *maketree(DUPSTR *list, int len) { DUPSTR *l_t; DUPSTR *pivot; int l_n; int r_n; int i; if (len < 1) return(0); r_n = len >> 1; l_n = len - r_n - 1; if (l_n == 0) { l_t = 0; pivot = list; } else { l_t = list; for (i=l_n-1;i>0;i--,list=list->left) ; pivot = list->left; list->left = 0; } pivot->right = maketree(pivot->left,r_n); pivot->left = maketree(l_t,l_n); return(pivot); } static int readfile(FILE *f, void *bytes, int nb) { if (fread(bytes,1,nb,f) != nb) return(1); if (getc(f) != EOF) return(1); return(0); } static int skipline(char **sp, int *nbp) { char *s; int n; s = *sp; n = *nbp; for (;(*s!='\n')&&(n>0);s++,n--) ; if (n <= 0) return(1); *s++ = '\0'; *sp = s; *nbp = n - 1; return(0); } static void load_strings(const char *fn) { FILE *f; DUPSTR *dsv; char *bytes; int dsvx; int fmtver; int n; int nb; int c; f = fopen(fn,"r"); if (f == 0) { fprintf(stderr,"%s: note: can't open strings file %s\n",argvec[0],fn); return; } if (fscanf(f,"%d",&fmtver) != 1) { fclose(f); fprintf(stderr,"%s: warning: invalid strings file %s (no format version)\n",argvec[0],fn); return; } switch (fmtver) { default: fclose(f); fprintf(stderr,"%s: warning: %s: unknown format version %d\n",argvec[0],fn,fmtver); return; break; case 1: if (fscanf(f,"%d%d",&n,&nb) != 2) { fclose(f); fprintf(stderr,"%s: warning: invalid strings file %s (no counts)\n",argvec[0],fn); return; } while (1) { c = getc(f); if (c == EOF) { fclose(f); fprintf(stderr,"%s: warning: invalid strings file %s (missing \\n after counts)\n",argvec[0],fn); return; } if (c == '\n') break; } if (n < 1) { fclose(f); return; } if (sizeof(DUPSTR) & 3) { fclose(f); fprintf(stderr,"%s: sorry, sizeof(DUPSTR) is not a multiple of 4, can't do duplicate string stuff\n",argvec[0]); return; } dsfree = malloc((n*sizeof(DUPSTR))+nb+3); bytes = dsfree; while (((int)bytes) & 3) bytes ++; dsv = (void *)bytes; bytes += n * sizeof(DUPSTR); if (readfile(f,bytes,nb)) { fclose(f); fprintf(stderr,"%s: warning: invalid strings file %s (byte count wrong)\n",argvec[0],fn); free(dsfree); return; } fclose(f); for (dsvx=0;dsvxs = bytes; if (skipline(&bytes,&nb)) { fprintf(stderr,"%s: warning: invalid strings file %s (file too short)\n",argvec[0],fn); free(dsfree); return; } d->hash = compute_hash(d->s); } if (nb != 0) { fprintf(stderr,"%s: warning: invalid strings file %s (file too long)\n",argvec[0],fn); free(dsfree); return; } for (dsvx=1;dsvx>grainshift)!=1;grainshift++) ; if (grain != (1<> grainshift) + 1; shorts = malloc(nshorts*sizeof(LUMPS)); if (liststrings) { for (i=0;i=0;i--) { shorts[i].free = 0; shorts[i].size = (i + 1) << grainshift; } j = ((unsigned long int)&_1_buf[0]) & 3; j = (4 - j) & 3; for (i=0;i<255;i++) { _1_buf[j] = i + 1; _1_buf[j+1] = '\0'; _1_strings[i] = &_1_buf[j]; j += 4; } freehandles = 0; arena[0].size = 2; arena[0].free = 1; arena[0].first = 1; arena[0].last = 1; arena[0].next = &arena[0]; arena[0].prev = &arena[0]; arena[1].next = &arena[0]; arena[1].prev = &arena[0]; hand = 0; duproot = 0; dsfree = 0; if (! liststrings) { load_strings("data/frequent-strings"); } } void close_str(void) { tfreeall(); free(dsfree); } static DUPSTR *dupstr_find(unsigned long int h, const char *s) { DUPSTR *d; int c; d = duproot; while (d) { c = dscmp(d,h,s); if (c == 0) break; d = (c < 0) ? d->right : d->left; } return(d); } STR store_str(const char *s) { unsigned long int h; unsigned int l; char *x; DUPSTR *d; if (!s || !*s) return(NOSTR); if (!s[1]) return(_1_strings[(255&s[0])-1]); if (duproot) { h = compute_hash(s); d = dupstr_find(h,s); if (d) return((STR)(2+(char *)d)); } l = strlen(s); if (l <= MAXSHORT) { x = pop_lump(&shorts[l>>grainshift]); strcpy(x,s); if (3 & (int)x) { fprintf(stderr,"false handle marker 1"); abort(); } return(x); } else { void *h; x = arena_alloc((l/sizeof(HDR))+1); strcpy(x,s); h = gethandle(); *(void **)h = x; if (3 & (int)h) { fprintf(stderr,"false handle marker 2"); abort(); } return((void *)(1+(char *)h)); } } char *fetch_str_(STR s, const char *file, int line) { if (s == NOSTR) return(0); switch (3 & (int)s) { case 0: return(dup_string_(s,file,line)); break; case 1: return(dup_string_(*(void **)(void *)(((char *)s)-1),file,line)); break; case 2: return(dup_string_(((DUPSTR *)(void *)(((char *)s)-2))->s,file,line)); break; } fprintf(stderr,"invalid flag bits\n"); abort(); } void free_str(STR s) { if (s == NOSTR) return; switch (3 & (int)s) { case 0: { int l; l = strlen((char *)s); if (l == 1) return; push_lump(s,&shorts[l>>grainshift]); } break; case 1: { void *h; h = ((char *)s) - 1; arena_free(*(void **)h); puthandle(h); } break; case 2: break; case 3: fprintf(stderr,"invalid flag bits\n"); abort(); break; } } void fputs_str(STR s, FILE *f) { if (s == NOSTR) { disk_write(s,f); return; } switch (3 & (int)s) { case 0: disk_write(s,f); break; case 1: { void *h; h = ((char *)s) - 1; disk_write(*(void **)h,f); } break; case 2: break; case 3: fprintf(stderr,"invalid flag bits\n"); abort(); break; } } STR copy_str(STR s) { if (s == NOSTR) return(s); switch (3 & (int)s) { case 0: return(store_str(s)); break; case 1: { void *h; h = ((char *)s) - 1; return(store_str(*(void **)h)); } break; case 2: return(s); break; } fprintf(stderr,"invalid flag bits\n"); abort(); } int str_memusage(STR s) { if (s == NOSTR) return(0); switch (3 & (int)s) { case 0: if (!((char *)s)[1]) return(0); return(shorts[strlen((char *)s)>>grainshift].size); break; case 1: { void *h; h = ((char *)s) - 1; return(grain+arena_memused(*(void **)h)); } case 2: return(0); /* ??? Not sure what's most useful. */ break; } fprintf(stderr,"invalid flag bits\n"); abort(); } int strs_match(STR s1, STR s2) { if (s1 == s2) return(1); if ((s1 == NOSTR) || (s2 == NOSTR)) return(0); if ((3 & (int)s1) != (3 & (int)s2)) return(0); switch (3 & (int)s1) { case 0: return(!strcmp(s1,s2)); break; case 1: { void *h1; void *h2; h1 = ((char *)s1) - 1; h2 = ((char *)s2) - 1; return(!strcmp(*(void **)h1,*(void **)h2)); } break; case 2: return(s1==s2); break; } fprintf(stderr,"invalid flag bits\n"); abort(); } static STRLIST *strlist_find(const char *s, STRLIST ***pvec) { STRLIST *sl; STRLIST *sl2; int h; int c; for (h=MAXHEIGHT;(h>=0)&&!slhead[h];h--) pvec[h] = &slhead[h]; while (h >= 0) { sl = slhead[h]; c = strcmp(SL_STR(sl),s); if (c == 0) return(sl); if (c < 0) break; pvec[h] = &slhead[h]; h --; } if (h < 0) return(0); while (h >= 0) { while (1) { sl2 = sl->ptrs[h]; if (! sl2) break; c = strcmp(SL_STR(sl2),s); if (c == 0) return(sl2); if (c > 0) break; sl = sl2; } pvec[h] = &sl->ptrs[h]; h --; } return(0); } void record_string(const char *s) { int h; STRLIST *sl; STRLIST *i; STRLIST **ptrs[MAXHEIGHT+1]; if (!s || !s[0] || !s[1]) return; i = strlist_find(s,&ptrs[0]); if (i) { i->count ++; return; } h = 1; while ((h < MAXHEIGHT) && !rnd(RATIO)) h ++; sl = malloc(sizeof(STRLIST)+((h-1)*sizeof(STRLIST *))+strlen(s)+1); sl->height = h; sl->count = 1; strcpy(SL_STR(sl),s); for (h--;h>=0;h--) { sl->ptrs[h] = *ptrs[h]; *ptrs[h] = sl; } } void dump_strings(void) { STRLIST *sl; STRLIST *list; STRLIST **slp; int l; int n; int nb; slp = &list; n = 0; for (sl=slhead[0];sl;sl=sl->ptrs[0]) { if (sl->count < 2) continue; l = strlen(SL_STR(sl)); if (l*(sl->count-1) > (int)sizeof(DUPSTR)) { n ++; nb += l + 1; *slp = sl; slp = &sl->ptrs[0]; } } *slp = 0; printf("1\n%d %d\n",n,nb); for (;list;list=list->ptrs[0]) printf("%s\n",SL_STR(list)); }