/* * Backing implementation which makes snapshotting possible. * * Multi-byte numbers are stored native-endian. * * Bitmaps store a vector of bits bit[N] in ceil(N/8) bytes, with * bit[N] being stored in the 1<<(N&7) bit of byte N>>3 * * A backing "file" is a directory, containing a subdirectory for each * serial number. For each serial number, N the directory named N * (represented as decimal, with no leading 0s), contains: * * admin: a file containing * 8@0: (ESIZE.N) ESIZE for this serial number * 8@8: (ASIZE.N) ASIZE for this serial number * 8@16: (BLOCKS.N) number of blocks in data * ESIZE is the effective size of the virtual backing * file. ASIZE is the amount of space actually stored, * which will always be >= ESIZE. * * present: a file containing ceil(ASIZE.N/(8*512)) blocks of * bitmap. A bit here is 1 if this serial contains data * for this block, 0 if this serial's data for this block * is stored in some lower serial number. Bits beyond the * bit for block ASIZE.N-1, if any, MBZ. * * map: a file containing ceil((ASIZE.N*8)/512) blocks, holding an * array of ASIZE.N 8-byte entries. If the present bit * for a block is nonexistent or clear, its entry here is * indeterminate and irrelevant; if set, its entry gives * the block number for the block's data. * * data: a file containing BLOCKS.N blocks of data. Every block * here must correspond to a real block of data, since * blocks are never deleted, only added. * * A backing "file" directory also contains an empty file, lock, which * serves as an fstat(2) target to tell when two BACKINGs are the same * thing, and as an flock(2) target. * * An uninitialized backing "file" is represented by an empty * directory. */ #include #include #include #include #include #include #include #include #include "stdio-util.h" #include "backing.h" typedef struct serial SERIAL; typedef struct vlist VLIST; typedef struct priv PRIV; struct vlist { VLIST *link; union { int i; unsigned int u; void *v; } ; } ; struct priv { char *basedir; int fd_lock; int nserial; SERIAL **serials; char *detail; int detail_valid; } ; struct serial { PRIV *p; unsigned int serial; int fd_admin; int fd_present; int fd_map; int fd_data; unsigned int esize; unsigned int asize; unsigned int blocks; unsigned char *map_present; } ; static unsigned int pagesize = 0; static unsigned char zblk[512] = { 0 }; /* * Unsigned int divide, rouding towards positive infinity. * * The name icd coems from "integer ceiling divide". */ static unsigned int icd(unsigned int a, unsigned int b) { return((a+(b-1))/b); } /* * Verify that certain assumptions the code makes are true. * * On failure, print a message to stderr and exit(1) - these failures * are not things that can be corrected by a non-coder. * * Ideally, we wouldn't need this. But that would mean either speccing * the data file formats in terms of native data sizes or contriving * the code to (needlessly, on most machines) convert between byte * streams and native integers. It's just a choice of uglinesses; * this is the ugliness I prefer. */ static void test_assumptions(void) { if (pagesize == 0) pagesize = getpagesize(); if (sizeof(unsigned int) != 4) { fprintf(stderr,"unsigned int isn't 4 bytes\n"); exit(1); } if (sizeof(unsigned long long int) != 8) { fprintf(stderr,"unsigned long long int isn't 8 bytes\n"); exit(1); } if ((int)(unsigned int)(unsigned char)~0U != 255) { fprintf(stderr,"unsigned char isn't 8 bits\n"); exit(1); } } /* * Given a directory pathname, returns a VLIST of its contents. * Deleted entries and . and .. are filtered out. If opendir() fails, * a nil pointer is returned (this is indistinguishable from an empty * directory). */ static VLIST *read_dir_contents(const char *dp) { DIR *d; struct dirent *e; VLIST *v; VLIST *l; l = 0; d = opendir(dp); if (! d) return(0); while ((e = readdir(d))) { if ( (e->d_fileno == 0) || ( (e->d_name[0] == '.') && ( (e->d_name[1] == '\0') || ( (e->d_name[1] == '.') && (e->d_name[2] == '\0') ) ) ) ) continue; v = malloc(sizeof(VLIST)); v->v = strdup(&e->d_name[0]); v->link = l; l = v; } closedir(d); return(l); } /* * Given a list of VLISTs using the v member, filter it based on an * int-returning function, witn an arbitrary handler for dropped * entries. */ static VLIST *filter_list_v_f(VLIST *l, int (*fn)(void *), void (*drop)(void *)) { VLIST **vp; VLIST *v; vp = &l; while ((v = *vp)) { if ((*fn)(v->v)) { vp = &v->link; } else { *vp = v->link; (*drop)(v->v); free(v); } } return(l); } /* * Given a list of VLISTs using the v member, filter it based on an * int-returning function, free()ing dropped entries. */ static VLIST *filter_list_v(VLIST *l, int (*fn)(void *)) { return(filter_list_v_f(l,fn,&free)); } /* * Given a list of VLISTs and a comparison function, sort the list. * * (*cmp)(a,b) is expected to return true if a should occur before b in * the resulting list. (If a and b compare equal, it doesn't matter * whether *cmp returns true or false.) */ static VLIST *sort_vlist(VLIST *l, int (*cmp)(VLIST *, VLIST *)) { VLIST *a; VLIST *b; VLIST *t; VLIST **lp; if (!l || !l->link) return(l); a = 0; b = 0; while (l) { t = l; l = l->link; t->link = a; a = b; b = t; } a = sort_vlist(a,cmp); b = sort_vlist(b,cmp); lp = &l; while (a || b) { if (a && (!b || (*cmp)(a,b))) { t = a; a = a->link; } else { t = b; b = b->link; } *lp = t; lp = &t->link; } *lp = 0; return(l); } /* * Shut down a SERIAL, freeing up resources. This means closing open * files and munmap()ping mapped memory. It does not include freeing * the SERIAL itself. */ static void close_serial(SERIAL *s) { if (s->fd_admin >= 0) close(s->fd_admin); if (s->fd_present >= 0) close(s->fd_present); if (s->fd_map >= 0) close(s->fd_map); if (s->fd_data >= 0) close(s->fd_data); if (s->map_present) munmap(s->map_present,icd(s->asize,4096)*512); s->fd_admin = -1; s->fd_present = -1; s->fd_map = -1; s->fd_data = -1; s->map_present = 0; } /* * Open the files for a SERIAL. Assumes the directory already exists. * Opens are performed using (*ofn); nothing beyond opening the files * is done. * * On success, 0 is returned, and s->fd_* are set; on error, (*err) is * called, any successful opens are closed and -1 is returned, with * s->fd_* all set to -1 (errno is not useful). * * If (*err) or (*ofn) throws out, this will leak memory. */ static int open_serial_with(SERIAL *s, int (*ofn)(const char *), void (*err)(const char *, ...)) { __label__ failure; int openit(const char *name) { int fd; char *p; asprintf(&p,"%s/%u/%s",s->p->basedir,s->serial,name); fd = (*ofn)(p); if (fd < 0) { (*err)("can't open %s: %s",p,strerror(errno)); free(p); goto failure; } free(p); return(fd); } s->fd_admin = -1; s->fd_present = -1; s->fd_map = -1; s->fd_data = -1; s->fd_admin = openit("admin"); s->fd_present = openit("present"); s->fd_map = openit("map"); s->fd_data = openit("data"); return(0); failure:; close_serial(s); return(-1); } /* * Load data from the files for an existing SERIAL. The SERIAL must be * new, except that the files must already have been opened. * * On success, returns 0, with the SERIAL set up. On failure, calls * (*err) and returns -1, with the SERIAL in a state suitable for * passing to close_serial(). */ static int load_serial(SERIAL *s, void (*err)(const char *, ...)) { __label__ failure; struct stat stb; struct stat stb2; unsigned int ui; unsigned long long int ulli; auto void fail(void) __attribute__((__noreturn__)); void fail(void) { goto failure; } void mustread(int fd, void *buf, int len, off_t at, const char *what, const char *where) { int n; n = pread(fd,buf,len,at); if (n == len) return; if (n < 0) { (*err)("can't read %s from %s/%u/%s: %s",what,s->p->basedir,s->serial,where,strerror(errno)); } else { (*err)("can't read %s from %s/%u/%s (wanted %d, got %d)",what,s->p->basedir,s->serial,where,len,n); } fail(); } void *mustmap(int fd, off_t at, unsigned int size, const char *what) { void *mmrv; mmrv = mmap(0,size,PROT_READ|PROT_WRITE,MAP_FILE|MAP_SHARED,fd,at); if (mmrv != MAP_FAILED) return(mmrv); (*err)("can't mmap %s/%u/%s (%u at %llu): %s",s->p->basedir,s->serial,what,size,(unsigned long long int)at,strerror(errno)); fail(); } fstat(s->fd_admin,&stb); if (stb.st_size != 24) { (*err)("%s/%u: admin size wrong",s->p->basedir,s->serial); return(-1); } mustread(s->fd_admin,&ulli,8,0,"effective size","admin"); s->esize = ulli; if (s->esize != ulli) { (*err)("%s/%u: effective size too large",s->p->basedir,s->serial); fail(); } mustread(s->fd_admin,&ulli,8,8,"actual size","admin"); s->asize = ulli; if (s->asize != ulli) { (*err)("%s/%u: actual size too large",s->p->basedir,s->serial); fail(); } mustread(s->fd_admin,&ulli,8,16,"data block count","admin"); s->blocks = ulli; if (s->blocks != ulli) { (*err)("%s/%u: data block count too large",s->p->basedir,s->serial); fail(); } if (s->blocks > s->asize) { (*err)("%s/%u: too many blocks for size",s->p->basedir,s->serial); fail(); } fstat(s->fd_present,&stb); fstat(s->fd_map,&stb2); ui = icd(s->asize,4096) * 512; ftruncate(s->fd_present,ui); if (s->asize > 0) s->map_present = mustmap(s->fd_present,0,ui,"present"); ulli = icd(s->asize,64) * 512ULL; if (stb2.st_size < ulli) { unsigned int o; stb2.st_size &= ~(off_t)511; ftruncate(s->fd_map,stb2.st_size); o = stb2.st_size >> (9 - 3); bzero(s->map_present+o,ui-o); } ftruncate(s->fd_map,ulli); return(0); failure:; return(-1); } /* * Open and set up a SERIAL. Assumes the directory already exists. * All the files must also already exist. * * Returns 0 on success or -1, with a call to (*err), on failure. */ static int open_serial(SERIAL *s, void (*err)(const char *, ...)) { int ofn(const char *s) { return(open(s,O_RDWR,0)); } if (open_serial_with(s,&ofn,err) < 0) return(-1); if (load_serial(s,err) < 0) { close_serial(s); return(-1); } return(0); } /* * Open and set up a SERIAL. Assumes the directory already exists. * Files that do not exist will be created; files that do exist will * have their data destroyed. * * Returns 0 on success or -1, with a call to (*err), on failure. */ static int create_serial(SERIAL *s, void (*err)(const char *, ...)) { unsigned long long int ulli; int ofn(const char *s) { return(open(s,O_RDWR|O_CREAT|O_TRUNC,0666)); } void loaderr(const char *fmt __attribute__((__unused__)), ...) { } if (open_serial_with(s,&ofn,err) < 0) return(-1); ftruncate(s->fd_admin,24); ulli = 1; pwrite(s->fd_admin,&ulli,8,0); ulli = 1; pwrite(s->fd_admin,&ulli,8,8); ulli = 1; pwrite(s->fd_admin,&ulli,8,16); ftruncate(s->fd_present,0); ftruncate(s->fd_present,512); ftruncate(s->fd_map,0); ftruncate(s->fd_map,512); ftruncate(s->fd_data,0); ftruncate(s->fd_data,512); if (load_serial(s,&loaderr) < 0) { (*err)("impossible create load failure"); close_serial(s); return(-1); } return(0); } /* * Find the next available serial number in p's directory, starting at * s->serial. Create the relevant directory and create_serial() it. * * On success, returns 0. On failure, calls (*err) and returns -1. */ static int new_next_serial(PRIV *p, SERIAL *s, void (*err)(const char *, ...)) { char *db; db = 0; while (1) { free(db); asprintf(&db,"%s/%u",p->basedir,s->serial); if (mkdir(db,0777) < 0) { if (errno == EEXIST) { s->serial ++; continue; } (*err)("can't create new serial %u in %s: %s",p->basedir,s->serial,strerror(errno)); free(db); return(-1); } free(db); if (create_serial(s,err) == 0) return(0); return(-1); } } /* * Return a new PRIV, ie, one with all its members initialized to * suitably empty values. */ static PRIV *new_priv(void) { PRIV *p; p = malloc(sizeof(PRIV)); p->basedir = 0; p->fd_lock = -1; p->nserial = 0; p->serials = 0; p->detail = 0; p->detail_valid = 0; return(p); } /* * Return a new SERIAL, ie, one with all its members initialized to * suitably empty values. */ static SERIAL *new_serial(void) { SERIAL *s; s = malloc(sizeof(SERIAL)); s->p = 0; s->fd_admin = -1; s->fd_present = -1; s->fd_map = -1; s->fd_data = -1; s->map_present = 0; return(s); } /* * Just like atoi() except for unsigned int. */ static unsigned int atou(const char *s) { return(strtoul(s,0,10)); } /* * A comparison function for sort_vlist to sort in ascending order by * the u member. */ static int vlist_cmp_u(VLIST *a, VLIST *b) { return(a->uu); } /* * Actually write a block of data (from data) to block blkno in SERIAL * s. This assumes size checking has already been taken care of. */ static int write_it(SERIAL *s, const void *data, unsigned int blkno) { unsigned char *bp; unsigned char bb; unsigned long long int m; int rv; bp = s->map_present + (blkno >> 3); bb = 1 << (blkno & 7); if (bb & *bp) { rv = pread(s->fd_map,&m,8,blkno*8ULL); if (rv != 8) { if (rv >= 0) errno = EIO; return(-1); } if (m >= s->blocks) { /* ??? How'd this happen? */ *bp &= ~bb; errno = EIO; return(-1); } } else { s->blocks ++; m = s->blocks; rv = pwrite(s->fd_admin,&m,8,8); if (rv != 8) { if (rv >= 0) errno = EIO; return(-1); } m --; rv = pwrite(s->fd_map,&m,8,blkno*8ULL); if (rv != 8) { if (rv >= 0) errno = EIO; return(-1); } } rv = pwrite(s->fd_data,data,512,m*512); if (rv == 512) return(512); if (rv >= 0) errno = EIO; return(-1); } /* * Grow s to include at least nblks blocks. This affects the effective * size. * * This means setting the size value in the admin file, plus making * sure the present and map files are big enough. This may involve * re-mmapping the present file. * * On success, returns 0. On failure, returns -1 with errno set * appropriately. */ static int grow_maps(SERIAL *s, unsigned int nblks) { unsigned long long int ulli; unsigned int oldpb; unsigned int newpb; void *mmrv; int rv; unsigned int b; oldpb = icd(s->asize,4096) * 512; newpb = icd(nblks,4096) * 512; if (newpb > oldpb) { msync(s->map_present,oldpb,MS_ASYNC); ftruncate(s->fd_present,newpb); mmrv = mmap(0,newpb,PROT_READ|PROT_WRITE,MAP_FILE|MAP_SHARED,s->fd_present,0); if (mmrv == MAP_FAILED) return(-1); munmap(s->map_present,oldpb); s->map_present = mmrv; } if (nblks > s->asize) { ftruncate(s->fd_map,icd(nblks,64)*512ULL); s->asize = nblks; } for (b=s->esize;besize = nblks; ulli = s->esize; rv = pwrite(s->fd_admin,&ulli,8,0); if (rv != 8) { if (rv >= 0) errno = EIO; return(-1); } ulli = s->asize; rv = pwrite(s->fd_admin,&ulli,8,8); if (rv != 8) { if (rv >= 0) errno = EIO; return(-1); } return(0); } /* * Open a backing "file". This is responsible for loading all existing * serial subdirs and figuring out the most recent serial number. If * the directory contains no openable serials at all, this creates a * new one with the lowest available number and uses it. */ static void *snapshot_open(const char *fn, void (*err)(const char *, ...)) { char *path; PRIV *p; int fd; VLIST *contents; VLIST *v; VLIST **vp; int n; SERIAL *s; int x; int drop_admin(void *s) { return(strcmp(s,"lock")); } test_assumptions(); asprintf(&path,"%s/lock",fn); fd = open(path,O_RDWR,0); if (fd < 0) { if (errno != ENOENT) { (*err)("can't open %s: %s",path,strerror(errno)); return(0); } fd = open(path,O_RDWR|O_CREAT|O_EXCL,0600); if (fd < 0) { (*err)("can't create %s: %s",path,strerror(errno)); return(0); } } contents = filter_list_v(read_dir_contents(fn),&drop_admin); n = 0; vp = &contents; while ((v = *vp)) { unsigned int s; char *ss; s = atou(v->v); asprintf(&ss,"%d",n); if (strcmp(ss,v->v) && (n > 0)) { *vp = v->link; free(v->v); free(v); } else { free(v->v); v->u = s; vp = &v->link; n ++; } free(ss); } contents = sort_vlist(contents,&vlist_cmp_u); p = new_priv(); p->basedir = strdup(fn); p->fd_lock = fd; p->serials = malloc(n*sizeof(SERIAL *)); for (x=n-1;x>=0;x--) p->serials[x] = 0; p->nserial = n; x = 0; while (contents) { v = contents; contents = contents->link; s = new_serial(); s->p = p; s->serial = v->u; free(v); if (open_serial(s,err) < 0) { free(s); n --; } else { p->serials[x++] = s; } } if (n < 1) { free(p->serials); p->nserial = 1; p->serials = malloc(sizeof(SERIAL *)); s = new_serial(); p->serials[0] = 0; s->p = p; s->serial = 1; if (new_next_serial(p,s,err) < 0) { free(s); (*backing_snapshot.done)(p); return(0); } p->serials[0] = s; n = 1; } p->nserial = n; return(p); } /* * Close down and free up a PRIV. */ static void snapshot_done(void *pv) { int i; PRIV *p; if (! pv) return; p = pv; for (i=p->nserial-1;i>=0;i--) { SERIAL *s; s = p->serials[i]; if (s) { close_serial(s); free(s); } } free(p->serials); close(p->fd_lock); free(p->basedir); free(p->detail); free(p); } /* * Lock a PRIV. Since this exists just to prevent two runs from * colliding over the same backing data, we just pick a distinguished * file (the admin file) and lock it. */ static int snapshot_flock(void *pv, int op) { return(flock(((PRIV *)pv)->fd_lock,op)); } /* * Write the data pointed to by data to block number blkno in p. * On success, returns 512; on failure, returns -1 and sets errno. */ static int snapshot_write(void *pv, unsigned int blkno, const void *data) { PRIV *p; SERIAL *s; p = pv; s = p->serials[p->nserial-1]; if (blkno >= s->esize) { if (grow_maps(s,blkno+1) < 0) return(-1); } return(write_it(s,data,blkno)); } /* * Read the data for blocks starting at blkno, for a total of nblks * blocks, into data. * * On success, returns nblks*512. On failure, returns -1 with errno * set appropriately. */ static int snapshot_read(void *pv, unsigned int blkno, unsigned int nblks, void *data) { PRIV *p; unsigned char *dp; int sx; SERIAL *s; unsigned long long int m; int rv; unsigned int nb; p = pv; dp = data; for <"blocks"> (nb=nblks;nb>0;blkno++,nb--,dp+=512) { for (sx=p->nserial-1;sx>=0;sx--) { s = p->serials[sx]; if ((blkno < s->esize) && (s->map_present[blkno>>3] & (1<<(blkno&7)))) { rv = pread(s->fd_map,&m,8,blkno*8ULL); if (rv != 8) { if (rv >= 0) errno = EIO; return(-1); } rv = pread(s->fd_data,dp,512,m*512); if (rv != 512) { if (rv >= 0) errno = EIO; return(-1); } continue <"blocks">; } } fprintf(stderr,"warning: synthesizing block of NULs for %s block %u\n",p->basedir,blkno); bzero(dp,512); } return(nblks*512); } /* * Set the PRIV's current size to nblks blocks. */ static void snapshot_set_size(void *pv, unsigned int nblks) { PRIV *p; SERIAL *s; p = pv; s = p->serials[p->nserial-1]; if (nblks > s->esize) grow_maps(s,nblks); } /* * Return a string giving internal data of potential value to debuggers * looking at live processes. */ static const char *snapshot_detail(void *pv) { PRIV *p; p = pv; if (! p->detail_valid) { char *str; FILE *f; int sx; SERIAL *s; f = fopenalloc(&str,0); fprintf(f,"%d",p->fd_lock); for (sx=p->nserial-1;sx>=0;sx--) { s = p->serials[sx]; fprintf(f,"-%u[%d/%d/%d/%d]",s->serial,s->fd_admin,s->fd_present,s->fd_map,s->fd_data); } fclose(f); free(p->detail); p->detail = str; p->detail_valid = 1; } return(p->detail); } /* * Return a struct stat which is suitable for comparing whether two * backing "file"s are really the same. * * Return semantics are as for fstat(2). */ static int snapshot_fstat(void *pv, struct stat *stb) { return(fstat(((PRIV *)pv)->fd_lock,stb)); } /* * Move a PRIV from *fp to *tp. */ static void snapshot_move(void **fvp, void **tvp) { (*backing_snapshot.done)(*tvp); *tvp = *fvp; *fvp = 0; } const BACKING backing_snapshot = BACKING_INIT(snapshot);