/* * Resize a filesystem. Is capable of both growing and shrinking. * * Usage: fsresize filesystem newsize * * Example: fsresize /dev/rsd1e 29574 * * newsize is in DEV_BSIZE units (ie, disk sectors, usually 512 bytes * each). * * Note: this currently requires gcc to build, since it is written * depending on gcc-specific features, notably nested function * definitions (which in at least a few cases depend on the lexical * scoping gcc provides, so they can't be trivially moved outside). * * It will not do anything useful with filesystems in other than * host-native byte order. This really should be fixed (it's largely * a historical accident; the original version of this program is * older than bi-endian support in FFS). * * Many thanks go to John Kohl for finding bugs: the * one responsible for the "realloccgblk: can't find blk in cyl" * problem and a more minor one which left fs_dsize wrong when * shrinking. (These actually indicate bugs in fsck too - it should * have caught and fixed them.) * * As its sole author, I explicitly place this code in the public * domain. Anyone may use it for any purpose (though I would * appreciate credit where it is due). * * der Mouse * * mouse@rodents.montreal.qc.ca * 7D C8 61 52 5D E7 2D 39 4E F1 31 3E E8 B3 27 4B */ #include #include #include #include #include #include #include #include #include #include #include #include #include extern const char *__progname; /* Patchup for systems that don't yet have __progname */ #ifdef NO_PROGNAME const char *__progname; int main(int, char **); int main_(int, char **); int main(int ac, char **av) #define main main_ { __progname = av[0]; return(main(ac,av)); } #endif /* Suppress warnings about unused arguments */ #if defined(__GNUC__) && \ ( (__GNUC__ > 2) || \ ( (__GNUC__ == 2) && \ defined(__GNUC_MINOR__) && \ (__GNUC_MINOR__ >= 7) ) ) #define UNUSED_ARG(x) x __attribute__((__unused__)) #else #define UNUSED_ARG(x) x #endif /* new size of filesystem, in sectors */ static int newsize; /* fd open onto disk device */ static int fd; /* must we break up big I/O operations - see checksmallio() */ static int smallio; /* size of a cg, in bytes, rounded up to a frag boundary */ static int cgblksz; /* Superblocks. */ static struct fs *oldsb; /* before we started */ static struct fs *newsb; /* copy to work with */ /* Buffer to hold the above. Make sure it's aligned correctly. */ static char sbbuf[2*SBSIZE] __attribute__((__aligned__(__alignof__(struct fs)))); /* a cg's worth of brand new squeaky-clean inodes */ static struct dinode *zinodes; /* pointers to the in-core cgs, read off disk and possibly modified */ static struct cg **cgs; /* pointer to csum array - the stuff pointed to on-disk by fs_csaddr */ static struct csum *csums; /* per-cg flags, indexed by cg number */ static unsigned char *cgflags; #define CGF_DIRTY 0x01 /* needs to be written to disk */ #define CGF_BLKMAPS 0x02 /* block bitmaps need rebuilding */ #define CGF_INOMAPS 0x04 /* inode bitmaps need rebuilding */ /* when shrinking, these two arrays record how we want blocks to move. */ /* if blkmove[i] is j, the frag that started out as frag #i should end */ /* up as frag #j. inomove[i]=j means, similarly, that the inode that */ /* started out as inode i should end up as inode j. */ static unsigned int *blkmove; static unsigned int *inomove; /* We'd *like* to keep all inodes in-core. If there are few enough to */ /* make this practical, we do it, in this vector: */ static struct dinode *inodev; /* However, on a big filesystem, the inodes may occupy more memory than */ /* our data size limit. When this happens, we fork subprocesses to */ /* keep the inodes in core and swap them in and out of a LRU cache. */ /* This is done with these variables. */ static int icachen; /* cache size */ static unsigned int *icachenums; /* #s if inodes in cache */ static struct dinode *icache; /* cache inodes */ static unsigned int *icacheuse; /* cache last-use values */ static unsigned int icacheusecnt; /* last last-use value */ static int nprocs; /* number of holder procs */ static int *procfds; /* fds of pipes to holders */ static int iperproc; /* inodes per process */ /* Commands to inode subprocs. */ #define IPROC_CMD_PUSH 1 #define IPROC_CMD_PULL 2 /* Per-inode flags, indexed by inumber. Arguably we ought to do the */ /* same trick as above for inums with these, but don't since we */ /* haven't run into that problem yet. We need one dirty bit per inode */ /* and one dirty bit per block of inodes. We keep them separate so we */ /* can avoid wasting flag bits that never get used. */ static unsigned char *idirty; /* per-inode */ static unsigned char *ibdirty; /* per-block */ #define GETFLAG(vec,inx) (((vec)[(inx)>>3]>>((inx)&7))&1) #define SETFLAG(vec,inx) ((vec)[(inx)>>3]|=1<<((inx)&7)) #define CLRFLAG(vec,inx) ((vec)[(inx)>>3]&=~(1<<((inx)&7))) /* * See if we need to break up large I/O operations. This should never * be needed, but under at least one combination, * large enough disk transfers to the raw device hang. So if we're * talking to a character special device, play it safe; in this case, * readat() and writeat() break everything up into pieces no larger * than 8K, doing multiple syscalls for larger operations. */ static void checksmallio(void) { struct stat stb; fstat(fd,&stb); smallio = ((stb.st_mode & S_IFMT) == S_IFCHR); } /* * Read size bytes starting at blkno into buf. blkno is in DEV_BSIZE * units, ie, after fsbtodb(); size is in bytes. */ static void readat(off_t blkno, void *buf, int size) { /* Seek to the correct place. */ if (lseek(fd,blkno*DEV_BSIZE,L_SET) < 0) { fprintf(stderr,"%s: lseek: %s\n",__progname,strerror(errno)); exit(1); } /* See if we have to break up the transfer... */ if (smallio) { char *bp; /* pointer into buf */ int left; /* bytes left to go */ int n; /* number to do this time around */ int rv; /* syscall return value */ bp = buf; left = size; while (left > 0) { n = (left > 8192) ? 8192 : left; rv = read(fd,bp,n); if (rv < 0) { fprintf(stderr,"%s: read: %s\n",__progname,strerror(errno)); exit(1); } if (rv != n) { fprintf(stderr,"%s: read: wanted %d, got %d\n",__progname,n,rv); exit(1); } bp += n; left -= n; } } else { int rv; rv = read(fd,buf,size); if (rv < 0) { fprintf(stderr,"%s: read: %s\n",__progname,strerror(errno)); exit(1); } if (rv != size) { fprintf(stderr,"%s: read: wanted %d, got %d\n",__progname,size,rv); exit(1); } } } /* * Write size bytes from buf starting at blkno. blkno is in DEV_BSIZE * units, ie, after fsbtodb(); size is in bytes. */ static void writeat(off_t blkno, const void *buf, int size) { /* Seek to the correct place. */ if (lseek(fd,blkno*DEV_BSIZE,L_SET) < 0) { fprintf(stderr,"%s: lseek: %s\n",__progname,strerror(errno)); exit(1); } /* See if we have to break up the transfer... */ if (smallio) { const char *bp; /* pointer into buf */ int left; /* bytes left to go */ int n; /* number to do this time around */ int rv; /* syscall return value */ bp = buf; left = size; while (left > 0) { n = (left > 8192) ? 8192 : left; rv = write(fd,bp,n); if (rv < 0) { fprintf(stderr,"%s: write: %s\n",__progname,strerror(errno)); exit(1); } if (rv != n) { fprintf(stderr,"%s: write: wanted %d, got %d\n",__progname,n,rv); exit(1); } bp += n; left -= n; } } else { int rv; rv = write(fd,buf,size); if (rv < 0) { fprintf(stderr,"%s: write: %s\n",__progname,strerror(errno)); exit(1); } if (rv != size) { fprintf(stderr,"%s: write: wanted %d, got %d\n",__progname,size,rv); exit(1); } } } /* * Never-fail versions of malloc() and realloc(), and an allocation * routine (which also never fails) for allocating memory that will * never be freed until exit. */ /* * Never-fail malloc. */ static void *nfmalloc(size_t nb, const char *tag) { void *rv; rv = malloc(nb); if (rv) return(rv); fprintf(stderr,"%s: can't allocate %lu bytes for %s\n",__progname,(unsigned long int)nb,tag); exit(1); } /* * Never-fail realloc. */ static void *nfrealloc(void *blk, size_t nb, const char *tag) { void *rv; rv = realloc(blk,nb); if (rv) return(rv); fprintf(stderr,"%s: can't reallocate to %lu bytes for %s\n",__progname,(unsigned long int)nb,tag); exit(1); } /* * Allocate memory that will never be freed or reallocated. Arguably * this routine should handle small allocations by chopping up pages, * but that's not worth the bother; it's not called more than a * handful of times per run, and if the allocations are that small the * waste in giving each one its own page is ignorable. */ static void *alloconce(size_t nb, const char *tag, int canfail) { void *rv; rv = mmap(0,nb,PROT_READ|PROT_WRITE,MAP_ANON|MAP_PRIVATE,-1,0); if (rv != MAP_FAILED) return(rv); if (canfail) return(0); fprintf(stderr,"%s: can't allocate %lu bytes for %s\n",__progname,(unsigned long int)nb,tag); exit(1); } /* * Allocate memory for sharing with subprocs. */ static void *allocshared(size_t nb) { void *rv; rv = mmap(0,nb,PROT_READ|PROT_WRITE,MAP_ANON|MAP_SHARED,-1,0); if (rv != MAP_FAILED) return(rv); else return(0); } /* * Load the cgs and csums off disk. Also allocates the space to load * them into and initializes the per-cg flags. */ static void loadcgs(void) { int cg; char *cgp; cgblksz = roundup(oldsb->fs_cgsize,oldsb->fs_fsize); cgs = nfmalloc(oldsb->fs_ncg*sizeof(struct cg *),"cg pointers"); cgp = alloconce(oldsb->fs_ncg*cgblksz,"cgs",0); cgflags = nfmalloc(oldsb->fs_ncg,"cg flags"); csums = nfmalloc(oldsb->fs_cssize,"cg summary"); for (cg=0;cgfs_ncg;cg++) { cgs[cg] = (struct cg *) cgp; readat(fsbtodb(oldsb,cgtod(oldsb,cg)),cgp,cgblksz); cgflags[cg] = 0; cgp += cgblksz; } readat(fsbtodb(oldsb,oldsb->fs_csaddr),csums,oldsb->fs_cssize); } /* * Set n bits, starting with bit #base, in the bitmap pointed to by * bitvec (which is assumed to be large enough to include bits base * through base+n-1). */ static void set_bits(unsigned char *bitvec, unsigned int base, unsigned int n) { if (n < 1) return; /* nothing to do */ if (base & 7) /* partial byte at beginning */ { if (n <= 8-(base&7)) /* entirely within one byte */ { bitvec[base>>3] |= (~((~0U)<>3] |= (~0U) << (base & 7); n -= 8 - (base & 7); base = (base & ~7) + 8; } if (n >= 8) /* do full bytes */ { memset(bitvec+(base>>3),0xff,n>>3); base += n & ~7; n &= 7; } if (n) /* partial byte at end */ { bitvec[base>>3] |= ~((~0U)<>3] &= ~((~((~0U)<>3] &= ~ ((~0U) << (base & 7)); n -= 8 - (base & 7); base = (base & ~7) + 8; } if (n >= 8) { bzero(bitvec+(base>>3),n>>3); base += n & ~7; n &= 7; } if (n) { bitvec[base>>3] &= (~0U) << n; } } /* * Test whether bit #bit is set in the bitmap pointed to by bitvec. */ inline static int bit_is_set(unsigned char *bitvec, int bit) { return(bitvec[bit>>3]&(1<<(bit&7))); } /* * Test whether bit #bit is clear in the bitmap pointed to by bitvec. */ inline static int bit_is_clr(unsigned char *bitvec, int bit) { return(!bit_is_set(bitvec,bit)); } /* * Test whether a whole block of bits is set in a bitmap. This is * designed for testing (aligned) disk blocks in a bit-per-frag * bitmap; it has assumptions wired into it based on that, essentially * that the entire block fits into a single byte. This returns true * iff _all_ the bits are set; it is not just the complement of * blk_is_clr on the same arguments (unless blkfrags==1). */ inline static int blk_is_set(unsigned char *bitvec, int blkbase, int blkfrags) { unsigned int mask; mask = (~((~0U)<>3]&mask)==mask); } /* * Test whether a whole block of bits is clear in a bitmap. See * blk_is_set (above) for assumptions. This returns true iff _all_ * the bits are clear; it is not just the complement of blk_is_set on * the same arguments (unless blkfrags==1). */ inline static int blk_is_clr(unsigned char *bitvec, int blkbase, int blkfrags) { unsigned int mask; mask = (~((~0U)<>3]&mask)==0); } /* * Initialize a new cg. Called when growing. Assumes memory has been * allocated but not otherwise set up. This code sets the fields of * the cg, initializes the bitmaps (and cluster summaries, if * applicable), updates both per-cylinder summary info and the global * summary info in newsb; it also writes out new inodes for the cg. * * This code knows it can never be called for cg 0, which makes it a * bit simpler than it would otherwise be. */ static void initcg(int cgn) { struct cg *cg; /* The in-core cg, of course */ int base; /* Disk address of cg base */ int dlow; /* Size of pre-cg data area */ int dhigh; /* Offset of post-inode data area, from base */ int dmax; /* Offset of end of post-inode data area */ int i; /* Generic loop index */ int n; /* Generic count */ cg = cgs[cgn]; /* Place the data areas */ base = cgbase(newsb,cgn); dlow = cgsblock(newsb,cgn) - base; dhigh = cgdmin(newsb,cgn) - base; dmax = newsb->fs_size - base; if (dmax > newsb->fs_fpg) dmax = newsb->fs_fpg; /* Clear out the cg - assumes all-0-bytes is the correct way to */ /* initialize fields we don't otherwise touch, which is perhaps not */ /* the right thing to do, but it's what fsck and mkfs do. */ bzero(cg,newsb->fs_cgsize); cg->cg_time = newsb->fs_time; cg->cg_magic = CG_MAGIC; cg->cg_cgx = cgn; cg->cg_ncyl = newsb->fs_cpg; /* fsck whines if the cg->cg_ncyl value in the last cg is fs_cpg */ /* instead of zero, when fs_cpg is the correct value. */ /* XXX fix once fsck is fixed */ if ((cgn == newsb->fs_ncg-1) /* && (newsb->fs_ncyl % newsb->fs_cpg) */) { cg->cg_ncyl = newsb->fs_ncyl % newsb->fs_cpg; } cg->cg_niblk = newsb->fs_ipg; cg->cg_ndblk = dmax; /* Set up the bitmap pointers. We have to be careful to lay out the */ /* cg _exactly_ the way mkfs and fsck do it, since fsck compares the */ /* _entire_ cg against a recomputed cg, and whines if there is any */ /* mismatch, including the bitmap offsets. */ /* XXX update this comment when fsck is fixed */ cg->cg_btotoff = &cg->cg_space[0] - (unsigned char *)cg; cg->cg_boff = cg->cg_btotoff + (newsb->fs_cpg * sizeof(int32_t)); cg->cg_iusedoff = cg->cg_boff + (newsb->fs_cpg * newsb->fs_nrpos * sizeof(int16_t)); cg->cg_freeoff = cg->cg_iusedoff + howmany(newsb->fs_ipg,NBBY); if (newsb->fs_contigsumsize > 0) { cg->cg_nclusterblks = cg->cg_ndblk / newsb->fs_frag; cg->cg_clustersumoff = cg->cg_freeoff + howmany(newsb->fs_cpg*newsb->fs_spc/NSPF(newsb),NBBY) - sizeof(int32_t); cg->cg_clustersumoff = roundup(cg->cg_clustersumoff,sizeof(int32_t)); cg->cg_clusteroff = cg->cg_clustersumoff + ((newsb->fs_contigsumsize+1) * sizeof(int32_t)); cg->cg_nextfreeoff = cg->cg_clusteroff + howmany(newsb->fs_cpg*newsb->fs_spc/NSPB(newsb),NBBY); n = dlow / newsb->fs_frag; if (n > 0) { set_bits(cg_clustersfree(cg,0),0,n); cg_clustersum(cg,0)[(n>newsb->fs_contigsumsize)?newsb->fs_contigsumsize:n] ++; } } else { cg->cg_nextfreeoff = cg->cg_freeoff + howmany(newsb->fs_cpg*newsb->fs_spc/NSPF(newsb),NBBY); } /* Mark the data areas as free; everything else is marked busy by the */ /* bzero up at the top. */ set_bits(cg_blksfree(cg,0),0,dlow); set_bits(cg_blksfree(cg,0),dhigh,dmax-dhigh); /* Initialize summary info */ cg->cg_cs.cs_ndir = 0; cg->cg_cs.cs_nifree = newsb->fs_ipg; cg->cg_cs.cs_nbfree = dlow / newsb->fs_frag; cg->cg_cs.cs_nffree = 0; /* This is the simplest way of doing this; we perhaps could compute */ /* the correct cg_blktot()[] and cg_blks()[] values other ways, but */ /* it would be complicated and hardly seems worth the effort. (The */ /* reason there isn't frag-at-beginning and frag-at-end code here, */ /* like the code below for the post-inode data area, is that the */ /* pre-sb data area always starts at 0, and thus is block-aligned, */ /* and always ends at the sb, which is block-aligned.) */ for (i=0;ifs_frag) { cg_blktot(cg,0)[cbtocylno(newsb,i)] ++; cg_blks(newsb,cg,cbtocylno(newsb,i),0)[cbtorpos(newsb,i)] ++; } /* Deal with a partial block at the beginning of the post-inode area. */ /* I'm not convinced this can happen - I think the inodes are always */ /* block-aligned and always an integral number of blocks - but it's */ /* cheap to do the right thing just in case. */ if (dhigh % newsb->fs_frag) { n = newsb->fs_frag - (dhigh % newsb->fs_frag); cg->cg_frsum[n] ++; cg->cg_cs.cs_nffree += n; dhigh += n; } n = (dmax-dhigh) / newsb->fs_frag; /* We have n full-size blocks in the post-inode data area. */ if (n > 0) { cg->cg_cs.cs_nbfree += n; if (newsb->fs_contigsumsize > 0) { i = dhigh / newsb->fs_frag; set_bits(cg_clustersfree(cg,0),i,n); cg_clustersum(cg,0)[(n>newsb->fs_contigsumsize)?newsb->fs_contigsumsize:n] ++; } for (i=n;i>0;i--) { cg_blktot(cg,0)[cbtocylno(newsb,dhigh)] ++; cg_blks(newsb,cg,cbtocylno(newsb,dhigh),0)[cbtorpos(newsb,dhigh)] ++; dhigh += newsb->fs_frag; } } /* Deal with any leftover frag at the end of the cg. */ i = dmax - dhigh; if (i) { cg->cg_frsum[i] ++; cg->cg_cs.cs_nffree += i; } /* Update the csum info. */ csums[cgn] = cg->cg_cs; newsb->fs_cstotal.cs_nffree += cg->cg_cs.cs_nffree; newsb->fs_cstotal.cs_nbfree += cg->cg_cs.cs_nbfree; newsb->fs_cstotal.cs_nifree += cg->cg_cs.cs_nifree; /* Write out the cleared inodes. */ writeat(fsbtodb(newsb,cgimin(newsb,cgn)),zinodes,newsb->fs_ipg*sizeof(struct dinode)); /* Dirty the cg. */ cgflags[cgn] |= CGF_DIRTY; } /* * Find free space, at least nfrags consecutive frags of it. Pays no * attention to block boundaries, but refuses to straddle cg * boundaries, even if the disk blocks involved are in fact * consecutive. Return value is the frag number of the first frag of * the block, or -1 if no space was found. Uses newsb for sb values, * and assumes the cgs[] structures correctly describe the area to be * searched. * * If we wrap off the end of the filesystem back to the beginning, we * can end up searching the end of the filesystem twice. I ignore * this inefficiency, since if that happens we're going to croak with * a no-space error anyway, so it happens at most once. */ static int find_freespace_unrestricted(unsigned int nfrags) { static int hand = 0; /* hand rotates through all frags in the fs */ int cgsize; /* size of the cg hand currently points into */ int cgn; /* number of cg hand currently points into */ int fwc; /* frag-within-cg number of frag hand points to */ int run; /* length of run of free frags seen so far */ int secondpass; /* have we wrapped from end of fs to beginning? */ unsigned char *bits; /* cg_blksfree()[] for cg hand points into */ cgn = dtog(newsb,hand); fwc = dtogd(newsb,hand); secondpass = (hand == 0); run = 0; bits = cg_blksfree(cgs[cgn],0); cgsize = cgs[cgn]->cg_ndblk; while (1) { if (bit_is_set(bits,fwc)) { run ++; if (run >= nfrags) return(hand+1-run); } else { run = 0; } hand ++; fwc ++; if (fwc >= cgsize) { fwc = 0; cgn ++; if (cgn >= newsb->fs_ncg) { hand = 0; if (secondpass) return(-1); secondpass = 1; cgn = 0; } bits = cg_blksfree(cgs[cgn],0); cgsize = cgs[cgn]->cg_ndblk; run = 0; } } } /* * Find free space, at least nfrags consecutive frags of it. Refuses * to straddle either block or cg boundaries, even if the disk blocks * involved are in fact consecutive. Return value is the frag number * of the first frag of the block, or -1 if no space was found. Uses * newsb for sb values, and assumes the cgs[] structures correctly * describe the area to be searched. * * This refuses to straddle block boundaries because an end-of-file * less-than-full-block may not do so; see the "blkfree: bad size" * panic in ffs_blkfree(). * * This differs from find_freespace_unrestricted in that it refuses to * straddle block boundaries. See the comment on that function for * some wrapping considerations. */ static int find_freespace_block(unsigned int nfrags) { static int hand = 0; /* hand rotates through all frags in the fs */ int cgsize; /* size of the cg hand currently points into */ int cgn; /* number of cg hand currently points into */ int fwc; /* frag-within-cg number of frag hand points to */ int fwb; /* frag-within-block number of frag hand points to */ int run; /* length of run of free frags seen so far */ int secondpass; /* have we wrapped from end of fs to beginning? */ unsigned char *bits; /* cg_blksfree()[] for cg hand points into */ if (nfrags > newsb->fs_frag) { fprintf(stderr,"find_freespace_block: impossible nfrags %u > %u\n",nfrags,newsb->fs_frag); abort(); } cgn = dtog(newsb,hand); fwc = dtogd(newsb,hand); fwb = hand % newsb->fs_frag; secondpass = (hand == 0); run = 0; bits = cg_blksfree(cgs[cgn],0); cgsize = cgs[cgn]->cg_ndblk; while (1) { if (bit_is_set(bits,fwc)) { run ++; if (run >= nfrags) return(hand+1-run); } else { run = 0; } hand ++; fwc ++; if (fwc >= cgsize) { fwc = 0; fwb = 0; cgn ++; if (cgn >= newsb->fs_ncg) { hand = 0; if (secondpass) return(-1); secondpass = 1; cgn = 0; } bits = cg_blksfree(cgs[cgn],0); cgsize = cgs[cgn]->cg_ndblk; run = 0; } else { fwb ++; if (fwb >= newsb->fs_frag) { fwb = 0; run = 0; } } } } /* * Find a free block of disk space. Finds an entire block of frags, * all of which are free. Return value is the frag number of the * first frag of the block, or -1 if no space was found. Uses newsb * for sb values, and assumes the cgs[] structures correctly describe * the area to be searched. * * See the find_freespace_*() functions, above, for remarks about hand * wrapping around. */ static int find_freeblock(void) { static int hand = 0; /* hand rotates through all frags in fs */ int cgn; /* cg number of cg hand points into */ int fwc; /* frag-within-cg number of frag hand points to */ int cgsize; /* size of cg hand points into */ int secondpass; /* have we wrapped from end to beginning? */ unsigned char *bits; /* cg_blksfree()[] for cg hand points into */ cgn = dtog(newsb,hand); fwc = dtogd(newsb,hand); secondpass = (hand == 0); bits = cg_blksfree(cgs[cgn],0); cgsize = blknum(newsb,cgs[cgn]->cg_ndblk); while (1) { if (blk_is_set(bits,fwc,newsb->fs_frag)) return(hand); fwc += newsb->fs_frag; hand += newsb->fs_frag; if (fwc >= cgsize) { fwc = 0; cgn ++; if (cgn >= newsb->fs_ncg) { hand = 0; if (secondpass) return(-1); secondpass = 1; cgn = 0; } bits = cg_blksfree(cgs[cgn],0); cgsize = blknum(newsb,cgs[cgn]->cg_ndblk); } } } /* * Find a free inode, returning its inumber or -1 if none was found. * Uses newsb for sb values, and assumes the cgs[] structures * correctly describe the area to be searched. * * See the find_freespace_*() functions, above, for remarks about hand * wrapping around. */ static int find_freeinode(void) { static int hand = 0; /* hand rotates through all inodes in fs */ int cgn; /* cg number of cg hand points into */ int iwc; /* inode-within-cg number of inode hand points to */ int secondpass; /* have we wrapped from end to beginning? */ unsigned char *bits; /* cg_inosused()[] for cg hand points into */ cgn = hand / newsb->fs_ipg; iwc = hand % newsb->fs_ipg; secondpass = (hand == 0); bits = cg_inosused(cgs[cgn],0); while (1) { if (bit_is_clr(bits,iwc)) return(hand); hand ++; iwc ++; if (iwc >= newsb->fs_ipg) { iwc = 0; cgn ++; if (cgn >= newsb->fs_ncg) { hand = 0; if (secondpass) return(-1); secondpass = 1; cgn = 0; } bits = cg_inosused(cgs[cgn],0); } } } /* * Mark a frag as free. Sets the frag's bit in the cg_blksfree bitmap * for the appropriate cg, and marks the cg as dirty. */ static void free_frag(int fno) { int cgn; cgn = dtog(newsb,fno); set_bits(cg_blksfree(cgs[cgn],0),dtogd(newsb,fno),1); cgflags[cgn] |= CGF_DIRTY | CGF_BLKMAPS; } /* * Allocate a frag. Clears the frag's bit in the cg_blksfree bitmap * for the appropriate cg, and marks the cg as dirty. */ static void alloc_frag(int fno) { int cgn; cgn = dtog(newsb,fno); clr_bits(cg_blksfree(cgs[cgn],0),dtogd(newsb,fno),1); cgflags[cgn] |= CGF_DIRTY | CGF_BLKMAPS; } /* * Fix up the csum array. If shrinking, this involves freeing zero or * more frags; if growing, it involves allocating them, or if the * frags being grown into aren't free, finding space elsewhere for the * csum info. (If the number of occupied frags doesn't change, * nothing happens here.) */ static void csum_fixup(void) { int nold; /* # frags in old csum info */ int ntot; /* # frags in new csum info */ int nnew; /* ntot-nold */ int newloc; /* new location for csum info, if necessary */ int i; /* generic loop index */ int j; /* generic loop index */ int f; /* "from" frag number, if moving */ int t; /* "to" frag number, if moving */ int cgn; /* cg number, used when shrinking */ ntot = howmany(newsb->fs_cssize,newsb->fs_fsize); nold = howmany(oldsb->fs_cssize,newsb->fs_fsize); nnew = ntot - nold; /* First, if there's no change in frag counts, it's easy. */ if (nnew == 0) return; /* Next, if we're shrinking, it's almost as easy. Just free up any */ /* frags in the old area we no longer need. */ if (nnew < 0) { for ((i=newsb->fs_csaddr+ntot-1),(j=nnew);j<0;i--,j++) { free_frag(i); } return; } /* We must be growing. Check to see that the new csum area fits */ /* within the filesystem. I think this can never happen, since for */ /* the csum area to grow, we must be adding at least one cg, so the */ /* old csum area can't be this close to the end of the new */ /* filesystem. But it's a cheap check. */ /* XXX what if csum info is at end of cg and grows into next cg, what */ /* if it spills over onto the next cg's backup superblock? Can this */ /* happen? */ if (newsb->fs_csaddr+ntot <= newsb->fs_size) { /* Okay, it fits - now, see if the space we want is free. */ for ((i=newsb->fs_csaddr+nold),(j=nnew);j>0;i++,j--) { cgn = dtog(newsb,i); if (bit_is_clr(cg_blksfree(cgs[cgn],0),dtogd(newsb,i))) break; } if (j <= 0) { /* Win win - all the frags we want are free. Allocate 'em and */ /* we're all done. */ for ((i=newsb->fs_csaddr+ntot-nnew),(j=nnew);j>0;i++,j--) { alloc_frag(i); } return; } } /* We have to move the csum info, sigh. Look for new space, free old */ /* space, and allocate new. Update fs_csaddr. We don't copy */ /* anything on disk at this point; the csum info will be written to */ /* the then-current fs_csaddr as part of the final flush. */ newloc = find_freespace_unrestricted(ntot); if (newloc < 0) { printf("Sorry, no space available for new csums\n"); exit(1); } for (i=0,f=newsb->fs_csaddr,t=newloc;ifs_csaddr = newloc; } /* * Recompute newsb->fs_dsize. Just scans all cgs, adding the number of * data blocks in that cg to the total. */ static void recompute_fs_dsize(void) { int i; newsb->fs_dsize = 0; for (i=0;ifs_ncg;i++) { int dlow; /* size of before-sb data area */ int dhigh; /* offset of post-inode data area */ int dmax; /* total size of cg */ int base; /* base of cg, since cgsblock() etc add it in */ base = cgbase(newsb,i); dlow = cgsblock(newsb,i) - base; dhigh = cgdmin(newsb,i) - base; dmax = newsb->fs_size - base; if (dmax > newsb->fs_fpg) dmax = newsb->fs_fpg; newsb->fs_dsize += dlow + dmax - dhigh; } /* Space in cg 0 before cgsblock is boot area, not free space! */ newsb->fs_dsize -= cgsblock(newsb,0) - cgbase(newsb,0); /* And of course the csum info takes up space. */ newsb->fs_dsize -= howmany(newsb->fs_cssize,newsb->fs_fsize); } /* * Return the current time. We call this and assign, rather than * calling time() directly, as insulation against OSes where fs_time * is not a time_t. */ static time_t timestamp(void) { time_t t; time(&t); return(t); } /* * Grow the filesystem. */ static void grow(void) { int i; /* Update the timestamp. */ newsb->fs_time = timestamp(); /* Allocate and clear the new-inode area, in case we add any cgs. */ zinodes = alloconce(newsb->fs_ipg*sizeof(struct dinode),"zeroed inodes",0); bzero(zinodes,newsb->fs_ipg*sizeof(struct dinode)); /* Update the size. */ newsb->fs_size = dbtofsb(newsb,newsize); /* Did we actually not grow? (This can happen if newsize is less than */ /* a frag larger than the old size - unlikely, but no excuse to */ /* misbehave if it happens.) */ if (newsb->fs_size == oldsb->fs_size) return; /* Check that the new last sector (frag, actually) is writable. Since */ /* it's at least one frag larger than it used to be, we know we */ /* aren't overwriting anything important by this. (The choice of */ /* sbbuf as what to write is irrelevant; it's just something handy */ /* that's known to be at least one frag in size.) */ writeat(newsb->fs_size-1,&sbbuf,newsb->fs_fsize); /* Update fs_ncyl and fs_ncg. */ newsb->fs_ncyl = (newsb->fs_size * NSPF(newsb)) / newsb->fs_spc; newsb->fs_ncg = howmany(newsb->fs_ncyl,newsb->fs_cpg); /* Does the last cg end before the end of its inode area? There is no */ /* reason why this couldn't be handled, but it would complicate a lot */ /* of code (in all filesystem code - fsck, kernel, etc) because of */ /* the potential partial inode area, and the gain in space would be */ /* minimal, at most the pre-sb data area. */ if (cgdmin(newsb,newsb->fs_ncg-1) > newsb->fs_size) { newsb->fs_ncg --; newsb->fs_ncyl = newsb->fs_ncg * newsb->fs_cpg; newsb->fs_size = (newsb->fs_ncyl * newsb->fs_spc) / NSPF(newsb); printf("Warning: last cylinder group is too small;\n"); printf(" dropping it. New size = %lu.\n",(unsigned long int)fsbtodb(newsb,newsb->fs_size)); } /* Find out how big the csum area is, and realloc csums if bigger. */ newsb->fs_cssize = fragroundup(newsb,newsb->fs_ncg*sizeof(struct csum)); if (newsb->fs_cssize > oldsb->fs_cssize) csums = nfrealloc(csums,newsb->fs_cssize,"new cg summary"); /* If we're adding any cgs, realloc structures and set up the new cgs. */ if (newsb->fs_ncg > oldsb->fs_ncg) { char *cgp; cgs = nfrealloc(cgs,newsb->fs_ncg*sizeof(struct cg *),"cg pointers"); cgflags = nfrealloc(cgflags,newsb->fs_ncg,"cg flags"); bzero(cgflags+oldsb->fs_ncg,newsb->fs_ncg-oldsb->fs_ncg); cgp = alloconce((newsb->fs_ncg-oldsb->fs_ncg)*cgblksz,"cgs",0); for (i=oldsb->fs_ncg;ifs_ncg;i++) { cgs[i] = (struct cg *) cgp; initcg(i); cgp += cgblksz; } cgs[oldsb->fs_ncg-1]->cg_ncyl = oldsb->fs_cpg; cgflags[oldsb->fs_ncg-1] |= CGF_DIRTY; } /* If the old fs ended partway through a cg, we have to update the old */ /* last cg (though possibly not to a full cg!). */ if (oldsb->fs_size % oldsb->fs_fpg) { struct cg *cg; int newcgsize; int prevcgtop; int oldcgsize; cg = cgs[oldsb->fs_ncg-1]; cgflags[oldsb->fs_ncg-1] |= CGF_DIRTY | CGF_BLKMAPS; prevcgtop = oldsb->fs_fpg * (oldsb->fs_ncg-1); newcgsize = newsb->fs_size - prevcgtop; if (newcgsize > newsb->fs_fpg) newcgsize = newsb->fs_fpg; oldcgsize = oldsb->fs_size % oldsb->fs_fpg; set_bits(cg_blksfree(cg,0),oldcgsize,newcgsize-oldcgsize); cg->cg_ncyl = howmany(newcgsize*NSPF(newsb),newsb->fs_spc); cg->cg_ndblk = newcgsize; } /* Fix up the csum info, if necessary. */ csum_fixup(); /* Make fs_dsize match the new reality. */ recompute_fs_dsize(); } /* * Send a message to an inode subprocess and wait for an ack. */ static void iserver_msg(int p, const void *msg) { int n; char ack; n = write(procfds[p],msg,3*sizeof(int)); if (n < 0) { fprintf(stderr,"%s: inode server write: %s\n",__progname,strerror(errno)); exit(1); } if (n != 3*sizeof(int)) { fprintf(stderr,"%s: inode server write: wanted %d did %d\n",__progname,(int)sizeof(msg),n); exit(1); } n = read(procfds[p],&ack,1); if (n < 0) { fprintf(stderr,"%s: inode server ack read: %s\n",__progname,strerror(errno)); exit(1); } if (n != 1) { fprintf(stderr,"%s: inode server ack read: wanted %d did %d\n",__progname,1,n); exit(1); } } /* * Push the icache entry in slot s to its process. */ static void icache_pushslot(int s) { unsigned int msg[3]; msg[0] = IPROC_CMD_PUSH; msg[1] = s; msg[2] = icachenums[s] % iperproc; iserver_msg(icachenums[s]/iperproc,&msg); } /* * Fill the icache entry in slot s with inode i. */ static void icache_getslot(int s, int i) { unsigned int msg[3]; msg[0] = IPROC_CMD_PULL; msg[1] = s; msg[2] = i % iperproc; iserver_msg(i/iperproc,&msg); icachenums[s] = i; } /* * Locate and return a pointer to inode #inum. */ static struct dinode *getinode(unsigned int inum) { int i; int lru; if (inodev) return(inodev+inum); lru = icachen - 1; for (i=icachen-1;i>=0;i--) { if (icachenums[i] == inum) { icacheuse[i] = icacheusecnt ++; return(icache+i); } if (icacheuse[i] < icacheuse[lru]) lru = i; } icache_pushslot(lru); icache_getslot(lru,inum); icacheuse[lru] = icacheusecnt ++; return(icache+lru); } /* * Call (*fn)() for each inode, passing the inode and its inumber. The * number of cylinder groups is pased in, so this can be used to map * over either the old or the new filesystem's set of inodes. */ static void map_inodes(void (*fn)(struct dinode *di, unsigned int), int ncg) { int i; int ni; ni = oldsb->fs_ipg * ncg; for (i=0;i= di->di_size) return(0); sz = dblksize(newsb,di,lblkno(newsb,o)); nb = (sz > di->di_size-o) ? di->di_size-o : sz; if (bn) (*fn)(bn,numfrags(newsb,sz),nb,MDB_DATA); return(sz); } /* Helper function - handles an indirect block. Makes the */ /* MDB_INDIR_PRE callback for the indirect block, loops over the */ /* pointers and recurses, and makes the MDB_INDIR_POST callback. */ /* Returns the number of bytes occupied in file, as does markblk(). */ /* For the sake of update_for_data_move(), we read the indirect block */ /* _after_ making the _PRE callback. The name is historical. */ static int markiblk(struct dinode *di, int bn, off_t o, int lev) { int i; int j; int tot; static daddr_t indirblk1[howmany(MAXBSIZE,sizeof(daddr_t))]; static daddr_t indirblk2[howmany(MAXBSIZE,sizeof(daddr_t))]; static daddr_t indirblk3[howmany(MAXBSIZE,sizeof(daddr_t))]; static daddr_t *indirblks[3] = { &indirblk1[0], &indirblk2[0], &indirblk3[0] }; if (lev < 0) return(markblk(di,bn,o)); if (bn == 0) { for (i=newsb->fs_bsize;lev>=0;i*=NINDIR(newsb),lev--) ; return(i); } (*fn)(bn,newsb->fs_frag,newsb->fs_bsize,MDB_INDIR_PRE); readat(fsbtodb(newsb,bn),indirblks[lev],newsb->fs_bsize); tot = 0; for (i=0;ifs_frag,newsb->fs_bsize,MDB_INDIR_POST); return(tot); } /* Scan the direct blocks... */ o = 0; for (b=0;bdi_db[b],o); if (inc == 0) break; o += inc; } /* ...and the indirect blocks. */ if (inc) { for (b=0;bdi_ib[b],o,b); if (inc == 0) return; o += inc; } } } /* * Make a callback call, a la map_inode_data_blocks, for all data * blocks in the entire fs. This is used only once, in * update_for_data_move, but it's out at top level because the complex * downward-funarg nesting that would otherwise result seems to give * gcc gastric distress. */ static void map_data_blocks(void (*fn)(unsigned int, unsigned int, unsigned int, int), int ncg) { map_inodes(({ static void foo(struct dinode *di, UNUSED_ARG(unsigned int inum)) { switch (di->di_mode & IFMT) { case IFLNK: if (di->di_size > newsb->fs_maxsymlinklen) { case IFDIR: case IFREG: map_inode_data_blocks(di,fn); } break; } } &foo; }),ncg); } /* * Initialize the blkmove array. */ static void blkmove_init(void) { int i; blkmove = alloconce(oldsb->fs_size*sizeof(*blkmove),"blkmove",0); for (i=0;ifs_size;i++) blkmove[i] = i; } /* * Load inodes off disk, for inodes-in-our-core scheme. */ static void own_inodes(void) { int cg; struct dinode *iptr; iptr = inodev; for (cg=0;cgfs_ncg;cg++) { readat(fsbtodb(oldsb,cgimin(oldsb,cg)),iptr,oldsb->fs_ipg*sizeof(struct dinode)); iptr += oldsb->fs_ipg; } } /* * Run the inode-server subprocess with parent pipe fd and memory block * buf. The parent process deals with offsetting inode numbers by the * base of our block. */ static void inode_subproc(int fd, struct dinode *buf) { unsigned int cmd[3]; int r; while (1) { r = recv(fd,&cmd,sizeof(cmd),MSG_WAITALL); if (r < 0) { fprintf(stderr,"%s: inode subproc recv: %s\n",__progname,strerror(errno)); exit(1); } if (r == 0) exit(0); if (r != sizeof(cmd)) { fprintf(stderr,"%s: inode subproc recv: wanted %d, got %d\n",__progname,(int)sizeof(cmd),r); exit(1); } switch (cmd[0]) { case IPROC_CMD_PUSH: bcopy(icache+cmd[1],buf+cmd[2],sizeof(struct dinode)); write(fd,"",1); break; case IPROC_CMD_PULL: bcopy(buf+cmd[2],icache+cmd[1],sizeof(struct dinode)); write(fd,"",1); break; } } } /* * Set up for inodes-in-subprocs scheme, and load inodes off disk. */ static void auxproc_inodes(void) { unsigned int ni; void *blk; int i; int cg; int ix; icachen = (256 / INOPB(oldsb)) * INOPB(oldsb); icache = allocshared(icachen*sizeof(struct dinode)); iperproc = oldsb->fs_ncg * oldsb->fs_ipg; if (iperproc > ((~0U)>>1)/sizeof(struct dinode)) { iperproc = ((~0U)>>1) / sizeof(struct dinode); } while (1) { blk = alloconce(iperproc*sizeof(struct dinode),0,1); if (blk) break; iperproc >>= 1; if (iperproc < 2) { fprintf(stderr,"%s: can't allocate even two inodes of space\n",__progname); exit(1); } } nprocs = howmany(ni,iperproc); procfds = nfmalloc(nprocs*sizeof(*procfds),"inode process pipes"); fprintf(stderr,"%s: note: using %d subprocs for inodes, %d each\n",__progname,nprocs,iperproc); for (i=nprocs-1;i>=0;i--) { pid_t kid; int fds[2]; if (socketpair(AF_LOCAL,SOCK_STREAM,0,&fds[0]) < 0) { fprintf(stderr,"%s: inode proc socketpair: %s\n",__progname,strerror(errno)); exit(1); } kid = fork(); if (kid == 0) { for (i++;ifs_ncg;cg++) { int left; int r; int j; left = oldsb->fs_ipg; while (left > 0) { i = (left > icachen) ? icachen : left; if (i % INOPB(oldsb)) abort(); readat(fsbtodb(oldsb,ino_to_fsba(oldsb,ix)),icache,r*sizeof(struct dinode)); for (j=0;j=0;i--) icacheuse[i] = 0; icacheusecnt = 1; } /* * Load the inodes off disk. Allocates the structures and initializes * them - the inodes from disk, the flags to zero. */ static void loadinodes(void) { unsigned long long int ibytes; int n; ibytes = oldsb->fs_ncg * oldsb->fs_ipg * sizeof(struct dinode); if (ibytes != (unsigned long long int)(size_t)ibytes) { auxproc_inodes(); } else { inodev = alloconce(ibytes,"inodes",1); if ((inodev == 0) && (errno == ENOMEM)) { auxproc_inodes(); } else { own_inodes(); } } n = howmany(oldsb->fs_ncg*oldsb->fs_ipg,8); idirty = alloconce(n,"inode dirty flags",0); bzero(idirty,n); n = howmany(oldsb->fs_ncg*oldsb->fs_ipg/INOPB(oldsb),8); ibdirty = alloconce(n,"inode block dirty flags",0); bzero(ibdirty,n); } /* * Report a filesystem-too-full problem. */ static void toofull(void) { printf("Sorry, would run out of data blocks\n"); exit(1); } /* * Record a desire to move "n" frags from "from" to "to". */ static void mark_move(unsigned int from, unsigned int to, unsigned int n) { for (;n>0;n--) blkmove[from++] = to++; } /* * Evict all data blocks from the given cg, starting at minfrag (based * at the beginning of the cg), for length nfrag. The eviction is * assumed to be entirely data-area; this should not be called with a * range overlapping the metadata structures in the cg. It also * assumes minfrag points into the given cg; it will misbehave if this * is not true. */ static void evict_data(struct cg *cg, unsigned int minfrag, unsigned int nfrag) { int base; /* base of cg (in frags from beginning of fs) */ /* Helper function - evict n frags, starting with start (cg-relative). */ /* The free bitmap is scanned, unallocated frags are ignored, and */ /* each block of consecutive allocated frags is moved as a unit. */ static void fragmove(unsigned int start, unsigned int n) { int i; int run; run = 0; for (i=0;i<=n;i++) { if ((i < n) && bit_is_clr(cg_blksfree(cg,0),start+i)) { run ++; } else { if (run > 0) { int off; off = find_freespace_block(run); if (off < 0) toofull(); mark_move(base+start+i-run,off,run); set_bits(cg_blksfree(cg,0),start+i-run,run); clr_bits(cg_blksfree(cgs[dtog(oldsb,off)],0),dtogd(oldsb,off),run); } run = 0; } } } base = cgbase(oldsb,cg->cg_cgx); /* Does the boundary fall in the middle of a block? To avoid breaking */ /* between frags allocated as consecutive, we always evict the whole */ /* block in this case, though one could argue we should check to see */ /* if the frag before or after the break is unallocated. */ if (minfrag % oldsb->fs_frag) { int n; n = minfrag % oldsb->fs_frag; minfrag -= n; nfrag += n; } /* Do whole blocks. If a block is wholly free, skip it; if wholly */ /* allocated, move it in toto. If neither, call fragmove() to move */ /* the frags to new locations. */ while (nfrag >= oldsb->fs_frag) { if (! blk_is_set(cg_blksfree(cg,0),minfrag,oldsb->fs_frag)) { if (blk_is_clr(cg_blksfree(cg,0),minfrag,oldsb->fs_frag)) { int off; off = find_freeblock(); if (off < 0) toofull(); mark_move(base+minfrag,off,oldsb->fs_frag); set_bits(cg_blksfree(cg,0),minfrag,oldsb->fs_frag); clr_bits(cg_blksfree(cgs[dtog(oldsb,off)],0),dtogd(oldsb,off),oldsb->fs_frag); } else { fragmove(minfrag,oldsb->fs_frag); } } minfrag += oldsb->fs_frag; nfrag -= oldsb->fs_frag; } /* Clean up any sub-block amount left over. */ if (nfrag) { fragmove(minfrag,nfrag); } } /* * Move all data blocks according to blkmove. We have to be careful, * because we may be updating indirect blocks that will themselves be * getting moved, or inode daddr_t arrays that point to indirect * blocks that will be moved. We call this before * update_for_data_move, and update_for_data_move does inodes first, * then indirect blocks in preorder, so as to make sure that the * filesystem is self-consistent at all points, for better crash * tolerance. (We can get away with this only because all the writes * done by perform_data_move() are writing into space that's not used * by the old filesystem.) If we crash, some things may point to the * old data and some to the new, but both copies are the same. The * only wrong things should be csum info and free bitmaps, which fsck * is entirely capable of cleaning up. * * Since blkmove_init() initializes all blocks to move to their current * locations, we can have two blocks marked as wanting to move to the * same location, but only two and only when one of them is the one * that was already there. So if blkmove[i]==i, we ignore that entry * entirely - for unallocated blocks, we don't want it (and may be * putting something else there), and for allocated blocks, we don't * want to copy it anywhere. */ static void perform_data_move(void) { int i; int run; int maxrun; char buf[65536]; maxrun = sizeof(buf) / newsb->fs_fsize; run = 0; for (i=0;ifs_size;i++) { if ( (blkmove[i] == i) || (run >= maxrun) || ( (run > 0) && (blkmove[i] != blkmove[i-1]+1) ) ) { if (run > 0) { readat(fsbtodb(oldsb,i-run),&buf[0],run<fs_fshift); writeat(fsbtodb(oldsb,blkmove[i-run]),&buf[0],run<fs_fshift); } run = 0; } if (blkmove[i] != i) run ++; } if (run > 0) { readat(fsbtodb(oldsb,i-run),&buf[0],run<fs_fshift); writeat(fsbtodb(oldsb,blkmove[i-run]),&buf[0],run<fs_fshift); } } /* * This modifies an array of daddr_t, according to blkmove. This is * used to update inode block arrays and indirect blocks to point to * the new locations of data blocks. * * Return value is the number of daddr_ts that needed updating; in * particular, the return value is zero iff nothing was modified. */ static int movemap_blocks(daddr_t *vec, int n) { int rv; rv = 0; for (;n>0;n--,vec++) { if (blkmove[*vec] != *vec) { *vec = blkmove[*vec]; rv ++; } } return(rv); } /* * Update all inode data arrays and indirect blocks to point to the new * locations of data blocks. See the comment header on * perform_data_move for some ordering considerations. */ static void update_for_data_move(void) { map_inodes(({ static void foo(struct dinode *di, unsigned int inum) { switch (di->di_mode & IFMT) { case IFLNK: if (di->di_size > oldsb->fs_maxsymlinklen) { case IFDIR: case IFREG: /* don't || these two calls; we need their side-effects */ if (movemap_blocks(&di->di_db[0],NDADDR)) { SETFLAG(idirty,inum); } if (movemap_blocks(&di->di_ib[0],NIADDR)) { SETFLAG(idirty,inum); } } break; } } &foo; }),oldsb->fs_ncg); map_data_blocks(({ static void foo(unsigned int off, UNUSED_ARG(unsigned int nfrag), UNUSED_ARG(unsigned int nbytes), int kind) { if (kind == MDB_INDIR_PRE) { daddr_t blk[howmany(MAXBSIZE,sizeof(daddr_t))]; readat(fsbtodb(oldsb,off),&blk[0],oldsb->fs_bsize); if (movemap_blocks(&blk[0],NINDIR(oldsb))) { writeat(fsbtodb(oldsb,off),&blk[0],oldsb->fs_bsize); } } } &foo; }),oldsb->fs_ncg); } /* * Initialize the inomove array. */ static void inomove_init(void) { int i; inomove = alloconce(oldsb->fs_ipg*oldsb->fs_ncg*sizeof(*inomove),"inomove",0); for (i=(oldsb->fs_ipg*oldsb->fs_ncg)-1;i>=0;i--) inomove[i] = i; } /* * Flush all dirtied inodes to disk. Scans the inode flags array; for * each dirty inode, it sets the BDIRTY bit on the first inode in the * block containing the dirty inode. Then it scans by blocks, and for * each marked block, writes it. */ static void flush_inodes(void) { int i; int ni; ni = newsb->fs_ipg * newsb->fs_ncg; for (i=0;ifs_bsize); } } } /* * Evict all inodes from the specified cg. shrink() already checked * that there were enough free inodes, so the no-free-inodes check is * a can't-happen. If it does trip, the filesystem should be in good * enough shape for fsck to fix; see the comment on perform_data_move * for the considerations in question. */ static void evict_inodes(struct cg *cg) { int inum; int i; int fi; inum = newsb->fs_ipg * cg->cg_cgx; for (i=0;ifs_ipg;i++,inum++) { if (getinode(inum)->di_mode != 0) { fi = find_freeinode(); if (fi < 0) { printf("Sorry, inodes evaporated - filesystem probably needs fsck\n"); exit(1); } inomove[inum] = fi; clr_bits(cg_inosused(cg,0),i,1); set_bits(cg_inosused(cgs[ino_to_cg(newsb,fi)],0),fi%newsb->fs_ipg,1); } } } /* * Move inodes from old locations to new. Does not actually write * anything to disk; just copies in-core and sets dirty bits. * * We have to be careful here for reasons similar to those mentioned in * the comment header on perform_data_move, above: for the sake of * crash tolerance, we want to make sure everything is present at both * old and new locations before we update pointers. So we call this * first, then flush_inodes() to get them out on disk, then update * directories to match. */ static void perform_inode_move(void) { int i; int ni; ni = oldsb->fs_ipg * oldsb->fs_ncg; for (i=0;i 0) { if (inomove[d->d_ino] != d->d_ino) { rv ++; d->d_ino = inomove[d->d_ino]; } nb -= d->d_reclen; buf += d->d_reclen; } return(rv); #undef d } /* * Callback function for map_inode_data_blocks, for updating a * directory to point to new inode locations. */ static void update_dir_data(unsigned int bn, unsigned int size, unsigned int nb, int kind) { if (kind == MDB_DATA) { union { struct direct d; char ch[MAXBSIZE]; } buf; readat(fsbtodb(oldsb,bn),&buf,size<fs_fshift); if (update_dirents((char *)&buf,nb)) { writeat(fsbtodb(oldsb,bn),&buf,size<fs_fshift); } } } /* * Update directory entries to point to new inode locations. */ static void update_for_inode_move(void) { map_inodes(({ static void foo(struct dinode *di, UNUSED_ARG(unsigned int inum)) { switch (di->di_mode & IFMT) { case IFDIR: map_inode_data_blocks(di,update_dir_data); break; } } &foo; }),newsb->fs_ncg); } /* * Shrink the filesystem. */ static void shrink(void) { int i; /* Load the inodes off disk - we'll need 'em. */ loadinodes(); /* Update the timestamp. */ newsb->fs_time = timestamp(); /* Update the size figures. */ newsb->fs_size = dbtofsb(newsb,newsize); newsb->fs_ncyl = (newsb->fs_size * NSPF(newsb)) / newsb->fs_spc; newsb->fs_ncg = howmany(newsb->fs_ncyl,newsb->fs_cpg); /* Does the (new) last cg end before the end of its inode area? See */ /* the similar code in grow() for more on this. */ if (cgdmin(newsb,newsb->fs_ncg-1) > newsb->fs_size) { newsb->fs_ncg --; newsb->fs_ncyl = newsb->fs_ncg * newsb->fs_cpg; newsb->fs_size = (newsb->fs_ncyl * newsb->fs_spc) / NSPF(newsb); printf("Warning: last cylinder group is too small;\n"); printf(" dropping it. New size = %lu.\n",(unsigned long int)fsbtodb(newsb,newsb->fs_size)); } /* Let's make sure we're not being shrunk into oblivion. */ if (newsb->fs_ncg < 1) { printf("Size too small - filesystem would have no cylinders\n"); exit(1); } /* Initialize for block motion. */ blkmove_init(); /* Update csum size, then fix up for the new size */ newsb->fs_cssize = fragroundup(newsb,newsb->fs_ncg*sizeof(struct csum)); csum_fixup(); /* Evict data from any cgs being wholly eliminiated */ for (i=newsb->fs_ncg;ifs_ncg;i++) { int base; int dlow; int dhigh; int dmax; base = cgbase(oldsb,i); dlow = cgsblock(oldsb,i) - base; dhigh = cgdmin(oldsb,i) - base; dmax = oldsb->fs_size - base; if (dmax > cgs[i]->cg_ndblk) dmax = cgs[i]->cg_ndblk; evict_data(cgs[i],0,dlow); evict_data(cgs[i],dhigh,dmax-dhigh); newsb->fs_cstotal.cs_ndir -= cgs[i]->cg_cs.cs_ndir; newsb->fs_cstotal.cs_nifree -= cgs[i]->cg_cs.cs_nifree; newsb->fs_cstotal.cs_nffree -= cgs[i]->cg_cs.cs_nffree; newsb->fs_cstotal.cs_nbfree -= cgs[i]->cg_cs.cs_nbfree; } /* Update the new last cg. */ cgs[newsb->fs_ncg-1]->cg_ndblk = newsb->fs_size - ((newsb->fs_ncg-1) * newsb->fs_fpg); /* Is the new last cg partial? If so, evict any data from the part */ /* being shrunken away. */ if (newsb->fs_size % newsb->fs_fpg) { struct cg *cg; int oldcgsize; int newcgsize; cg = cgs[newsb->fs_ncg-1]; newcgsize = newsb->fs_size % newsb->fs_fpg; oldcgsize = oldsb->fs_size - ((newsb->fs_ncg-1) & oldsb->fs_fpg); if (oldcgsize > oldsb->fs_fpg) oldcgsize = oldsb->fs_fpg; evict_data(cg,newcgsize,oldcgsize-newcgsize); clr_bits(cg_blksfree(cg,0),newcgsize,oldcgsize-newcgsize); } /* Find out whether we would run out of inodes. (Note we haven't */ /* actually done anything to the filesystem yet; all those evict_data */ /* calls just update blkmove.) */ { int slop; slop = 0; for (i=0;ifs_ncg;i++) slop += cgs[i]->cg_cs.cs_nifree; for (;ifs_ncg;i++) slop -= oldsb->fs_ipg - cgs[i]->cg_cs.cs_nifree; if (slop < 0) { printf("Sorry, would run out of inodes\n"); exit(1); } } /* Copy data, then update pointers to data. See the comment header on */ /* perform_data_move for ordering considerations. */ perform_data_move(); update_for_data_move(); /* Now do inodes. Initialize, evict, move, update - see the comment */ /* header on perform_inode_move. */ inomove_init(); for (i=newsb->fs_ncg;ifs_ncg;i++) evict_inodes(cgs[i]); perform_inode_move(); flush_inodes(); update_for_inode_move(); /* Recompute all the bitmaps; most of them probably need it anyway, */ /* the rest are just paranoia and not wanting to have to bother */ /* keeping track of exactly which ones require it. */ for (i=0;ifs_ncg;i++) cgflags[i] |= CGF_DIRTY | CGF_BLKMAPS | CGF_INOMAPS; /* Update the cg_ncyl value for the last cylinder. The condition is */ /* commented out because fsck whines if not - see the similar */ /* condition in grow() for more. */ /* XXX fix once fsck is fixed */ /* if (newsb->fs_ncyl % newsb->fs_cpg) XXX */ cgs[newsb->fs_ncg-1]->cg_ncyl = newsb->fs_ncyl % newsb->fs_cpg; /* Make fs_dsize match the new reality. */ recompute_fs_dsize(); } /* * Recompute the block totals, block cluster summaries, and rotational * position summaries, for a given cg (specified by number), based on * its free-frag bitmap (cg_blksfree()[]). */ static void rescan_blkmaps(int cgn) { struct cg *cg; int f; int b; int blkfree; int blkrun; int fragrun; int fwb; cg = cgs[cgn]; /* Subtract off the current totals from the sb's summary info */ newsb->fs_cstotal.cs_nffree -= cg->cg_cs.cs_nffree; newsb->fs_cstotal.cs_nbfree -= cg->cg_cs.cs_nbfree; /* Clear counters and bitmaps. */ cg->cg_cs.cs_nffree = 0; cg->cg_cs.cs_nbfree = 0; bzero(&cg->cg_frsum[0],MAXFRAG*sizeof(cg->cg_frsum[0])); bzero(&cg_blktot(cg,0)[0],newsb->fs_cpg*sizeof(cg_blktot(cg,0)[0])); bzero(&cg_blks(newsb,cg,0,0)[0],newsb->fs_cpg*newsb->fs_nrpos*sizeof(cg_blks(newsb,cg,0,0)[0])); if (newsb->fs_contigsumsize > 0) { cg->cg_nclusterblks = cg->cg_ndblk / newsb->fs_frag; bzero(&cg_clustersum(cg,0)[1],newsb->fs_contigsumsize*sizeof(cg_clustersum(cg,0)[1])); bzero(&cg_clustersfree(cg,0)[0],howmany((newsb->fs_cpg*newsb->fs_spc)/NSPB(newsb),NBBY)); } /* Scan the free-frag bitmap. Runs of free frags are kept track of */ /* with fragrun, and recorded into cg_frsum[] and cg_cs.cs_nffree; on */ /* each block boundary, entire free blocks are recorded as well. */ blkfree = 1; blkrun = 0; fragrun = 0; f = 0; b = 0; fwb = 0; while (f < cg->cg_ndblk) { if (bit_is_set(cg_blksfree(cg,0),f)) { fragrun ++; } else { blkfree = 0; if (fragrun > 0) { cg->cg_frsum[fragrun] ++; cg->cg_cs.cs_nffree += fragrun; } fragrun = 0; } f ++; fwb ++; if (fwb >= newsb->fs_frag) { if (blkfree) { cg->cg_cs.cs_nbfree ++; if (newsb->fs_contigsumsize > 0) set_bits(cg_clustersfree(cg,0),b,1); cg_blktot(cg,0)[cbtocylno(newsb,f-newsb->fs_frag)] ++; cg_blks(newsb,cg,cbtocylno(newsb,f-newsb->fs_frag),0)[cbtorpos(newsb,f-newsb->fs_frag)] ++; blkrun ++; } else { if (fragrun > 0) { cg->cg_frsum[fragrun] ++; cg->cg_cs.cs_nffree += fragrun; } if (newsb->fs_contigsumsize > 0) { if (blkrun > 0) { cg_clustersum(cg,0)[(blkrun>newsb->fs_contigsumsize)?newsb->fs_contigsumsize:blkrun] ++; } } blkrun = 0; } fwb = 0; b ++; blkfree = 1; fragrun = 0; } } if (fragrun > 0) { cg->cg_frsum[fragrun] ++; cg->cg_cs.cs_nffree += fragrun; } if ((blkrun > 0) && (newsb->fs_contigsumsize > 0)) { cg_clustersum(cg,0)[(blkrun>newsb->fs_contigsumsize)?newsb->fs_contigsumsize:blkrun] ++; } /* Put the updated summary info back into csums, and add it back into */ /* the sb's summary info. Then mark the cg dirty. */ csums[cgn] = cg->cg_cs; newsb->fs_cstotal.cs_nffree += cg->cg_cs.cs_nffree; newsb->fs_cstotal.cs_nbfree += cg->cg_cs.cs_nbfree; cgflags[cgn] |= CGF_DIRTY; } /* * Recompute the cg_inosused()[] bitmap, and the cs_nifree and cs_ndir * values, for a cg, based on the in-core inodes for that cg. */ static void rescan_inomaps(int cgn) { struct cg *cg; int inum; int iwc; cg = cgs[cgn]; newsb->fs_cstotal.cs_ndir -= cg->cg_cs.cs_ndir; newsb->fs_cstotal.cs_nifree -= cg->cg_cs.cs_nifree; cg->cg_cs.cs_ndir = 0; cg->cg_cs.cs_nifree = 0; bzero(&cg_inosused(cg,0)[0],howmany(newsb->fs_ipg,NBBY)); inum = cgn * newsb->fs_ipg; if (cgn == 0) { set_bits(cg_inosused(cg,0),0,2); iwc = 2; inum += 2; } else { iwc = 0; } for (;iwcfs_ipg;iwc++,inum++) { switch (getinode(inum)->di_mode & IFMT) { case 0: cg->cg_cs.cs_nifree ++; break; case IFDIR: cg->cg_cs.cs_ndir ++; /* fall through */ default: set_bits(cg_inosused(cg,0),iwc,1); break; } } csums[cgn] = cg->cg_cs; newsb->fs_cstotal.cs_ndir += cg->cg_cs.cs_ndir; newsb->fs_cstotal.cs_nifree += cg->cg_cs.cs_nifree; cgflags[cgn] |= CGF_DIRTY; } /* * Flush cgs to disk, recomputing anything they're marked as needing. */ static void flush_cgs(void) { int i; for (i=0;ifs_ncg;i++) { if (cgflags[i] & CGF_BLKMAPS) { rescan_blkmaps(i); } if (cgflags[i] & CGF_INOMAPS) { rescan_inomaps(i); } if (cgflags[i] & CGF_DIRTY) { cgs[i]->cg_rotor = 0; cgs[i]->cg_frotor = 0; cgs[i]->cg_irotor = 0; writeat(fsbtodb(newsb,cgtod(newsb,i)),cgs[i],cgblksz); } } writeat(fsbtodb(newsb,newsb->fs_csaddr),csums,newsb->fs_cssize); } /* * Write the superblock, both to the main superblock and to each cg's * alternative superblock. */ static void write_sbs(void) { int i; writeat(SBLOCK,newsb,SBSIZE); for (i=0;ifs_ncg;i++) { writeat(fsbtodb(newsb,cgsblock(newsb,i)),newsb,SBSIZE); } } /* * Set our soft datasize limit to our hard datasize limit. When * working with largeish filesystems we need this. */ static void raisedsize(void) { struct rlimit rl; if (getrlimit(RLIMIT_DATA,&rl) < 0) { fprintf(stderr,"%s: getrlimit(RLIM_DATA): %s\n",__progname,strerror(errno)); exit(1); } rl.rlim_cur = rl.rlim_max; if (setrlimit(RLIMIT_DATA,&rl) < 0) { fprintf(stderr,"%s: setrlimit(RLIM_DATA): %s\n",__progname,strerror(errno)); exit(1); } } /* * main(). */ int main(int, char **); int main(int ac, char **av) { if (ac != 3) { fprintf(stderr,"Usage: %s filesystem new-size\n",__progname); exit(1); } fd = open(av[1],O_RDWR,0); if (fd < 0) { fprintf(stderr,"%s: %s: %s\n",__progname,av[1],strerror(errno)); exit(1); } checksmallio(); raisedsize(); newsize = atoi(av[2]); oldsb = (struct fs *)&sbbuf; newsb = (struct fs *)(SBSIZE+(char *)&sbbuf); readat(SBLOCK,oldsb,SBSIZE); if (oldsb->fs_magic != FS_MAGIC) { fprintf(stderr,"%s: %s: bad magic number\n",__progname,av[1]); exit(1); } oldsb->fs_qbmask = ~(int64_t)oldsb->fs_bmask; oldsb->fs_qfmask = ~(int64_t)oldsb->fs_fmask; if (oldsb->fs_ipg % INOPB(oldsb)) { printf("ipg[%d] %% INOPB[%d] != 0\n",(int)oldsb->fs_ipg,(int)INOPB(oldsb)); exit(1); } /* The superblock is bigger than struct fs (there are trailing tables, */ /* of non-fixed size); make sure we copy the whole thing. SBSIZE may */ /* be an over-estimate, but we do this just once, so being generous */ /* is cheap. */ bcopy(oldsb,newsb,SBSIZE); loadcgs(); if (newsize > fsbtodb(oldsb,oldsb->fs_size)) { grow(); } else if (newsize < fsbtodb(oldsb,oldsb->fs_size)) { shrink(); } flush_cgs(); write_sbs(); exit(0); }