/* * Knit together partial sequences to form a complete (circular) * sequence. * * First line of input lists valid symbols. Every line after that is a * subsequence. This program finds a complete sequence by merging * together the subsequences. * * This finds a sequence that includes all the given subsequences. For * example, if symbols are A B C D E F G H and the input is * * A B C D E F * B C D E F G * D E F G H A * G H A B * * then the resulting sequence (as of this writing) is * * D E F G H A B C D E F G * * The list of valid symbols is taken from the command-line arguments; * it is an error for a symbol to contain whitespace. * * Input lines are taken from stdin and are broken into symbols at * whitespace (which is why symbols must not contain whitespace). */ #include #include #include #include extern const char *__progname; typedef struct seq SEQ; struct seq { SEQ *link; const char *tag; int inx; int len; int *syms; } ; static int nsyms; static char **symnames; static int *symnamelens; static int nseqs; static SEQ *seqs; static int *seqb; static int seqa; static int seql; static unsigned char *usev; static int unused; #define Cisspace(x) isspace((unsigned char)(x)) static void panic(void) __attribute__((__noreturn__)); static void panic(void) { (void)*(volatile char *)0; abort(); } static char *read_line(void) { static char *b = 0; static int a = 0; int l; int c; void addc(unsigned char ch) { if (l >= a) b = realloc(b,a=l+8); b[l++] = ch; } l = 0; while (1) { c = getchar(); if (c == EOF) { if (l > 0) { fprintf(stderr,"%s: input ends with partial line\n",__progname); exit(1); } return(0); } if (c == '\n') { addc('\0'); return(b); } addc(c); } } static const char *skipwhite(const char *s) { while (*s && Cisspace(*s)) s ++; return(s); } static const char *skipnonwhite(const char *s) { while (*s && !Cisspace(*s)) s ++; return(s); } static int lookup_sym(const char *s, int l) { int i; for (i=nsyms-1;i>=0;i--) if ((symnamelens[i] == l) && !bcmp(s,symnames[i],l)) return(i); return(-1); } static void setup_symbols(int ac, char **av) { int i; char *t; int n; if (ac < 3) { fprintf(stderr,"Usage: %s sym sym [sym ...]\n",__progname); exit(1); } nsyms = ac - 1; symnames = malloc(nsyms*sizeof(*symnames)); symnamelens = malloc(nsyms*sizeof(*symnamelens)); for (i=nsyms-1;i>=0;i--) { t = strdup(av[i+1]); for (n=0;t[n];n++) { if (Cisspace(t[n])) { fprintf(stderr,"%s: symbol names must not contain whitespace\n",__progname); exit(1); } } symnames[i] = t; symnamelens[i] = strlen(t); } } static void add_seq(int *syms, int nsyms) { SEQ *s; char *tag; int i; s = malloc(sizeof(SEQ)); asprintf(&tag,"%d",nseqs); s->tag = tag; s->inx = nseqs; s->len = nsyms; s->syms = malloc(nsyms*sizeof(int)); bcopy(syms,s->syms,nsyms*sizeof(int)); s->link = seqs; seqs = s; nseqs ++; printf("add_seq %s:",s->tag); for (i=0;ilen;i++) printf(" %s",symnames[s->syms[i]]); printf("\n"); } static void add_onesym_lines(void) { int i; for (i=nsyms-1;i>=0;i--) add_seq(&i,1); } static void add_line(const char *line) { static int *sb = 0; static int sa = 0; int sl; const char *f0; const char *f1; int f; sl = 0; f1 = line; while (1) { f0 = skipwhite(f1); if (! *f0) break; f1 = skipnonwhite(f0); f = lookup_sym(f0,f1-f0); if (f < 0) { fprintf(stderr,"%s: input line includes unrecognized symbol `%.*s': %s\n",__progname,(int)(f1-f0),f0,line); exit(1); } if (sl >= sa) sb = realloc(sb,(sa=sl+8)*sizeof(*sb)); sb[sl++] = f; } if (sl < 1) { fprintf(stderr,"%s: input line contains no fields\n",__progname); exit(1); } add_seq(sb,sl); } static void consume_input(void) { char *l; while (1) { l = read_line(); if (! l) break; add_line(l); } } static void seqroom(int len) { if (len > seqa) seqb = realloc(seqb,(seqa=len)*sizeof(*seqb)); } static void copytoseq(const int *vec, int to, int n) { if (to+n > seqa) panic(); bcopy(vec,seqb+to,n*sizeof(int)); } static int vec_equal(const int *v1, const int *v2, int n) { for (;n>0;n--,v1++,v2++) if (*v1 != *v2) return(0); return(1); } static void setused(int inx, int u) { if ((inx < 0) || (inx >= nseqs)) panic(); if (usev[inx] && !u) { usev[inx] = 0; unused ++; } else if (u && !usev[inx]) { usev[inx] = 1; unused --; } } static void include_subsumed(void) { SEQ *s; int i; for (s=seqs;s;s=s->link) { if (usev[s->inx]) continue; if (s->len > seql) continue; for (i=seql-s->len;i>=0;i--) { if (vec_equal(s->syms,seqb+i,s->len)) { printf("%s subsumed at offset %d\n",s->tag,i); setused(s->inx,1); break; } } } } static void solve(void) { SEQ *s; SEQ *choice; int o; int overlap; int i; seqb = 0; seqa = 0; seql = 0; usev = malloc(nseqs); unused = 0; for (s=seqs;s;s=s->link) { if ((s->inx < 0) || (s->inx >= nseqs)) panic(); usev[s->inx] = 0; unused ++; } if (unused != nseqs) panic(); choice = 0; for (s=seqs;s;s=s->link) if (!choice || (s->len > choice->len)) choice = s; if (! choice) panic(); printf("starting with %s\n",choice->tag); seqroom(choice->len); copytoseq(choice->syms,0,choice->len); seql = choice->len; setused(choice->inx,1); while (1) { printf("vector %d:",seql); for (i=0;ilink) { if (usev[s->inx]) continue; for (o=s->len-1;o>=0;o--) { if (vec_equal(s->syms,seqb+seql-o,o)) { printf("%s overlaps %d (at %d)\n",s->tag,o,seql-o); if (!choice || (o > overlap)) { choice = s; overlap = o; printf("saving, new best\n"); } break; } } } if (! choice) panic(); printf("chose %s, overlap %d, new %d\n",choice->tag,overlap,choice->len-overlap); seqroom(seql-overlap+choice->len); copytoseq(choice->syms+overlap,seql,choice->len-overlap); seql += choice->len - overlap; setused(choice->inx,1); } } int main(int, char **); int main(int ac, char **av) { setup_symbols(ac,av); seqs = 0; nseqs = 0; add_onesym_lines(); consume_input(); solve(); return(0); }